├── src
└── pyphm
│ ├── __init__.py
│ └── datasets
│ ├── __init__.py
│ ├── auxilary_metadata
│ ├── __init__.py
│ ├── milling_labels_with_tool_class.csv
│ └── airbus_dfvalid_groundtruth.csv
│ ├── pyphm.py
│ ├── airbus.py
│ ├── ims.py
│ ├── milling.py
│ └── utils.py
├── tests
├── integration
│ ├── __init__.py
│ ├── fixtures
│ │ ├── milling
│ │ │ ├── mill.mat
│ │ │ ├── milling_truncated_results.csv.gz
│ │ │ └── milling_labels_with_tool_class_truncated.csv
│ │ └── ims
│ │ │ ├── ims_truncated_results.csv.gz
│ │ │ └── 1st_test
│ │ │ ├── 2003.10.22.12.06.24
│ │ │ └── 2003.10.22.12.09.13
│ ├── test_integration_ims.py
│ └── test_integration_milling.py
└── conftest.py
├── notebooks
├── scratch
│ ├── test._mill.ipynb
│ ├── test.py
│ ├── get_hash.ipynb
│ ├── import_package_resources.ipynb
│ ├── milling_examp.ipynb
│ ├── test.ipynb
│ ├── airbus_download.ipynb
│ └── ims_download.ipynb
└── images
│ ├── logo.png
│ ├── vae.png
│ ├── cut_signals.png
│ ├── flank_wear.png
│ ├── thresholds.png
│ ├── violin_plot.png
│ ├── face_milling.png
│ ├── simple_trend.png
│ ├── latent_space_cnc.png
│ ├── trend_spash_image.png
│ ├── vae_training_step3.jpg
│ ├── vae_training_random_search.png
│ ├── prauc_params_cnc.svg
│ ├── logo.svg
│ └── prauc_cnc.svg
├── .gitattributes
├── requirements.txt
├── setup.py
├── env_pyphm.yml
├── .github
└── workflows
│ └── main.yml
├── pyproject.toml
├── LICENSE
├── setup.cfg
├── README.md
├── .gitignore
└── references
└── sources.bib
/src/pyphm/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/pyphm/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/notebooks/scratch/test._mill.ipynb:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/pyphm/datasets/auxilary_metadata/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-documentation
2 |
--------------------------------------------------------------------------------
/notebooks/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/logo.png
--------------------------------------------------------------------------------
/notebooks/images/vae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/vae.png
--------------------------------------------------------------------------------
/notebooks/images/cut_signals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/cut_signals.png
--------------------------------------------------------------------------------
/notebooks/images/flank_wear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/flank_wear.png
--------------------------------------------------------------------------------
/notebooks/images/thresholds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/thresholds.png
--------------------------------------------------------------------------------
/notebooks/images/violin_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/violin_plot.png
--------------------------------------------------------------------------------
/notebooks/images/face_milling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/face_milling.png
--------------------------------------------------------------------------------
/notebooks/images/simple_trend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/simple_trend.png
--------------------------------------------------------------------------------
/notebooks/images/latent_space_cnc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/latent_space_cnc.png
--------------------------------------------------------------------------------
/notebooks/images/trend_spash_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/trend_spash_image.png
--------------------------------------------------------------------------------
/notebooks/images/vae_training_step3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/vae_training_step3.jpg
--------------------------------------------------------------------------------
/tests/integration/fixtures/milling/mill.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/tests/integration/fixtures/milling/mill.mat
--------------------------------------------------------------------------------
/notebooks/images/vae_training_random_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/vae_training_random_search.png
--------------------------------------------------------------------------------
/tests/integration/fixtures/ims/ims_truncated_results.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/tests/integration/fixtures/ims/ims_truncated_results.csv.gz
--------------------------------------------------------------------------------
/tests/integration/fixtures/milling/milling_truncated_results.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/tests/integration/fixtures/milling/milling_truncated_results.csv.gz
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # local package
2 | -e .
3 |
4 | # external requirements
5 | pandas
6 | numpy
7 | py7zr
8 | rarfile
9 | tqdm
10 | scipy
11 | requests
12 | h5py
13 | tables
14 | gdown
--------------------------------------------------------------------------------
/tests/integration/fixtures/milling/milling_labels_with_tool_class_truncated.csv:
--------------------------------------------------------------------------------
1 | case,run,VB,time,DOC,feed,material,cut_no,tool_class,window_start,window_end
2 | 1,1,0,2,1.5,0.5,1,0,0,2496,6976
3 | 1,2,,4,1.5,0.5,1,1,0,2496,6976
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | if __name__ == "__main__":
4 | setuptools.setup()
5 |
6 | # from setuptools import setup, find_packages
7 |
8 | # setup(
9 | # name="pyphm",
10 | # version="0.1.0",
11 | # packages=find_packages(),
12 | # )
13 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """
2 | Dummy conftest.py for pyphm.
3 |
4 | If you don't know what this is for, just leave it empty.
5 | Read more about conftest.py under:
6 | - https://docs.pytest.org/en/stable/fixture.html
7 | - https://docs.pytest.org/en/stable/writing_plugins.html
8 | """
9 |
10 | # import pytest
11 |
--------------------------------------------------------------------------------
/env_pyphm.yml:
--------------------------------------------------------------------------------
1 | name: pyphm
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - python=3.11
6 | - conda
7 | - mamba
8 | - jupyterlab
9 | - ipywidgets
10 | - scipy
11 | - matplotlib
12 | - seaborn
13 | - pandas
14 | - scikit-learn
15 | - py7zr
16 | - rarfile
17 | - pytables
18 | - requests
19 | - gdown=4.6.0
20 |
21 |
22 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: CI/CD
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | pull_request:
8 | branches:
9 | - master
10 |
11 | jobs:
12 | build:
13 | runs-on: ubuntu-latest
14 |
15 | strategy:
16 | matrix:
17 | python-version: [3.7, 3.8, 3.9]
18 |
19 | steps:
20 | - uses: actions/checkout@v2
21 |
22 | - name: Set up Python all python versions
23 | uses: actions/setup-python@v2
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 | architecture: x64
27 |
28 | - name: Install dependencies
29 | run: pip install -r requirements.txt
30 |
31 | - name: Run Tests
32 | run: python -m unittest discover -s tests
--------------------------------------------------------------------------------
/src/pyphm/datasets/pyphm.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from pathlib import Path
4 | from typing import Any, Callable, List, Optional, Tuple
5 | from .utils import download_and_extract_archive, extract_archive, check_integrity
6 |
7 |
8 | class PHMDataset:
9 | """
10 | Base class for making PyPHM data sets.
11 |
12 | Args:
13 | root (string): Root directory to place all the data sets.
14 |
15 | dataset_folder_name (string): Name of folder containing raw data.
16 | This folder will be created in the root directory if not present.
17 |
18 | """
19 |
20 | def __init__(
21 | self,
22 | root: Path,
23 | dataset_folder_name: str,
24 | ) -> None:
25 |
26 | self.root = Path(root)
27 | self.dataset_folder_name = dataset_folder_name
28 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "pyphm"
3 | version = "0.0.5"
4 | description = "Machinery data, made easy"
5 | requires-python = ">=3.6"
6 | readme = "README.md"
7 | authors = [
8 | { name = "Tim von Hahn", email = "t.vonhahn@queensu.ca" },
9 | ]
10 | classifiers = [
11 | "Programming Language :: Python :: 3",
12 | "License :: OSI Approved :: MIT License",
13 | "Operating System :: OS Independent",
14 | ]
15 | dependencies = [
16 | "pandas",
17 | "numpy",
18 | "py7zr",
19 | "rarfile",
20 | "tqdm",
21 | "scipy",
22 | "requests",
23 | "h5py",
24 | "tables",
25 | "gdown"
26 | ]
27 |
28 | [project.urls]
29 | Homepage = "https://github.com/tvhahn/PyPHM"
30 | Repository = "https://github.com/tvhahn/PyPHM"
31 | Documentation = "https://github.com/tvhahn/PyPHM"
32 |
33 | [project.optional-dependencies]
34 | doc = ["sphinx~=4.4.0", "myst-parser"]
35 |
--------------------------------------------------------------------------------
/notebooks/scratch/test.py:
--------------------------------------------------------------------------------
1 | resources = [
2 | {
3 | "name": "aws",
4 | "url": "https://phm-datasets.s3.amazonaws.com/NASA/",
5 | "files": [
6 | {
7 | "filename": "3.+Milling.zip",
8 | "md5": "4da3afb0aa50cb3dcdd8e20ed1ed1c7c",
9 | },
10 | {
11 | "filename": "another_file.zip",
12 | "md5": "some_other_md5_checksum",
13 | },
14 | ],
15 | },
16 | {
17 | "name": "google_drive",
18 | "url": "https://drive.google.com/file/d/1_4Hm8RO_7Av1LzGtFnhx6cIN-zi-W40j/view?usp=sharing",
19 | "files": [
20 | {
21 | "filename": "mill.zip",
22 | "md5": "81d821fdef812183a7d38b6f83f7cefa",
23 | },
24 | {
25 | "filename": "another_file.zip",
26 | "md5": "some_other_md5_checksum",
27 | },
28 | ],
29 | },
30 | # Additional sources can be added here in the same format.
31 | ]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | The MIT License (MIT)
3 | Copyright (c) 2022, Tim von Hahn
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 |
11 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = pyphm
3 | version = 0.0.5
4 | author = Tim von Hahn
5 | author_email = t.vonhahn@queensu.ca
6 | description = Machinery data, made easy
7 | long_description = file: README.md
8 | long_description_content_type = text/markdown
9 | url = https://github.com/tvhahn/PyPHM
10 | project_urls =
11 | Bug Tracker = https://github.com/tvhahn/PyPHM/issues
12 | classifiers =
13 | Programming Language :: Python :: 3
14 | License :: OSI Approved :: MIT License
15 | Operating System :: OS Independent
16 |
17 | [options]
18 | package_dir =
19 | = src
20 | packages = find:
21 | python_requires = >=3.7
22 | include_package_data = True
23 |
24 | # Add here dependencies of your project (semicolon/line-separated)
25 | install_requires =
26 | pandas
27 | wheel
28 | scipy
29 | numpy
30 | py7zr
31 | rarfile
32 | tqdm
33 | requests
34 | versioned-hdf5
35 | h5py
36 | tables
37 | gdown
38 |
39 | [options.package_data]
40 | * = *.csv, *.mat
41 |
42 | [options.packages.find]
43 | where = src
44 |
45 |
46 |
--------------------------------------------------------------------------------
/tests/integration/fixtures/ims/1st_test/2003.10.22.12.06.24:
--------------------------------------------------------------------------------
1 | -0.022 -0.039 -0.183 -0.054 -0.105 -0.134 -0.129 -0.142
2 | -0.105 -0.017 -0.164 -0.183 -0.049 0.029 -0.115 -0.122
3 | -0.183 -0.098 -0.195 -0.125 -0.005 -0.007 -0.171 -0.071
4 | -0.178 -0.161 -0.159 -0.178 -0.100 -0.115 -0.112 -0.078
5 | -0.208 -0.129 -0.261 -0.098 -0.151 -0.205 -0.063 -0.066
6 | -0.232 -0.061 -0.281 -0.125 0.046 -0.088 -0.078 -0.078
7 | -0.112 -0.132 -0.181 -0.186 -0.132 -0.051 -0.132 -0.076
8 | -0.054 -0.107 -0.173 -0.134 -0.164 0.002 -0.146 -0.125
9 | -0.159 -0.032 -0.161 -0.181 -0.110 -0.044 -0.173 -0.137
10 | -0.225 -0.044 -0.090 -0.159 -0.100 -0.151 -0.139 -0.076
11 | -0.093 -0.117 -0.039 -0.161 -0.132 -0.161 -0.090 -0.098
12 | -0.002 -0.161 -0.042 -0.054 -0.095 -0.232 -0.137 -0.042
13 | 0.000 -0.117 -0.081 -0.088 -0.142 -0.183 -0.117 -0.171
14 | -0.154 -0.142 -0.027 -0.093 -0.183 -0.251 -0.095 -0.083
15 | -0.129 -0.068 0.083 -0.071 -0.129 -0.117 -0.183 -0.071
16 | -0.015 -0.049 0.044 -0.088 -0.188 -0.081 -0.183 -0.020
17 | -0.015 -0.046 0.005 -0.061 -0.049 -0.098 -0.139 -0.085
18 | -0.090 -0.105 0.020 -0.012 -0.181 -0.186 -0.107 -0.037
19 | -0.088 -0.012 0.037 -0.093 -0.078 -0.105 -0.134 -0.039
20 | -0.127 -0.081 -0.051 -0.073 -0.100 -0.105 -0.115 -0.051
21 |
--------------------------------------------------------------------------------
/tests/integration/fixtures/ims/1st_test/2003.10.22.12.09.13:
--------------------------------------------------------------------------------
1 | -0.117 -0.076 -0.127 -0.144 -0.083 -0.002 -0.098 -0.051
2 | -0.132 -0.068 -0.117 -0.083 -0.132 -0.076 -0.117 -0.085
3 | -0.186 -0.120 -0.217 -0.212 -0.081 -0.112 -0.132 -0.054
4 | -0.098 -0.125 -0.117 -0.093 -0.022 -0.112 -0.090 -0.164
5 | -0.137 -0.120 -0.188 -0.142 -0.129 -0.046 -0.098 -0.129
6 | -0.103 -0.078 -0.127 -0.156 -0.110 -0.061 -0.061 -0.129
7 | -0.120 -0.046 -0.085 -0.056 -0.149 -0.042 -0.103 -0.039
8 | -0.110 -0.068 -0.076 -0.078 -0.168 -0.134 -0.146 -0.168
9 | -0.088 -0.110 -0.022 -0.044 -0.225 -0.083 -0.100 -0.044
10 | -0.120 -0.073 -0.034 -0.076 -0.217 -0.073 -0.107 -0.088
11 | -0.159 -0.129 0.034 -0.022 -0.090 -0.139 -0.107 -0.049
12 | -0.073 -0.090 -0.032 -0.044 -0.076 -0.132 -0.134 -0.049
13 | -0.105 -0.122 -0.073 0.015 -0.078 -0.107 -0.195 -0.027
14 | -0.139 -0.056 0.000 -0.154 -0.068 -0.146 -0.193 0.032
15 | -0.129 -0.095 -0.012 -0.078 0.034 -0.127 -0.110 0.046
16 | -0.134 -0.159 -0.139 -0.210 -0.112 -0.107 -0.112 -0.005
17 | -0.071 -0.129 -0.134 -0.024 -0.156 -0.042 -0.132 -0.049
18 | -0.183 -0.093 -0.090 -0.112 -0.054 -0.088 -0.127 -0.127
19 | -0.278 -0.010 -0.007 -0.007 0.066 -0.103 -0.078 -0.071
20 | -0.154 -0.046 -0.198 -0.129 -0.078 -0.046 -0.093 -0.051
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # Machinery data, made easy
4 |  [](https://arxiv.org/abs/2205.15489)
5 |
6 |
7 | Datasets specific to PHM (prognostics and health management). Use Python to easily download and prepare the data, before feature engineering or model training.
8 |
9 | Current datasets:
10 | - **UC-Berkeley Milling Dataset**: [example notebook](https://github.com/tvhahn/PyPHM/blob/master/notebooks/milling_example.ipynb) ([open in Colab](https://colab.research.google.com/github/tvhahn/PyPHM/blob/master/notebooks/milling_example.ipynb)); [dataset source](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#milling)
11 | - **IMS Bearing Dataset**: [dataset source](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#bearing)
12 | - **Airbus Helicopter Accelerometer Dataset**: [dataset source](https://www.research-collection.ethz.ch/handle/20.500.11850/415151)
13 | - More coming soon!
14 |
15 |
16 | ## Alpha Notice
17 | PyPHM is in active development. Expect considerable changes in the near future.
18 |
19 | Our goals are to create:
20 |
21 | * A package that implements **common data preprocessing methods** used by others.
22 | * A package with a **coherent and thoughtful API**.
23 | * Thorough **documentation**, with plenty of **examples**.
24 | * A package that is well **tested**, with the use of **type hints**.
25 | * A package built with **continuous integration and continuous deployment**.
26 |
27 |
28 | ## Installation
29 | Install with pip: `pip install pyphm`
30 |
31 | Install from github repository: clone with git `clone https://github.com/tvhahn/PyPHM.git`. Then run `python -m pip install -e .` to install the package on your local machine.
32 |
33 | Run tests: `python -m unittest discover -s tests`
34 |
35 |
--------------------------------------------------------------------------------
/tests/integration/test_integration_ims.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy as np
3 | from pathlib import Path
4 | import pandas as pd
5 | from pandas.testing import assert_frame_equal
6 | from pyphm.datasets.ims import ImsDataLoad
7 |
8 |
9 | class TestIms(unittest.TestCase):
10 |
11 | @classmethod
12 | def setUpClass(cls):
13 | pass
14 |
15 | def setUp(self):
16 | # path to mill_truncated.mat
17 | self.root = (
18 | Path(__file__).parent / "fixtures"
19 | )
20 |
21 | # path to ims_truncated_results.csv.gz
22 | self.results_path = (
23 | self.root / "ims/ims_truncated_results.csv.gz"
24 | )
25 |
26 | def tearDown(self):
27 | pass
28 |
29 | def test_milling_data_prep(self):
30 | """Test that the milling data prep works as expected."""
31 |
32 | # load the data and instantiate the data prep class
33 | ims = ImsDataLoad(self.root, download=False)
34 |
35 | # create the results dataframe
36 | df = ims.load_run_as_df(1, n_jobs=1)
37 |
38 | # load the ground truth results dataframe
39 | col_names_ordered = ["id", "run", "file", "time_step"] + ims.col_1st_names
40 |
41 | col_dtype = [
42 | str,
43 | int,
44 | str,
45 | np.float32,
46 | np.float32,
47 | np.float32,
48 | np.float32,
49 | np.float32,
50 | np.float32,
51 | np.float32,
52 | np.float32,
53 | np.float32,
54 | ]
55 |
56 | col_dtype_dict = dict(zip(col_names_ordered, col_dtype))
57 |
58 | # load the ground truth results dataframe
59 | df_gt = pd.read_csv(
60 | self.results_path,
61 | compression="gzip",
62 | ).astype(col_dtype_dict)
63 |
64 | # compare the results
65 | assert_frame_equal(df, df_gt)
66 |
67 |
68 | if __name__ == "__main__":
69 |
70 | unittest.main()
71 |
--------------------------------------------------------------------------------
/notebooks/scratch/get_hash.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "The autoreload extension is already loaded. To reload it, use:\n",
13 | " %reload_ext autoreload\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import hashlib\n",
19 | "from pathlib import Path\n",
20 | "import pandas as pd\n",
21 | "from pyphm.datasets.utils import calculate_md5, check_md5\n",
22 | "\n",
23 | "\n",
24 | "%load_ext autoreload\n",
25 | "%autoreload 2"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 7,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "/home/tim/Documents/PyPHM\n",
38 | "/home/tim/Documents/PyPHM/data/raw/milling\n"
39 | ]
40 | }
41 | ],
42 | "source": [
43 | "root_dir = Path.cwd().parent.parent\n",
44 | "print(root_dir)\n",
45 | "path_data_raw_folder = Path(root_dir / 'data/raw/milling/' )\n",
46 | "print(path_data_raw_folder)"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 8,
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "name": "stdout",
56 | "output_type": "stream",
57 | "text": [
58 | "4da3afb0aa50cb3dcdd8e20ed1ed1c7c\n"
59 | ]
60 | }
61 | ],
62 | "source": [
63 | "hash_md5 = calculate_md5(path_data_raw_folder / \"3.+Milling.zip\")\n",
64 | "print(hash_md5)"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": []
73 | }
74 | ],
75 | "metadata": {
76 | "interpreter": {
77 | "hash": "bb5c389ed065b0664b086eb1393fdb5729447cbf21b18fded646434c15c951b5"
78 | },
79 | "kernelspec": {
80 | "display_name": "Python 3.8.12 ('featstore')",
81 | "language": "python",
82 | "name": "python3"
83 | },
84 | "language_info": {
85 | "codemirror_mode": {
86 | "name": "ipython",
87 | "version": 3
88 | },
89 | "file_extension": ".py",
90 | "mimetype": "text/x-python",
91 | "name": "python",
92 | "nbconvert_exporter": "python",
93 | "pygments_lexer": "ipython3",
94 | "version": "3.11.7"
95 | },
96 | "orig_nbformat": 4
97 | },
98 | "nbformat": 4,
99 | "nbformat_minor": 2
100 | }
101 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # folders
132 | data/
133 |
--------------------------------------------------------------------------------
/tests/integration/test_integration_milling.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy as np
3 | from pathlib import Path
4 | import pandas as pd
5 | from pandas.testing import assert_frame_equal
6 | from pyphm.datasets.milling import MillingPrepMethodA
7 |
8 |
9 | class TestMilling(unittest.TestCase):
10 |
11 | @classmethod
12 | def setUpClass(cls):
13 | pass
14 |
15 |
16 | def setUp(self):
17 | # path to mill_truncated.mat
18 | self.root = (
19 | Path(__file__).parent / "fixtures"
20 | )
21 |
22 | # path to milling_labels_with_tool_class_truncated.csv
23 | self.labels_path = (
24 | self.root
25 | / "milling/milling_labels_with_tool_class_truncated.csv"
26 | )
27 |
28 | # path to milling_truncated_results.csv.gz
29 | self.results_path = (
30 | self.root / "milling/milling_truncated_results.csv.gz"
31 | )
32 |
33 | def test_load_run_as_df(self):
34 | """Test the loading of an individual run as a dataframe."""
35 |
36 | # load the data and instantiate the data prep class
37 | mill = MillingPrepMethodA(
38 | self.root,
39 | window_len=64,
40 | stride=64,
41 | cut_drop_list=[],
42 | path_csv_labels=self.labels_path,
43 | download=False,
44 | )
45 |
46 | # create the results dataframe
47 | df = mill.create_xy_dataframe()
48 |
49 | # load the ground truth results dataframe
50 | col_names_ordered = [
51 | "cut_id",
52 | "cut_no",
53 | "case",
54 | "time",
55 | "ae_spindle",
56 | "ae_table",
57 | "vib_spindle",
58 | "vib_table",
59 | "smcdc",
60 | "smcac",
61 | "tool_class",
62 | ]
63 |
64 | col_dtype = [
65 | str,
66 | int,
67 | int,
68 | np.float32,
69 | np.float32,
70 | np.float32,
71 | np.float32,
72 | np.float32,
73 | np.float32,
74 | np.float32,
75 | int,
76 | ]
77 |
78 | col_dtype_dict = dict(zip(col_names_ordered, col_dtype))
79 |
80 | # load the ground truth results dataframe
81 | df_gt = pd.read_csv(
82 | self.results_path,
83 | compression="gzip",
84 | ).astype(col_dtype_dict)
85 |
86 | # compare the results
87 | assert_frame_equal(df, df_gt)
88 |
89 |
90 | if __name__ == "__main__":
91 |
92 | unittest.main()
93 |
--------------------------------------------------------------------------------
/notebooks/images/prauc_params_cnc.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
18 |
--------------------------------------------------------------------------------
/notebooks/images/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
94 |
--------------------------------------------------------------------------------
/notebooks/scratch/import_package_resources.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pkg_resources\n",
10 | "from pathlib import Path\n",
11 | "import pandas as pd\n"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 7,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "meta_data_path = Path(pkg_resources.resource_filename('pyphm', 'datasets/auxilary_metadata/'))"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 8,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "data": {
30 | "text/plain": [
31 | "WindowsPath('C:/Users/Tim/Anaconda3/envs/featstore/lib/site-packages/pyphm/datasets/auxilary_metadata')"
32 | ]
33 | },
34 | "execution_count": 8,
35 | "metadata": {},
36 | "output_type": "execute_result"
37 | }
38 | ],
39 | "source": [
40 | "meta_data_path"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 10,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/html": [
51 | "
\n",
52 | "\n",
65 | "
\n",
66 | " \n",
67 | " \n",
68 | " | \n",
69 | " case | \n",
70 | " run | \n",
71 | " VB | \n",
72 | " time | \n",
73 | " DOC | \n",
74 | " feed | \n",
75 | " material | \n",
76 | " cut_no | \n",
77 | " tool_class | \n",
78 | " window_start | \n",
79 | " window_end | \n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " \n",
84 | " | 0 | \n",
85 | " 1 | \n",
86 | " 1 | \n",
87 | " 0.00 | \n",
88 | " 2 | \n",
89 | " 1.5 | \n",
90 | " 0.5 | \n",
91 | " 1 | \n",
92 | " 0 | \n",
93 | " 0 | \n",
94 | " 2496 | \n",
95 | " 6976 | \n",
96 | "
\n",
97 | " \n",
98 | " | 1 | \n",
99 | " 1 | \n",
100 | " 2 | \n",
101 | " NaN | \n",
102 | " 4 | \n",
103 | " 1.5 | \n",
104 | " 0.5 | \n",
105 | " 1 | \n",
106 | " 1 | \n",
107 | " 0 | \n",
108 | " 2496 | \n",
109 | " 6976 | \n",
110 | "
\n",
111 | " \n",
112 | " | 2 | \n",
113 | " 1 | \n",
114 | " 3 | \n",
115 | " NaN | \n",
116 | " 6 | \n",
117 | " 1.5 | \n",
118 | " 0.5 | \n",
119 | " 1 | \n",
120 | " 2 | \n",
121 | " 0 | \n",
122 | " 2496 | \n",
123 | " 6976 | \n",
124 | "
\n",
125 | " \n",
126 | " | 3 | \n",
127 | " 1 | \n",
128 | " 4 | \n",
129 | " 0.11 | \n",
130 | " 7 | \n",
131 | " 1.5 | \n",
132 | " 0.5 | \n",
133 | " 1 | \n",
134 | " 3 | \n",
135 | " 0 | \n",
136 | " 2496 | \n",
137 | " 6976 | \n",
138 | "
\n",
139 | " \n",
140 | " | 4 | \n",
141 | " 1 | \n",
142 | " 5 | \n",
143 | " NaN | \n",
144 | " 11 | \n",
145 | " 1.5 | \n",
146 | " 0.5 | \n",
147 | " 1 | \n",
148 | " 4 | \n",
149 | " 0 | \n",
150 | " 2496 | \n",
151 | " 6976 | \n",
152 | "
\n",
153 | " \n",
154 | "
\n",
155 | "
"
156 | ],
157 | "text/plain": [
158 | " case run VB time DOC feed material cut_no tool_class \\\n",
159 | "0 1 1 0.00 2 1.5 0.5 1 0 0 \n",
160 | "1 1 2 NaN 4 1.5 0.5 1 1 0 \n",
161 | "2 1 3 NaN 6 1.5 0.5 1 2 0 \n",
162 | "3 1 4 0.11 7 1.5 0.5 1 3 0 \n",
163 | "4 1 5 NaN 11 1.5 0.5 1 4 0 \n",
164 | "\n",
165 | " window_start window_end \n",
166 | "0 2496 6976 \n",
167 | "1 2496 6976 \n",
168 | "2 2496 6976 \n",
169 | "3 2496 6976 \n",
170 | "4 2496 6976 "
171 | ]
172 | },
173 | "execution_count": 10,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "df = pd.read_csv(meta_data_path / 'milling_labels_with_tool_class.csv')\n",
180 | "df.head()"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": []
189 | }
190 | ],
191 | "metadata": {
192 | "interpreter": {
193 | "hash": "bb5c389ed065b0664b086eb1393fdb5729447cbf21b18fded646434c15c951b5"
194 | },
195 | "kernelspec": {
196 | "display_name": "Python 3.8.12 ('featstore')",
197 | "language": "python",
198 | "name": "python3"
199 | },
200 | "language_info": {
201 | "codemirror_mode": {
202 | "name": "ipython",
203 | "version": 3
204 | },
205 | "file_extension": ".py",
206 | "mimetype": "text/x-python",
207 | "name": "python",
208 | "nbconvert_exporter": "python",
209 | "pygments_lexer": "ipython3",
210 | "version": "3.8.12"
211 | },
212 | "orig_nbformat": 4
213 | },
214 | "nbformat": 4,
215 | "nbformat_minor": 2
216 | }
217 |
--------------------------------------------------------------------------------
/src/pyphm/datasets/auxilary_metadata/milling_labels_with_tool_class.csv:
--------------------------------------------------------------------------------
1 | case,run,VB,time,DOC,feed,material,cut_no,tool_class,window_start,window_end
2 | 1,1,0,2,1.5,0.5,1,0,0,2496,6976
3 | 1,2,,4,1.5,0.5,1,1,0,2496,6976
4 | 1,3,,6,1.5,0.5,1,2,0,2496,6976
5 | 1,4,0.11,7,1.5,0.5,1,3,0,2496,6976
6 | 1,5,,11,1.5,0.5,1,4,0,2496,6976
7 | 1,6,0.2,15,1.5,0.5,1,5,1,2496,6976
8 | 1,7,0.24,19,1.5,0.5,1,6,1,2496,6976
9 | 1,8,0.29,22,1.5,0.5,1,7,1,2496,6976
10 | 1,9,0.28,26,1.5,0.5,1,8,1,2496,6976
11 | 1,10,0.29,29,1.5,0.5,1,9,1,2496,6976
12 | 1,11,0.38,32,1.5,0.5,1,10,1,2496,6976
13 | 1,12,0.4,35,1.5,0.5,1,11,1,2496,6976
14 | 1,13,0.43,38,1.5,0.5,1,12,1,2496,6976
15 | 1,14,0.45,41,1.5,0.5,1,13,1,2496,6976
16 | 1,15,0.5,44,1.5,0.5,1,14,1,2496,6976
17 | 1,16,,46,1.5,0.5,1,15,1,2496,6976
18 | 1,17,0.44,48,1.5,0.5,1,16,1,2496,6976
19 | 2,1,0.08,3,0.75,0.5,1,17,0,64,128
20 | 2,2,0.14,9,0.75,0.5,1,18,0,2496,6976
21 | 2,3,0.14,12,0.75,0.5,1,19,0,2496,6976
22 | 2,4,0.14,15,0.75,0.5,1,20,0,2496,6976
23 | 2,5,0.15,22,0.75,0.5,1,21,0,4224,6976
24 | 2,6,,24,0.75,0.5,1,22,0,5056,6976
25 | 2,7,0.18,27,0.75,0.5,1,23,0,2496,6976
26 | 2,8,0.22,33,0.75,0.5,1,24,1,2496,6976
27 | 2,9,0.26,39,0.75,0.5,1,25,1,2496,6976
28 | 2,10,0.31,45,0.75,0.5,1,26,1,3520,8000
29 | 2,11,0.38,51,0.75,0.5,1,27,1,2496,6976
30 | 2,12,0.43,59,0.75,0.5,1,28,1,2496,6976
31 | 2,13,0.48,66,0.75,0.5,1,29,1,2496,6976
32 | 2,14,0.55,72,0.75,0.5,1,30,1,3520,8000
33 | 3,1,0,0,0.75,0.25,1,31,0,4480,8960
34 | 3,2,0.13,3,0.75,0.25,1,32,0,4480,8960
35 | 3,3,0.13,9,0.75,0.25,1,33,0,4480,8960
36 | 3,5,0.17,21,0.75,0.25,1,34,0,4480,8960
37 | 3,6,0.19,27,0.75,0.25,1,35,0,3520,8960
38 | 3,7,0.2,33,0.75,0.25,1,36,1,3520,8960
39 | 3,8,0.23,39,0.75,0.25,1,37,1,3520,8960
40 | 3,9,0.23,45,0.75,0.25,1,38,1,4480,8960
41 | 3,10,0.26,51,0.75,0.25,1,39,1,3520,8960
42 | 3,11,0.28,57,0.75,0.25,1,40,1,4160,8960
43 | 3,12,0.33,63,0.75,0.25,1,41,1,4160,8960
44 | 3,14,0.36,69,0.75,0.25,1,42,1,4480,8960
45 | 3,15,0.44,75,0.75,0.25,1,43,1,4480,8960
46 | 3,16,0.55,81,0.75,0.25,1,44,1,4480,8960
47 | 4,1,0.08,3,1.5,0.25,1,45,0,4480,8960
48 | 4,2,0.13,9,1.5,0.25,1,46,0,4480,8960
49 | 4,3,0.2,15,1.5,0.25,1,47,1,4160,8960
50 | 4,4,0.31,21,1.5,0.25,1,48,1,4160,8960
51 | 4,5,0.35,27,1.5,0.25,1,49,1,4160,8960
52 | 4,6,0.4,34,1.5,0.25,1,50,1,4160,8960
53 | 4,7,0.49,39,1.5,0.25,1,51,1,4160,8960
54 | 9,1,0,1,1.5,0.5,1,52,0,2112,6720
55 | 9,2,0.1,3,1.5,0.5,1,53,0,2112,6720
56 | 9,3,0.14,9,1.5,0.5,1,54,0,2112,6464
57 | 9,4,0.19,16,1.5,0.5,1,55,0,2496,6720
58 | 9,5,0.27,22,1.5,0.5,1,56,1,2496,6720
59 | 9,6,0.38,28,1.5,0.5,1,57,1,2496,6720
60 | 9,7,0.47,34,1.5,0.5,1,58,1,2496,6720
61 | 9,8,0.64,40,1.5,0.5,1,59,1,2496,6720
62 | 9,9,0.81,46,1.5,0.5,1,60,2,2112,6464
63 | 10,1,0,0,1.5,0.25,1,61,0,4480,8960
64 | 10,2,0.04,4,1.5,0.25,1,62,0,4480,8960
65 | 10,3,0.08,9,1.5,0.25,1,63,0,4480,8960
66 | 10,4,0.16,15,1.5,0.25,1,64,0,4160,8960
67 | 10,5,0.25,21,1.5,0.25,1,65,1,4160,8960
68 | 10,6,0.36,27,1.5,0.25,1,66,1,4160,8960
69 | 10,7,0.43,33,1.5,0.25,1,67,1,4160,8960
70 | 10,8,0.47,39,1.5,0.25,1,68,1,4160,8960
71 | 10,9,0.53,45,1.5,0.25,1,69,1,4160,8960
72 | 10,10,0.7,57,1.5,0.25,1,70,2,5056,8960
73 | 11,1,0,1,0.75,0.25,1,71,0,4160,8960
74 | 11,2,0.04,3,0.75,0.25,1,72,0,4160,8960
75 | 11,3,0.07,10,0.75,0.25,1,73,0,4160,8960
76 | 11,4,0.07,12,0.75,0.25,1,74,0,4160,8960
77 | 11,5,0.08,14,0.75,0.25,1,75,0,4160,8960
78 | 11,6,0.09,17,0.75,0.25,1,76,0,4160,8960
79 | 11,7,,19,0.75,0.25,1,77,0,4160,8960
80 | 11,8,0.12,21,0.75,0.25,1,78,0,4160,8960
81 | 11,9,0.16,27,0.75,0.25,1,79,0,4160,8960
82 | 11,10,0.18,33,0.75,0.25,1,80,0,4160,8960
83 | 11,11,0.2,39,0.75,0.25,1,81,1,4160,8960
84 | 11,12,0.23,45,0.75,0.25,1,82,1,4160,8960
85 | 11,13,0.26,51,0.75,0.25,1,83,1,4160,8960
86 | 11,14,,54,0.75,0.25,1,84,1,4160,8960
87 | 11,15,0.31,57,0.75,0.25,1,85,1,4160,8960
88 | 11,16,0.37,63,0.75,0.25,1,86,1,4160,8960
89 | 11,17,,67,0.75,0.25,1,87,1,4160,8960
90 | 11,18,0.42,72,0.75,0.25,1,88,1,4160,8960
91 | 11,19,0.47,80,0.75,0.25,1,89,1,4160,8960
92 | 11,20,0.57,86,0.75,0.25,1,90,1,4160,8960
93 | 11,21,0.65,93,0.75,0.25,1,91,1,4160,8960
94 | 11,22,0.68,100,0.75,0.25,1,92,1,4160,8960
95 | 11,23,0.76,105,0.75,0.25,1,93,2,4160,8960
96 | 12,1,,1,0.75,0.5,1,94,0,64,128
97 | 12,2,0.05,3,0.75,0.5,1,95,0,2496,6720
98 | 12,3,0.08,6,0.75,0.5,1,96,0,2496,6464
99 | 12,4,,12,0.75,0.5,1,97,0,2496,6464
100 | 12,5,0.12,19,0.75,0.5,1,98,0,3008,6464
101 | 12,6,0.17,24,0.75,0.5,1,99,0,2496,6720
102 | 12,7,0.2,30,0.75,0.5,1,100,1,2496,6720
103 | 12,8,0.24,36,0.75,0.5,1,101,1,2496,6720
104 | 12,9,0.32,42,0.75,0.5,1,102,1,2496,6720
105 | 12,10,,45,0.75,0.5,1,103,1,2496,6720
106 | 12,11,0.4,49,0.75,0.5,1,104,1,2496,6464
107 | 12,12,0.45,55,0.75,0.5,1,105,1,2496,3904
108 | 12,13,0.49,61,0.75,0.5,1,106,1,2496,6720
109 | 12,14,0.58,67,0.75,0.5,1,107,1,2496,6720
110 | 12,15,0.65,74,0.75,0.5,1,108,1,2496,6464
111 | 5,1,0,0,1.5,0.5,2,109,0,2496,6464
112 | 5,2,0.16,3,1.5,0.5,2,110,0,2496,6720
113 | 5,3,0.29,6,1.5,0.5,2,111,1,2496,6976
114 | 5,4,0.44,9,1.5,0.5,2,112,1,2496,6976
115 | 5,5,0.53,12,1.5,0.5,2,113,1,2496,6976
116 | 5,6,0.74,15,1.5,0.5,2,114,2,2496,6720
117 | 6,1,0,0,1.5,0.25,2,115,0,4160,8960
118 | 7,1,0,1,0.75,0.25,2,116,0,4160,8960
119 | 7,2,0.09,3,0.75,0.25,2,117,0,4160,8960
120 | 7,3,0.13,6,0.75,0.25,2,118,0,4160,8960
121 | 7,4,0.22,10,0.75,0.25,2,119,1,4160,8960
122 | 7,5,0.24,13,0.75,0.25,2,120,1,4480,8960
123 | 7,6,0.34,15,0.75,0.25,2,121,1,4160,8960
124 | 7,7,0.46,19,0.75,0.25,2,122,1,4160,8960
125 | 7,8,,21,0.75,0.25,2,123,1,4480,8960
126 | 8,1,0,0,0.75,0.5,2,124,0,2496,6720
127 | 8,2,0.18,3,0.75,0.5,2,125,0,2496,6720
128 | 8,3,0.3,6,0.75,0.5,2,126,1,2496,6720
129 | 8,4,,8,0.75,0.5,2,127,1,2496,6720
130 | 8,5,0.44,9,0.75,0.5,2,128,1,2496,6720
131 | 8,6,0.62,12,0.75,0.5,2,129,1,2496,6720
132 | 13,1,,1,0.75,0.25,2,130,0,4480,8960
133 | 13,2,,2,0.75,0.25,2,131,0,4480,8960
134 | 13,3,0.1,4,0.75,0.25,2,132,0,4480,8960
135 | 13,4,0.13,7,0.75,0.25,2,133,0,4480,8960
136 | 13,5,0.17,11,0.75,0.25,2,134,0,4160,8960
137 | 13,6,0.32,16,0.75,0.25,2,135,1,4160,8960
138 | 13,7,0.38,19,0.75,0.25,2,136,1,4160,8960
139 | 13,8,0.49,22,0.75,0.25,2,137,1,4160,8960
140 | 13,9,0.56,25,0.75,0.25,2,138,1,4160,8960
141 | 13,10,0.68,29,0.75,0.25,2,139,1,4160,8960
142 | 13,11,0.83,32,0.75,0.25,2,140,2,4160,8960
143 | 13,12,0.92,35,0.75,0.25,2,141,2,4160,8960
144 | 13,13,1.07,38,0.75,0.25,2,142,2,4160,8960
145 | 13,14,1.3,42,0.75,0.25,2,143,2,4160,8960
146 | 13,15,1.53,45,0.75,0.25,2,144,2,4160,8960
147 | 14,1,,1,0.75,0.5,2,145,0,2496,6976
148 | 14,2,0.09,3,0.75,0.5,2,146,0,2496,6464
149 | 14,3,0.17,6,0.75,0.5,2,147,0,2496,6720
150 | 14,4,0.24,9,0.75,0.5,2,148,1,2496,6720
151 | 14,5,,11,0.75,0.5,2,149,1,2496,6720
152 | 14,6,0.35,12,0.75,0.5,2,150,1,2496,6464
153 | 14,8,0.6,18,0.75,0.5,2,151,1,2496,6464
154 | 14,9,0.81,21,0.75,0.5,2,152,2,2496,6464
155 | 14,10,1.14,24,0.75,0.5,2,153,2,2496,6464
156 | 15,1,,1,1.5,0.25,2,154,0,4160,8960
157 | 15,2,0.15,3,1.5,0.25,2,155,0,4160,8960
158 | 15,3,0.28,6,1.5,0.25,2,156,1,4160,8960
159 | 15,4,0.37,9,1.5,0.25,2,157,1,4160,8960
160 | 15,5,0.48,13,1.5,0.25,2,158,1,4160,8960
161 | 15,6,0.56,16,1.5,0.25,2,159,1,4480,8960
162 | 15,7,0.7,19,1.5,0.25,2,160,2,4160,8960
163 | 16,1,,1,1.5,0.5,2,161,0,2496,6720
164 | 16,2,,2,1.5,0.5,2,162,0,2496,6720
165 | 16,3,0.24,3,1.5,0.5,2,163,1,2496,6720
166 | 16,4,,4,1.5,0.5,2,164,1,2496,6720
167 | 16,5,0.4,6,1.5,0.5,2,165,1,2496,6720
168 | 16,6,0.62,9,1.5,0.5,2,166,1,2496,6720
169 |
--------------------------------------------------------------------------------
/notebooks/scratch/milling_examp.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from pyphm.datasets.milling import MillingDataLoad, MillingPrepMethodA\n",
10 | "import pandas as pd\n",
11 | "from pathlib import Path\n",
12 | "\n",
13 | "%load_ext autoreload\n",
14 | "%autoreload 2"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [
22 | {
23 | "name": "stdout",
24 | "output_type": "stream",
25 | "text": [
26 | "root_dir: /home/tim/Documents/PyPHM\n",
27 | "path_data_raw_folder: /home/tim/Documents/PyPHM/data\n"
28 | ]
29 | }
30 | ],
31 | "source": [
32 | "root_dir = Path.cwd().parent\n",
33 | "print('root_dir: ', root_dir)\n",
34 | "path_data_raw_folder = Path(root_dir / 'data' )\n",
35 | "print('path_data_raw_folder: ', path_data_raw_folder)"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "mill = MillingPrepMethodA(root=path_data_raw_folder, dataset_folder_name='milling', window_size=64, stride=64, cut_drop_list=[17, 94], download=False)"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 4,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/plain": [
55 | "PosixPath('/home/tim/Documents/PyPHM/data/milling')"
56 | ]
57 | },
58 | "execution_count": 4,
59 | "metadata": {},
60 | "output_type": "execute_result"
61 | }
62 | ],
63 | "source": [
64 | "mill.dataset_folder_path"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "x.shape (11570, 64, 6)\n",
77 | "y.shape (11570, 64, 3)\n"
78 | ]
79 | }
80 | ],
81 | "source": [
82 | "x, y = mill.create_xy_arrays()\n",
83 | "print(\"x.shape\", x.shape)\n",
84 | "print(\"y.shape\", y.shape)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 9,
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "data": {
94 | "text/html": [
95 | "\n",
96 | "\n",
109 | "
\n",
110 | " \n",
111 | " \n",
112 | " | \n",
113 | " cut_id | \n",
114 | " cut_no | \n",
115 | " case | \n",
116 | " time | \n",
117 | " ae_spindle | \n",
118 | " ae_table | \n",
119 | " vib_spindle | \n",
120 | " vib_table | \n",
121 | " smcdc | \n",
122 | " smcac | \n",
123 | " tool_class | \n",
124 | "
\n",
125 | " \n",
126 | " \n",
127 | " \n",
128 | " | 0 | \n",
129 | " 0_0 | \n",
130 | " 0 | \n",
131 | " 1 | \n",
132 | " 0.000 | \n",
133 | " 0.219727 | \n",
134 | " 0.272827 | \n",
135 | " 0.733643 | \n",
136 | " 2.116699 | \n",
137 | " 6.840820 | \n",
138 | " 0.124512 | \n",
139 | " 0 | \n",
140 | "
\n",
141 | " \n",
142 | " | 1 | \n",
143 | " 0_0 | \n",
144 | " 0 | \n",
145 | " 1 | \n",
146 | " 0.004 | \n",
147 | " 0.246582 | \n",
148 | " 0.322266 | \n",
149 | " 0.778809 | \n",
150 | " 2.277832 | \n",
151 | " 6.660156 | \n",
152 | " -0.561523 | \n",
153 | " 0 | \n",
154 | "
\n",
155 | " \n",
156 | " | 2 | \n",
157 | " 0_0 | \n",
158 | " 0 | \n",
159 | " 1 | \n",
160 | " 0.008 | \n",
161 | " 0.294189 | \n",
162 | " 0.283813 | \n",
163 | " 0.758057 | \n",
164 | " 2.343750 | \n",
165 | " 6.508789 | \n",
166 | " -2.099609 | \n",
167 | " 0 | \n",
168 | "
\n",
169 | " \n",
170 | " | 3 | \n",
171 | " 0_0 | \n",
172 | " 0 | \n",
173 | " 1 | \n",
174 | " 0.012 | \n",
175 | " 0.323486 | \n",
176 | " 0.260010 | \n",
177 | " 0.726318 | \n",
178 | " 2.448730 | \n",
179 | " 6.542969 | \n",
180 | " -2.731934 | \n",
181 | " 0 | \n",
182 | "
\n",
183 | " \n",
184 | " | 4 | \n",
185 | " 0_0 | \n",
186 | " 0 | \n",
187 | " 1 | \n",
188 | " 0.016 | \n",
189 | " 0.290527 | \n",
190 | " 0.253296 | \n",
191 | " 0.653076 | \n",
192 | " 2.546387 | \n",
193 | " 6.621094 | \n",
194 | " -3.505859 | \n",
195 | " 0 | \n",
196 | "
\n",
197 | " \n",
198 | "
\n",
199 | "
"
200 | ],
201 | "text/plain": [
202 | " cut_id cut_no case time ae_spindle ae_table vib_spindle vib_table \\\n",
203 | "0 0_0 0 1 0.000 0.219727 0.272827 0.733643 2.116699 \n",
204 | "1 0_0 0 1 0.004 0.246582 0.322266 0.778809 2.277832 \n",
205 | "2 0_0 0 1 0.008 0.294189 0.283813 0.758057 2.343750 \n",
206 | "3 0_0 0 1 0.012 0.323486 0.260010 0.726318 2.448730 \n",
207 | "4 0_0 0 1 0.016 0.290527 0.253296 0.653076 2.546387 \n",
208 | "\n",
209 | " smcdc smcac tool_class \n",
210 | "0 6.840820 0.124512 0 \n",
211 | "1 6.660156 -0.561523 0 \n",
212 | "2 6.508789 -2.099609 0 \n",
213 | "3 6.542969 -2.731934 0 \n",
214 | "4 6.621094 -3.505859 0 "
215 | ]
216 | },
217 | "execution_count": 9,
218 | "metadata": {},
219 | "output_type": "execute_result"
220 | }
221 | ],
222 | "source": [
223 | "df = mill.create_xy_dataframe()\n",
224 | "df.head()"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": []
233 | }
234 | ],
235 | "metadata": {
236 | "interpreter": {
237 | "hash": "daff1afd4d675d5e247c0a95a5de0c03bd87d8f7edee7cb37c539016070f1c16"
238 | },
239 | "kernelspec": {
240 | "display_name": "Python 3.8.12 64-bit ('featstore': conda)",
241 | "language": "python",
242 | "name": "python3"
243 | },
244 | "language_info": {
245 | "codemirror_mode": {
246 | "name": "ipython",
247 | "version": 3
248 | },
249 | "file_extension": ".py",
250 | "mimetype": "text/x-python",
251 | "name": "python",
252 | "nbconvert_exporter": "python",
253 | "pygments_lexer": "ipython3",
254 | "version": "3.8.12"
255 | },
256 | "orig_nbformat": 4
257 | },
258 | "nbformat": 4,
259 | "nbformat_minor": 2
260 | }
261 |
--------------------------------------------------------------------------------
/src/pyphm/datasets/auxilary_metadata/airbus_dfvalid_groundtruth.csv:
--------------------------------------------------------------------------------
1 | seqID,anomaly
2 | 0,0.0
3 | 1,1.0
4 | 2,0.0
5 | 3,0.0
6 | 4,1.0
7 | 5,1.0
8 | 6,0.0
9 | 7,0.0
10 | 8,1.0
11 | 9,0.0
12 | 10,0.0
13 | 11,0.0
14 | 12,1.0
15 | 13,1.0
16 | 14,1.0
17 | 15,0.0
18 | 16,1.0
19 | 17,1.0
20 | 18,0.0
21 | 19,0.0
22 | 20,0.0
23 | 21,1.0
24 | 22,0.0
25 | 23,0.0
26 | 24,1.0
27 | 25,1.0
28 | 26,0.0
29 | 27,1.0
30 | 28,1.0
31 | 29,0.0
32 | 30,0.0
33 | 31,0.0
34 | 32,1.0
35 | 33,0.0
36 | 34,0.0
37 | 35,0.0
38 | 36,1.0
39 | 37,0.0
40 | 38,0.0
41 | 39,0.0
42 | 40,0.0
43 | 41,0.0
44 | 42,1.0
45 | 43,1.0
46 | 44,1.0
47 | 45,0.0
48 | 46,0.0
49 | 47,0.0
50 | 48,0.0
51 | 49,1.0
52 | 50,0.0
53 | 51,1.0
54 | 52,0.0
55 | 53,1.0
56 | 54,1.0
57 | 55,1.0
58 | 56,0.0
59 | 57,1.0
60 | 58,1.0
61 | 59,1.0
62 | 60,0.0
63 | 61,1.0
64 | 62,0.0
65 | 63,1.0
66 | 64,0.0
67 | 65,0.0
68 | 66,1.0
69 | 67,1.0
70 | 68,0.0
71 | 69,1.0
72 | 70,0.0
73 | 71,0.0
74 | 72,0.0
75 | 73,1.0
76 | 74,1.0
77 | 75,1.0
78 | 76,1.0
79 | 77,1.0
80 | 78,1.0
81 | 79,1.0
82 | 80,1.0
83 | 81,1.0
84 | 82,1.0
85 | 83,0.0
86 | 84,0.0
87 | 85,1.0
88 | 86,0.0
89 | 87,1.0
90 | 88,1.0
91 | 89,0.0
92 | 90,1.0
93 | 91,0.0
94 | 92,0.0
95 | 93,0.0
96 | 94,0.0
97 | 95,0.0
98 | 96,0.0
99 | 97,1.0
100 | 98,1.0
101 | 99,1.0
102 | 100,1.0
103 | 101,1.0
104 | 102,1.0
105 | 103,0.0
106 | 104,1.0
107 | 105,0.0
108 | 106,0.0
109 | 107,0.0
110 | 108,0.0
111 | 109,0.0
112 | 110,0.0
113 | 111,1.0
114 | 112,1.0
115 | 113,0.0
116 | 114,0.0
117 | 115,1.0
118 | 116,1.0
119 | 117,0.0
120 | 118,1.0
121 | 119,0.0
122 | 120,1.0
123 | 121,0.0
124 | 122,1.0
125 | 123,1.0
126 | 124,0.0
127 | 125,1.0
128 | 126,1.0
129 | 127,0.0
130 | 128,1.0
131 | 129,0.0
132 | 130,0.0
133 | 131,1.0
134 | 132,1.0
135 | 133,0.0
136 | 134,1.0
137 | 135,0.0
138 | 136,0.0
139 | 137,0.0
140 | 138,0.0
141 | 139,0.0
142 | 140,0.0
143 | 141,1.0
144 | 142,1.0
145 | 143,1.0
146 | 144,1.0
147 | 145,1.0
148 | 146,1.0
149 | 147,1.0
150 | 148,0.0
151 | 149,0.0
152 | 150,0.0
153 | 151,0.0
154 | 152,0.0
155 | 153,0.0
156 | 154,1.0
157 | 155,0.0
158 | 156,0.0
159 | 157,0.0
160 | 158,0.0
161 | 159,0.0
162 | 160,0.0
163 | 161,1.0
164 | 162,1.0
165 | 163,1.0
166 | 164,0.0
167 | 165,1.0
168 | 166,1.0
169 | 167,1.0
170 | 168,0.0
171 | 169,1.0
172 | 170,1.0
173 | 171,1.0
174 | 172,0.0
175 | 173,1.0
176 | 174,0.0
177 | 175,1.0
178 | 176,0.0
179 | 177,0.0
180 | 178,1.0
181 | 179,1.0
182 | 180,0.0
183 | 181,0.0
184 | 182,0.0
185 | 183,1.0
186 | 184,1.0
187 | 185,0.0
188 | 186,0.0
189 | 187,0.0
190 | 188,0.0
191 | 189,1.0
192 | 190,1.0
193 | 191,1.0
194 | 192,1.0
195 | 193,0.0
196 | 194,0.0
197 | 195,0.0
198 | 196,0.0
199 | 197,1.0
200 | 198,1.0
201 | 199,1.0
202 | 200,0.0
203 | 201,0.0
204 | 202,0.0
205 | 203,0.0
206 | 204,0.0
207 | 205,0.0
208 | 206,1.0
209 | 207,1.0
210 | 208,1.0
211 | 209,1.0
212 | 210,1.0
213 | 211,1.0
214 | 212,0.0
215 | 213,1.0
216 | 214,0.0
217 | 215,0.0
218 | 216,0.0
219 | 217,1.0
220 | 218,0.0
221 | 219,0.0
222 | 220,0.0
223 | 221,1.0
224 | 222,1.0
225 | 223,0.0
226 | 224,0.0
227 | 225,0.0
228 | 226,0.0
229 | 227,0.0
230 | 228,0.0
231 | 229,1.0
232 | 230,0.0
233 | 231,0.0
234 | 232,1.0
235 | 233,0.0
236 | 234,0.0
237 | 235,0.0
238 | 236,0.0
239 | 237,1.0
240 | 238,1.0
241 | 239,1.0
242 | 240,1.0
243 | 241,0.0
244 | 242,1.0
245 | 243,1.0
246 | 244,1.0
247 | 245,0.0
248 | 246,0.0
249 | 247,1.0
250 | 248,0.0
251 | 249,1.0
252 | 250,0.0
253 | 251,0.0
254 | 252,1.0
255 | 253,0.0
256 | 254,0.0
257 | 255,1.0
258 | 256,0.0
259 | 257,0.0
260 | 258,1.0
261 | 259,1.0
262 | 260,0.0
263 | 261,0.0
264 | 262,1.0
265 | 263,0.0
266 | 264,1.0
267 | 265,0.0
268 | 266,1.0
269 | 267,0.0
270 | 268,0.0
271 | 269,1.0
272 | 270,0.0
273 | 271,0.0
274 | 272,1.0
275 | 273,0.0
276 | 274,1.0
277 | 275,0.0
278 | 276,0.0
279 | 277,0.0
280 | 278,0.0
281 | 279,1.0
282 | 280,1.0
283 | 281,0.0
284 | 282,1.0
285 | 283,1.0
286 | 284,1.0
287 | 285,1.0
288 | 286,1.0
289 | 287,1.0
290 | 288,0.0
291 | 289,0.0
292 | 290,0.0
293 | 291,0.0
294 | 292,0.0
295 | 293,0.0
296 | 294,1.0
297 | 295,0.0
298 | 296,0.0
299 | 297,0.0
300 | 298,1.0
301 | 299,0.0
302 | 300,1.0
303 | 301,1.0
304 | 302,1.0
305 | 303,1.0
306 | 304,1.0
307 | 305,1.0
308 | 306,0.0
309 | 307,1.0
310 | 308,0.0
311 | 309,1.0
312 | 310,1.0
313 | 311,0.0
314 | 312,0.0
315 | 313,0.0
316 | 314,1.0
317 | 315,1.0
318 | 316,0.0
319 | 317,0.0
320 | 318,0.0
321 | 319,1.0
322 | 320,1.0
323 | 321,0.0
324 | 322,0.0
325 | 323,1.0
326 | 324,0.0
327 | 325,0.0
328 | 326,0.0
329 | 327,0.0
330 | 328,0.0
331 | 329,1.0
332 | 330,0.0
333 | 331,0.0
334 | 332,1.0
335 | 333,0.0
336 | 334,1.0
337 | 335,0.0
338 | 336,0.0
339 | 337,0.0
340 | 338,1.0
341 | 339,1.0
342 | 340,0.0
343 | 341,0.0
344 | 342,0.0
345 | 343,1.0
346 | 344,0.0
347 | 345,1.0
348 | 346,0.0
349 | 347,1.0
350 | 348,0.0
351 | 349,1.0
352 | 350,0.0
353 | 351,1.0
354 | 352,1.0
355 | 353,1.0
356 | 354,1.0
357 | 355,1.0
358 | 356,0.0
359 | 357,0.0
360 | 358,1.0
361 | 359,1.0
362 | 360,0.0
363 | 361,0.0
364 | 362,1.0
365 | 363,0.0
366 | 364,0.0
367 | 365,0.0
368 | 366,1.0
369 | 367,0.0
370 | 368,1.0
371 | 369,1.0
372 | 370,1.0
373 | 371,1.0
374 | 372,0.0
375 | 373,0.0
376 | 374,0.0
377 | 375,1.0
378 | 376,1.0
379 | 377,1.0
380 | 378,0.0
381 | 379,0.0
382 | 380,1.0
383 | 381,1.0
384 | 382,0.0
385 | 383,0.0
386 | 384,1.0
387 | 385,0.0
388 | 386,0.0
389 | 387,0.0
390 | 388,0.0
391 | 389,1.0
392 | 390,0.0
393 | 391,0.0
394 | 392,1.0
395 | 393,0.0
396 | 394,1.0
397 | 395,1.0
398 | 396,1.0
399 | 397,0.0
400 | 398,0.0
401 | 399,0.0
402 | 400,1.0
403 | 401,1.0
404 | 402,1.0
405 | 403,1.0
406 | 404,1.0
407 | 405,0.0
408 | 406,1.0
409 | 407,1.0
410 | 408,1.0
411 | 409,0.0
412 | 410,1.0
413 | 411,1.0
414 | 412,0.0
415 | 413,0.0
416 | 414,1.0
417 | 415,0.0
418 | 416,0.0
419 | 417,1.0
420 | 418,0.0
421 | 419,0.0
422 | 420,1.0
423 | 421,0.0
424 | 422,0.0
425 | 423,1.0
426 | 424,1.0
427 | 425,0.0
428 | 426,0.0
429 | 427,1.0
430 | 428,0.0
431 | 429,0.0
432 | 430,0.0
433 | 431,1.0
434 | 432,0.0
435 | 433,1.0
436 | 434,0.0
437 | 435,0.0
438 | 436,1.0
439 | 437,0.0
440 | 438,1.0
441 | 439,1.0
442 | 440,1.0
443 | 441,1.0
444 | 442,1.0
445 | 443,0.0
446 | 444,1.0
447 | 445,1.0
448 | 446,1.0
449 | 447,1.0
450 | 448,0.0
451 | 449,0.0
452 | 450,0.0
453 | 451,1.0
454 | 452,1.0
455 | 453,1.0
456 | 454,1.0
457 | 455,1.0
458 | 456,0.0
459 | 457,1.0
460 | 458,1.0
461 | 459,0.0
462 | 460,0.0
463 | 461,1.0
464 | 462,1.0
465 | 463,1.0
466 | 464,0.0
467 | 465,1.0
468 | 466,1.0
469 | 467,0.0
470 | 468,0.0
471 | 469,1.0
472 | 470,1.0
473 | 471,0.0
474 | 472,1.0
475 | 473,0.0
476 | 474,1.0
477 | 475,1.0
478 | 476,0.0
479 | 477,0.0
480 | 478,1.0
481 | 479,1.0
482 | 480,1.0
483 | 481,0.0
484 | 482,0.0
485 | 483,1.0
486 | 484,1.0
487 | 485,0.0
488 | 486,1.0
489 | 487,1.0
490 | 488,1.0
491 | 489,0.0
492 | 490,1.0
493 | 491,0.0
494 | 492,1.0
495 | 493,1.0
496 | 494,1.0
497 | 495,0.0
498 | 496,0.0
499 | 497,1.0
500 | 498,1.0
501 | 499,0.0
502 | 500,0.0
503 | 501,0.0
504 | 502,1.0
505 | 503,1.0
506 | 504,1.0
507 | 505,1.0
508 | 506,1.0
509 | 507,0.0
510 | 508,0.0
511 | 509,1.0
512 | 510,1.0
513 | 511,0.0
514 | 512,1.0
515 | 513,1.0
516 | 514,0.0
517 | 515,0.0
518 | 516,1.0
519 | 517,1.0
520 | 518,1.0
521 | 519,0.0
522 | 520,0.0
523 | 521,0.0
524 | 522,0.0
525 | 523,0.0
526 | 524,0.0
527 | 525,1.0
528 | 526,1.0
529 | 527,1.0
530 | 528,0.0
531 | 529,0.0
532 | 530,1.0
533 | 531,0.0
534 | 532,0.0
535 | 533,0.0
536 | 534,1.0
537 | 535,1.0
538 | 536,1.0
539 | 537,1.0
540 | 538,1.0
541 | 539,0.0
542 | 540,1.0
543 | 541,1.0
544 | 542,1.0
545 | 543,1.0
546 | 544,0.0
547 | 545,1.0
548 | 546,1.0
549 | 547,1.0
550 | 548,0.0
551 | 549,1.0
552 | 550,0.0
553 | 551,1.0
554 | 552,0.0
555 | 553,1.0
556 | 554,1.0
557 | 555,1.0
558 | 556,0.0
559 | 557,1.0
560 | 558,0.0
561 | 559,0.0
562 | 560,1.0
563 | 561,0.0
564 | 562,0.0
565 | 563,1.0
566 | 564,0.0
567 | 565,0.0
568 | 566,0.0
569 | 567,1.0
570 | 568,1.0
571 | 569,1.0
572 | 570,0.0
573 | 571,1.0
574 | 572,0.0
575 | 573,0.0
576 | 574,0.0
577 | 575,0.0
578 | 576,1.0
579 | 577,1.0
580 | 578,1.0
581 | 579,1.0
582 | 580,1.0
583 | 581,1.0
584 | 582,1.0
585 | 583,1.0
586 | 584,0.0
587 | 585,1.0
588 | 586,0.0
589 | 587,1.0
590 | 588,0.0
591 | 589,1.0
592 | 590,1.0
593 | 591,0.0
594 | 592,0.0
595 | 593,1.0
596 |
--------------------------------------------------------------------------------
/src/pyphm/datasets/airbus.py:
--------------------------------------------------------------------------------
1 | import scipy.io as sio
2 | import numpy as np
3 | import pandas as pd
4 | from pathlib import Path
5 | from .pyphm import PHMDataset
6 | from typing import Any, Callable, List, Optional, Tuple
7 | import pkg_resources
8 | from .utils import (
9 | download_and_extract_archive,
10 | extract_archive,
11 | check_integrity,
12 | download_url,
13 | )
14 | import os
15 | from urllib.error import URLError
16 |
17 | """
18 | Contains the data prep class for the Airbus Helicopter Accelerometer Dataset.
19 |
20 | Also contains helper functions associated with the dataset.
21 | """
22 |
23 |
24 | ###############################################################################
25 | # Data Prep Classes
26 | ###############################################################################
27 | class AirbusDataLoad(PHMDataset):
28 | """
29 | Airbus Helicopter Accelerometer Dataset from .h5 file, and download if necessary.
30 |
31 | Args:
32 | root (string): Root directory to place all the data sets.
33 |
34 | dataset_folder_name (string): Name of folder containing raw data.
35 | This folder will be created in the root directory if not present.
36 |
37 | download (bool): If True, the data will be downloaded from ETH Zurich.
38 |
39 | """
40 |
41 | mirrors = [
42 | "https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/",
43 | ]
44 |
45 | resources = [
46 | ("dftrain.h5", None),
47 | ("dfvalid.h5", None),
48 | ("dfvalid_groundtruth.csv", None),
49 | ]
50 |
51 | def __init__(
52 | self,
53 | root: Path,
54 | dataset_folder_name: str = "airbus",
55 | download: bool = False,
56 | path_df_labels: Path = None,
57 | ) -> None:
58 | super().__init__(root, dataset_folder_name)
59 |
60 | self.dataset_folder_path = self.root / self.dataset_folder_name
61 |
62 | if path_df_labels is not None:
63 | self.path_df_labels = path_df_labels
64 | else:
65 | # path of pyphm source directory using pathlib
66 | self.path_df_labels = Path(pkg_resources.resource_filename('pyphm', 'datasets/auxilary_metadata/airbus_dfvalid_groundtruth.csv'))
67 |
68 | if download:
69 | self.download()
70 |
71 | def _check_exists(self) -> bool:
72 | return all(
73 | check_integrity(self.dataset_folder_path / file_name)
74 | for file_name, _ in self.resources
75 | )
76 |
77 | def download(self) -> None:
78 | """Download the Airbus Helicopter Accelerometer Dataset if it doesn't exist already."""
79 |
80 | if self._check_exists():
81 | return
82 |
83 | # pathlib makdir if not exists
84 | self.dataset_folder_path.mkdir(parents=True, exist_ok=True)
85 |
86 | # download files
87 | for filename, md5 in self.resources:
88 | for mirror in self.mirrors:
89 | url = f"{mirror}{filename}"
90 | try:
91 | print(f"Downloading {url}")
92 |
93 | download_url(url, self.dataset_folder_path, filename, md5)
94 |
95 | except URLError as error:
96 | print(f"Failed to download (trying next):\n{error}")
97 | continue
98 | finally:
99 | print()
100 | break
101 | else:
102 | raise RuntimeError(f"Error downloading {filename}")
103 |
104 | def load_df(
105 | self,
106 | train_or_val: str = "train",
107 | ) -> None:
108 | """Load the h5 file as df."""
109 |
110 | if train_or_val == "train":
111 | df = pd.read_hdf(self.dataset_folder_path / "dftrain.h5", "dftrain")
112 |
113 | # add y column of all zeros (indicating no anomaly)
114 | df["y"] = 0
115 |
116 | else: # val dataset
117 | df = pd.read_hdf(self.dataset_folder_path / "dfvalid.h5", "dfvalid")
118 |
119 | # load the dfvalid_groundtruth.csv as dataframe
120 | df_labels = pd.read_csv(
121 | self.path_df_labels,
122 | dtype={"seqID": int, "anomaly": int},
123 | )
124 |
125 | # append the anomaly label to the df_val dataframe
126 | df = df.merge(df_labels, left_index=True, right_on="seqID")
127 |
128 | # drop the seqID column and rename the anomaly column to y
129 | df = df.drop("seqID", axis=1).rename(columns={"anomaly": "y"})
130 |
131 | return df
132 |
133 |
134 | class AirbusPrepMethodA(AirbusDataLoad):
135 | """
136 | Class used to prepare the Airbus Helicopter Accelerometer Dataset before feature engining or machine learning.
137 | Method is described in the paper:
138 |
139 | `Temporal signals to images: Monitoring the condition of industrial assets with deep learning image processing algorithms`
140 | by Garcia et al., 2021 - https://arxiv.org/abs/2005.07031
141 |
142 | Args:
143 | root (string): Root directory to place all the data sets. (likely the raw data folder)
144 |
145 | dataset_folder_name (string): Name of folder (within root) containing raw data.
146 | This folder will be created in the root directory if not present.
147 |
148 | download (bool): If True, the data will be downloaded from the ETH Zurich website.
149 |
150 | path_df_labels (Path, optional): Path to the csv with the labels. If not provided, it
151 | will default to airbus_dfvalid_groundtruth.csv in the auxilary_metadata folder.
152 |
153 | window_size (int): Size of the window to be used for the sliding window.
154 |
155 | stride (int): Size of the stride to be used for the sliding window.
156 |
157 | """
158 |
159 | def __init__(
160 | self,
161 | root: Path,
162 | dataset_folder_name: str = "airbus",
163 | download: bool = False,
164 | path_df_labels: Path = None,
165 | window_size: int = 64,
166 | stride: int = 64,
167 | ) -> None:
168 | super().__init__(root, dataset_folder_name, download, path_df_labels)
169 |
170 | self.window_size = window_size # size of the window
171 | self.stride = stride # stride between windows
172 |
173 | def create_xy_arrays(self, train_or_val: str = "train"):
174 | """Create the x and y arrays used in deep learning.
175 |
176 | Parameters
177 | ----------
178 | train_or_val : str
179 | Either 'train' or 'val' to indicate which dataset to use. Default is 'train'.
180 |
181 | Returns
182 | -------
183 | x : ndarray
184 | Array of the signals (samples). Shape: (n_samples, n_windows, window_size)
185 |
186 | y : ndarray
187 | Array of the labels/meta-data for each signals. Shape: (n_samples, n_windows, window_size, label_columns)
188 | The label_columns (in order) are:
189 | time_increments (int) -- the index of each time increment in the window. e.g. (0, 1, 2, ...)
190 | sample_index (int) -- the index of each sample
191 | window_index (int) -- the index of each window
192 | label (int) -- the label of each windowed sample (0 for normal, 1 for anomaly)
193 |
194 | """
195 |
196 | # load the dataframe
197 | df = self.load_df(train_or_val)
198 |
199 | x = df.drop("y", axis=1).to_numpy()
200 | y = df["y"].to_numpy()
201 |
202 | # instantiate the "temporary" lists to store the windows and labels
203 | window_list = []
204 | y_sample_win_label_list = []
205 |
206 | n_samples = x.shape[0]
207 | len_sample = x.shape[1]
208 |
209 | # fit the strided windows into the temporary list until the length
210 | # of the window does not equal the proper length (better way to do this???)
211 | for window_i in range(len_sample):
212 | windowed_signal = x[
213 | :, window_i * self.stride : window_i * self.stride + self.window_size
214 | ]
215 |
216 | # if the windowed signal is the proper length, add it to the list
217 | if windowed_signal.shape == (n_samples, self.window_size):
218 | window_list.append(windowed_signal)
219 |
220 | y_sample_win_label_list.append(
221 | [
222 | (int(sample_indices), int(window_indices), int(ys))
223 | for sample_indices, window_indices, ys in list(
224 | zip(list(range(0, n_samples)), [window_i] * n_samples, y)
225 | )
226 | ]
227 | )
228 |
229 | else:
230 | break
231 |
232 | x = np.array(window_list).reshape(n_samples, -1, self.window_size)
233 |
234 | y_sample_win_label_array = np.array(y_sample_win_label_list)[:, :, np.newaxis].repeat(
235 | self.window_size, axis=2
236 | )
237 |
238 | time_index = (
239 | np.arange(0, self.window_size, 1)[np.newaxis, np.newaxis, :]
240 | .repeat(n_samples, axis=1)
241 | .repeat(x.shape[1], axis=0)[:, :, :, np.newaxis]
242 | )
243 |
244 | y_time_sample_win_label_array = np.concatenate(
245 | (time_index, y_sample_win_label_array), axis=3
246 | ).reshape(n_samples, -1, self.window_size, 4)
247 | # window_id_array = np.expand_dims(np.array(window_id_list).reshape(-1), axis=1)
248 | # window_label_array = np.expand_dims(np.array(window_label_list).reshape(-1), axis=1)
249 |
250 | # x = np.vstack(window_list,)
251 |
252 | # y = np.hstack((window_label_array, window_id_array))
253 | # return np.vstack(x), np.vstack(y_time_sig_win_label_array)
254 | return x, y_time_sample_win_label_array
255 |
256 | def create_xy_dataframe(self, train_or_val: str = "train"):
257 | """
258 | Create a flat dataframe (2D array) of the x and y arrays.
259 |
260 | Amenable for use with TSFresh for feature engineering.
261 |
262 | Returns
263 | -------
264 | df : pd.DataFrame
265 | Single flat dataframe containing each sample and its labels.
266 | columns: ['x', 'time_index', 'sample_index', 'window_index', 'y']
267 |
268 | """
269 |
270 | x, y = self.create_xy_arrays(train_or_val) # create the x and y arrays
271 |
272 | df = pd.DataFrame(np.vstack(x).reshape(-1,1), columns=['x'])
273 |
274 | # add the time_index, sample_index, window_index, and label columns
275 | # to the dataframe
276 | df = df.assign(time_index=np.vstack(y[:,:,:,0]).reshape(-1,1))
277 | df = df.assign(sample_index=np.vstack(y[:,:,:,1]).reshape(-1,1))
278 | df = df.assign(win_index=np.vstack(y[:,:,:,2]).reshape(-1,1))
279 | df = df.assign(y=np.vstack(y[:,:,:,3]).reshape(-1,1))
280 |
281 | return df
282 |
--------------------------------------------------------------------------------
/notebooks/scratch/test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "ename": "ModuleNotFoundError",
10 | "evalue": "No module named 'pyphm'",
11 | "output_type": "error",
12 | "traceback": [
13 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
14 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
15 | "Cell \u001b[0;32mIn[1], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[1;32m 2\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/tim/Documents/PyPHM\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyphm\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _urlretrieve\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyphm\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmilling\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MillingDataLoad, MillingPrepMethodA\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpathlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Path\n",
16 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyphm'"
17 | ]
18 | }
19 | ],
20 | "source": [
21 | "import sys\n",
22 | "sys.path.append(r'/home/tim/Documents/PyPHM')\n",
23 | "from pyphm.datasets.utils import _urlretrieve\n",
24 | "from pyphm.datasets.milling import MillingDataLoad, MillingPrepMethodA\n",
25 | "from pathlib import Path\n",
26 | "import hashlib\n",
27 | "\n",
28 | "%load_ext autoreload\n",
29 | "%autoreload 2"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "name": "stdout",
39 | "output_type": "stream",
40 | "text": [
41 | "/home/tim/Documents/PyPHM\n",
42 | "/home/tim/Documents/PyPHM/data\n",
43 | "\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "root_dir = Path.cwd().parent\n",
49 | "print(root_dir)\n",
50 | "path_data_raw_folder = Path(root_dir / 'data' )\n",
51 | "print(path_data_raw_folder)\n",
52 | "print(type(path_data_raw_folder))"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 4,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "type(root) = \n",
65 | "Loading data!!!!\n"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "mill = MillingDataLoad(path_data_raw_folder, download=False)"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 5,
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "name": "stdout",
80 | "output_type": "stream",
81 | "text": [
82 | "type(root) = \n",
83 | "Loading data!!!!\n",
84 | "type field names: \n",
85 | "type signal names: \n",
86 | "('case', 'run', 'VB', 'time', 'DOC', 'feed', 'material', 'smcAC', 'smcDC', 'vib_table', 'vib_spindle', 'AE_table', 'AE_spindle')\n",
87 | "('AE_spindle', 'AE_table', 'vib_spindle', 'vib_table', 'smcDC', 'smcAC')\n"
88 | ]
89 | }
90 | ],
91 | "source": [
92 | "mill = MillingPrepMethodA(path_data_raw_folder, download=False)"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 6,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "name": "stdout",
102 | "output_type": "stream",
103 | "text": [
104 | "x.shape (11570, 64, 6)\n",
105 | "y.shape (11570, 64, 3)\n"
106 | ]
107 | }
108 | ],
109 | "source": [
110 | "x, y = mill.create_xy_arrays()\n",
111 | "print(\"x.shape\", x.shape)\n",
112 | "print(\"y.shape\", y.shape)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 7,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "data": {
122 | "text/plain": [
123 | "array(['0', '0_0', '0.0'], dtype='\n",
144 | "\n",
157 | "\n",
158 | " \n",
159 | " \n",
160 | " | \n",
161 | " cut_id | \n",
162 | " cut_no | \n",
163 | " case | \n",
164 | " time | \n",
165 | " ae_spindle | \n",
166 | " ae_table | \n",
167 | " vib_spindle | \n",
168 | " vib_table | \n",
169 | " smcdc | \n",
170 | " smcac | \n",
171 | " tool_class | \n",
172 | "
\n",
173 | " \n",
174 | " \n",
175 | " \n",
176 | " | 0 | \n",
177 | " 0_0 | \n",
178 | " 0 | \n",
179 | " 1 | \n",
180 | " 0.000 | \n",
181 | " 0.219727 | \n",
182 | " 0.272827 | \n",
183 | " 0.733643 | \n",
184 | " 2.116699 | \n",
185 | " 6.840820 | \n",
186 | " 0.124512 | \n",
187 | " 0 | \n",
188 | "
\n",
189 | " \n",
190 | " | 1 | \n",
191 | " 0_0 | \n",
192 | " 0 | \n",
193 | " 1 | \n",
194 | " 0.004 | \n",
195 | " 0.246582 | \n",
196 | " 0.322266 | \n",
197 | " 0.778809 | \n",
198 | " 2.277832 | \n",
199 | " 6.660156 | \n",
200 | " -0.561523 | \n",
201 | " 0 | \n",
202 | "
\n",
203 | " \n",
204 | " | 2 | \n",
205 | " 0_0 | \n",
206 | " 0 | \n",
207 | " 1 | \n",
208 | " 0.008 | \n",
209 | " 0.294189 | \n",
210 | " 0.283813 | \n",
211 | " 0.758057 | \n",
212 | " 2.343750 | \n",
213 | " 6.508789 | \n",
214 | " -2.099609 | \n",
215 | " 0 | \n",
216 | "
\n",
217 | " \n",
218 | " | 3 | \n",
219 | " 0_0 | \n",
220 | " 0 | \n",
221 | " 1 | \n",
222 | " 0.012 | \n",
223 | " 0.323486 | \n",
224 | " 0.260010 | \n",
225 | " 0.726318 | \n",
226 | " 2.448730 | \n",
227 | " 6.542969 | \n",
228 | " -2.731934 | \n",
229 | " 0 | \n",
230 | "
\n",
231 | " \n",
232 | " | 4 | \n",
233 | " 0_0 | \n",
234 | " 0 | \n",
235 | " 1 | \n",
236 | " 0.016 | \n",
237 | " 0.290527 | \n",
238 | " 0.253296 | \n",
239 | " 0.653076 | \n",
240 | " 2.546387 | \n",
241 | " 6.621094 | \n",
242 | " -3.505859 | \n",
243 | " 0 | \n",
244 | "
\n",
245 | " \n",
246 | "
\n",
247 | ""
248 | ],
249 | "text/plain": [
250 | " cut_id cut_no case time ae_spindle ae_table vib_spindle vib_table \\\n",
251 | "0 0_0 0 1 0.000 0.219727 0.272827 0.733643 2.116699 \n",
252 | "1 0_0 0 1 0.004 0.246582 0.322266 0.778809 2.277832 \n",
253 | "2 0_0 0 1 0.008 0.294189 0.283813 0.758057 2.343750 \n",
254 | "3 0_0 0 1 0.012 0.323486 0.260010 0.726318 2.448730 \n",
255 | "4 0_0 0 1 0.016 0.290527 0.253296 0.653076 2.546387 \n",
256 | "\n",
257 | " smcdc smcac tool_class \n",
258 | "0 6.840820 0.124512 0 \n",
259 | "1 6.660156 -0.561523 0 \n",
260 | "2 6.508789 -2.099609 0 \n",
261 | "3 6.542969 -2.731934 0 \n",
262 | "4 6.621094 -3.505859 0 "
263 | ]
264 | },
265 | "execution_count": 39,
266 | "metadata": {},
267 | "output_type": "execute_result"
268 | }
269 | ],
270 | "source": [
271 | "df = mill.create_xy_dataframe()\n",
272 | "df.head()"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 40,
278 | "metadata": {},
279 | "outputs": [
280 | {
281 | "data": {
282 | "text/plain": [
283 | "(740480, 11)"
284 | ]
285 | },
286 | "execution_count": 40,
287 | "metadata": {},
288 | "output_type": "execute_result"
289 | }
290 | ],
291 | "source": [
292 | "df.shape"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 10,
298 | "metadata": {},
299 | "outputs": [
300 | {
301 | "data": {
302 | "text/plain": [
303 | "(11570, 64, 3)"
304 | ]
305 | },
306 | "execution_count": 10,
307 | "metadata": {},
308 | "output_type": "execute_result"
309 | }
310 | ],
311 | "source": [
312 | "y.shape"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 8,
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "data": {
322 | "text/plain": [
323 | "(11570, 64, 6)"
324 | ]
325 | },
326 | "execution_count": 8,
327 | "metadata": {},
328 | "output_type": "execute_result"
329 | }
330 | ],
331 | "source": [
332 | "x.shape"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [],
340 | "source": []
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 8,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "# sys.path.append(root_dir / 'pyphm')\n",
349 | "from pyphm.datasets.utils import _urlretrieve"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": []
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": 9,
362 | "metadata": {},
363 | "outputs": [
364 | {
365 | "name": "stdout",
366 | "output_type": "stream",
367 | "text": [
368 | "d3ca5a418c2ed0887d68bc3f91991f12\n"
369 | ]
370 | }
371 | ],
372 | "source": [
373 | "def file_as_bytes(file):\n",
374 | " with file:\n",
375 | " return file.read()\n",
376 | "\n",
377 | "print(hashlib.md5(file_as_bytes(open(path_data_raw_folder / 'IMS.7z', 'rb'))).hexdigest())"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "# _urlretrieve('https://files.realpython.com/media/Python-Imports_Watermarked.ae72c8a00197.jpg', 'test.jpg')"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "import sys\n",
396 | "sys.path"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": null,
402 | "metadata": {},
403 | "outputs": [],
404 | "source": [
405 | "import pyphm"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": null,
411 | "metadata": {},
412 | "outputs": [],
413 | "source": []
414 | }
415 | ],
416 | "metadata": {
417 | "interpreter": {
418 | "hash": "a445fd1dd59e042f3702a5878c89afe1dbbe900f3b58e4a7756e0c9feaaac4f1"
419 | },
420 | "kernelspec": {
421 | "display_name": "Python 3.8.12 64-bit ('ganzoo': conda)",
422 | "language": "python",
423 | "name": "python3"
424 | },
425 | "language_info": {
426 | "codemirror_mode": {
427 | "name": "ipython",
428 | "version": 3
429 | },
430 | "file_extension": ".py",
431 | "mimetype": "text/x-python",
432 | "name": "python",
433 | "nbconvert_exporter": "python",
434 | "pygments_lexer": "ipython3",
435 | "version": "3.11.7"
436 | },
437 | "orig_nbformat": 4
438 | },
439 | "nbformat": 4,
440 | "nbformat_minor": 2
441 | }
442 |
--------------------------------------------------------------------------------
/src/pyphm/datasets/ims.py:
--------------------------------------------------------------------------------
1 | import scipy.io as sio
2 | import numpy as np
3 | import pandas as pd
4 | from pathlib import Path
5 | from .pyphm import PHMDataset
6 | import datetime
7 | import time
8 | import multiprocessing as mp
9 | from typing import Any, Callable, List, Optional, Tuple
10 | from .utils import (
11 | download_and_extract_archive,
12 | extract_archive,
13 | check_integrity,
14 | )
15 | import os
16 | from urllib.error import URLError
17 |
18 |
19 | class ImsDataLoad(PHMDataset):
20 | """
21 | Load the IMS bearing data set from .csv files, and download if necessary.
22 |
23 | Args:
24 | root (string): Root directory to place all the data sets.
25 |
26 | dataset_folder_name (string): Name of folder containing raw data.
27 | This folder will be created in the root directory if not present.
28 |
29 | download (bool): If True, the data will be downloaded from the NASA Prognostics Repository.
30 |
31 | """
32 |
33 | mirrors = [
34 | "https://drive.google.com/file/d/1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx/view?usp=sharing",
35 | "https://ti.arc.nasa.gov/m/project/prognostic-repository/",
36 | ]
37 |
38 | resources = [
39 | ("IMS.7z", "d3ca5a418c2ed0887d68bc3f91991f12"),
40 | ]
41 |
42 | col_1st_names = [
43 | "b1_ch1",
44 | "b1_ch2",
45 | "b2_ch3",
46 | "b2_ch4",
47 | "b3_ch5",
48 | "b3_ch6",
49 | "b4_ch7",
50 | "b4_ch8",
51 | ]
52 | col_2nd_names = col_3rd_names = ["b1_ch1", "b1_ch2", "b2_ch3", "b2_ch4"]
53 |
54 | def __init__(
55 | self,
56 | root: Path,
57 | dataset_folder_name: str = "ims",
58 | download: bool = False,
59 | dataset_path: Path = None,
60 | data: np.ndarray = None,
61 | sample_freq: float = 20480.0,
62 | ) -> None:
63 | super().__init__(root, dataset_folder_name)
64 |
65 | self.dataset_path = self.root / self.dataset_folder_name
66 |
67 | if download:
68 | self.download()
69 |
70 | if not self._check_exists():
71 | raise RuntimeError(
72 | "Dataset not found. You can use download=True to download it"
73 | )
74 |
75 | # set the paths for the three experiment run folders
76 | self.path_1st_folder = self.dataset_path / "1st_test"
77 | self.path_2nd_folder = self.dataset_path / "2nd_test"
78 |
79 | # the third test is labelled as the "4th_test" in the IMS.7z archive
80 | self.path_3rd_folder = self.dataset_path / "4th_test/txt"
81 |
82 | self.sample_freq = sample_freq
83 |
84 | def _check_exists(self) -> bool:
85 | return all(
86 | check_integrity(self.dataset_path / file_name)
87 | for file_name, _ in self.resources
88 | )
89 |
90 | def download(self) -> None:
91 | """Download the UC Berkeley milling data if it doesn't exist already."""
92 |
93 | if self._check_exists():
94 | print("IMS.7z already exists.")
95 | return
96 |
97 | # pathlib makdir if not exists
98 | self.dataset_path.mkdir(parents=True, exist_ok=True)
99 |
100 | # download files
101 | for filename, md5 in self.resources:
102 | for mirror in self.mirrors:
103 | url = f"{mirror}{filename}"
104 | try:
105 | print(f"Downloading {url}")
106 | download_and_extract_archive(
107 | url, download_root=self.dataset_path, filename=filename, md5=md5
108 | )
109 |
110 | # sequentially extract the .rar files
111 | rar_list = ["1st_test.rar", "2nd_test.rar", "3rd_test.rar"]
112 | for rar_file in rar_list:
113 | print(f"Extracting {rar_file}")
114 | extract_archive(
115 | self.dataset_path / rar_file, remove_finished=True
116 | )
117 |
118 | except URLError as error:
119 | print(f"Failed to download (trying next):\n{error}")
120 | continue
121 | finally:
122 | print()
123 | break
124 | else:
125 | raise RuntimeError(f"Error downloading {filename}")
126 |
127 | def extract(self) -> None:
128 | """Extract the data set if it has already been dowloaded."""
129 |
130 | if not self._check_exists():
131 | print("IMS.7z does not exist. Please download.")
132 | return
133 |
134 | print("Extracting IMS.7z")
135 |
136 | # start with the .7z file
137 | extract_archive(self.dataset_path / "IMS.7z", remove_finished=False)
138 |
139 | # sequentially extract the .rar files
140 | rar_list = ["1st_test.rar", "2nd_test.rar", "3rd_test.rar"]
141 | for rar_file in rar_list:
142 | print(f"Extracting {rar_file}")
143 | extract_archive(self.dataset_path / rar_file, remove_finished=True)
144 |
145 | @staticmethod
146 | def process_raw_csv_to_dict(file_info_dict) -> None:
147 | """Load an individual sample (.csv file) of the IMS data set."""
148 |
149 | path_run_folder = file_info_dict["path_run_folder"]
150 | file_name = file_info_dict["file_name"]
151 | run_no = file_info_dict["run_no"]
152 | sample_index = file_info_dict["sample_index"]
153 |
154 | # load the .csv file
155 | signals_array = np.loadtxt(path_run_folder / file_name, delimiter="\t")
156 |
157 | # get the start time (for the first sample) and convert to unix timestamp
158 | start_time_unix = time.mktime(
159 | datetime.datetime.strptime(file_name, "%Y.%m.%d.%H.%M.%S").timetuple()
160 | )
161 |
162 | # create dictionary with the signals_array, id_list, run_list, file_list, time_step_array
163 | data_dict = {
164 | "signals_array": signals_array,
165 | "id": f"{run_no}_{sample_index}",
166 | "run_no": run_no,
167 | "file_name": file_name,
168 | "sample_index": sample_index,
169 | "start_time_unix": start_time_unix,
170 | }
171 |
172 | return data_dict
173 |
174 | def load_run_as_dict(
175 | self,
176 | run_no: int,
177 | n_jobs: int = None,
178 | ) -> None:
179 | if run_no == 1:
180 | col_names = self.col_1st_names
181 | path_run_folder = self.path_1st_folder
182 | elif run_no == 2:
183 | col_names = self.col_2nd_names
184 | path_run_folder = self.path_2nd_folder
185 | else:
186 | col_names = self.col_3rd_names
187 | path_run_folder = self.path_3rd_folder
188 |
189 | # create a list of dictionaries containing the metadata for each file
190 | file_info_list = []
191 | for i, file_name in enumerate(sorted(os.listdir(path_run_folder))):
192 | file_info_list.append(
193 | {
194 | "path_run_folder": path_run_folder,
195 | "file_name": file_name,
196 | "col_names": col_names,
197 | "run_no": run_no,
198 | "sample_index": i,
199 | }
200 | )
201 |
202 | # get number of cpu cores
203 | if n_jobs is None:
204 | n_jobs = mp.cpu_count() - 2
205 | if n_jobs < 1:
206 | n_jobs = 1
207 | print("n_jobs:", n_jobs)
208 | with mp.Pool(processes=n_jobs) as pool:
209 |
210 | # from https://stackoverflow.com/a/36590187
211 | data_list = pool.map(self.process_raw_csv_to_dict, file_info_list)
212 |
213 | # store the data from data_list as a dictionary, with the key being the file name
214 | data_dict = {}
215 | for data_dict_i in data_list:
216 | data_dict[data_dict_i["file_name"]] = data_dict_i
217 | return data_dict
218 |
219 | @staticmethod
220 | def process_raw_csv_to_df(file_info_dict) -> None:
221 | """Load an individual sample (.csv file) of the IMS data set."""
222 |
223 | path_run_folder = file_info_dict["path_run_folder"]
224 | file_name = file_info_dict["file_name"]
225 | sample_freq = file_info_dict["sample_freq"]
226 | col_names = file_info_dict["col_names"]
227 | run_no = file_info_dict["run_no"]
228 | sample_index = file_info_dict["sample_index"]
229 |
230 | # load the .csv file
231 | signals_array = np.loadtxt(path_run_folder / file_name, delimiter="\t")
232 |
233 | id_list = [f"{run_no}_{sample_index}"] * len(signals_array)
234 | run_list = [run_no] * len(signals_array)
235 | file_list = [file_name] * len(signals_array)
236 | time_step_array = np.linspace(
237 | 0.0, len(signals_array) / sample_freq, len(signals_array)
238 | )
239 |
240 | df = pd.DataFrame(np.vstack(signals_array), columns=col_names, dtype=np.float32)
241 | df["id"] = id_list
242 | df["run"] = run_list
243 | df["file"] = file_list
244 | df["time_step"] = np.hstack(time_step_array)
245 |
246 | return df.astype({"id": str, "run": int, "file": str, "time_step": np.float32})
247 |
248 | def load_run_as_df(
249 | self,
250 | run_no: int,
251 | n_jobs: int = None,
252 | ) -> None:
253 | """Load the three runs as individual dataframes."""
254 |
255 | if run_no == 1:
256 | col_names = self.col_1st_names
257 | path_run_folder = self.path_1st_folder
258 | elif run_no == 2:
259 | col_names = self.col_2nd_names
260 | path_run_folder = self.path_2nd_folder
261 | else:
262 | col_names = self.col_3rd_names
263 | path_run_folder = self.path_3rd_folder
264 |
265 | # get list of every file in the folder and sort by ascending date
266 | file_list = sorted(os.listdir(path_run_folder))
267 |
268 | # create a list of dictionaries containing the metadata for each file
269 | file_info_list = []
270 | for i, file_name in enumerate(sorted(os.listdir(path_run_folder))):
271 | file_info_list.append(
272 | {
273 | "path_run_folder": path_run_folder,
274 | "file_name": file_name,
275 | "sample_freq": self.sample_freq,
276 | "col_names": col_names,
277 | "run_no": run_no,
278 | "sample_index": i,
279 | }
280 | )
281 |
282 | # get number of cpu cores
283 | if n_jobs is None:
284 | n_jobs = mp.cpu_count() - 2
285 | if n_jobs < 1:
286 | n_jobs = 1
287 |
288 | # load the dataframes in parallel
289 | with mp.Pool(processes=n_jobs) as pool:
290 |
291 | # from https://stackoverflow.com/a/36590187
292 | df_run = pool.map(self.process_raw_csv_to_df, file_info_list)
293 | df = pd.concat(df_run, ignore_index=True)
294 |
295 | col_names_ordered = ["id", "run", "file", "time_step"] + col_names
296 |
297 | return df[col_names_ordered]
298 |
299 |
300 | class ImsPrepMethodA(ImsDataLoad):
301 | """
302 | Class used to prepare the IMS bearing dataset before feature engining or machine learning.
303 |
304 | Args:
305 | root (string): Root directory to place all the data sets. (likely the raw data folder)
306 |
307 | dataset_folder_name (string): Name of folder containing raw data.
308 | This folder will be created in the root directory if not present.
309 |
310 | download (bool): If True, the data will be downloaded from the NASA Prognostics Repository.
311 |
312 | path_df_labels (Path, optional): Path to the dataframe with the labels (as a string).
313 | If not provided, the dataframe must be created.
314 |
315 | window_size (int): Size of the window to be used for the sliding window.
316 |
317 | stride (int): Size of the stride to be used for the sliding window.
318 |
319 | cut_drop_list (list, optional): List of cut numbers to drop. cut_no 17 and 94 are erroneous.
320 | """
321 |
322 | def __init__(
323 | self,
324 | root: Path,
325 | dataset_folder_name: str = "ims",
326 | download: bool = False,
327 | ) -> None:
328 | super().__init__(
329 | root,
330 | dataset_folder_name,
331 | download,
332 | )
333 |
334 | def create_xy_arrays(
335 | self,
336 | run_no: int = 1,
337 | n_jobs: int = None,
338 | ) -> None:
339 |
340 | # create a list to store the x and y arrays
341 | x = [] # instantiate X's
342 | y_ids_runs_files_times_ctimes = [] # instantiate y's
343 |
344 | # create the data dict storing the signals and metadata
345 | data_dict = self.load_run_as_dict(run_no, n_jobs)
346 |
347 | # get all the file names from the data_dict and sort them
348 | file_names = sorted(data_dict.keys())
349 |
350 | for i, file_name in enumerate(file_names):
351 |
352 | x.append(data_dict[file_name]["signals_array"])
353 | y_ids_runs_files_times_ctimes.append(
354 | [
355 | data_dict[file_name]["id"],
356 | data_dict[file_name]["run_no"],
357 | data_dict[file_name]["file_name"],
358 | data_dict[file_name]["sample_index"],
359 | data_dict[file_name]["start_time_unix"],
360 | ]
361 | )
362 |
363 | x = np.stack(x)
364 | n_samples = x.shape[0]
365 | n_signals = x.shape[2]
366 |
367 | return x, np.stack(y_ids_runs_files_times_ctimes).reshape(-1, 5)
368 |
369 | def create_xy_df(
370 | self,
371 | run_no: int = 1,
372 | n_jobs: int = None,
373 | ) -> None:
374 | return self.load_run_as_df(run_no, n_jobs)
375 |
--------------------------------------------------------------------------------
/notebooks/images/prauc_cnc.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
38 |
--------------------------------------------------------------------------------
/notebooks/scratch/airbus_download.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from pyphm.datasets.utils import _urlretrieve, download_url\n",
10 | "from pathlib import Path\n",
11 | "from pyphm.datasets.airbus import AirbusDataLoad\n",
12 | "\n",
13 | "%load_ext autoreload\n",
14 | "%autoreload 2"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 6,
20 | "metadata": {},
21 | "outputs": [
22 | {
23 | "name": "stdout",
24 | "output_type": "stream",
25 | "text": [
26 | "/home/tim/Documents/PyPHM\n",
27 | "/home/tim/Documents/PyPHM/data/raw\n"
28 | ]
29 | }
30 | ],
31 | "source": [
32 | "root_dir = Path.cwd().parent.parent\n",
33 | "print(root_dir)\n",
34 | "path_data_raw_folder = Path(root_dir / 'data/raw/' )\n",
35 | "print(path_data_raw_folder)"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 7,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "air = AirbusDataLoad(path_data_raw_folder, download=True)"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 8,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "(1677, 61441)\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "df_train = air.load_df(train_or_val=\"train\")\n",
62 | "print(df_train.shape)"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 9,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "name": "stdout",
72 | "output_type": "stream",
73 | "text": [
74 | "(594, 61441)\n"
75 | ]
76 | }
77 | ],
78 | "source": [
79 | "df_val = air.load_df(train_or_val=\"val\")\n",
80 | "print(df_val.shape)"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": []
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 12,
93 | "metadata": {},
94 | "outputs": [
95 | {
96 | "name": "stdout",
97 | "output_type": "stream",
98 | "text": [
99 | "Downloading https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/dftrain.h5 to c:\\_Python\\PyPHM\\notebooks\\dftrain.h5\n"
100 | ]
101 | },
102 | {
103 | "name": "stderr",
104 | "output_type": "stream",
105 | "text": [
106 | " 4%|▍ | 36639744/825280760 [00:04<01:32, 8542721.26it/s] \n"
107 | ]
108 | },
109 | {
110 | "ename": "KeyboardInterrupt",
111 | "evalue": "",
112 | "output_type": "error",
113 | "traceback": [
114 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
115 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
116 | "\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_20668/1413174493.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 7\u001b[0m ]\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0mdownload_url\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/dftrain.h5\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mroot\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mpath_cwd\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
117 | "\u001b[1;32mc:\\_python\\pyphm\\pyphm\\datasets\\utils.py\u001b[0m in \u001b[0;36mdownload_url\u001b[1;34m(url, root, filename, md5, max_redirect_hops)\u001b[0m\n\u001b[0;32m 176\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 177\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Downloading \"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0murl\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\" to \"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mfpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 178\u001b[1;33m \u001b[0m_urlretrieve\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 179\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mURLError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOSError\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# type: ignore[attr-defined]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 180\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"https\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
118 | "\u001b[1;32mc:\\_python\\pyphm\\pyphm\\datasets\\utils.py\u001b[0m in \u001b[0;36m_urlretrieve\u001b[1;34m(url, filename, chunk_size)\u001b[0m\n\u001b[0;32m 69\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m\"User-Agent\"\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mUSER_AGENT\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 70\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlength\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpbar\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 71\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[1;32min\u001b[0m \u001b[0miter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 72\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mchunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 73\u001b[0m \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
119 | "\u001b[1;32mc:\\_python\\pyphm\\pyphm\\datasets\\utils.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 69\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m\"User-Agent\"\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mUSER_AGENT\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 70\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlength\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpbar\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 71\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[1;32min\u001b[0m \u001b[0miter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 72\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mchunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 73\u001b[0m \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
120 | "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\http\\client.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, amt)\u001b[0m\n\u001b[0;32m 457\u001b[0m \u001b[1;31m# Amount is given, implement using readinto\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[0mb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 459\u001b[1;33m \u001b[0mn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 460\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mn\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtobytes\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 461\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
121 | "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\http\\client.py\u001b[0m in \u001b[0;36mreadinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 501\u001b[0m \u001b[1;31m# connection, and the user is reading more bytes than will be provided\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 502\u001b[0m \u001b[1;31m# (for example, reading in 1k chunks)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 503\u001b[1;33m \u001b[0mn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 504\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mn\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mb\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 505\u001b[0m \u001b[1;31m# Ideally, we would raise IncompleteRead if the content-length\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
122 | "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 667\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 668\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 669\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 670\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 671\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
123 | "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\ssl.py\u001b[0m in \u001b[0;36mrecv_into\u001b[1;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[0;32m 1239\u001b[0m \u001b[1;34m\"non-zero flags not allowed in calls to recv_into() on %s\"\u001b[0m \u001b[1;33m%\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1240\u001b[0m self.__class__)\n\u001b[1;32m-> 1241\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnbytes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1242\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1243\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
124 | "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\ssl.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, len, buffer)\u001b[0m\n\u001b[0;32m 1097\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1098\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mbuffer\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1099\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1100\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1101\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
125 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
126 | ]
127 | }
128 | ],
129 | "source": [
130 | "mirrors = [\n",
131 | " \"https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/\",\n",
132 | "]\n",
133 | "\n",
134 | "resources = [\n",
135 | " (\"dftrain.h5?sequence=1&isAllowed=y\",),\n",
136 | "]\n",
137 | "\n",
138 | "download_url(\"https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/dftrain.h5\", root=path_cwd,)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": []
147 | }
148 | ],
149 | "metadata": {
150 | "kernelspec": {
151 | "display_name": "Python 3.8.12 ('featstore')",
152 | "language": "python",
153 | "name": "python3"
154 | },
155 | "language_info": {
156 | "codemirror_mode": {
157 | "name": "ipython",
158 | "version": 3
159 | },
160 | "file_extension": ".py",
161 | "mimetype": "text/x-python",
162 | "name": "python",
163 | "nbconvert_exporter": "python",
164 | "pygments_lexer": "ipython3",
165 | "version": "3.8.12"
166 | },
167 | "orig_nbformat": 4,
168 | "vscode": {
169 | "interpreter": {
170 | "hash": "daff1afd4d675d5e247c0a95a5de0c03bd87d8f7edee7cb37c539016070f1c16"
171 | }
172 | }
173 | },
174 | "nbformat": 4,
175 | "nbformat_minor": 2
176 | }
177 |
--------------------------------------------------------------------------------
/references/sources.bib:
--------------------------------------------------------------------------------
1 | @incollection{buckheit1995wavelab,
2 | title={Wavelab and reproducible research},
3 | author={Buckheit, Jonathan B and Donoho, David L},
4 | booktitle={Wavelets and statistics},
5 | pages={55--81},
6 | year={1995},
7 | publisher={Springer}
8 | }
9 |
10 | @Article{ harris2020array,
11 | title = {Array programming with {NumPy}},
12 | author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J.
13 | van der Walt and Ralf Gommers and Pauli Virtanen and David
14 | Cournapeau and Eric Wieser and Julian Taylor and Sebastian
15 | Berg and Nathaniel J. Smith and Robert Kern and Matti Picus
16 | and Stephan Hoyer and Marten H. van Kerkwijk and Matthew
17 | Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del
18 | R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre
19 | G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and
20 | Warren Weckesser and Hameer Abbasi and Christoph Gohlke and
21 | Travis E. Oliphant},
22 | year = {2020},
23 | month = sep,
24 | journal = {Nature},
25 | volume = {585},
26 | number = {7825},
27 | pages = {357--362},
28 | doi = {10.1038/s41586-020-2649-2},
29 | publisher = {Springer Science and Business Media {LLC}},
30 | url = {https://doi.org/10.1038/s41586-020-2649-2}
31 | }
32 |
33 | @ARTICLE{2020SciPy-NMeth,
34 | author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and
35 | Haberland, Matt and Reddy, Tyler and Cournapeau, David and
36 | Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and
37 | Bright, Jonathan and {van der Walt}, St{\'e}fan J. and
38 | Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and
39 | Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and
40 | Kern, Robert and Larson, Eric and Carey, C J and
41 | Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and
42 | {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and
43 | Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and
44 | Harris, Charles R. and Archibald, Anne M. and
45 | Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and
46 | {van Mulbregt}, Paul and {SciPy 1.0 Contributors}},
47 | title = {{{SciPy} 1.0: Fundamental Algorithms for Scientific
48 | Computing in Python}},
49 | journal = {Nature Methods},
50 | year = {2020},
51 | volume = {17},
52 | pages = {261--272},
53 | adsurl = {https://rdcu.be/b08Wh},
54 | doi = {10.1038/s41592-019-0686-2},
55 | }
56 |
57 | @InProceedings{ mckinney-proc-scipy-2010,
58 | author = { {W}es {M}c{K}inney },
59 | title = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython },
60 | booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference },
61 | pages = { 56 - 61 },
62 | year = { 2010 },
63 | editor = { {S}t\'efan van der {W}alt and {J}arrod {M}illman },
64 | doi = { 10.25080/Majora-92bf1922-00a }
65 | }
66 |
67 | @article{donoho2008reproducible,
68 | title={Reproducible research in computational harmonic analysis},
69 | author={Donoho, David L and Maleki, Arian and Rahman, Inam Ur and Shahram, Morteza and Stodden, Victoria},
70 | journal={Computing in Science \& Engineering},
71 | volume={11},
72 | number={1},
73 | pages={8--18},
74 | year={2008},
75 | publisher={IEEE}
76 | }
77 |
78 | @article{ince2012case,
79 | title={The case for open computer programs},
80 | author={Ince, Darrel C and Hatton, Leslie and Graham-Cumming, John},
81 | journal={Nature},
82 | volume={482},
83 | number={7386},
84 | pages={485--488},
85 | year={2012},
86 | publisher={Nature Publishing Group}
87 | }
88 |
89 |
90 | @article{trouble_lab_2013, ISSN={0013-0613},
91 | title={Trouble at the lab},
92 | url={https://www.economist.com/briefing/2013/10/18/trouble-at-the-lab},
93 | abstractNote={Scientists like to think of science as self-correcting. To an alarming degree, it is not},
94 | journal={The Economist},
95 | year={2013},
96 | month={Oct}}
97 |
98 | @article{hu2022prognostics,
99 | title={Prognostics and health management: A review from the perspectives of design, development and decision},
100 | author={Hu, Yang and Miao, Xuewen and Si, Yong and Pan, Ershun and Zio, Enrico},
101 | journal={Reliability Engineering \& System Safety},
102 | volume={217},
103 | pages={108063},
104 | year={2022},
105 | publisher={Elsevier}
106 | }
107 |
108 | @article{national2019reproducibility,
109 | title={Reproducibility and replicability in science},
110 | author={National Academies of Sciences, Engineering, and Medicine and others},
111 | year={2019},
112 | publisher={National Academies Press}
113 | }
114 |
115 | @inproceedings{stodden2018enabling,
116 | title={Enabling the verification of computational results: An empirical evaluation of computational reproducibility},
117 | author={Stodden, Victoria and Krafczyk, Matthew S and Bhaskar, Adhithya},
118 | booktitle={Proceedings of the First International Workshop on Practical Reproducible Evaluation of Computer Systems},
119 | pages={1--5},
120 | year={2018}
121 | }
122 |
123 | @article{gundersen2018reproducible,
124 | title={On reproducible AI: Towards reproducible research, open science, and digital scholarship in AI publications},
125 | author={Gundersen, Odd Erik and Gil, Yolanda and Aha, David W},
126 | journal={AI magazine},
127 | volume={39},
128 | number={3},
129 | pages={56--68},
130 | year={2018}
131 | }
132 |
133 | @book{chollet2021deep,
134 | title={Deep learning with Python},
135 | author={Chollet, Francois},
136 | year={2021},
137 | publisher={Simon and Schuster}
138 | }
139 |
140 | @inproceedings{astfalck2016modelling,
141 | title={A modelling ecosystem for prognostics},
142 | author={Astfalck, Lachlan and Hodkiewicz, Melinda and Keating, Adrian and Cripps, Edward and Pecht, Michael},
143 | booktitle={Annual Conference of the PHM Society},
144 | volume={8},
145 | number={1},
146 | year={2016}
147 | }
148 |
149 | @article{frachtenberg2022research,
150 | title={Research artifacts and citations in computer systems papers},
151 | author={Frachtenberg, Eitan},
152 | journal={PeerJ Computer Science},
153 | volume={8},
154 | pages={e887},
155 | year={2022},
156 | publisher={PeerJ Inc.}
157 | }
158 |
159 | @article{dorch2015data,
160 | title={The data sharing advantage in astrophysics},
161 | author={Dorch, Bertil F and Drachen, Thea M and Ellegaard, Ole},
162 | journal={Proceedings of the International Astronomical Union},
163 | volume={11},
164 | number={A29A},
165 | pages={172--175},
166 | year={2015},
167 | publisher={Cambridge University Press}
168 | }
169 |
170 | @article{henneken2011linking,
171 | title={Linking to data-effect on citation rates in astronomy},
172 | author={Henneken, Edwin A and Accomazzi, Alberto},
173 | journal={arXiv preprint arXiv:1111.3618},
174 | year={2011}
175 | }
176 |
177 | @article{piwowar2013data,
178 | title={Data reuse and the open data citation advantage},
179 | author={Piwowar, Heather A and Vision, Todd J},
180 | journal={PeerJ},
181 | volume={1},
182 | pages={e175},
183 | year={2013},
184 | publisher={PeerJ Inc.}
185 | }
186 |
187 | @article{piwowar2007sharing,
188 | title={Sharing detailed research data is associated with increased citation rate},
189 | author={Piwowar, Heather A and Day, Roger S and Fridsma, Douglas B},
190 | journal={PloS one},
191 | volume={2},
192 | number={3},
193 | pages={e308},
194 | year={2007},
195 | publisher={Public Library of Science San Francisco, USA}
196 | }
197 |
198 | @article{colavizza2020citation,
199 | title={The citation advantage of linking publications to research data},
200 | author={Colavizza, Giovanni and Hrynaszkiewicz, Iain and Staden, Isla and Whitaker, Kirstie and McGillivray, Barbara},
201 | journal={PloS one},
202 | volume={15},
203 | number={4},
204 | pages={e0230416},
205 | year={2020},
206 | publisher={Public Library of Science San Francisco, CA USA}
207 | }
208 |
209 | @article{fu2019meta,
210 | title={Meta-Research: Releasing a preprint is associated with more attention and citations for the peer-reviewed article},
211 | author={Fu, Darwin Y and Hughey, Jacob J},
212 | journal={Elife},
213 | volume={8},
214 | pages={e52646},
215 | year={2019},
216 | publisher={eLife Sciences Publications Limited}
217 | }
218 |
219 | @article{christensen2019study,
220 | title={A study of the impact of data sharing on article citations using journal policies as a natural experiment},
221 | author={Christensen, Garret and Dafoe, Allan and Miguel, Edward and Moore, Don A and Rose, Andrew K},
222 | journal={PLoS One},
223 | volume={14},
224 | number={12},
225 | pages={e0225883},
226 | year={2019},
227 | publisher={Public Library of Science San Francisco, CA USA}
228 | }
229 |
230 | @article{wahlquist2018dissemination,
231 | title={Dissemination of novel biostatistics methods: Impact of programming code availability and other characteristics on article citations},
232 | author={Wahlquist, Amy E and Muhammad, Lutfiyya N and Herbert, Teri Lynn and Ramakrishnan, Viswanathan and Nietert, Paul J},
233 | journal={PloS one},
234 | volume={13},
235 | number={8},
236 | pages={e0201590},
237 | year={2018},
238 | publisher={Public Library of Science San Francisco, CA USA}
239 | }
240 |
241 | @article{zilberman2021computer,
242 | title={Why computer occupations are behind strong STEM employment growth in the 2019--29 decade},
243 | author={Zilberman, Alan and Ice, Lindsey},
244 | journal={Computer},
245 | volume={4},
246 | number={5,164.6},
247 | pages={11--5},
248 | year={2021}
249 | }
250 |
251 | @article{rainie2017future,
252 | title={The Future of Jobs and Jobs Training.},
253 | author={Rainie, Lee and Anderson, Janna},
254 | journal={Pew Research Center},
255 | year={2017},
256 | publisher={ERIC}
257 | }
258 |
259 | @inproceedings{hars34working,
260 | title={Working for Free?--Motivations of Participating in Open Source Projects; 2001},
261 | author={Hars, A and Ou, S},
262 | booktitle={34th Annual Hawaii International Conference on System Sciences (HICSS-34), Hava{\'\i}},
263 | pages={25--39}
264 | }
265 |
266 | @article{bitzer2007intrinsic,
267 | title={Intrinsic motivation in open source software development},
268 | author={Bitzer, J{\"u}rgen and Schrettl, Wolfram and Schr{\"o}der, Philipp JH},
269 | journal={Journal of comparative economics},
270 | volume={35},
271 | number={1},
272 | pages={160--169},
273 | year={2007},
274 | publisher={Elsevier}
275 | }
276 |
277 | @misc{neurodatascience,
278 | url={https://neurodatascience.github.io/QLS612-Overview/},
279 | title={An introduction to the foundations of neuro data science},
280 | publisher={McGill University}, }
281 |
282 | @misc{ucberkeleyreproducible,
283 | title={Reproducible and Collaborative Data Science},
284 | url={https://berkeley-stat159-f17.github.io/stat159-f17/},
285 | abstractNote={A project-based introduction to statistical data science.
286 | Through lectures, computational laboratories, readings, homeworks, and a
287 | group project, you will learn practical techniques and tools for producing statistically sound and appropriate, reproducible, and verifiable computational answers to scientific
288 | questions. The course emphasizes version control, testing, process
289 | automation, code review, and collaborative programming. Software tools
290 | include Bash, Git, Python, Jupyter and LATEX},
291 | publisher={University of California, Berkeley} }
292 |
293 | @misc{harvard2017reproducible, url={https://pll.harvard.edu/course/principles-statistical-and-computational-tools-reproducible-data-science},
294 | title={Principles, Statistical and Computational Tools for Reproducible Data Science},
295 | abstractNote={Learn skills and tools that support data science and reproducible research, to ensure you can trust your own research
296 | results, reproduce them yourself, and communicate them to others.},
297 | publisher={Harvard University},
298 | year={2017},
299 | month={Oct} }
300 |
301 | @article{stodden2013toward,
302 | title={Toward reproducible computational research: an empirical analysis of data and code policy adoption by journals},
303 | author={Stodden, Victoria and Guo, Peixuan and Ma, Zhaokun},
304 | journal={PloS one},
305 | volume={8},
306 | number={6},
307 | pages={e67111},
308 | year={2013},
309 | publisher={Public Library of Science San Francisco, USA}
310 | }
311 |
312 | @article{zhao2019deep,
313 | title={Deep learning and its applications to machine health monitoring},
314 | author={Zhao, Rui and Yan, Ruqiang and Chen, Zhenghua and Mao, Kezhi and Wang, Peng and Gao, Robert X},
315 | journal={Mechanical Systems and Signal Processing},
316 | volume={115},
317 | pages={213--237},
318 | year={2019},
319 | publisher={Elsevier}
320 | }
321 |
322 | @article{wang2021recent,
323 | title={Recent Advancement of Deep Learning Applications to Machine Condition Monitoring Part 1: A Critical Review},
324 | author={Wang, Wenyi and Taylor, John and Rees, Robert J},
325 | journal={Acoustics Australia},
326 | pages={1--13},
327 | year={2021},
328 | publisher={Springer}
329 | }
330 |
331 | @article{lee2007bearing,
332 | title={Bearing data set},
333 | author={Lee, J and Qiu, H and Yu, G and Lin, Ja and others},
334 | journal={IMS, University of Cincinnati, NASA Ames Prognostics Data Repository, Rexnord Technical Services},
335 | year={2007}
336 | }
337 |
338 | @article{agogino2007milling,
339 | title={Milling data set. NASA Ames Prognostics Data Repository},
340 | author={Agogino, A and Goebel, K},
341 | journal={Moffett Field, CA},
342 | year={2007},
343 | url={https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/}
344 | }
345 |
346 | @article{garcia2021temporal,
347 | title={Temporal signals to images: Monitoring the condition of industrial assets with deep learning image processing algorithms},
348 | author={Garcia, Gabriel Rodriguez and Michau, Gabriel and Ducoffe, M{\'e}lanie and Gupta, Jayant Sen and Fink, Olga},
349 | journal={Proceedings of the Institution of Mechanical Engineers, Part O: Journal of Risk and Reliability},
350 | pages={1748006X21994446},
351 | year={2021},
352 | publisher={SAGE Publications Sage UK: London, England}
353 | }
354 |
355 | @article{esteban2019fmriprep,
356 | title={fMRIPrep: a robust preprocessing pipeline for functional MRI},
357 | author={Esteban, Oscar and Markiewicz, Christopher J and Blair, Ross W and Moodie, Craig A and Isik, A Ilkay and Erramuzpe, Asier and Kent, James D and Goncalves, Mathias and DuPre, Elizabeth and Snyder, Madeleine and others},
358 | journal={Nature methods},
359 | volume={16},
360 | number={1},
361 | pages={111--116},
362 | year={2019},
363 | publisher={Nature Publishing Group}
364 | }
365 |
366 | @software{christian_s_perone_2018_1495335,
367 | author = {Christian S. Perone and
368 | cclauss and
369 | Elvis Saravia and
370 | Pedro Lemos Ballester and
371 | MohitTare},
372 | title = {perone/medicaltorch: Release v0.2},
373 | month = nov,
374 | year = 2018,
375 | publisher = {Zenodo},
376 | version = {v0.2},
377 | doi = {10.5281/zenodo.1495335},
378 | url = {https://doi.org/10.5281/zenodo.1495335}
379 | }
380 |
381 | @INPROCEEDINGS{astroML,
382 | author={{Vanderplas}, J.T. and {Connolly}, A.J.
383 | and {Ivezi{\'c}}, {\v Z}. and {Gray}, A.},
384 | booktitle={Conference on Intelligent Data Understanding (CIDU)},
385 | title={Introduction to astroML: Machine learning for astrophysics},
386 | month={oct.},
387 | pages={47 -54},
388 | doi={10.1109/CIDU.2012.6382200},
389 | year={2012}}
390 |
391 | @incollection{NEURIPS2019_9015,
392 | title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
393 | author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
394 | booktitle = {Advances in Neural Information Processing Systems 32},
395 | editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
396 | pages = {8024--8035},
397 | year = {2019},
398 | publisher = {Curran Associates, Inc.},
399 | url = {http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf}
400 | }
401 |
402 | @book{Bird_Natural_Language_Processing_2009,
403 | author = {Bird, Steven and Klein, Ewan and Loper, Edward},
404 | publisher = {O'Reilly Media, Inc.},
405 | title = {{Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit}},
406 | year = {2009}}
407 |
408 | @article{wilson2014software,
409 | title={Software Carpentry: lessons learned},
410 | author={Wilson, Greg},
411 | journal={F1000Research},
412 | volume={3},
413 | year={2014},
414 | publisher={Faculty of 1000 Ltd}
415 | }
416 |
417 |
--------------------------------------------------------------------------------
/src/pyphm/datasets/milling.py:
--------------------------------------------------------------------------------
1 | import scipy.io as sio
2 | import numpy as np
3 | import pandas as pd
4 | from pathlib import Path
5 | from .pyphm import PHMDataset
6 | from typing import Any, Callable, List, Optional, Tuple
7 | import pkg_resources
8 | from .utils import (
9 | download_and_extract_archive,
10 | extract_archive,
11 | check_integrity,
12 | )
13 | import os
14 | from urllib.error import URLError
15 |
16 | """
17 | Contains the data prep class for the UC-Berkely milling data set.
18 |
19 | Also contains helper functions associated with the milling data set.
20 | """
21 |
22 |
23 | ###############################################################################
24 | # Data Prep Classes
25 | ###############################################################################
26 | class MillingDataLoad(PHMDataset):
27 | """
28 | Load the UC Berkely milling data set from .mat file, and download if necessary.
29 |
30 | Args:
31 | root (string): Root directory to place all the data sets.
32 |
33 | dataset_folder_name (string): Name of folder containing raw data.
34 | This folder will be created in the root directory if not present.
35 |
36 | download (bool): If True, the data will be downloaded from the NASA Prognostics Repository.
37 |
38 | """
39 |
40 | resources = [
41 | {
42 | "name": "aws",
43 | "url": "https://phm-datasets.s3.amazonaws.com/NASA/",
44 | "files": [
45 | {
46 | "filename": "3.+Milling.zip",
47 | "md5": "4da3afb0aa50cb3dcdd8e20ed1ed1c7c",
48 | }
49 | ],
50 | },
51 | {
52 | "name": "github",
53 | "url": "https://github.com/tvhahn/Manufacturing-Data-Science-with-Python/raw/master/Data%20Sets/milling_uc_berkeley/raw/",
54 | "files": [
55 | {
56 | "filename": "mill.zip",
57 | "md5": "81d821fdef812183a7d38b6f83f7cefa",
58 | }
59 | ],
60 | },
61 | ]
62 |
63 | def __init__(
64 | self,
65 | root: Path,
66 | dataset_folder_name: str = "milling",
67 | data_file_name: str = "mill.mat",
68 | download: bool = False,
69 | data: np.ndarray = None,
70 | ) -> None:
71 | super().__init__(root, dataset_folder_name)
72 |
73 | self.dataset_folder_path = self.root / self.dataset_folder_name
74 | self.data_file_name = data_file_name
75 |
76 | if download:
77 | self.download()
78 |
79 | data_file_path = self.dataset_folder_path / self.data_file_name
80 | # assert that data_file_path exists
81 | assert data_file_path.exists(), f"{data_file_path} does not exist."
82 |
83 | self.data = self.load_mat()
84 |
85 | def _check_exists(self) -> bool:
86 | for source in self.resources:
87 | for file in source["files"]:
88 | file_name = file["filename"]
89 | file_path = self.dataset_folder_path / file_name
90 | if not check_integrity(file_path, file["md5"]):
91 | return False
92 | return True
93 |
94 |
95 | def download(self) -> None:
96 | """Download the data files from their sources if they don't exist already."""
97 |
98 | if self._check_exists():
99 | print("Files already downloaded and verified.")
100 | return
101 |
102 | # Ensure the dataset folder exists
103 | self.dataset_folder_path.mkdir(parents=True, exist_ok=True)
104 |
105 | successful_download = False
106 |
107 | for source in self.resources:
108 | all_files_downloaded = True # Assume success, prove otherwise
109 |
110 | for file in source["files"]:
111 | file_name = file["filename"]
112 | md5 = file["md5"]
113 | file_path = self.dataset_folder_path / file_name
114 |
115 | # Check if the file already exists and is verified
116 | if check_integrity(file_path, md5):
117 | print(f"{file_name} already exists and is verified.")
118 | continue # Skip to the next file as this one is already handled
119 |
120 | # Construct the URL for downloading
121 | url = f"{source['url']}{file_name}"
122 |
123 | try:
124 | print(f"Attempting to download {url}")
125 | download_and_extract_archive(
126 | url,
127 | download_root=str(self.dataset_folder_path),
128 | filename=file_name,
129 | md5=md5,
130 | remove_finished=True,
131 | )
132 | # After successful download and extraction, check for and extract any nested archive
133 | self.check_and_extract_nested(file_path.parent)
134 |
135 | except URLError as error:
136 | print(f"Failed to download {file_name} from {source['name']}:\n{error}")
137 | all_files_downloaded = False # Mark as failed to trigger another source attempt
138 | break # Exit the file loop to try the next source
139 |
140 | if all_files_downloaded:
141 | successful_download = True
142 | print(f"Successfully downloaded all files from {source['name']}")
143 | break # Exit the source loop since we've successfully downloaded from this source
144 |
145 | if not successful_download:
146 | raise RuntimeError("Failed to download files from all sources.")
147 |
148 | def check_and_extract_nested(self, directory: Path) -> None:
149 | """Check for and extract any nested archives in the given directory."""
150 | for item in directory.iterdir():
151 | if item.is_dir():
152 | # Check each directory for nested archives
153 | for nested_item in item.iterdir():
154 | if nested_item.suffix in ['.zip', '.tar', '.gz']:
155 | print(f"Found nested archive: {nested_item}")
156 | extract_archive(str(nested_item), str(directory), remove_finished=True)
157 |
158 |
159 |
160 | def load_mat(self) -> np.ndarray:
161 | """Load the mat file and return the data as a numpy array."""
162 | data = sio.loadmat(self.dataset_folder_path / self.data_file_name, struct_as_record=True)
163 | return data["mill"]
164 |
165 |
166 | class MillingPrepMethodA(MillingDataLoad):
167 | """
168 | Class used to prepare the UC Berkeley milling dataset before feature engining or machine learning.
169 | Method is described in the paper:
170 |
171 | `Self-supervised learning for tool wear monitoring with a disentangled-variational-autoencoder`
172 | by von Hahn and Mechefkse, 2021
173 |
174 | Args:
175 | root (string): Root directory to place all the data sets. (likely the raw data folder)
176 |
177 | dataset_folder_name (string): Name of folder (within root) containing raw data.
178 | This folder will be created in the root directory if not present.
179 |
180 | download (bool): If True, the data will be downloaded from the NASA Prognostics Repository.
181 |
182 | path_csv_labels (Path, optional): Path to the csv of the label dataframe.
183 | If not provided, the 'milling_labels_with_tool_class.csv' will be used, provided in the
184 | PyPHM package.
185 |
186 | window_len (int): Length of the window to be used for the sliding window.
187 |
188 | stride (int): Amount to move (stride) between individual windows of data.
189 |
190 | cut_drop_list (list, optional): List of cut numbers to drop. cut_no 17 and 94 are erroneous and
191 | will be dropped as default.
192 | """
193 |
194 | def __init__(
195 | self,
196 | root: Path,
197 | dataset_folder_name: str = "milling",
198 | dataset_folder_path: Path = None,
199 | data_file_name: str = "mill.mat",
200 | download: bool = False,
201 | data: np.ndarray = None,
202 | path_csv_labels: Path = None,
203 | window_len: int = 64,
204 | stride: int = 64,
205 | cut_drop_list: List[int] = [17, 94],
206 | ) -> None:
207 | super().__init__(root, dataset_folder_name, data_file_name, download, data)
208 |
209 | self.window_len = window_len # size of the window
210 | self.stride = stride # stride between windows
211 | self.cut_drop_list = cut_drop_list # list of cut numbers to be dropped
212 |
213 | if path_csv_labels is not None:
214 | self.path_csv_labels = path_csv_labels
215 | else:
216 | # path of pyphm source directory using pathlib
217 | self.path_csv_labels = Path(
218 | pkg_resources.resource_filename(
219 | "pyphm", "datasets/auxilary_metadata/milling_labels_with_tool_class.csv"
220 | )
221 | )
222 |
223 | # load the labels dataframe
224 | self.df_labels = pd.read_csv(self.path_csv_labels)
225 |
226 | if self.cut_drop_list is not None:
227 | self.df_labels.drop(self.cut_drop_list, inplace=True) # drop the cuts that are bad
228 |
229 | self.df_labels.reset_index(drop=True, inplace=True) # reset the index
230 |
231 | self.field_names = self.data.dtype.names
232 |
233 | self.signal_names = self.field_names[7:][::-1]
234 |
235 | def create_labels(self):
236 | """Function that will create the label dataframe from the mill data set
237 |
238 | Only needed if the dataframe with the labels is not provided.
239 | """
240 |
241 | # create empty dataframe for the labels
242 | df_labels = pd.DataFrame()
243 |
244 | # get the labels from the original .mat file and put in dataframe
245 | for i in range(7):
246 | # list for storing the label data for each field
247 | x = []
248 |
249 | # iterate through each of the unique cuts
250 | for j in range(167):
251 | x.append(self.data[0, j][i][0][0])
252 | x = np.array(x)
253 | df_labels[str(i)] = x
254 |
255 | # add column names to the dataframe
256 | df_labels.columns = self.field_names[0:7]
257 |
258 | # create a column with the unique cut number
259 | df_labels["cut_no"] = [i for i in range(167)]
260 |
261 | def tool_state(cols):
262 | """Add the label to the cut.
263 |
264 | Categories are:
265 | Healthy Sate (label=0): 0~0.2mm flank wear
266 | Degredation State (label=1): 0.2~0.7mm flank wear
267 | Failure State (label=2): >0.7mm flank wear
268 | """
269 | # pass in the tool wear, VB, column
270 | vb = cols
271 |
272 | if vb < 0.2:
273 | return 0
274 | elif vb >= 0.2 and vb < 0.7:
275 | return 1
276 | elif pd.isnull(vb):
277 | pass
278 | else:
279 | return 2
280 |
281 | # apply the label to the dataframe
282 | df_labels["tool_class"] = df_labels["VB"].apply(tool_state)
283 |
284 | return df_labels
285 |
286 | def create_data_array(self, cut_no):
287 | """Create an array from an individual cut sample.
288 |
289 | Parameters
290 | ===========
291 | cut_no : int
292 | Index of the cut to be used.
293 |
294 | Returns
295 | ===========
296 | sub_cut_array : np.array
297 | Array of the cut samples. Shape of [no. samples, sample len, features/sample]
298 |
299 | sub_cut_labels : np.array
300 | Array of the labels for the cut samples. Shape of [# samples, # features/sample]
301 |
302 | """
303 |
304 | assert cut_no in self.df_labels["cut_no"].values, "Cut number must be in the dataframe"
305 |
306 | # create a numpy array of the cut
307 | # with a final array shape like [no. cuts, len cuts, no. signals]
308 | cut = self.data[0, cut_no]
309 | for i, signal_name in enumerate(self.signal_names):
310 | if i == 0:
311 | cut_array = cut[signal_name].reshape((9000, 1))
312 | else:
313 | cut_array = np.concatenate((cut_array, cut[signal_name].reshape((9000, 1))), axis=1)
314 |
315 | # select the start and end of the cut
316 | start = self.df_labels[self.df_labels["cut_no"] == cut_no]["window_start"].values[0]
317 | end = self.df_labels[self.df_labels["cut_no"] == cut_no]["window_end"].values[0]
318 | cut_array = cut_array[start:end, :]
319 |
320 | # instantiate the "temporary" lists to store the sub-cuts and metadata
321 | sub_cut_list = []
322 | sub_cut_id_list = []
323 | sub_cut_label_list = []
324 |
325 | # get the labels for the cut
326 | label = self.df_labels[self.df_labels["cut_no"] == cut_no]["tool_class"].values[0]
327 |
328 | # fit the strided windows into the dummy_array until the length
329 | # of the window does not equal the proper length (better way to do this???)
330 | for i in range(cut_array.shape[0]):
331 | windowed_signal = cut_array[i * self.stride : i * self.stride + self.window_len]
332 |
333 | # if the windowed signal is the proper length, add it to the list
334 | if windowed_signal.shape == (self.window_len, 6):
335 | sub_cut_list.append(windowed_signal)
336 |
337 | # create sub_cut_id fstring to keep track of the cut_id and the window_id
338 | sub_cut_id_list.append(f"{cut_no}_{i}")
339 |
340 | # create the sub_cut_label and append it to the list
341 | sub_cut_label_list.append(int(label))
342 |
343 | else:
344 | break
345 |
346 | sub_cut_array = np.array(sub_cut_list)
347 |
348 | sub_cut_ids = np.expand_dims(np.array(sub_cut_id_list, dtype=str), axis=1)
349 | sub_cut_ids = np.repeat(sub_cut_ids, sub_cut_array.shape[1], axis=1)
350 |
351 | sub_cut_labels = np.expand_dims(np.array(sub_cut_label_list, dtype=int), axis=1)
352 | sub_cut_labels = np.repeat(sub_cut_labels, sub_cut_array.shape[1], axis=1)
353 |
354 | # take the length of the signals in the sub_cut_array
355 | # and divide it by the frequency (250 Hz) to get the time (seconds) of each sub-cut
356 | sub_cut_times = np.expand_dims(np.arange(0, sub_cut_array.shape[1]) / 250.0, axis=0)
357 | sub_cut_times = np.repeat(
358 | sub_cut_times,
359 | sub_cut_array.shape[0],
360 | axis=0,
361 | )
362 |
363 | sub_cut_labels_ids_times = np.stack((sub_cut_labels, sub_cut_ids, sub_cut_times), axis=2)
364 |
365 | return (
366 | sub_cut_array,
367 | sub_cut_labels,
368 | sub_cut_ids,
369 | sub_cut_times,
370 | sub_cut_labels_ids_times,
371 | )
372 |
373 | def create_xy_arrays(self):
374 | """Create the x and y arrays used in deep learning.
375 |
376 | Returns
377 | ===========
378 | x_array : np.array
379 | Array of the cut samples. Shape of [no. samples, sample len, features/sample]
380 |
381 | y_array : np.array
382 | Array of the labels for the cut samples. Shape of [no. samples, sample len, label/ids/times]
383 | Use y[:,0,:], for example, to get the y in a shape of [no. samples, label/ids/times]
384 | ( e.g. will be shape (no. samples, 3) )
385 |
386 | """
387 |
388 | # create a list to store the x and y arrays
389 | x = [] # instantiate X's
390 | y_labels_ids_times = [] # instantiate y's
391 |
392 | # iterate throught the df_labels
393 | for i in self.df_labels.itertuples():
394 | (
395 | sub_cut_array,
396 | sub_cut_labels,
397 | sub_cut_ids,
398 | sub_cut_times,
399 | sub_cut_labels_ids_times,
400 | ) = self.create_data_array(i.cut_no)
401 |
402 | x.append(sub_cut_array)
403 | y_labels_ids_times.append(sub_cut_labels_ids_times)
404 |
405 | return np.vstack(x), np.vstack(y_labels_ids_times)
406 |
407 | def create_xy_dataframe(self):
408 | """
409 | Create a flat dataframe (2D array) of the x and y arrays.
410 |
411 | Amenable for use with TSFresh for feature engineering.
412 |
413 | Returns
414 | ===========
415 | df : pd.DataFrame
416 | Single flat dataframe containing each sample and its labels.
417 |
418 | """
419 |
420 | x, y_labels_ids_times = self.create_xy_arrays() # create the x and y arrays
421 |
422 | # concatenate the x and y arrays and reshape them to be a flat array (2D)
423 | x_labels = np.reshape(np.concatenate((x, y_labels_ids_times), axis=2), (-1, 9))
424 |
425 | # define the column names and the data types
426 | col_names = [s.lower() for s in list(self.signal_names)] + [
427 | "tool_class",
428 | "cut_id",
429 | "time",
430 | ]
431 |
432 | col_names_ordered = [
433 | "cut_id",
434 | "cut_no",
435 | "case",
436 | "time",
437 | "ae_spindle",
438 | "ae_table",
439 | "vib_spindle",
440 | "vib_table",
441 | "smcdc",
442 | "smcac",
443 | "tool_class",
444 | ]
445 |
446 | col_dtype = [
447 | str,
448 | int,
449 | int,
450 | np.float32,
451 | np.float32,
452 | np.float32,
453 | np.float32,
454 | np.float32,
455 | np.float32,
456 | np.float32,
457 | int,
458 | ]
459 |
460 | col_dtype_dict = dict(zip(col_names_ordered, col_dtype))
461 |
462 | # create a dataframe from the x and y arrays
463 | df = pd.DataFrame(x_labels, columns=col_names, dtype=str)
464 |
465 | # split the cut_id by "_" and take the first element (cut_no)
466 | df["cut_no"] = df["cut_id"].str.split("_").str[0]
467 |
468 | # get the case from each cut_no using the df_labels
469 | df = df.merge(
470 | self.df_labels[["cut_no", "case"]].astype(dtype=str),
471 | on="cut_no",
472 | how="left",
473 | )
474 |
475 | df = df[col_names_ordered].astype(col_dtype_dict) # reorder the columns
476 |
477 | return df
478 |
--------------------------------------------------------------------------------
/src/pyphm/datasets/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | BSD 3-Clause License
3 |
4 | The utils.py is Copyright (c) Soumith Chintala 2016, (from pytorch/vision)
5 | All rights reserved.
6 |
7 | Redistribution and use in source and binary forms, with or without
8 | modification, are permitted provided that the following conditions are met:
9 |
10 | * Redistributions of source code must retain the above copyright notice, this
11 | list of conditions and the following disclaimer.
12 |
13 | * Redistributions in binary form must reproduce the above copyright notice,
14 | this list of conditions and the following disclaimer in the documentation
15 | and/or other materials provided with the distribution.
16 |
17 | * Neither the name of the copyright holder nor the names of its
18 | contributors may be used to endorse or promote products derived from
19 | this software without specific prior written permission.
20 |
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | """
32 |
33 | import bz2
34 | import gzip
35 | import hashlib
36 | import itertools
37 | import lzma
38 | import os
39 | import os.path
40 | import pathlib
41 | from pathlib import Path
42 | import re
43 | import tarfile
44 | import rarfile # needed for IMS dataset
45 | import py7zr # needed for IMS dataset
46 | import urllib
47 | import urllib.error
48 | import urllib.request
49 | import zipfile
50 | from typing import Any, Callable, List, Iterable, Optional, TypeVar, Dict, IO, Tuple, Iterator
51 | from urllib.parse import urlparse
52 | import gdown
53 |
54 | import requests
55 | from tqdm.auto import tqdm
56 |
57 |
58 | def _download_file_from_remote_location(fpath: str, url: str) -> None:
59 | pass
60 |
61 |
62 | def _is_remote_location_available() -> bool:
63 | return False
64 |
65 | USER_AGENT = "PyPHM"
66 |
67 |
68 | def _urlretrieve(url: str, filename: str, chunk_size: int = 1024) -> None:
69 | with open(filename, "wb") as fh:
70 | with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": USER_AGENT})) as response:
71 | with tqdm(total=response.length) as pbar:
72 | for chunk in iter(lambda: response.read(chunk_size), ""):
73 | if not chunk:
74 | break
75 | pbar.update(chunk_size)
76 | fh.write(chunk)
77 |
78 |
79 | def gen_bar_updater() -> Callable[[int, int, int], None]:
80 | pbar = tqdm(total=None)
81 |
82 | def bar_update(count, block_size, total_size):
83 | if pbar.total is None and total_size:
84 | pbar.total = total_size
85 | progress_bytes = count * block_size
86 | pbar.update(progress_bytes - pbar.n)
87 |
88 | return bar_update
89 |
90 |
91 | def calculate_md5(fpath: Path, chunk_size: int = 1024 * 1024) -> str:
92 | md5 = hashlib.md5()
93 | with open(fpath, "rb") as f:
94 | for chunk in iter(lambda: f.read(chunk_size), b""):
95 | md5.update(chunk)
96 | return md5.hexdigest()
97 |
98 |
99 | def check_md5(fpath: Path, md5: str, **kwargs: Any) -> bool:
100 | return md5 == calculate_md5(fpath, **kwargs)
101 |
102 |
103 | def check_integrity(fpath: Path, md5: Optional[str] = None) -> bool:
104 | fpath = Path(fpath)
105 | if not fpath.exists() and not fpath.is_file():
106 | return False
107 | if md5 is None:
108 | return True
109 | return check_md5(fpath, md5)
110 |
111 |
112 | def _get_redirect_url(url: str, max_hops: int = 3) -> str:
113 | initial_url = url
114 | headers = {"Method": "HEAD", "User-Agent": USER_AGENT}
115 |
116 | for _ in range(max_hops + 1):
117 | with urllib.request.urlopen(urllib.request.Request(url, headers=headers)) as response:
118 | if response.url == url or response.url is None:
119 | return url
120 |
121 | url = response.url
122 | else:
123 | raise RecursionError(
124 | f"Request to {initial_url} exceeded {max_hops} redirects. The last redirect points to {url}."
125 | )
126 |
127 |
128 | def _get_google_drive_file_id(url: str) -> Optional[str]:
129 | parts = urlparse(url)
130 |
131 | if re.match(r"(drive|docs)[.]google[.]com", parts.netloc) is None:
132 | return None
133 |
134 | match = re.match(r"/file/d/(?P[^/]*)", parts.path)
135 | if match is None:
136 | return None
137 |
138 | return match.group("id")
139 |
140 |
141 | def download_url(
142 | url: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None, max_redirect_hops: int = 3
143 | ) -> None:
144 | """Download a file from a url and place it in root.
145 |
146 | Args:
147 | url (str): URL to download file from
148 | root (str): Directory to place downloaded file in
149 | filename (str, optional): Name to save the file under. If None, use the basename of the URL
150 | md5 (str, optional): MD5 checksum of the download. If None, do not check
151 | max_redirect_hops (int, optional): Maximum number of redirect hops allowed
152 | """
153 | root = os.path.expanduser(root)
154 | if not filename:
155 | filename = os.path.basename(url)
156 | fpath = os.path.join(root, filename)
157 |
158 | os.makedirs(root, exist_ok=True)
159 |
160 | # check if file is already present locally
161 | if check_integrity(fpath, md5):
162 | print("Using downloaded and verified file: " + fpath)
163 | return
164 |
165 | if _is_remote_location_available():
166 | _download_file_from_remote_location(fpath, url)
167 | else:
168 | # expand redirect chain if needed
169 | url = _get_redirect_url(url, max_hops=max_redirect_hops)
170 |
171 | # check if file is located on Google Drive
172 | file_id = _get_google_drive_file_id(url)
173 | if file_id is not None:
174 | print("Goolgle drive file id:", file_id)
175 | return gdown.download(id=file_id, output=str(Path(root) / filename), quiet=False)
176 | # return download_file_from_google_drive(file_id, root, filename, md5)
177 |
178 | # download the file
179 | try:
180 | print("Downloading " + url + " to " + fpath)
181 | _urlretrieve(url, fpath)
182 | except (urllib.error.URLError, OSError) as e: # type: ignore[attr-defined]
183 | if url[:5] == "https":
184 | url = url.replace("https:", "http:")
185 | print("Failed download. Trying https -> http instead. Downloading " + url + " to " + fpath)
186 | _urlretrieve(url, fpath)
187 | else:
188 | raise e
189 |
190 | # check integrity of downloaded file
191 | if not check_integrity(fpath, md5):
192 | raise RuntimeError("File not found or corrupted.")
193 |
194 |
195 | def list_dir(root: str, prefix: bool = False) -> List[str]:
196 | """List all directories at a given root
197 |
198 | Args:
199 | root (str): Path to directory whose folders need to be listed
200 | prefix (bool, optional): If true, prepends the path to each result, otherwise
201 | only returns the name of the directories found
202 | """
203 | root = os.path.expanduser(root)
204 | directories = [p for p in os.listdir(root) if os.path.isdir(os.path.join(root, p))]
205 | if prefix is True:
206 | directories = [os.path.join(root, d) for d in directories]
207 | return directories
208 |
209 |
210 | def list_files(root: str, suffix: str, prefix: bool = False) -> List[str]:
211 | """List all files ending with a suffix at a given root
212 |
213 | Args:
214 | root (str): Path to directory whose folders need to be listed
215 | suffix (str or tuple): Suffix of the files to match, e.g. '.png' or ('.jpg', '.png').
216 | It uses the Python "str.endswith" method and is passed directly
217 | prefix (bool, optional): If true, prepends the path to each result, otherwise
218 | only returns the name of the files found
219 | """
220 | root = os.path.expanduser(root)
221 | files = [p for p in os.listdir(root) if os.path.isfile(os.path.join(root, p)) and p.endswith(suffix)]
222 | if prefix is True:
223 | files = [os.path.join(root, d) for d in files]
224 | return files
225 |
226 |
227 | def _quota_exceeded(first_chunk: bytes) -> bool:
228 | try:
229 | return "Google Drive - Quota exceeded" in first_chunk.decode()
230 | except UnicodeDecodeError:
231 | return False
232 |
233 |
234 | def download_file_from_google_drive(file_id: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None):
235 | """Download a Google Drive file from and place it in root.
236 |
237 | Args:
238 | file_id (str): id of file to be downloaded
239 | root (str): Directory to place downloaded file in
240 | filename (str, optional): Name to save the file under. If None, use the id of the file.
241 | md5 (str, optional): MD5 checksum of the download. If None, do not check
242 | """
243 | # Based on https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
244 |
245 | url = "https://docs.google.com/uc?export=download"
246 |
247 | root = os.path.expanduser(root)
248 | if not filename:
249 | filename = file_id
250 | fpath = os.path.join(root, filename)
251 |
252 | os.makedirs(root, exist_ok=True)
253 |
254 | if os.path.isfile(fpath) and check_integrity(fpath, md5):
255 | print("Using downloaded and verified file: " + fpath)
256 | else:
257 | session = requests.Session()
258 |
259 | response = session.get(url, params={"id": file_id}, stream=True)
260 | token = _get_confirm_token(response)
261 |
262 | if token:
263 | params = {"id": file_id, "confirm": token}
264 | response = session.get(url, params=params, stream=True)
265 |
266 | # Ideally, one would use response.status_code to check for quota limits, but google drive is not consistent
267 | # with their own API, refer https://github.com/pytorch/vision/issues/2992#issuecomment-730614517.
268 | # Should this be fixed at some place in future, one could refactor the following to no longer rely on decoding
269 | # the first_chunk of the payload
270 | response_content_generator = response.iter_content(32768)
271 | first_chunk = None
272 | while not first_chunk: # filter out keep-alive new chunks
273 | first_chunk = next(response_content_generator)
274 |
275 | if _quota_exceeded(first_chunk):
276 | msg = (
277 | f"The daily quota of the file {filename} is exceeded and it "
278 | f"can't be downloaded. This is a limitation of Google Drive "
279 | f"and can only be overcome by trying again later."
280 | )
281 | raise RuntimeError(msg)
282 |
283 | _save_response_content(itertools.chain((first_chunk,), response_content_generator), fpath)
284 | response.close()
285 |
286 |
287 | def _get_confirm_token(response: requests.models.Response) -> Optional[str]:
288 | for key, value in response.cookies.items():
289 | if key.startswith("download_warning"):
290 | return value
291 |
292 | return None
293 |
294 |
295 | def _save_response_content(
296 | response_gen: Iterator[bytes],
297 | destination: str,
298 | ) -> None:
299 | with open(destination, "wb") as f:
300 | pbar = tqdm(total=None)
301 | progress = 0
302 |
303 | for chunk in response_gen:
304 | if chunk: # filter out keep-alive new chunks
305 | f.write(chunk)
306 | progress += len(chunk)
307 | pbar.update(progress - pbar.n)
308 | pbar.close()
309 |
310 |
311 | def _extract_tar(from_path: str, to_path: str, compression: Optional[str]) -> None:
312 | with tarfile.open(from_path, f"r:{compression[1:]}" if compression else "r") as tar:
313 | tar.extractall(to_path)
314 |
315 |
316 | def _extract_rar(from_path: str, to_path: str, compression: Optional[str]) -> None:
317 | with rarfile.RarFile(from_path, f"r:{compression[1:]}" if compression else "r") as rar:
318 | rar.extractall(to_path)
319 |
320 |
321 | def _extract_7z(from_path: str, to_path: str, compression: Optional[str]) -> None:
322 | with py7zr.SevenZipFile(from_path, f"r:{compression[1:]}" if compression else "r") as z:
323 | z.extractall(to_path)
324 |
325 |
326 | _ZIP_COMPRESSION_MAP: Dict[str, int] = {
327 | ".bz2": zipfile.ZIP_BZIP2,
328 | ".xz": zipfile.ZIP_LZMA,
329 | }
330 |
331 |
332 | def _extract_zip(from_path: str, to_path: str, compression: Optional[str]) -> None:
333 | with zipfile.ZipFile(
334 | from_path, "r", compression=_ZIP_COMPRESSION_MAP[compression] if compression else zipfile.ZIP_STORED
335 | ) as zip:
336 | zip.extractall(to_path)
337 |
338 |
339 | _ARCHIVE_EXTRACTORS: Dict[str, Callable[[str, str, Optional[str]], None]] = {
340 | ".tar": _extract_tar,
341 | ".zip": _extract_zip,
342 | ".rar": _extract_rar,
343 | ".7z": _extract_7z,
344 | }
345 | _COMPRESSED_FILE_OPENERS: Dict[str, Callable[..., IO]] = {
346 | ".bz2": bz2.open,
347 | ".gz": gzip.open,
348 | ".xz": lzma.open,
349 | }
350 | _FILE_TYPE_ALIASES: Dict[str, Tuple[Optional[str], Optional[str]]] = {
351 | ".tbz": (".tar", ".bz2"),
352 | ".tbz2": (".tar", ".bz2"),
353 | ".tgz": (".tar", ".gz"),
354 | }
355 |
356 |
357 | def _detect_file_type(file: str) -> Tuple[str, Optional[str], Optional[str]]:
358 | """Detect the archive type and/or compression of a file.
359 |
360 | Args:
361 | file (str): the filename
362 |
363 | Returns:
364 | (tuple): tuple of suffix, archive type, and compression
365 |
366 | Raises:
367 | RuntimeError: if file has no suffix or suffix is not supported
368 | """
369 | suffixes = pathlib.Path(file).suffixes
370 | if not suffixes:
371 | raise RuntimeError(
372 | f"File '{file}' has no suffixes that could be used to detect the archive type and compression."
373 | )
374 | suffix = suffixes[-1]
375 |
376 | # check if the suffix is a known alias
377 | if suffix in _FILE_TYPE_ALIASES:
378 | return (suffix, *_FILE_TYPE_ALIASES[suffix])
379 |
380 | # check if the suffix is an archive type
381 | if suffix in _ARCHIVE_EXTRACTORS:
382 | return suffix, suffix, None
383 |
384 | # check if the suffix is a compression
385 | if suffix in _COMPRESSED_FILE_OPENERS:
386 | # check for suffix hierarchy
387 | if len(suffixes) > 1:
388 | suffix2 = suffixes[-2]
389 |
390 | # check if the suffix2 is an archive type
391 | if suffix2 in _ARCHIVE_EXTRACTORS:
392 | return suffix2 + suffix, suffix2, suffix
393 |
394 | return suffix, None, suffix
395 |
396 | valid_suffixes = sorted(set(_FILE_TYPE_ALIASES) | set(_ARCHIVE_EXTRACTORS) | set(_COMPRESSED_FILE_OPENERS))
397 | raise RuntimeError(f"Unknown compression or archive type: '{suffix}'.\nKnown suffixes are: '{valid_suffixes}'.")
398 |
399 |
400 | def _decompress(from_path: str, to_path: Optional[str] = None, remove_finished: bool = False) -> str:
401 | r"""Decompress a file.
402 |
403 | The compression is automatically detected from the file name.
404 |
405 | Args:
406 | from_path (str): Path to the file to be decompressed.
407 | to_path (str): Path to the decompressed file. If omitted, ``from_path`` without compression extension is used.
408 | remove_finished (bool): If ``True``, remove the file after the extraction.
409 |
410 | Returns:
411 | (str): Path to the decompressed file.
412 | """
413 | suffix, archive_type, compression = _detect_file_type(from_path)
414 | if not compression:
415 | raise RuntimeError(f"Couldn't detect a compression from suffix {suffix}.")
416 |
417 | if to_path is None:
418 | to_path = from_path.replace(suffix, archive_type if archive_type is not None else "")
419 |
420 | # We don't need to check for a missing key here, since this was already done in _detect_file_type()
421 | compressed_file_opener = _COMPRESSED_FILE_OPENERS[compression]
422 |
423 | with compressed_file_opener(from_path, "rb") as rfh, open(to_path, "wb") as wfh:
424 | wfh.write(rfh.read())
425 |
426 | if remove_finished:
427 | os.remove(from_path)
428 |
429 | return to_path
430 |
431 |
432 | def extract_archive(from_path: str, to_path: Optional[str] = None, remove_finished: bool = False) -> str:
433 | """Extract an archive.
434 |
435 | The archive type and a possible compression is automatically detected from the file name. If the file is compressed
436 | but not an archive the call is dispatched to :func:`decompress`.
437 |
438 | Args:
439 | from_path (str): Path to the file to be extracted.
440 | to_path (str): Path to the directory the file will be extracted to. If omitted, the directory of the file is
441 | used.
442 | remove_finished (bool): If ``True``, remove the file after the extraction.
443 |
444 | Returns:
445 | (str): Path to the directory the file was extracted to.
446 | """
447 | if to_path is None:
448 | to_path = os.path.dirname(from_path)
449 |
450 | suffix, archive_type, compression = _detect_file_type(from_path)
451 | if not archive_type:
452 | return _decompress(
453 | from_path,
454 | os.path.join(to_path, os.path.basename(from_path).replace(suffix, "")),
455 | remove_finished=remove_finished,
456 | )
457 |
458 | # We don't need to check for a missing key here, since this was already done in _detect_file_type()
459 | extractor = _ARCHIVE_EXTRACTORS[archive_type]
460 |
461 | extractor(from_path, to_path, compression)
462 | if remove_finished:
463 | os.remove(from_path)
464 |
465 | return to_path
466 |
467 |
468 | def download_and_extract_archive(
469 | url: str,
470 | download_root: str,
471 | extract_root: Optional[str] = None,
472 | filename: Optional[str] = None,
473 | md5: Optional[str] = None,
474 | remove_finished: bool = False,
475 | ) -> None:
476 | download_root = os.path.expanduser(download_root)
477 | if extract_root is None:
478 | extract_root = download_root
479 | if not filename:
480 | filename = os.path.basename(url)
481 |
482 | download_url(url, download_root, filename, md5)
483 |
484 | archive = os.path.join(download_root, filename)
485 | print(f"Extracting {archive} to {extract_root}")
486 | extract_archive(archive, extract_root, remove_finished)
487 |
488 |
489 | def iterable_to_str(iterable: Iterable) -> str:
490 | return "'" + "', '".join([str(item) for item in iterable]) + "'"
491 |
492 |
493 | T = TypeVar("T", str, bytes)
494 |
495 |
496 | # def verify_str_arg(
497 | # value: T,
498 | # arg: Optional[str] = None,
499 | # valid_values: Iterable[T] = None,
500 | # custom_msg: Optional[str] = None,
501 | # ) -> T:
502 | # if not isinstance(value, torch._six.string_classes):
503 | # if arg is None:
504 | # msg = "Expected type str, but got type {type}."
505 | # else:
506 | # msg = "Expected type str for argument {arg}, but got type {type}."
507 | # msg = msg.format(type=type(value), arg=arg)
508 | # raise ValueError(msg)
509 |
510 | # if valid_values is None:
511 | # return value
512 |
513 | # if value not in valid_values:
514 | # if custom_msg is not None:
515 | # msg = custom_msg
516 | # else:
517 | # msg = "Unknown value '{value}' for argument {arg}. Valid values are {{{valid_values}}}."
518 | # msg = msg.format(value=value, arg=arg, valid_values=iterable_to_str(valid_values))
519 | # raise ValueError(msg)
520 |
521 | # return value
522 |
--------------------------------------------------------------------------------
/notebooks/scratch/ims_download.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "\n",
10 | "from pyphm.datasets.ims import ImsPrepMethodA\n",
11 | "from pathlib import Path\n",
12 | "import pandas as pd\n",
13 | "import os\n",
14 | "import numpy as np\n",
15 | "import time\n",
16 | "import datetime\n",
17 | "import csv\n",
18 | "\n",
19 | "\n",
20 | "%load_ext autoreload\n",
21 | "%autoreload 2"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "name": "stdout",
31 | "output_type": "stream",
32 | "text": [
33 | "/home/tim/Documents/PyPHM/data/raw\n"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "# define the location of where the raw data folders will be kept.\n",
39 | "# e.g. the ims data will be in path_data_raw_folder/ims/ \n",
40 | "path_data_raw_folder = Path(Path.cwd().parent.parent / 'data/raw/' )\n",
41 | "print(path_data_raw_folder)\n",
42 | "\n",
43 | "# create the path_data_raw_folder if it does not exist\n",
44 | "path_data_raw_folder.mkdir(parents=True, exist_ok=True)"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "Downloading https://drive.google.com/file/d/1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx/view?usp=sharingIMS.7z\n",
57 | "Goolgle drive file id: 1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx\n"
58 | ]
59 | },
60 | {
61 | "name": "stderr",
62 | "output_type": "stream",
63 | "text": [
64 | "Downloading...\n",
65 | "From: https://drive.google.com/uc?id=1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx\n",
66 | "To: /home/tim/Documents/PyPHM/data/raw/ims/IMS.7z\n",
67 | " 49%|████▉ | 532M/1.08G [00:12<00:19, 28.4MB/s] "
68 | ]
69 | },
70 | {
71 | "name": "stdout",
72 | "output_type": "stream",
73 | "text": [
74 | "\n"
75 | ]
76 | },
77 | {
78 | "ename": "KeyboardInterrupt",
79 | "evalue": "",
80 | "output_type": "error",
81 | "traceback": [
82 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
83 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
84 | "\u001b[0;32m/tmp/ipykernel_93187/765225230.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# instantiate the ImsPrepMethodA class and download data if it does not exist\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mims\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mImsPrepMethodA\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath_data_raw_folder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
85 | "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/ims.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, root, dataset_folder_name, download)\u001b[0m\n\u001b[1;32m 326\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m ) -> None:\n\u001b[0;32m--> 328\u001b[0;31m super().__init__(\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[0mdataset_folder_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
86 | "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/ims.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, root, dataset_folder_name, download, dataset_path, data, sample_freq)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_exists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
87 | "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/ims.py\u001b[0m in \u001b[0;36mdownload\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Downloading {url}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m download_and_extract_archive(\n\u001b[0m\u001b[1;32m 107\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_root\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdataset_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmd5\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmd5\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 108\u001b[0m )\n",
88 | "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/utils.py\u001b[0m in \u001b[0;36mdownload_and_extract_archive\u001b[0;34m(url, download_root, extract_root, filename, md5, remove_finished)\u001b[0m\n\u001b[1;32m 480\u001b[0m \u001b[0mfilename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbasename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 481\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 482\u001b[0;31m \u001b[0mdownload_url\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_root\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmd5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 483\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 484\u001b[0m \u001b[0marchive\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdownload_root\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
89 | "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/utils.py\u001b[0m in \u001b[0;36mdownload_url\u001b[0;34m(url, root, filename, md5, max_redirect_hops)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfile_id\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 174\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Goolgle drive file id:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 175\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mgdown\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfile_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquiet\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 176\u001b[0m \u001b[0;31m# return download_file_from_google_drive(file_id, root, filename, md5)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
90 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/gdown/download.py\u001b[0m in \u001b[0;36mdownload\u001b[0;34m(url, output, quiet, proxy, speed, use_cookies, verify, id, fuzzy, resume)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[0mpbar\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"B\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munit_scale\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0mt_start\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 257\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miter_content\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mCHUNK_SIZE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 258\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 259\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mquiet\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
91 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/requests/models.py\u001b[0m in \u001b[0;36mgenerate\u001b[0;34m()\u001b[0m\n\u001b[1;32m 756\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'stream'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 757\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 758\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 759\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mchunk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 760\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mProtocolError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
92 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/urllib3/response.py\u001b[0m in \u001b[0;36mstream\u001b[0;34m(self, amt, decode_content)\u001b[0m\n\u001b[1;32m 574\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 575\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_fp_closed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 576\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdecode_content\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 577\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 578\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
93 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/urllib3/response.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, amt, decode_content, cache_content)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 518\u001b[0m \u001b[0mcache_content\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 519\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mfp_closed\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34mb\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 520\u001b[0m if (\n\u001b[1;32m 521\u001b[0m \u001b[0mamt\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
94 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 457\u001b[0m \u001b[0;31m# Amount is given, implement using readinto\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 458\u001b[0m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 459\u001b[0;31m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 460\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtobytes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 461\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
95 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 501\u001b[0m \u001b[0;31m# connection, and the user is reading more bytes than will be provided\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 502\u001b[0m \u001b[0;31m# (for example, reading in 1k chunks)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 503\u001b[0;31m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 504\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mn\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[0;31m# Ideally, we would raise IncompleteRead if the content-length\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
96 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 667\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 669\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 670\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 671\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
97 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/ssl.py\u001b[0m in \u001b[0;36mrecv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1239\u001b[0m \u001b[0;34m\"non-zero flags not allowed in calls to recv_into() on %s\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1240\u001b[0m self.__class__)\n\u001b[0;32m-> 1241\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1242\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1243\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
98 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/ssl.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1097\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1098\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbuffer\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1099\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1100\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1101\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
99 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
100 | ]
101 | },
102 | {
103 | "name": "stderr",
104 | "output_type": "stream",
105 | "text": [
106 | " 49%|████▉ | 532M/1.08G [00:29<00:19, 28.4MB/s]"
107 | ]
108 | }
109 | ],
110 | "source": [
111 | "# instantiate the ImsPrepMethodA class and download data if it does not exist\n",
112 | "ims = ImsPrepMethodA(root=path_data_raw_folder, download=True)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": []
121 | }
122 | ],
123 | "metadata": {
124 | "kernelspec": {
125 | "display_name": "Python 3.8.12 ('featstore')",
126 | "language": "python",
127 | "name": "python3"
128 | },
129 | "language_info": {
130 | "codemirror_mode": {
131 | "name": "ipython",
132 | "version": 3
133 | },
134 | "file_extension": ".py",
135 | "mimetype": "text/x-python",
136 | "name": "python",
137 | "nbconvert_exporter": "python",
138 | "pygments_lexer": "ipython3",
139 | "version": "3.8.12"
140 | },
141 | "orig_nbformat": 4,
142 | "vscode": {
143 | "interpreter": {
144 | "hash": "daff1afd4d675d5e247c0a95a5de0c03bd87d8f7edee7cb37c539016070f1c16"
145 | }
146 | }
147 | },
148 | "nbformat": 4,
149 | "nbformat_minor": 2
150 | }
151 |
--------------------------------------------------------------------------------