├── src └── pyphm │ ├── __init__.py │ └── datasets │ ├── __init__.py │ ├── auxilary_metadata │ ├── __init__.py │ ├── milling_labels_with_tool_class.csv │ └── airbus_dfvalid_groundtruth.csv │ ├── pyphm.py │ ├── airbus.py │ ├── ims.py │ ├── milling.py │ └── utils.py ├── tests ├── integration │ ├── __init__.py │ ├── fixtures │ │ ├── milling │ │ │ ├── mill.mat │ │ │ ├── milling_truncated_results.csv.gz │ │ │ └── milling_labels_with_tool_class_truncated.csv │ │ └── ims │ │ │ ├── ims_truncated_results.csv.gz │ │ │ └── 1st_test │ │ │ ├── 2003.10.22.12.06.24 │ │ │ └── 2003.10.22.12.09.13 │ ├── test_integration_ims.py │ └── test_integration_milling.py └── conftest.py ├── notebooks ├── scratch │ ├── test._mill.ipynb │ ├── test.py │ ├── get_hash.ipynb │ ├── import_package_resources.ipynb │ ├── milling_examp.ipynb │ ├── test.ipynb │ ├── airbus_download.ipynb │ └── ims_download.ipynb └── images │ ├── logo.png │ ├── vae.png │ ├── cut_signals.png │ ├── flank_wear.png │ ├── thresholds.png │ ├── violin_plot.png │ ├── face_milling.png │ ├── simple_trend.png │ ├── latent_space_cnc.png │ ├── trend_spash_image.png │ ├── vae_training_step3.jpg │ ├── vae_training_random_search.png │ ├── prauc_params_cnc.svg │ ├── logo.svg │ └── prauc_cnc.svg ├── .gitattributes ├── requirements.txt ├── setup.py ├── env_pyphm.yml ├── .github └── workflows │ └── main.yml ├── pyproject.toml ├── LICENSE ├── setup.cfg ├── README.md ├── .gitignore └── references └── sources.bib /src/pyphm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pyphm/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/scratch/test._mill.ipynb: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pyphm/datasets/auxilary_metadata/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-documentation 2 | -------------------------------------------------------------------------------- /notebooks/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/logo.png -------------------------------------------------------------------------------- /notebooks/images/vae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/vae.png -------------------------------------------------------------------------------- /notebooks/images/cut_signals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/cut_signals.png -------------------------------------------------------------------------------- /notebooks/images/flank_wear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/flank_wear.png -------------------------------------------------------------------------------- /notebooks/images/thresholds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/thresholds.png -------------------------------------------------------------------------------- /notebooks/images/violin_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/violin_plot.png -------------------------------------------------------------------------------- /notebooks/images/face_milling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/face_milling.png -------------------------------------------------------------------------------- /notebooks/images/simple_trend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/simple_trend.png -------------------------------------------------------------------------------- /notebooks/images/latent_space_cnc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/latent_space_cnc.png -------------------------------------------------------------------------------- /notebooks/images/trend_spash_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/trend_spash_image.png -------------------------------------------------------------------------------- /notebooks/images/vae_training_step3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/vae_training_step3.jpg -------------------------------------------------------------------------------- /tests/integration/fixtures/milling/mill.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/tests/integration/fixtures/milling/mill.mat -------------------------------------------------------------------------------- /notebooks/images/vae_training_random_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/vae_training_random_search.png -------------------------------------------------------------------------------- /tests/integration/fixtures/ims/ims_truncated_results.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/tests/integration/fixtures/ims/ims_truncated_results.csv.gz -------------------------------------------------------------------------------- /tests/integration/fixtures/milling/milling_truncated_results.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/tests/integration/fixtures/milling/milling_truncated_results.csv.gz -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # local package 2 | -e . 3 | 4 | # external requirements 5 | pandas 6 | numpy 7 | py7zr 8 | rarfile 9 | tqdm 10 | scipy 11 | requests 12 | h5py 13 | tables 14 | gdown -------------------------------------------------------------------------------- /tests/integration/fixtures/milling/milling_labels_with_tool_class_truncated.csv: -------------------------------------------------------------------------------- 1 | case,run,VB,time,DOC,feed,material,cut_no,tool_class,window_start,window_end 2 | 1,1,0,2,1.5,0.5,1,0,0,2496,6976 3 | 1,2,,4,1.5,0.5,1,1,0,2496,6976 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | if __name__ == "__main__": 4 | setuptools.setup() 5 | 6 | # from setuptools import setup, find_packages 7 | 8 | # setup( 9 | # name="pyphm", 10 | # version="0.1.0", 11 | # packages=find_packages(), 12 | # ) 13 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dummy conftest.py for pyphm. 3 | 4 | If you don't know what this is for, just leave it empty. 5 | Read more about conftest.py under: 6 | - https://docs.pytest.org/en/stable/fixture.html 7 | - https://docs.pytest.org/en/stable/writing_plugins.html 8 | """ 9 | 10 | # import pytest 11 | -------------------------------------------------------------------------------- /env_pyphm.yml: -------------------------------------------------------------------------------- 1 | name: pyphm 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.11 6 | - conda 7 | - mamba 8 | - jupyterlab 9 | - ipywidgets 10 | - scipy 11 | - matplotlib 12 | - seaborn 13 | - pandas 14 | - scikit-learn 15 | - py7zr 16 | - rarfile 17 | - pytables 18 | - requests 19 | - gdown=4.6.0 20 | 21 | 22 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI/CD 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | strategy: 16 | matrix: 17 | python-version: [3.7, 3.8, 3.9] 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | 22 | - name: Set up Python all python versions 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | architecture: x64 27 | 28 | - name: Install dependencies 29 | run: pip install -r requirements.txt 30 | 31 | - name: Run Tests 32 | run: python -m unittest discover -s tests -------------------------------------------------------------------------------- /src/pyphm/datasets/pyphm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from pathlib import Path 4 | from typing import Any, Callable, List, Optional, Tuple 5 | from .utils import download_and_extract_archive, extract_archive, check_integrity 6 | 7 | 8 | class PHMDataset: 9 | """ 10 | Base class for making PyPHM data sets. 11 | 12 | Args: 13 | root (string): Root directory to place all the data sets. 14 | 15 | dataset_folder_name (string): Name of folder containing raw data. 16 | This folder will be created in the root directory if not present. 17 | 18 | """ 19 | 20 | def __init__( 21 | self, 22 | root: Path, 23 | dataset_folder_name: str, 24 | ) -> None: 25 | 26 | self.root = Path(root) 27 | self.dataset_folder_name = dataset_folder_name 28 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pyphm" 3 | version = "0.0.5" 4 | description = "Machinery data, made easy" 5 | requires-python = ">=3.6" 6 | readme = "README.md" 7 | authors = [ 8 | { name = "Tim von Hahn", email = "t.vonhahn@queensu.ca" }, 9 | ] 10 | classifiers = [ 11 | "Programming Language :: Python :: 3", 12 | "License :: OSI Approved :: MIT License", 13 | "Operating System :: OS Independent", 14 | ] 15 | dependencies = [ 16 | "pandas", 17 | "numpy", 18 | "py7zr", 19 | "rarfile", 20 | "tqdm", 21 | "scipy", 22 | "requests", 23 | "h5py", 24 | "tables", 25 | "gdown" 26 | ] 27 | 28 | [project.urls] 29 | Homepage = "https://github.com/tvhahn/PyPHM" 30 | Repository = "https://github.com/tvhahn/PyPHM" 31 | Documentation = "https://github.com/tvhahn/PyPHM" 32 | 33 | [project.optional-dependencies] 34 | doc = ["sphinx~=4.4.0", "myst-parser"] 35 | -------------------------------------------------------------------------------- /notebooks/scratch/test.py: -------------------------------------------------------------------------------- 1 | resources = [ 2 | { 3 | "name": "aws", 4 | "url": "https://phm-datasets.s3.amazonaws.com/NASA/", 5 | "files": [ 6 | { 7 | "filename": "3.+Milling.zip", 8 | "md5": "4da3afb0aa50cb3dcdd8e20ed1ed1c7c", 9 | }, 10 | { 11 | "filename": "another_file.zip", 12 | "md5": "some_other_md5_checksum", 13 | }, 14 | ], 15 | }, 16 | { 17 | "name": "google_drive", 18 | "url": "https://drive.google.com/file/d/1_4Hm8RO_7Av1LzGtFnhx6cIN-zi-W40j/view?usp=sharing", 19 | "files": [ 20 | { 21 | "filename": "mill.zip", 22 | "md5": "81d821fdef812183a7d38b6f83f7cefa", 23 | }, 24 | { 25 | "filename": "another_file.zip", 26 | "md5": "some_other_md5_checksum", 27 | }, 28 | ], 29 | }, 30 | # Additional sources can be added here in the same format. 31 | ] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | The MIT License (MIT) 3 | Copyright (c) 2022, Tim von Hahn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | 11 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = pyphm 3 | version = 0.0.5 4 | author = Tim von Hahn 5 | author_email = t.vonhahn@queensu.ca 6 | description = Machinery data, made easy 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/tvhahn/PyPHM 10 | project_urls = 11 | Bug Tracker = https://github.com/tvhahn/PyPHM/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | 17 | [options] 18 | package_dir = 19 | = src 20 | packages = find: 21 | python_requires = >=3.7 22 | include_package_data = True 23 | 24 | # Add here dependencies of your project (semicolon/line-separated) 25 | install_requires = 26 | pandas 27 | wheel 28 | scipy 29 | numpy 30 | py7zr 31 | rarfile 32 | tqdm 33 | requests 34 | versioned-hdf5 35 | h5py 36 | tables 37 | gdown 38 | 39 | [options.package_data] 40 | * = *.csv, *.mat 41 | 42 | [options.packages.find] 43 | where = src 44 | 45 | 46 | -------------------------------------------------------------------------------- /tests/integration/fixtures/ims/1st_test/2003.10.22.12.06.24: -------------------------------------------------------------------------------- 1 | -0.022 -0.039 -0.183 -0.054 -0.105 -0.134 -0.129 -0.142 2 | -0.105 -0.017 -0.164 -0.183 -0.049 0.029 -0.115 -0.122 3 | -0.183 -0.098 -0.195 -0.125 -0.005 -0.007 -0.171 -0.071 4 | -0.178 -0.161 -0.159 -0.178 -0.100 -0.115 -0.112 -0.078 5 | -0.208 -0.129 -0.261 -0.098 -0.151 -0.205 -0.063 -0.066 6 | -0.232 -0.061 -0.281 -0.125 0.046 -0.088 -0.078 -0.078 7 | -0.112 -0.132 -0.181 -0.186 -0.132 -0.051 -0.132 -0.076 8 | -0.054 -0.107 -0.173 -0.134 -0.164 0.002 -0.146 -0.125 9 | -0.159 -0.032 -0.161 -0.181 -0.110 -0.044 -0.173 -0.137 10 | -0.225 -0.044 -0.090 -0.159 -0.100 -0.151 -0.139 -0.076 11 | -0.093 -0.117 -0.039 -0.161 -0.132 -0.161 -0.090 -0.098 12 | -0.002 -0.161 -0.042 -0.054 -0.095 -0.232 -0.137 -0.042 13 | 0.000 -0.117 -0.081 -0.088 -0.142 -0.183 -0.117 -0.171 14 | -0.154 -0.142 -0.027 -0.093 -0.183 -0.251 -0.095 -0.083 15 | -0.129 -0.068 0.083 -0.071 -0.129 -0.117 -0.183 -0.071 16 | -0.015 -0.049 0.044 -0.088 -0.188 -0.081 -0.183 -0.020 17 | -0.015 -0.046 0.005 -0.061 -0.049 -0.098 -0.139 -0.085 18 | -0.090 -0.105 0.020 -0.012 -0.181 -0.186 -0.107 -0.037 19 | -0.088 -0.012 0.037 -0.093 -0.078 -0.105 -0.134 -0.039 20 | -0.127 -0.081 -0.051 -0.073 -0.100 -0.105 -0.115 -0.051 21 | -------------------------------------------------------------------------------- /tests/integration/fixtures/ims/1st_test/2003.10.22.12.09.13: -------------------------------------------------------------------------------- 1 | -0.117 -0.076 -0.127 -0.144 -0.083 -0.002 -0.098 -0.051 2 | -0.132 -0.068 -0.117 -0.083 -0.132 -0.076 -0.117 -0.085 3 | -0.186 -0.120 -0.217 -0.212 -0.081 -0.112 -0.132 -0.054 4 | -0.098 -0.125 -0.117 -0.093 -0.022 -0.112 -0.090 -0.164 5 | -0.137 -0.120 -0.188 -0.142 -0.129 -0.046 -0.098 -0.129 6 | -0.103 -0.078 -0.127 -0.156 -0.110 -0.061 -0.061 -0.129 7 | -0.120 -0.046 -0.085 -0.056 -0.149 -0.042 -0.103 -0.039 8 | -0.110 -0.068 -0.076 -0.078 -0.168 -0.134 -0.146 -0.168 9 | -0.088 -0.110 -0.022 -0.044 -0.225 -0.083 -0.100 -0.044 10 | -0.120 -0.073 -0.034 -0.076 -0.217 -0.073 -0.107 -0.088 11 | -0.159 -0.129 0.034 -0.022 -0.090 -0.139 -0.107 -0.049 12 | -0.073 -0.090 -0.032 -0.044 -0.076 -0.132 -0.134 -0.049 13 | -0.105 -0.122 -0.073 0.015 -0.078 -0.107 -0.195 -0.027 14 | -0.139 -0.056 0.000 -0.154 -0.068 -0.146 -0.193 0.032 15 | -0.129 -0.095 -0.012 -0.078 0.034 -0.127 -0.110 0.046 16 | -0.134 -0.159 -0.139 -0.210 -0.112 -0.107 -0.112 -0.005 17 | -0.071 -0.129 -0.134 -0.024 -0.156 -0.042 -0.132 -0.049 18 | -0.183 -0.093 -0.090 -0.112 -0.054 -0.088 -0.127 -0.127 19 | -0.278 -0.010 -0.007 -0.007 0.066 -0.103 -0.078 -0.071 20 | -0.154 -0.046 -0.198 -0.129 -0.078 -0.046 -0.093 -0.051 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![PyPHM Logo](./notebooks/images/logo.png) 2 | 3 | # Machinery data, made easy 4 | ![example workflow](https://github.com/tvhahn/PyPHM/actions/workflows/main.yml/badge.svg) [![arXiv](https://img.shields.io/badge/arXiv-2205.15489-b31b1b.svg)](https://arxiv.org/abs/2205.15489) 5 | 6 | 7 | Datasets specific to PHM (prognostics and health management). Use Python to easily download and prepare the data, before feature engineering or model training. 8 | 9 | Current datasets: 10 | - **UC-Berkeley Milling Dataset**: [example notebook](https://github.com/tvhahn/PyPHM/blob/master/notebooks/milling_example.ipynb) ([open in Colab](https://colab.research.google.com/github/tvhahn/PyPHM/blob/master/notebooks/milling_example.ipynb)); [dataset source](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#milling) 11 | - **IMS Bearing Dataset**: [dataset source](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#bearing) 12 | - **Airbus Helicopter Accelerometer Dataset**: [dataset source](https://www.research-collection.ethz.ch/handle/20.500.11850/415151) 13 | - More coming soon! 14 | 15 | 16 | ## Alpha Notice 17 | PyPHM is in active development. Expect considerable changes in the near future. 18 | 19 | Our goals are to create: 20 | 21 | * A package that implements **common data preprocessing methods** used by others. 22 | * A package with a **coherent and thoughtful API**. 23 | * Thorough **documentation**, with plenty of **examples**. 24 | * A package that is well **tested**, with the use of **type hints**. 25 | * A package built with **continuous integration and continuous deployment**. 26 | 27 | 28 | ## Installation 29 | Install with pip: `pip install pyphm` 30 | 31 | Install from github repository: clone with git `clone https://github.com/tvhahn/PyPHM.git`. Then run `python -m pip install -e .` to install the package on your local machine. 32 | 33 | Run tests: `python -m unittest discover -s tests` 34 | 35 | -------------------------------------------------------------------------------- /tests/integration/test_integration_ims.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from pathlib import Path 4 | import pandas as pd 5 | from pandas.testing import assert_frame_equal 6 | from pyphm.datasets.ims import ImsDataLoad 7 | 8 | 9 | class TestIms(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | pass 14 | 15 | def setUp(self): 16 | # path to mill_truncated.mat 17 | self.root = ( 18 | Path(__file__).parent / "fixtures" 19 | ) 20 | 21 | # path to ims_truncated_results.csv.gz 22 | self.results_path = ( 23 | self.root / "ims/ims_truncated_results.csv.gz" 24 | ) 25 | 26 | def tearDown(self): 27 | pass 28 | 29 | def test_milling_data_prep(self): 30 | """Test that the milling data prep works as expected.""" 31 | 32 | # load the data and instantiate the data prep class 33 | ims = ImsDataLoad(self.root, download=False) 34 | 35 | # create the results dataframe 36 | df = ims.load_run_as_df(1, n_jobs=1) 37 | 38 | # load the ground truth results dataframe 39 | col_names_ordered = ["id", "run", "file", "time_step"] + ims.col_1st_names 40 | 41 | col_dtype = [ 42 | str, 43 | int, 44 | str, 45 | np.float32, 46 | np.float32, 47 | np.float32, 48 | np.float32, 49 | np.float32, 50 | np.float32, 51 | np.float32, 52 | np.float32, 53 | np.float32, 54 | ] 55 | 56 | col_dtype_dict = dict(zip(col_names_ordered, col_dtype)) 57 | 58 | # load the ground truth results dataframe 59 | df_gt = pd.read_csv( 60 | self.results_path, 61 | compression="gzip", 62 | ).astype(col_dtype_dict) 63 | 64 | # compare the results 65 | assert_frame_equal(df, df_gt) 66 | 67 | 68 | if __name__ == "__main__": 69 | 70 | unittest.main() 71 | -------------------------------------------------------------------------------- /notebooks/scratch/get_hash.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "The autoreload extension is already loaded. To reload it, use:\n", 13 | " %reload_ext autoreload\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import hashlib\n", 19 | "from pathlib import Path\n", 20 | "import pandas as pd\n", 21 | "from pyphm.datasets.utils import calculate_md5, check_md5\n", 22 | "\n", 23 | "\n", 24 | "%load_ext autoreload\n", 25 | "%autoreload 2" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 7, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "/home/tim/Documents/PyPHM\n", 38 | "/home/tim/Documents/PyPHM/data/raw/milling\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "root_dir = Path.cwd().parent.parent\n", 44 | "print(root_dir)\n", 45 | "path_data_raw_folder = Path(root_dir / 'data/raw/milling/' )\n", 46 | "print(path_data_raw_folder)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 8, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "4da3afb0aa50cb3dcdd8e20ed1ed1c7c\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "hash_md5 = calculate_md5(path_data_raw_folder / \"3.+Milling.zip\")\n", 64 | "print(hash_md5)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [] 73 | } 74 | ], 75 | "metadata": { 76 | "interpreter": { 77 | "hash": "bb5c389ed065b0664b086eb1393fdb5729447cbf21b18fded646434c15c951b5" 78 | }, 79 | "kernelspec": { 80 | "display_name": "Python 3.8.12 ('featstore')", 81 | "language": "python", 82 | "name": "python3" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 3 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython3", 94 | "version": "3.11.7" 95 | }, 96 | "orig_nbformat": 4 97 | }, 98 | "nbformat": 4, 99 | "nbformat_minor": 2 100 | } 101 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # folders 132 | data/ 133 | -------------------------------------------------------------------------------- /tests/integration/test_integration_milling.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from pathlib import Path 4 | import pandas as pd 5 | from pandas.testing import assert_frame_equal 6 | from pyphm.datasets.milling import MillingPrepMethodA 7 | 8 | 9 | class TestMilling(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | pass 14 | 15 | 16 | def setUp(self): 17 | # path to mill_truncated.mat 18 | self.root = ( 19 | Path(__file__).parent / "fixtures" 20 | ) 21 | 22 | # path to milling_labels_with_tool_class_truncated.csv 23 | self.labels_path = ( 24 | self.root 25 | / "milling/milling_labels_with_tool_class_truncated.csv" 26 | ) 27 | 28 | # path to milling_truncated_results.csv.gz 29 | self.results_path = ( 30 | self.root / "milling/milling_truncated_results.csv.gz" 31 | ) 32 | 33 | def test_load_run_as_df(self): 34 | """Test the loading of an individual run as a dataframe.""" 35 | 36 | # load the data and instantiate the data prep class 37 | mill = MillingPrepMethodA( 38 | self.root, 39 | window_len=64, 40 | stride=64, 41 | cut_drop_list=[], 42 | path_csv_labels=self.labels_path, 43 | download=False, 44 | ) 45 | 46 | # create the results dataframe 47 | df = mill.create_xy_dataframe() 48 | 49 | # load the ground truth results dataframe 50 | col_names_ordered = [ 51 | "cut_id", 52 | "cut_no", 53 | "case", 54 | "time", 55 | "ae_spindle", 56 | "ae_table", 57 | "vib_spindle", 58 | "vib_table", 59 | "smcdc", 60 | "smcac", 61 | "tool_class", 62 | ] 63 | 64 | col_dtype = [ 65 | str, 66 | int, 67 | int, 68 | np.float32, 69 | np.float32, 70 | np.float32, 71 | np.float32, 72 | np.float32, 73 | np.float32, 74 | np.float32, 75 | int, 76 | ] 77 | 78 | col_dtype_dict = dict(zip(col_names_ordered, col_dtype)) 79 | 80 | # load the ground truth results dataframe 81 | df_gt = pd.read_csv( 82 | self.results_path, 83 | compression="gzip", 84 | ).astype(col_dtype_dict) 85 | 86 | # compare the results 87 | assert_frame_equal(df, df_gt) 88 | 89 | 90 | if __name__ == "__main__": 91 | 92 | unittest.main() 93 | -------------------------------------------------------------------------------- /notebooks/images/prauc_params_cnc.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | image/svg+xml5 4 | 2 5 | 4 6 | 6 7 | 3 8 | all 9 | Sub-Cut Number 10 | 0.406 11 | 0.358 12 | 0.292 13 | 0.262 14 | 0.233 15 | 0.149 16 | PR-AUC Score in the Latent Space for the Most Common Manufactured Part 17 | 18 | -------------------------------------------------------------------------------- /notebooks/images/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 22 | 24 | 46 | 48 | 49 | 51 | image/svg+xml 52 | 54 | 55 | 56 | 57 | 58 | 63 | 68 | PyPHM 81 | 93 | 94 | -------------------------------------------------------------------------------- /notebooks/scratch/import_package_resources.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pkg_resources\n", 10 | "from pathlib import Path\n", 11 | "import pandas as pd\n" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 7, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "meta_data_path = Path(pkg_resources.resource_filename('pyphm', 'datasets/auxilary_metadata/'))" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 8, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "WindowsPath('C:/Users/Tim/Anaconda3/envs/featstore/lib/site-packages/pyphm/datasets/auxilary_metadata')" 32 | ] 33 | }, 34 | "execution_count": 8, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "meta_data_path" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 10, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/html": [ 51 | "
\n", 52 | "\n", 65 | "\n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | "
caserunVBtimeDOCfeedmaterialcut_notool_classwindow_startwindow_end
0110.0021.50.510024966976
112NaN41.50.511024966976
213NaN61.50.512024966976
3140.1171.50.513024966976
415NaN111.50.514024966976
\n", 155 | "
" 156 | ], 157 | "text/plain": [ 158 | " case run VB time DOC feed material cut_no tool_class \\\n", 159 | "0 1 1 0.00 2 1.5 0.5 1 0 0 \n", 160 | "1 1 2 NaN 4 1.5 0.5 1 1 0 \n", 161 | "2 1 3 NaN 6 1.5 0.5 1 2 0 \n", 162 | "3 1 4 0.11 7 1.5 0.5 1 3 0 \n", 163 | "4 1 5 NaN 11 1.5 0.5 1 4 0 \n", 164 | "\n", 165 | " window_start window_end \n", 166 | "0 2496 6976 \n", 167 | "1 2496 6976 \n", 168 | "2 2496 6976 \n", 169 | "3 2496 6976 \n", 170 | "4 2496 6976 " 171 | ] 172 | }, 173 | "execution_count": 10, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "df = pd.read_csv(meta_data_path / 'milling_labels_with_tool_class.csv')\n", 180 | "df.head()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [] 189 | } 190 | ], 191 | "metadata": { 192 | "interpreter": { 193 | "hash": "bb5c389ed065b0664b086eb1393fdb5729447cbf21b18fded646434c15c951b5" 194 | }, 195 | "kernelspec": { 196 | "display_name": "Python 3.8.12 ('featstore')", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.8.12" 211 | }, 212 | "orig_nbformat": 4 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 2 216 | } 217 | -------------------------------------------------------------------------------- /src/pyphm/datasets/auxilary_metadata/milling_labels_with_tool_class.csv: -------------------------------------------------------------------------------- 1 | case,run,VB,time,DOC,feed,material,cut_no,tool_class,window_start,window_end 2 | 1,1,0,2,1.5,0.5,1,0,0,2496,6976 3 | 1,2,,4,1.5,0.5,1,1,0,2496,6976 4 | 1,3,,6,1.5,0.5,1,2,0,2496,6976 5 | 1,4,0.11,7,1.5,0.5,1,3,0,2496,6976 6 | 1,5,,11,1.5,0.5,1,4,0,2496,6976 7 | 1,6,0.2,15,1.5,0.5,1,5,1,2496,6976 8 | 1,7,0.24,19,1.5,0.5,1,6,1,2496,6976 9 | 1,8,0.29,22,1.5,0.5,1,7,1,2496,6976 10 | 1,9,0.28,26,1.5,0.5,1,8,1,2496,6976 11 | 1,10,0.29,29,1.5,0.5,1,9,1,2496,6976 12 | 1,11,0.38,32,1.5,0.5,1,10,1,2496,6976 13 | 1,12,0.4,35,1.5,0.5,1,11,1,2496,6976 14 | 1,13,0.43,38,1.5,0.5,1,12,1,2496,6976 15 | 1,14,0.45,41,1.5,0.5,1,13,1,2496,6976 16 | 1,15,0.5,44,1.5,0.5,1,14,1,2496,6976 17 | 1,16,,46,1.5,0.5,1,15,1,2496,6976 18 | 1,17,0.44,48,1.5,0.5,1,16,1,2496,6976 19 | 2,1,0.08,3,0.75,0.5,1,17,0,64,128 20 | 2,2,0.14,9,0.75,0.5,1,18,0,2496,6976 21 | 2,3,0.14,12,0.75,0.5,1,19,0,2496,6976 22 | 2,4,0.14,15,0.75,0.5,1,20,0,2496,6976 23 | 2,5,0.15,22,0.75,0.5,1,21,0,4224,6976 24 | 2,6,,24,0.75,0.5,1,22,0,5056,6976 25 | 2,7,0.18,27,0.75,0.5,1,23,0,2496,6976 26 | 2,8,0.22,33,0.75,0.5,1,24,1,2496,6976 27 | 2,9,0.26,39,0.75,0.5,1,25,1,2496,6976 28 | 2,10,0.31,45,0.75,0.5,1,26,1,3520,8000 29 | 2,11,0.38,51,0.75,0.5,1,27,1,2496,6976 30 | 2,12,0.43,59,0.75,0.5,1,28,1,2496,6976 31 | 2,13,0.48,66,0.75,0.5,1,29,1,2496,6976 32 | 2,14,0.55,72,0.75,0.5,1,30,1,3520,8000 33 | 3,1,0,0,0.75,0.25,1,31,0,4480,8960 34 | 3,2,0.13,3,0.75,0.25,1,32,0,4480,8960 35 | 3,3,0.13,9,0.75,0.25,1,33,0,4480,8960 36 | 3,5,0.17,21,0.75,0.25,1,34,0,4480,8960 37 | 3,6,0.19,27,0.75,0.25,1,35,0,3520,8960 38 | 3,7,0.2,33,0.75,0.25,1,36,1,3520,8960 39 | 3,8,0.23,39,0.75,0.25,1,37,1,3520,8960 40 | 3,9,0.23,45,0.75,0.25,1,38,1,4480,8960 41 | 3,10,0.26,51,0.75,0.25,1,39,1,3520,8960 42 | 3,11,0.28,57,0.75,0.25,1,40,1,4160,8960 43 | 3,12,0.33,63,0.75,0.25,1,41,1,4160,8960 44 | 3,14,0.36,69,0.75,0.25,1,42,1,4480,8960 45 | 3,15,0.44,75,0.75,0.25,1,43,1,4480,8960 46 | 3,16,0.55,81,0.75,0.25,1,44,1,4480,8960 47 | 4,1,0.08,3,1.5,0.25,1,45,0,4480,8960 48 | 4,2,0.13,9,1.5,0.25,1,46,0,4480,8960 49 | 4,3,0.2,15,1.5,0.25,1,47,1,4160,8960 50 | 4,4,0.31,21,1.5,0.25,1,48,1,4160,8960 51 | 4,5,0.35,27,1.5,0.25,1,49,1,4160,8960 52 | 4,6,0.4,34,1.5,0.25,1,50,1,4160,8960 53 | 4,7,0.49,39,1.5,0.25,1,51,1,4160,8960 54 | 9,1,0,1,1.5,0.5,1,52,0,2112,6720 55 | 9,2,0.1,3,1.5,0.5,1,53,0,2112,6720 56 | 9,3,0.14,9,1.5,0.5,1,54,0,2112,6464 57 | 9,4,0.19,16,1.5,0.5,1,55,0,2496,6720 58 | 9,5,0.27,22,1.5,0.5,1,56,1,2496,6720 59 | 9,6,0.38,28,1.5,0.5,1,57,1,2496,6720 60 | 9,7,0.47,34,1.5,0.5,1,58,1,2496,6720 61 | 9,8,0.64,40,1.5,0.5,1,59,1,2496,6720 62 | 9,9,0.81,46,1.5,0.5,1,60,2,2112,6464 63 | 10,1,0,0,1.5,0.25,1,61,0,4480,8960 64 | 10,2,0.04,4,1.5,0.25,1,62,0,4480,8960 65 | 10,3,0.08,9,1.5,0.25,1,63,0,4480,8960 66 | 10,4,0.16,15,1.5,0.25,1,64,0,4160,8960 67 | 10,5,0.25,21,1.5,0.25,1,65,1,4160,8960 68 | 10,6,0.36,27,1.5,0.25,1,66,1,4160,8960 69 | 10,7,0.43,33,1.5,0.25,1,67,1,4160,8960 70 | 10,8,0.47,39,1.5,0.25,1,68,1,4160,8960 71 | 10,9,0.53,45,1.5,0.25,1,69,1,4160,8960 72 | 10,10,0.7,57,1.5,0.25,1,70,2,5056,8960 73 | 11,1,0,1,0.75,0.25,1,71,0,4160,8960 74 | 11,2,0.04,3,0.75,0.25,1,72,0,4160,8960 75 | 11,3,0.07,10,0.75,0.25,1,73,0,4160,8960 76 | 11,4,0.07,12,0.75,0.25,1,74,0,4160,8960 77 | 11,5,0.08,14,0.75,0.25,1,75,0,4160,8960 78 | 11,6,0.09,17,0.75,0.25,1,76,0,4160,8960 79 | 11,7,,19,0.75,0.25,1,77,0,4160,8960 80 | 11,8,0.12,21,0.75,0.25,1,78,0,4160,8960 81 | 11,9,0.16,27,0.75,0.25,1,79,0,4160,8960 82 | 11,10,0.18,33,0.75,0.25,1,80,0,4160,8960 83 | 11,11,0.2,39,0.75,0.25,1,81,1,4160,8960 84 | 11,12,0.23,45,0.75,0.25,1,82,1,4160,8960 85 | 11,13,0.26,51,0.75,0.25,1,83,1,4160,8960 86 | 11,14,,54,0.75,0.25,1,84,1,4160,8960 87 | 11,15,0.31,57,0.75,0.25,1,85,1,4160,8960 88 | 11,16,0.37,63,0.75,0.25,1,86,1,4160,8960 89 | 11,17,,67,0.75,0.25,1,87,1,4160,8960 90 | 11,18,0.42,72,0.75,0.25,1,88,1,4160,8960 91 | 11,19,0.47,80,0.75,0.25,1,89,1,4160,8960 92 | 11,20,0.57,86,0.75,0.25,1,90,1,4160,8960 93 | 11,21,0.65,93,0.75,0.25,1,91,1,4160,8960 94 | 11,22,0.68,100,0.75,0.25,1,92,1,4160,8960 95 | 11,23,0.76,105,0.75,0.25,1,93,2,4160,8960 96 | 12,1,,1,0.75,0.5,1,94,0,64,128 97 | 12,2,0.05,3,0.75,0.5,1,95,0,2496,6720 98 | 12,3,0.08,6,0.75,0.5,1,96,0,2496,6464 99 | 12,4,,12,0.75,0.5,1,97,0,2496,6464 100 | 12,5,0.12,19,0.75,0.5,1,98,0,3008,6464 101 | 12,6,0.17,24,0.75,0.5,1,99,0,2496,6720 102 | 12,7,0.2,30,0.75,0.5,1,100,1,2496,6720 103 | 12,8,0.24,36,0.75,0.5,1,101,1,2496,6720 104 | 12,9,0.32,42,0.75,0.5,1,102,1,2496,6720 105 | 12,10,,45,0.75,0.5,1,103,1,2496,6720 106 | 12,11,0.4,49,0.75,0.5,1,104,1,2496,6464 107 | 12,12,0.45,55,0.75,0.5,1,105,1,2496,3904 108 | 12,13,0.49,61,0.75,0.5,1,106,1,2496,6720 109 | 12,14,0.58,67,0.75,0.5,1,107,1,2496,6720 110 | 12,15,0.65,74,0.75,0.5,1,108,1,2496,6464 111 | 5,1,0,0,1.5,0.5,2,109,0,2496,6464 112 | 5,2,0.16,3,1.5,0.5,2,110,0,2496,6720 113 | 5,3,0.29,6,1.5,0.5,2,111,1,2496,6976 114 | 5,4,0.44,9,1.5,0.5,2,112,1,2496,6976 115 | 5,5,0.53,12,1.5,0.5,2,113,1,2496,6976 116 | 5,6,0.74,15,1.5,0.5,2,114,2,2496,6720 117 | 6,1,0,0,1.5,0.25,2,115,0,4160,8960 118 | 7,1,0,1,0.75,0.25,2,116,0,4160,8960 119 | 7,2,0.09,3,0.75,0.25,2,117,0,4160,8960 120 | 7,3,0.13,6,0.75,0.25,2,118,0,4160,8960 121 | 7,4,0.22,10,0.75,0.25,2,119,1,4160,8960 122 | 7,5,0.24,13,0.75,0.25,2,120,1,4480,8960 123 | 7,6,0.34,15,0.75,0.25,2,121,1,4160,8960 124 | 7,7,0.46,19,0.75,0.25,2,122,1,4160,8960 125 | 7,8,,21,0.75,0.25,2,123,1,4480,8960 126 | 8,1,0,0,0.75,0.5,2,124,0,2496,6720 127 | 8,2,0.18,3,0.75,0.5,2,125,0,2496,6720 128 | 8,3,0.3,6,0.75,0.5,2,126,1,2496,6720 129 | 8,4,,8,0.75,0.5,2,127,1,2496,6720 130 | 8,5,0.44,9,0.75,0.5,2,128,1,2496,6720 131 | 8,6,0.62,12,0.75,0.5,2,129,1,2496,6720 132 | 13,1,,1,0.75,0.25,2,130,0,4480,8960 133 | 13,2,,2,0.75,0.25,2,131,0,4480,8960 134 | 13,3,0.1,4,0.75,0.25,2,132,0,4480,8960 135 | 13,4,0.13,7,0.75,0.25,2,133,0,4480,8960 136 | 13,5,0.17,11,0.75,0.25,2,134,0,4160,8960 137 | 13,6,0.32,16,0.75,0.25,2,135,1,4160,8960 138 | 13,7,0.38,19,0.75,0.25,2,136,1,4160,8960 139 | 13,8,0.49,22,0.75,0.25,2,137,1,4160,8960 140 | 13,9,0.56,25,0.75,0.25,2,138,1,4160,8960 141 | 13,10,0.68,29,0.75,0.25,2,139,1,4160,8960 142 | 13,11,0.83,32,0.75,0.25,2,140,2,4160,8960 143 | 13,12,0.92,35,0.75,0.25,2,141,2,4160,8960 144 | 13,13,1.07,38,0.75,0.25,2,142,2,4160,8960 145 | 13,14,1.3,42,0.75,0.25,2,143,2,4160,8960 146 | 13,15,1.53,45,0.75,0.25,2,144,2,4160,8960 147 | 14,1,,1,0.75,0.5,2,145,0,2496,6976 148 | 14,2,0.09,3,0.75,0.5,2,146,0,2496,6464 149 | 14,3,0.17,6,0.75,0.5,2,147,0,2496,6720 150 | 14,4,0.24,9,0.75,0.5,2,148,1,2496,6720 151 | 14,5,,11,0.75,0.5,2,149,1,2496,6720 152 | 14,6,0.35,12,0.75,0.5,2,150,1,2496,6464 153 | 14,8,0.6,18,0.75,0.5,2,151,1,2496,6464 154 | 14,9,0.81,21,0.75,0.5,2,152,2,2496,6464 155 | 14,10,1.14,24,0.75,0.5,2,153,2,2496,6464 156 | 15,1,,1,1.5,0.25,2,154,0,4160,8960 157 | 15,2,0.15,3,1.5,0.25,2,155,0,4160,8960 158 | 15,3,0.28,6,1.5,0.25,2,156,1,4160,8960 159 | 15,4,0.37,9,1.5,0.25,2,157,1,4160,8960 160 | 15,5,0.48,13,1.5,0.25,2,158,1,4160,8960 161 | 15,6,0.56,16,1.5,0.25,2,159,1,4480,8960 162 | 15,7,0.7,19,1.5,0.25,2,160,2,4160,8960 163 | 16,1,,1,1.5,0.5,2,161,0,2496,6720 164 | 16,2,,2,1.5,0.5,2,162,0,2496,6720 165 | 16,3,0.24,3,1.5,0.5,2,163,1,2496,6720 166 | 16,4,,4,1.5,0.5,2,164,1,2496,6720 167 | 16,5,0.4,6,1.5,0.5,2,165,1,2496,6720 168 | 16,6,0.62,9,1.5,0.5,2,166,1,2496,6720 169 | -------------------------------------------------------------------------------- /notebooks/scratch/milling_examp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyphm.datasets.milling import MillingDataLoad, MillingPrepMethodA\n", 10 | "import pandas as pd\n", 11 | "from pathlib import Path\n", 12 | "\n", 13 | "%load_ext autoreload\n", 14 | "%autoreload 2" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "root_dir: /home/tim/Documents/PyPHM\n", 27 | "path_data_raw_folder: /home/tim/Documents/PyPHM/data\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "root_dir = Path.cwd().parent\n", 33 | "print('root_dir: ', root_dir)\n", 34 | "path_data_raw_folder = Path(root_dir / 'data' )\n", 35 | "print('path_data_raw_folder: ', path_data_raw_folder)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "mill = MillingPrepMethodA(root=path_data_raw_folder, dataset_folder_name='milling', window_size=64, stride=64, cut_drop_list=[17, 94], download=False)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 4, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "PosixPath('/home/tim/Documents/PyPHM/data/milling')" 56 | ] 57 | }, 58 | "execution_count": 4, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "mill.dataset_folder_path" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "x.shape (11570, 64, 6)\n", 77 | "y.shape (11570, 64, 3)\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "x, y = mill.create_xy_arrays()\n", 83 | "print(\"x.shape\", x.shape)\n", 84 | "print(\"y.shape\", y.shape)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 9, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/html": [ 95 | "
\n", 96 | "\n", 109 | "\n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | "
cut_idcut_nocasetimeae_spindleae_tablevib_spindlevib_tablesmcdcsmcactool_class
00_0010.0000.2197270.2728270.7336432.1166996.8408200.1245120
10_0010.0040.2465820.3222660.7788092.2778326.660156-0.5615230
20_0010.0080.2941890.2838130.7580572.3437506.508789-2.0996090
30_0010.0120.3234860.2600100.7263182.4487306.542969-2.7319340
40_0010.0160.2905270.2532960.6530762.5463876.621094-3.5058590
\n", 199 | "
" 200 | ], 201 | "text/plain": [ 202 | " cut_id cut_no case time ae_spindle ae_table vib_spindle vib_table \\\n", 203 | "0 0_0 0 1 0.000 0.219727 0.272827 0.733643 2.116699 \n", 204 | "1 0_0 0 1 0.004 0.246582 0.322266 0.778809 2.277832 \n", 205 | "2 0_0 0 1 0.008 0.294189 0.283813 0.758057 2.343750 \n", 206 | "3 0_0 0 1 0.012 0.323486 0.260010 0.726318 2.448730 \n", 207 | "4 0_0 0 1 0.016 0.290527 0.253296 0.653076 2.546387 \n", 208 | "\n", 209 | " smcdc smcac tool_class \n", 210 | "0 6.840820 0.124512 0 \n", 211 | "1 6.660156 -0.561523 0 \n", 212 | "2 6.508789 -2.099609 0 \n", 213 | "3 6.542969 -2.731934 0 \n", 214 | "4 6.621094 -3.505859 0 " 215 | ] 216 | }, 217 | "execution_count": 9, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "df = mill.create_xy_dataframe()\n", 224 | "df.head()" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [] 233 | } 234 | ], 235 | "metadata": { 236 | "interpreter": { 237 | "hash": "daff1afd4d675d5e247c0a95a5de0c03bd87d8f7edee7cb37c539016070f1c16" 238 | }, 239 | "kernelspec": { 240 | "display_name": "Python 3.8.12 64-bit ('featstore': conda)", 241 | "language": "python", 242 | "name": "python3" 243 | }, 244 | "language_info": { 245 | "codemirror_mode": { 246 | "name": "ipython", 247 | "version": 3 248 | }, 249 | "file_extension": ".py", 250 | "mimetype": "text/x-python", 251 | "name": "python", 252 | "nbconvert_exporter": "python", 253 | "pygments_lexer": "ipython3", 254 | "version": "3.8.12" 255 | }, 256 | "orig_nbformat": 4 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 2 260 | } 261 | -------------------------------------------------------------------------------- /src/pyphm/datasets/auxilary_metadata/airbus_dfvalid_groundtruth.csv: -------------------------------------------------------------------------------- 1 | seqID,anomaly 2 | 0,0.0 3 | 1,1.0 4 | 2,0.0 5 | 3,0.0 6 | 4,1.0 7 | 5,1.0 8 | 6,0.0 9 | 7,0.0 10 | 8,1.0 11 | 9,0.0 12 | 10,0.0 13 | 11,0.0 14 | 12,1.0 15 | 13,1.0 16 | 14,1.0 17 | 15,0.0 18 | 16,1.0 19 | 17,1.0 20 | 18,0.0 21 | 19,0.0 22 | 20,0.0 23 | 21,1.0 24 | 22,0.0 25 | 23,0.0 26 | 24,1.0 27 | 25,1.0 28 | 26,0.0 29 | 27,1.0 30 | 28,1.0 31 | 29,0.0 32 | 30,0.0 33 | 31,0.0 34 | 32,1.0 35 | 33,0.0 36 | 34,0.0 37 | 35,0.0 38 | 36,1.0 39 | 37,0.0 40 | 38,0.0 41 | 39,0.0 42 | 40,0.0 43 | 41,0.0 44 | 42,1.0 45 | 43,1.0 46 | 44,1.0 47 | 45,0.0 48 | 46,0.0 49 | 47,0.0 50 | 48,0.0 51 | 49,1.0 52 | 50,0.0 53 | 51,1.0 54 | 52,0.0 55 | 53,1.0 56 | 54,1.0 57 | 55,1.0 58 | 56,0.0 59 | 57,1.0 60 | 58,1.0 61 | 59,1.0 62 | 60,0.0 63 | 61,1.0 64 | 62,0.0 65 | 63,1.0 66 | 64,0.0 67 | 65,0.0 68 | 66,1.0 69 | 67,1.0 70 | 68,0.0 71 | 69,1.0 72 | 70,0.0 73 | 71,0.0 74 | 72,0.0 75 | 73,1.0 76 | 74,1.0 77 | 75,1.0 78 | 76,1.0 79 | 77,1.0 80 | 78,1.0 81 | 79,1.0 82 | 80,1.0 83 | 81,1.0 84 | 82,1.0 85 | 83,0.0 86 | 84,0.0 87 | 85,1.0 88 | 86,0.0 89 | 87,1.0 90 | 88,1.0 91 | 89,0.0 92 | 90,1.0 93 | 91,0.0 94 | 92,0.0 95 | 93,0.0 96 | 94,0.0 97 | 95,0.0 98 | 96,0.0 99 | 97,1.0 100 | 98,1.0 101 | 99,1.0 102 | 100,1.0 103 | 101,1.0 104 | 102,1.0 105 | 103,0.0 106 | 104,1.0 107 | 105,0.0 108 | 106,0.0 109 | 107,0.0 110 | 108,0.0 111 | 109,0.0 112 | 110,0.0 113 | 111,1.0 114 | 112,1.0 115 | 113,0.0 116 | 114,0.0 117 | 115,1.0 118 | 116,1.0 119 | 117,0.0 120 | 118,1.0 121 | 119,0.0 122 | 120,1.0 123 | 121,0.0 124 | 122,1.0 125 | 123,1.0 126 | 124,0.0 127 | 125,1.0 128 | 126,1.0 129 | 127,0.0 130 | 128,1.0 131 | 129,0.0 132 | 130,0.0 133 | 131,1.0 134 | 132,1.0 135 | 133,0.0 136 | 134,1.0 137 | 135,0.0 138 | 136,0.0 139 | 137,0.0 140 | 138,0.0 141 | 139,0.0 142 | 140,0.0 143 | 141,1.0 144 | 142,1.0 145 | 143,1.0 146 | 144,1.0 147 | 145,1.0 148 | 146,1.0 149 | 147,1.0 150 | 148,0.0 151 | 149,0.0 152 | 150,0.0 153 | 151,0.0 154 | 152,0.0 155 | 153,0.0 156 | 154,1.0 157 | 155,0.0 158 | 156,0.0 159 | 157,0.0 160 | 158,0.0 161 | 159,0.0 162 | 160,0.0 163 | 161,1.0 164 | 162,1.0 165 | 163,1.0 166 | 164,0.0 167 | 165,1.0 168 | 166,1.0 169 | 167,1.0 170 | 168,0.0 171 | 169,1.0 172 | 170,1.0 173 | 171,1.0 174 | 172,0.0 175 | 173,1.0 176 | 174,0.0 177 | 175,1.0 178 | 176,0.0 179 | 177,0.0 180 | 178,1.0 181 | 179,1.0 182 | 180,0.0 183 | 181,0.0 184 | 182,0.0 185 | 183,1.0 186 | 184,1.0 187 | 185,0.0 188 | 186,0.0 189 | 187,0.0 190 | 188,0.0 191 | 189,1.0 192 | 190,1.0 193 | 191,1.0 194 | 192,1.0 195 | 193,0.0 196 | 194,0.0 197 | 195,0.0 198 | 196,0.0 199 | 197,1.0 200 | 198,1.0 201 | 199,1.0 202 | 200,0.0 203 | 201,0.0 204 | 202,0.0 205 | 203,0.0 206 | 204,0.0 207 | 205,0.0 208 | 206,1.0 209 | 207,1.0 210 | 208,1.0 211 | 209,1.0 212 | 210,1.0 213 | 211,1.0 214 | 212,0.0 215 | 213,1.0 216 | 214,0.0 217 | 215,0.0 218 | 216,0.0 219 | 217,1.0 220 | 218,0.0 221 | 219,0.0 222 | 220,0.0 223 | 221,1.0 224 | 222,1.0 225 | 223,0.0 226 | 224,0.0 227 | 225,0.0 228 | 226,0.0 229 | 227,0.0 230 | 228,0.0 231 | 229,1.0 232 | 230,0.0 233 | 231,0.0 234 | 232,1.0 235 | 233,0.0 236 | 234,0.0 237 | 235,0.0 238 | 236,0.0 239 | 237,1.0 240 | 238,1.0 241 | 239,1.0 242 | 240,1.0 243 | 241,0.0 244 | 242,1.0 245 | 243,1.0 246 | 244,1.0 247 | 245,0.0 248 | 246,0.0 249 | 247,1.0 250 | 248,0.0 251 | 249,1.0 252 | 250,0.0 253 | 251,0.0 254 | 252,1.0 255 | 253,0.0 256 | 254,0.0 257 | 255,1.0 258 | 256,0.0 259 | 257,0.0 260 | 258,1.0 261 | 259,1.0 262 | 260,0.0 263 | 261,0.0 264 | 262,1.0 265 | 263,0.0 266 | 264,1.0 267 | 265,0.0 268 | 266,1.0 269 | 267,0.0 270 | 268,0.0 271 | 269,1.0 272 | 270,0.0 273 | 271,0.0 274 | 272,1.0 275 | 273,0.0 276 | 274,1.0 277 | 275,0.0 278 | 276,0.0 279 | 277,0.0 280 | 278,0.0 281 | 279,1.0 282 | 280,1.0 283 | 281,0.0 284 | 282,1.0 285 | 283,1.0 286 | 284,1.0 287 | 285,1.0 288 | 286,1.0 289 | 287,1.0 290 | 288,0.0 291 | 289,0.0 292 | 290,0.0 293 | 291,0.0 294 | 292,0.0 295 | 293,0.0 296 | 294,1.0 297 | 295,0.0 298 | 296,0.0 299 | 297,0.0 300 | 298,1.0 301 | 299,0.0 302 | 300,1.0 303 | 301,1.0 304 | 302,1.0 305 | 303,1.0 306 | 304,1.0 307 | 305,1.0 308 | 306,0.0 309 | 307,1.0 310 | 308,0.0 311 | 309,1.0 312 | 310,1.0 313 | 311,0.0 314 | 312,0.0 315 | 313,0.0 316 | 314,1.0 317 | 315,1.0 318 | 316,0.0 319 | 317,0.0 320 | 318,0.0 321 | 319,1.0 322 | 320,1.0 323 | 321,0.0 324 | 322,0.0 325 | 323,1.0 326 | 324,0.0 327 | 325,0.0 328 | 326,0.0 329 | 327,0.0 330 | 328,0.0 331 | 329,1.0 332 | 330,0.0 333 | 331,0.0 334 | 332,1.0 335 | 333,0.0 336 | 334,1.0 337 | 335,0.0 338 | 336,0.0 339 | 337,0.0 340 | 338,1.0 341 | 339,1.0 342 | 340,0.0 343 | 341,0.0 344 | 342,0.0 345 | 343,1.0 346 | 344,0.0 347 | 345,1.0 348 | 346,0.0 349 | 347,1.0 350 | 348,0.0 351 | 349,1.0 352 | 350,0.0 353 | 351,1.0 354 | 352,1.0 355 | 353,1.0 356 | 354,1.0 357 | 355,1.0 358 | 356,0.0 359 | 357,0.0 360 | 358,1.0 361 | 359,1.0 362 | 360,0.0 363 | 361,0.0 364 | 362,1.0 365 | 363,0.0 366 | 364,0.0 367 | 365,0.0 368 | 366,1.0 369 | 367,0.0 370 | 368,1.0 371 | 369,1.0 372 | 370,1.0 373 | 371,1.0 374 | 372,0.0 375 | 373,0.0 376 | 374,0.0 377 | 375,1.0 378 | 376,1.0 379 | 377,1.0 380 | 378,0.0 381 | 379,0.0 382 | 380,1.0 383 | 381,1.0 384 | 382,0.0 385 | 383,0.0 386 | 384,1.0 387 | 385,0.0 388 | 386,0.0 389 | 387,0.0 390 | 388,0.0 391 | 389,1.0 392 | 390,0.0 393 | 391,0.0 394 | 392,1.0 395 | 393,0.0 396 | 394,1.0 397 | 395,1.0 398 | 396,1.0 399 | 397,0.0 400 | 398,0.0 401 | 399,0.0 402 | 400,1.0 403 | 401,1.0 404 | 402,1.0 405 | 403,1.0 406 | 404,1.0 407 | 405,0.0 408 | 406,1.0 409 | 407,1.0 410 | 408,1.0 411 | 409,0.0 412 | 410,1.0 413 | 411,1.0 414 | 412,0.0 415 | 413,0.0 416 | 414,1.0 417 | 415,0.0 418 | 416,0.0 419 | 417,1.0 420 | 418,0.0 421 | 419,0.0 422 | 420,1.0 423 | 421,0.0 424 | 422,0.0 425 | 423,1.0 426 | 424,1.0 427 | 425,0.0 428 | 426,0.0 429 | 427,1.0 430 | 428,0.0 431 | 429,0.0 432 | 430,0.0 433 | 431,1.0 434 | 432,0.0 435 | 433,1.0 436 | 434,0.0 437 | 435,0.0 438 | 436,1.0 439 | 437,0.0 440 | 438,1.0 441 | 439,1.0 442 | 440,1.0 443 | 441,1.0 444 | 442,1.0 445 | 443,0.0 446 | 444,1.0 447 | 445,1.0 448 | 446,1.0 449 | 447,1.0 450 | 448,0.0 451 | 449,0.0 452 | 450,0.0 453 | 451,1.0 454 | 452,1.0 455 | 453,1.0 456 | 454,1.0 457 | 455,1.0 458 | 456,0.0 459 | 457,1.0 460 | 458,1.0 461 | 459,0.0 462 | 460,0.0 463 | 461,1.0 464 | 462,1.0 465 | 463,1.0 466 | 464,0.0 467 | 465,1.0 468 | 466,1.0 469 | 467,0.0 470 | 468,0.0 471 | 469,1.0 472 | 470,1.0 473 | 471,0.0 474 | 472,1.0 475 | 473,0.0 476 | 474,1.0 477 | 475,1.0 478 | 476,0.0 479 | 477,0.0 480 | 478,1.0 481 | 479,1.0 482 | 480,1.0 483 | 481,0.0 484 | 482,0.0 485 | 483,1.0 486 | 484,1.0 487 | 485,0.0 488 | 486,1.0 489 | 487,1.0 490 | 488,1.0 491 | 489,0.0 492 | 490,1.0 493 | 491,0.0 494 | 492,1.0 495 | 493,1.0 496 | 494,1.0 497 | 495,0.0 498 | 496,0.0 499 | 497,1.0 500 | 498,1.0 501 | 499,0.0 502 | 500,0.0 503 | 501,0.0 504 | 502,1.0 505 | 503,1.0 506 | 504,1.0 507 | 505,1.0 508 | 506,1.0 509 | 507,0.0 510 | 508,0.0 511 | 509,1.0 512 | 510,1.0 513 | 511,0.0 514 | 512,1.0 515 | 513,1.0 516 | 514,0.0 517 | 515,0.0 518 | 516,1.0 519 | 517,1.0 520 | 518,1.0 521 | 519,0.0 522 | 520,0.0 523 | 521,0.0 524 | 522,0.0 525 | 523,0.0 526 | 524,0.0 527 | 525,1.0 528 | 526,1.0 529 | 527,1.0 530 | 528,0.0 531 | 529,0.0 532 | 530,1.0 533 | 531,0.0 534 | 532,0.0 535 | 533,0.0 536 | 534,1.0 537 | 535,1.0 538 | 536,1.0 539 | 537,1.0 540 | 538,1.0 541 | 539,0.0 542 | 540,1.0 543 | 541,1.0 544 | 542,1.0 545 | 543,1.0 546 | 544,0.0 547 | 545,1.0 548 | 546,1.0 549 | 547,1.0 550 | 548,0.0 551 | 549,1.0 552 | 550,0.0 553 | 551,1.0 554 | 552,0.0 555 | 553,1.0 556 | 554,1.0 557 | 555,1.0 558 | 556,0.0 559 | 557,1.0 560 | 558,0.0 561 | 559,0.0 562 | 560,1.0 563 | 561,0.0 564 | 562,0.0 565 | 563,1.0 566 | 564,0.0 567 | 565,0.0 568 | 566,0.0 569 | 567,1.0 570 | 568,1.0 571 | 569,1.0 572 | 570,0.0 573 | 571,1.0 574 | 572,0.0 575 | 573,0.0 576 | 574,0.0 577 | 575,0.0 578 | 576,1.0 579 | 577,1.0 580 | 578,1.0 581 | 579,1.0 582 | 580,1.0 583 | 581,1.0 584 | 582,1.0 585 | 583,1.0 586 | 584,0.0 587 | 585,1.0 588 | 586,0.0 589 | 587,1.0 590 | 588,0.0 591 | 589,1.0 592 | 590,1.0 593 | 591,0.0 594 | 592,0.0 595 | 593,1.0 596 | -------------------------------------------------------------------------------- /src/pyphm/datasets/airbus.py: -------------------------------------------------------------------------------- 1 | import scipy.io as sio 2 | import numpy as np 3 | import pandas as pd 4 | from pathlib import Path 5 | from .pyphm import PHMDataset 6 | from typing import Any, Callable, List, Optional, Tuple 7 | import pkg_resources 8 | from .utils import ( 9 | download_and_extract_archive, 10 | extract_archive, 11 | check_integrity, 12 | download_url, 13 | ) 14 | import os 15 | from urllib.error import URLError 16 | 17 | """ 18 | Contains the data prep class for the Airbus Helicopter Accelerometer Dataset. 19 | 20 | Also contains helper functions associated with the dataset. 21 | """ 22 | 23 | 24 | ############################################################################### 25 | # Data Prep Classes 26 | ############################################################################### 27 | class AirbusDataLoad(PHMDataset): 28 | """ 29 | Airbus Helicopter Accelerometer Dataset from .h5 file, and download if necessary. 30 | 31 | Args: 32 | root (string): Root directory to place all the data sets. 33 | 34 | dataset_folder_name (string): Name of folder containing raw data. 35 | This folder will be created in the root directory if not present. 36 | 37 | download (bool): If True, the data will be downloaded from ETH Zurich. 38 | 39 | """ 40 | 41 | mirrors = [ 42 | "https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/", 43 | ] 44 | 45 | resources = [ 46 | ("dftrain.h5", None), 47 | ("dfvalid.h5", None), 48 | ("dfvalid_groundtruth.csv", None), 49 | ] 50 | 51 | def __init__( 52 | self, 53 | root: Path, 54 | dataset_folder_name: str = "airbus", 55 | download: bool = False, 56 | path_df_labels: Path = None, 57 | ) -> None: 58 | super().__init__(root, dataset_folder_name) 59 | 60 | self.dataset_folder_path = self.root / self.dataset_folder_name 61 | 62 | if path_df_labels is not None: 63 | self.path_df_labels = path_df_labels 64 | else: 65 | # path of pyphm source directory using pathlib 66 | self.path_df_labels = Path(pkg_resources.resource_filename('pyphm', 'datasets/auxilary_metadata/airbus_dfvalid_groundtruth.csv')) 67 | 68 | if download: 69 | self.download() 70 | 71 | def _check_exists(self) -> bool: 72 | return all( 73 | check_integrity(self.dataset_folder_path / file_name) 74 | for file_name, _ in self.resources 75 | ) 76 | 77 | def download(self) -> None: 78 | """Download the Airbus Helicopter Accelerometer Dataset if it doesn't exist already.""" 79 | 80 | if self._check_exists(): 81 | return 82 | 83 | # pathlib makdir if not exists 84 | self.dataset_folder_path.mkdir(parents=True, exist_ok=True) 85 | 86 | # download files 87 | for filename, md5 in self.resources: 88 | for mirror in self.mirrors: 89 | url = f"{mirror}{filename}" 90 | try: 91 | print(f"Downloading {url}") 92 | 93 | download_url(url, self.dataset_folder_path, filename, md5) 94 | 95 | except URLError as error: 96 | print(f"Failed to download (trying next):\n{error}") 97 | continue 98 | finally: 99 | print() 100 | break 101 | else: 102 | raise RuntimeError(f"Error downloading {filename}") 103 | 104 | def load_df( 105 | self, 106 | train_or_val: str = "train", 107 | ) -> None: 108 | """Load the h5 file as df.""" 109 | 110 | if train_or_val == "train": 111 | df = pd.read_hdf(self.dataset_folder_path / "dftrain.h5", "dftrain") 112 | 113 | # add y column of all zeros (indicating no anomaly) 114 | df["y"] = 0 115 | 116 | else: # val dataset 117 | df = pd.read_hdf(self.dataset_folder_path / "dfvalid.h5", "dfvalid") 118 | 119 | # load the dfvalid_groundtruth.csv as dataframe 120 | df_labels = pd.read_csv( 121 | self.path_df_labels, 122 | dtype={"seqID": int, "anomaly": int}, 123 | ) 124 | 125 | # append the anomaly label to the df_val dataframe 126 | df = df.merge(df_labels, left_index=True, right_on="seqID") 127 | 128 | # drop the seqID column and rename the anomaly column to y 129 | df = df.drop("seqID", axis=1).rename(columns={"anomaly": "y"}) 130 | 131 | return df 132 | 133 | 134 | class AirbusPrepMethodA(AirbusDataLoad): 135 | """ 136 | Class used to prepare the Airbus Helicopter Accelerometer Dataset before feature engining or machine learning. 137 | Method is described in the paper: 138 | 139 | `Temporal signals to images: Monitoring the condition of industrial assets with deep learning image processing algorithms` 140 | by Garcia et al., 2021 - https://arxiv.org/abs/2005.07031 141 | 142 | Args: 143 | root (string): Root directory to place all the data sets. (likely the raw data folder) 144 | 145 | dataset_folder_name (string): Name of folder (within root) containing raw data. 146 | This folder will be created in the root directory if not present. 147 | 148 | download (bool): If True, the data will be downloaded from the ETH Zurich website. 149 | 150 | path_df_labels (Path, optional): Path to the csv with the labels. If not provided, it 151 | will default to airbus_dfvalid_groundtruth.csv in the auxilary_metadata folder. 152 | 153 | window_size (int): Size of the window to be used for the sliding window. 154 | 155 | stride (int): Size of the stride to be used for the sliding window. 156 | 157 | """ 158 | 159 | def __init__( 160 | self, 161 | root: Path, 162 | dataset_folder_name: str = "airbus", 163 | download: bool = False, 164 | path_df_labels: Path = None, 165 | window_size: int = 64, 166 | stride: int = 64, 167 | ) -> None: 168 | super().__init__(root, dataset_folder_name, download, path_df_labels) 169 | 170 | self.window_size = window_size # size of the window 171 | self.stride = stride # stride between windows 172 | 173 | def create_xy_arrays(self, train_or_val: str = "train"): 174 | """Create the x and y arrays used in deep learning. 175 | 176 | Parameters 177 | ---------- 178 | train_or_val : str 179 | Either 'train' or 'val' to indicate which dataset to use. Default is 'train'. 180 | 181 | Returns 182 | ------- 183 | x : ndarray 184 | Array of the signals (samples). Shape: (n_samples, n_windows, window_size) 185 | 186 | y : ndarray 187 | Array of the labels/meta-data for each signals. Shape: (n_samples, n_windows, window_size, label_columns) 188 | The label_columns (in order) are: 189 | time_increments (int) -- the index of each time increment in the window. e.g. (0, 1, 2, ...) 190 | sample_index (int) -- the index of each sample 191 | window_index (int) -- the index of each window 192 | label (int) -- the label of each windowed sample (0 for normal, 1 for anomaly) 193 | 194 | """ 195 | 196 | # load the dataframe 197 | df = self.load_df(train_or_val) 198 | 199 | x = df.drop("y", axis=1).to_numpy() 200 | y = df["y"].to_numpy() 201 | 202 | # instantiate the "temporary" lists to store the windows and labels 203 | window_list = [] 204 | y_sample_win_label_list = [] 205 | 206 | n_samples = x.shape[0] 207 | len_sample = x.shape[1] 208 | 209 | # fit the strided windows into the temporary list until the length 210 | # of the window does not equal the proper length (better way to do this???) 211 | for window_i in range(len_sample): 212 | windowed_signal = x[ 213 | :, window_i * self.stride : window_i * self.stride + self.window_size 214 | ] 215 | 216 | # if the windowed signal is the proper length, add it to the list 217 | if windowed_signal.shape == (n_samples, self.window_size): 218 | window_list.append(windowed_signal) 219 | 220 | y_sample_win_label_list.append( 221 | [ 222 | (int(sample_indices), int(window_indices), int(ys)) 223 | for sample_indices, window_indices, ys in list( 224 | zip(list(range(0, n_samples)), [window_i] * n_samples, y) 225 | ) 226 | ] 227 | ) 228 | 229 | else: 230 | break 231 | 232 | x = np.array(window_list).reshape(n_samples, -1, self.window_size) 233 | 234 | y_sample_win_label_array = np.array(y_sample_win_label_list)[:, :, np.newaxis].repeat( 235 | self.window_size, axis=2 236 | ) 237 | 238 | time_index = ( 239 | np.arange(0, self.window_size, 1)[np.newaxis, np.newaxis, :] 240 | .repeat(n_samples, axis=1) 241 | .repeat(x.shape[1], axis=0)[:, :, :, np.newaxis] 242 | ) 243 | 244 | y_time_sample_win_label_array = np.concatenate( 245 | (time_index, y_sample_win_label_array), axis=3 246 | ).reshape(n_samples, -1, self.window_size, 4) 247 | # window_id_array = np.expand_dims(np.array(window_id_list).reshape(-1), axis=1) 248 | # window_label_array = np.expand_dims(np.array(window_label_list).reshape(-1), axis=1) 249 | 250 | # x = np.vstack(window_list,) 251 | 252 | # y = np.hstack((window_label_array, window_id_array)) 253 | # return np.vstack(x), np.vstack(y_time_sig_win_label_array) 254 | return x, y_time_sample_win_label_array 255 | 256 | def create_xy_dataframe(self, train_or_val: str = "train"): 257 | """ 258 | Create a flat dataframe (2D array) of the x and y arrays. 259 | 260 | Amenable for use with TSFresh for feature engineering. 261 | 262 | Returns 263 | ------- 264 | df : pd.DataFrame 265 | Single flat dataframe containing each sample and its labels. 266 | columns: ['x', 'time_index', 'sample_index', 'window_index', 'y'] 267 | 268 | """ 269 | 270 | x, y = self.create_xy_arrays(train_or_val) # create the x and y arrays 271 | 272 | df = pd.DataFrame(np.vstack(x).reshape(-1,1), columns=['x']) 273 | 274 | # add the time_index, sample_index, window_index, and label columns 275 | # to the dataframe 276 | df = df.assign(time_index=np.vstack(y[:,:,:,0]).reshape(-1,1)) 277 | df = df.assign(sample_index=np.vstack(y[:,:,:,1]).reshape(-1,1)) 278 | df = df.assign(win_index=np.vstack(y[:,:,:,2]).reshape(-1,1)) 279 | df = df.assign(y=np.vstack(y[:,:,:,3]).reshape(-1,1)) 280 | 281 | return df 282 | -------------------------------------------------------------------------------- /notebooks/scratch/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "ModuleNotFoundError", 10 | "evalue": "No module named 'pyphm'", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 14 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 15 | "Cell \u001b[0;32mIn[1], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[1;32m 2\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/tim/Documents/PyPHM\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyphm\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _urlretrieve\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyphm\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmilling\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MillingDataLoad, MillingPrepMethodA\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpathlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Path\n", 16 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyphm'" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "import sys\n", 22 | "sys.path.append(r'/home/tim/Documents/PyPHM')\n", 23 | "from pyphm.datasets.utils import _urlretrieve\n", 24 | "from pyphm.datasets.milling import MillingDataLoad, MillingPrepMethodA\n", 25 | "from pathlib import Path\n", 26 | "import hashlib\n", 27 | "\n", 28 | "%load_ext autoreload\n", 29 | "%autoreload 2" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "/home/tim/Documents/PyPHM\n", 42 | "/home/tim/Documents/PyPHM/data\n", 43 | "\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "root_dir = Path.cwd().parent\n", 49 | "print(root_dir)\n", 50 | "path_data_raw_folder = Path(root_dir / 'data' )\n", 51 | "print(path_data_raw_folder)\n", 52 | "print(type(path_data_raw_folder))" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 4, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "type(root) = \n", 65 | "Loading data!!!!\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "mill = MillingDataLoad(path_data_raw_folder, download=False)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 5, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "type(root) = \n", 83 | "Loading data!!!!\n", 84 | "type field names: \n", 85 | "type signal names: \n", 86 | "('case', 'run', 'VB', 'time', 'DOC', 'feed', 'material', 'smcAC', 'smcDC', 'vib_table', 'vib_spindle', 'AE_table', 'AE_spindle')\n", 87 | "('AE_spindle', 'AE_table', 'vib_spindle', 'vib_table', 'smcDC', 'smcAC')\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "mill = MillingPrepMethodA(path_data_raw_folder, download=False)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "x.shape (11570, 64, 6)\n", 105 | "y.shape (11570, 64, 3)\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "x, y = mill.create_xy_arrays()\n", 111 | "print(\"x.shape\", x.shape)\n", 112 | "print(\"y.shape\", y.shape)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "array(['0', '0_0', '0.0'], dtype='\n", 144 | "\n", 157 | "\n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | "
cut_idcut_nocasetimeae_spindleae_tablevib_spindlevib_tablesmcdcsmcactool_class
00_0010.0000.2197270.2728270.7336432.1166996.8408200.1245120
10_0010.0040.2465820.3222660.7788092.2778326.660156-0.5615230
20_0010.0080.2941890.2838130.7580572.3437506.508789-2.0996090
30_0010.0120.3234860.2600100.7263182.4487306.542969-2.7319340
40_0010.0160.2905270.2532960.6530762.5463876.621094-3.5058590
\n", 247 | "" 248 | ], 249 | "text/plain": [ 250 | " cut_id cut_no case time ae_spindle ae_table vib_spindle vib_table \\\n", 251 | "0 0_0 0 1 0.000 0.219727 0.272827 0.733643 2.116699 \n", 252 | "1 0_0 0 1 0.004 0.246582 0.322266 0.778809 2.277832 \n", 253 | "2 0_0 0 1 0.008 0.294189 0.283813 0.758057 2.343750 \n", 254 | "3 0_0 0 1 0.012 0.323486 0.260010 0.726318 2.448730 \n", 255 | "4 0_0 0 1 0.016 0.290527 0.253296 0.653076 2.546387 \n", 256 | "\n", 257 | " smcdc smcac tool_class \n", 258 | "0 6.840820 0.124512 0 \n", 259 | "1 6.660156 -0.561523 0 \n", 260 | "2 6.508789 -2.099609 0 \n", 261 | "3 6.542969 -2.731934 0 \n", 262 | "4 6.621094 -3.505859 0 " 263 | ] 264 | }, 265 | "execution_count": 39, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "df = mill.create_xy_dataframe()\n", 272 | "df.head()" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 40, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/plain": [ 283 | "(740480, 11)" 284 | ] 285 | }, 286 | "execution_count": 40, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "df.shape" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 10, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "data": { 302 | "text/plain": [ 303 | "(11570, 64, 3)" 304 | ] 305 | }, 306 | "execution_count": 10, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "y.shape" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 8, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/plain": [ 323 | "(11570, 64, 6)" 324 | ] 325 | }, 326 | "execution_count": 8, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "x.shape" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 8, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "# sys.path.append(root_dir / 'pyphm')\n", 349 | "from pyphm.datasets.utils import _urlretrieve" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 9, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stdout", 366 | "output_type": "stream", 367 | "text": [ 368 | "d3ca5a418c2ed0887d68bc3f91991f12\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "def file_as_bytes(file):\n", 374 | " with file:\n", 375 | " return file.read()\n", 376 | "\n", 377 | "print(hashlib.md5(file_as_bytes(open(path_data_raw_folder / 'IMS.7z', 'rb'))).hexdigest())" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "# _urlretrieve('https://files.realpython.com/media/Python-Imports_Watermarked.ae72c8a00197.jpg', 'test.jpg')" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "import sys\n", 396 | "sys.path" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "import pyphm" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [] 414 | } 415 | ], 416 | "metadata": { 417 | "interpreter": { 418 | "hash": "a445fd1dd59e042f3702a5878c89afe1dbbe900f3b58e4a7756e0c9feaaac4f1" 419 | }, 420 | "kernelspec": { 421 | "display_name": "Python 3.8.12 64-bit ('ganzoo': conda)", 422 | "language": "python", 423 | "name": "python3" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 3 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython3", 435 | "version": "3.11.7" 436 | }, 437 | "orig_nbformat": 4 438 | }, 439 | "nbformat": 4, 440 | "nbformat_minor": 2 441 | } 442 | -------------------------------------------------------------------------------- /src/pyphm/datasets/ims.py: -------------------------------------------------------------------------------- 1 | import scipy.io as sio 2 | import numpy as np 3 | import pandas as pd 4 | from pathlib import Path 5 | from .pyphm import PHMDataset 6 | import datetime 7 | import time 8 | import multiprocessing as mp 9 | from typing import Any, Callable, List, Optional, Tuple 10 | from .utils import ( 11 | download_and_extract_archive, 12 | extract_archive, 13 | check_integrity, 14 | ) 15 | import os 16 | from urllib.error import URLError 17 | 18 | 19 | class ImsDataLoad(PHMDataset): 20 | """ 21 | Load the IMS bearing data set from .csv files, and download if necessary. 22 | 23 | Args: 24 | root (string): Root directory to place all the data sets. 25 | 26 | dataset_folder_name (string): Name of folder containing raw data. 27 | This folder will be created in the root directory if not present. 28 | 29 | download (bool): If True, the data will be downloaded from the NASA Prognostics Repository. 30 | 31 | """ 32 | 33 | mirrors = [ 34 | "https://drive.google.com/file/d/1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx/view?usp=sharing", 35 | "https://ti.arc.nasa.gov/m/project/prognostic-repository/", 36 | ] 37 | 38 | resources = [ 39 | ("IMS.7z", "d3ca5a418c2ed0887d68bc3f91991f12"), 40 | ] 41 | 42 | col_1st_names = [ 43 | "b1_ch1", 44 | "b1_ch2", 45 | "b2_ch3", 46 | "b2_ch4", 47 | "b3_ch5", 48 | "b3_ch6", 49 | "b4_ch7", 50 | "b4_ch8", 51 | ] 52 | col_2nd_names = col_3rd_names = ["b1_ch1", "b1_ch2", "b2_ch3", "b2_ch4"] 53 | 54 | def __init__( 55 | self, 56 | root: Path, 57 | dataset_folder_name: str = "ims", 58 | download: bool = False, 59 | dataset_path: Path = None, 60 | data: np.ndarray = None, 61 | sample_freq: float = 20480.0, 62 | ) -> None: 63 | super().__init__(root, dataset_folder_name) 64 | 65 | self.dataset_path = self.root / self.dataset_folder_name 66 | 67 | if download: 68 | self.download() 69 | 70 | if not self._check_exists(): 71 | raise RuntimeError( 72 | "Dataset not found. You can use download=True to download it" 73 | ) 74 | 75 | # set the paths for the three experiment run folders 76 | self.path_1st_folder = self.dataset_path / "1st_test" 77 | self.path_2nd_folder = self.dataset_path / "2nd_test" 78 | 79 | # the third test is labelled as the "4th_test" in the IMS.7z archive 80 | self.path_3rd_folder = self.dataset_path / "4th_test/txt" 81 | 82 | self.sample_freq = sample_freq 83 | 84 | def _check_exists(self) -> bool: 85 | return all( 86 | check_integrity(self.dataset_path / file_name) 87 | for file_name, _ in self.resources 88 | ) 89 | 90 | def download(self) -> None: 91 | """Download the UC Berkeley milling data if it doesn't exist already.""" 92 | 93 | if self._check_exists(): 94 | print("IMS.7z already exists.") 95 | return 96 | 97 | # pathlib makdir if not exists 98 | self.dataset_path.mkdir(parents=True, exist_ok=True) 99 | 100 | # download files 101 | for filename, md5 in self.resources: 102 | for mirror in self.mirrors: 103 | url = f"{mirror}{filename}" 104 | try: 105 | print(f"Downloading {url}") 106 | download_and_extract_archive( 107 | url, download_root=self.dataset_path, filename=filename, md5=md5 108 | ) 109 | 110 | # sequentially extract the .rar files 111 | rar_list = ["1st_test.rar", "2nd_test.rar", "3rd_test.rar"] 112 | for rar_file in rar_list: 113 | print(f"Extracting {rar_file}") 114 | extract_archive( 115 | self.dataset_path / rar_file, remove_finished=True 116 | ) 117 | 118 | except URLError as error: 119 | print(f"Failed to download (trying next):\n{error}") 120 | continue 121 | finally: 122 | print() 123 | break 124 | else: 125 | raise RuntimeError(f"Error downloading {filename}") 126 | 127 | def extract(self) -> None: 128 | """Extract the data set if it has already been dowloaded.""" 129 | 130 | if not self._check_exists(): 131 | print("IMS.7z does not exist. Please download.") 132 | return 133 | 134 | print("Extracting IMS.7z") 135 | 136 | # start with the .7z file 137 | extract_archive(self.dataset_path / "IMS.7z", remove_finished=False) 138 | 139 | # sequentially extract the .rar files 140 | rar_list = ["1st_test.rar", "2nd_test.rar", "3rd_test.rar"] 141 | for rar_file in rar_list: 142 | print(f"Extracting {rar_file}") 143 | extract_archive(self.dataset_path / rar_file, remove_finished=True) 144 | 145 | @staticmethod 146 | def process_raw_csv_to_dict(file_info_dict) -> None: 147 | """Load an individual sample (.csv file) of the IMS data set.""" 148 | 149 | path_run_folder = file_info_dict["path_run_folder"] 150 | file_name = file_info_dict["file_name"] 151 | run_no = file_info_dict["run_no"] 152 | sample_index = file_info_dict["sample_index"] 153 | 154 | # load the .csv file 155 | signals_array = np.loadtxt(path_run_folder / file_name, delimiter="\t") 156 | 157 | # get the start time (for the first sample) and convert to unix timestamp 158 | start_time_unix = time.mktime( 159 | datetime.datetime.strptime(file_name, "%Y.%m.%d.%H.%M.%S").timetuple() 160 | ) 161 | 162 | # create dictionary with the signals_array, id_list, run_list, file_list, time_step_array 163 | data_dict = { 164 | "signals_array": signals_array, 165 | "id": f"{run_no}_{sample_index}", 166 | "run_no": run_no, 167 | "file_name": file_name, 168 | "sample_index": sample_index, 169 | "start_time_unix": start_time_unix, 170 | } 171 | 172 | return data_dict 173 | 174 | def load_run_as_dict( 175 | self, 176 | run_no: int, 177 | n_jobs: int = None, 178 | ) -> None: 179 | if run_no == 1: 180 | col_names = self.col_1st_names 181 | path_run_folder = self.path_1st_folder 182 | elif run_no == 2: 183 | col_names = self.col_2nd_names 184 | path_run_folder = self.path_2nd_folder 185 | else: 186 | col_names = self.col_3rd_names 187 | path_run_folder = self.path_3rd_folder 188 | 189 | # create a list of dictionaries containing the metadata for each file 190 | file_info_list = [] 191 | for i, file_name in enumerate(sorted(os.listdir(path_run_folder))): 192 | file_info_list.append( 193 | { 194 | "path_run_folder": path_run_folder, 195 | "file_name": file_name, 196 | "col_names": col_names, 197 | "run_no": run_no, 198 | "sample_index": i, 199 | } 200 | ) 201 | 202 | # get number of cpu cores 203 | if n_jobs is None: 204 | n_jobs = mp.cpu_count() - 2 205 | if n_jobs < 1: 206 | n_jobs = 1 207 | print("n_jobs:", n_jobs) 208 | with mp.Pool(processes=n_jobs) as pool: 209 | 210 | # from https://stackoverflow.com/a/36590187 211 | data_list = pool.map(self.process_raw_csv_to_dict, file_info_list) 212 | 213 | # store the data from data_list as a dictionary, with the key being the file name 214 | data_dict = {} 215 | for data_dict_i in data_list: 216 | data_dict[data_dict_i["file_name"]] = data_dict_i 217 | return data_dict 218 | 219 | @staticmethod 220 | def process_raw_csv_to_df(file_info_dict) -> None: 221 | """Load an individual sample (.csv file) of the IMS data set.""" 222 | 223 | path_run_folder = file_info_dict["path_run_folder"] 224 | file_name = file_info_dict["file_name"] 225 | sample_freq = file_info_dict["sample_freq"] 226 | col_names = file_info_dict["col_names"] 227 | run_no = file_info_dict["run_no"] 228 | sample_index = file_info_dict["sample_index"] 229 | 230 | # load the .csv file 231 | signals_array = np.loadtxt(path_run_folder / file_name, delimiter="\t") 232 | 233 | id_list = [f"{run_no}_{sample_index}"] * len(signals_array) 234 | run_list = [run_no] * len(signals_array) 235 | file_list = [file_name] * len(signals_array) 236 | time_step_array = np.linspace( 237 | 0.0, len(signals_array) / sample_freq, len(signals_array) 238 | ) 239 | 240 | df = pd.DataFrame(np.vstack(signals_array), columns=col_names, dtype=np.float32) 241 | df["id"] = id_list 242 | df["run"] = run_list 243 | df["file"] = file_list 244 | df["time_step"] = np.hstack(time_step_array) 245 | 246 | return df.astype({"id": str, "run": int, "file": str, "time_step": np.float32}) 247 | 248 | def load_run_as_df( 249 | self, 250 | run_no: int, 251 | n_jobs: int = None, 252 | ) -> None: 253 | """Load the three runs as individual dataframes.""" 254 | 255 | if run_no == 1: 256 | col_names = self.col_1st_names 257 | path_run_folder = self.path_1st_folder 258 | elif run_no == 2: 259 | col_names = self.col_2nd_names 260 | path_run_folder = self.path_2nd_folder 261 | else: 262 | col_names = self.col_3rd_names 263 | path_run_folder = self.path_3rd_folder 264 | 265 | # get list of every file in the folder and sort by ascending date 266 | file_list = sorted(os.listdir(path_run_folder)) 267 | 268 | # create a list of dictionaries containing the metadata for each file 269 | file_info_list = [] 270 | for i, file_name in enumerate(sorted(os.listdir(path_run_folder))): 271 | file_info_list.append( 272 | { 273 | "path_run_folder": path_run_folder, 274 | "file_name": file_name, 275 | "sample_freq": self.sample_freq, 276 | "col_names": col_names, 277 | "run_no": run_no, 278 | "sample_index": i, 279 | } 280 | ) 281 | 282 | # get number of cpu cores 283 | if n_jobs is None: 284 | n_jobs = mp.cpu_count() - 2 285 | if n_jobs < 1: 286 | n_jobs = 1 287 | 288 | # load the dataframes in parallel 289 | with mp.Pool(processes=n_jobs) as pool: 290 | 291 | # from https://stackoverflow.com/a/36590187 292 | df_run = pool.map(self.process_raw_csv_to_df, file_info_list) 293 | df = pd.concat(df_run, ignore_index=True) 294 | 295 | col_names_ordered = ["id", "run", "file", "time_step"] + col_names 296 | 297 | return df[col_names_ordered] 298 | 299 | 300 | class ImsPrepMethodA(ImsDataLoad): 301 | """ 302 | Class used to prepare the IMS bearing dataset before feature engining or machine learning. 303 | 304 | Args: 305 | root (string): Root directory to place all the data sets. (likely the raw data folder) 306 | 307 | dataset_folder_name (string): Name of folder containing raw data. 308 | This folder will be created in the root directory if not present. 309 | 310 | download (bool): If True, the data will be downloaded from the NASA Prognostics Repository. 311 | 312 | path_df_labels (Path, optional): Path to the dataframe with the labels (as a string). 313 | If not provided, the dataframe must be created. 314 | 315 | window_size (int): Size of the window to be used for the sliding window. 316 | 317 | stride (int): Size of the stride to be used for the sliding window. 318 | 319 | cut_drop_list (list, optional): List of cut numbers to drop. cut_no 17 and 94 are erroneous. 320 | """ 321 | 322 | def __init__( 323 | self, 324 | root: Path, 325 | dataset_folder_name: str = "ims", 326 | download: bool = False, 327 | ) -> None: 328 | super().__init__( 329 | root, 330 | dataset_folder_name, 331 | download, 332 | ) 333 | 334 | def create_xy_arrays( 335 | self, 336 | run_no: int = 1, 337 | n_jobs: int = None, 338 | ) -> None: 339 | 340 | # create a list to store the x and y arrays 341 | x = [] # instantiate X's 342 | y_ids_runs_files_times_ctimes = [] # instantiate y's 343 | 344 | # create the data dict storing the signals and metadata 345 | data_dict = self.load_run_as_dict(run_no, n_jobs) 346 | 347 | # get all the file names from the data_dict and sort them 348 | file_names = sorted(data_dict.keys()) 349 | 350 | for i, file_name in enumerate(file_names): 351 | 352 | x.append(data_dict[file_name]["signals_array"]) 353 | y_ids_runs_files_times_ctimes.append( 354 | [ 355 | data_dict[file_name]["id"], 356 | data_dict[file_name]["run_no"], 357 | data_dict[file_name]["file_name"], 358 | data_dict[file_name]["sample_index"], 359 | data_dict[file_name]["start_time_unix"], 360 | ] 361 | ) 362 | 363 | x = np.stack(x) 364 | n_samples = x.shape[0] 365 | n_signals = x.shape[2] 366 | 367 | return x, np.stack(y_ids_runs_files_times_ctimes).reshape(-1, 5) 368 | 369 | def create_xy_df( 370 | self, 371 | run_no: int = 1, 372 | n_jobs: int = None, 373 | ) -> None: 374 | return self.load_run_as_df(run_no, n_jobs) 375 | -------------------------------------------------------------------------------- /notebooks/images/prauc_cnc.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | image/svg+xml0.0 4 | 0.2 5 | 0.4 6 | 0.6 7 | 0.8 8 | 1.0 9 | Recall 10 | 0.0 11 | 0.2 12 | 0.4 13 | 0.6 14 | 0.8 15 | 1.0 16 | Precision 17 | Precision-Recall Area-Under-Curve = 0.044 18 | Precision-Recall Curve 19 | Best model 20 | No skill model 21 | 0.0 22 | 0.2 23 | 0.4 24 | 0.6 25 | 0.8 26 | 1.0 27 | False Positive Rate 28 | 0.0 29 | 0.2 30 | 0.4 31 | 0.6 32 | 0.8 33 | 1.0 34 | True Positive Rate 35 | ROC Area-Under-Curve = 0.617 36 | ROC Curve 37 | 38 | -------------------------------------------------------------------------------- /notebooks/scratch/airbus_download.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyphm.datasets.utils import _urlretrieve, download_url\n", 10 | "from pathlib import Path\n", 11 | "from pyphm.datasets.airbus import AirbusDataLoad\n", 12 | "\n", 13 | "%load_ext autoreload\n", 14 | "%autoreload 2" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 6, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "/home/tim/Documents/PyPHM\n", 27 | "/home/tim/Documents/PyPHM/data/raw\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "root_dir = Path.cwd().parent.parent\n", 33 | "print(root_dir)\n", 34 | "path_data_raw_folder = Path(root_dir / 'data/raw/' )\n", 35 | "print(path_data_raw_folder)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 7, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "air = AirbusDataLoad(path_data_raw_folder, download=True)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 8, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "(1677, 61441)\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "df_train = air.load_df(train_or_val=\"train\")\n", 62 | "print(df_train.shape)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 9, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "(594, 61441)\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "df_val = air.load_df(train_or_val=\"val\")\n", 80 | "print(df_val.shape)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 12, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "Downloading https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/dftrain.h5 to c:\\_Python\\PyPHM\\notebooks\\dftrain.h5\n" 100 | ] 101 | }, 102 | { 103 | "name": "stderr", 104 | "output_type": "stream", 105 | "text": [ 106 | " 4%|▍ | 36639744/825280760 [00:04<01:32, 8542721.26it/s] \n" 107 | ] 108 | }, 109 | { 110 | "ename": "KeyboardInterrupt", 111 | "evalue": "", 112 | "output_type": "error", 113 | "traceback": [ 114 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 115 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 116 | "\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_20668/1413174493.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 7\u001b[0m ]\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0mdownload_url\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/dftrain.h5\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mroot\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mpath_cwd\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 117 | "\u001b[1;32mc:\\_python\\pyphm\\pyphm\\datasets\\utils.py\u001b[0m in \u001b[0;36mdownload_url\u001b[1;34m(url, root, filename, md5, max_redirect_hops)\u001b[0m\n\u001b[0;32m 176\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 177\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Downloading \"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0murl\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\" to \"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mfpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 178\u001b[1;33m \u001b[0m_urlretrieve\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 179\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mURLError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOSError\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# type: ignore[attr-defined]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 180\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"https\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 118 | "\u001b[1;32mc:\\_python\\pyphm\\pyphm\\datasets\\utils.py\u001b[0m in \u001b[0;36m_urlretrieve\u001b[1;34m(url, filename, chunk_size)\u001b[0m\n\u001b[0;32m 69\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m\"User-Agent\"\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mUSER_AGENT\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 70\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlength\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpbar\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 71\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[1;32min\u001b[0m \u001b[0miter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 72\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mchunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 73\u001b[0m \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 119 | "\u001b[1;32mc:\\_python\\pyphm\\pyphm\\datasets\\utils.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 69\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m\"User-Agent\"\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mUSER_AGENT\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 70\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlength\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpbar\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 71\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[1;32min\u001b[0m \u001b[0miter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 72\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mchunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 73\u001b[0m \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 120 | "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\http\\client.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, amt)\u001b[0m\n\u001b[0;32m 457\u001b[0m \u001b[1;31m# Amount is given, implement using readinto\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[0mb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 459\u001b[1;33m \u001b[0mn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 460\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mn\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtobytes\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 461\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 121 | "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\http\\client.py\u001b[0m in \u001b[0;36mreadinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 501\u001b[0m \u001b[1;31m# connection, and the user is reading more bytes than will be provided\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 502\u001b[0m \u001b[1;31m# (for example, reading in 1k chunks)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 503\u001b[1;33m \u001b[0mn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 504\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mn\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mb\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 505\u001b[0m \u001b[1;31m# Ideally, we would raise IncompleteRead if the content-length\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 122 | "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 667\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 668\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 669\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 670\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 671\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 123 | "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\ssl.py\u001b[0m in \u001b[0;36mrecv_into\u001b[1;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[0;32m 1239\u001b[0m \u001b[1;34m\"non-zero flags not allowed in calls to recv_into() on %s\"\u001b[0m \u001b[1;33m%\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1240\u001b[0m self.__class__)\n\u001b[1;32m-> 1241\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnbytes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1242\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1243\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 124 | "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\ssl.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, len, buffer)\u001b[0m\n\u001b[0;32m 1097\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1098\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mbuffer\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1099\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1100\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1101\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 125 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: " 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "mirrors = [\n", 131 | " \"https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/\",\n", 132 | "]\n", 133 | "\n", 134 | "resources = [\n", 135 | " (\"dftrain.h5?sequence=1&isAllowed=y\",),\n", 136 | "]\n", 137 | "\n", 138 | "download_url(\"https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/dftrain.h5\", root=path_cwd,)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Python 3.8.12 ('featstore')", 152 | "language": "python", 153 | "name": "python3" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.8.12" 166 | }, 167 | "orig_nbformat": 4, 168 | "vscode": { 169 | "interpreter": { 170 | "hash": "daff1afd4d675d5e247c0a95a5de0c03bd87d8f7edee7cb37c539016070f1c16" 171 | } 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 2 176 | } 177 | -------------------------------------------------------------------------------- /references/sources.bib: -------------------------------------------------------------------------------- 1 | @incollection{buckheit1995wavelab, 2 | title={Wavelab and reproducible research}, 3 | author={Buckheit, Jonathan B and Donoho, David L}, 4 | booktitle={Wavelets and statistics}, 5 | pages={55--81}, 6 | year={1995}, 7 | publisher={Springer} 8 | } 9 | 10 | @Article{ harris2020array, 11 | title = {Array programming with {NumPy}}, 12 | author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. 13 | van der Walt and Ralf Gommers and Pauli Virtanen and David 14 | Cournapeau and Eric Wieser and Julian Taylor and Sebastian 15 | Berg and Nathaniel J. Smith and Robert Kern and Matti Picus 16 | and Stephan Hoyer and Marten H. van Kerkwijk and Matthew 17 | Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del 18 | R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre 19 | G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and 20 | Warren Weckesser and Hameer Abbasi and Christoph Gohlke and 21 | Travis E. Oliphant}, 22 | year = {2020}, 23 | month = sep, 24 | journal = {Nature}, 25 | volume = {585}, 26 | number = {7825}, 27 | pages = {357--362}, 28 | doi = {10.1038/s41586-020-2649-2}, 29 | publisher = {Springer Science and Business Media {LLC}}, 30 | url = {https://doi.org/10.1038/s41586-020-2649-2} 31 | } 32 | 33 | @ARTICLE{2020SciPy-NMeth, 34 | author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and 35 | Haberland, Matt and Reddy, Tyler and Cournapeau, David and 36 | Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and 37 | Bright, Jonathan and {van der Walt}, St{\'e}fan J. and 38 | Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and 39 | Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and 40 | Kern, Robert and Larson, Eric and Carey, C J and 41 | Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and 42 | {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and 43 | Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and 44 | Harris, Charles R. and Archibald, Anne M. and 45 | Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and 46 | {van Mulbregt}, Paul and {SciPy 1.0 Contributors}}, 47 | title = {{{SciPy} 1.0: Fundamental Algorithms for Scientific 48 | Computing in Python}}, 49 | journal = {Nature Methods}, 50 | year = {2020}, 51 | volume = {17}, 52 | pages = {261--272}, 53 | adsurl = {https://rdcu.be/b08Wh}, 54 | doi = {10.1038/s41592-019-0686-2}, 55 | } 56 | 57 | @InProceedings{ mckinney-proc-scipy-2010, 58 | author = { {W}es {M}c{K}inney }, 59 | title = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython }, 60 | booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference }, 61 | pages = { 56 - 61 }, 62 | year = { 2010 }, 63 | editor = { {S}t\'efan van der {W}alt and {J}arrod {M}illman }, 64 | doi = { 10.25080/Majora-92bf1922-00a } 65 | } 66 | 67 | @article{donoho2008reproducible, 68 | title={Reproducible research in computational harmonic analysis}, 69 | author={Donoho, David L and Maleki, Arian and Rahman, Inam Ur and Shahram, Morteza and Stodden, Victoria}, 70 | journal={Computing in Science \& Engineering}, 71 | volume={11}, 72 | number={1}, 73 | pages={8--18}, 74 | year={2008}, 75 | publisher={IEEE} 76 | } 77 | 78 | @article{ince2012case, 79 | title={The case for open computer programs}, 80 | author={Ince, Darrel C and Hatton, Leslie and Graham-Cumming, John}, 81 | journal={Nature}, 82 | volume={482}, 83 | number={7386}, 84 | pages={485--488}, 85 | year={2012}, 86 | publisher={Nature Publishing Group} 87 | } 88 | 89 | 90 | @article{trouble_lab_2013, ISSN={0013-0613}, 91 | title={Trouble at the lab}, 92 | url={https://www.economist.com/briefing/2013/10/18/trouble-at-the-lab}, 93 | abstractNote={Scientists like to think of science as self-correcting. To an alarming degree, it is not}, 94 | journal={The Economist}, 95 | year={2013}, 96 | month={Oct}} 97 | 98 | @article{hu2022prognostics, 99 | title={Prognostics and health management: A review from the perspectives of design, development and decision}, 100 | author={Hu, Yang and Miao, Xuewen and Si, Yong and Pan, Ershun and Zio, Enrico}, 101 | journal={Reliability Engineering \& System Safety}, 102 | volume={217}, 103 | pages={108063}, 104 | year={2022}, 105 | publisher={Elsevier} 106 | } 107 | 108 | @article{national2019reproducibility, 109 | title={Reproducibility and replicability in science}, 110 | author={National Academies of Sciences, Engineering, and Medicine and others}, 111 | year={2019}, 112 | publisher={National Academies Press} 113 | } 114 | 115 | @inproceedings{stodden2018enabling, 116 | title={Enabling the verification of computational results: An empirical evaluation of computational reproducibility}, 117 | author={Stodden, Victoria and Krafczyk, Matthew S and Bhaskar, Adhithya}, 118 | booktitle={Proceedings of the First International Workshop on Practical Reproducible Evaluation of Computer Systems}, 119 | pages={1--5}, 120 | year={2018} 121 | } 122 | 123 | @article{gundersen2018reproducible, 124 | title={On reproducible AI: Towards reproducible research, open science, and digital scholarship in AI publications}, 125 | author={Gundersen, Odd Erik and Gil, Yolanda and Aha, David W}, 126 | journal={AI magazine}, 127 | volume={39}, 128 | number={3}, 129 | pages={56--68}, 130 | year={2018} 131 | } 132 | 133 | @book{chollet2021deep, 134 | title={Deep learning with Python}, 135 | author={Chollet, Francois}, 136 | year={2021}, 137 | publisher={Simon and Schuster} 138 | } 139 | 140 | @inproceedings{astfalck2016modelling, 141 | title={A modelling ecosystem for prognostics}, 142 | author={Astfalck, Lachlan and Hodkiewicz, Melinda and Keating, Adrian and Cripps, Edward and Pecht, Michael}, 143 | booktitle={Annual Conference of the PHM Society}, 144 | volume={8}, 145 | number={1}, 146 | year={2016} 147 | } 148 | 149 | @article{frachtenberg2022research, 150 | title={Research artifacts and citations in computer systems papers}, 151 | author={Frachtenberg, Eitan}, 152 | journal={PeerJ Computer Science}, 153 | volume={8}, 154 | pages={e887}, 155 | year={2022}, 156 | publisher={PeerJ Inc.} 157 | } 158 | 159 | @article{dorch2015data, 160 | title={The data sharing advantage in astrophysics}, 161 | author={Dorch, Bertil F and Drachen, Thea M and Ellegaard, Ole}, 162 | journal={Proceedings of the International Astronomical Union}, 163 | volume={11}, 164 | number={A29A}, 165 | pages={172--175}, 166 | year={2015}, 167 | publisher={Cambridge University Press} 168 | } 169 | 170 | @article{henneken2011linking, 171 | title={Linking to data-effect on citation rates in astronomy}, 172 | author={Henneken, Edwin A and Accomazzi, Alberto}, 173 | journal={arXiv preprint arXiv:1111.3618}, 174 | year={2011} 175 | } 176 | 177 | @article{piwowar2013data, 178 | title={Data reuse and the open data citation advantage}, 179 | author={Piwowar, Heather A and Vision, Todd J}, 180 | journal={PeerJ}, 181 | volume={1}, 182 | pages={e175}, 183 | year={2013}, 184 | publisher={PeerJ Inc.} 185 | } 186 | 187 | @article{piwowar2007sharing, 188 | title={Sharing detailed research data is associated with increased citation rate}, 189 | author={Piwowar, Heather A and Day, Roger S and Fridsma, Douglas B}, 190 | journal={PloS one}, 191 | volume={2}, 192 | number={3}, 193 | pages={e308}, 194 | year={2007}, 195 | publisher={Public Library of Science San Francisco, USA} 196 | } 197 | 198 | @article{colavizza2020citation, 199 | title={The citation advantage of linking publications to research data}, 200 | author={Colavizza, Giovanni and Hrynaszkiewicz, Iain and Staden, Isla and Whitaker, Kirstie and McGillivray, Barbara}, 201 | journal={PloS one}, 202 | volume={15}, 203 | number={4}, 204 | pages={e0230416}, 205 | year={2020}, 206 | publisher={Public Library of Science San Francisco, CA USA} 207 | } 208 | 209 | @article{fu2019meta, 210 | title={Meta-Research: Releasing a preprint is associated with more attention and citations for the peer-reviewed article}, 211 | author={Fu, Darwin Y and Hughey, Jacob J}, 212 | journal={Elife}, 213 | volume={8}, 214 | pages={e52646}, 215 | year={2019}, 216 | publisher={eLife Sciences Publications Limited} 217 | } 218 | 219 | @article{christensen2019study, 220 | title={A study of the impact of data sharing on article citations using journal policies as a natural experiment}, 221 | author={Christensen, Garret and Dafoe, Allan and Miguel, Edward and Moore, Don A and Rose, Andrew K}, 222 | journal={PLoS One}, 223 | volume={14}, 224 | number={12}, 225 | pages={e0225883}, 226 | year={2019}, 227 | publisher={Public Library of Science San Francisco, CA USA} 228 | } 229 | 230 | @article{wahlquist2018dissemination, 231 | title={Dissemination of novel biostatistics methods: Impact of programming code availability and other characteristics on article citations}, 232 | author={Wahlquist, Amy E and Muhammad, Lutfiyya N and Herbert, Teri Lynn and Ramakrishnan, Viswanathan and Nietert, Paul J}, 233 | journal={PloS one}, 234 | volume={13}, 235 | number={8}, 236 | pages={e0201590}, 237 | year={2018}, 238 | publisher={Public Library of Science San Francisco, CA USA} 239 | } 240 | 241 | @article{zilberman2021computer, 242 | title={Why computer occupations are behind strong STEM employment growth in the 2019--29 decade}, 243 | author={Zilberman, Alan and Ice, Lindsey}, 244 | journal={Computer}, 245 | volume={4}, 246 | number={5,164.6}, 247 | pages={11--5}, 248 | year={2021} 249 | } 250 | 251 | @article{rainie2017future, 252 | title={The Future of Jobs and Jobs Training.}, 253 | author={Rainie, Lee and Anderson, Janna}, 254 | journal={Pew Research Center}, 255 | year={2017}, 256 | publisher={ERIC} 257 | } 258 | 259 | @inproceedings{hars34working, 260 | title={Working for Free?--Motivations of Participating in Open Source Projects; 2001}, 261 | author={Hars, A and Ou, S}, 262 | booktitle={34th Annual Hawaii International Conference on System Sciences (HICSS-34), Hava{\'\i}}, 263 | pages={25--39} 264 | } 265 | 266 | @article{bitzer2007intrinsic, 267 | title={Intrinsic motivation in open source software development}, 268 | author={Bitzer, J{\"u}rgen and Schrettl, Wolfram and Schr{\"o}der, Philipp JH}, 269 | journal={Journal of comparative economics}, 270 | volume={35}, 271 | number={1}, 272 | pages={160--169}, 273 | year={2007}, 274 | publisher={Elsevier} 275 | } 276 | 277 | @misc{neurodatascience, 278 | url={https://neurodatascience.github.io/QLS612-Overview/}, 279 | title={An introduction to the foundations of neuro data science}, 280 | publisher={McGill University}, } 281 | 282 | @misc{ucberkeleyreproducible, 283 | title={Reproducible and Collaborative Data Science}, 284 | url={https://berkeley-stat159-f17.github.io/stat159-f17/}, 285 | abstractNote={A project-based introduction to statistical data science. 286 | Through lectures, computational laboratories, readings, homeworks, and a 287 | group project, you will learn practical techniques and tools for producing statistically sound and appropriate, reproducible, and verifiable computational answers to scientific 288 | questions. The course emphasizes version control, testing, process 289 | automation, code review, and collaborative programming. Software tools 290 | include Bash, Git, Python, Jupyter and LATEX}, 291 | publisher={University of California, Berkeley} } 292 | 293 | @misc{harvard2017reproducible, url={https://pll.harvard.edu/course/principles-statistical-and-computational-tools-reproducible-data-science}, 294 | title={Principles, Statistical and Computational Tools for Reproducible Data Science}, 295 | abstractNote={Learn skills and tools that support data science and reproducible research, to ensure you can trust your own research 296 | results, reproduce them yourself, and communicate them to others.}, 297 | publisher={Harvard University}, 298 | year={2017}, 299 | month={Oct} } 300 | 301 | @article{stodden2013toward, 302 | title={Toward reproducible computational research: an empirical analysis of data and code policy adoption by journals}, 303 | author={Stodden, Victoria and Guo, Peixuan and Ma, Zhaokun}, 304 | journal={PloS one}, 305 | volume={8}, 306 | number={6}, 307 | pages={e67111}, 308 | year={2013}, 309 | publisher={Public Library of Science San Francisco, USA} 310 | } 311 | 312 | @article{zhao2019deep, 313 | title={Deep learning and its applications to machine health monitoring}, 314 | author={Zhao, Rui and Yan, Ruqiang and Chen, Zhenghua and Mao, Kezhi and Wang, Peng and Gao, Robert X}, 315 | journal={Mechanical Systems and Signal Processing}, 316 | volume={115}, 317 | pages={213--237}, 318 | year={2019}, 319 | publisher={Elsevier} 320 | } 321 | 322 | @article{wang2021recent, 323 | title={Recent Advancement of Deep Learning Applications to Machine Condition Monitoring Part 1: A Critical Review}, 324 | author={Wang, Wenyi and Taylor, John and Rees, Robert J}, 325 | journal={Acoustics Australia}, 326 | pages={1--13}, 327 | year={2021}, 328 | publisher={Springer} 329 | } 330 | 331 | @article{lee2007bearing, 332 | title={Bearing data set}, 333 | author={Lee, J and Qiu, H and Yu, G and Lin, Ja and others}, 334 | journal={IMS, University of Cincinnati, NASA Ames Prognostics Data Repository, Rexnord Technical Services}, 335 | year={2007} 336 | } 337 | 338 | @article{agogino2007milling, 339 | title={Milling data set. NASA Ames Prognostics Data Repository}, 340 | author={Agogino, A and Goebel, K}, 341 | journal={Moffett Field, CA}, 342 | year={2007}, 343 | url={https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/} 344 | } 345 | 346 | @article{garcia2021temporal, 347 | title={Temporal signals to images: Monitoring the condition of industrial assets with deep learning image processing algorithms}, 348 | author={Garcia, Gabriel Rodriguez and Michau, Gabriel and Ducoffe, M{\'e}lanie and Gupta, Jayant Sen and Fink, Olga}, 349 | journal={Proceedings of the Institution of Mechanical Engineers, Part O: Journal of Risk and Reliability}, 350 | pages={1748006X21994446}, 351 | year={2021}, 352 | publisher={SAGE Publications Sage UK: London, England} 353 | } 354 | 355 | @article{esteban2019fmriprep, 356 | title={fMRIPrep: a robust preprocessing pipeline for functional MRI}, 357 | author={Esteban, Oscar and Markiewicz, Christopher J and Blair, Ross W and Moodie, Craig A and Isik, A Ilkay and Erramuzpe, Asier and Kent, James D and Goncalves, Mathias and DuPre, Elizabeth and Snyder, Madeleine and others}, 358 | journal={Nature methods}, 359 | volume={16}, 360 | number={1}, 361 | pages={111--116}, 362 | year={2019}, 363 | publisher={Nature Publishing Group} 364 | } 365 | 366 | @software{christian_s_perone_2018_1495335, 367 | author = {Christian S. Perone and 368 | cclauss and 369 | Elvis Saravia and 370 | Pedro Lemos Ballester and 371 | MohitTare}, 372 | title = {perone/medicaltorch: Release v0.2}, 373 | month = nov, 374 | year = 2018, 375 | publisher = {Zenodo}, 376 | version = {v0.2}, 377 | doi = {10.5281/zenodo.1495335}, 378 | url = {https://doi.org/10.5281/zenodo.1495335} 379 | } 380 | 381 | @INPROCEEDINGS{astroML, 382 | author={{Vanderplas}, J.T. and {Connolly}, A.J. 383 | and {Ivezi{\'c}}, {\v Z}. and {Gray}, A.}, 384 | booktitle={Conference on Intelligent Data Understanding (CIDU)}, 385 | title={Introduction to astroML: Machine learning for astrophysics}, 386 | month={oct.}, 387 | pages={47 -54}, 388 | doi={10.1109/CIDU.2012.6382200}, 389 | year={2012}} 390 | 391 | @incollection{NEURIPS2019_9015, 392 | title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library}, 393 | author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith}, 394 | booktitle = {Advances in Neural Information Processing Systems 32}, 395 | editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett}, 396 | pages = {8024--8035}, 397 | year = {2019}, 398 | publisher = {Curran Associates, Inc.}, 399 | url = {http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf} 400 | } 401 | 402 | @book{Bird_Natural_Language_Processing_2009, 403 | author = {Bird, Steven and Klein, Ewan and Loper, Edward}, 404 | publisher = {O'Reilly Media, Inc.}, 405 | title = {{Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit}}, 406 | year = {2009}} 407 | 408 | @article{wilson2014software, 409 | title={Software Carpentry: lessons learned}, 410 | author={Wilson, Greg}, 411 | journal={F1000Research}, 412 | volume={3}, 413 | year={2014}, 414 | publisher={Faculty of 1000 Ltd} 415 | } 416 | 417 | -------------------------------------------------------------------------------- /src/pyphm/datasets/milling.py: -------------------------------------------------------------------------------- 1 | import scipy.io as sio 2 | import numpy as np 3 | import pandas as pd 4 | from pathlib import Path 5 | from .pyphm import PHMDataset 6 | from typing import Any, Callable, List, Optional, Tuple 7 | import pkg_resources 8 | from .utils import ( 9 | download_and_extract_archive, 10 | extract_archive, 11 | check_integrity, 12 | ) 13 | import os 14 | from urllib.error import URLError 15 | 16 | """ 17 | Contains the data prep class for the UC-Berkely milling data set. 18 | 19 | Also contains helper functions associated with the milling data set. 20 | """ 21 | 22 | 23 | ############################################################################### 24 | # Data Prep Classes 25 | ############################################################################### 26 | class MillingDataLoad(PHMDataset): 27 | """ 28 | Load the UC Berkely milling data set from .mat file, and download if necessary. 29 | 30 | Args: 31 | root (string): Root directory to place all the data sets. 32 | 33 | dataset_folder_name (string): Name of folder containing raw data. 34 | This folder will be created in the root directory if not present. 35 | 36 | download (bool): If True, the data will be downloaded from the NASA Prognostics Repository. 37 | 38 | """ 39 | 40 | resources = [ 41 | { 42 | "name": "aws", 43 | "url": "https://phm-datasets.s3.amazonaws.com/NASA/", 44 | "files": [ 45 | { 46 | "filename": "3.+Milling.zip", 47 | "md5": "4da3afb0aa50cb3dcdd8e20ed1ed1c7c", 48 | } 49 | ], 50 | }, 51 | { 52 | "name": "github", 53 | "url": "https://github.com/tvhahn/Manufacturing-Data-Science-with-Python/raw/master/Data%20Sets/milling_uc_berkeley/raw/", 54 | "files": [ 55 | { 56 | "filename": "mill.zip", 57 | "md5": "81d821fdef812183a7d38b6f83f7cefa", 58 | } 59 | ], 60 | }, 61 | ] 62 | 63 | def __init__( 64 | self, 65 | root: Path, 66 | dataset_folder_name: str = "milling", 67 | data_file_name: str = "mill.mat", 68 | download: bool = False, 69 | data: np.ndarray = None, 70 | ) -> None: 71 | super().__init__(root, dataset_folder_name) 72 | 73 | self.dataset_folder_path = self.root / self.dataset_folder_name 74 | self.data_file_name = data_file_name 75 | 76 | if download: 77 | self.download() 78 | 79 | data_file_path = self.dataset_folder_path / self.data_file_name 80 | # assert that data_file_path exists 81 | assert data_file_path.exists(), f"{data_file_path} does not exist." 82 | 83 | self.data = self.load_mat() 84 | 85 | def _check_exists(self) -> bool: 86 | for source in self.resources: 87 | for file in source["files"]: 88 | file_name = file["filename"] 89 | file_path = self.dataset_folder_path / file_name 90 | if not check_integrity(file_path, file["md5"]): 91 | return False 92 | return True 93 | 94 | 95 | def download(self) -> None: 96 | """Download the data files from their sources if they don't exist already.""" 97 | 98 | if self._check_exists(): 99 | print("Files already downloaded and verified.") 100 | return 101 | 102 | # Ensure the dataset folder exists 103 | self.dataset_folder_path.mkdir(parents=True, exist_ok=True) 104 | 105 | successful_download = False 106 | 107 | for source in self.resources: 108 | all_files_downloaded = True # Assume success, prove otherwise 109 | 110 | for file in source["files"]: 111 | file_name = file["filename"] 112 | md5 = file["md5"] 113 | file_path = self.dataset_folder_path / file_name 114 | 115 | # Check if the file already exists and is verified 116 | if check_integrity(file_path, md5): 117 | print(f"{file_name} already exists and is verified.") 118 | continue # Skip to the next file as this one is already handled 119 | 120 | # Construct the URL for downloading 121 | url = f"{source['url']}{file_name}" 122 | 123 | try: 124 | print(f"Attempting to download {url}") 125 | download_and_extract_archive( 126 | url, 127 | download_root=str(self.dataset_folder_path), 128 | filename=file_name, 129 | md5=md5, 130 | remove_finished=True, 131 | ) 132 | # After successful download and extraction, check for and extract any nested archive 133 | self.check_and_extract_nested(file_path.parent) 134 | 135 | except URLError as error: 136 | print(f"Failed to download {file_name} from {source['name']}:\n{error}") 137 | all_files_downloaded = False # Mark as failed to trigger another source attempt 138 | break # Exit the file loop to try the next source 139 | 140 | if all_files_downloaded: 141 | successful_download = True 142 | print(f"Successfully downloaded all files from {source['name']}") 143 | break # Exit the source loop since we've successfully downloaded from this source 144 | 145 | if not successful_download: 146 | raise RuntimeError("Failed to download files from all sources.") 147 | 148 | def check_and_extract_nested(self, directory: Path) -> None: 149 | """Check for and extract any nested archives in the given directory.""" 150 | for item in directory.iterdir(): 151 | if item.is_dir(): 152 | # Check each directory for nested archives 153 | for nested_item in item.iterdir(): 154 | if nested_item.suffix in ['.zip', '.tar', '.gz']: 155 | print(f"Found nested archive: {nested_item}") 156 | extract_archive(str(nested_item), str(directory), remove_finished=True) 157 | 158 | 159 | 160 | def load_mat(self) -> np.ndarray: 161 | """Load the mat file and return the data as a numpy array.""" 162 | data = sio.loadmat(self.dataset_folder_path / self.data_file_name, struct_as_record=True) 163 | return data["mill"] 164 | 165 | 166 | class MillingPrepMethodA(MillingDataLoad): 167 | """ 168 | Class used to prepare the UC Berkeley milling dataset before feature engining or machine learning. 169 | Method is described in the paper: 170 | 171 | `Self-supervised learning for tool wear monitoring with a disentangled-variational-autoencoder` 172 | by von Hahn and Mechefkse, 2021 173 | 174 | Args: 175 | root (string): Root directory to place all the data sets. (likely the raw data folder) 176 | 177 | dataset_folder_name (string): Name of folder (within root) containing raw data. 178 | This folder will be created in the root directory if not present. 179 | 180 | download (bool): If True, the data will be downloaded from the NASA Prognostics Repository. 181 | 182 | path_csv_labels (Path, optional): Path to the csv of the label dataframe. 183 | If not provided, the 'milling_labels_with_tool_class.csv' will be used, provided in the 184 | PyPHM package. 185 | 186 | window_len (int): Length of the window to be used for the sliding window. 187 | 188 | stride (int): Amount to move (stride) between individual windows of data. 189 | 190 | cut_drop_list (list, optional): List of cut numbers to drop. cut_no 17 and 94 are erroneous and 191 | will be dropped as default. 192 | """ 193 | 194 | def __init__( 195 | self, 196 | root: Path, 197 | dataset_folder_name: str = "milling", 198 | dataset_folder_path: Path = None, 199 | data_file_name: str = "mill.mat", 200 | download: bool = False, 201 | data: np.ndarray = None, 202 | path_csv_labels: Path = None, 203 | window_len: int = 64, 204 | stride: int = 64, 205 | cut_drop_list: List[int] = [17, 94], 206 | ) -> None: 207 | super().__init__(root, dataset_folder_name, data_file_name, download, data) 208 | 209 | self.window_len = window_len # size of the window 210 | self.stride = stride # stride between windows 211 | self.cut_drop_list = cut_drop_list # list of cut numbers to be dropped 212 | 213 | if path_csv_labels is not None: 214 | self.path_csv_labels = path_csv_labels 215 | else: 216 | # path of pyphm source directory using pathlib 217 | self.path_csv_labels = Path( 218 | pkg_resources.resource_filename( 219 | "pyphm", "datasets/auxilary_metadata/milling_labels_with_tool_class.csv" 220 | ) 221 | ) 222 | 223 | # load the labels dataframe 224 | self.df_labels = pd.read_csv(self.path_csv_labels) 225 | 226 | if self.cut_drop_list is not None: 227 | self.df_labels.drop(self.cut_drop_list, inplace=True) # drop the cuts that are bad 228 | 229 | self.df_labels.reset_index(drop=True, inplace=True) # reset the index 230 | 231 | self.field_names = self.data.dtype.names 232 | 233 | self.signal_names = self.field_names[7:][::-1] 234 | 235 | def create_labels(self): 236 | """Function that will create the label dataframe from the mill data set 237 | 238 | Only needed if the dataframe with the labels is not provided. 239 | """ 240 | 241 | # create empty dataframe for the labels 242 | df_labels = pd.DataFrame() 243 | 244 | # get the labels from the original .mat file and put in dataframe 245 | for i in range(7): 246 | # list for storing the label data for each field 247 | x = [] 248 | 249 | # iterate through each of the unique cuts 250 | for j in range(167): 251 | x.append(self.data[0, j][i][0][0]) 252 | x = np.array(x) 253 | df_labels[str(i)] = x 254 | 255 | # add column names to the dataframe 256 | df_labels.columns = self.field_names[0:7] 257 | 258 | # create a column with the unique cut number 259 | df_labels["cut_no"] = [i for i in range(167)] 260 | 261 | def tool_state(cols): 262 | """Add the label to the cut. 263 | 264 | Categories are: 265 | Healthy Sate (label=0): 0~0.2mm flank wear 266 | Degredation State (label=1): 0.2~0.7mm flank wear 267 | Failure State (label=2): >0.7mm flank wear 268 | """ 269 | # pass in the tool wear, VB, column 270 | vb = cols 271 | 272 | if vb < 0.2: 273 | return 0 274 | elif vb >= 0.2 and vb < 0.7: 275 | return 1 276 | elif pd.isnull(vb): 277 | pass 278 | else: 279 | return 2 280 | 281 | # apply the label to the dataframe 282 | df_labels["tool_class"] = df_labels["VB"].apply(tool_state) 283 | 284 | return df_labels 285 | 286 | def create_data_array(self, cut_no): 287 | """Create an array from an individual cut sample. 288 | 289 | Parameters 290 | =========== 291 | cut_no : int 292 | Index of the cut to be used. 293 | 294 | Returns 295 | =========== 296 | sub_cut_array : np.array 297 | Array of the cut samples. Shape of [no. samples, sample len, features/sample] 298 | 299 | sub_cut_labels : np.array 300 | Array of the labels for the cut samples. Shape of [# samples, # features/sample] 301 | 302 | """ 303 | 304 | assert cut_no in self.df_labels["cut_no"].values, "Cut number must be in the dataframe" 305 | 306 | # create a numpy array of the cut 307 | # with a final array shape like [no. cuts, len cuts, no. signals] 308 | cut = self.data[0, cut_no] 309 | for i, signal_name in enumerate(self.signal_names): 310 | if i == 0: 311 | cut_array = cut[signal_name].reshape((9000, 1)) 312 | else: 313 | cut_array = np.concatenate((cut_array, cut[signal_name].reshape((9000, 1))), axis=1) 314 | 315 | # select the start and end of the cut 316 | start = self.df_labels[self.df_labels["cut_no"] == cut_no]["window_start"].values[0] 317 | end = self.df_labels[self.df_labels["cut_no"] == cut_no]["window_end"].values[0] 318 | cut_array = cut_array[start:end, :] 319 | 320 | # instantiate the "temporary" lists to store the sub-cuts and metadata 321 | sub_cut_list = [] 322 | sub_cut_id_list = [] 323 | sub_cut_label_list = [] 324 | 325 | # get the labels for the cut 326 | label = self.df_labels[self.df_labels["cut_no"] == cut_no]["tool_class"].values[0] 327 | 328 | # fit the strided windows into the dummy_array until the length 329 | # of the window does not equal the proper length (better way to do this???) 330 | for i in range(cut_array.shape[0]): 331 | windowed_signal = cut_array[i * self.stride : i * self.stride + self.window_len] 332 | 333 | # if the windowed signal is the proper length, add it to the list 334 | if windowed_signal.shape == (self.window_len, 6): 335 | sub_cut_list.append(windowed_signal) 336 | 337 | # create sub_cut_id fstring to keep track of the cut_id and the window_id 338 | sub_cut_id_list.append(f"{cut_no}_{i}") 339 | 340 | # create the sub_cut_label and append it to the list 341 | sub_cut_label_list.append(int(label)) 342 | 343 | else: 344 | break 345 | 346 | sub_cut_array = np.array(sub_cut_list) 347 | 348 | sub_cut_ids = np.expand_dims(np.array(sub_cut_id_list, dtype=str), axis=1) 349 | sub_cut_ids = np.repeat(sub_cut_ids, sub_cut_array.shape[1], axis=1) 350 | 351 | sub_cut_labels = np.expand_dims(np.array(sub_cut_label_list, dtype=int), axis=1) 352 | sub_cut_labels = np.repeat(sub_cut_labels, sub_cut_array.shape[1], axis=1) 353 | 354 | # take the length of the signals in the sub_cut_array 355 | # and divide it by the frequency (250 Hz) to get the time (seconds) of each sub-cut 356 | sub_cut_times = np.expand_dims(np.arange(0, sub_cut_array.shape[1]) / 250.0, axis=0) 357 | sub_cut_times = np.repeat( 358 | sub_cut_times, 359 | sub_cut_array.shape[0], 360 | axis=0, 361 | ) 362 | 363 | sub_cut_labels_ids_times = np.stack((sub_cut_labels, sub_cut_ids, sub_cut_times), axis=2) 364 | 365 | return ( 366 | sub_cut_array, 367 | sub_cut_labels, 368 | sub_cut_ids, 369 | sub_cut_times, 370 | sub_cut_labels_ids_times, 371 | ) 372 | 373 | def create_xy_arrays(self): 374 | """Create the x and y arrays used in deep learning. 375 | 376 | Returns 377 | =========== 378 | x_array : np.array 379 | Array of the cut samples. Shape of [no. samples, sample len, features/sample] 380 | 381 | y_array : np.array 382 | Array of the labels for the cut samples. Shape of [no. samples, sample len, label/ids/times] 383 | Use y[:,0,:], for example, to get the y in a shape of [no. samples, label/ids/times] 384 | ( e.g. will be shape (no. samples, 3) ) 385 | 386 | """ 387 | 388 | # create a list to store the x and y arrays 389 | x = [] # instantiate X's 390 | y_labels_ids_times = [] # instantiate y's 391 | 392 | # iterate throught the df_labels 393 | for i in self.df_labels.itertuples(): 394 | ( 395 | sub_cut_array, 396 | sub_cut_labels, 397 | sub_cut_ids, 398 | sub_cut_times, 399 | sub_cut_labels_ids_times, 400 | ) = self.create_data_array(i.cut_no) 401 | 402 | x.append(sub_cut_array) 403 | y_labels_ids_times.append(sub_cut_labels_ids_times) 404 | 405 | return np.vstack(x), np.vstack(y_labels_ids_times) 406 | 407 | def create_xy_dataframe(self): 408 | """ 409 | Create a flat dataframe (2D array) of the x and y arrays. 410 | 411 | Amenable for use with TSFresh for feature engineering. 412 | 413 | Returns 414 | =========== 415 | df : pd.DataFrame 416 | Single flat dataframe containing each sample and its labels. 417 | 418 | """ 419 | 420 | x, y_labels_ids_times = self.create_xy_arrays() # create the x and y arrays 421 | 422 | # concatenate the x and y arrays and reshape them to be a flat array (2D) 423 | x_labels = np.reshape(np.concatenate((x, y_labels_ids_times), axis=2), (-1, 9)) 424 | 425 | # define the column names and the data types 426 | col_names = [s.lower() for s in list(self.signal_names)] + [ 427 | "tool_class", 428 | "cut_id", 429 | "time", 430 | ] 431 | 432 | col_names_ordered = [ 433 | "cut_id", 434 | "cut_no", 435 | "case", 436 | "time", 437 | "ae_spindle", 438 | "ae_table", 439 | "vib_spindle", 440 | "vib_table", 441 | "smcdc", 442 | "smcac", 443 | "tool_class", 444 | ] 445 | 446 | col_dtype = [ 447 | str, 448 | int, 449 | int, 450 | np.float32, 451 | np.float32, 452 | np.float32, 453 | np.float32, 454 | np.float32, 455 | np.float32, 456 | np.float32, 457 | int, 458 | ] 459 | 460 | col_dtype_dict = dict(zip(col_names_ordered, col_dtype)) 461 | 462 | # create a dataframe from the x and y arrays 463 | df = pd.DataFrame(x_labels, columns=col_names, dtype=str) 464 | 465 | # split the cut_id by "_" and take the first element (cut_no) 466 | df["cut_no"] = df["cut_id"].str.split("_").str[0] 467 | 468 | # get the case from each cut_no using the df_labels 469 | df = df.merge( 470 | self.df_labels[["cut_no", "case"]].astype(dtype=str), 471 | on="cut_no", 472 | how="left", 473 | ) 474 | 475 | df = df[col_names_ordered].astype(col_dtype_dict) # reorder the columns 476 | 477 | return df 478 | -------------------------------------------------------------------------------- /src/pyphm/datasets/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | BSD 3-Clause License 3 | 4 | The utils.py is Copyright (c) Soumith Chintala 2016, (from pytorch/vision) 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from 19 | this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | """ 32 | 33 | import bz2 34 | import gzip 35 | import hashlib 36 | import itertools 37 | import lzma 38 | import os 39 | import os.path 40 | import pathlib 41 | from pathlib import Path 42 | import re 43 | import tarfile 44 | import rarfile # needed for IMS dataset 45 | import py7zr # needed for IMS dataset 46 | import urllib 47 | import urllib.error 48 | import urllib.request 49 | import zipfile 50 | from typing import Any, Callable, List, Iterable, Optional, TypeVar, Dict, IO, Tuple, Iterator 51 | from urllib.parse import urlparse 52 | import gdown 53 | 54 | import requests 55 | from tqdm.auto import tqdm 56 | 57 | 58 | def _download_file_from_remote_location(fpath: str, url: str) -> None: 59 | pass 60 | 61 | 62 | def _is_remote_location_available() -> bool: 63 | return False 64 | 65 | USER_AGENT = "PyPHM" 66 | 67 | 68 | def _urlretrieve(url: str, filename: str, chunk_size: int = 1024) -> None: 69 | with open(filename, "wb") as fh: 70 | with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": USER_AGENT})) as response: 71 | with tqdm(total=response.length) as pbar: 72 | for chunk in iter(lambda: response.read(chunk_size), ""): 73 | if not chunk: 74 | break 75 | pbar.update(chunk_size) 76 | fh.write(chunk) 77 | 78 | 79 | def gen_bar_updater() -> Callable[[int, int, int], None]: 80 | pbar = tqdm(total=None) 81 | 82 | def bar_update(count, block_size, total_size): 83 | if pbar.total is None and total_size: 84 | pbar.total = total_size 85 | progress_bytes = count * block_size 86 | pbar.update(progress_bytes - pbar.n) 87 | 88 | return bar_update 89 | 90 | 91 | def calculate_md5(fpath: Path, chunk_size: int = 1024 * 1024) -> str: 92 | md5 = hashlib.md5() 93 | with open(fpath, "rb") as f: 94 | for chunk in iter(lambda: f.read(chunk_size), b""): 95 | md5.update(chunk) 96 | return md5.hexdigest() 97 | 98 | 99 | def check_md5(fpath: Path, md5: str, **kwargs: Any) -> bool: 100 | return md5 == calculate_md5(fpath, **kwargs) 101 | 102 | 103 | def check_integrity(fpath: Path, md5: Optional[str] = None) -> bool: 104 | fpath = Path(fpath) 105 | if not fpath.exists() and not fpath.is_file(): 106 | return False 107 | if md5 is None: 108 | return True 109 | return check_md5(fpath, md5) 110 | 111 | 112 | def _get_redirect_url(url: str, max_hops: int = 3) -> str: 113 | initial_url = url 114 | headers = {"Method": "HEAD", "User-Agent": USER_AGENT} 115 | 116 | for _ in range(max_hops + 1): 117 | with urllib.request.urlopen(urllib.request.Request(url, headers=headers)) as response: 118 | if response.url == url or response.url is None: 119 | return url 120 | 121 | url = response.url 122 | else: 123 | raise RecursionError( 124 | f"Request to {initial_url} exceeded {max_hops} redirects. The last redirect points to {url}." 125 | ) 126 | 127 | 128 | def _get_google_drive_file_id(url: str) -> Optional[str]: 129 | parts = urlparse(url) 130 | 131 | if re.match(r"(drive|docs)[.]google[.]com", parts.netloc) is None: 132 | return None 133 | 134 | match = re.match(r"/file/d/(?P[^/]*)", parts.path) 135 | if match is None: 136 | return None 137 | 138 | return match.group("id") 139 | 140 | 141 | def download_url( 142 | url: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None, max_redirect_hops: int = 3 143 | ) -> None: 144 | """Download a file from a url and place it in root. 145 | 146 | Args: 147 | url (str): URL to download file from 148 | root (str): Directory to place downloaded file in 149 | filename (str, optional): Name to save the file under. If None, use the basename of the URL 150 | md5 (str, optional): MD5 checksum of the download. If None, do not check 151 | max_redirect_hops (int, optional): Maximum number of redirect hops allowed 152 | """ 153 | root = os.path.expanduser(root) 154 | if not filename: 155 | filename = os.path.basename(url) 156 | fpath = os.path.join(root, filename) 157 | 158 | os.makedirs(root, exist_ok=True) 159 | 160 | # check if file is already present locally 161 | if check_integrity(fpath, md5): 162 | print("Using downloaded and verified file: " + fpath) 163 | return 164 | 165 | if _is_remote_location_available(): 166 | _download_file_from_remote_location(fpath, url) 167 | else: 168 | # expand redirect chain if needed 169 | url = _get_redirect_url(url, max_hops=max_redirect_hops) 170 | 171 | # check if file is located on Google Drive 172 | file_id = _get_google_drive_file_id(url) 173 | if file_id is not None: 174 | print("Goolgle drive file id:", file_id) 175 | return gdown.download(id=file_id, output=str(Path(root) / filename), quiet=False) 176 | # return download_file_from_google_drive(file_id, root, filename, md5) 177 | 178 | # download the file 179 | try: 180 | print("Downloading " + url + " to " + fpath) 181 | _urlretrieve(url, fpath) 182 | except (urllib.error.URLError, OSError) as e: # type: ignore[attr-defined] 183 | if url[:5] == "https": 184 | url = url.replace("https:", "http:") 185 | print("Failed download. Trying https -> http instead. Downloading " + url + " to " + fpath) 186 | _urlretrieve(url, fpath) 187 | else: 188 | raise e 189 | 190 | # check integrity of downloaded file 191 | if not check_integrity(fpath, md5): 192 | raise RuntimeError("File not found or corrupted.") 193 | 194 | 195 | def list_dir(root: str, prefix: bool = False) -> List[str]: 196 | """List all directories at a given root 197 | 198 | Args: 199 | root (str): Path to directory whose folders need to be listed 200 | prefix (bool, optional): If true, prepends the path to each result, otherwise 201 | only returns the name of the directories found 202 | """ 203 | root = os.path.expanduser(root) 204 | directories = [p for p in os.listdir(root) if os.path.isdir(os.path.join(root, p))] 205 | if prefix is True: 206 | directories = [os.path.join(root, d) for d in directories] 207 | return directories 208 | 209 | 210 | def list_files(root: str, suffix: str, prefix: bool = False) -> List[str]: 211 | """List all files ending with a suffix at a given root 212 | 213 | Args: 214 | root (str): Path to directory whose folders need to be listed 215 | suffix (str or tuple): Suffix of the files to match, e.g. '.png' or ('.jpg', '.png'). 216 | It uses the Python "str.endswith" method and is passed directly 217 | prefix (bool, optional): If true, prepends the path to each result, otherwise 218 | only returns the name of the files found 219 | """ 220 | root = os.path.expanduser(root) 221 | files = [p for p in os.listdir(root) if os.path.isfile(os.path.join(root, p)) and p.endswith(suffix)] 222 | if prefix is True: 223 | files = [os.path.join(root, d) for d in files] 224 | return files 225 | 226 | 227 | def _quota_exceeded(first_chunk: bytes) -> bool: 228 | try: 229 | return "Google Drive - Quota exceeded" in first_chunk.decode() 230 | except UnicodeDecodeError: 231 | return False 232 | 233 | 234 | def download_file_from_google_drive(file_id: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None): 235 | """Download a Google Drive file from and place it in root. 236 | 237 | Args: 238 | file_id (str): id of file to be downloaded 239 | root (str): Directory to place downloaded file in 240 | filename (str, optional): Name to save the file under. If None, use the id of the file. 241 | md5 (str, optional): MD5 checksum of the download. If None, do not check 242 | """ 243 | # Based on https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url 244 | 245 | url = "https://docs.google.com/uc?export=download" 246 | 247 | root = os.path.expanduser(root) 248 | if not filename: 249 | filename = file_id 250 | fpath = os.path.join(root, filename) 251 | 252 | os.makedirs(root, exist_ok=True) 253 | 254 | if os.path.isfile(fpath) and check_integrity(fpath, md5): 255 | print("Using downloaded and verified file: " + fpath) 256 | else: 257 | session = requests.Session() 258 | 259 | response = session.get(url, params={"id": file_id}, stream=True) 260 | token = _get_confirm_token(response) 261 | 262 | if token: 263 | params = {"id": file_id, "confirm": token} 264 | response = session.get(url, params=params, stream=True) 265 | 266 | # Ideally, one would use response.status_code to check for quota limits, but google drive is not consistent 267 | # with their own API, refer https://github.com/pytorch/vision/issues/2992#issuecomment-730614517. 268 | # Should this be fixed at some place in future, one could refactor the following to no longer rely on decoding 269 | # the first_chunk of the payload 270 | response_content_generator = response.iter_content(32768) 271 | first_chunk = None 272 | while not first_chunk: # filter out keep-alive new chunks 273 | first_chunk = next(response_content_generator) 274 | 275 | if _quota_exceeded(first_chunk): 276 | msg = ( 277 | f"The daily quota of the file {filename} is exceeded and it " 278 | f"can't be downloaded. This is a limitation of Google Drive " 279 | f"and can only be overcome by trying again later." 280 | ) 281 | raise RuntimeError(msg) 282 | 283 | _save_response_content(itertools.chain((first_chunk,), response_content_generator), fpath) 284 | response.close() 285 | 286 | 287 | def _get_confirm_token(response: requests.models.Response) -> Optional[str]: 288 | for key, value in response.cookies.items(): 289 | if key.startswith("download_warning"): 290 | return value 291 | 292 | return None 293 | 294 | 295 | def _save_response_content( 296 | response_gen: Iterator[bytes], 297 | destination: str, 298 | ) -> None: 299 | with open(destination, "wb") as f: 300 | pbar = tqdm(total=None) 301 | progress = 0 302 | 303 | for chunk in response_gen: 304 | if chunk: # filter out keep-alive new chunks 305 | f.write(chunk) 306 | progress += len(chunk) 307 | pbar.update(progress - pbar.n) 308 | pbar.close() 309 | 310 | 311 | def _extract_tar(from_path: str, to_path: str, compression: Optional[str]) -> None: 312 | with tarfile.open(from_path, f"r:{compression[1:]}" if compression else "r") as tar: 313 | tar.extractall(to_path) 314 | 315 | 316 | def _extract_rar(from_path: str, to_path: str, compression: Optional[str]) -> None: 317 | with rarfile.RarFile(from_path, f"r:{compression[1:]}" if compression else "r") as rar: 318 | rar.extractall(to_path) 319 | 320 | 321 | def _extract_7z(from_path: str, to_path: str, compression: Optional[str]) -> None: 322 | with py7zr.SevenZipFile(from_path, f"r:{compression[1:]}" if compression else "r") as z: 323 | z.extractall(to_path) 324 | 325 | 326 | _ZIP_COMPRESSION_MAP: Dict[str, int] = { 327 | ".bz2": zipfile.ZIP_BZIP2, 328 | ".xz": zipfile.ZIP_LZMA, 329 | } 330 | 331 | 332 | def _extract_zip(from_path: str, to_path: str, compression: Optional[str]) -> None: 333 | with zipfile.ZipFile( 334 | from_path, "r", compression=_ZIP_COMPRESSION_MAP[compression] if compression else zipfile.ZIP_STORED 335 | ) as zip: 336 | zip.extractall(to_path) 337 | 338 | 339 | _ARCHIVE_EXTRACTORS: Dict[str, Callable[[str, str, Optional[str]], None]] = { 340 | ".tar": _extract_tar, 341 | ".zip": _extract_zip, 342 | ".rar": _extract_rar, 343 | ".7z": _extract_7z, 344 | } 345 | _COMPRESSED_FILE_OPENERS: Dict[str, Callable[..., IO]] = { 346 | ".bz2": bz2.open, 347 | ".gz": gzip.open, 348 | ".xz": lzma.open, 349 | } 350 | _FILE_TYPE_ALIASES: Dict[str, Tuple[Optional[str], Optional[str]]] = { 351 | ".tbz": (".tar", ".bz2"), 352 | ".tbz2": (".tar", ".bz2"), 353 | ".tgz": (".tar", ".gz"), 354 | } 355 | 356 | 357 | def _detect_file_type(file: str) -> Tuple[str, Optional[str], Optional[str]]: 358 | """Detect the archive type and/or compression of a file. 359 | 360 | Args: 361 | file (str): the filename 362 | 363 | Returns: 364 | (tuple): tuple of suffix, archive type, and compression 365 | 366 | Raises: 367 | RuntimeError: if file has no suffix or suffix is not supported 368 | """ 369 | suffixes = pathlib.Path(file).suffixes 370 | if not suffixes: 371 | raise RuntimeError( 372 | f"File '{file}' has no suffixes that could be used to detect the archive type and compression." 373 | ) 374 | suffix = suffixes[-1] 375 | 376 | # check if the suffix is a known alias 377 | if suffix in _FILE_TYPE_ALIASES: 378 | return (suffix, *_FILE_TYPE_ALIASES[suffix]) 379 | 380 | # check if the suffix is an archive type 381 | if suffix in _ARCHIVE_EXTRACTORS: 382 | return suffix, suffix, None 383 | 384 | # check if the suffix is a compression 385 | if suffix in _COMPRESSED_FILE_OPENERS: 386 | # check for suffix hierarchy 387 | if len(suffixes) > 1: 388 | suffix2 = suffixes[-2] 389 | 390 | # check if the suffix2 is an archive type 391 | if suffix2 in _ARCHIVE_EXTRACTORS: 392 | return suffix2 + suffix, suffix2, suffix 393 | 394 | return suffix, None, suffix 395 | 396 | valid_suffixes = sorted(set(_FILE_TYPE_ALIASES) | set(_ARCHIVE_EXTRACTORS) | set(_COMPRESSED_FILE_OPENERS)) 397 | raise RuntimeError(f"Unknown compression or archive type: '{suffix}'.\nKnown suffixes are: '{valid_suffixes}'.") 398 | 399 | 400 | def _decompress(from_path: str, to_path: Optional[str] = None, remove_finished: bool = False) -> str: 401 | r"""Decompress a file. 402 | 403 | The compression is automatically detected from the file name. 404 | 405 | Args: 406 | from_path (str): Path to the file to be decompressed. 407 | to_path (str): Path to the decompressed file. If omitted, ``from_path`` without compression extension is used. 408 | remove_finished (bool): If ``True``, remove the file after the extraction. 409 | 410 | Returns: 411 | (str): Path to the decompressed file. 412 | """ 413 | suffix, archive_type, compression = _detect_file_type(from_path) 414 | if not compression: 415 | raise RuntimeError(f"Couldn't detect a compression from suffix {suffix}.") 416 | 417 | if to_path is None: 418 | to_path = from_path.replace(suffix, archive_type if archive_type is not None else "") 419 | 420 | # We don't need to check for a missing key here, since this was already done in _detect_file_type() 421 | compressed_file_opener = _COMPRESSED_FILE_OPENERS[compression] 422 | 423 | with compressed_file_opener(from_path, "rb") as rfh, open(to_path, "wb") as wfh: 424 | wfh.write(rfh.read()) 425 | 426 | if remove_finished: 427 | os.remove(from_path) 428 | 429 | return to_path 430 | 431 | 432 | def extract_archive(from_path: str, to_path: Optional[str] = None, remove_finished: bool = False) -> str: 433 | """Extract an archive. 434 | 435 | The archive type and a possible compression is automatically detected from the file name. If the file is compressed 436 | but not an archive the call is dispatched to :func:`decompress`. 437 | 438 | Args: 439 | from_path (str): Path to the file to be extracted. 440 | to_path (str): Path to the directory the file will be extracted to. If omitted, the directory of the file is 441 | used. 442 | remove_finished (bool): If ``True``, remove the file after the extraction. 443 | 444 | Returns: 445 | (str): Path to the directory the file was extracted to. 446 | """ 447 | if to_path is None: 448 | to_path = os.path.dirname(from_path) 449 | 450 | suffix, archive_type, compression = _detect_file_type(from_path) 451 | if not archive_type: 452 | return _decompress( 453 | from_path, 454 | os.path.join(to_path, os.path.basename(from_path).replace(suffix, "")), 455 | remove_finished=remove_finished, 456 | ) 457 | 458 | # We don't need to check for a missing key here, since this was already done in _detect_file_type() 459 | extractor = _ARCHIVE_EXTRACTORS[archive_type] 460 | 461 | extractor(from_path, to_path, compression) 462 | if remove_finished: 463 | os.remove(from_path) 464 | 465 | return to_path 466 | 467 | 468 | def download_and_extract_archive( 469 | url: str, 470 | download_root: str, 471 | extract_root: Optional[str] = None, 472 | filename: Optional[str] = None, 473 | md5: Optional[str] = None, 474 | remove_finished: bool = False, 475 | ) -> None: 476 | download_root = os.path.expanduser(download_root) 477 | if extract_root is None: 478 | extract_root = download_root 479 | if not filename: 480 | filename = os.path.basename(url) 481 | 482 | download_url(url, download_root, filename, md5) 483 | 484 | archive = os.path.join(download_root, filename) 485 | print(f"Extracting {archive} to {extract_root}") 486 | extract_archive(archive, extract_root, remove_finished) 487 | 488 | 489 | def iterable_to_str(iterable: Iterable) -> str: 490 | return "'" + "', '".join([str(item) for item in iterable]) + "'" 491 | 492 | 493 | T = TypeVar("T", str, bytes) 494 | 495 | 496 | # def verify_str_arg( 497 | # value: T, 498 | # arg: Optional[str] = None, 499 | # valid_values: Iterable[T] = None, 500 | # custom_msg: Optional[str] = None, 501 | # ) -> T: 502 | # if not isinstance(value, torch._six.string_classes): 503 | # if arg is None: 504 | # msg = "Expected type str, but got type {type}." 505 | # else: 506 | # msg = "Expected type str for argument {arg}, but got type {type}." 507 | # msg = msg.format(type=type(value), arg=arg) 508 | # raise ValueError(msg) 509 | 510 | # if valid_values is None: 511 | # return value 512 | 513 | # if value not in valid_values: 514 | # if custom_msg is not None: 515 | # msg = custom_msg 516 | # else: 517 | # msg = "Unknown value '{value}' for argument {arg}. Valid values are {{{valid_values}}}." 518 | # msg = msg.format(value=value, arg=arg, valid_values=iterable_to_str(valid_values)) 519 | # raise ValueError(msg) 520 | 521 | # return value 522 | -------------------------------------------------------------------------------- /notebooks/scratch/ims_download.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "\n", 10 | "from pyphm.datasets.ims import ImsPrepMethodA\n", 11 | "from pathlib import Path\n", 12 | "import pandas as pd\n", 13 | "import os\n", 14 | "import numpy as np\n", 15 | "import time\n", 16 | "import datetime\n", 17 | "import csv\n", 18 | "\n", 19 | "\n", 20 | "%load_ext autoreload\n", 21 | "%autoreload 2" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "/home/tim/Documents/PyPHM/data/raw\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "# define the location of where the raw data folders will be kept.\n", 39 | "# e.g. the ims data will be in path_data_raw_folder/ims/ \n", 40 | "path_data_raw_folder = Path(Path.cwd().parent.parent / 'data/raw/' )\n", 41 | "print(path_data_raw_folder)\n", 42 | "\n", 43 | "# create the path_data_raw_folder if it does not exist\n", 44 | "path_data_raw_folder.mkdir(parents=True, exist_ok=True)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "Downloading https://drive.google.com/file/d/1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx/view?usp=sharingIMS.7z\n", 57 | "Goolgle drive file id: 1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx\n" 58 | ] 59 | }, 60 | { 61 | "name": "stderr", 62 | "output_type": "stream", 63 | "text": [ 64 | "Downloading...\n", 65 | "From: https://drive.google.com/uc?id=1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx\n", 66 | "To: /home/tim/Documents/PyPHM/data/raw/ims/IMS.7z\n", 67 | " 49%|████▉ | 532M/1.08G [00:12<00:19, 28.4MB/s] " 68 | ] 69 | }, 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "\n" 75 | ] 76 | }, 77 | { 78 | "ename": "KeyboardInterrupt", 79 | "evalue": "", 80 | "output_type": "error", 81 | "traceback": [ 82 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 83 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 84 | "\u001b[0;32m/tmp/ipykernel_93187/765225230.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# instantiate the ImsPrepMethodA class and download data if it does not exist\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mims\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mImsPrepMethodA\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath_data_raw_folder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 85 | "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/ims.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, root, dataset_folder_name, download)\u001b[0m\n\u001b[1;32m 326\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m ) -> None:\n\u001b[0;32m--> 328\u001b[0;31m super().__init__(\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[0mdataset_folder_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 86 | "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/ims.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, root, dataset_folder_name, download, dataset_path, data, sample_freq)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_exists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 87 | "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/ims.py\u001b[0m in \u001b[0;36mdownload\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Downloading {url}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m download_and_extract_archive(\n\u001b[0m\u001b[1;32m 107\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_root\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdataset_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmd5\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmd5\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 108\u001b[0m )\n", 88 | "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/utils.py\u001b[0m in \u001b[0;36mdownload_and_extract_archive\u001b[0;34m(url, download_root, extract_root, filename, md5, remove_finished)\u001b[0m\n\u001b[1;32m 480\u001b[0m \u001b[0mfilename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbasename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 481\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 482\u001b[0;31m \u001b[0mdownload_url\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_root\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmd5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 483\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 484\u001b[0m \u001b[0marchive\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdownload_root\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 89 | "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/utils.py\u001b[0m in \u001b[0;36mdownload_url\u001b[0;34m(url, root, filename, md5, max_redirect_hops)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfile_id\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 174\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Goolgle drive file id:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 175\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mgdown\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfile_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquiet\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 176\u001b[0m \u001b[0;31m# return download_file_from_google_drive(file_id, root, filename, md5)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 90 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/gdown/download.py\u001b[0m in \u001b[0;36mdownload\u001b[0;34m(url, output, quiet, proxy, speed, use_cookies, verify, id, fuzzy, resume)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[0mpbar\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"B\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munit_scale\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0mt_start\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 257\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miter_content\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mCHUNK_SIZE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 258\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 259\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mquiet\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 91 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/requests/models.py\u001b[0m in \u001b[0;36mgenerate\u001b[0;34m()\u001b[0m\n\u001b[1;32m 756\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'stream'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 757\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 758\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 759\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mchunk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 760\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mProtocolError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 92 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/urllib3/response.py\u001b[0m in \u001b[0;36mstream\u001b[0;34m(self, amt, decode_content)\u001b[0m\n\u001b[1;32m 574\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 575\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_fp_closed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 576\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdecode_content\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 577\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 578\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 93 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/urllib3/response.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, amt, decode_content, cache_content)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 518\u001b[0m \u001b[0mcache_content\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 519\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mfp_closed\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34mb\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 520\u001b[0m if (\n\u001b[1;32m 521\u001b[0m \u001b[0mamt\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 94 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 457\u001b[0m \u001b[0;31m# Amount is given, implement using readinto\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 458\u001b[0m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 459\u001b[0;31m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 460\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtobytes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 461\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 95 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 501\u001b[0m \u001b[0;31m# connection, and the user is reading more bytes than will be provided\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 502\u001b[0m \u001b[0;31m# (for example, reading in 1k chunks)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 503\u001b[0;31m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 504\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mn\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[0;31m# Ideally, we would raise IncompleteRead if the content-length\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 96 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 667\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 669\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 670\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 671\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 97 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/ssl.py\u001b[0m in \u001b[0;36mrecv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1239\u001b[0m \u001b[0;34m\"non-zero flags not allowed in calls to recv_into() on %s\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1240\u001b[0m self.__class__)\n\u001b[0;32m-> 1241\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1242\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1243\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 98 | "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/ssl.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1097\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1098\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbuffer\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1099\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1100\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1101\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 99 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 100 | ] 101 | }, 102 | { 103 | "name": "stderr", 104 | "output_type": "stream", 105 | "text": [ 106 | " 49%|████▉ | 532M/1.08G [00:29<00:19, 28.4MB/s]" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "# instantiate the ImsPrepMethodA class and download data if it does not exist\n", 112 | "ims = ImsPrepMethodA(root=path_data_raw_folder, download=True)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": "Python 3.8.12 ('featstore')", 126 | "language": "python", 127 | "name": "python3" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 3 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython3", 139 | "version": "3.8.12" 140 | }, 141 | "orig_nbformat": 4, 142 | "vscode": { 143 | "interpreter": { 144 | "hash": "daff1afd4d675d5e247c0a95a5de0c03bd87d8f7edee7cb37c539016070f1c16" 145 | } 146 | } 147 | }, 148 | "nbformat": 4, 149 | "nbformat_minor": 2 150 | } 151 | --------------------------------------------------------------------------------