├── .gitignore ├── LICENSE ├── README.md ├── data ├── interim │ ├── labeled_data_CN7.csv │ ├── labeled_data_RG3.csv │ └── labeled_data_preprocessed.csv ├── processed │ ├── labeled_data_CN7.csv │ └── labeled_data_RG3.csv └── raw │ └── labeled_data.csv ├── env.yml ├── img ├── clf_report_cn7_AE.png ├── clf_report_cn7_MD.png ├── clf_report_cn7_ml.png ├── clf_report_rg3_AE.png ├── clf_report_rg3_MD.png ├── clf_report_rg3_ml.png ├── cn7_parameter_distribution.png ├── feature_importance_1.png ├── feature_importance_2.png ├── feature_importance_3.png ├── feature_importance_4.png ├── process_parameter_distribution.png └── rg3_parameter_distribution.png ├── notebooks ├── 0_data_preprocessing.ipynb ├── 1_exploratory_data_analysis.ipynb ├── 2-1_EDA_on_CN7.ipynb ├── 2-2_EDA_on_RG3.ipynb ├── 3-1_ML_on_CN7.ipynb ├── 3-2_ML_on_CN7_MD.ipynb ├── 3-3_ML_on_CN7_Variational_AE.ipynb ├── 4-1_ML_on_RG3.ipynb ├── 4-2_ML_on_RG3_MD.ipynb ├── 4-3_ML_on_RG3_Variational_AE.ipynb └── 5_feature_importances_CN7.ipynb └── src └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 John W.S. Lee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![license 2 | status](https://img.shields.io/github/license/johnwslee/injection_molding_analysis) 3 | 4 | # Classification of Defective Parts in Injection Molding Using Various Machine Learning Approaches 5 | 6 | **Author:** John W.S. Lee 7 | 8 | ## 1. Introduction 9 | 10 | In this study, efforts were made to classify defects in parts produced by injection molding processes. Three different modeling approaches, namely supervised learning models, Mahalanobis Distance model, and Variational AutoEncoder model, were implemented and their performances were compared. 11 | 12 | The dataset used in this study was downloaded from the Korea AI Manufacturing Platform, also known as [KAMP](https://www.kamp-ai.kr/aidataDetail?AI_SEARCH=&page=1&DATASET_SEQ=4&EQUIP_SEL=&GUBUN_SEL=&FILE_TYPE_SEL=&WDATE_SEL=). Although the website is written in Korean, the contents of the dataset were mostly written in English. The dataset consisted of 7,996 rows with 44 columns. One of the columns represented the target label, `PassOrFail`. 13 | 14 | The following is a summary of this study. For more detailed codes and notebooks used in this study, please refer to the [notebook folder](https://github.com/johnwslee/injection_molding_analysis/tree/main/notebooks). 15 | 16 | ## 2. Summary of Study 17 | 18 | ### 2.1. Basic Exploratory Data Analysis 19 | 20 | The dataset had 4 different injection-molded parts, namely `CN7` and `RG3`, each with Left-Hand and Right-Hand components. The figure below shows the distribution of processing parameters for the parts with 4 different combinations. As shown in the figure, the processing parameters for `CN7` and `RG3` exhibited very different distribution, whereas the difference between the Left-Hand and Right-Hand components were not big. Therefore, it was reasonable to proceed with two separate models for `CN7` and `RG3`. 21 | 22 | 23 | 24 | ### 2.2. Exploratory Data Analysis for `CN7` and `RG3` 25 | 26 | For each type of injection-molded parts, the distributions of the processing parameters were compared for passed parts(i.e. good parts) and failed parts (i.e. defective parts). In the case of `CN7`, there seemed to be some difference in the distributions of the processing parameters for passed/failed parts . However, the difference for `RG3` seems to be less obvious than that for `CN7`. 27 | 28 | #### Distribution of Processing Parameters for `CN7` 29 | 30 | 31 | 32 | #### Distribution of Processing Parameters for `RG3` 33 | 34 | 35 | 36 | ### 2.3. Classification of Defective Parts for `CN7` and `RG3` 37 | 38 | As mentioned above, 3 different machine learning approaches were implemented for the purpose of classifying the defective injection-molded parts. Detailed codes can be found in the [notebook folder](https://github.com/johnwslee/injection_molding_analysis/tree/main/notebooks). Since there was a significant class imbalances, f1-score was used as the evaluation metric. 39 | 40 | For `CN7`, the f1-scores for supervised learning models, Mahalanobis Distance model, and Variational AutoEncoder model were 0.67, 0.55, and 0.73, respectively. 41 | 42 | For `RG3`, the f1-scores for supervised learning models, Mahalanobis Distance model, and Variational AutoEncoder model were 0, 0.3, and 0.24, respectively. 43 | 44 |

45 | 46 | 47 |

48 |

49 | 50 | 51 |

52 |

53 | 54 | 55 |

56 | 57 | Clearly, the effectiveness of the models were different for `CN7` and `RG3` parts. Especially, it was surprising that the f1-score could be improved from 0 to 0.27 by switching from supervised learning models to Mahalanobis Distance model. It should be also noted that the choice of the thresholds for Mahalanobis Distance model and Variational AutoEncoder model played a significant roled in determining their performances. 58 | 59 | ### 2.4. Feature Importances 60 | 61 | Feature importances for `CN7` parts were checked on 3 models (i.e, SVC, RandomForest, and LightGBM) using the models' built-in function and `shap` library. 62 | 63 | The importance of each feature appeared to be slightly different based on the models and the methods used. It turned out that "Max Injection Speed", "Filling Time", "Mold Temperature 4", "Barrel Temperature 1", and "Plasticizing Position" were the processing parameters that models thought to be important. 64 | 65 |

66 | 67 | 68 |

69 |

70 | 71 | 72 |

73 | 74 | 75 | ## 3. Conclusion 76 | 77 | This study demonstrated how various machine learning approaches performed in the classification of defective parts in injection molding. It turned out that the performance of each approach varied based on the type of datasets, `CN7` and `RG3` for this study. For `CN7` parts, Variational AutoEncoder performed best, whereas Mahalanobis Distance model performed best for `RG3`. This suggests that it is important to try several machine learning approaches to find the best-performing approach for a given data. 78 | 79 | ## How to Run the Notebooks Locally 80 | 81 | To download the contents of this GitHub page on to your local machine, follow these steps: 82 | 83 | 1. Copy and paste the following link: `git clone https://github.com/johnwslee/injection_molding_analysis.git` to your Terminal. 84 | 85 | 2. On your terminal, type: `cd injection_molding_analysis`. 86 | 87 | 3. Create a virtualenv by typing: `conda env create -f env.yml` 88 | 89 | 4. Activate the virtualenv by typing: `conda activate inj_env` 90 | 91 | 5. Run the notebooks in notebook folder in order. -------------------------------------------------------------------------------- /env.yml: -------------------------------------------------------------------------------- 1 | # install with conda env create -f env.yml 2 | name: 'inj_env' 3 | channels: 4 | - conda-forge 5 | - defaults 6 | - pytorch 7 | dependencies: 8 | - python=3.10.8 9 | - ipykernel 10 | - pip 11 | - pandas=1.5.2 12 | - matplotlib=3.6.2 13 | - scikit-learn=1.2.0 14 | - scipy=1.9.3 15 | - seaborn=0.12.1 16 | - black=22.10.0 17 | - flake8=6.0.0 18 | - pytorch=1.12.1 19 | - torchvision=0.13.1 20 | - lightgbm=3.3.3 21 | - ipywidgets 22 | - mkl=2021.4.0 23 | - shap=0.41.0 24 | - pip: 25 | - plotly==5.11.0 26 | - kaleido==0.2.1 27 | - tslearn==0.5.2 28 | - xgboost==1.7.2 29 | - catboost==1.1.1 30 | - torch-summary==1.4.5 31 | - otter-grader 32 | 33 | -------------------------------------------------------------------------------- /img/clf_report_cn7_AE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/clf_report_cn7_AE.png -------------------------------------------------------------------------------- /img/clf_report_cn7_MD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/clf_report_cn7_MD.png -------------------------------------------------------------------------------- /img/clf_report_cn7_ml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/clf_report_cn7_ml.png -------------------------------------------------------------------------------- /img/clf_report_rg3_AE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/clf_report_rg3_AE.png -------------------------------------------------------------------------------- /img/clf_report_rg3_MD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/clf_report_rg3_MD.png -------------------------------------------------------------------------------- /img/clf_report_rg3_ml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/clf_report_rg3_ml.png -------------------------------------------------------------------------------- /img/cn7_parameter_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/cn7_parameter_distribution.png -------------------------------------------------------------------------------- /img/feature_importance_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/feature_importance_1.png -------------------------------------------------------------------------------- /img/feature_importance_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/feature_importance_2.png -------------------------------------------------------------------------------- /img/feature_importance_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/feature_importance_3.png -------------------------------------------------------------------------------- /img/feature_importance_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/feature_importance_4.png -------------------------------------------------------------------------------- /img/process_parameter_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/process_parameter_distribution.png -------------------------------------------------------------------------------- /img/rg3_parameter_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnwslee/injection_molding_analysis/faa04b2923d0e75eda569eda522ba1f383cbd728/img/rg3_parameter_distribution.png -------------------------------------------------------------------------------- /notebooks/0_data_preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bfeabc3c-3ebe-4c8f-85ad-e3c058209de0", 6 | "metadata": {}, 7 | "source": [ 8 | "# 0. Imports" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "97be1e08-f742-4664-9aae-cd34068183dc", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "2ef4e5d7-3179-45ca-8d50-052c561a3e03", 24 | "metadata": {}, 25 | "source": [ 26 | "# 1. Data Read In" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "id": "cf967899-6162-455a-be84-0a4a53366a5d", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "df = pd.read_csv(\"../data/raw/labeled_data.csv\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "id": "d8224552-77f4-459c-93e8-b2094e2e24b1", 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/html": [ 48 | "
\n", 49 | "\n", 62 | "\n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | "
_idTimeStampPART_FACT_PLAN_DATEPART_FACT_SERIALPART_NAMEEQUIP_CDEQUIP_NAMEPassOrFailReasonInjection_Time...Mold_Temperature_3Mold_Temperature_4Mold_Temperature_5Mold_Temperature_6Mold_Temperature_7Mold_Temperature_8Mold_Temperature_9Mold_Temperature_10Mold_Temperature_11Mold_Temperature_12
05f8928bb9c0189cc666ef19b2020-10-16 04:57:472020-10-16 오전 12:00:0024CN7 W/S SIDE MLD'G RHS14650톤-우진2호기YNone9.59...24.79999927.50.00.00.00.00.00.00.00.0
15f8928de9c0189cc666ef20b2020-10-16 04:58:482020-10-16 오전 12:00:0024CN7 W/S SIDE MLD'G RHS14650톤-우진2호기YNone9.60...24.79999927.60.00.00.00.00.00.00.00.0
25f8928df9c0189cc666ef2132020-10-16 04:58:482020-10-16 오전 12:00:0023CN7 W/S SIDE MLD'G LHS14650톤-우진2호기YNone9.60...24.79999927.60.00.00.00.00.00.00.00.0
35f8928f39c0189cc666ef25e2020-10-16 04:59:482020-10-16 오전 12:00:0023CN7 W/S SIDE MLD'G LHS14650톤-우진2호기YNone9.59...25.00000027.60.00.00.00.00.00.00.00.0
45f8928f59c0189cc666ef2652020-10-16 04:59:482020-10-16 오전 12:00:0024CN7 W/S SIDE MLD'G RHS14650톤-우진2호기YNone9.59...25.00000027.60.00.00.00.00.00.00.00.0
\n", 212 | "

5 rows × 45 columns

\n", 213 | "
" 214 | ], 215 | "text/plain": [ 216 | " _id TimeStamp PART_FACT_PLAN_DATE \\\n", 217 | "0 5f8928bb9c0189cc666ef19b 2020-10-16 04:57:47 2020-10-16 오전 12:00:00 \n", 218 | "1 5f8928de9c0189cc666ef20b 2020-10-16 04:58:48 2020-10-16 오전 12:00:00 \n", 219 | "2 5f8928df9c0189cc666ef213 2020-10-16 04:58:48 2020-10-16 오전 12:00:00 \n", 220 | "3 5f8928f39c0189cc666ef25e 2020-10-16 04:59:48 2020-10-16 오전 12:00:00 \n", 221 | "4 5f8928f59c0189cc666ef265 2020-10-16 04:59:48 2020-10-16 오전 12:00:00 \n", 222 | "\n", 223 | " PART_FACT_SERIAL PART_NAME EQUIP_CD EQUIP_NAME PassOrFail \\\n", 224 | "0 24 CN7 W/S SIDE MLD'G RH S14 650톤-우진2호기 Y \n", 225 | "1 24 CN7 W/S SIDE MLD'G RH S14 650톤-우진2호기 Y \n", 226 | "2 23 CN7 W/S SIDE MLD'G LH S14 650톤-우진2호기 Y \n", 227 | "3 23 CN7 W/S SIDE MLD'G LH S14 650톤-우진2호기 Y \n", 228 | "4 24 CN7 W/S SIDE MLD'G RH S14 650톤-우진2호기 Y \n", 229 | "\n", 230 | " Reason Injection_Time ... Mold_Temperature_3 Mold_Temperature_4 \\\n", 231 | "0 None 9.59 ... 24.799999 27.5 \n", 232 | "1 None 9.60 ... 24.799999 27.6 \n", 233 | "2 None 9.60 ... 24.799999 27.6 \n", 234 | "3 None 9.59 ... 25.000000 27.6 \n", 235 | "4 None 9.59 ... 25.000000 27.6 \n", 236 | "\n", 237 | " Mold_Temperature_5 Mold_Temperature_6 Mold_Temperature_7 \\\n", 238 | "0 0.0 0.0 0.0 \n", 239 | "1 0.0 0.0 0.0 \n", 240 | "2 0.0 0.0 0.0 \n", 241 | "3 0.0 0.0 0.0 \n", 242 | "4 0.0 0.0 0.0 \n", 243 | "\n", 244 | " Mold_Temperature_8 Mold_Temperature_9 Mold_Temperature_10 \\\n", 245 | "0 0.0 0.0 0.0 \n", 246 | "1 0.0 0.0 0.0 \n", 247 | "2 0.0 0.0 0.0 \n", 248 | "3 0.0 0.0 0.0 \n", 249 | "4 0.0 0.0 0.0 \n", 250 | "\n", 251 | " Mold_Temperature_11 Mold_Temperature_12 \n", 252 | "0 0.0 0.0 \n", 253 | "1 0.0 0.0 \n", 254 | "2 0.0 0.0 \n", 255 | "3 0.0 0.0 \n", 256 | "4 0.0 0.0 \n", 257 | "\n", 258 | "[5 rows x 45 columns]" 259 | ] 260 | }, 261 | "execution_count": 3, 262 | "metadata": {}, 263 | "output_type": "execute_result" 264 | } 265 | ], 266 | "source": [ 267 | "df.head()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 4, 273 | "id": "03206431-93f9-4a43-97e8-dd7d7d42099a", 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "\n", 281 | "RangeIndex: 7996 entries, 0 to 7995\n", 282 | "Data columns (total 45 columns):\n", 283 | " # Column Non-Null Count Dtype \n", 284 | "--- ------ -------------- ----- \n", 285 | " 0 _id 7996 non-null object \n", 286 | " 1 TimeStamp 7996 non-null object \n", 287 | " 2 PART_FACT_PLAN_DATE 7996 non-null object \n", 288 | " 3 PART_FACT_SERIAL 7996 non-null int64 \n", 289 | " 4 PART_NAME 7996 non-null object \n", 290 | " 5 EQUIP_CD 7996 non-null object \n", 291 | " 6 EQUIP_NAME 7996 non-null object \n", 292 | " 7 PassOrFail 7996 non-null object \n", 293 | " 8 Reason 7996 non-null object \n", 294 | " 9 Injection_Time 7996 non-null float64\n", 295 | " 10 Filling_Time 7996 non-null float64\n", 296 | " 11 Plasticizing_Time 7996 non-null float64\n", 297 | " 12 Cycle_Time 7996 non-null float64\n", 298 | " 13 Clamp_Close_Time 7996 non-null float64\n", 299 | " 14 Cushion_Position 7996 non-null float64\n", 300 | " 15 Switch_Over_Position 7996 non-null float64\n", 301 | " 16 Plasticizing_Position 7996 non-null float64\n", 302 | " 17 Clamp_Open_Position 7996 non-null float64\n", 303 | " 18 Max_Injection_Speed 7996 non-null float64\n", 304 | " 19 Max_Screw_RPM 7996 non-null float64\n", 305 | " 20 Average_Screw_RPM 7996 non-null float64\n", 306 | " 21 Max_Injection_Pressure 7996 non-null float64\n", 307 | " 22 Max_Switch_Over_Pressure 7996 non-null float64\n", 308 | " 23 Max_Back_Pressure 7996 non-null float64\n", 309 | " 24 Average_Back_Pressure 7996 non-null float64\n", 310 | " 25 Barrel_Temperature_1 7996 non-null float64\n", 311 | " 26 Barrel_Temperature_2 7996 non-null float64\n", 312 | " 27 Barrel_Temperature_3 7996 non-null float64\n", 313 | " 28 Barrel_Temperature_4 7996 non-null float64\n", 314 | " 29 Barrel_Temperature_5 7996 non-null float64\n", 315 | " 30 Barrel_Temperature_6 7996 non-null float64\n", 316 | " 31 Barrel_Temperature_7 7996 non-null float64\n", 317 | " 32 Hopper_Temperature 7996 non-null float64\n", 318 | " 33 Mold_Temperature_1 7996 non-null float64\n", 319 | " 34 Mold_Temperature_2 7996 non-null float64\n", 320 | " 35 Mold_Temperature_3 7996 non-null float64\n", 321 | " 36 Mold_Temperature_4 7996 non-null float64\n", 322 | " 37 Mold_Temperature_5 7996 non-null float64\n", 323 | " 38 Mold_Temperature_6 7996 non-null float64\n", 324 | " 39 Mold_Temperature_7 7996 non-null float64\n", 325 | " 40 Mold_Temperature_8 7996 non-null float64\n", 326 | " 41 Mold_Temperature_9 7996 non-null float64\n", 327 | " 42 Mold_Temperature_10 7996 non-null float64\n", 328 | " 43 Mold_Temperature_11 7996 non-null float64\n", 329 | " 44 Mold_Temperature_12 7996 non-null float64\n", 330 | "dtypes: float64(36), int64(1), object(8)\n", 331 | "memory usage: 2.7+ MB\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "df.info()" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "id": "f7ad5237-31b7-417b-9e13-9e5f42ac52a8", 342 | "metadata": {}, 343 | "source": [ 344 | "# 2. Data Preprocessing" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "id": "e1c53ea4-ba06-43ff-9a31-7706603a1711", 350 | "metadata": {}, 351 | "source": [ 352 | "## 2.1. Dropping the columns that have only one unique value" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 5, 358 | "id": "ac3966d1-bdb2-47e4-9e6f-c8590dabd979", 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "data": { 363 | "text/plain": [ 364 | "_id 5232\n", 365 | "TimeStamp 2625\n", 366 | "PART_FACT_PLAN_DATE 13\n", 367 | "PART_FACT_SERIAL 17\n", 368 | "PART_NAME 6\n", 369 | "EQUIP_CD 3\n", 370 | "EQUIP_NAME 3\n", 371 | "PassOrFail 2\n", 372 | "Reason 4\n", 373 | "Injection_Time 36\n", 374 | "Filling_Time 35\n", 375 | "Plasticizing_Time 121\n", 376 | "Cycle_Time 48\n", 377 | "Clamp_Close_Time 13\n", 378 | "Cushion_Position 22\n", 379 | "Switch_Over_Position 3\n", 380 | "Plasticizing_Position 49\n", 381 | "Clamp_Open_Position 5\n", 382 | "Max_Injection_Speed 55\n", 383 | "Max_Screw_RPM 11\n", 384 | "Average_Screw_RPM 16\n", 385 | "Max_Injection_Pressure 35\n", 386 | "Max_Switch_Over_Pressure 69\n", 387 | "Max_Back_Pressure 68\n", 388 | "Average_Back_Pressure 53\n", 389 | "Barrel_Temperature_1 67\n", 390 | "Barrel_Temperature_2 60\n", 391 | "Barrel_Temperature_3 45\n", 392 | "Barrel_Temperature_4 79\n", 393 | "Barrel_Temperature_5 56\n", 394 | "Barrel_Temperature_6 37\n", 395 | "Barrel_Temperature_7 2\n", 396 | "Hopper_Temperature 94\n", 397 | "Mold_Temperature_1 1\n", 398 | "Mold_Temperature_2 1\n", 399 | "Mold_Temperature_3 56\n", 400 | "Mold_Temperature_4 55\n", 401 | "Mold_Temperature_5 1\n", 402 | "Mold_Temperature_6 1\n", 403 | "Mold_Temperature_7 1\n", 404 | "Mold_Temperature_8 1\n", 405 | "Mold_Temperature_9 1\n", 406 | "Mold_Temperature_10 1\n", 407 | "Mold_Temperature_11 1\n", 408 | "Mold_Temperature_12 1\n", 409 | "dtype: int64" 410 | ] 411 | }, 412 | "execution_count": 5, 413 | "metadata": {}, 414 | "output_type": "execute_result" 415 | } 416 | ], 417 | "source": [ 418 | "# Check on the Unique value\n", 419 | "\n", 420 | "df.nunique()" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 6, 426 | "id": "f25568ca-d13f-4db1-8cbc-4b7980f91dd1", 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "# Mold_Temperature_s that have only one unique value are dropped\n", 431 | "\n", 432 | "df.drop(\n", 433 | " columns=[\"Mold_Temperature_1\", \"Mold_Temperature_2\", \"Mold_Temperature_5\", \n", 434 | " \"Mold_Temperature_6\", \"Mold_Temperature_7\", \"Mold_Temperature_8\",\n", 435 | " \"Mold_Temperature_9\", \"Mold_Temperature_10\", \"Mold_Temperature_11\",\n", 436 | " \"Mold_Temperature_12\"],\n", 437 | " inplace=True\n", 438 | ")" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "id": "2401c4fd-8cd8-434f-ac18-dfeadce7b0ac", 444 | "metadata": {}, 445 | "source": [ 446 | "## 2.2. Removal of duplicated rows" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 7, 452 | "id": "f2ace337-690b-4776-9a5c-712cc7198e53", 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "# The number of unique _id is smaller than the number of rows --> Duplicated rows are removed\n", 457 | "\n", 458 | "df.drop_duplicates(keep=\"first\", inplace=True)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 8, 464 | "id": "a3ef0eda-b637-4083-b3fa-3a133e89ed7c", 465 | "metadata": {}, 466 | "outputs": [ 467 | { 468 | "name": "stdout", 469 | "output_type": "stream", 470 | "text": [ 471 | "\n", 472 | "Int64Index: 5232 entries, 0 to 5231\n", 473 | "Data columns (total 35 columns):\n", 474 | " # Column Non-Null Count Dtype \n", 475 | "--- ------ -------------- ----- \n", 476 | " 0 _id 5232 non-null object \n", 477 | " 1 TimeStamp 5232 non-null object \n", 478 | " 2 PART_FACT_PLAN_DATE 5232 non-null object \n", 479 | " 3 PART_FACT_SERIAL 5232 non-null int64 \n", 480 | " 4 PART_NAME 5232 non-null object \n", 481 | " 5 EQUIP_CD 5232 non-null object \n", 482 | " 6 EQUIP_NAME 5232 non-null object \n", 483 | " 7 PassOrFail 5232 non-null object \n", 484 | " 8 Reason 5232 non-null object \n", 485 | " 9 Injection_Time 5232 non-null float64\n", 486 | " 10 Filling_Time 5232 non-null float64\n", 487 | " 11 Plasticizing_Time 5232 non-null float64\n", 488 | " 12 Cycle_Time 5232 non-null float64\n", 489 | " 13 Clamp_Close_Time 5232 non-null float64\n", 490 | " 14 Cushion_Position 5232 non-null float64\n", 491 | " 15 Switch_Over_Position 5232 non-null float64\n", 492 | " 16 Plasticizing_Position 5232 non-null float64\n", 493 | " 17 Clamp_Open_Position 5232 non-null float64\n", 494 | " 18 Max_Injection_Speed 5232 non-null float64\n", 495 | " 19 Max_Screw_RPM 5232 non-null float64\n", 496 | " 20 Average_Screw_RPM 5232 non-null float64\n", 497 | " 21 Max_Injection_Pressure 5232 non-null float64\n", 498 | " 22 Max_Switch_Over_Pressure 5232 non-null float64\n", 499 | " 23 Max_Back_Pressure 5232 non-null float64\n", 500 | " 24 Average_Back_Pressure 5232 non-null float64\n", 501 | " 25 Barrel_Temperature_1 5232 non-null float64\n", 502 | " 26 Barrel_Temperature_2 5232 non-null float64\n", 503 | " 27 Barrel_Temperature_3 5232 non-null float64\n", 504 | " 28 Barrel_Temperature_4 5232 non-null float64\n", 505 | " 29 Barrel_Temperature_5 5232 non-null float64\n", 506 | " 30 Barrel_Temperature_6 5232 non-null float64\n", 507 | " 31 Barrel_Temperature_7 5232 non-null float64\n", 508 | " 32 Hopper_Temperature 5232 non-null float64\n", 509 | " 33 Mold_Temperature_3 5232 non-null float64\n", 510 | " 34 Mold_Temperature_4 5232 non-null float64\n", 511 | "dtypes: float64(26), int64(1), object(8)\n", 512 | "memory usage: 1.4+ MB\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "df.info()" 518 | ] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "id": "3104e75a-9359-46cb-b9e1-2ad56bdbe492", 523 | "metadata": {}, 524 | "source": [ 525 | "## 2.3. Change the time formate for the column, `PART_FACT_PLAN_DATE`" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 9, 531 | "id": "e225f665-c1f6-4733-b6fc-833eb76eacf7", 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "data": { 536 | "text/plain": [ 537 | "array(['2020-10-16 오전 12:00:00', '2020-10-20 오전 12:00:00',\n", 538 | " '2020-10-21 오전 12:00:00', '2020-10-22 오전 12:00:00',\n", 539 | " '2020-10-23 오전 12:00:00', '2020-10-27 오전 12:00:00',\n", 540 | " '2020-10-28 오전 12:00:00', '2020-10-29 오전 12:00:00',\n", 541 | " '2020-10-30 오전 12:00:00', '2020-11-03 오전 12:00:00',\n", 542 | " '2020-11-04 오전 12:00:00', '2020-11-05 오전 12:00:00',\n", 543 | " '2020-11-06 오전 12:00:00'], dtype=object)" 544 | ] 545 | }, 546 | "execution_count": 9, 547 | "metadata": {}, 548 | "output_type": "execute_result" 549 | } 550 | ], 551 | "source": [ 552 | "df[\"PART_FACT_PLAN_DATE\"].unique()" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 10, 558 | "id": "f6c432b8-d445-4bc7-8f05-ee3ae002e87e", 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "df[\"PART_FACT_PLAN_DATE\"] = df[\"PART_FACT_PLAN_DATE\"].str.replace(\"오전 12\", \"00\")" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 11, 568 | "id": "9ba9f9b3-0203-45ec-828c-5f4639f3d840", 569 | "metadata": {}, 570 | "outputs": [ 571 | { 572 | "data": { 573 | "text/plain": [ 574 | "array(['2020-10-16 00:00:00', '2020-10-20 00:00:00',\n", 575 | " '2020-10-21 00:00:00', '2020-10-22 00:00:00',\n", 576 | " '2020-10-23 00:00:00', '2020-10-27 00:00:00',\n", 577 | " '2020-10-28 00:00:00', '2020-10-29 00:00:00',\n", 578 | " '2020-10-30 00:00:00', '2020-11-03 00:00:00',\n", 579 | " '2020-11-04 00:00:00', '2020-11-05 00:00:00',\n", 580 | " '2020-11-06 00:00:00'], dtype=object)" 581 | ] 582 | }, 583 | "execution_count": 11, 584 | "metadata": {}, 585 | "output_type": "execute_result" 586 | } 587 | ], 588 | "source": [ 589 | "df[\"PART_FACT_PLAN_DATE\"].unique()" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "id": "bb2d9157-15c7-4eaa-bd9c-e268451f4bc1", 595 | "metadata": {}, 596 | "source": [ 597 | "## 2.4. Transformation of the values in `PassOrFail`" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 12, 603 | "id": "48642f31-e21a-4022-b303-71fd2d1da081", 604 | "metadata": {}, 605 | "outputs": [ 606 | { 607 | "data": { 608 | "text/plain": [ 609 | "Y 5172\n", 610 | "N 60\n", 611 | "Name: PassOrFail, dtype: int64" 612 | ] 613 | }, 614 | "execution_count": 12, 615 | "metadata": {}, 616 | "output_type": "execute_result" 617 | } 618 | ], 619 | "source": [ 620 | "df[\"PassOrFail\"].value_counts()" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 13, 626 | "id": "c6739f67-c643-4f79-8231-5209f2323ad2", 627 | "metadata": {}, 628 | "outputs": [], 629 | "source": [ 630 | "# Change Y/N to 1/0\n", 631 | "\n", 632 | "df[\"PassOrFail\"] = df[\"PassOrFail\"].apply(lambda x: 1 if x == \"N\" else 0) " 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "id": "26b01a20-7da5-43c7-9418-def53b4f9b29", 638 | "metadata": {}, 639 | "source": [ 640 | "# 3. Saving the preprocessed dataframe as csv" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 14, 646 | "id": "0567f5c2-6dee-469c-a434-f1cd8489111c", 647 | "metadata": {}, 648 | "outputs": [], 649 | "source": [ 650 | "df.to_csv(\"../data/interim/labeled_data_preprocessed.csv\", index=False)" 651 | ] 652 | } 653 | ], 654 | "metadata": { 655 | "kernelspec": { 656 | "display_name": "Python [conda env:inj_env]", 657 | "language": "python", 658 | "name": "conda-env-inj_env-py" 659 | }, 660 | "language_info": { 661 | "codemirror_mode": { 662 | "name": "ipython", 663 | "version": 3 664 | }, 665 | "file_extension": ".py", 666 | "mimetype": "text/x-python", 667 | "name": "python", 668 | "nbconvert_exporter": "python", 669 | "pygments_lexer": "ipython3", 670 | "version": "3.10.8" 671 | } 672 | }, 673 | "nbformat": 4, 674 | "nbformat_minor": 5 675 | } 676 | -------------------------------------------------------------------------------- /notebooks/3-2_ML_on_CN7_MD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bfeabc3c-3ebe-4c8f-85ad-e3c058209de0", 6 | "metadata": {}, 7 | "source": [ 8 | "# 0. Imports" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "97be1e08-f742-4664-9aae-cd34068183dc", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "import scipy as sp\n", 21 | "from scipy.stats import chi2\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "import seaborn as sns\n", 24 | "\n", 25 | "from collections import defaultdict\n", 26 | "import time\n", 27 | "from datetime import timedelta\n", 28 | "\n", 29 | "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", 30 | "from sklearn.model_selection import train_test_split\n", 31 | "from sklearn.metrics import confusion_matrix, classification_report" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "6749c29f-62cb-4c13-96a5-980a90d12888", 37 | "metadata": {}, 38 | "source": [ 39 | "# 1. Data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "id": "11d01b97-0d88-470d-b99a-72626c8b3e34", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "cn7 = pd.read_csv(\"../data/processed/labeled_data_cn7.csv\", parse_dates=True)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "id": "0ffc6350-b5b1-4bc6-9ee3-e9c15362bb36", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "\n", 63 | "RangeIndex: 3974 entries, 0 to 3973\n", 64 | "Data columns (total 27 columns):\n", 65 | " # Column Non-Null Count Dtype \n", 66 | "--- ------ -------------- ----- \n", 67 | " 0 TimeStamp 3974 non-null object \n", 68 | " 1 PassOrFail 3974 non-null int64 \n", 69 | " 2 Hopper_Temperature 3974 non-null float64\n", 70 | " 3 Plasticizing_Position 3974 non-null float64\n", 71 | " 4 Barrel_Temperature_3 3974 non-null float64\n", 72 | " 5 Reason 3974 non-null object \n", 73 | " 6 Injection_Time 3974 non-null float64\n", 74 | " 7 Max_Injection_Pressure 3974 non-null float64\n", 75 | " 8 Barrel_Temperature_6 3974 non-null float64\n", 76 | " 9 Barrel_Temperature_2 3974 non-null float64\n", 77 | " 10 Cushion_Position 3974 non-null float64\n", 78 | " 11 Max_Screw_RPM 3974 non-null float64\n", 79 | " 12 Barrel_Temperature_5 3974 non-null float64\n", 80 | " 13 Average_Screw_RPM 3974 non-null float64\n", 81 | " 14 _id 3974 non-null object \n", 82 | " 15 Average_Back_Pressure 3974 non-null float64\n", 83 | " 16 Plasticizing_Time 3974 non-null float64\n", 84 | " 17 Max_Back_Pressure 3974 non-null float64\n", 85 | " 18 Filling_Time 3974 non-null float64\n", 86 | " 19 Max_Switch_Over_Pressure 3974 non-null float64\n", 87 | " 20 Barrel_Temperature_1 3974 non-null float64\n", 88 | " 21 Barrel_Temperature_4 3974 non-null float64\n", 89 | " 22 Cycle_Time 3974 non-null float64\n", 90 | " 23 Clamp_Close_Time 3974 non-null float64\n", 91 | " 24 Mold_Temperature_4 3974 non-null float64\n", 92 | " 25 Mold_Temperature_3 3974 non-null float64\n", 93 | " 26 Max_Injection_Speed 3974 non-null float64\n", 94 | "dtypes: float64(23), int64(1), object(3)\n", 95 | "memory usage: 838.4+ KB\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "cn7.info()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "d476ba58-0785-4370-b416-c80dfc0cad52", 106 | "metadata": {}, 107 | "source": [ 108 | "# 2. Data Preprocessing" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 4, 114 | "id": "ca4ce13b-11ca-46db-aeba-08de5d94ef80", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Preparation of features for model training\n", 119 | "numerical_features = [x for x in cn7.columns if np.dtype(cn7[x]) == \"float64\"]" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "id": "7509fb43-d58a-4dda-afec-5c702ff26081", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "No. of passed CN7 parts: 3946\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "# Data for passed parts\n", 138 | "cn7_Y = cn7[cn7[\"PassOrFail\"] == 0]\n", 139 | "cn7_Y = cn7_Y[numerical_features]\n", 140 | "print(\"No. of passed CN7 parts:\", len(cn7_Y))" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "id": "0ac9183e-d9b0-4380-a2c7-19126573a000", 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "No. of failed CN7 parts: 28\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "# Data for failed parts\n", 159 | "cn7_N = cn7[cn7[\"PassOrFail\"] == 1]\n", 160 | "cn7_N = cn7_N[numerical_features]\n", 161 | "print(\"No. of failed CN7 parts:\", len(cn7_N))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 7, 167 | "id": "20110606-19d5-4e81-baaa-8eba73ca661a", 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "No. of Train Set (Passed Parts): 3551\n", 175 | "No. of Test Set (Passed Parts): 395\n", 176 | "No. of Test Set (Failed Parts): 28\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "# The model using Mahalanobis Distance is trained by Data for passed parts (i.e., data with a majority class)\n", 182 | "cn7_train_Y, cn7_test_Y = train_test_split(cn7_Y, test_size=0.1)\n", 183 | "\n", 184 | "# Test set with failed parts\n", 185 | "cn7_test_N = cn7_N\n", 186 | "\n", 187 | "print(f\"No. of Train Set (Passed Parts): {len(cn7_train_Y)}\")\n", 188 | "print(f\"No. of Test Set (Passed Parts): {len(cn7_test_Y)}\")\n", 189 | "print(f\"No. of Test Set (Failed Parts): {len(cn7_test_N)}\")" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 8, 195 | "id": "d2032b24-477d-4a02-87e8-a990aef8fea2", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# Data normalization\n", 200 | "\n", 201 | "scaler = StandardScaler()\n", 202 | "\n", 203 | "cn7_train_Y = scaler.fit_transform(cn7_train_Y)\n", 204 | "cn7_test_Y = scaler.transform(cn7_test_Y)\n", 205 | "cn7_test_N = scaler.transform(cn7_test_N)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "0acdf843-6077-4018-b3ea-423452b90e39", 211 | "metadata": {}, 212 | "source": [ 213 | "# 3. Mahalanobis Distance" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 9, 219 | "id": "0ed8a91b-47be-49cc-945f-3c2a8ce09780", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "def mahalanobis(x=None, data=None, cov=None):\n", 224 | " \"\"\"\n", 225 | " Compute the Mahalanobis Distance between each row of x and the data \n", 226 | " x : vector or matrix of data with, say, p columns.\n", 227 | " data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.\n", 228 | " cov : covariance matrix (p x p) of the distribution. If None, will be computed from data.\n", 229 | " \"\"\"\n", 230 | " x_minus_mu = x - np.mean(data, axis=0)\n", 231 | " if not cov:\n", 232 | " cov = np.cov(data.T)\n", 233 | " # cov = np.cov(data.values.T)\n", 234 | " inv_covmat = sp.linalg.inv(cov)\n", 235 | " left_term = np.dot(x_minus_mu, inv_covmat)\n", 236 | " mahal = np.dot(left_term, x_minus_mu.T)\n", 237 | " return mahal.diagonal() # Can't understand why .diagonal() is used" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 10, 243 | "id": "cd7152de-3dd5-485c-ad8f-9db5628b3257", 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "class MahalanobisOneclassClassifier():\n", 248 | " def __init__(self, xtrain, significance_level=0.01):\n", 249 | " self.xtrain = xtrain\n", 250 | " self.critical_value = chi2.ppf((1-significance_level), df=xtrain.shape[1] - 1) # df = degree of freedom\n", 251 | " print('Critical value is: ', self.critical_value)\n", 252 | "\n", 253 | " def predict_proba(self, xtest):\n", 254 | " mahalanobis_dist = mahalanobis(xtest, self.xtrain)\n", 255 | " self.pvalues = 1 - chi2.cdf(mahalanobis_dist, 2)\n", 256 | " return mahalanobis_dist\n", 257 | "\n", 258 | " def predict(self, xtest):\n", 259 | " return np.array([int(i) for i in self.predict_proba(xtest) > self.critical_value])" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "id": "7a439320-3074-4e43-b4c8-450c1b4fde4a", 265 | "metadata": {}, 266 | "source": [ 267 | "# 4. Setup of Threshold" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 11, 273 | "id": "260c938a-f2fb-4c64-99fb-fbf59d80887f", 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "Critical value is: 30.813282343953027\n" 281 | ] 282 | } 283 | ], 284 | "source": [ 285 | "clf = MahalanobisOneclassClassifier(cn7_train_Y, significance_level=0.1)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 12, 291 | "id": "8ee61460-5e70-46e8-aba0-7527f193dcb7", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "threshold = clf.critical_value" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "id": "6e674593-3220-409c-82bf-08b3f330cf2e", 301 | "metadata": {}, 302 | "source": [ 303 | "# 5. Classification of Test Set by Mahalanobis Distance" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "id": "bcb19903-b68f-4fb5-9169-2f1c2f4bde9e", 309 | "metadata": {}, 310 | "source": [ 311 | "## 5.1. Evaluation Using Test Set (Passed Parts)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 13, 317 | "id": "f13a656e-0223-41e7-8c88-a854000bcb48", 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "# Prediction of Mahalanobis Distance\n", 322 | "cn7_MD_Y = clf.predict_proba(cn7_test_Y)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 14, 328 | "id": "5e57b830-3e15-4684-bdb7-ef7d5e1af119", 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "data": { 333 | "image/png": "", 334 | "text/plain": [ 335 | "
" 336 | ] 337 | }, 338 | "metadata": {}, 339 | "output_type": "display_data" 340 | } 341 | ], 342 | "source": [ 343 | "# Data Visualization \n", 344 | "# Log was used for better visualization\n", 345 | "\n", 346 | "plt.hist(np.log(cn7_MD_Y), bins=200)\n", 347 | "plt.xlabel(\"log(Mahalanobis Distance)\")\n", 348 | "plt.ylabel(\"No of samples\")\n", 349 | "plt.vlines(np.log(threshold), 0, 25, color=\"red\")\n", 350 | "plt.show();" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 15, 356 | "id": "1bbc188b-403f-4751-a161-ed63f8abc49f", 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "No. of Failed Parts: 24\n", 364 | "Accuracy: 0.9392405063291139\n" 365 | ] 366 | } 367 | ], 368 | "source": [ 369 | "# Check on the data that were predicted as failed parts\n", 370 | "\n", 371 | "cn7_test_Y_anomalies = cn7_MD_Y > threshold\n", 372 | "print(\"No. of Failed Parts:\", np.sum(cn7_test_Y_anomalies))\n", 373 | "print(\"Accuracy:\", (cn7_test_Y.shape[0]-np.sum(cn7_test_Y_anomalies))/cn7_test_Y.shape[0])" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "id": "055407b1-4ea7-40c4-abca-00580ba44869", 379 | "metadata": {}, 380 | "source": [ 381 | "## 5.2. Evaluation Using Test Set (Failed Parts)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 16, 387 | "id": "2dd8090b-ff11-4c2e-a7df-d23c4a1af6d3", 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "# Prediction of Mahalanobis Distance\n", 392 | "cn7_MD_N = clf.predict_proba(cn7_test_N)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 17, 398 | "id": "81d6064b-f17e-43cf-a8fc-bbc96900e8af", 399 | "metadata": {}, 400 | "outputs": [ 401 | { 402 | "data": { 403 | "image/png": "", 404 | "text/plain": [ 405 | "
" 406 | ] 407 | }, 408 | "metadata": {}, 409 | "output_type": "display_data" 410 | } 411 | ], 412 | "source": [ 413 | "# Data visualization \n", 414 | "# Log was used for better visualization\n", 415 | "\n", 416 | "plt.hist(np.log(cn7_MD_N), bins=50)\n", 417 | "plt.xlabel(\"Mahalanobis Distance\")\n", 418 | "plt.ylabel(\"No of samples\")\n", 419 | "plt.vlines(np.log(threshold), 0, 5, color=\"red\")\n", 420 | "plt.show();" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 18, 426 | "id": "695fe8ac-cdda-4b8b-b41a-649f41993637", 427 | "metadata": {}, 428 | "outputs": [ 429 | { 430 | "name": "stdout", 431 | "output_type": "stream", 432 | "text": [ 433 | "No. of Failed Parts: 20\n", 434 | "Accuracy: 0.7142857142857143\n" 435 | ] 436 | } 437 | ], 438 | "source": [ 439 | "# Check on the data that were predicted as failed parts\n", 440 | "\n", 441 | "cn7_test_N_anomalies = cn7_MD_N > threshold\n", 442 | "print(\"No. of Failed Parts:\", np.sum(cn7_test_N_anomalies))\n", 443 | "print(\"Accuracy:\", np.sum(cn7_test_N_anomalies)/cn7_test_N.shape[0])" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "id": "d950e685-039b-415c-95e4-cccf396de964", 449 | "metadata": {}, 450 | "source": [ 451 | "# 6. Result Analysis" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 19, 457 | "id": "9068d6e4-eda8-464f-8f6d-ff38bd2cad87", 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "# True values for the test set\n", 462 | "\n", 463 | "cn7_true = np.concatenate(\n", 464 | " [np.zeros(len(cn7_test_Y_anomalies)), np.ones(len(cn7_test_N_anomalies))]\n", 465 | ")" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 20, 471 | "id": "0130a09a-8455-4d42-884b-ae31fbb4e326", 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "# Predicted values for the test set\n", 476 | "\n", 477 | "cn7_prediction = np.concatenate(\n", 478 | " [cn7_test_Y_anomalies, cn7_test_N_anomalies]\n", 479 | ")" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 21, 485 | "id": "09bf7653-cf80-414b-936c-16282d427e8a", 486 | "metadata": {}, 487 | "outputs": [ 488 | { 489 | "data": { 490 | "text/plain": [ 491 | "array([[371, 24],\n", 492 | " [ 8, 20]], dtype=int64)" 493 | ] 494 | }, 495 | "execution_count": 21, 496 | "metadata": {}, 497 | "output_type": "execute_result" 498 | } 499 | ], 500 | "source": [ 501 | "confusion_matrix(cn7_true, cn7_prediction)" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 22, 507 | "id": "295332c3-ad08-4c02-84ce-1bdcc6194e2e", 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "image/png": "", 513 | "text/plain": [ 514 | "
" 515 | ] 516 | }, 517 | "metadata": {}, 518 | "output_type": "display_data" 519 | } 520 | ], 521 | "source": [ 522 | "target_names = [\"Pass\", \"Fail\"]\n", 523 | "clf_report = classification_report(\n", 524 | " cn7_true, cn7_prediction, target_names=target_names, output_dict=True\n", 525 | ")\n", 526 | "clf_plot = sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)\n", 527 | "plt.title('Classification Report for Mahalanobis Distance on CN7')\n", 528 | "clf_plot.figure.savefig(\"../img/clf_report_cn7_MD.png\")" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "id": "5946b2a7-fb65-4f04-b33d-dbee8550b438", 534 | "metadata": {}, 535 | "source": [ 536 | "- Performance of the model using Mahalanobis Distance was slightly less effective than the machine learning models." 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": null, 542 | "id": "c093eb72-959e-453e-91d8-d525a6ede8fe", 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [] 546 | } 547 | ], 548 | "metadata": { 549 | "kernelspec": { 550 | "display_name": "Python [conda env:inj_env]", 551 | "language": "python", 552 | "name": "conda-env-inj_env-py" 553 | }, 554 | "language_info": { 555 | "codemirror_mode": { 556 | "name": "ipython", 557 | "version": 3 558 | }, 559 | "file_extension": ".py", 560 | "mimetype": "text/x-python", 561 | "name": "python", 562 | "nbconvert_exporter": "python", 563 | "pygments_lexer": "ipython3", 564 | "version": "3.10.8" 565 | } 566 | }, 567 | "nbformat": 4, 568 | "nbformat_minor": 5 569 | } 570 | -------------------------------------------------------------------------------- /notebooks/4-2_ML_on_RG3_MD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bfeabc3c-3ebe-4c8f-85ad-e3c058209de0", 6 | "metadata": {}, 7 | "source": [ 8 | "# 0. Imports" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "97be1e08-f742-4664-9aae-cd34068183dc", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "import scipy as sp\n", 21 | "from scipy.stats import chi2\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "import seaborn as sns\n", 24 | "\n", 25 | "from collections import defaultdict\n", 26 | "import time\n", 27 | "from datetime import timedelta\n", 28 | "\n", 29 | "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", 30 | "from sklearn.model_selection import train_test_split\n", 31 | "from sklearn.metrics import confusion_matrix, classification_report" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "6749c29f-62cb-4c13-96a5-980a90d12888", 37 | "metadata": {}, 38 | "source": [ 39 | "# 1. Data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "id": "11d01b97-0d88-470d-b99a-72626c8b3e34", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "rg3 = pd.read_csv(\"../data/processed/labeled_data_rg3.csv\", parse_dates=True)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "id": "0ffc6350-b5b1-4bc6-9ee3-e9c15362bb36", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "\n", 63 | "RangeIndex: 1256 entries, 0 to 1255\n", 64 | "Data columns (total 27 columns):\n", 65 | " # Column Non-Null Count Dtype \n", 66 | "--- ------ -------------- ----- \n", 67 | " 0 TimeStamp 1256 non-null object \n", 68 | " 1 Clamp_Close_Time 1256 non-null float64\n", 69 | " 2 Barrel_Temperature_4 1256 non-null float64\n", 70 | " 3 Hopper_Temperature 1256 non-null float64\n", 71 | " 4 Reason 1256 non-null object \n", 72 | " 5 Injection_Time 1256 non-null float64\n", 73 | " 6 Barrel_Temperature_1 1256 non-null float64\n", 74 | " 7 Plasticizing_Time 1256 non-null float64\n", 75 | " 8 Max_Back_Pressure 1256 non-null float64\n", 76 | " 9 Filling_Time 1256 non-null float64\n", 77 | " 10 Max_Injection_Pressure 1256 non-null float64\n", 78 | " 11 Plasticizing_Position 1256 non-null float64\n", 79 | " 12 Barrel_Temperature_6 1256 non-null float64\n", 80 | " 13 Cushion_Position 1256 non-null float64\n", 81 | " 14 Max_Injection_Speed 1256 non-null float64\n", 82 | " 15 _id 1256 non-null object \n", 83 | " 16 Average_Back_Pressure 1256 non-null float64\n", 84 | " 17 Mold_Temperature_3 1256 non-null float64\n", 85 | " 18 Barrel_Temperature_3 1256 non-null float64\n", 86 | " 19 Barrel_Temperature_5 1256 non-null float64\n", 87 | " 20 Cycle_Time 1256 non-null float64\n", 88 | " 21 Mold_Temperature_4 1256 non-null float64\n", 89 | " 22 Average_Screw_RPM 1256 non-null float64\n", 90 | " 23 Max_Screw_RPM 1256 non-null float64\n", 91 | " 24 Max_Switch_Over_Pressure 1256 non-null float64\n", 92 | " 25 Barrel_Temperature_2 1256 non-null float64\n", 93 | " 26 PassOrFail 1256 non-null int64 \n", 94 | "dtypes: float64(23), int64(1), object(3)\n", 95 | "memory usage: 265.1+ KB\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "rg3.info()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "d476ba58-0785-4370-b416-c80dfc0cad52", 106 | "metadata": {}, 107 | "source": [ 108 | "# 2. Data Preprocessing" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 4, 114 | "id": "ca4ce13b-11ca-46db-aeba-08de5d94ef80", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Preparation of features for model training\n", 119 | "numerical_features = [x for x in rg3.columns if np.dtype(rg3[x]) == \"float64\"]" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "id": "7509fb43-d58a-4dda-afec-5c702ff26081", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "No. of passed RG3 parts: 1224\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "# Data for passed parts\n", 138 | "rg3_Y = rg3[rg3[\"PassOrFail\"] == 0]\n", 139 | "rg3_Y = rg3_Y[numerical_features]\n", 140 | "print(\"No. of passed RG3 parts:\", len(rg3_Y))" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "id": "0ac9183e-d9b0-4380-a2c7-19126573a000", 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "No. of failed RG3 parts: 32\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "# Data for failed parts\n", 159 | "rg3_N = rg3[rg3[\"PassOrFail\"] == 1]\n", 160 | "rg3_N = rg3_N[numerical_features]\n", 161 | "print(\"No. of failed RG3 parts:\", len(rg3_N))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 7, 167 | "id": "20110606-19d5-4e81-baaa-8eba73ca661a", 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "No. of Train Set (Passed Parts): 1101\n", 175 | "No. of Test Set (Passed Parts): 123\n", 176 | "No. of Test Set (Failed Parts): 32\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "# The model using Mahalanobis Distance is trained by Data for passed parts (i.e., data with a majority class)\n", 182 | "rg3_train_Y, rg3_test_Y = train_test_split(rg3_Y, test_size=0.1)\n", 183 | "\n", 184 | "# Test set with failed parts\n", 185 | "rg3_test_N = rg3_N\n", 186 | "\n", 187 | "print(f\"No. of Train Set (Passed Parts): {len(rg3_train_Y)}\")\n", 188 | "print(f\"No. of Test Set (Passed Parts): {len(rg3_test_Y)}\")\n", 189 | "print(f\"No. of Test Set (Failed Parts): {len(rg3_test_N)}\")" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 8, 195 | "id": "d2032b24-477d-4a02-87e8-a990aef8fea2", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# Data normalization\n", 200 | "\n", 201 | "scaler = StandardScaler()\n", 202 | "\n", 203 | "rg3_train_Y = scaler.fit_transform(rg3_train_Y)\n", 204 | "rg3_test_Y = scaler.transform(rg3_test_Y)\n", 205 | "rg3_test_N = scaler.transform(rg3_test_N)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "0acdf843-6077-4018-b3ea-423452b90e39", 211 | "metadata": {}, 212 | "source": [ 213 | "# 3. Mahalanobis Distance" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 9, 219 | "id": "0ed8a91b-47be-49cc-945f-3c2a8ce09780", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "def mahalanobis(x=None, data=None, cov=None):\n", 224 | " \"\"\"\n", 225 | " Compute the Mahalanobis Distance between each row of x and the data \n", 226 | " x : vector or matrix of data with, say, p columns.\n", 227 | " data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.\n", 228 | " cov : covariance matrix (p x p) of the distribution. If None, will be computed from data.\n", 229 | " \"\"\"\n", 230 | " x_minus_mu = x - np.mean(data, axis=0)\n", 231 | " if not cov:\n", 232 | " cov = np.cov(data.T)\n", 233 | " # cov = np.cov(data.values.T)\n", 234 | " inv_covmat = sp.linalg.inv(cov)\n", 235 | " left_term = np.dot(x_minus_mu, inv_covmat)\n", 236 | " mahal = np.dot(left_term, x_minus_mu.T)\n", 237 | " return mahal.diagonal() # Can't understand why .diagonal() is used" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 10, 243 | "id": "cd7152de-3dd5-485c-ad8f-9db5628b3257", 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "class MahalanobisOneclassClassifier():\n", 248 | " def __init__(self, xtrain, significance_level=0.01):\n", 249 | " self.xtrain = xtrain\n", 250 | " self.critical_value = chi2.ppf((1-significance_level), df=xtrain.shape[1] - 1) # df = degree of freedom\n", 251 | " print('Critical value is: ', self.critical_value)\n", 252 | "\n", 253 | " def predict_proba(self, xtest):\n", 254 | " mahalanobis_dist = mahalanobis(xtest, self.xtrain)\n", 255 | " self.pvalues = 1 - chi2.cdf(mahalanobis_dist, 2)\n", 256 | " return mahalanobis_dist\n", 257 | "\n", 258 | " def predict(self, xtest):\n", 259 | " return np.array([int(i) for i in self.predict_proba(xtest) > self.critical_value])" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "id": "7a439320-3074-4e43-b4c8-450c1b4fde4a", 265 | "metadata": {}, 266 | "source": [ 267 | "# 4. Setup of Threshold" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 11, 273 | "id": "260c938a-f2fb-4c64-99fb-fbf59d80887f", 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "Critical value is: 30.813282343953027\n" 281 | ] 282 | } 283 | ], 284 | "source": [ 285 | "clf = MahalanobisOneclassClassifier(rg3_train_Y, significance_level=0.1)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 12, 291 | "id": "8ee61460-5e70-46e8-aba0-7527f193dcb7", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "threshold = clf.critical_value" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "id": "6e674593-3220-409c-82bf-08b3f330cf2e", 301 | "metadata": {}, 302 | "source": [ 303 | "# 5. Classification of Test Set by Mahalanobis Distance" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "id": "bcb19903-b68f-4fb5-9169-2f1c2f4bde9e", 309 | "metadata": {}, 310 | "source": [ 311 | "## 5.1. Evaluation Using Test Set (Passed Parts)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 13, 317 | "id": "f13a656e-0223-41e7-8c88-a854000bcb48", 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "# Prediction of Mahalanobis Distance\n", 322 | "rg3_MD_Y = clf.predict_proba(rg3_test_Y)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 14, 328 | "id": "5e57b830-3e15-4684-bdb7-ef7d5e1af119", 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "data": { 333 | "image/png": "", 334 | "text/plain": [ 335 | "
" 336 | ] 337 | }, 338 | "metadata": {}, 339 | "output_type": "display_data" 340 | } 341 | ], 342 | "source": [ 343 | "# Data Visualization \n", 344 | "# Log was used for better visualization\n", 345 | "\n", 346 | "plt.hist(np.log(rg3_MD_Y), bins=200)\n", 347 | "plt.xlabel(\"log(Mahalanobis Distance)\")\n", 348 | "plt.ylabel(\"No of samples\")\n", 349 | "plt.vlines(np.log(threshold), 0, 5, color=\"red\")\n", 350 | "plt.show();" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 15, 356 | "id": "1bbc188b-403f-4751-a161-ed63f8abc49f", 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "No. of Failed Parts: 19\n", 364 | "Accuracy: 0.8455284552845529\n" 365 | ] 366 | } 367 | ], 368 | "source": [ 369 | "# Check on the data that were predicted as failed parts\n", 370 | "\n", 371 | "rg3_test_Y_anomalies = rg3_MD_Y > threshold\n", 372 | "print(\"No. of Failed Parts:\", np.sum(rg3_test_Y_anomalies))\n", 373 | "print(\"Accuracy:\", (rg3_test_Y.shape[0]-np.sum(rg3_test_Y_anomalies))/rg3_test_Y.shape[0])" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "id": "055407b1-4ea7-40c4-abca-00580ba44869", 379 | "metadata": {}, 380 | "source": [ 381 | "## 5.2. Evaluation Using Test Set (Failed Parts)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 16, 387 | "id": "2dd8090b-ff11-4c2e-a7df-d23c4a1af6d3", 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "# Prediction of Mahalanobis Distance\n", 392 | "rg3_MD_N = clf.predict_proba(rg3_test_N)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 17, 398 | "id": "81d6064b-f17e-43cf-a8fc-bbc96900e8af", 399 | "metadata": {}, 400 | "outputs": [ 401 | { 402 | "data": { 403 | "image/png": "", 404 | "text/plain": [ 405 | "
" 406 | ] 407 | }, 408 | "metadata": {}, 409 | "output_type": "display_data" 410 | } 411 | ], 412 | "source": [ 413 | "# Data visualization \n", 414 | "# Log was used for better visualization\n", 415 | "\n", 416 | "plt.hist(np.log(rg3_MD_N), bins=50)\n", 417 | "plt.xlabel(\"Mahalanobis Distance\")\n", 418 | "plt.ylabel(\"No of samples\")\n", 419 | "plt.vlines(np.log(threshold), 0, 5, color=\"red\")\n", 420 | "plt.show();" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 18, 426 | "id": "695fe8ac-cdda-4b8b-b41a-649f41993637", 427 | "metadata": {}, 428 | "outputs": [ 429 | { 430 | "name": "stdout", 431 | "output_type": "stream", 432 | "text": [ 433 | "No. of Failed Parts: 8\n", 434 | "Accuracy: 0.25\n" 435 | ] 436 | } 437 | ], 438 | "source": [ 439 | "# Check on the data that were predicted as failed parts\n", 440 | "\n", 441 | "rg3_test_N_anomalies = rg3_MD_N > threshold\n", 442 | "print(\"No. of Failed Parts:\", np.sum(rg3_test_N_anomalies))\n", 443 | "print(\"Accuracy:\", np.sum(rg3_test_N_anomalies)/rg3_test_N.shape[0])" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "id": "d950e685-039b-415c-95e4-cccf396de964", 449 | "metadata": {}, 450 | "source": [ 451 | "# 6. Result Analysis" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 19, 457 | "id": "9068d6e4-eda8-464f-8f6d-ff38bd2cad87", 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "# True values for the test set\n", 462 | "\n", 463 | "rg3_true = np.concatenate(\n", 464 | " [np.zeros(len(rg3_test_Y_anomalies)), np.ones(len(rg3_test_N_anomalies))]\n", 465 | ")" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 20, 471 | "id": "0130a09a-8455-4d42-884b-ae31fbb4e326", 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "# Predicted values for the test set\n", 476 | "\n", 477 | "rg3_prediction = np.concatenate(\n", 478 | " [rg3_test_Y_anomalies, rg3_test_N_anomalies]\n", 479 | ")" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 21, 485 | "id": "09bf7653-cf80-414b-936c-16282d427e8a", 486 | "metadata": {}, 487 | "outputs": [ 488 | { 489 | "data": { 490 | "text/plain": [ 491 | "array([[104, 19],\n", 492 | " [ 24, 8]], dtype=int64)" 493 | ] 494 | }, 495 | "execution_count": 21, 496 | "metadata": {}, 497 | "output_type": "execute_result" 498 | } 499 | ], 500 | "source": [ 501 | "confusion_matrix(rg3_true, rg3_prediction)" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 22, 507 | "id": "cff3821f-1959-42d7-96d1-2b7c4d727726", 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "image/png": "", 513 | "text/plain": [ 514 | "
" 515 | ] 516 | }, 517 | "metadata": {}, 518 | "output_type": "display_data" 519 | } 520 | ], 521 | "source": [ 522 | "target_names = [\"Pass\", \"Fail\"]\n", 523 | "clf_report = classification_report(\n", 524 | " rg3_true, rg3_prediction, target_names=target_names, output_dict=True\n", 525 | ")\n", 526 | "clf_plot = sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)\n", 527 | "plt.title('Classification Report for Mahalanobis Distance on RG3')\n", 528 | "clf_plot.figure.savefig(\"../img/clf_report_rg3_MD.png\")" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "id": "5946b2a7-fb65-4f04-b33d-dbee8550b438", 534 | "metadata": {}, 535 | "source": [ 536 | "- While it was impossible to classify the failed parts using machine learning models, the model using Mahalanobis Distance could classify the passed and failed parts for `RG3` parts.\n", 537 | "- However, the f1 score for `RG3` was much lower that that for `CN7`" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "id": "c093eb72-959e-453e-91d8-d525a6ede8fe", 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [] 547 | } 548 | ], 549 | "metadata": { 550 | "kernelspec": { 551 | "display_name": "Python [conda env:inj_env]", 552 | "language": "python", 553 | "name": "conda-env-inj_env-py" 554 | }, 555 | "language_info": { 556 | "codemirror_mode": { 557 | "name": "ipython", 558 | "version": 3 559 | }, 560 | "file_extension": ".py", 561 | "mimetype": "text/x-python", 562 | "name": "python", 563 | "nbconvert_exporter": "python", 564 | "pygments_lexer": "ipython3", 565 | "version": "3.10.8" 566 | } 567 | }, 568 | "nbformat": 4, 569 | "nbformat_minor": 5 570 | } 571 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.model_selection import cross_validate 3 | import matplotlib.pyplot as plt 4 | from sklearn.metrics import roc_curve 5 | 6 | 7 | def mean_std_cross_val_scores(model, X_train, y_train, **kwargs): 8 | """ 9 | Returns mean and std of cross validation 10 | 11 | Parameters 12 | ---------- 13 | model : 14 | scikit-learn model 15 | X_train : numpy array or pandas DataFrame 16 | X in the training data 17 | y_train : 18 | y in the training data 19 | 20 | Returns 21 | ---------- 22 | pandas Series with mean scores from cross_validation 23 | """ 24 | 25 | scores = cross_validate(model, X_train, y_train, **kwargs) 26 | 27 | mean_scores = pd.DataFrame(scores).mean() 28 | std_scores = pd.DataFrame(scores).std() 29 | out_col = [] 30 | 31 | for i in range(len(mean_scores)): 32 | out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i]))) 33 | 34 | return pd.Series(data=out_col, index=mean_scores.index) 35 | 36 | 37 | def plot_roc_curve(true_y, y_prob): 38 | """ 39 | plots the roc curve based of the probabilities 40 | """ 41 | plt.figure(figsize=(3, 2.5)) 42 | fpr, tpr, thresholds = roc_curve(true_y, y_prob) 43 | plt.plot(fpr, tpr) 44 | plt.xlabel('False Positive Rate') 45 | plt.ylabel('True Positive Rate') 46 | plt.show(); --------------------------------------------------------------------------------