├── DataSeperation.py
├── README.md
├── main.py
├── procfunc.py
├── screenshots
    ├── current_work.PNG
    ├── dataset_structure.PNG
    ├── results_1.PNG
    └── results_2.PNG
└── utils_laj.py


/DataSeperation.py:
--------------------------------------------------------------------------------
  1 | import os, os.path
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import math
  6 | 
  7 | 
  8 | def movingavg(data, window):
  9 |     w = np.repeat(1.0, window) / window
 10 |     smas = np.convolve(data, w, 'valid')
 11 |     return smas
 12 | 
 13 | 
 14 | root = r'D:\LAHIRU\Work\KeySight\OriginalData'
 15 | 
 16 | months = ['Sep']
 17 | machines = ['192_168_28_28', 'hp36']
 18 | 
 19 | for month in months:
 20 |     sensor_file = 'sensorData' + month + '.csv'
 21 |     event_file = 'machineEvent' + month + '.csv'
 22 | 
 23 |     SensorData_all = pd.read_csv(os.path.join(root, sensor_file), index_col='cf:timestamp',
 24 |                                  usecols=['cf:sensorType', 'cf:timestamp', 'cf:value', 'cf:equipmentId', 'cf:partId',
 25 |                                           'cf:moduleId'],
 26 |                                  dtype={'cf:sensorType': str, 'cf:timestamp': str, 'cf:value': str,
 27 |                                         'cf:equipmentId': str, 'cf:partId': str, 'cf:moduleId': str})
 28 | 
 29 |     SensorData_all.rename(columns={'cf:sensorType': 'sensorType', 'cf:timestamp': 'timestamp', 'cf:value': 'value',
 30 |                                    'cf:equipmentId': 'equipmentId', 'cf:partId': 'partId', 'cf:moduleId': 'moduleId'},
 31 |                           inplace=True)
 32 | 
 33 |     SensorData_all = SensorData_all.drop(SensorData_all[SensorData_all.sensorType == 'cf:sensorType'].index)
 34 | 
 35 |     SensorData_all.index = pd.to_datetime((SensorData_all.index).str[0:-6], format='%Y-%m-%dT%H:%M:%S.%f',
 36 |                                           errors='coerce')
 37 | 
 38 |     for machine in machines:
 39 |         eq_id = machine
 40 |         save_file = 'filtered_events_' + month + '_' + machine + '.csv'
 41 |         print(save_file)
 42 | 
 43 |         if eq_id == 'hp36':
 44 |             df_filterd = SensorData_all.loc[
 45 |                 (SensorData_all['equipmentId'] == '192.168.28.252') | (SensorData_all['equipmentId'] == 'hp36')]
 46 |         elif eq_id == '192_168_28_28':
 47 |             df_filterd = SensorData_all.loc[SensorData_all['equipmentId'] == '192.168.28.28']  # filter by equipment ID
 48 | 
 49 |         df_filterd = SensorData_all
 50 | 
 51 |         df_filterd = df_filterd.loc[df_filterd.index.notnull()]
 52 |         df_filterd = df_filterd.loc[df_filterd.value.notnull()]
 53 |         df_filterd.fillna('Blank', inplace=True)
 54 | 
 55 |         print(df_filterd.equipmentId.unique())
 56 | 
 57 |         df_filterd.replace('BF1', 'BF', inplace=True)
 58 |         df_filterd.replace('VV1', 'VV', inplace=True)
 59 |         module_list = df_filterd.moduleId.unique()
 60 |         tmp = []
 61 |         for _module in module_list:
 62 |             df_moduled = df_filterd.loc[df_filterd['moduleId'] == _module]  # filter by module ID
 63 |             partId_list = df_moduled.partId.unique()
 64 |             print(_module, partId_list)
 65 |             for _part in partId_list:
 66 |                 df_parted = df_moduled.loc[df_moduled['partId'] == _part]  # filter by part ID
 67 |                 sensor_list = df_parted.sensorType.unique()
 68 |                 # print(_part, sensor_list)
 69 |                 for _sensor in sensor_list:
 70 |                     # if _sensor == 'Orientation X' or _sensor == 'Orientation Y' or _sensor == 'Orientation Z' or _sensor == '3.3V' or _sensor == '5.0V':
 71 |                     #     continue
 72 |                     # print(_sensor)
 73 |                     df_sensor = df_parted.loc[df_parted['sensorType'] == _sensor]  # filter by sensor type
 74 |                     # print(df_sensor.shape)
 75 |                     sensor_values = df_sensor[['value']]
 76 |                     sensor_values = sensor_values.loc[sensor_values.value.notnull()]
 77 |                     sensor_values['value'] = pd.to_numeric(sensor_values['value'], errors='coerce')
 78 |                     sensor_values = sensor_values[~sensor_values.index.duplicated(keep='first')]
 79 |                     sensor_values.sort_index(inplace=True)
 80 |                     sensor_values.rename(columns={'value': _module + "_" + _part + "_" + _sensor}, inplace=True)
 81 |                     # print(_sensor,'start',sensor_values.index[0],'end',sensor_values.index[-1])
 82 |                     tmp.append(sensor_values)
 83 |             print("***********")
 84 |         # print(len(tmp))
 85 | 
 86 |         ########################### Machine Evens #############################
 87 |         machine_event = pd.read_csv(os.path.join(root, event_file), index_col='cf:timestamp',
 88 |                                     dtype={'cf:eventType': str})
 89 | 
 90 |         machine_event.rename(columns={'Unnamed: 0': 'hashId', 'cf:eventType': 'event'}, inplace=True)
 91 |         machine_event.index = pd.to_datetime((machine_event.index).str[0:-6], format='%Y-%m-%dT%H:%M:%S.%f',
 92 |                                              errors='coerce')
 93 | 
 94 |         machine_event = machine_event.loc[machine_event.index.notnull()]
 95 | 
 96 |         hash_split = machine_event['hashId'].str.split(':', expand=True)
 97 |         _tmp = pd.concat([machine_event, hash_split], axis=1)
 98 |         event_data = _tmp.loc[:, ['event', 2]]
 99 |         event_data.rename(columns={2: 'equipmentId'}, inplace=True)
100 | 
101 |         # print("equipments : ",event_data.equipmentId.unique())
102 | 
103 |         print('before unique machine events : ', machine_event.event.unique())
104 | 
105 |         clearnup_nums = {'event': {'User input received': 1, 'Machine ready': 2, 'Machine alarm': 3,
106 |                                    'Wait for user input': 4, 'Log out': 5, 'DUT all tests completed': 6,
107 |                                    'DUT test time': 7,
108 |                                    'DUT overall test result': 8, 'DUT serial number': 9, 'Board file loaded': 10,
109 |                                    'Start running testplan': 11, 'Fixture locked': 12, 'Testplan loaded': 13,
110 |                                    'Fixture unlocked': 14, 'Finish running diagnostic': 15, 'Machine busy/idle': 16,
111 |                                    'Stop running testplan': 17, 'Start running diagnostic': 18}}  #
112 |         event_data.replace(clearnup_nums, inplace=True)
113 | 
114 |         if eq_id == 'hp36':
115 |             event_data_filtered = event_data.loc[(event_data['equipmentId'] == '192.168.28.252') | (
116 |             event_data['equipmentId'] == 'hp36')]  # filter by equipment ID
117 |         elif eq_id == '192_168_28_28':
118 |             event_data_filtered = event_data.loc[event_data['equipmentId'] == '192.168.28.28']  # filter by equipment ID
119 | 
120 |         event_data_filtered = event_data_filtered[~event_data_filtered.index.duplicated(keep='first')]
121 |         event_data_filtered.sort_index(inplace=True)
122 |         event_values = event_data_filtered[['event']]
123 |         # print('after unique machine events : ', event_values.event.unique())
124 |         ######################################################################
125 | 
126 |         newindex = tmp[0].index.union(event_values.index)
127 |         for i in range(1, len(tmp)):
128 |             newindex = tmp[i].index.union(newindex)
129 | 
130 |         for i in range(0, len(tmp)):
131 |             # print(tmp[i].shape)
132 |             tmp[i] = tmp[i].reindex(newindex, method='nearest')
133 |         event_values = event_values.reindex(newindex, method='nearest')
134 |         print(tmp[0].shape)
135 | 
136 |         sync_results = pd.concat([tmp[0], tmp[1]], axis=1)
137 |         for i in range(2, len(tmp)):
138 |             sync_results = pd.concat([sync_results, tmp[i]], axis=1)
139 |         sync_results = pd.concat([sync_results, event_values], axis=1)
140 |         # print(list(sync_results))
141 |         print('after unique machine events : ', sync_results.event.unique())
142 |         unique, counts = np.unique(sync_results.event.values, return_counts=True)
143 |         event_count = np.asarray((unique, counts)).T
144 |         print(event_count)
145 |         # show_col = ['Module 3_BF_Temperature Blower']
146 |         # sync_results.plot(y=show_col)
147 | 
148 |         # print(sync_results.event.value_counts())
149 |         # sync_results.to_csv(save_file)
150 |         sync_results.plot()
151 |         plt.show()
152 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Predictive-Maintance-Model
 2 | 
 3 | A requirement to implement a degradation model for an industrial machine and predict the failures beforehand.
 4 | Malfunctioning of machines are captured here as anomalies and failures and its related data are captured here as outliers
 5 | 
 6 | ### Dependencies
 7 | 
 8 | - numpy
 9 | - scikit-learn > 0.19.1
10 | - pandas > 0.20.3
11 | 
12 | ### Dataset
13 | 
14 | pickel files of the dataset: https://www.dropbox.com/s/jt0nsqsmqxm8wz4/pickle.rar?dl=0 
15 | 
16 | ### Dataset structure (Time Synchronized)
17 | 
18 | ![Screenshot](screenshots/dataset_structure.PNG)
19 | 
20 | ### Architecture
21 | 
22 | ![Screenshot](screenshots/current_work.PNG)
23 | 
24 | ### Results of anomaly and outlier detection
25 | 
26 | ![Screenshot](screenshots/results_1.PNG)
27 | ![Screenshot](screenshots/results_2.PNG)
28 | 
29 | ### Degradation Model
30 | 
31 | The degradation model for remaining useful life estimation can be found [here](https://github.com/LahiruJayasinghe/RUL-Net) 
32 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | from sklearn.preprocessing import StandardScaler
  6 | from utils_laj import cache
  7 | 
  8 | from procfunc import plot_data
  9 | from procfunc import clustering
 10 | from procfunc import get_outlier_data
 11 | from procfunc import get_pca_components
 12 | from procfunc import eigenvalue_analysis
 13 | from procfunc import show_plots
 14 | 
 15 | if __name__ == "__main__":
 16 | 
 17 |     # TODO: doc discription
 18 | 
 19 |     #### folder structure ##############################################################################################
 20 |     root = r'D:\LAHIRU\Work\KeySight\DataVisualization'
 21 |     data_folder = 'synchronized_data'
 22 |     anomaly_folder = 'first_round_anomaly_detection'
 23 |     ####################################################################################################################
 24 | 
 25 |     #### data parameters ###############################################################################################
 26 |     months = ['Nov']
 27 | 
 28 |     machines = ['192_168_28_28']
 29 |     # machines = ['hp36']
 30 | 
 31 |     discrete_sensors = True
 32 |     # discrete_sensors = True
 33 |     ####################################################################################################################
 34 | 
 35 |     #### clustering parameters #########################################################################################
 36 |     # cluster = 'dbscan'
 37 |     distance = 0.6
 38 |     min_samples = 20
 39 | 
 40 |     cluster = 'kmeans'
 41 |     n_clusters = 6
 42 |     ####################################################################################################################
 43 | 
 44 |     remove_outliers = False
 45 |     show_anomalies = False
 46 |     show_noise = False
 47 | 
 48 |     plot_eig_vals = False
 49 |     plot_events_based = False
 50 |     plot_cluster_based = True
 51 |     save_fig = False
 52 | 
 53 |     if discrete_sensors:
 54 |         file_tag = 'filtered_events_'
 55 |         pickle_tag = ''
 56 |     else:
 57 |         file_tag = 'filtered_events_sensors_'
 58 |         pickle_tag = 'SensorFiltered_'
 59 | 
 60 |     for month in months:
 61 |         for machine in machines:
 62 |             data_file = file_tag + month + '_' + machine + '.csv'
 63 |             data_fpath = os.path.join(data_folder, data_file)
 64 |             ftag = pickle_tag + month + '_' + machine
 65 |             path_half = os.path.join('pickle', ftag)
 66 |             print('data file: ', data_file, '\nfile tag: ', ftag)
 67 | 
 68 |             #### pre processing ########################################################################################
 69 |             # data_all = pd.read_csv(data_fpath, index_col='cf:timestamp')
 70 |             # data_all = cache(path_half+'_DataAll.pkl',data_all.values)
 71 |             # data_all = cache(path_half+'_DataAll.pkl')
 72 |             ############################################################################################################
 73 | 
 74 |             #### make events as classes ################################################################################
 75 |             # col = data_all.shape[1]
 76 |             # cls = data_all[:,col-1].astype(np.int32) # take events as classes
 77 |             # cls = cache(path_half + '_EventCls.pkl',cls)
 78 |             ############################################################################################################
 79 |             cls = cache(path_half + '_EventCls.pkl')
 80 | 
 81 |             #### evaluating number of events ###########################################################################
 82 |             unique, counts = np.unique(cls, return_counts=True)
 83 |             event_count = np.asarray((unique, counts)).T
 84 |             num_events = event_count[-1, 0]
 85 |             # print(event_count,num_events)
 86 |             ############################################################################################################
 87 | 
 88 |             #### pre processing ########################################################################################
 89 |             # data_all = data_all[:,0:col-1] # exclude events for PCA
 90 |             # print("excluding events", data_all.shape)
 91 | 
 92 |             # data_all = cache(path_half+'_ExcludeEvent.pkl',data_all)
 93 |             # data_all = cache(path_half+'_ExcludeEvent.pkl')
 94 | 
 95 |             # data_ma = movingavg(data_all,window=250)
 96 |             # data_ma = movingavg(data_all,window=1)
 97 |             # data_ma=data_all
 98 |             # print("data_ma shape : ",data_ma.shape)
 99 |             ############################################################################################################
100 | 
101 |             #### standerdization #######################################################################################
102 |             # mean = np.mean(data_ma,axis=0)
103 |             # data_ma_std = data_ma - mean
104 |             # data_ma_std = StandardScaler().fit_transform(data_ma) #[n_samples, n_features]
105 |             data_ma_std = cache(path_half + '_ExcludeEvent_normalized.pkl')
106 |             ############################################################################################################
107 | 
108 |             if plot_eig_vals:
109 |                 eigenvalue_analysis(data_ma_std, save_fig, ftag + '_eigenvalues')
110 | 
111 |             #### show anomalies ########################################################################################
112 |             if show_anomalies:
113 |                 for_anomaly = pd.read_csv(data_fpath, index_col='cf:timestamp')
114 |                 for_anomaly.plot().legend(loc='upper right')
115 |                 anomaly_fpath = os.path.join(anomaly_folder, ftag + '_outliers.csv')
116 |                 print(anomaly_fpath)
117 |                 anomaly_data = pd.read_csv(anomaly_fpath)
118 |                 anomaly_indexes = anomaly_data['index']
119 |                 for i in anomaly_indexes:
120 |                     plt.axvspan(i, i + 1, color='green', alpha=0.3)
121 |                     # plt.axvline(i, color='green', alpha=0.5)
122 |                 if show_noise:
123 |                     noise_i = pd.read_csv(os.path.join(anomaly_folder, ftag + '_noise.csv'))['index']
124 |                     for i in noise_i:
125 |                         plt.axvspan(i, i + 1, color='blue', alpha=0.5)
126 |                 plt.show()
127 |             ############################################################################################################
128 | 
129 |             data_all = pd.read_csv(data_fpath).reset_index()  # for anomaly analysis
130 | 
131 |             #### remove_outliers #######################################################################################
132 |             if remove_outliers:
133 |                 if pickle_tag == 'SensorFiltered_':
134 |                     max_index = 181759
135 |                     min_index = 181435
136 |                 else:
137 |                     max_index = 115694
138 |                     min_index = 114428
139 | 
140 |                 data_ma_std = np.delete(data_ma_std, slice(min_index, max_index), axis=0)
141 |                 cls = np.delete(cls, slice(min_index, max_index))
142 |                 data_all.drop(data_all.index[min_index:max_index], inplace=True)
143 |             ############################################################################################################
144 | 
145 |             Z, x, y, z = get_pca_components(data_ma_std)  # perform pca on standardized data
146 | 
147 |             if cluster == 'kmeans':
148 |                 labels, event_count, k_cls = clustering(Z, cluster, n_clusters)
149 |                 outlier_data = get_outlier_data(data_all, labels, cluster, event_count)
150 |                 if save_fig:
151 |                     outlier_data.to_csv(ftag + '_outliers.csv')
152 |             elif cluster == 'dbscan':
153 |                 Z = Z[(Z[:, 0] > 5.5)]
154 |                 x = Z[:, 0]
155 |                 y = Z[:, 1]
156 |                 z = Z[:, 2]
157 |                 labels, event_count, k_cls = clustering(Z, cluster, distance=distance, min_samples=min_samples)
158 |                 outlier_data, noise_data = get_outlier_data(data_all, labels, cluster, event_count)
159 |                 if save_fig:
160 |                     noise_data.to_csv(ftag + '_noise.csv')
161 |                     outlier_data.to_csv(ftag + '_outliers.csv')
162 |             print(event_count.T)
163 | 
164 |             if plot_events_based:
165 |                 plot_data(x, y, None, 3, cls, num_events, save_fig, 'first three Eigenvalues' + '_' + ftag)
166 |                 plot_data(x, y, None, 2, cls, num_events, save_fig, 'x-y' + '_' + ftag)
167 |                 plot_data(x, z, None, 2, cls, num_events, save_fig, 'x-z' + '_' + ftag)
168 |                 plot_data(y, z, None, 2, cls, num_events, save_fig, 'y-z' + '_' + ftag)
169 | 
170 |             if plot_cluster_based:
171 |                 plot_data(x, y, z, 3, labels, k_cls, save_fig, ftag + '_outliers_3d')
172 |                 plot_data(x, y, None, 2, labels, k_cls, save_fig, ftag + '_outliers_XY')
173 |                 plot_data(y, z, None, 2, labels, k_cls, save_fig, ftag + '_outliers_YZ')
174 |                 plot_data(x, z, None, 2, labels, k_cls, save_fig, ftag + '_outliers_XZ')
175 | 
176 |             show_plots(save_fig)
177 | 


--------------------------------------------------------------------------------
/procfunc.py:
--------------------------------------------------------------------------------
  1 | from sklearn.decomposition import PCA
  2 | import os, os.path
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import matplotlib.cm as cm
  7 | from mpl_toolkits.mplot3d import Axes3D
  8 | from sklearn.preprocessing import StandardScaler
  9 | from utils_laj import movingavg
 10 | from utils_laj import cache
 11 | from sklearn.cluster import KMeans
 12 | from sklearn.cluster import DBSCAN
 13 | 
 14 | 
 15 | def plot_data(x, y, z, dim, labels_array, num_classes, save_fig, image_name):
 16 |     """
 17 |     Scatter plot for data distribution
 18 | 
 19 |     :param z:
 20 |     :param y:
 21 |     :param x:
 22 |     :param dim: dim=3 or dim=2, indicate the dimentions of the plot / type(dim)=int
 23 |     :param labels_array: an numpy array, which indicate the label for each and every data eg: [0 0 1 ... 2 2 2] / type(labels_array)=np.array
 24 |     :param num_classes: number of classes or clusters that the data belong / type(num_classes)=int
 25 |     :param save_fig: boolean value, save figure if True / type(save_fig)=bool
 26 |     :param image_name: string variable for plot title and file name in case its saved / type(image_name)=string
 27 | 
 28 |     """
 29 | 
 30 |     cmap = cm.get_cmap('gist_ncar', num_classes)
 31 |     if dim == 3:
 32 |         fig = plt.figure()
 33 |         ax = Axes3D(fig)
 34 |         sc = ax.scatter(x, y, z, c=labels_array.astype(np.float), edgecolor='k', cmap=cmap)
 35 |         plt.ylabel('y')
 36 |         plt.xlabel('x')
 37 |         plt.colorbar(sc, ticks=range(0, num_classes))
 38 |         plt.title(image_name)
 39 |         if save_fig == True:
 40 |             fig.savefig(image_name + '.png', bbox_inches='tight')
 41 |             plt.close(fig)
 42 |     elif dim == 2:
 43 |         fig = plt.figure()
 44 |         sc = plt.scatter(x, y, c=labels_array.astype(np.float), edgecolor='k', cmap=cmap)
 45 |         plt.ylabel('y')
 46 |         plt.xlabel('x')
 47 |         plt.colorbar(sc, ticks=range(0, num_classes))
 48 |         plt.title(image_name)
 49 |         if save_fig == True:
 50 |             fig.savefig(image_name + '.png', bbox_inches='tight')
 51 |             plt.close(fig)
 52 |     else:
 53 |         raise ValueError("invalid dimension valid either dim=2 or dim=3")
 54 | 
 55 | 
 56 | def clustering(Z, cluster_method, n_clusters=0, distance=0, min_samples=0):
 57 |     """
 58 |     Clustering function supports only kmeans and dbscan
 59 |     Eg:
 60 |     labels : [0 0 1 ... 2 2 2]
 61 |     event_cout :  [[  -1   72]
 62 |                   [   0  398]
 63 |                   [   1   26]
 64 |                   [   2 2520]]
 65 |     k_cls :  4
 66 | 
 67 |     :param Z: The data need to be clustered
 68 |     :param cluster_method: should be either 'kmeans' or 'dbscan' type(cluster_method)=string
 69 |     :param n_clusters: only for kmenas, type(n_clusters)=int
 70 |     :param distance:  only for dbscan type(distance)=float
 71 |     :param min_samples: only for dbscan type(min_samples)=float
 72 |     :return: 'labels' represents cluster labels for each and every points in 'Z' based on the 'clustering_method'
 73 |              'event_count[:,0]' column represents different cluster labels calculated by 'cluster_method'
 74 |              'k_cls' total number of different clusters exist in 'Z'
 75 |     """
 76 | 
 77 |     if cluster_method == 'kmeans':
 78 |         if isinstance(n_clusters, int) and n_clusters > 0:
 79 |             # kmeans = KMeans(n_clusters=k_cls,init='k-means++',n_init=k_cls)
 80 |             kmeans = KMeans(n_clusters=n_clusters)
 81 |             kmeans.fit(Z)
 82 |             labels = kmeans.labels_
 83 |         else:
 84 |             raise ValueError("invalied 'n_clusters'")
 85 |     elif cluster_method == 'dbscan':
 86 |         if isinstance(min_samples, int) and min_samples > 0 and distance > 0:
 87 |             np.random.seed(42)
 88 |             db = DBSCAN(eps=distance, min_samples=min_samples).fit(Z)
 89 |             labels = db.labels_
 90 |         else:
 91 |             raise ValueError("invalied 'distance' or 'min_samples'")
 92 |     else:
 93 |         raise ValueError("Undefined cluster method, valied either cluster_method='kmeans' or cluster_method='dbscan'")
 94 | 
 95 |     unique, counts = np.unique(labels, return_counts=True)
 96 |     event_count = np.asarray((unique, counts)).T
 97 |     k_cls = len(unique)
 98 |     return labels, event_count, k_cls
 99 | 
100 | 
101 | def get_outlier_data(data_all, labels, cluster_method, event_count):
102 |     """
103 |     Hypothesis : outliers are the cluster, which containes smallest number of data points
104 |     Since this implementation is focusing on removing anomalies and considering our data distribution,
105 |     this hypothesis is almost accurate so far
106 | 
107 |     for dbscan, noise data lable (-1) and the label which belogs to the smallest cluster are identified separately
108 | 
109 |     :param data_all: raw data
110 |     :param labels: cluster labels for each and every points in raw data, based on the 'cluster_method'
111 |     :param cluster_method: should be either 'kmeans' or 'dbscan' type(cluster_method)=string
112 |     :param event_count: an array which contains number of data points for specific cluster label type(event_count)=np.array, [None,2]
113 |     :return: outlier data as np.array
114 |     """
115 | 
116 |     if cluster_method == 'kmeans':
117 |         smallest_cluster_label = np.argmin(event_count[:, 1])
118 |         outliers_indexes = np.where(labels == smallest_cluster_label)[0]
119 |         print('number of outlier datapoints : ', len(outliers_indexes), '\noutlier label : ', smallest_cluster_label)
120 |         return data_all.iloc[outliers_indexes]
121 |     elif cluster_method == 'dbscan':
122 |         smallest_cluster_label = np.argmin(event_count[1:, 1])
123 |         outliers_indexes = np.where(labels == smallest_cluster_label)[0]
124 |         print('number of outlier datapoints : ', len(outliers_indexes), '\noutlier label : ', smallest_cluster_label)
125 |         noise_indexes = np.where(labels == -1)[0]
126 |         # print('number of noise indexes : ', len(noise_indexes))
127 |         return data_all.iloc[outliers_indexes], data_all.iloc[noise_indexes]
128 |     else:
129 |         raise ValueError("Undefined cluster method, valid either cluster_method='kmeans' or cluster_method == 'dbscan'")
130 | 
131 | 
132 | def get_pca_components(Z):
133 |     """
134 | 
135 |     :param Z: accept normalized data, type(Z)=np.array shape(Z)=[n_samples, n_features]
136 |     :return: pca values for respective components (x,y, and z)
137 |     """
138 |     pca = PCA(n_components=3)
139 |     Z = pca.fit_transform(Z)
140 |     x = Z[:, 0]
141 |     y = Z[:, 1]
142 |     z = Z[:, 2]
143 |     return Z, x, y, z
144 | 
145 | 
146 | def eigenvalue_analysis(data_std, save_fig, image_name):
147 |     """
148 |     plot covariance and correlation graphs
149 |     plot cumulative explained variance and individual explained variance of eigen values
150 | 
151 |     :param data_std: data should be standardized type(data_std)=np.array, shape(data_std)=[features,samples]
152 |     :param save_fig: whether to save the plot type(save_fig)=bool
153 |     :param image_name: plot title and its saving file name incase its save_fig=True
154 |     :return:
155 |     """
156 |     ############### covariance/correlation ########
157 |     cov_mat = np.cov(data_std.T)
158 |     # mean_vec = np.mean(data_ma_std, axis=0)
159 |     # cov_mat = (data_ma_std - mean_vec).T.dot((data_ma_std - mean_vec)) / (data_ma_std.shape[0]-1)
160 |     plt.figure()
161 |     plt.imshow(cov_mat, label='covariance of data')
162 |     plt.title('covariance of data')
163 |     corr_mat = np.corrcoef(data_std.T)
164 |     plt.figure()
165 |     plt.imshow(corr_mat, label='correlation of data')
166 |     plt.title('correlation of data')
167 |     ##############################################
168 | 
169 |     ############### eigenvalues ##################
170 |     eig_vals, eig_vecs = np.linalg.eig(cov_mat)
171 |     for ev in eig_vecs:
172 |         np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
173 |     print('\nEigenvalues \n%s' % eig_vals)
174 | 
175 |     tot = sum(eig_vals)
176 |     var_exp = [(i / tot) * 100 for i in sorted(eig_vals, reverse=True)]
177 |     cum_var_exp = np.cumsum(var_exp)
178 |     with plt.style.context('seaborn-whitegrid'):
179 |         fig = plt.figure()
180 |         plt.bar(range(data_std.shape[1]), var_exp, alpha=0.5, align='center',
181 |                 label='individual explained variance')
182 |         plt.step(range(data_std.shape[1]), cum_var_exp, where='mid',
183 |                  label='cumulative explained variance')
184 |         plt.ylabel('Explained variance ratio')
185 |         plt.xlabel('Principal components')
186 |         plt.legend(loc='best')
187 |         fig.savefig(image_name + '.png', bbox_inches='tight')
188 |         plt.tight_layout()
189 |     plt.show()
190 |     ##############################################
191 | 
192 | 
193 | def show_plots(save_fig):
194 |     if not save_fig:
195 |         plt.show()
196 | 


--------------------------------------------------------------------------------
/screenshots/current_work.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LahiruJayasinghe/machine-failure-detection/e982c55bf675b5d80f715ab3105fb0a4ab445d9f/screenshots/current_work.PNG


--------------------------------------------------------------------------------
/screenshots/dataset_structure.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LahiruJayasinghe/machine-failure-detection/e982c55bf675b5d80f715ab3105fb0a4ab445d9f/screenshots/dataset_structure.PNG


--------------------------------------------------------------------------------
/screenshots/results_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LahiruJayasinghe/machine-failure-detection/e982c55bf675b5d80f715ab3105fb0a4ab445d9f/screenshots/results_1.PNG


--------------------------------------------------------------------------------
/screenshots/results_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LahiruJayasinghe/machine-failure-detection/e982c55bf675b5d80f715ab3105fb0a4ab445d9f/screenshots/results_2.PNG


--------------------------------------------------------------------------------
/utils_laj.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import numpy as np
 4 | 
 5 | def cache(cache_path,obj=0):
 6 |     if os.path.exists(cache_path) and obj==0:
 7 |         with open(cache_path, mode='rb') as file:
 8 |             obj = pickle.load(file)
 9 |         print("- Data loaded from cache-file: " + cache_path)
10 |     else:
11 |         with open(cache_path, mode='wb') as file:
12 |             pickle.dump(obj, file)
13 |         print("- Data saved to cache-file: " + cache_path)
14 | 
15 |     return obj
16 | 
17 | def movingavg(data,window): #[n_samples, n_features]
18 |     data = np.transpose(data)
19 |     if data.ndim > 1 :
20 |         tmp = []
21 |         for i in range(data.shape[0]):
22 |             ma = movingavg(np.squeeze(data[i]), window)
23 |             tmp.append(ma)
24 |         smas = np.array(tmp)
25 |     else :
26 |         w = np.repeat(1.0,window)/window
27 |         smas = np.convolve(data,w,'valid')
28 |     smas = np.transpose(smas)
29 |     return smas #[n_samples, n_features]


--------------------------------------------------------------------------------