├── Chapter_ANN ├── CCPP_FFNN.ipynb ├── CCPP_FFNN.py ├── CCPP_FFNN_gridSearch.ipynb ├── CCPP_data_explore.ipynb ├── CCPP_data_explore.py ├── Folds5x2_pp.xlsx ├── ccpp_FFNN_gridSearch.py ├── debutanizer_FFNN.ipynb ├── debutanizer_FFNN.py ├── debutanizer_PLS.ipynb ├── debutanizer_PLS.py ├── debutanizer_data.txt ├── debutanizer_dataExplore.ipynb ├── debutanizer_dataExplore.py ├── info.txt ├── kamyr-digester.csv ├── kamyr_data_FFNN_earlyStopping.ipynb ├── kamyr_data_FFNN_earlyStopping.py ├── quadratic_function_singleLayer.ipynb └── quadratic_function_singleLayer.py ├── Chapter_BestPractices ├── 3Way_Holdout_Method.ipynb ├── 3wayHoldout_Method.py ├── Centering_Scaling.ipynb ├── Centering_Scaling.py ├── Feature_Engineering_OneHotEncoding.ipynb ├── Feature_Engineering_OneHotEncoding.py ├── Feature_Engineering_quadraticFit.ipynb ├── Feature_Engineering_quadraticFit.py ├── GridSearchCV.ipynb ├── GridSearchCV.py ├── Holdout_Method.ipynb ├── Holdout_Method.py ├── Pipeline_quadraticFit.ipynb ├── Pipeline_quadraticFit.py ├── Regularization.ipynb ├── ValidationCurve.ipynb ├── ValidationCurve.py ├── info.txt ├── kFold_CrossValidation.ipynb ├── kFold_CrossValidation.py ├── quadratic_raw_data.csv └── regularization.py ├── Chapter_Clustering_GMM ├── DBSCAN_clustering.ipynb ├── DBSCAN_clustering.py ├── DBSCAN_illustration.ipynb ├── DBSCAN_illustration.py ├── Etch_data_explore.ipynb ├── Etch_data_explore.py ├── GMM_clustering.ipynb ├── GMM_clustering.py ├── GMM_illustration.ipynb ├── GMM_illustration.py ├── MACHINE_Data.mat ├── Metal_etch_complete_data_visualize.ipynb ├── Metal_etch_complete_data_visualize.py ├── ProcessMonitoring_GMM.ipynb ├── ProcessMonitoring_GMM.py ├── k_means_clustering.ipynb ├── k_means_clustering.py ├── k_means_failure.ipynb └── k_means_failure.py ├── Chapter_DecisionTrees_EnsembleLearning ├── Bagging_illustration.ipynb ├── Bagging_illustration.py ├── DT_illustration.ipynb ├── DT_illustration.py ├── RF_illustration.ipynb ├── RF_illustration.py ├── SoftSensing_ConcreteStrength_PLS.ipynb ├── SoftSensing_ConcreteStrength_PLS.py ├── SoftSensing_ConcreteStrength_RF.ipynb ├── SoftSensing_ConcreteStrength_RF.py ├── SoftSensing_WastewaterPlant_PLS.ipynb ├── SoftSensing_WastewaterPlant_PLS.py ├── SoftSensing_WastewaterPlant_XGBoost.ipynb ├── SoftSensing_WastewaterPlant_XGBoost.py ├── cement_strength.txt ├── info.txt ├── water-treatment.data └── water-treatment.names ├── Chapter_LatentVariable1 ├── DimensionalityReduction.ipynb ├── DimensionalityReduction.py ├── DynamicPCA.ipynb ├── DynamicPCA.py ├── KernelPCA.ipynb ├── KernelPCA.py ├── LDPE.csv ├── ProcessMonitoring_PCA.ipynb ├── ProcessMonitoring_PCA.py ├── ProcessMonitoring_PLS.ipynb ├── ProcessMonitoring_PLS.py ├── SoftSensor_PLS.ipynb ├── kamyr-digester.csv ├── proc1a.xls └── softSensor_PLS.py ├── Chapter_LatentVariable2 ├── DimensionalityReduction_FDA.ipynb ├── DimensionalityReduction_FDA.py ├── DimensionalityReduction_ICA.ipynb ├── DimensionalityReduction_ICA.py ├── FDA_illustration.ipynb ├── FDA_illustration.py ├── FaultClassification_FDA.ipynb ├── FaultClassification_FDA.py ├── ICA_illustration.ipynb ├── ICA_illustration.py ├── ProcessMonitoring_ICA.ipynb ├── ProcessMonitoring_ICA.py ├── ProcessMonitoring_PCA.ipynb ├── ProcessMonitoring_PCA.py ├── TEP_data_explore.ipynb ├── TE_processData_explore.py ├── d00.dat ├── d00_te.dat ├── d01.dat ├── d01_te.dat ├── d02.dat ├── d02_te.dat ├── d03.dat ├── d03_te.dat ├── d04.dat ├── d04_te.dat ├── d05.dat ├── d05_te.dat ├── d06.dat ├── d06_te.dat ├── d07.dat ├── d07_te.dat ├── d08.dat ├── d08_te.dat ├── d09.dat ├── d09_te.dat ├── d10.dat ├── d10_te.dat ├── d11.dat ├── d11_te.dat ├── d12.dat ├── d12_te.dat ├── d13.dat ├── d13_te.dat ├── d14.dat ├── d14_te.dat ├── d15.dat ├── d15_te.dat ├── d16.dat ├── d16_te.dat ├── d17.dat ├── d17_te.dat ├── d18.dat ├── d18_te.dat ├── d19.dat ├── d19_te.dat ├── d20.dat ├── d20_te.dat ├── d21.dat ├── d21_te.dat └── info.txt ├── Chapter_OtherUsefulMethods ├── FD-kNN.ipynb ├── FD-kNN.py ├── KDE_ControlLimits_for_ICAmonitoring.ipynb ├── KDE_ControlLimits_for_ICAmonitoring.py ├── KDE_GridSearchCV.ipynb ├── KDE_GridSearchCV.py └── info.txt ├── Chapter_Preprocessing ├── EmbeddedMethods_Lasso.py ├── Embedded_Method_Lasso.ipynb ├── MLR_VSdata.py ├── Missing_data_imputation.ipynb ├── Missing_data_imputation.py ├── MultivariateLinearRegression_VSdata.ipynb ├── Multivariate_Outliers_MCD.ipynb ├── Multivariate_Outliers_MahalanobisDistance.ipynb ├── Multivariate_outliers_MCD.py ├── Multivariate_outliers_Mahalanobis_distance.py ├── Univariate_Outliers.ipynb ├── Univariate_Outliers.py ├── VSdata.csv ├── VSdata_val.csv ├── WrapperMethods_backward_SFS.py ├── Wrapper_Methods_backward_SFS.ipynb ├── complex2D_outlier.csv ├── deNoising_process_signals.py ├── denoising_process_signals.ipynb ├── filterMethods.py ├── filter_Methods.ipynb ├── info.txt ├── noisy_flow_signal.csv └── simple2D_outlier.csv ├── Chapter_RNN ├── AircraftEngine_dataExploration.ipynb ├── AircraftEngine_dataExplore.py ├── PM_test.txt ├── PM_train.txt ├── PM_truth.txt ├── SISO_Heater_system_RNN.ipynb ├── SISO_Heater_system_RNN.py ├── TCLab_test_data.txt ├── TCLab_train_data.txt ├── TEP_dataExploration.ipynb ├── TEP_dataExploration.py ├── TEP_faultClassification_RNN.ipynb ├── TEPclassification_RNN.py ├── info.txt ├── predictiveMaint_Regression_RNN.py ├── predictiveMaint_binaryClassification_RNN.ipynb ├── predictiveMaint_binaryClassification_RNN.py └── predictiveMaint_regression_RNN.ipynb ├── Chapter_ReinforcementLearning ├── RL_agent_train_test.ipynb ├── RL_agent_train_test.py ├── Tank_Environment.py ├── actor_saved │ ├── keras_metadata.pb │ ├── saved_model.pb │ └── variables │ │ ├── variables.data-00000-of-00001 │ │ └── variables.index ├── disturbance_200.csv └── info.txt ├── Chapter_ScriptingEnvironment ├── NumpyBasics.ipynb ├── NumpyBasics.py ├── PandasBasics.ipynb ├── PandasBasics.py ├── PythonBasics.ipynb ├── PythonBasics.py ├── info.txt ├── quadratic_raw_data.csv ├── typicalML_script.ipynb └── typicalML_script.py ├── Chapter_SupportVectorMachines ├── Metal_etch_2DPCA_testData.csv ├── Metal_etch_2DPCA_trainingData.csv ├── SVDD_FaultDetection.ipynb ├── SVDD_FaultDetection.py ├── SVDD_OneClassClassification.ipynb ├── SVDD_OneClassClassification.py ├── SVDD_toyDataset.csv ├── SVM_BinaryClassification.ipynb ├── SVM_BinaryClassification.py ├── SVM_Kernel_BinaryClassification.ipynb ├── SVM_Kernel_BinaryClassification.py ├── SVM_Kernel_BinaryClassification_noGridSearch.py ├── SVM_SoftMarginClassification.ipynb ├── SVM_SoftMarginClassification.py ├── SVR_illustration.ipynb ├── SVR_illustration.py ├── debutanizer_Softsensing_PLS.ipynb ├── debutanizer_Softsensing_PLS.py ├── debutanizer_Softsensing_SVR.ipynb ├── debutanizer_Softsensing_SVR.py ├── debutanizer_data.txt ├── info.txt ├── polymer.dat ├── polymerPlantData_Softsensing_PLS.ipynb ├── polymerPlantData_Softsensing_PLS.py ├── polymerPlantData_Softsensing_SVR.ipynb ├── polymerPlantData_Softsensing_SVR.py ├── toyDataset.csv └── toyDataset2.csv ├── Chapter_WebDeployment ├── FDD.py ├── FDD_withHTML.py ├── FDD_withoutHTML.py ├── PCAmetrics_history.pickle ├── PCAmodelData.pickle ├── ProcessMonitoring_PCA.py ├── contributionPlot.png ├── frontEndTemplate.html ├── helloWorld.py ├── info.txt ├── metricPlot.png ├── proc1a.xlsx ├── processLatestDatabase_local.csv └── sample.html ├── Images ├── Book3_coverPage.JPG └── ML-for-PSE-2023Edition-CoverPage.JPG ├── LICENSE └── README.md /Chapter_ANN/CCPP_FFNN.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## FFNN modeling of CCPP 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np, pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | #%% read data 10 | data = pd.read_excel('Folds5x2_pp.xlsx', usecols = 'A:E').values 11 | X = data[:,0:4] 12 | y = data[:,4][:,np.newaxis] 13 | 14 | #%% separate train and test data 15 | from sklearn.model_selection import train_test_split 16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100) 17 | 18 | #%% scale data 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | X_scaler = StandardScaler() 22 | X_train_scaled = X_scaler.fit_transform(X_train) 23 | X_test_scaled = X_scaler.transform(X_test) 24 | 25 | y_scaler = StandardScaler() 26 | y_train_scaled = y_scaler.fit_transform(y_train) 27 | y_test_scaled = y_scaler.transform(y_test) 28 | 29 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 30 | ## Define & Fit FFNN model 31 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 32 | 33 | #%% import Keras libraries 34 | from tensorflow.keras import Sequential 35 | from tensorflow.keras.layers import Dense 36 | 37 | #%% define model 38 | model = Sequential() 39 | model.add(Dense(8, activation='relu', kernel_initializer='he_normal', input_shape=(4,))) # 8 neurons in 1st hidden layer; this hidden layer accepts data from a 4 dimensional input 40 | model.add(Dense(5, activation='relu', kernel_initializer='he_normal')) # 5 neurons in 2nd layer 41 | model.add(Dense(1)) # output layer 42 | 43 | #%% compile model 44 | model.compile(loss='mse', optimizer='Adam') # mean-squared error is to be minimized 45 | 46 | #%% fit model 47 | model.fit(X_train_scaled, y_train_scaled, epochs=25, batch_size=50) 48 | 49 | #%% predict y_test 50 | y_test_scaled_pred = model.predict(X_test_scaled) 51 | y_test_pred = y_scaler.inverse_transform(y_test_scaled_pred) 52 | 53 | plt.figure() 54 | plt.plot(y_test, y_test_pred, '*') 55 | plt.xlabel('y_test') 56 | plt.ylabel('y_test_pred') 57 | 58 | #%% metrics 59 | from sklearn.metrics import r2_score 60 | print('R2:', r2_score(y_test, y_test_pred)) 61 | 62 | #%% model summary 63 | model.summary() -------------------------------------------------------------------------------- /Chapter_ANN/CCPP_data_explore.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter: Feedforward Neural Networks\n", 8 | "\n", 9 | "\n", 10 | "# Topic: Combined Cycle Power Plant data exploration" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# import required packages\n", 20 | "import numpy as np, pandas as pd\n", 21 | "import matplotlib.pyplot as plt" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# read data\n", 31 | "data = pd.read_excel('Folds5x2_pp.xlsx', usecols = 'A:E').values\n", 32 | "X = data[:,0:4]\n", 33 | "y = data[:,4][:,np.newaxis]" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "#%% plot input vs output for each input\n", 43 | "plt.figure()\n", 44 | "plt.plot(X[:,0], y, '*')\n", 45 | "plt.title('AT vs EP')\n", 46 | "\n", 47 | "plt.figure()\n", 48 | "plt.plot(X[:,1], y, '*')\n", 49 | "plt.title('V vs EP')\n", 50 | "\n", 51 | "plt.figure()\n", 52 | "plt.plot(X[:,2], y, '*')\n", 53 | "plt.title('AP vs EP')\n", 54 | "\n", 55 | "plt.figure()\n", 56 | "plt.plot(X[:,3], y, '*')\n", 57 | "plt.title('RH vs EP')" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "Python 3", 71 | "language": "python", 72 | "name": "python3" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.7.4" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 2 89 | } 90 | -------------------------------------------------------------------------------- /Chapter_ANN/CCPP_data_explore.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## exploration of CCPP data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np, pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | #%% read data 10 | data = pd.read_excel('Folds5x2_pp.xlsx', usecols = 'A:E').values 11 | X = data[:,0:4] 12 | y = data[:,4][:,np.newaxis] 13 | 14 | #%% plot input vs output for each input 15 | plt.figure() 16 | plt.plot(X[:,0], y, '*') 17 | plt.title('AT vs EP') 18 | 19 | plt.figure() 20 | plt.plot(X[:,1], y, '*') 21 | plt.title('V vs EP') 22 | 23 | plt.figure() 24 | plt.plot(X[:,2], y, '*') 25 | plt.title('AP vs EP') 26 | 27 | plt.figure() 28 | plt.plot(X[:,3], y, '*') 29 | plt.title('RH vs EP') -------------------------------------------------------------------------------- /Chapter_ANN/Folds5x2_pp.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_ANN/Folds5x2_pp.xlsx -------------------------------------------------------------------------------- /Chapter_ANN/ccpp_FFNN_gridSearch.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## grid search-based FFNN model for ccpp data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np, pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | #%% read data 10 | data = pd.read_excel('Folds5x2_pp.xlsx', usecols = 'A:E').values 11 | X = data[:,0:4] 12 | y = data[:,4][:,np.newaxis] 13 | 14 | #%% separate training, validation, test data 15 | from sklearn.model_selection import train_test_split 16 | 17 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100) 18 | X_est, X_val, y_est, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state = 100) 19 | 20 | #%% scale data 21 | from sklearn.preprocessing import StandardScaler 22 | 23 | X_scaler = StandardScaler() 24 | X_est_scaled = X_scaler.fit_transform(X_est) 25 | X_val_scaled = X_scaler.transform(X_val) 26 | X_test_scaled = X_scaler.transform(X_test) 27 | 28 | y_scaler = StandardScaler() 29 | y_est_scaled = y_scaler.fit_transform(y_est) 30 | y_val_scaled = y_scaler.transform(y_val) 31 | y_test_scaled = y_scaler.transform(y_test) 32 | 33 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 34 | ## Define FFNN model 35 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 36 | 37 | #%% import packages 38 | from tensorflow.keras import Sequential 39 | from tensorflow.keras.layers import Dense 40 | from tensorflow.keras import regularizers 41 | from tensorflow.keras.optimizers import Adam 42 | 43 | #%% model function 44 | def FFNN_model(hidden_layers, layer_size, regularizationValue, learningRate): 45 | model = Sequential() 46 | model.add(Dense(layer_size, kernel_regularizer=regularizers.L1(regularizationValue), activation='relu', kernel_initializer='he_normal', input_shape=(4,))) 47 | 48 | for _ in range(hidden_layers-1): 49 | model.add(Dense(layer_size, kernel_regularizer=regularizers.L1(regularizationValue), activation='relu', kernel_initializer='he_normal')) 50 | 51 | model.add(Dense(1)) 52 | model.compile(loss='mse', optimizer=Adam(learning_rate=learningRate)) 53 | 54 | return model 55 | 56 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 57 | ## KerasRegressor wrapper and gridSearchCV 58 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 59 | 60 | #%% KerasRegressor 61 | from tensorflow.keras.wrappers.scikit_learn import KerasRegressor 62 | model = KerasRegressor(build_fn=FFNN_model, epochs=25, batch_size=50) 63 | 64 | #%% gridSearchCV 65 | from sklearn.model_selection import GridSearchCV 66 | 67 | param_grid={ 68 | "hidden_layers":[1, 2], 69 | "layer_size":np.arange(1,10), 70 | "regularizationValue": [0.001, 0.01, 0.1], 71 | "learningRate":[0.05, 0.01, 0.1] 72 | } 73 | 74 | grid_searchcv = GridSearchCV(model, param_grid) 75 | grid_searchcv.fit(X_est_scaled, y_est_scaled, validation_data=(X_val_scaled, y_val_scaled)) 76 | 77 | print("The best parameters obtained are:", grid_searchcv.best_params_) 78 | 79 | #%% best model 80 | model = grid_searchcv.best_estimator_.model 81 | 82 | #%% predict y_test 83 | y_test_scaled_pred = model.predict(X_test_scaled) 84 | y_test_pred = y_scaler.inverse_transform(y_test_scaled_pred) 85 | 86 | plt.figure() 87 | plt.plot(y_test, y_test_pred, '*') 88 | plt.xlabel('y_test') 89 | plt.ylabel('y_test_pred') 90 | 91 | #%% metrics 92 | from sklearn.metrics import r2_score 93 | print('R2:', r2_score(y_test, y_test_pred)) 94 | 95 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 96 | ## save model 97 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 98 | model.save('CCPP_FFNN_bestModel') -------------------------------------------------------------------------------- /Chapter_ANN/debutanizer_FFNN.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## FFNN model with debutanizer data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% random number seed for result reproducibility 10 | from numpy.random import seed 11 | seed(1) 12 | import tensorflow 13 | tensorflow.random.set_seed(2) 14 | 15 | #%% read data 16 | data = np.loadtxt('debutanizer_data.txt', skiprows=5) 17 | 18 | #%% separate training, validation, and test data 19 | from sklearn.model_selection import train_test_split 20 | X = data[:,0:-1] 21 | y = data[:,-1][:,np.newaxis] 22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 100) 23 | X_est, X_val, y_est, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 100) 24 | 25 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 26 | ## Fit FFNN model 27 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 28 | 29 | #%% import packages 30 | from tensorflow.keras import Sequential 31 | from tensorflow.keras.layers import Dense 32 | from tensorflow.keras import regularizers 33 | from tensorflow.keras.callbacks import EarlyStopping 34 | from tensorflow.keras.optimizers import Adam 35 | 36 | #%% define model 37 | model = Sequential() 38 | model.add(Dense(60, kernel_regularizer=regularizers.L1(0.0000001), activation='relu', kernel_initializer='he_normal', input_shape=(7,))) 39 | model.add(Dense(30, kernel_regularizer=regularizers.L1(0.0000001), activation='relu', kernel_initializer='he_normal')) 40 | model.add(Dense(1, kernel_regularizer=regularizers.L1(0.0000001))) 41 | 42 | #%% compile model 43 | model.compile(loss='mse', optimizer=Adam(learning_rate=0.005)) 44 | 45 | #%% fit model 46 | es = EarlyStopping(monitor='val_loss', patience=200) 47 | history = model.fit(X_est, y_est, epochs=2000, batch_size=32, validation_data=(X_val, y_val), callbacks=es) 48 | 49 | #%% plot validation curve 50 | plt.figure() 51 | plt.title('Validation Curves') 52 | plt.xlabel('Epoch') 53 | plt.ylabel('MSE') 54 | plt.plot(history.history['loss'], label='train') 55 | plt.plot(history.history['val_loss'], label='val') 56 | plt.legend() 57 | plt.grid() 58 | plt.show() 59 | 60 | #%% predict y 61 | y_test_pred = model.predict(X_test) 62 | y_val_pred = model.predict(X_val) 63 | y_est_pred = model.predict(X_est) 64 | 65 | #%% plots of raw and predicted data 66 | plt.figure() 67 | plt.plot(y_test, y_test_pred, '*') 68 | plt.xlabel('C4 content (test data)') 69 | plt.ylabel('C4 content (prediction)') 70 | 71 | plt.figure() 72 | plt.plot(y_test, 'b', label='Raw data') 73 | plt.plot(y_test_pred, 'r', label='FFNN prediction') 74 | plt.ylabel('C4 content (test data)') 75 | plt.xlabel('Sample #') 76 | plt.legend() 77 | 78 | #%% residuals 79 | plt.figure() 80 | plt.plot(y_test, y_test-y_test_pred, '*') 81 | plt.xlabel('C4 content test data') 82 | plt.ylabel('residual (raw data- prediction)') 83 | plt.title('residual plot') 84 | 85 | #%% metrics 86 | from sklearn.metrics import r2_score 87 | print('R2 for test dataset:', r2_score(y_test, y_test_pred)) 88 | print('R2:', r2_score(y_val, y_val_pred)) 89 | print('R2:', r2_score(y_est, y_est_pred)) -------------------------------------------------------------------------------- /Chapter_ANN/debutanizer_PLS.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## PLS model with debutanizer data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% read data 10 | data = np.loadtxt('debutanizer_data.txt', skiprows=5) 11 | 12 | #%% separate train and test data 13 | from sklearn.model_selection import train_test_split 14 | X = data[:,0:-1] 15 | y = data[:,-1][:,np.newaxis] 16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 100) 17 | 18 | #%% scale data 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | X_scaler = StandardScaler() 22 | X_train_normal = X_scaler.fit_transform(X_train) 23 | X_test_normal = X_scaler.transform(X_test) 24 | 25 | y_scaler = StandardScaler() 26 | y_train_normal = y_scaler.fit_transform(y_train) 27 | y_test_normal = y_scaler.transform(y_test) 28 | 29 | #%% Finding # latents using kFold cross validation 30 | from sklearn.model_selection import KFold 31 | from sklearn.metrics import mean_squared_error 32 | from sklearn.cross_decomposition import PLSRegression 33 | 34 | scaler = StandardScaler() 35 | 36 | fit_MSE = [] 37 | validate_MSE = [] 38 | for n_comp in range(1,8): 39 | local_fit_MSE = [] 40 | local_validate_MSE = [] 41 | 42 | kfold = KFold(n_splits = 10, shuffle = True, random_state = 100) 43 | for fit_index, validate_index in kfold.split(y_train): 44 | X_fit_normal = scaler.fit_transform(X_train[fit_index]) 45 | X_validate_normal = scaler.transform(X_train[validate_index]) 46 | 47 | y_fit_normal = scaler.fit_transform(y_train[fit_index]) 48 | y_validate_normal = scaler.transform(y_train[validate_index]) 49 | 50 | pls = PLSRegression(n_components = n_comp) 51 | pls.fit(X_fit_normal, y_fit_normal) 52 | 53 | local_fit_MSE.append(mean_squared_error(y_fit_normal, pls.predict(X_fit_normal))) 54 | local_validate_MSE.append(mean_squared_error(y_validate_normal, 55 | pls.predict(X_validate_normal))) 56 | 57 | fit_MSE.append(np.mean(local_fit_MSE)) 58 | validate_MSE.append(np.mean(local_validate_MSE)) 59 | 60 | 61 | # plot 62 | plt.figure() 63 | plt.plot(range(1,8), fit_MSE, 'b*', label = 'Training MSE') 64 | plt.plot(range(1,8), validate_MSE, 'r*', label = 'Validation MSE') 65 | plt.xticks(range(1,8)) 66 | plt.ylabel('Mean Squared Error (MSE)') 67 | plt.xlabel('# of latents') 68 | plt.legend() 69 | 70 | #%% build PLS model 71 | pls = PLSRegression(n_components = 5) 72 | pls.fit(X_train_normal, y_train_normal) 73 | 74 | #%% check training vs test accuracy 75 | print('Accuracy over training data: ', pls.score(X_train_normal, y_train_normal)) 76 | print('Accuracy over test data: ', pls.score(X_test_normal, y_test_normal)) 77 | 78 | #%% plots of raw and predicted data 79 | y_train_normal_predict = pls.predict(X_train_normal) 80 | y_test_normal_predict = pls.predict(X_test_normal) 81 | 82 | y_train_predict = y_scaler.inverse_transform(y_train_normal_predict) 83 | y_test_predict = y_scaler.inverse_transform(y_test_normal_predict) 84 | 85 | 86 | plt.figure() 87 | plt.plot(y_train, 'b', label = 'Raw data') 88 | plt.plot(y_train_predict, 'r', label = 'PLS prediction') 89 | plt.ylabel('C4 content (training data)') 90 | plt.xlabel('Sample #') 91 | plt.legend() 92 | 93 | 94 | plt.figure() 95 | plt.plot(y_test, 'b', label = 'Raw data') 96 | plt.plot(y_test_predict, 'r', label = 'PLS prediction') 97 | plt.ylabel('C4 content (test data)') 98 | plt.xlabel('Sample #') 99 | plt.legend() 100 | 101 | plt.figure() 102 | plt.plot(y_test, y_test_predict, '*') 103 | plt.xlabel('C4 content (test data)') 104 | plt.ylabel('C4 content (prediction)') 105 | 106 | #%% residuals 107 | plt.figure() 108 | plt.plot(y_test, y_test-y_test_predict, '*') 109 | plt.xlabel('C4 content test data') 110 | plt.ylabel('residual (raw data- prediction)') 111 | plt.title('residual plot') -------------------------------------------------------------------------------- /Chapter_ANN/debutanizer_dataExplore.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Exploration of debutanizer data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% read data 10 | data = np.loadtxt('debutanizer_data.txt', skiprows=5) 11 | 12 | #%% plot each variable 13 | plt.figure() 14 | plt.plot(data[:,0]) 15 | plt.ylabel('top Temperature') 16 | plt.xlabel('samples') 17 | plt.xlim((0,2500)) 18 | 19 | plt.figure() 20 | plt.plot(data[:,1]) 21 | plt.ylabel('top Pressure') 22 | plt.xlabel('samples') 23 | plt.xlim((0,2500)) 24 | 25 | plt.figure() 26 | plt.plot(data[:,2]) 27 | plt.ylabel('reflux flow') 28 | plt.xlabel('samples') 29 | plt.xlim((0,2500)) 30 | 31 | plt.figure() 32 | plt.plot(data[:,3]) 33 | plt.ylabel('flow to next process') 34 | plt.xlabel('samples') 35 | plt.xlim((0,2500)) 36 | 37 | plt.figure() 38 | plt.plot(data[:,4]) 39 | plt.ylabel('6th tray Temperature') 40 | plt.xlabel('samples') 41 | plt.xlim((0,2500)) 42 | 43 | plt.figure() 44 | plt.plot(data[:,5]) 45 | plt.ylabel('bottom Temperature 1') 46 | plt.xlabel('samples') 47 | plt.xlim((0,2500)) 48 | 49 | plt.figure() 50 | plt.plot(data[:,6]) 51 | plt.ylabel('bottom Temperature 2') 52 | plt.xlabel('samples') 53 | plt.xlim((0,2500)) 54 | 55 | plt.figure() 56 | plt.plot(data[:,7]) 57 | plt.ylabel('C4 content') 58 | plt.xlabel('samples') 59 | plt.xlim((0,2500)) -------------------------------------------------------------------------------- /Chapter_ANN/info.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter_ANN/kamyr_data_FFNN_earlyStopping.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## FFNN-based Soft Sensor for kamyr dataset 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | 10 | #%% random number seed for result reproducibility 11 | from numpy.random import seed 12 | seed(10) 13 | import tensorflow 14 | tensorflow.random.set_seed(20) 15 | 16 | #%% fetch data 17 | data = pd.read_csv('kamyr-digester.csv', usecols = range(1,23)) 18 | 19 | #%% pre-process 20 | # find the # of nan entries in each column 21 | na_counts = data.isna().sum(axis = 0) 22 | 23 | # remove columns that have a lot of nan entries 24 | data_cleaned = data.drop(columns = ['AAWhiteSt-4 ','SulphidityL-4 ']) 25 | 26 | # remove any row that have any nan entry 27 | data_cleaned = data_cleaned.dropna(axis = 0) 28 | 29 | # separate X, y 30 | y = data_cleaned.iloc[:,0].values[:,np.newaxis] # StandardScaler requires 2D array 31 | X = data_cleaned.iloc[:,1:].values 32 | 33 | print('Number of samples left: ', X.shape[0]) 34 | 35 | #%% separate train and test data 36 | from sklearn.model_selection import train_test_split 37 | 38 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100) 39 | X_est, X_val, y_est, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state = 100) 40 | 41 | #%% scale data 42 | from sklearn.preprocessing import StandardScaler 43 | 44 | X_scaler = StandardScaler() 45 | X_est_scaled = X_scaler.fit_transform(X_est) 46 | X_val_scaled = X_scaler.transform(X_val) 47 | X_test_scaled = X_scaler.transform(X_test) 48 | 49 | y_scaler = StandardScaler() 50 | y_est_scaled = y_scaler.fit_transform(y_est) 51 | y_val_scaled = y_scaler.transform(y_val) 52 | y_test_scaled = y_scaler.transform(y_test) 53 | 54 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 55 | ## Define & Fit FFNN model without early stopping 56 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 57 | 58 | #%% import packages 59 | from tensorflow.keras import Sequential 60 | from tensorflow.keras.layers import Dense 61 | 62 | #%% define model 63 | def FFNN_model(): 64 | model = Sequential() 65 | model.add(Dense(20, activation='tanh', kernel_initializer='he_normal', input_shape=(19,))) 66 | model.add(Dense(5, activation='tanh', kernel_initializer='he_normal')) 67 | model.add(Dense(1)) 68 | model.compile(loss='mse', optimizer='Adam') 69 | return model 70 | 71 | #%% fit model 72 | history = FFNN_model().fit(X_est_scaled, y_est_scaled, epochs=250, batch_size=32, validation_data=(X_val_scaled, y_val_scaled)) 73 | 74 | #%% plot validation curve 75 | plt.figure() 76 | plt.title('Validation Curves') 77 | plt.xlabel('Epoch') 78 | plt.ylabel('MSE') 79 | plt.plot(history.history['loss'], label='training') 80 | plt.plot(history.history['val_loss'], label='validation') 81 | plt.legend() 82 | plt.grid() 83 | plt.show() 84 | 85 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 86 | ## Define & Fit FFNN model with early stopping 87 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 88 | 89 | #%% random number seed for result reproducibility 90 | from numpy.random import seed 91 | seed(10) 92 | import tensorflow 93 | tensorflow.random.set_seed(20) 94 | 95 | #%% fit model again with early stopping 96 | from tensorflow.keras.callbacks import EarlyStopping 97 | es = EarlyStopping(monitor='val_loss', patience=15) 98 | 99 | history = FFNN_model().fit(X_est_scaled, y_est_scaled, epochs=250, batch_size=32, validation_data=(X_val_scaled, y_val_scaled), callbacks=es) 100 | 101 | #%% plot validation curve 102 | plt.figure() 103 | plt.title('Validation Curves') 104 | plt.xlabel('Epoch') 105 | plt.ylabel('MSE') 106 | plt.plot(history.history['loss'], label='training') 107 | plt.plot(history.history['val_loss'], label='validation') 108 | plt.legend() 109 | plt.grid() 110 | plt.show() -------------------------------------------------------------------------------- /Chapter_ANN/quadratic_function_singleLayer.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## FFNN modeling of y = x*x 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% random number seed for result reproducibility 10 | from numpy.random import seed 11 | seed(1) 12 | import tensorflow 13 | tensorflow.random.set_seed(2) 14 | 15 | #%% generate data 16 | x = np.linspace(-1,1,500) 17 | y = x*x 18 | plt.plot(x,y) 19 | 20 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 21 | ## Define & Fit FFNN model 22 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 23 | 24 | #%% import Keras libraries 25 | from tensorflow.keras import Sequential 26 | from tensorflow.keras.layers import Dense 27 | from tensorflow.keras.optimizers import Adam 28 | 29 | #%% define model 30 | n_nodes = 5 31 | 32 | model = Sequential() 33 | model.add(Dense(n_nodes, activation='relu', input_shape=(1,))) 34 | model.add(Dense(1)) 35 | 36 | #%% compile model 37 | model.compile(loss='mse', optimizer=Adam(learning_rate=0.05)) 38 | 39 | #%% fit model 40 | history = model.fit(x, y, epochs=400, batch_size=50) 41 | 42 | plt.figure() 43 | plt.xlabel('Epoch') 44 | plt.ylabel('MSE') 45 | plt.plot(history.history['loss'], label='train') 46 | plt.show() 47 | 48 | #%% predict y_test 49 | y_pred = model.predict(x) 50 | 51 | plt.figure() 52 | plt.plot(x, y, '--b', label='y=x^2') 53 | plt.plot(x, y_pred, '--r', label='Approximation') 54 | plt.xlabel('x') 55 | plt.title('y_pred vs y') 56 | plt.legend() 57 | 58 | plt.figure() 59 | plt.plot(y_pred, 'r') 60 | plt.title('y_pred') 61 | 62 | #%% metrics 63 | from sklearn.metrics import r2_score 64 | print('R2:', r2_score(y, y_pred)) 65 | 66 | #%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 67 | ## inner layer activations 68 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 69 | import tensorflow.keras.backend as K 70 | activations = [] 71 | for layer in model.layers: 72 | keras_function = K.function([model.input], [layer.output]) 73 | activations.append(keras_function(x)) 74 | 75 | #%% plot activations 76 | layer1_activations = activations[0][0] 77 | for node in range(n_nodes): 78 | plt.figure() 79 | plt.plot(x, layer1_activations[:,node]) 80 | plt.title('node ' + str(node+1) + ' activation') -------------------------------------------------------------------------------- /Chapter_BestPractices/3Way_Holdout_Method.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter: Best Practices\n", 8 | "\n", 9 | "# Topic: 3Way Holdout Method" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# read data\n", 19 | "import numpy as np\n", 20 | "data = np.loadtxt('quadratic_raw_data.csv', delimiter=',')\n", 21 | "x = data[:,0,None]; y = data[:,1,None]" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# create pipeline for quadratic fit via linear model \n", 31 | "# import relevant classes\n", 32 | "from sklearn.pipeline import Pipeline\n", 33 | "from sklearn.preprocessing import PolynomialFeatures\n", 34 | "from sklearn.preprocessing import StandardScaler\n", 35 | "from sklearn.linear_model import LinearRegression\n", 36 | "\n", 37 | "# add transformers and estimators sequentially as list of tuples\n", 38 | "# the names ‘poly’, ‘scaler’, ‘model’ can be used to access the individual elements of pipeline later \n", 39 | "pipe = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),\n", 40 | " ('scaler', StandardScaler()),\n", 41 | " ('model', LinearRegression())])" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Number of samples in fitting set: 14\n", 54 | "Number of samples in validation set: 6\n", 55 | "Number of samples in test set: 5\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "# train-validate-test split\n", 61 | "from sklearn.model_selection import train_test_split\n", 62 | "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)\n", 63 | "x_fit, x_val, y_fit, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=1)\n", 64 | "\n", 65 | "print('Number of samples in fitting set: ', x_fit.shape[0])\n", 66 | "print('Number of samples in validation set: ', x_val.shape[0])\n", 67 | "print('Number of samples in test set: ', x_test.shape[0])" 68 | ] 69 | } 70 | ], 71 | "metadata": { 72 | "kernelspec": { 73 | "display_name": "Python 3 (ipykernel)", 74 | "language": "python", 75 | "name": "python3" 76 | }, 77 | "language_info": { 78 | "codemirror_mode": { 79 | "name": "ipython", 80 | "version": 3 81 | }, 82 | "file_extension": ".py", 83 | "mimetype": "text/x-python", 84 | "name": "python", 85 | "nbconvert_exporter": "python", 86 | "pygments_lexer": "ipython3", 87 | "version": "3.9.7" 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 2 92 | } 93 | -------------------------------------------------------------------------------- /Chapter_BestPractices/3wayHoldout_Method.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Split dataset nto training, validation, and test sets 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | #%% read data 5 | import numpy as np 6 | data = np.loadtxt('quadratic_raw_data.csv', delimiter=',') 7 | x = data[:,0,None]; y = data[:,1,None] 8 | 9 | #%% create pipeline for quadratic fit via linear model 10 | # import relevant classes 11 | from sklearn.pipeline import Pipeline 12 | from sklearn.preprocessing import PolynomialFeatures 13 | from sklearn.preprocessing import StandardScaler 14 | from sklearn.linear_model import LinearRegression 15 | 16 | # add transformers and estimators sequentially as list of tuples 17 | # the names ‘poly’, ‘scaler’, ‘model’ can be used to access the individual elements of pipeline later 18 | pipe = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)), 19 | ('scaler', StandardScaler()), 20 | ('model', LinearRegression())]) 21 | 22 | #%% train-validate-test split 23 | from sklearn.model_selection import train_test_split 24 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) 25 | x_fit, x_val, y_fit, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=1) 26 | 27 | print('Number of samples in fitting set: ', x_fit.shape[0]) 28 | print('Number of samples in validation set: ', x_val.shape[0]) 29 | print('Number of samples in test set: ', x_test.shape[0]) 30 | 31 | -------------------------------------------------------------------------------- /Chapter_BestPractices/Centering_Scaling.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Centering & Scaling 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% Standard scaling 6 | import numpy as np 7 | from sklearn.preprocessing import StandardScaler 8 | 9 | X = np.array([[ 1000, 0.01, 300], 10 | [ 1200, 0.06, 350], 11 | [ 1500, 0.1, 320]]) 12 | scaler = StandardScaler().fit(X) # computes mean & std column-wise 13 | X_scaled = scaler.transform(X) # transform using computed mean and std 14 | 15 | # check mean = 0 and variance = 1 for every variable/column after scaling 16 | print(X_scaled.mean(axis=0)) # return 1D array of size(3,1) 17 | print(X_scaled.std(axis=0)) # return 1D array of size(3,1) 18 | 19 | # access mean and variance via object properties 20 | print(scaler.mean_) # return 1D array of size(3,1) 21 | print(scaler.var_) # return 1D array of size(3,1) 22 | 23 | #%% Normalization 24 | from sklearn.preprocessing import MinMaxScaler 25 | 26 | scaler = MinMaxScaler() # create object 27 | X_scaled = scaler.fit_transform(X) # fit & transform 28 | 29 | # check min = 0 and max = 1 for every variable/column after scaling 30 | print(X_scaled.min(axis=0)) 31 | print(X_scaled.max(axis=0)) 32 | 33 | # access min and max via object properties 34 | print(scaler.data_min_) 35 | print(scaler.data_max_) 36 | 37 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 38 | ## Robust Centering & Scaling 39 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 40 | 41 | #%% Generate oulier-infested data 42 | X = np.random.normal(40, 1, (1500,1)) 43 | X[200:300] = X[200:300] +8; X[1000:1150] = X[1000:1150] + 8 44 | 45 | # plot 46 | import matplotlib.pyplot as plt 47 | plt.plot(X, '.-') 48 | plt.xlabel('sample #'), plt.ylabel('variable measurement') 49 | plt.title('Raw measurements') 50 | 51 | #%% Transform via standard scaling 52 | scaler = StandardScaler().fit(X) 53 | X_scaled = scaler.transform(X) 54 | 55 | # mean and std 56 | print('Estimated mean = ', scaler.mean_[0]) 57 | print('Estimated standard deviation = ', np.sqrt(scaler.var_[0])) 58 | 59 | # plot 60 | plt.figure() 61 | plt.plot(X_scaled, '.-') 62 | plt.xlabel('sample #'), plt.ylabel('scaled variable measurement') 63 | plt.xlim((0,1500)) 64 | plt.title('Standard scaling') 65 | 66 | #%% Transform via robust MAD scaling 67 | # compute median and MAD 68 | from scipy import stats 69 | median = np.median(X) 70 | MAD = stats.median_absolute_deviation(X) 71 | 72 | # scale 73 | X_scaled = (X - median)/MAD[0] 74 | 75 | # median and MAD 76 | print('Estimated robust location = ', median) 77 | print('Estimated robust spread = ', MAD) 78 | 79 | # plot 80 | plt.figure() 81 | plt.plot(X_scaled, '.-') 82 | plt.xlabel('sample #'), plt.ylabel('scaled variable measurement') 83 | plt.xlim((0,1500)) 84 | plt.title('Robust MAD scaling') 85 | 86 | -------------------------------------------------------------------------------- /Chapter_BestPractices/Feature_Engineering_OneHotEncoding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter: Best Practices\n", 8 | "\n", 9 | "# Topic: Feature Engineering (one-hot encoding)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "[[1. 0. 0.]\n", 22 | " [0. 0. 1.]\n", 23 | " [0. 1. 0.]\n", 24 | " [0. 0. 1.]]\n", 25 | "[array(['type A', 'type B', 'type C'], dtype='= 90) + 1 45 | score_train_reduced = score_train[:,0:n_comp] 46 | 47 | print('Number of PCs cumulatively explaining atleast 90% variance: ', n_comp) 48 | 49 | #%% confirm that only about 10% of original information is lost 50 | from sklearn.metrics import r2_score 51 | 52 | V_matrix = pca.components_.T 53 | P_matrix = V_matrix[:,0:n_comp] 54 | 55 | data_train_normal_reconstruct = np.dot(score_train_reduced, P_matrix.T) 56 | R2_score = r2_score(data_train_normal, data_train_normal_reconstruct) 57 | 58 | print('% information lost = ', 100*(1-R2_score)) 59 | 60 | #%% alternative approach 61 | pca = PCA(n_components = 0.9) 62 | score_train_reduced = pca.fit_transform(data_train_normal) 63 | 64 | data_train_normal_reconstruct = pca.inverse_transform(score_train_reduced) 65 | R2_score = r2_score(data_train_normal, data_train_normal_reconstruct) 66 | 67 | print('% information lost = ', 100*(1-R2_score)) 68 | 69 | #%% plot to compare original and reconstructed variables 70 | var = 32 71 | plt.figure() 72 | plt.plot(data_train_normal[:,var],label = 'Measured data') 73 | plt.plot(data_train_normal_reconstruct[:,var],label = 'Reconstructed data') 74 | plt.ylabel('Variable # '+ str(var)) 75 | plt.xlabel('sample #') 76 | plt.legend() -------------------------------------------------------------------------------- /Chapter_LatentVariable1/DynamicPCA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter: Dimension Reduction and Latent Variable Methods (Part 1)\n", 8 | "\n", 9 | "\n", 10 | "# Topic: Dynamic PCA" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# import required packages\n", 20 | "import numpy as np\n", 21 | "import pandas as pd\n", 22 | "from sklearn.preprocessing import StandardScaler\n", 23 | "from sklearn.decomposition import PCA" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# fetch data\n", 33 | "data = pd.read_excel('proc1a.xls', skiprows = 1,usecols = 'C:AI')\n", 34 | "\n", 35 | "# separate train data\n", 36 | "data_train = data.iloc[0:69,]" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# augment training data\n", 46 | "lag = 5\n", 47 | "N = data_train.shape[0]\n", 48 | "m = data_train.shape[1]\n", 49 | "\n", 50 | "data_train_augmented = np.zeros((N-lag,(lag+1)*m))\n", 51 | "\n", 52 | "for sample in range(lag, N):\n", 53 | " dataBlock = data_train.iloc[sample-lag:sample+1,:].values # converting from pandas dataframe to numpy array\n", 54 | " data_train_augmented[sample-lag,:] = np.reshape(dataBlock, (1,-1), order = 'F')" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 5, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# scale data\n", 64 | "scaler = StandardScaler()\n", 65 | "data_train_augmented_normal = scaler.fit_transform(data_train_augmented)\n", 66 | "\n", 67 | "# PCA\n", 68 | "pca = PCA()\n", 69 | "score_train = pca.fit_transform(data_train_augmented_normal)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 3", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.7.4" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | -------------------------------------------------------------------------------- /Chapter_LatentVariable1/DynamicPCA.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | """ 5 | #%% import required packages 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.decomposition import PCA 10 | 11 | #%% fetch data 12 | data = pd.read_excel('proc1a.xls', skiprows = 1,usecols = 'C:AI') 13 | 14 | #%% separate train data 15 | data_train = data.iloc[0:69,] 16 | 17 | # %% augment training data 18 | lag = 5 19 | N = data_train.shape[0] 20 | m = data_train.shape[1] 21 | 22 | data_train_augmented = np.zeros((N-lag,(lag+1)*m)) 23 | 24 | for sample in range(lag, N): 25 | dataBlock = data_train.iloc[sample-lag:sample+1,:].values # converting from pandas dataframe to numpy array 26 | data_train_augmented[sample-lag,:] = np.reshape(dataBlock, (1,-1), order = 'F') 27 | 28 | #%% scale data 29 | scaler = StandardScaler() 30 | data_train_augmented_normal = scaler.fit_transform(data_train_augmented) 31 | 32 | #%% PCA 33 | pca = PCA() 34 | score_train = pca.fit_transform(data_train_augmented_normal) 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /Chapter_LatentVariable1/KernelPCA.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | """ 5 | #%% import required packages 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.decomposition import KernelPCA 10 | 11 | #%% fetch data 12 | data = pd.read_excel('KPCA_example.xlsx') 13 | 14 | #%% scale data 15 | scaler = StandardScaler() 16 | data_train_normal = scaler.fit_transform(data) 17 | 18 | #%% PCA 19 | kpca = KernelPCA(kernel='rbf', gamma = 1) 20 | score = kpca.fit_transform(data) 21 | 22 | #%% visualize explained variance 23 | import matplotlib.pyplot as plt 24 | 25 | lambdas = kpca.lambdas_ 26 | explained_variance_ratio = lambdas/np.sum(lambdas) 27 | 28 | explained_variance = 100*explained_variance_ratio # in percentage 29 | cum_explained_variance = np.cumsum(explained_variance) # cumulative % variance explained 30 | 31 | plt.figure() 32 | plt.plot(cum_explained_variance, 'r+', label = 'cumulative % variance explained') 33 | plt.plot(explained_variance, 'b+' , label = '% variance explained by each PC') 34 | plt.ylabel('Explained variance (in %)') 35 | plt.xlabel('Principal component number') 36 | plt.legend() 37 | plt.show() 38 | 39 | #%% decide # of PCs to retain and compute reduced data in PC space 40 | n_comp = np.argmax(cum_explained_variance >= 90) + 1 41 | score_reduced = score[:,0:n_comp] 42 | 43 | print('Number of PCs cumulatively explaining atleast 90% variance: ', n_comp) -------------------------------------------------------------------------------- /Chapter_LatentVariable1/proc1a.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_LatentVariable1/proc1a.xls -------------------------------------------------------------------------------- /Chapter_LatentVariable1/softSensor_PLS.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## PLS-based Soft Sensor 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import pandas as pd 8 | 9 | #%% fetch data 10 | data = pd.read_csv('kamyr-digester.csv', usecols = range(1,23)) 11 | 12 | #%% pre-process 13 | # find the # of nan entries in each column 14 | na_counts = data.isna().sum(axis = 0) 15 | 16 | # remove columns that have a lot of nan entries 17 | data_cleaned = data.drop(columns = ['AAWhiteSt-4 ','SulphidityL-4 ']) 18 | 19 | # remove any row that have any nan entry 20 | data_cleaned = data_cleaned.dropna(axis = 0) 21 | 22 | # separate X, y 23 | y = data_cleaned.iloc[:,0].values[:,np.newaxis] # StandardScaler requires 2D array 24 | X = data_cleaned.iloc[:,1:].values 25 | 26 | print('Number of samples left: ', X.shape[0]) 27 | 28 | #%% separate train and test data 29 | from sklearn.model_selection import train_test_split 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100) 31 | 32 | #%% scale data 33 | from sklearn.preprocessing import StandardScaler 34 | 35 | X_scaler = StandardScaler() 36 | X_train_normal = X_scaler.fit_transform(X_train) 37 | X_test_normal = X_scaler.transform(X_test) 38 | 39 | y_scaler = StandardScaler() 40 | y_train_normal = y_scaler.fit_transform(y_train) 41 | y_test_normal = y_scaler.transform(y_test) 42 | 43 | #%% Finding # latents using kFold cross validation 44 | from sklearn.model_selection import KFold 45 | from sklearn.metrics import mean_squared_error 46 | from sklearn.cross_decomposition import PLSRegression 47 | import matplotlib.pyplot as plt 48 | 49 | scaler = StandardScaler() 50 | 51 | fit_MSE = [] 52 | validate_MSE = [] 53 | for n_comp in range(1,20): 54 | local_fit_MSE = [] 55 | local_validate_MSE = [] 56 | 57 | kfold = KFold(n_splits = 10, shuffle = True, random_state = 100) 58 | for fit_index, validate_index in kfold.split(y_train): 59 | X_fit_normal = scaler.fit_transform(X_train[fit_index]) 60 | X_validate_normal = scaler.transform(X_train[validate_index]) 61 | 62 | y_fit_normal = scaler.fit_transform(y_train[fit_index]) 63 | y_validate_normal = scaler.transform(y_train[validate_index]) 64 | 65 | pls = PLSRegression(n_components = n_comp) 66 | pls.fit(X_fit_normal, y_fit_normal) 67 | 68 | local_fit_MSE.append(mean_squared_error(y_fit_normal, pls.predict(X_fit_normal))) 69 | local_validate_MSE.append(mean_squared_error(y_validate_normal, 70 | pls.predict(X_validate_normal))) 71 | 72 | fit_MSE.append(np.mean(local_fit_MSE)) 73 | validate_MSE.append(np.mean(local_validate_MSE)) 74 | 75 | 76 | # plot 77 | plt.figure() 78 | plt.plot(range(1,20), fit_MSE, 'b*', label = 'Training MSE') 79 | plt.plot(range(1,20), validate_MSE, 'r*', label = 'Validation MSE') 80 | plt.xticks(range(1,20)) 81 | plt.ylabel('Mean Squared Error (MSE)') 82 | plt.xlabel('# of latents') 83 | plt.legend() 84 | 85 | #%% build PLS model 86 | pls = PLSRegression(n_components = 9) 87 | pls.fit(X_train_normal, y_train_normal) 88 | 89 | #%% check training vs test accuracy 90 | y_train_normal_predict = pls.predict(X_train_normal) 91 | y_test_normal_predict = pls.predict(X_test_normal) 92 | 93 | print('Accuracy over training data: ', pls.score(X_train_normal, y_train_normal)) 94 | print('Accuracy over test data: ', pls.score(X_test_normal, y_test_normal)) 95 | 96 | #%% plots of raw and predicted data 97 | y_train_predict = y_scaler.inverse_transform(y_train_normal_predict) 98 | y_test_predict = y_scaler.inverse_transform(y_test_normal_predict) 99 | 100 | 101 | plt.figure() 102 | plt.plot(y_train, 'b', label = 'Raw data') 103 | plt.plot(y_train_predict, 'r', label = 'PLS prediction') 104 | plt.ylabel('Kappa number (training data)') 105 | plt.xlabel('Sample #') 106 | plt.legend() 107 | 108 | 109 | plt.figure() 110 | plt.plot(y_test, 'b', label = 'Raw data') 111 | plt.plot(y_test_predict, 'r', label = 'PLS prediction') 112 | plt.ylabel('Kappa number (test data)') 113 | plt.xlabel('Sample #') 114 | plt.legend() -------------------------------------------------------------------------------- /Chapter_LatentVariable2/DimensionalityReduction_ICA.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## ICA model for TEP data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% fetch TE data 10 | TEdata_noFault_train = np.loadtxt('d00.dat').T # data arrangement in d00.dat is different than that in other files 11 | 12 | # select variables as done in Lee et al. 13 | xmeas = TEdata_noFault_train[:,0:22] 14 | xmv = TEdata_noFault_train[:,41:52] 15 | data_noFault_train = np.hstack((xmeas, xmv)) 16 | 17 | #%% scale data 18 | from sklearn.preprocessing import StandardScaler 19 | scaler = StandardScaler() 20 | data_train_normal = scaler.fit_transform(data_noFault_train) 21 | 22 | #%% fit ICA model 23 | from sklearn.decomposition import FastICA 24 | ica = FastICA(max_iter=1000, tol=0.005, random_state=1).fit(data_train_normal) 25 | W = ica.components_ 26 | 27 | #%% confirm L2 norm of all IC scores is 1 28 | S = ica.transform(data_train_normal) 29 | S_L2_norms = np.linalg.norm(S, 2, axis = 0) 30 | 31 | #%% sort the ICs in importance order using L2 norm of each row 32 | L2_norm = np.linalg.norm(W, 2, axis=1) 33 | sort_order = np.flip(np.argsort(L2_norm)) # descending order 34 | L2_norm_sorted_pct = 100*L2_norm[sort_order]/np.sum(L2_norm) 35 | 36 | plt.figure() 37 | plt.plot(L2_norm, 'b') 38 | plt.xlabel('IC number (unsorted)') 39 | plt.ylabel('L2 norm') 40 | 41 | plt.figure() 42 | plt.plot(L2_norm_sorted_pct, 'b+') 43 | plt.xlabel('IC number (sorted)') 44 | plt.ylabel('% L2 norm') 45 | 46 | W_sorted = W[sort_order,:] # row 1 now corresponds to the most important IC and so on 47 | 48 | #%% decide # of ICs to retain via PCA variance method and compute ICs 49 | from sklearn.decomposition import PCA 50 | pca = PCA().fit(data_train_normal) 51 | 52 | explained_variance = 100*pca.explained_variance_ratio_ # in percentage 53 | cum_explained_variance = np.cumsum(explained_variance) # cumulative % variance explained 54 | 55 | n_comp = np.argmax(cum_explained_variance >= 90) + 1 56 | 57 | print('Number of PCs cumulatively explaining atleast 90% variance: ', n_comp) 58 | 59 | #%% compute ICs with reduced dimension 60 | Wd = W_sorted[0:n_comp,:] 61 | Sd = np.dot(Wd, data_train_normal.T) # row 1 contains scores of the most important IC 62 | -------------------------------------------------------------------------------- /Chapter_LatentVariable2/FDA_illustration.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Illustration example for FDA/LDA 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | from sklearn.decomposition import PCA 8 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 9 | from sklearn.preprocessing import StandardScaler 10 | import matplotlib.pyplot as plt 11 | 12 | #%% generate data 13 | x1_class1 = np.random.uniform(1, 6, 100) 14 | x2_class1 = x1_class1 + 1 + np.random.normal(0,0.5,100) 15 | X_class1 = np.column_stack((x1_class1, x2_class1)) 16 | 17 | 18 | x1_class2 = np.random.uniform(2, 7, 100) 19 | x2_class2 = x1_class2 - 1 + np.random.normal(0,0.5,100) 20 | X_class2 = np.column_stack((x1_class2, x2_class2)) 21 | 22 | plt.figure() 23 | plt.plot(x1_class1, x2_class1, 'b.', label='Class 1') 24 | plt.plot(x1_class2, x2_class2, 'r.', label='Class 2') 25 | plt.xlabel('x1') 26 | plt.ylabel('x2') 27 | plt.legend() 28 | plt.show() 29 | 30 | X = np.vstack((X_class1, X_class2)) 31 | y = np.concatenate((np.ones(100,), 2*np.ones(100,))) 32 | 33 | #%% scale data 34 | scalar = StandardScaler() 35 | X_normal = scalar.fit_transform(X) 36 | 37 | #%% extract latent variables via PCA 38 | pca = PCA(n_components=1) 39 | score_pca = pca.fit_transform(X_normal) 40 | 41 | plt.figure() 42 | plt.plot(score_pca[0:100], np.zeros((100,)), 'b.') 43 | plt.plot(score_pca[100:], np.zeros((100,)), 'r.') 44 | plt.ylim((-2,100)) 45 | plt.xlabel('PCA score') 46 | plt.ylabel('sample #') 47 | 48 | #%% extract latent variables via LDA 49 | lda = LinearDiscriminantAnalysis(n_components=1) 50 | score_lda = lda.fit_transform(X_normal, y) 51 | 52 | plt.figure() 53 | plt.plot(score_lda[0:100], np.zeros((100,)), 'b.') 54 | plt.plot(score_lda[100:], np.zeros((100,)), 'r.') 55 | plt.ylim((-2,100)) 56 | plt.xlabel('LDA score') 57 | plt.ylabel('sample #') -------------------------------------------------------------------------------- /Chapter_LatentVariable2/FaultClassification_FDA.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Fault classification via FDA 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% fetch TEP data for faults 5,10,19 10 | TEdata_Fault5_train = np.loadtxt('d05.dat') 11 | TEdata_Fault10_train = np.loadtxt('d10.dat') 12 | TEdata_Fault19_train = np.loadtxt('d19.dat') 13 | TEdata_Faulty_train = np.vstack((TEdata_Fault5_train, TEdata_Fault10_train, TEdata_Fault19_train)) 14 | 15 | # select variables as done in Lee et al. 16 | xmeas = TEdata_Faulty_train[:,0:22] 17 | xmv = TEdata_Faulty_train[:,41:52] 18 | data_Faulty_train = np.hstack((xmeas, xmv)) 19 | 20 | # generate sample labels 21 | n_rows_train = TEdata_Fault5_train.shape[0] 22 | y_train = np.concatenate((5*np.ones(n_rows_train,), 10*np.ones(n_rows_train,), 19*np.ones(n_rows_train,))) 23 | 24 | #%% scale data 25 | from sklearn.preprocessing import StandardScaler 26 | scaler = StandardScaler() 27 | Faultydata_train_scaled = scaler.fit_transform(data_Faulty_train) 28 | 29 | #%% visualize all scaled variables 30 | plt.figure() 31 | plt.plot(Faultydata_train_scaled) 32 | plt.show() 33 | 34 | #%% fit LDA model 35 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 36 | lda = LinearDiscriminantAnalysis() 37 | scores_train_lda = lda.fit_transform(Faultydata_train_scaled, y_train) 38 | 39 | #%% visualize LDA scores 40 | plt.figure() 41 | plt.plot(scores_train_lda[0:n_rows_train,0], scores_train_lda[0:n_rows_train,1], 'b.', label='Fault 5') 42 | plt.plot(scores_train_lda[n_rows_train:2*n_rows_train,0], scores_train_lda[n_rows_train:2*n_rows_train,1], 'r.', label='Fault 10') 43 | plt.plot(scores_train_lda[2*n_rows_train:3*n_rows_train,0], scores_train_lda[2*n_rows_train:3*n_rows_train,1], 'm.', label='Fault 19') 44 | plt.legend() 45 | plt.xlabel('FD1 (training data)') 46 | plt.ylabel('FD2 (training data)') 47 | 48 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 49 | ## Control limit determination for fault5 class 50 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 51 | import scipy.stats 52 | Nj = n_rows_train 53 | k = 2 54 | 55 | alpha = 0.01# 99% control limit 56 | T2_CL = k*(Nj**2-1)*scipy.stats.f.ppf(1-alpha,k,Nj-k)/(Nj*(Nj-k)) 57 | 58 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 59 | ## Fault classification with fault 5 test data 60 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 61 | # mean and covariance for Fault 5 class 62 | scores_train_lda_Fault5 = scores_train_lda[0:n_rows_train,:] 63 | cov_scores_train_Fault5 = np.cov(scores_train_lda_Fault5.T) 64 | mean_scores_train_Fault5 = np.mean(scores_train_lda_Fault5, axis = 0) 65 | 66 | #%% fetch TE test dta for fault 5 67 | TEdata_Fault5_test = np.loadtxt('d05_te.dat') 68 | TEdata_Fault5_test = TEdata_Fault5_test[160:,:] 69 | n_rows_test = TEdata_Fault5_test.shape[0] 70 | 71 | # select variables as done in Lee et al. 72 | xmeas = TEdata_Fault5_test[:,0:22] 73 | xmv = TEdata_Fault5_test[:,41:52] 74 | data_Faulty_test = np.hstack((xmeas, xmv)) 75 | 76 | #%% scale data and transform 77 | Faultydata_test_scaled = scaler.transform(data_Faulty_test) 78 | scores_test_lda = lda.transform(Faultydata_test_scaled) 79 | 80 | #%% compute T2 statistic for test data for Fault 5 class 81 | T2_test = np.zeros((n_rows_test,)) 82 | for sample in range(n_rows_test): 83 | score_sample = scores_test_lda[sample,:] 84 | score_sample_centered = score_sample - mean_scores_train_Fault5 85 | T2_test[sample] = np.dot(np.dot(score_sample_centered[np.newaxis,:],np.linalg.inv(cov_scores_train_Fault5)),score_sample_centered[np.newaxis,:].T) 86 | 87 | #%% plot test prediction 88 | outsideCL_flag = T2_test > T2_CL 89 | insideCL_flag = T2_test <= T2_CL 90 | plt.figure() 91 | plt.plot(scores_test_lda[outsideCL_flag,0], scores_test_lda[outsideCL_flag,1], 'k.', label='outside Fault 5 boundary') 92 | plt.plot(scores_test_lda[insideCL_flag,0], scores_test_lda[insideCL_flag,1], 'b.', label='inside Fault 5 boundary') 93 | plt.xlabel('FD1 (test data)') 94 | plt.ylabel('FD2 (test data)') 95 | plt.legend() 96 | 97 | print('Percentage of samples correctly diagnosed as Fault 5: ', 100*np.sum(T2_test < T2_CL)/n_rows_test) 98 | -------------------------------------------------------------------------------- /Chapter_LatentVariable2/ICA_illustration.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Illustration example for ICA vs PCA 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | from sklearn.decomposition import PCA 8 | from sklearn.decomposition import FastICA 9 | import matplotlib.pyplot as plt 10 | 11 | #%% generate independent data 12 | s1 = 2*np.sin(2*np.pi*8*np.arange(500)/500) 13 | s2 = np.random.uniform(-2, 2, 500) 14 | 15 | plt.figure() 16 | plt.plot(s1) 17 | plt.xlabel('sample #') 18 | plt.ylabel('s1') 19 | 20 | plt.figure() 21 | plt.plot(s2) 22 | plt.xlabel('sample #') 23 | plt.ylabel('s2') 24 | 25 | plt.figure() 26 | plt.scatter(s1, s2) 27 | plt.xlabel('s1') 28 | plt.ylabel('s2') 29 | 30 | #%% generate transformed observed data 31 | x1 = (2/3)*s1 + s2 32 | x2 = (2/3)*s1 + (1/3)*s2 33 | 34 | X = np.column_stack((x1,x2)) 35 | 36 | plt.figure() 37 | plt.plot(x1) 38 | plt.xlabel('sample #') 39 | plt.ylabel('x1') 40 | 41 | plt.figure() 42 | plt.plot(x2) 43 | plt.xlabel('sample #') 44 | plt.ylabel('x2') 45 | 46 | plt.figure() 47 | plt.scatter(x1, x2) 48 | plt.xlabel('x1') 49 | plt.ylabel('x2') 50 | 51 | #%% extract latent variables via PCA 52 | pca = PCA() 53 | T = pca.fit_transform(X) 54 | 55 | plt.figure() 56 | plt.plot(T[:,0]) 57 | plt.xlabel('sample #') 58 | plt.ylabel('t1') 59 | 60 | plt.figure() 61 | plt.plot(T[:,1]) 62 | plt.xlabel('sample #') 63 | plt.ylabel('t2') 64 | 65 | plt.figure() 66 | plt.scatter(T[:,0], T[:,1]) 67 | plt.xlabel('t1') 68 | plt.ylabel('t2') 69 | 70 | #%% extract latent variables via ICA 71 | ica = FastICA() 72 | U = ica.fit_transform(X) 73 | 74 | plt.figure() 75 | plt.plot(U[:,0]) 76 | plt.xlabel('sample #') 77 | plt.ylabel('u1') 78 | 79 | plt.figure() 80 | plt.plot(U[:,1]) 81 | plt.xlabel('sample #') 82 | plt.ylabel('u2') 83 | 84 | plt.figure() 85 | plt.scatter(U[:,0], U[:,1]) 86 | plt.xlabel('u1') 87 | plt.ylabel('u2') 88 | 89 | -------------------------------------------------------------------------------- /Chapter_LatentVariable2/TE_processData_explore.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## TE data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% fetch TE data 10 | TEdata_noFault_train = np.loadtxt('d00.dat').T # data arrnagement in d00.dat is different than that in other files 11 | TEdata_Fault_train = np.loadtxt('d10.dat') 12 | 13 | #%% quick visualize 14 | plt.figure() 15 | plt.plot(TEdata_noFault_train[:,17]) 16 | plt.xlabel('sample #') 17 | plt.ylabel('Stripper Tempearture') 18 | plt.title('Normal operation') 19 | 20 | plt.figure() 21 | plt.plot(TEdata_Fault_train[:,17]) 22 | plt.xlabel('sample #') 23 | plt.ylabel('Stripper Tempearture') 24 | plt.title('Faulty operation') 25 | 26 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 27 | ## Visualize normal and faulty data in PC space 28 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 29 | 30 | #%% scale data 31 | from sklearn.preprocessing import StandardScaler 32 | scaler = StandardScaler() 33 | TEdata_noFault_scaled = scaler.fit_transform(TEdata_noFault_train) 34 | TEdata_Fault_scaled = scaler.transform(TEdata_Fault_train) 35 | 36 | #%% build PCA model and copmute PC scores 37 | from sklearn.decomposition import PCA 38 | pca = PCA(n_components = 3).fit(TEdata_noFault_scaled) 39 | TEdata_noFault_scores = pca.transform(TEdata_noFault_scaled) 40 | TEdata_Fault_scores = pca.transform(TEdata_Fault_scaled) 41 | 42 | #%% visualize in 3D plot 43 | from mpl_toolkits.mplot3d import Axes3D 44 | fig = plt.figure() 45 | ax = Axes3D(fig) 46 | ax.scatter(TEdata_noFault_scores[:,0],TEdata_noFault_scores[:,1],TEdata_noFault_scores[:,2], c='blue', alpha=0.1, label='Normal operation') 47 | ax.scatter(TEdata_Fault_scores[:,0],TEdata_Fault_scores[:,1],TEdata_Fault_scores[:,2], c='red', marker = '*', label='Faulty operation') 48 | ax.set_xlabel('PC1 scores') 49 | ax.set_ylabel('PC2 scores') 50 | ax.set_zlabel('PC3 scores') 51 | ax.legend() 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /Chapter_LatentVariable2/info.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter_OtherUsefulMethods/info.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter_Preprocessing/EmbeddedMethods_Lasso.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Implementing embedded method (Lasso) on simulated process data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import numpy as np 7 | VSdata = np.loadtxt('VSdata.csv', delimiter=',') 8 | 9 | #%% separate X and y 10 | y = VSdata[:,0] 11 | X = VSdata[:,1:] 12 | 13 | #%% scale data 14 | from sklearn.preprocessing import StandardScaler 15 | xscaler = StandardScaler() 16 | X_scaled = xscaler.fit_transform(X) 17 | 18 | yscaler = StandardScaler() 19 | y_scaled = yscaler.fit_transform(y[:,None]) 20 | 21 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 22 | ## Lasso-based variable selection 23 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 24 | 25 | #%% fit Lasso model 26 | from sklearn.linear_model import LassoCV 27 | Lasso_model = LassoCV(cv=5).fit(X_scaled, y_scaled) 28 | 29 | #%% find the relevant inputs using model coefficients 30 | top_k_inputs = np.argsort(abs(Lasso_model.coef_))[::-1][:10] + 1 31 | print('Relevant inputs: ', top_k_inputs) 32 | 33 | -------------------------------------------------------------------------------- /Chapter_Preprocessing/Embedded_Method_Lasso.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter: Data Preprocessing\n", 8 | "\n", 9 | "# Topic: Embedded Method: Lasso" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# read data\n", 19 | "import numpy as np\n", 20 | "VSdata = np.loadtxt('VSdata.csv', delimiter=',')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# separate X and y\n", 30 | "y = VSdata[:,0]\n", 31 | "X = VSdata[:,1:]" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# scale data\n", 41 | "from sklearn.preprocessing import StandardScaler\n", 42 | "xscaler = StandardScaler()\n", 43 | "X_scaled = xscaler.fit_transform(X)\n", 44 | "\n", 45 | "yscaler = StandardScaler()\n", 46 | "y_scaled = yscaler.fit_transform(y[:,None])" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 6, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# fit Lasso model \n", 56 | "from sklearn.linear_model import LassoCV\n", 57 | "Lasso_model = LassoCV(cv=5).fit(X_scaled, y_scaled.ravel())" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 7, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "Relevant inputs: [21 22 20 23 24 19 25 18 33 14]\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "# cfind the relevant inputs using model coefficients\n", 75 | "top_k_inputs = np.argsort(abs(Lasso_model.coef_))[::-1][:10] + 1\n", 76 | "print('Relevant inputs: ', top_k_inputs)" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 3 (ipykernel)", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.9.7" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | -------------------------------------------------------------------------------- /Chapter_Preprocessing/MLR_VSdata.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Implementing MLR on simulated process data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import numpy as np 7 | VSdata = np.loadtxt('VSdata.csv', delimiter=',') 8 | VSdata_val = np.loadtxt('VSdata_val.csv', delimiter=',') 9 | 10 | #%% separate X and y 11 | y_train = VSdata[:,0] 12 | X_train = VSdata[:,1:] 13 | 14 | y_val = VSdata_val[:,0] 15 | X_val = VSdata_val[:,1:] 16 | 17 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 18 | ## MLR using all variables 19 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 20 | 21 | #%% fit model on training data 22 | from sklearn.linear_model import LinearRegression 23 | from sklearn.preprocessing import StandardScaler 24 | from matplotlib import pyplot as plt 25 | 26 | # scale X 27 | scaler_all = StandardScaler() 28 | X_train_scaled = scaler_all.fit_transform(X_train) 29 | X_val_scaled = scaler_all.transform(X_val) 30 | 31 | # fit 32 | MLR_all = LinearRegression().fit(X_train_scaled, y_train) 33 | 34 | # predict 35 | y_val_pred = MLR_all.predict(X_val_scaled) 36 | 37 | # score 38 | R2_all_train = MLR_all.score(X_train_scaled, y_train) 39 | R2_all = MLR_all.score(X_val_scaled, y_val) 40 | 41 | # plot raw vs predicted target 42 | plt.figure() 43 | plt.plot(y_val, y_val_pred, '.') 44 | plt.title('Using all variables') 45 | 46 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 47 | ## MLR using only 10 relevant variables 48 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 49 | # select only relevant inputs 50 | k = 10 51 | X_train_rel = X_train[:,16:16+k] 52 | X_val_rel = X_val[:,16:16+k] 53 | 54 | # scale X 55 | scaler_rel = StandardScaler() 56 | X_train_rel_scaled = scaler_rel.fit_transform(X_train_rel) 57 | X_val_rel_scaled = scaler_rel.transform(X_val_rel) 58 | 59 | # fit 60 | MLR_rel = LinearRegression().fit(X_train_rel_scaled, y_train) 61 | 62 | # predict 63 | y_val_rel_pred = MLR_rel.predict(X_val_rel_scaled) 64 | 65 | # score 66 | R2_rel_train = MLR_rel.score(X_train_rel_scaled, y_train) 67 | R2_rel = MLR_rel.score(X_val_rel_scaled, y_val) 68 | 69 | # plot raw vs predicted target 70 | plt.figure() 71 | plt.plot(y_val, y_val_rel_pred, '.') 72 | plt.title('Using relevant variables') 73 | -------------------------------------------------------------------------------- /Chapter_Preprocessing/Missing_data_imputation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter: Data Preprocessing\n", 8 | "\n", 9 | "# Topic: Data imputation" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "[[1. 2. 5.]\n", 22 | " [3. 4. 3.]\n", 23 | " [4. 6. 5.]\n", 24 | " [8. 8. 7.]]\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "# Mean imputation\n", 30 | "import numpy as np\n", 31 | "from sklearn.impute import SimpleImputer\n", 32 | "\n", 33 | "sample_data = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]\n", 34 | "mean_imputeModel = SimpleImputer(missing_values=np.nan, strategy='mean')\n", 35 | "\n", 36 | "print(mean_imputeModel.fit_transform(sample_data))" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "[[1. 2. 4. ]\n", 49 | " [3. 4. 3. ]\n", 50 | " [5.5 6. 5. ]\n", 51 | " [8. 8. 7. ]]\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "# KNN imputation\n", 57 | "from sklearn.impute import KNNImputer\n", 58 | "\n", 59 | "knn_imputeModel = KNNImputer(n_neighbors=2)\n", 60 | "print(knn_imputeModel.fit_transform(sample_data))" 61 | ] 62 | } 63 | ], 64 | "metadata": { 65 | "kernelspec": { 66 | "display_name": "Python 3 (ipykernel)", 67 | "language": "python", 68 | "name": "python3" 69 | }, 70 | "language_info": { 71 | "codemirror_mode": { 72 | "name": "ipython", 73 | "version": 3 74 | }, 75 | "file_extension": ".py", 76 | "mimetype": "text/x-python", 77 | "name": "python", 78 | "nbconvert_exporter": "python", 79 | "pygments_lexer": "ipython3", 80 | "version": "3.9.7" 81 | } 82 | }, 83 | "nbformat": 4, 84 | "nbformat_minor": 2 85 | } 86 | -------------------------------------------------------------------------------- /Chapter_Preprocessing/Missing_data_imputation.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## data imputation 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% Mean imputation 6 | import numpy as np 7 | from sklearn.impute import SimpleImputer 8 | 9 | sample_data = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]] 10 | mean_imputeModel = SimpleImputer(missing_values=np.nan, strategy='mean') 11 | 12 | print(mean_imputeModel.fit_transform(sample_data)) 13 | 14 | #%% KNN imputation 15 | from sklearn.impute import KNNImputer 16 | 17 | knn_imputeModel = KNNImputer(n_neighbors=2) 18 | print(knn_imputeModel.fit_transform(sample_data)) 19 | -------------------------------------------------------------------------------- /Chapter_Preprocessing/Multivariate_outliers_MCD.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Multivariate outlier detection via MCD-based Mahalanobis distances 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import numpy as np 7 | data_2Doutlier = np.loadtxt('complex2D_outlier.csv', delimiter=',') 8 | 9 | # plot 10 | import matplotlib.pyplot as plt 11 | plt.plot(data_2Doutlier[:-30,0], data_2Doutlier[:-30,1], '.', markeredgecolor='k', markeredgewidth=0.5, ms=9) 12 | plt.plot(data_2Doutlier[-30:,0], data_2Doutlier[-30:,1], '.r', markeredgecolor='k', markeredgewidth=0.5, ms=11) 13 | plt.xlabel('x1'), plt.ylabel('x2') 14 | plt.title('Raw measurements') 15 | 16 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 17 | ## Non-robust Mahalanobis distances 18 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 19 | 20 | #%% compute Mahalanobis distances 21 | from sklearn.covariance import EmpiricalCovariance 22 | 23 | emp_cov = EmpiricalCovariance().fit(data_2Doutlier) 24 | MD_emp_cov = emp_cov.mahalanobis(data_2Doutlier) 25 | 26 | #%% transform Mahalanobis distances into normal distribution via cubic-root 27 | MD_emp_cov_cubeRoot = np.power(MD_emp_cov, 0.333) 28 | 29 | #%% find hampel identifier bounds 30 | from scipy import stats 31 | 32 | median = np.median(MD_emp_cov_cubeRoot) 33 | sigma_MAD = stats.median_absolute_deviation(MD_emp_cov_cubeRoot) 34 | 35 | upperBound_MD_emp_cov = np.power(median+3*sigma_MAD, 3) 36 | lowerBound_MD_emp_cov = np.power(median-3*sigma_MAD, 3) 37 | 38 | #%% plot Mahalanobis distances with bounds 39 | plt.figure() 40 | plt.plot(MD_emp_cov[:-30], '.', markeredgecolor='k', markeredgewidth=0.5, ms=9) 41 | plt.plot(np.arange(300,330), MD_emp_cov[-30:], '.r', markeredgecolor='k', markeredgewidth=0.5, ms=11) 42 | 43 | plt.hlines(upperBound_MD_emp_cov, 0, 330, colors='r', linestyles='dashdot', label='Upper bound') 44 | plt.hlines(lowerBound_MD_emp_cov, 0, 330, colors='r', linestyles='dashed', label='Lower bound') 45 | 46 | plt.xlabel('sample #'), plt.ylabel('Mahalanobis distance') 47 | plt.title('Mahalanobis distances of raw measurements') 48 | plt.legend(loc='upper left') 49 | 50 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 51 | ## MCD-based robust Mahalanobis distances 52 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 53 | from sklearn.covariance import MinCovDet 54 | 55 | MCD_cov = MinCovDet().fit(data_2Doutlier) 56 | MD_MCD = MCD_cov.mahalanobis(data_2Doutlier) 57 | 58 | #%% transform Mahalanobis distances into normal distribution via cubic-root 59 | MD_MCD_cubeRoot = np.power(MD_MCD, 0.333) 60 | 61 | #%% find hampel identifier bounds 62 | from scipy import stats 63 | 64 | median = np.median(MD_MCD_cubeRoot) 65 | sigma_MAD = stats.median_absolute_deviation(MD_MCD_cubeRoot) 66 | 67 | upperBound_MD_emp_cov = np.power(median+3*sigma_MAD, 3) 68 | lowerBound_MD_emp_cov = np.power(median-3*sigma_MAD, 3) 69 | 70 | #%% plot Mahalanobis distances with bounds 71 | plt.figure() 72 | plt.plot(MD_MCD[:-30], '.', markeredgecolor='k', markeredgewidth=0.5, ms=9) 73 | plt.plot(np.arange(300,330), MD_MCD[-30:], '.r', markeredgecolor='k', markeredgewidth=0.5, ms=11) 74 | 75 | plt.hlines(upperBound_MD_emp_cov, 0, 330, colors='r', linestyles='dashdot', label='Upper bound') 76 | plt.hlines(lowerBound_MD_emp_cov, 0, 330, colors='r', linestyles='dashed', label='Lower bound') 77 | 78 | plt.xlabel('sample #'), plt.ylabel('Mahalanobis distance') 79 | plt.title('MCD_based Mahalanobis distances of raw measurements') 80 | plt.legend(loc='upper left') 81 | -------------------------------------------------------------------------------- /Chapter_Preprocessing/Multivariate_outliers_Mahalanobis_distance.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Multivariate outlier detection via Mahalanobis distances 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import numpy as np 7 | data_2Doutlier = np.loadtxt('simple2D_outlier.csv', delimiter=',') 8 | 9 | # plot 10 | import matplotlib.pyplot as plt 11 | plt.plot(data_2Doutlier[:-5,0], data_2Doutlier[:-5,1], '.', markeredgecolor='k', markeredgewidth=0.5, ms=9) 12 | plt.plot(data_2Doutlier[-5:,0], data_2Doutlier[-5:,1], '.r', markeredgecolor='k', markeredgewidth=0.5, ms=11) 13 | plt.xlabel('x1'), plt.ylabel('x2') 14 | plt.title('Raw measurements') 15 | 16 | #%% compute Mahalanobis distances 17 | from sklearn.covariance import EmpiricalCovariance 18 | 19 | emp_cov = EmpiricalCovariance().fit(data_2Doutlier) 20 | MD_emp_cov = emp_cov.mahalanobis(data_2Doutlier) 21 | 22 | #%% transform Mahalanobis distances into normal distribution via cubic-root 23 | MD_cubeRoot = np.power(MD_emp_cov, 0.333) 24 | 25 | #%% find hampel identifier bounds 26 | from scipy import stats 27 | 28 | median = np.median(MD_cubeRoot) 29 | sigma_MAD = stats.median_absolute_deviation(MD_cubeRoot) 30 | 31 | upperBound_MD_emp_cov = np.power(median+3*sigma_MAD, 3) 32 | lowerBound_MD_emp_cov = np.power(median-3*sigma_MAD, 3) 33 | 34 | #%% plot Mahalanobis distances with bounds (last 5 samples are the outliers) 35 | plt.figure(), plt.plot(MD_emp_cov[:-5], '.', markeredgecolor='k', markeredgewidth=0.5, ms=9) 36 | plt.plot(np.arange(300,305), MD_emp_cov[-5:], '.r', markeredgecolor='k', markeredgewidth=0.5, ms=11) 37 | 38 | plt.hlines(upperBound_MD_emp_cov, 0, 305, colors='r', linestyles='dashdot', label='Upper bound') 39 | plt.hlines(lowerBound_MD_emp_cov, 0, 305, colors='r', linestyles='dashed', label='Lower bound') 40 | 41 | plt.xlabel('sample #'), plt.ylabel('Mahalanobis distance') 42 | plt.title('Mahalanobis distances of raw measurements') 43 | plt.legend(loc='upper left') -------------------------------------------------------------------------------- /Chapter_Preprocessing/Univariate_Outliers.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Univariate outlier detection 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% Generate oulier-infested data 6 | import numpy as np 7 | 8 | X = np.random.normal(40, 1, (1500,1)) 9 | X[200:300] = X[200:300] +8; X[1000:1150] = X[1000:1150] + 8 10 | 11 | # plot 12 | import matplotlib.pyplot as plt 13 | plt.plot(X, '.-') 14 | plt.xlabel('sample #'), plt.ylabel('variable measurement') 15 | plt.title('Raw measurements') 16 | 17 | #%% 3-sigma rule 18 | # location & spread 19 | mu = np.mean(X) 20 | sigma = np.std(X) 21 | 22 | # mean and std 23 | print('Estimated mean = ', mu) 24 | print('Estimated standard deviation = ', sigma) 25 | 26 | # plot 27 | plt.figure() 28 | plt.plot(X, '.-', alpha=0.8, markeredgecolor='k', markeredgewidth=0.1, ms=3) 29 | plt.hlines(mu, 0, 1500, colors='m', linestyles='dashdot', label='Mean') 30 | plt.hlines(mu+3*sigma, 0, 1500, colors='r', linestyles='dashdot', label='Upper bound') 31 | plt.hlines(mu-3*sigma, 0, 1500, colors='r', linestyles='dashed', label='Lower bound') 32 | 33 | plt.xlabel('sample #'), plt.ylabel('Variable measurement') 34 | plt.xlim((0,1500)) 35 | plt.title('3-sigma bounds') 36 | plt.legend(loc='upper right') 37 | 38 | #%% hampel identifier 39 | # compute median and MAD 40 | from scipy import stats 41 | 42 | median = np.median(X) 43 | sigma_MAD = stats.median_absolute_deviation(X) # default scaling of 1.4826 is built-in 44 | 45 | # median & sigma_MAD 46 | print('Estimated robust location = ', median) 47 | print('Estimated robust spread = ', sigma_MAD) 48 | 49 | # plot 50 | plt.figure() 51 | plt.plot(X, '.-', alpha=0.8, markeredgecolor='k', markeredgewidth=0.1, ms=3) 52 | plt.hlines(median, 0, 1500, colors='m', linestyles='dashdot', label='Mean') 53 | plt.hlines(median+3*sigma_MAD, 0, 1500, colors='r', linestyles='dashdot', label='Upper bound') 54 | plt.hlines(median-3*sigma_MAD, 0, 1500, colors='r', linestyles='dashed', label='Lower bound') 55 | 56 | plt.xlabel('sample #'), plt.ylabel('Variable measurement') 57 | plt.xlim((0,1500)) 58 | plt.title('Hampel identifier bounds') 59 | plt.legend(loc='upper right') -------------------------------------------------------------------------------- /Chapter_Preprocessing/WrapperMethods_backward_SFS.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Implementing backward SFS on simulated process data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import numpy as np 7 | VSdata = np.loadtxt('VSdata.csv', delimiter=',') 8 | 9 | #%% separate X and y 10 | y = VSdata[:,0] 11 | X = VSdata[:,1:] 12 | 13 | #%% scale data 14 | from sklearn.preprocessing import StandardScaler 15 | xscaler = StandardScaler() 16 | X_scaled = xscaler.fit_transform(X) 17 | 18 | yscaler = StandardScaler() 19 | y_scaled = yscaler.fit_transform(y[:,None]) 20 | 21 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 22 | ## SFS-based variable selection 23 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 24 | from sklearn.feature_selection import SequentialFeatureSelector 25 | from sklearn.linear_model import LinearRegression 26 | 27 | BSFS = SequentialFeatureSelector(LinearRegression(), n_features_to_select=10, direction='backward', cv=5).fit(X_scaled, y_scaled) 28 | 29 | #%% check selected inputs 30 | print('Inputs selected: ', BSFS.get_support(indices=True)+1) # returns integer index of the features selected 31 | 32 | #%% reduce X to only top relevant inputs 33 | X_relevant = BSFS.transform(X) -------------------------------------------------------------------------------- /Chapter_Preprocessing/Wrapper_Methods_backward_SFS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter: Data Preprocessing\n", 8 | "\n", 9 | "# Topic: Wrapper Method: Backward SFS" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 3, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# read data\n", 19 | "import numpy as np\n", 20 | "VSdata = np.loadtxt('VSdata.csv', delimiter=',')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 4, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# separate X and y\n", 30 | "y = VSdata[:,0]\n", 31 | "X = VSdata[:,1:]" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 5, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# scale data\n", 41 | "from sklearn.preprocessing import StandardScaler\n", 42 | "xscaler = StandardScaler()\n", 43 | "X_scaled = xscaler.fit_transform(X)\n", 44 | "\n", 45 | "yscaler = StandardScaler()\n", 46 | "y_scaled = yscaler.fit_transform(y[:,None])" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 7, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# SFS-based variable selection\n", 56 | "from sklearn.feature_selection import SequentialFeatureSelector\n", 57 | "from sklearn.linear_model import LinearRegression\n", 58 | "\n", 59 | "BSFS = SequentialFeatureSelector(LinearRegression(), n_features_to_select=10, direction='backward', cv=5).fit(X_scaled, y_scaled)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 8, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "Inputs selected: [18 19 20 21 22 23 24 25 31 33]\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "# check selected inputs\n", 77 | "print('Inputs selected: ', BSFS.get_support(indices=True)+1) # returns integer index of the features selected" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 9, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# reduce X to only top relevant inputs\n", 87 | "X_relevant = BSFS.transform(X)" 88 | ] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "Python 3 (ipykernel)", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.9.7" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 2 112 | } 113 | -------------------------------------------------------------------------------- /Chapter_Preprocessing/deNoising_process_signals.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## De-noising Process Signals 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import numpy as np 7 | noisy_signal = np.loadtxt('noisy_flow_signal.csv', delimiter=',') 8 | 9 | #%% SMA filter 10 | import pandas as pd 11 | 12 | windowSize = 15 13 | smoothed_signal_MA = pd.DataFrame(noisy_signal).rolling(windowSize).mean().values 14 | 15 | #%% SG filter 16 | from scipy.signal import savgol_filter 17 | 18 | smoothed_signal_SG = savgol_filter(noisy_signal, window_length = 15, polyorder = 2) 19 | 20 | #%% plots 21 | from matplotlib import pyplot as plt 22 | 23 | plt.figure(figsize=(11,3)) 24 | plt.plot(noisy_signal, alpha=0.3, label='Noisy signal') 25 | plt.plot(smoothed_signal_MA, color='m', label='SMA smoothed signal') 26 | plt.plot(smoothed_signal_SG, color='orange', label='SG smoothed signal') 27 | plt.xlabel('Sample #'), plt.ylabel('Value') 28 | plt.legend() -------------------------------------------------------------------------------- /Chapter_Preprocessing/filterMethods.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Implementing filter methods on simulated process data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import numpy as np 7 | VSdata = np.loadtxt('VSdata.csv', delimiter=',') 8 | 9 | #%% separate X and y 10 | y = VSdata[:,0] 11 | X = VSdata[:,1:] 12 | 13 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 14 | ## Linear correlation-based variable selection 15 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 16 | 17 | # compute linear correlation based scores 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import f_regression 20 | 21 | VSmodel_Correlation = SelectKBest(f_regression, k=10).fit(X, y) 22 | input_scores = VSmodel_Correlation.scores_ 23 | 24 | # find the top ranked inputs 25 | top_k_inputs_Correlation = np.argsort(input_scores)[::-1][:10] + 1# [::-1] reverses the array returned by argsort() and [:n] gives that last n elements 26 | print(top_k_inputs_Correlation) 27 | 28 | # reduce X to only top relevant inputs 29 | X_relevant = VSmodel_Correlation.transform(X) 30 | 31 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 32 | ## MI-based variable selection 33 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 34 | 35 | # compute linear correlation based scores 36 | from sklearn.feature_selection import mutual_info_regression 37 | 38 | VSmodel_MI = SelectKBest(mutual_info_regression, k=10).fit(X, y) 39 | input_scores = VSmodel_MI.scores_ 40 | 41 | # find the top ranked inputs 42 | top_k_inputs_MI = np.argsort(input_scores)[::-1][:10] # [::-1] reverses the array returned by argsort() and [:n] gives that last n elements 43 | print(top_k_inputs_MI) 44 | 45 | # reduce X to only top relevant inputs 46 | X_relevant = VSmodel_MI.transform(X) -------------------------------------------------------------------------------- /Chapter_Preprocessing/filter_Methods.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter: Data Preprocessing\n", 8 | "\n", 9 | "# Topic: Filter Methods for Variable Selection" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# read data\n", 19 | "import numpy as np\n", 20 | "VSdata = np.loadtxt('VSdata.csv', delimiter=',')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# separate X and y\n", 30 | "y = VSdata[:,0]\n", 31 | "X = VSdata[:,1:]" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n", 41 | "## Linear correlation-based variable selection\n", 42 | "## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 7, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "[22 21 24 20 30 29 32 31 28 27]\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "# compute linear correlation based scores \n", 60 | "from sklearn.feature_selection import SelectKBest\n", 61 | "from sklearn.feature_selection import f_regression\n", 62 | "\n", 63 | "VSmodel_Correlation = SelectKBest(f_regression, k=10).fit(X, y)\n", 64 | "input_scores = VSmodel_Correlation.scores_\n", 65 | "\n", 66 | "# find the top ranked inputs\n", 67 | "top_k_inputs_Correlation = np.argsort(input_scores)[::-1][:10] + 1# [::-1] reverses the array returned by argsort() and [:n] gives that last n elements\n", 68 | "print(top_k_inputs_Correlation)\n", 69 | "\n", 70 | "# reduce X to only top relevant inputs\n", 71 | "X_relevant = VSmodel_Correlation.transform(X)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n", 81 | "## MI-based variable selection\n", 82 | "## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 8, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "[21 8 0 4 5 37 1 30 13 16]\n" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "# compute linear correlation based scores \n", 100 | "from sklearn.feature_selection import mutual_info_regression\n", 101 | "\n", 102 | "VSmodel_MI = SelectKBest(mutual_info_regression, k=10).fit(X, y)\n", 103 | "input_scores = VSmodel_MI.scores_\n", 104 | "\n", 105 | "# find the top ranked inputs\n", 106 | "top_k_inputs_MI = np.argsort(input_scores)[::-1][:10] # [::-1] reverses the array returned by argsort() and [:n] gives that last n elements\n", 107 | "print(top_k_inputs_MI)\n", 108 | "\n", 109 | "# reduce X to only top relevant inputs\n", 110 | "X_relevant = VSmodel_MI.transform(X)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [] 119 | } 120 | ], 121 | "metadata": { 122 | "kernelspec": { 123 | "display_name": "Python 3 (ipykernel)", 124 | "language": "python", 125 | "name": "python3" 126 | }, 127 | "language_info": { 128 | "codemirror_mode": { 129 | "name": "ipython", 130 | "version": 3 131 | }, 132 | "file_extension": ".py", 133 | "mimetype": "text/x-python", 134 | "name": "python", 135 | "nbconvert_exporter": "python", 136 | "pygments_lexer": "ipython3", 137 | "version": "3.9.7" 138 | } 139 | }, 140 | "nbformat": 4, 141 | "nbformat_minor": 2 142 | } 143 | -------------------------------------------------------------------------------- /Chapter_Preprocessing/info.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter_RNN/AircraftEngine_dataExplore.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Exploring aircraft engine data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | from sklearn.preprocessing import StandardScaler 9 | 10 | #%% read data 11 | # training 12 | train_df = pd.read_csv('PM_train.txt', sep=" ", header=None) 13 | train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True) # last two columns are blank 14 | train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 15 | 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 16 | 's15', 's16', 's17', 's18', 's19', 's20', 's21'] 17 | 18 | # test 19 | test_df = pd.read_csv('PM_test.txt', sep=" ", header=None) 20 | test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True) 21 | test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 22 | 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 23 | 's15', 's16', 's17', 's18', 's19', 's20', 's21'] 24 | 25 | # actual RUL for each engine-id in the test data 26 | truth_df = pd.read_csv('PM_truth.txt', sep=" ", header=None) 27 | truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True) 28 | 29 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 30 | ## exploratory graphs (training) 31 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 32 | # get all sensor data for an engine ID 33 | engineID = 1 34 | engineDataAll = train_df.loc[train_df['id'] == engineID] 35 | engineDataSensor = engineDataAll.iloc[:, 5:] 36 | 37 | # normalize 38 | scalar = StandardScaler() 39 | engineDataSensor_scaled = scalar.fit_transform(engineDataSensor.values) 40 | 41 | # plot all sensor data for an engine ID 42 | plt.figure() 43 | plt.plot(engineDataSensor_scaled) 44 | plt.xlabel('Engine cycle') 45 | plt.ylabel('Scaled sensor values') 46 | plt.title('Training sensor Data for engineID ' + str(engineID)) 47 | plt.box(False) 48 | 49 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 50 | ## exploratory graphs (test) 51 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 52 | # get all sensor data for an engine ID 53 | engineID = 90 54 | engineDataAll = test_df.loc[test_df['id'] == engineID] 55 | engineDataSensor = engineDataAll.iloc[:, 5:] 56 | 57 | # normalize 58 | scalar = StandardScaler() 59 | engineDataSensor_scaled = scalar.fit_transform(engineDataSensor.values) 60 | 61 | # plot all sensor data for an engine ID 62 | plt.figure() 63 | plt.plot(engineDataSensor_scaled) 64 | plt.xlabel('Engine cycle') 65 | plt.ylabel('Scaled sensor values') 66 | plt.title('Test sensor Data for engineID ' + str(engineID)) 67 | plt.box(False) 68 | -------------------------------------------------------------------------------- /Chapter_RNN/PM_truth.txt: -------------------------------------------------------------------------------- 1 | 112 2 | 98 3 | 69 4 | 82 5 | 91 6 | 93 7 | 91 8 | 95 9 | 111 10 | 96 11 | 97 12 | 124 13 | 95 14 | 107 15 | 83 16 | 84 17 | 50 18 | 28 19 | 87 20 | 16 21 | 57 22 | 111 23 | 113 24 | 20 25 | 145 26 | 119 27 | 66 28 | 97 29 | 90 30 | 115 31 | 8 32 | 48 33 | 106 34 | 7 35 | 11 36 | 19 37 | 21 38 | 50 39 | 142 40 | 28 41 | 18 42 | 10 43 | 59 44 | 109 45 | 114 46 | 47 47 | 135 48 | 92 49 | 21 50 | 79 51 | 114 52 | 29 53 | 26 54 | 97 55 | 137 56 | 15 57 | 103 58 | 37 59 | 114 60 | 100 61 | 21 62 | 54 63 | 72 64 | 28 65 | 128 66 | 14 67 | 77 68 | 8 69 | 121 70 | 94 71 | 118 72 | 50 73 | 131 74 | 126 75 | 113 76 | 10 77 | 34 78 | 107 79 | 63 80 | 90 81 | 8 82 | 9 83 | 137 84 | 58 85 | 118 86 | 89 87 | 116 88 | 115 89 | 136 90 | 28 91 | 38 92 | 20 93 | 85 94 | 55 95 | 128 96 | 137 97 | 82 98 | 59 99 | 117 100 | 20 101 | -------------------------------------------------------------------------------- /Chapter_RNN/TEP_dataExploration.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Exploring TEP data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import pyreadr 7 | fault_free_training_data = pyreadr.read_r('TEP_FaultFree_Training.RData')['fault_free_training'] # pandas dataframe 8 | fault_free_testing_data = pyreadr.read_r('TEP_FaultFree_Testing.RData')['fault_free_testing'] 9 | faulty_training_data = pyreadr.read_r('TEP_Faulty_Training.RData')['faulty_training'] 10 | faulty_testing_data = pyreadr.read_r('TEP_Faulty_Testing.RData')['faulty_testing'] 11 | 12 | #%% remove fault 3,9,15 data from faulty dataset 13 | faulty_training_data = faulty_training_data[faulty_training_data['faultNumber'] != 3] 14 | faulty_training_data = faulty_training_data[faulty_training_data['faultNumber'] != 9] 15 | faulty_training_data = faulty_training_data[faulty_training_data['faultNumber'] != 15] 16 | 17 | faulty_testing_data = faulty_testing_data[faulty_testing_data['faultNumber'] != 3] 18 | faulty_testing_data = faulty_testing_data[faulty_testing_data['faultNumber'] != 9] 19 | faulty_testing_data = faulty_testing_data[faulty_testing_data['faultNumber'] != 15] 20 | 21 | #%% sample process values from selected simulation runs 22 | # fault free 23 | faultFree_simulationData = fault_free_training_data[fault_free_training_data['simulationRun'] == 1] 24 | faultFree_simulationData = faultFree_simulationData.iloc[:,3:13] 25 | 26 | # faulty 27 | faulty_simulationData_1 = faulty_training_data[faulty_training_data['simulationRun'] == 1] 28 | faulty_simulationData_1 = faulty_simulationData_1[faulty_simulationData_1['faultNumber'] == 1] 29 | faulty_simulationData_1 = faulty_simulationData_1.iloc[:,3:13] 30 | 31 | faulty_simulationData_2 = faulty_training_data[faulty_training_data['simulationRun'] == 1] 32 | faulty_simulationData_2 = faulty_simulationData_2[faulty_simulationData_2['faultNumber'] == 8] 33 | faulty_simulationData_2 = faulty_simulationData_2.iloc[:,3:13] 34 | 35 | faulty_simulationData_3 = faulty_training_data[faulty_training_data['simulationRun'] == 1] 36 | faulty_simulationData_3 = faulty_simulationData_3[faulty_simulationData_3['faultNumber'] == 12] 37 | faulty_simulationData_3 = faulty_simulationData_3.iloc[:,3:13] 38 | 39 | #%% scale data 40 | from sklearn.preprocessing import StandardScaler 41 | 42 | scaler = StandardScaler() 43 | faultFree_simulationData_scaled = scaler.fit_transform(faultFree_simulationData) 44 | faulty_simulationData_1_scaled = scaler.transform(faulty_simulationData_1) 45 | faulty_simulationData_2_scaled = scaler.transform(faulty_simulationData_2) 46 | faulty_simulationData_3_scaled = scaler.transform(faulty_simulationData_3) 47 | 48 | #%% plots 49 | import matplotlib.pyplot as plt 50 | legendNames = ['signal' + str(i+1) for i in range(10)] 51 | 52 | plt.figure() 53 | plt.plot(faultFree_simulationData_scaled) 54 | plt.xlabel('Time step') 55 | plt.ylabel('Scaled values') 56 | plt.title('Training measurements for non-faulty data') 57 | plt.legend(legendNames, loc='upper left') 58 | 59 | plt.figure() 60 | plt.plot(faulty_simulationData_1_scaled) 61 | plt.xlabel('Time step') 62 | plt.ylabel('Scaled values') 63 | plt.title('Training measurements for fault 1 data') 64 | plt.legend(legendNames, loc='upper left') 65 | 66 | plt.figure() 67 | plt.plot(faulty_simulationData_2_scaled) 68 | plt.xlabel('Time step') 69 | plt.ylabel('Scaled values') 70 | plt.title('Training measurements for fault 8 data') 71 | plt.legend(legendNames, loc='upper left') 72 | 73 | plt.figure() 74 | plt.plot(faulty_simulationData_3_scaled) 75 | plt.xlabel('Time step') 76 | plt.ylabel('Scaled values') 77 | plt.title('Training measurements for fault 8 data (run = 300)') 78 | plt.legend(legendNames, loc='upper left') -------------------------------------------------------------------------------- /Chapter_RNN/info.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter_ReinforcementLearning/actor_saved/keras_metadata.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_ReinforcementLearning/actor_saved/keras_metadata.pb -------------------------------------------------------------------------------- /Chapter_ReinforcementLearning/actor_saved/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_ReinforcementLearning/actor_saved/saved_model.pb -------------------------------------------------------------------------------- /Chapter_ReinforcementLearning/actor_saved/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_ReinforcementLearning/actor_saved/variables/variables.data-00000-of-00001 -------------------------------------------------------------------------------- /Chapter_ReinforcementLearning/actor_saved/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_ReinforcementLearning/actor_saved/variables/variables.index -------------------------------------------------------------------------------- /Chapter_ReinforcementLearning/info.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter_ScriptingEnvironment/NumpyBasics.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Numpy Basics 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | # create a 2D array 5 | import numpy as np 6 | 7 | arr2D = np.array([[1,4,6],[2,5,7]]) 8 | 9 | # getting information about arr2D 10 | print(arr2D.size) # returns 6, the no. of items 11 | print(arr2D.ndim) # returns 2, the no. of dimensions 12 | print(arr2D.shape) # returns tuple(2,3) corresponding to 2 rows & 3 columns 13 | 14 | # create a 1D array 15 | arr1D = np.array([1,4,6]) 16 | 17 | # getting information about arr1D 18 | print(arr1D.size) # returns 3, the no. of items 19 | print(arr1D.ndim) # returns 1, the no. of dimensions 20 | print(arr1D.shape) # returns tuple(3,) corresponding to 3 items 21 | 22 | #%% creating numpy arrays 23 | # creating sequence of numbers 24 | arr1 = np.arange(3, 6) # same as Python range function; results in array([3,4,5]) 25 | arr2 = np.arange(3, 9, 2) # the 3rd argument defines the step size; results in array([3,5,7]) 26 | arr3 = np.linspace(1,7,3) # creates evenly spaced 3 values from 1 to 7; results in array([1,4,7]) 27 | 28 | # creating special arrays 29 | arr4 = np.ones((2,1)) # array of shape (2,1) with all items as 1 30 | arr5 = np.zeros((2,2)) # all items as zero; often used as placeholder array at beginning of script 31 | arr6 = np.eye(2) # diagonal items as 1 32 | 33 | # adding axis to existing arrays (e.g., converting 1D array to 2D array) 34 | print(arr1[:, np.newaxis]) 35 | arr7 = arr1[:, None] # same as above 36 | 37 | # combining / stacking arrays 38 | print(np.hstack((arr1, arr2))) # horizontally stacks passed arrays 39 | print(np.vstack((arr1, arr2))) # vertically stacks passed arrays 40 | print(np.hstack((arr5,arr4))) # array 4 added as a column into arr5 41 | print(np.vstack((arr5,arr6))) # rows of array 6 added onto arr5 42 | 43 | #%% basic numpy functions 44 | print(arr2D.sum(axis=0)) 45 | print(arr2D.sum(axis=1)) 46 | 47 | #%% indexing arrays 48 | # accessing individual items 49 | print(arr2D[1,2]) # returns 7 50 | 51 | # slicing 52 | arr8 = np.arange(10).reshape((2,5)) # rearrange the 1D array into shape (2,5) 53 | print((arr8[0:1,1:3])) 54 | print((arr8[0,1:3])) # note that a 1D array is returned here instead of the 2D array above 55 | 56 | # accessing entire row or column 57 | print(arr8[1]) # returns 2nd row as array([5,6,7,8,9]); same as arr8[1,:] 58 | print(arr8[:, 4]) # returns items of 5th column as a 1D array 59 | 60 | # extract a subarray from arr8 and modify it 61 | arr8_sub = arr8[:, :2] # columns 0 and 1 from all rows 62 | arr8_sub[1, 1] = 1000 63 | print(arr8) # arr8 gets modified as well!! 64 | 65 | # use copy method for a separate copy 66 | arr8 = np.arange(10).reshape((2,5)) 67 | arr8_sub2 = arr8[:, :2].copy() 68 | arr8_sub2[1, 1] = 100 69 | print(arr8) 70 | 71 | # Fancy indexing 72 | # combination of simple and fancy indexing 73 | arr8_sub3 = arr8[:, [0, 1]] # note how columns are indexed via a list 74 | arr8_sub3[1, 1] = 100 # arr8_sub3 becomes same as arr8_sub2 but arr8 is not modified here 75 | print(arr8) 76 | 77 | # use boolean mask to select subarray 78 | arr8_sub4 = arr8[arr8 > 5] # returns array([6,7,8,9]), i.e., all values > 5 79 | arr8_sub4[0] = 0 # again, arr8 is not affected 80 | print(arr8) 81 | 82 | #%% vectorized operations 83 | vec1 = np.array([1,2,3,4]) 84 | vec2 = np.array([5,6,7,8]) 85 | vec_sum = vec1 + vec2 # returns array([6,8,10,12]); no need to loop through index 0 to 3 86 | 87 | # slightly more complex operation (computing distance between vectors) 88 | vec_distance = np.sqrt(np.sum((vec1 - vec2)**2)) # vec_distance = 8.0 89 | -------------------------------------------------------------------------------- /Chapter_ScriptingEnvironment/PandasBasics.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Pandas Basics 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | # create a series (1D structure) 5 | import pandas as pd 6 | 7 | data = [10,8,6] 8 | s = pd.Series(data) # can pass numpy array as well 9 | print(s) 10 | 11 | # create a dataframe 12 | data = [[1,10],[1,8],[1,6]] 13 | df = pd.DataFrame(data, columns=['id', 'value']) 14 | print(df) 15 | 16 | # dataframe from series 17 | s2 = pd.Series([1,1,1]) 18 | df = pd.DataFrame({'id':s2, 'value':s}) 19 | print(df) 20 | 21 | #%% data access 22 | # column(s) selection 23 | print(df['id']) # returns column 'id' as a series 24 | print(df.id) # same as above 25 | print(df[['id']]) # returns specified columns in the list as a dataframe 26 | 27 | # row selection 28 | df.index = [100, 101, 102] # changing row indices from [0,1,2] to [100,101,102] 29 | print(df) 30 | print(df.loc[101]) # returns 2nd row as a series; can provide a list for multiple rows selection 31 | print(df.iloc[1]) # integer location-based selection; same result as above 32 | 33 | # individual item selection 34 | print(df.loc[101, 'value']) # returns 8 35 | print(df.iloc[1, 1]) # same as above 36 | 37 | #%% data aggregation exanple 38 | # create another dataframe using df 39 | df2 = df.copy() 40 | df2.id = 2 # make all items in column 'id' as 2 41 | df2.value *= 4 # multiply all items in column 'value' by 4 42 | print(df2) 43 | 44 | # combine df and df2 45 | df3 = df.append(df2) # a new object is retuned unlike Python’s append function 46 | print(df3) 47 | 48 | # id-based mean values computation 49 | print(df3.groupby('id').mean()) # returns a dataframe 50 | 51 | #%% file I/O 52 | # reading from excel and csv files 53 | dataset1 = pd.read_excel('filename.xlsx') # several parameter options are available to customize what data is read 54 | dataset2 = pd.read_csv('filename.xlsx') 55 | -------------------------------------------------------------------------------- /Chapter_ScriptingEnvironment/PythonBasics.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Python Basics 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% basic data types 6 | i = 2 # integer; type(i) = int 7 | f = 1.2 # floating-point number; type(f) = float 8 | s = 'two' # string; type(s) = str 9 | b = True # boolean; type(b) = bool 10 | 11 | # basic operations 12 | print(i+2) # displays 4 13 | print(f*2) # displays 2.4 14 | print(not b)# displays False 15 | 16 | #%% ordered sequences 17 | # different ways of creating lists 18 | list1 = [2,4,6] 19 | list2 = ['air',3,1,5] 20 | list3 = list(range(4)) # equals [0,1,2,3]; range function returns a sequence of numbers starting from 0 (default) with increments of 1 (default) 21 | list3.append(8) # returns [0,1,2,3,8]; append function adds new items to existing list 22 | list4 = list1 + list2 # equals [2,4,6,'air',3,1,5] 23 | list5 = [list2, list3] # nested list [['air', 3, 1, 5], [0, 1, 2, 3,8]] 24 | 25 | # creating tuples 26 | tuple1 = (0,1,'two') 27 | tuple2 = (list1, list2) # equals ([2, 4, 6, 8], ['air', 3, 1, 5]) 28 | 29 | #%% list comprehension 30 | # return powers of list items 31 | newList1 = [item**2 for item in list3] # equals [0,1,4,9, 64] 32 | # nested list comprehension 33 | newList2 = [item2**2 for item2 in [item**2 for item in list3]] # equals [0,1,16,81, 4096] 34 | 35 | #%% Indexing and slicing sequences 36 | # working with single item using positive or negative indexes 37 | print(list1[0]) # displays 2, the 1st item in list1 38 | list2[1] = 1 # list2 becomes ['air',1,1,5] 39 | print(list2[-2]) # displays 1, the 2nd last element in list2 40 | 41 | # accessing multiple items through slicing 42 | # Syntax: givenList[start,stop,step]; if unspecified, start=0, stop=list length, step=1 43 | print(list4[0:3]) # displays [2,4,6], the 1st, 2nd, 3rd items; note that index 3 item is excluded 44 | print(list4[:3]) # same as above 45 | print(list4[4:len(list4)]) # displays [3,1,5]; len() function returns the number of items in list 46 | print(list4[4:]) # same as above 47 | print(list4[::3]) # displays [2, 'air', 5] 48 | print(list4[::-1]) # displays list 4 backwards [5, 1, 3, 'air', 6, 4, 2] 49 | list4[2:4] = [0,0,0] # list 4 becomes [2, 4, 0, 0, 0, 3, 1, 5] 50 | 51 | #%% Execution control statements 52 | # conditional execution 53 | # selectively execute code based on condition 54 | if list1[0] > 0: 55 | list1[0] = 'positive' 56 | else: 57 | list1[0] = 'negative' 58 | 59 | # loop execution 60 | # code below computes sum of squares of numbers in list 3 61 | sum_of_squares = 0 62 | for i in range(len(list3)): 63 | sum_of_squares += list3[i]**2 64 | 65 | print(sum_of_squares) # displays 78 66 | 67 | #%% custom functions 68 | # define function instructions 69 | def sumSquares(givenList): 70 | sum_of_squares = 0 71 | for i in range(len(givenList)): 72 | sum_of_squares += givenList[i]**2 73 | 74 | return sum_of_squares 75 | 76 | # call/re-use the custom function multiple times 77 | print(sumSquares(list3)) # displays 78 78 | print(sumSquares(list4)) # displays 55 79 | 80 | -------------------------------------------------------------------------------- /Chapter_ScriptingEnvironment/info.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter_ScriptingEnvironment/quadratic_raw_data.csv: -------------------------------------------------------------------------------- 1 | -5.000000000000000000e+00,1.900000000000000000e+01 2 | -4.400000000000000355e+00,1.800000000000000000e+01 3 | -4.349999999999999645e+00,1.780000000000000071e+01 4 | -4.099999999999999645e+00,1.600000000000000000e+01 5 | -4.049999999999999822e+00,1.219999999999999929e+01 6 | -4.000000000000000000e+00,1.200000000000000000e+01 7 | -3.799999999999999822e+00,7.000000000000000000e+00 8 | -3.700000000000000178e+00,6.000000000000000000e+00 9 | -3.500000000000000000e+00,5.500000000000000000e+00 10 | -3.000000000000000000e+00,6.500000000000000000e+00 11 | -2.899999999999999911e+00,6.480000000000000426e+00 12 | -2.799999999999999822e+00,6.450000000000000178e+00 13 | -2.000000000000000000e+00,2.500000000000000000e+00 14 | -1.949999999999999956e+00,2.450000000000000178e+00 15 | -1.350000000000000089e+00,0.000000000000000000e+00 16 | -1.300000000000000044e+00,-2.500000000000000000e-01 17 | -1.149999999999999911e+00,-3.000000000000000000e+00 18 | 0.000000000000000000e+00,-1.000000000000000000e+00 19 | 2.000000000000000111e-01,-1.000000000000000056e-01 20 | 2.999999999999999889e-01,-1.199999999999999956e-01 21 | 5.000000000000000000e-01,-1.100000000000000006e-01 22 | 1.199999999999999956e+00,-2.000000000000000000e+00 23 | 1.699999999999999956e+00,1.000000000000000000e+00 24 | 1.800000000000000044e+00,1.399999999999999911e+00 25 | 1.899999999999999911e+00,1.449999999999999956e+00 26 | -------------------------------------------------------------------------------- /Chapter_ScriptingEnvironment/typicalML_script.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Linear regression model 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import libraries 6 | import numpy as np 7 | from sklearn.preprocessing import PolynomialFeatures 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.linear_model import LinearRegression 10 | from sklearn.metrics import r2_score 11 | import matplotlib.pyplot as plt 12 | 13 | #%% read data 14 | data = np.loadtxt('quadratic_raw_data.csv', delimiter=',') 15 | x = data[:,0:1]; y = data[:,1:] # equivalent to y = data[:,1,None] which returns 2D array 16 | 17 | #%% Pre-process / Feature engineering 18 | poly = PolynomialFeatures(degree=2, include_bias=False) 19 | X_poly = poly.fit_transform(x) # X_poly: 1st column is x, 2nd column is x^2 20 | 21 | #%% scale model input variables 22 | scaler = StandardScaler() 23 | X_scaled = scaler.fit_transform(X_poly) 24 | 25 | #%% fit linear model & predict 26 | model = LinearRegression() 27 | model.fit(X_poly, y) 28 | y_predicted = model.predict(X_poly) 29 | 30 | #%% Assess model accuracy 31 | print('Fit accuracy = ', r2_score(y, y_predicted)) 32 | 33 | #%% plot predictions 34 | plt.figure(figsize=(4, 2)) 35 | plt.plot(x, y, 'o', label='raw data') 36 | plt.plot(x, y_predicted, label='quadratic fit') 37 | plt.legend() 38 | plt.xlabel('x'), plt.ylabel('y') 39 | 40 | -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/Metal_etch_2DPCA_testData.csv: -------------------------------------------------------------------------------- 1 | 1.723544992180245927e+00,9.056845222188112388e+00 2 | -1.195362798389852976e+01,1.971340850279771573e+01 3 | -1.405180395370675228e+01,1.003786454500131242e+01 4 | -2.296861786600156208e+01,5.985086921969785578e+01 5 | -8.375268274972489380e+00,1.115643566056243685e+01 6 | -1.173779751252286374e+01,1.000032875202828109e+01 7 | -1.077657111425542347e+01,-3.574611061778244903e+01 8 | -1.546675594585076574e+01,2.381405848082144772e+01 9 | -1.152423531101815612e+01,8.593634005642147855e+00 10 | 6.834573733321335220e-01,-1.356739897215637569e+01 11 | -1.144209951547643556e+01,-1.816846025077951765e+01 12 | -9.969315306557533063e+00,-3.025019322744546102e+01 13 | -1.444956309528516236e+01,2.780732221277049376e+01 14 | -1.618541901915454417e+01,-1.541740007231091170e+01 15 | 1.236806249975188088e+01,5.187754758901448326e+00 16 | 1.774901749020017405e+01,1.940141924912499860e+01 17 | 1.876992176399419776e+01,8.031281916261185927e+00 18 | 2.659171665908793258e+01,-1.896470000938777289e+01 19 | 2.257578315399075564e+01,1.854098261967717676e+01 20 | 3.469430765643569714e+01,-1.916772089195522444e+00 21 | -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/SVDD_FaultDetection.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Process Fault Detection via SVDD in metal etch dataset 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import numpy as np 7 | 8 | X_train = np.loadtxt('Metal_etch_2DPCA_trainingData.csv', delimiter=',') 9 | 10 | #%% bandwidth via modified mean criteria 11 | import scipy.spatial 12 | 13 | N = X_train.shape[0] 14 | phi = 1/np.log(N-1) 15 | delta = -0.14818008*np.power(phi,4) + 0.2846623624*np.power(phi,3) - 0.252853808*np.power(phi,2) + 0.159059498*phi - 0.001381145 16 | D2 = np.sum(scipy.spatial.distance.pdist(X_train, 'sqeuclidean'))/(N*(N-1)/2) # pdist computes pairwise distances between observations 17 | sigma = np.sqrt(D2/np.log((N-1)/delta*delta)) 18 | gamma = 1/(2*sigma*sigma) 19 | 20 | #%% SVM fit 21 | from sklearn.svm import OneClassSVM 22 | 23 | model = OneClassSVM(nu=0.01, gamma=0.025).fit(X_train) # nu corresponds to f 24 | 25 | #%% predict for test data 26 | X_test = np.loadtxt('Metal_etch_2DPCA_testData.csv', delimiter=',') 27 | y_test = model.predict(X_test) # y=-1 for outliers 28 | 29 | print('Number of faults identified: ', np.sum(y_test == -1), ' out of ', len(y_test)) 30 | 31 | #%% plot SVDD boundaries 32 | import matplotlib.pyplot as plt 33 | 34 | plt.figure() 35 | plt.scatter(X_train[:, 0], X_train[:, 1], edgecolors='k', alpha=0.8) 36 | plt.xlabel('PC1 scores') 37 | plt.ylabel('PC2 scores') 38 | 39 | # get axis limits 40 | ax = plt.gca() 41 | xlim = ax.get_xlim() 42 | ylim = ax.get_ylim() 43 | 44 | # create grid to evaluate model 45 | xx = np.linspace(xlim[0], xlim[1], 100) 46 | yy = np.linspace(ylim[0], ylim[1], 100) 47 | YY, XX = np.meshgrid(yy, xx) 48 | xy = np.vstack([XX.ravel(), YY.ravel()]).T 49 | Z = model.decision_function(xy).reshape(XX.shape) 50 | 51 | # plot decision boundary and supporting planes 52 | ax.contour(XX, YY, Z, levels=[0], alpha=0.9, linestyles=['-'], colors=['red']) 53 | 54 | #%% plot test data 55 | plt.scatter(X_test[y_test==-1, 0],X_test[y_test==-1,1], c='red', marker = '*', label='True Positive') 56 | plt.scatter(X_test[y_test==1, 0],X_test[y_test==1,1], c='magenta', marker = '*', label='False Negative') 57 | plt.legend() 58 | -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/SVDD_OneClassClassification.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Nonlinear boundary generation via One Class SVM / SVDD 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% generate data 6 | import numpy as np 7 | 8 | X = np.loadtxt('SVDD_toyDataset.csv', delimiter=',') 9 | 10 | #%% compute bandwidth via modified mean criteria 11 | import scipy.spatial 12 | 13 | N = X.shape[0] 14 | phi = 1/np.log(N-1) 15 | delta = -0.14818008*np.power(phi,4) + 0.2846623624*np.power(phi,3) - 0.252853808*np.power(phi,2) + 0.159059498*phi - 0.001381145 16 | D2 = np.sum(scipy.spatial.distance.pdist(X, 'sqeuclidean'))/(N*(N-1)/2) # pdist computes pairwise distances between observations 17 | sigma = np.sqrt(D2/np.log((N-1)/delta*delta)) 18 | gamma = 1/(2*sigma*sigma) 19 | 20 | #%% SVM fit 21 | from sklearn.svm import OneClassSVM 22 | 23 | model = OneClassSVM(nu=0.01, gamma=5) 24 | model.fit(X) 25 | 26 | #%% plot SVM boundaries 27 | import matplotlib.pyplot as plt 28 | 29 | plt.figure() 30 | plt.scatter(X[:, 0], X[:, 1], edgecolors='k', alpha=0.8) 31 | plt.xlabel('x1') 32 | plt.ylabel('x2') 33 | 34 | # get axis limits 35 | ax = plt.gca() 36 | xlim = ax.get_xlim() 37 | ylim = ax.get_ylim() 38 | 39 | # create grid to evaluate model 40 | xx = np.linspace(xlim[0], xlim[1], 100) 41 | yy = np.linspace(ylim[0], ylim[1], 100) 42 | YY, XX = np.meshgrid(yy, xx) 43 | xy = np.vstack([XX.ravel(), YY.ravel()]).T 44 | Z = model.decision_function(xy).reshape(XX.shape) 45 | 46 | # plot decision boundary and supporting planes 47 | ax.contour(XX, YY, Z, levels=[0], alpha=0.9, linestyles=['-'], colors=['red']) -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/SVM_BinaryClassification.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Binary classification via SVM on toy dataset 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import numpy as np 7 | 8 | data = np.loadtxt('toyDataset.csv', delimiter=',') 9 | X = data[:, [0, 1]]; y = data[:, 2] 10 | 11 | #%% scale model inputs 12 | from sklearn.preprocessing import StandardScaler 13 | 14 | scaler = StandardScaler() 15 | X_scaled = scaler.fit_transform(X) 16 | 17 | #%% fit SVM model 18 | from sklearn.svm import SVC # for large datasets LinearSVC class is preferable 19 | 20 | model = SVC(kernel='linear', C=100) 21 | model.fit(X_scaled, y) 22 | 23 | #%% get details of support vectors 24 | print('# of support vectors:', len(model.support_)) 25 | 26 | #%% plot SVM boundaries 27 | import matplotlib.pyplot as plt 28 | 29 | plt.figure() 30 | plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k') 31 | plt.xlabel('X1'), plt.ylabel('X2') 32 | 33 | # get axis limits 34 | ax = plt.gca() 35 | xlim = ax.get_xlim() 36 | ylim = ax.get_ylim() 37 | 38 | # create grid to evaluate model 39 | xx = np.linspace(xlim[0], xlim[1], 100) 40 | yy = np.linspace(ylim[0], ylim[1], 100) 41 | YY, XX = np.meshgrid(yy, xx) 42 | xy = np.vstack([XX.ravel(), YY.ravel()]).T 43 | Z = model.decision_function(xy).reshape(XX.shape) 44 | 45 | # plot decision boundary and supporting planes 46 | ax.contour(XX, YY, Z, levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'], colors=['green', 'red', 'green']) 47 | 48 | # highlight support vectors 49 | ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=200, linewidth=2, alpha=0.25) 50 | 51 | -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/SVM_Kernel_BinaryClassification.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Nonlinear binary classification via kernel SVM on toy dataset 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% generate data 6 | import matplotlib.pyplot as plt 7 | from sklearn.datasets import make_circles 8 | 9 | X, y = make_circles(500, factor=.08, noise=.1, random_state=1) 10 | # note that y = 0,1 here and need not be +-1; SVM does internal transformation accordingly 11 | 12 | # plot 13 | plt.figure() 14 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k') 15 | plt.xlabel('x1') 16 | plt.ylabel('x2') 17 | plt.title('raw data') 18 | 19 | #%% find optimal hyperparameter via GridSearchCV 20 | from sklearn.svm import SVC 21 | from sklearn.model_selection import GridSearchCV 22 | 23 | param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[0.01, 0.1, 1, 10, 100]} 24 | gs = GridSearchCV(SVC(), param_grid, cv=5).fit(X, y) # no scaling required as input variables are already scaled 25 | 26 | print('Optimal hyperparameter:', gs.best_params_) 27 | 28 | #%% plot model predictions 29 | y_predicted = gs.predict(X) 30 | 31 | # plot 32 | plt.figure() 33 | plt.scatter(X[:, 0], X[:, 1], c=y_predicted, cmap=plt.cm.Paired, edgecolors='k') 34 | plt.xlabel('x1') 35 | plt.ylabel('x2') 36 | plt.title('predictions') 37 | 38 | #%% plot SVM boundaries 39 | plt.figure() 40 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k') 41 | plt.xlabel('X1'), plt.ylabel('X2') 42 | 43 | # get axis limits 44 | ax = plt.gca() 45 | xlim = ax.get_xlim() 46 | ylim = ax.get_ylim() 47 | 48 | # create grid to evaluate model 49 | import numpy as np 50 | xx = np.linspace(xlim[0], xlim[1], 100) 51 | yy = np.linspace(ylim[0], ylim[1], 100) 52 | YY, XX = np.meshgrid(yy, xx) 53 | xy = np.vstack([XX.ravel(), YY.ravel()]).T 54 | Z = gs.decision_function(xy).reshape(XX.shape) 55 | 56 | # plot decision boundary and supporting planes 57 | ax.contour(XX, YY, Z, levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'], colors=['green', 'red', 'green']) -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/SVM_Kernel_BinaryClassification_noGridSearch.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Nonlinear binary classification via kernel SVM on toy dataset 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% generate data 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from sklearn.datasets import make_circles 9 | 10 | X, y = make_circles(500, factor=.08, noise=.1, random_state=1) 11 | # note that y = 0,1 here and need not be +-1; SVM does internal transformation accordingly 12 | 13 | # plot 14 | plt.figure() 15 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k') 16 | plt.xlabel('x1') 17 | plt.ylabel('x2') 18 | plt.title('raw data') 19 | 20 | #%% SVM fit 21 | from sklearn.svm import SVC 22 | 23 | model = SVC(C=100, gamma=1) 24 | model.fit(X, y) # no scaling required as input variables are already scaled 25 | 26 | #%% plot model predictions 27 | y_predicted = model.predict(X) 28 | 29 | # plot 30 | plt.figure() 31 | plt.scatter(X[:, 0], X[:, 1], c=y_predicted, cmap=plt.cm.Paired, edgecolors='k') 32 | plt.xlabel('x1') 33 | plt.ylabel('x2') 34 | plt.title('predictions') 35 | 36 | #%% plot SVM boundaries 37 | plt.figure() 38 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k') 39 | plt.xlabel('x1') 40 | plt.ylabel('x2') 41 | 42 | # get axis limits 43 | ax = plt.gca() 44 | xlim = ax.get_xlim() 45 | ylim = ax.get_ylim() 46 | 47 | # create grid to evaluate model 48 | xx = np.linspace(xlim[0], xlim[1], 100) 49 | yy = np.linspace(ylim[0], ylim[1], 100) 50 | YY, XX = np.meshgrid(yy, xx) 51 | xy = np.vstack([XX.ravel(), YY.ravel()]).T 52 | Z = model.decision_function(xy).reshape(XX.shape) 53 | 54 | # plot decision boundary and supporting planes 55 | ax.contour(XX, YY, Z, levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'], colors=['green', 'red', 'green']) -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/SVM_SoftMarginClassification.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Binary classification via soft margin SVM on toy dataset 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% read data 6 | import numpy as np 7 | 8 | data = np.loadtxt('toyDataset2.csv', delimiter=',') 9 | X = data[:,0:2]; y = data[:,2] 10 | 11 | #%% scale model inputs 12 | from sklearn.preprocessing import StandardScaler 13 | 14 | scaler = StandardScaler() 15 | X_scaled = scaler.fit_transform(X) 16 | 17 | #%% SVM fit 18 | from sklearn.svm import SVC 19 | 20 | model = SVC(kernel='linear', C=100) 21 | model.fit(X_scaled, y) 22 | 23 | #%% get details of support vectors 24 | print('# of support vectors:', len(model.support_)) 25 | # The BAD sample lying on the wrong side of the support plane is also a support vector 26 | 27 | #%% plot SVM boundaries 28 | import matplotlib.pyplot as plt 29 | 30 | plt.figure() 31 | plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k') 32 | 33 | # get axis limits 34 | ax = plt.gca() 35 | xlim = ax.get_xlim() 36 | ylim = ax.get_ylim() 37 | 38 | # create grid to evaluate model 39 | xx = np.linspace(xlim[0], xlim[1], 100) 40 | yy = np.linspace(ylim[0], ylim[1], 100) 41 | YY, XX = np.meshgrid(yy, xx) 42 | xy = np.vstack([XX.ravel(), YY.ravel()]).T 43 | Z = model.decision_function(xy).reshape(XX.shape) 44 | 45 | # plot decision boundary and supporting planes 46 | ax.contour(XX, YY, Z, levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'], colors=['green', 'red', 'green']) 47 | 48 | # highlight support vectors 49 | ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=200, linewidth=2, alpha=0.25) 50 | -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/SVR_illustration.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## SVR quadratic fitting 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import 6 | import numpy as np 7 | np.random.seed(1) 8 | 9 | #%% generate data 10 | x = np.linspace(-1, 1, 50)[:, None] 11 | y = x*x + 0.25 12 | y = y + np.random.normal(0, 0.15, (50,1)) 13 | 14 | #%% plot 15 | import matplotlib.pyplot as plt 16 | plt.figure() 17 | plt.scatter(x,y,edgecolors='k', alpha=0.8) 18 | plt.xlabel('x'), plt.ylabel('y') 19 | 20 | #%% fit SVR model 21 | from sklearn.svm import SVR 22 | 23 | epsilon = 0.1 24 | model = SVR(gamma=0.5, C=10, epsilon=epsilon) 25 | model.fit(x, y) 26 | 27 | #%% predict 28 | xx = np.linspace(-1, 1, 200)[:, None] 29 | yy_predicted = model.predict(xx) 30 | yy_epsilon_tube_upper = yy_predicted + epsilon 31 | yy_epsilon_tube_lower = yy_predicted - epsilon 32 | 33 | #%% get support vectors 34 | x_SVs = model.support_vectors_ 35 | y_SVs = y[model.support_] 36 | 37 | #%% plot 38 | plt.figure() 39 | plt.scatter(x,y,edgecolors='k', alpha=0.8) 40 | plt.plot(xx, yy_predicted, 'r') 41 | plt.plot(xx, yy_epsilon_tube_upper, '--g') 42 | plt.plot(xx, yy_epsilon_tube_lower, '--g') 43 | plt.scatter(x_SVs, y_SVs, s=200, linewidth=2, edgecolors='m', alpha=0.15) 44 | plt.xlabel('x'), plt.ylabel('y') 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/debutanizer_Softsensing_PLS.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## PLS model with debutanizer data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% read data 10 | data = np.loadtxt('debutanizer_data.txt', skiprows=5) 11 | 12 | #%% separate train and test data 13 | from sklearn.model_selection import train_test_split 14 | X = data[:,0:-1] 15 | y = data[:,-1][:,np.newaxis] 16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 100) 17 | 18 | #%% scale data 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | X_scaler = StandardScaler() 22 | X_train_normal = X_scaler.fit_transform(X_train) 23 | X_test_normal = X_scaler.transform(X_test) 24 | 25 | y_scaler = StandardScaler() 26 | y_train_normal = y_scaler.fit_transform(y_train) 27 | y_test_normal = y_scaler.transform(y_test) 28 | 29 | #%% Finding # latents using kFold cross validation 30 | from sklearn.model_selection import KFold 31 | from sklearn.metrics import mean_squared_error 32 | from sklearn.cross_decomposition import PLSRegression 33 | 34 | scaler = StandardScaler() 35 | 36 | fit_MSE = [] 37 | validate_MSE = [] 38 | for n_comp in range(1,8): 39 | local_fit_MSE = [] 40 | local_validate_MSE = [] 41 | 42 | kfold = KFold(n_splits = 10, shuffle = True, random_state = 100) 43 | for fit_index, validate_index in kfold.split(y_train): 44 | X_fit_normal = scaler.fit_transform(X_train[fit_index]) 45 | X_validate_normal = scaler.transform(X_train[validate_index]) 46 | 47 | y_fit_normal = scaler.fit_transform(y_train[fit_index]) 48 | y_validate_normal = scaler.transform(y_train[validate_index]) 49 | 50 | pls = PLSRegression(n_components = n_comp) 51 | pls.fit(X_fit_normal, y_fit_normal) 52 | 53 | local_fit_MSE.append(mean_squared_error(y_fit_normal, pls.predict(X_fit_normal))) 54 | local_validate_MSE.append(mean_squared_error(y_validate_normal, 55 | pls.predict(X_validate_normal))) 56 | 57 | fit_MSE.append(np.mean(local_fit_MSE)) 58 | validate_MSE.append(np.mean(local_validate_MSE)) 59 | 60 | 61 | # plot 62 | plt.figure() 63 | plt.plot(range(1,8), fit_MSE, 'b*', label = 'Training MSE') 64 | plt.plot(range(1,8), validate_MSE, 'r*', label = 'Validation MSE') 65 | plt.xticks(range(1,8)) 66 | plt.ylabel('Mean Squared Error (MSE)') 67 | plt.xlabel('# of latents') 68 | plt.legend() 69 | 70 | #%% build PLS model 71 | pls = PLSRegression(n_components = 5) 72 | pls.fit(X_train_normal, y_train_normal) 73 | 74 | #%% check training vs test accuracy 75 | print('Accuracy over training data: ', pls.score(X_train_normal, y_train_normal)) 76 | print('Accuracy over test data: ', pls.score(X_test_normal, y_test_normal)) 77 | 78 | #%% plots of raw and predicted data 79 | y_train_normal_predict = pls.predict(X_train_normal) 80 | y_test_normal_predict = pls.predict(X_test_normal) 81 | 82 | y_train_predict = y_scaler.inverse_transform(y_train_normal_predict) 83 | y_test_predict = y_scaler.inverse_transform(y_test_normal_predict) 84 | 85 | 86 | plt.figure() 87 | plt.plot(y_train, 'b', label = 'Raw data') 88 | plt.plot(y_train_predict, 'r', label = 'PLS prediction') 89 | plt.ylabel('C4 content (training data)') 90 | plt.xlabel('Sample #') 91 | plt.legend() 92 | 93 | 94 | plt.figure() 95 | plt.plot(y_test, 'b', label = 'Raw data') 96 | plt.plot(y_test_predict, 'r', label = 'PLS prediction') 97 | plt.ylabel('C4 content (test data)') 98 | plt.xlabel('Sample #') 99 | plt.legend() 100 | 101 | plt.figure() 102 | plt.plot(y_train, y_train_predict, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9) 103 | plt.plot(y_train, y_train, '-r', linewidth=0.5) 104 | plt.xlabel('C4 content (raw training data)') 105 | plt.ylabel('C4 content (prediction)') 106 | 107 | plt.figure() 108 | plt.plot(y_test, y_test_predict, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9) 109 | plt.plot(y_test, y_test, '-r', linewidth=0.5) 110 | plt.xlabel('C4 content (raw test data)') 111 | plt.ylabel('C4 content (prediction)') 112 | 113 | #%% residuals 114 | plt.figure() 115 | plt.plot(y_test, y_test-y_test_predict, '*') 116 | plt.xlabel('C4 content test data') 117 | plt.ylabel('residual (raw data- prediction)') 118 | plt.title('residual plot') -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/debutanizer_Softsensing_SVR.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## SVR model with debutanizer data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% read data 10 | data = np.loadtxt('debutanizer_data.txt', skiprows=5) 11 | 12 | #%% separate train and test data 13 | from sklearn.model_selection import train_test_split 14 | X = data[:,0:-1] 15 | y = data[:,-1] 16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 100) 17 | 18 | #%% fit SVR model via grid-search 19 | from sklearn.svm import SVR 20 | from sklearn.model_selection import GridSearchCV 21 | 22 | model = SVR(epsilon=0.05) 23 | param_grid = [{'gamma': np.linspace(1,10,10), 'C': np.linspace(0.01,500,10)}] 24 | gs = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=10, verbose=2) 25 | 26 | gs.fit(X_train, y_train) 27 | print('Optimal hyperparameter:', gs.best_params_) 28 | 29 | #%% predict using the best model 30 | y_train_predicted = gs.predict(X_train) 31 | y_test_predicted = gs.predict(X_test) 32 | 33 | #%% plots of raw and predicted data 34 | plt.figure() 35 | plt.plot(y_train, 'b', label = 'Raw data') 36 | plt.plot(y_train_predicted, 'r', label = 'SVR prediction') 37 | plt.ylabel('C4 content (training data)') 38 | plt.xlabel('Sample #') 39 | plt.legend() 40 | 41 | 42 | plt.figure() 43 | plt.plot(y_test, 'b', label = 'Raw data') 44 | plt.plot(y_test_predicted, 'r', label = 'SVR prediction') 45 | plt.ylabel('C4 content (test data)') 46 | plt.xlabel('Sample #') 47 | plt.legend() 48 | 49 | plt.figure() 50 | plt.plot(y_train, y_train_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9) 51 | plt.plot(y_train, y_train, '-r', linewidth=0.5) 52 | plt.xlabel('C4 content (raw training data)') 53 | plt.ylabel('C4 content (prediction)') 54 | 55 | plt.figure() 56 | plt.plot(y_test, y_test_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9) 57 | plt.plot(y_test, y_test, '-r', linewidth=0.5) 58 | plt.xlabel('C4 content (raw test data)') 59 | plt.ylabel('C4 content (prediction)') 60 | 61 | #%% residuals 62 | plt.figure() 63 | plt.plot(y_test, y_test-y_test_predicted, '*') 64 | plt.xlabel('C4 content test data') 65 | plt.ylabel('residual (raw data- prediction)') 66 | plt.title('residual plot') 67 | 68 | #%% check training vs test accuracy 69 | from sklearn.metrics import r2_score 70 | print('Accuracy over training data: ', r2_score(y_train, y_train_predicted)) 71 | print('Accuracy over test data: ', r2_score(y_test, y_test_predicted)) -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/info.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/polymerPlantData_Softsensing_PLS.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## PLS model with polymer plant data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% read data 10 | data = np.loadtxt('polymer.dat') 11 | X = data[:,0:10] 12 | Y = data[:,10:] 13 | y = Y[:,3:] 14 | 15 | #%% scale data 16 | from sklearn.preprocessing import StandardScaler 17 | 18 | X_scaler = StandardScaler() 19 | X_scaled = X_scaler.fit_transform(X) 20 | 21 | y_scaler = StandardScaler() 22 | y_scaled = y_scaler.fit_transform(y) 23 | 24 | #%% Finding # latents using kFold cross validation 25 | from sklearn.model_selection import KFold 26 | from sklearn.metrics import mean_squared_error 27 | from sklearn.cross_decomposition import PLSRegression 28 | 29 | scaler = StandardScaler() 30 | 31 | fit_MSE = [] 32 | validate_MSE = [] 33 | for n_comp in range(1,10): 34 | local_fit_MSE = [] 35 | local_validate_MSE = [] 36 | 37 | kfold = KFold(n_splits = 10, shuffle = True, random_state = 100) 38 | for fit_index, validate_index in kfold.split(y): 39 | X_fit_scaled = scaler.fit_transform(X[fit_index]) 40 | X_validate_scaled = scaler.transform(X[validate_index]) 41 | 42 | y_fit_scaled = scaler.fit_transform(y[fit_index]) 43 | y_validate_scaled = scaler.transform(y[validate_index]) 44 | 45 | pls = PLSRegression(n_components = n_comp) 46 | pls.fit(X_fit_scaled, y_fit_scaled) 47 | 48 | local_fit_MSE.append(mean_squared_error(y_fit_scaled, pls.predict(X_fit_scaled))) 49 | local_validate_MSE.append(mean_squared_error(y_validate_scaled, 50 | pls.predict(X_validate_scaled))) 51 | 52 | fit_MSE.append(np.mean(local_fit_MSE)) 53 | validate_MSE.append(np.mean(local_validate_MSE)) 54 | 55 | 56 | # plot 57 | plt.figure() 58 | plt.plot(range(1,10), fit_MSE, 'b*', label = 'Training MSE') 59 | plt.plot(range(1,10), validate_MSE, 'r*', label = 'Validation MSE') 60 | plt.xticks(range(1,10)) 61 | plt.ylabel('Mean Squared Error (MSE)') 62 | plt.xlabel('# of latents') 63 | plt.legend() 64 | 65 | 66 | #%% build PLS model and predict 67 | from sklearn.cross_decomposition import PLSRegression 68 | 69 | pls = PLSRegression(n_components = 5) 70 | pls.fit(X_scaled, y_scaled) 71 | 72 | y_predicted_PLS = y_scaler.inverse_transform(pls.predict(X_scaled)) 73 | 74 | #%% plots of raw and predicted data 75 | plt.figure() 76 | plt.plot(y, y_predicted_PLS, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9, markerfacecolor = 'C4') 77 | plt.plot(y, y, '-r', linewidth=0.5) 78 | plt.xlabel('measured data') 79 | plt.ylabel('predicted data ') 80 | 81 | #%% metrics 82 | from sklearn.metrics import r2_score 83 | print('R2:', r2_score(y, y_predicted_PLS)) -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/polymerPlantData_Softsensing_SVR.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## SVR model with polymer plant data 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | #%% read data 10 | data = np.loadtxt('polymer.dat') 11 | X = data[:,0:10] 12 | Y = data[:,10:] 13 | y = Y[:,2] 14 | 15 | #%% fit SVR model 16 | from sklearn.svm import SVR 17 | from sklearn.model_selection import GridSearchCV 18 | 19 | model = SVR(epsilon=0.01) # default epsilon = 0.1 20 | param_grid = [{'gamma': np.linspace(0.1e-05,5,100), 'C': np.linspace(0.01,5000,100)}] 21 | gs = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=10, verbose=2) 22 | 23 | gs.fit(X, y) 24 | print('Optimal hyperparameter:', gs.best_params_) 25 | 26 | #%% predict using the best model 27 | y_predicted_SVR = gs.predict(X) 28 | 29 | #%% plots of raw and predicted data 30 | plt.figure() 31 | plt.plot(y, y_predicted_SVR, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9) 32 | plt.plot(y, y, '-r', linewidth=0.5) 33 | plt.xlabel('measured data'), plt.ylabel('predicted data ') 34 | 35 | #%% metrics 36 | from sklearn.metrics import r2_score 37 | print('R2:', r2_score(y, y_predicted_SVR)) -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/toyDataset.csv: -------------------------------------------------------------------------------- 1 | 1.178862847343031817e+00,1.043650985051199021e+00,-1.000000000000000000e+00 2 | 1.009649746807200765e+00,8.136507296635508979e-01,-1.000000000000000000e+00 3 | 9.722611797485600782e-01,9.645241020731013526e-01,-1.000000000000000000e+00 4 | 9.917258518517539922e-01,9.372999323176152142e-01,-1.000000000000000000e+00 5 | 9.956181831024071283e-01,9.522781969640496946e-01,-1.000000000000000000e+00 6 | 8.686135246637317620e-01,1.088462238049958453e+00,-1.000000000000000000e+00 7 | 1.088131804220753063e+00,1.170957306365294937e+00,-1.000000000000000000e+00 8 | 1.005003364217686102e+00,9.595322585399108650e-01,-1.000000000000000000e+00 9 | 9.454640052380469672e-01,8.453522684417031918e-01,-1.000000000000000000e+00 10 | 1.098236743425816009e+00,8.898932369888523652e-01,-1.000000000000000000e+00 11 | 8.814953472979827342e-01,9.794350100577459139e-01,-1.000000000000000000e+00 12 | 1.148614835507459020e+00,1.023671626722691297e+00,-1.000000000000000000e+00 13 | 8.976214860073531421e-01,9.287006799887950192e-01,-1.000000000000000000e+00 14 | 1.062524496616283010e+00,9.839486636813076226e-01,-1.000000000000000000e+00 15 | 9.231163649680770300e-01,9.769969277722061474e-01,-1.000000000000000000e+00 16 | 1.789406751968644516e+00,1.487133293975156256e+00,1.000000000000000000e+00 17 | 1.550705200525287486e+00,1.174829970657395695e+00,1.000000000000000000e+00 18 | 1.603548068650810787e+00,9.597100192185596956e-01,1.000000000000000000e+00 19 | 1.589144957396505298e+00,1.127134908698859572e+00,1.000000000000000000e+00 20 | 1.834877355074896244e+00,1.234170292063891949e+00,1.000000000000000000e+00 21 | 1.505205746499770347e+00,1.327601054272420589e+00,1.000000000000000000e+00 22 | 1.657247508866390495e+00,1.040823075565584954e+00,1.000000000000000000e+00 23 | 1.628402042997896038e+00,1.179368674437410780e+00,1.000000000000000000e+00 24 | 1.595134124268523967e+00,1.253565657843322079e+00,1.000000000000000000e+00 25 | 1.430209067890807262e+00,1.217868576218473331e+00,1.000000000000000000e+00 26 | 1.821582013026379343e+00,1.352335740914497819e+00,1.000000000000000000e+00 27 | 1.832982499992199088e+00,1.384326878638266978e+00,1.000000000000000000e+00 28 | 1.878505175839104702e+00,1.115803917871956319e+00,1.000000000000000000e+00 29 | 1.801500008846861789e+00,1.026693256526946429e+00,1.000000000000000000e+00 30 | 1.627653787519133699e+00,1.020263354791302257e+00,1.000000000000000000e+00 31 | -------------------------------------------------------------------------------- /Chapter_SupportVectorMachines/toyDataset2.csv: -------------------------------------------------------------------------------- 1 | 1.178862847343031817e+00,1.043650985051199021e+00,-1.000000000000000000e+00 2 | 1.009649746807200765e+00,8.136507296635508979e-01,-1.000000000000000000e+00 3 | 9.722611797485600782e-01,9.645241020731013526e-01,-1.000000000000000000e+00 4 | 9.917258518517539922e-01,9.372999323176152142e-01,-1.000000000000000000e+00 5 | 9.956181831024071283e-01,9.522781969640496946e-01,-1.000000000000000000e+00 6 | 8.686135246637317620e-01,1.088462238049958453e+00,-1.000000000000000000e+00 7 | 1.088131804220753063e+00,1.170957306365294937e+00,-1.000000000000000000e+00 8 | 1.005003364217686102e+00,9.595322585399108650e-01,-1.000000000000000000e+00 9 | 9.454640052380469672e-01,8.453522684417031918e-01,-1.000000000000000000e+00 10 | 1.098236743425816009e+00,8.898932369888523652e-01,-1.000000000000000000e+00 11 | 8.814953472979827342e-01,9.794350100577459139e-01,-1.000000000000000000e+00 12 | 1.148614835507459020e+00,1.023671626722691297e+00,-1.000000000000000000e+00 13 | 8.976214860073531421e-01,9.287006799887950192e-01,-1.000000000000000000e+00 14 | 1.062524496616283010e+00,9.839486636813076226e-01,-1.000000000000000000e+00 15 | 9.231163649680770300e-01,9.769969277722061474e-01,-1.000000000000000000e+00 16 | 1.789406751968644516e+00,1.487133293975156256e+00,1.000000000000000000e+00 17 | 1.550705200525287486e+00,1.174829970657395695e+00,1.000000000000000000e+00 18 | 1.603548068650810787e+00,9.597100192185596956e-01,1.000000000000000000e+00 19 | 1.589144957396505298e+00,1.127134908698859572e+00,1.000000000000000000e+00 20 | 1.834877355074896244e+00,1.234170292063891949e+00,1.000000000000000000e+00 21 | 1.505205746499770347e+00,1.327601054272420589e+00,1.000000000000000000e+00 22 | 1.657247508866390495e+00,1.040823075565584954e+00,1.000000000000000000e+00 23 | 1.628402042997896038e+00,1.179368674437410780e+00,1.000000000000000000e+00 24 | 1.595134124268523967e+00,1.253565657843322079e+00,1.000000000000000000e+00 25 | 1.430209067890807262e+00,1.217868576218473331e+00,1.000000000000000000e+00 26 | 1.821582013026379343e+00,1.352335740914497819e+00,1.000000000000000000e+00 27 | 1.832982499992199088e+00,1.384326878638266978e+00,1.000000000000000000e+00 28 | 1.878505175839104702e+00,1.115803917871956319e+00,1.000000000000000000e+00 29 | 1.801500008846861789e+00,1.026693256526946429e+00,1.000000000000000000e+00 30 | 1.627653787519133699e+00,1.020263354791302257e+00,1.000000000000000000e+00 31 | 1.699999999999999956e+00,1.100000000000000089e+00,-1.000000000000000000e+00 32 | -------------------------------------------------------------------------------- /Chapter_WebDeployment/FDD.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Hello World Web App 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import packages 6 | import cherrypy 7 | 8 | #%% FDD tool Web application 9 | class FDDapp(object): 10 | @cherrypy.expose 11 | def getResults(self): 12 | processState = runPCAmodel() # returns 'All good' or 'Issue detected' 13 | return processState 14 | 15 | #%% execution settings 16 | cherrypy.config.update({'server.socket_host': '0.0.0.0'}) 17 | 18 | if __name__ == '__main__': 19 | cherrypy.quickstart(FDDapp()) # when this script is executed, host FDDapp app 20 | -------------------------------------------------------------------------------- /Chapter_WebDeployment/PCAmetrics_history.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_WebDeployment/PCAmetrics_history.pickle -------------------------------------------------------------------------------- /Chapter_WebDeployment/PCAmodelData.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_WebDeployment/PCAmodelData.pickle -------------------------------------------------------------------------------- /Chapter_WebDeployment/ProcessMonitoring_PCA.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Train PCA model 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import required packages 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.decomposition import PCA 10 | import matplotlib.pyplot as plt 11 | 12 | #%% fetch data 13 | data = pd.read_excel('proc1a.xlsx', skiprows = 1,usecols = 'C:AI') 14 | 15 | #%% separate train data 16 | data_train = data.iloc[0:69,] 17 | 18 | #%% scale data 19 | scaler = StandardScaler() 20 | data_train_normal = scaler.fit_transform(data_train) 21 | 22 | #%% PCA 23 | pca = PCA() 24 | score_train = pca.fit_transform(data_train_normal) 25 | 26 | #%% decide # of PCs to retain and compute reduced data in PC space 27 | explained_variance = 100*pca.explained_variance_ratio_ # in percentage 28 | cum_explained_variance = np.cumsum(explained_variance) # cumulative % variance explained 29 | 30 | n_comp = np.argmax(cum_explained_variance >= 90) + 1 31 | score_train_reduced = score_train[:,0:n_comp] 32 | 33 | print('Number of PCs cumulatively explaining atleast 90% variance: ', n_comp) 34 | 35 | #%% reconstruct original data 36 | V_matrix = pca.components_.T 37 | P_matrix = V_matrix[:,0:n_comp] 38 | 39 | data_train_normal_reconstruct = np.dot(score_train_reduced, P_matrix.T) 40 | 41 | #%% calculate T2 for training data 42 | lambda_k = np.diag(pca.explained_variance_[0:n_comp]) # eigenvalue = explained variance 43 | lambda_k_inv = np.linalg.inv(lambda_k) 44 | 45 | T2_train = np.zeros((data_train_normal.shape[0],)) 46 | 47 | for i in range(data_train_normal.shape[0]): 48 | T2_train[i] = np.dot(np.dot(score_train_reduced[i,:],lambda_k_inv),score_train_reduced[i,:].T) 49 | 50 | #%% calculate Q for training data 51 | error_train = data_train_normal - data_train_normal_reconstruct 52 | Q_train = np.sum(error_train*error_train, axis = 1) 53 | 54 | #%% T2_train control limit 55 | import scipy.stats 56 | 57 | N = data_train_normal.shape[0] 58 | k = n_comp 59 | 60 | alpha = 0.01# 99% control limit 61 | T2_CL = k*(N**2-1)*scipy.stats.f.ppf(1-alpha,k,N-k)/(N*(N-k)) 62 | 63 | #%% Q_train control limit 64 | eig_vals = pca.explained_variance_ 65 | m = data_train_normal.shape[1] 66 | 67 | theta1 = np.sum(eig_vals[k:]) 68 | theta2 = np.sum([eig_vals[j]**2 for j in range(k,m)]) 69 | theta3 = np.sum([eig_vals[j]**3 for j in range(k,m)]) 70 | h0 = 1-2*theta1*theta3/(3*theta2**2) 71 | 72 | z_alpha = scipy.stats.norm.ppf(1-alpha) 73 | Q_CL = theta1*(z_alpha*np.sqrt(2*theta2*h0**2)/theta1+ 1 + theta2*h0*(1-h0)/theta1**2)**2 74 | 75 | #%% Q_train plot with CL 76 | plt.figure() 77 | plt.plot(Q_train) 78 | plt.plot([1,len(Q_train)],[Q_CL,Q_CL], color='red') 79 | plt.xlabel('Sample #') 80 | plt.ylabel('Q for training data') 81 | plt.show() 82 | 83 | #%% T2_train plot with CL 84 | plt.figure() 85 | plt.plot(T2_train) 86 | plt.plot([1,len(T2_train)],[T2_CL,T2_CL], color='red') 87 | plt.xlabel('Sample #') 88 | plt.ylabel('T$^2$ for training data') 89 | plt.show() 90 | 91 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 92 | ## Save model for later use 93 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 94 | import pickle 95 | PCAmodelData = {"PCAmodel": pca, 96 | "scaler": scaler, 97 | "n_comp": n_comp, 98 | "P_matrix": P_matrix, 99 | "lambda_k_inv": lambda_k_inv, 100 | "Q_CL": Q_CL, 101 | "T2_CL": T2_CL} # dictionary data structure uses key-value pairs 102 | 103 | with open('PCAmodelData.pickle', 'wb') as f: 104 | pickle.dump(PCAmodelData, f, pickle.HIGHEST_PROTOCOL) -------------------------------------------------------------------------------- /Chapter_WebDeployment/contributionPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_WebDeployment/contributionPlot.png -------------------------------------------------------------------------------- /Chapter_WebDeployment/frontEndTemplate.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 21 | 22 | 31 | 32 | 33 | 34 | 35 |

Smart Process Monitoring Tool

36 | 37 | {% if state == 0 %} 38 |

All Good


39 | {% else %} 40 |

Issue Detected


41 | {% endif %} 42 | 43 |


44 | 45 |


46 | 47 |
48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /Chapter_WebDeployment/helloWorld.py: -------------------------------------------------------------------------------- 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | ## Hello World Web App 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 4 | 5 | #%% import packages 6 | import cherrypy 7 | 8 | #%% Web application will be written as a Python class. 9 | # Methods of the class will be used to respond to client requests 10 | class HelloWorld(object): 11 | @cherrypy.expose 12 | def index(self): 13 | return "Hello world!" 14 | 15 | #%% execution settings 16 | cherrypy.config.update({'server.socket_host': '0.0.0.0'}) 17 | 18 | if __name__ == '__main__': 19 | cherrypy.quickstart(HelloWorld()) # when this script is executed, host HelloWorld app 20 | -------------------------------------------------------------------------------- /Chapter_WebDeployment/info.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter_WebDeployment/metricPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_WebDeployment/metricPlot.png -------------------------------------------------------------------------------- /Chapter_WebDeployment/proc1a.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_WebDeployment/proc1a.xlsx -------------------------------------------------------------------------------- /Chapter_WebDeployment/processLatestDatabase_local.csv: -------------------------------------------------------------------------------- 1 | 0.51,-0.04,0.13,-0.79,0.26,0.29,-2.74,-0.19,-0.17,0.21,-0.1,0.38,-0.04,0.07,0,-0.4,-0.25,-0.14,-0.05,-1.27,-0.97,-0.1,0.91,1.15,-0.11,-0.01,-0.65,-0.03,-0.19,-0.31,0.07,0.46,-0.74 2 | 0.1,0.05,-0.2,0.1,-0.25,0.4,-0.59,-0.92,-0.88,-4.76,-0.81,-0.03,-0.79,0.28,-0.35,-0.24,-0.53,-0.28,-0.48,-0.74,-0.65,-0.22,0.5,0.42,-0.14,-0.03,-1.1,-0.2,-0.24,-0.53,-0.1,-0.1,-1.09 3 | -0.51,0.6,-0.22,0,-0.24,0.3,0.29,-0.38,-0.42,0.31,-0.29,0.3,-0.22,0.12,0.13,-0.37,-0.66,-0.26,-0.04,-0.67,-0.49,-0.75,0.83,0.17,-0.07,-0.01,-0.73,-0.06,-0.11,0.19,0.07,-0.14,-0.99 4 | -0.5,0.67,-0.16,-0.01,-0.23,0.24,0.32,-0.34,-0.36,0.25,-0.23,0.34,-0.16,0.1,0.14,-0.39,6.57,-0.25,0.01,-0.7,-0.47,-0.77,0.88,0.22,-0.1,-0.02,-0.66,0.01,-0.31,-0.08,0.07,-0.22,-0.72 5 | -1.19,-0.81,-1.74,0.08,-0.17,0.23,0.34,-0.15,0.01,0.02,0.04,0.29,0.05,0.18,0.38,-0.37,-0.62,-0.16,-0.14,-0.89,-0.42,-0.79,0.34,0.58,-0.12,-0.06,-0.68,-0.13,-0.31,-0.15,0.04,-0.28,-0.98 6 | -0.88,-1.78,-1.42,0.06,-0.51,-0.37,-4.28,0.01,0.46,-0.28,0.49,0.72,0.47,-0.09,0.64,-0.54,-0.14,-0.05,0.39,-0.99,-0.52,-0.97,0.3,0.38,-0.5,-0.25,-0.49,-0.29,-0.21,0.26,0.17,-0.19,-0.36 7 | -0.31,-0.81,-1.22,0.62,-0.43,-0.04,-2.43,0.26,0.45,-0.27,0.58,-0.12,0.37,0,1.46,-0.49,-0.28,-0.12,0.24,-1.04,-1.11,-0.8,0.19,0.24,-0.39,-0.19,-0.61,-0.23,-0.52,0.04,0.1,-0.57,-0.58 8 | -0.61,0.14,-0.5,0.24,-0.33,0.33,0.38,-0.47,-0.12,0.23,0,0.46,-0.07,0.08,0.13,-0.43,-0.69,-0.28,-0.02,-0.79,-0.83,-0.62,0.51,0.12,-0.07,0.02,-0.45,0.03,-0.43,-0.07,0.06,-0.6,-1.08 9 | -1.14,-0.56,-0.71,0.96,-0.28,0.27,0.91,-0.46,-0.04,0.21,0.03,0.47,0.03,0.09,-0.45,-0.44,-1,-0.38,0,-0.77,-0.03,-0.74,0.05,-0.4,0.23,-5.06,0.09,0.32,-0.17,0.01,0.15,-0.59,-1.09 10 | -2.47,0.41,-0.38,0.47,-0.21,0.32,0.67,-0.9,-0.36,0.49,-0.24,0.51,-0.14,0.07,-0.78,-0.78,-0.98,-0.47,0.14,-0.43,0.18,-0.96,-0.39,-0.4,0.6,0.47,0.71,0.76,0,0.14,0.42,-0.81,-1.7 11 | -2.26,0.08,-1.54,-0.56,-0.11,0.26,0.73,-0.73,-0.27,0.34,-0.12,-0.16,0.02,0.04,-0.45,-0.8,-0.84,-0.43,0.19,0.23,0.96,-1.5,-0.8,-0.94,0.86,0.63,0.99,1.03,0.31,0.41,0.59,-0.88,-1.73 12 | -1.94,-0.95,-0.94,0.07,-0.07,0.33,0.73,-0.48,-0.06,0.23,0.01,-0.27,0.07,0.11,-0.58,-0.76,-0.59,-0.38,0.08,0.41,1.57,-1.86,-0.91,-1.1,0.81,0.57,0.83,0.9,0.39,0.44,0.49,-0.87,-1.67 13 | -1.22,0.53,0.29,-0.27,-0.01,0.27,0.79,-0.46,0.12,-0.04,0.23,-0.25,0.39,0.11,-0.83,-0.77,-0.54,-0.32,0.13,0.92,1.53,-1.7,-1.28,-0.96,0.78,0.6,1.02,0.98,0.61,0.45,-4.51,-0.71,-1.39 14 | -1.03,-0.25,-0.6,-0.24,0.05,0.27,0.46,-0.22,-0.02,0.11,0.04,0.22,0.24,0.25,-0.55,-0.99,-0.36,-0.22,-0.12,1.28,1.48,-1.43,-1.54,-0.95,1,0.73,1.22,1.1,0.98,0.18,0.48,-0.86,-0.64 15 | -0.77,-0.45,0.16,-0.54,-0.02,0.39,0.51,-0.24,0.16,0.01,0.24,-0.54,0.34,0.27,-0.56,-0.98,-0.51,-0.22,0,1.05,1.86,-1.14,-1.57,-1.08,0.94,0.69,1.13,1.04,0.94,0.14,0.44,-1.28,-0.92 16 | -0.6,-0.34,-1.56,-0.37,-0.14,0.29,0.42,0.02,0.53,-0.35,0.6,0.08,0.55,0.34,-0.66,-0.94,-0.66,-0.29,-0.12,1.03,1.73,-1.14,-1.67,-1.7,0.66,0.59,0.99,0.85,0.88,0.01,0.28,-1.26,-0.26 17 | -1.32,-0.21,-0.24,-0.9,-0.28,0.03,0.5,-0.04,0.88,-0.52,0.94,0.02,0.68,0.38,-1.22,-0.92,-0.8,-0.41,-0.28,0.84,1.46,-0.88,-1.73,-1.73,-8.36,0.72,1.16,1.03,0.96,-0.12,0.26,-1.46,-0.52 18 | -1.48,-0.57,-0.12,-1.08,-0.3,0.21,0.36,-0.3,0.55,-0.24,0.55,-0.01,0.35,0.4,-0.96,-0.91,-0.87,-0.63,-0.19,0.56,1.77,-1.06,-1.76,-1.83,0.82,0.77,1.35,1.12,1.17,-0.05,0.29,-1.49,-0.81 19 | -1.37,-0.77,-0.02,-1.36,-0.11,0.22,0.4,-0.26,0.69,-0.3,0.7,-0.68,0.5,0.38,-0.73,-1.34,-0.79,-0.57,-0.09,0.89,1.93,-1.32,-1.71,-1.86,0.66,0.69,1.37,1.01,1.18,-0.07,0.28,-1.69,-0.54 20 | -1.3,-0.63,-0.19,-1.09,-0.06,-5.7,0.37,-0.38,0.61,-0.23,0.6,-0.69,0.48,0.4,-0.87,-1.17,-0.78,-0.79,-0.14,1.54,1.92,-1.28,-1.72,-2.11,1.07,0.95,1.67,1.4,1.28,0.04,0.35,-1.82,-1.13 21 | -1.19,-0.03,0.11,-1.05,-0.09,0.09,0.66,-0.01,1.33,-0.97,1.31,-0.55,0.99,0.33,-1.11,-1.23,-0.69,-0.82,0.03,1.27,2.46,-1.58,-1.57,-1.97,0.92,0.83,1.62,1.27,1.1,-4.16,0.37,-1.54,-0.92 22 | -0.94,-0.06,0.71,-1.15,-0.06,-5.7,0.65,-0.29,0.79,-0.56,0.84,-0.72,0.59,0.44,-1.09,-2.06,-0.69,-7.34,-0.2,1.78,2.11,-1.19,-1.84,-2.07,0.83,0.78,1.28,1.12,1.13,-0.08,0.26,-1.85,-0.81 23 | -1.62,0.12,0.61,-0.97,0.13,0.25,0.46,-0.21,0.95,-0.58,0.95,-0.77,0.67,0.46,-1.08,-2.05,-0.89,-1.13,-0.27,1.7,1.96,-1.41,-2.36,-2.28,0.83,0.75,1.36,1.08,1.18,-0.16,0.22,-1.9,-0.79 24 | -------------------------------------------------------------------------------- /Chapter_WebDeployment/sample.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 18 | 19 | 20 |

Smart Process Monitoring Tool

21 |

Issue detected

22 | 23 | 24 | -------------------------------------------------------------------------------- /Images/Book3_coverPage.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Images/Book3_coverPage.JPG -------------------------------------------------------------------------------- /Images/ML-for-PSE-2023Edition-CoverPage.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Images/ML-for-PSE-2023Edition-CoverPage.JPG -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine_Learning_for_PSE 2 | 3 | Chapter-wise code repository for the book 'Machine Learning in Python for Process Systems Engineering' 4 | 5 | ![](/Images/Book3_coverPage.JPG) 6 | 7 | ## Book Links: 8 | - *Google Play*: https://play.google.com/store/books/details?id=K_NjEAAAQBAJ 9 | - *LeanPub*: https://leanpub.com/machineLearningPSE 10 | 11 | ## Original data sources for datasets used in this book: 12 | [Weblinks mentioned below may change or may no longer exist in future. Relevant data files have been provided in the respective folders in this repository. If you plan to share or use any dataset, please abide by the license policy (and/or the citation requests, if any) for the dataset.] 13 | 14 | - *Polymer Manufacturing Process Data*: 15 | 16 | Originally obtained from https://landing.umetrics.com/downloads-other-downloads (unfortunately this link no longer seems to work; data file is provided in the respective folder in this repository). 17 | Dataset also referenced at https://www.academia.edu/38630159/Multivariate_data_analysis_wiki 18 | 19 | 20 | - *Pulp & Paper Manufacturing Process Data*: 21 | 22 | Obtained from https://openmv.net. 23 | 24 | Citation: Dayal et al. "Application of feedforward neural networks and partial least squares regression for modelling Kappa number in a continuous Kamyr digester", Pulp and Paper Canada, 95, 1994, p T7-T13. 25 | 26 | 27 | - *Low-Density Polyethylene (LDPE) Process Data*: 28 | 29 | Obtained from https://openmv.net. 30 | 31 | 32 | - *Tennessee Eastman Process Data*: 33 | 34 | Available at https://github.com/camaramm/tennessee-eastman-profBraatz. Bigger dataset available at https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/6C3JR1. 35 | 36 | Citation: Reith, C.A., B.D. Amsel, R. Tran., and B. Maia. Additional Tennessee Eastman process simulation data for anomaly detection evaluation. Harvard Dataverse, Version 1, 2017 37 | 38 | - *Semiconductor Manufacturing Process Data*: 39 | 40 | Obtained from http://www.eigenvector.com/data/Etch/. 41 | 42 | Citation: B.M. Wise, N.B. Gallagher, S.W. Butler, D.D. White, Jr. and G.G. Barna, "A Comparison of Principal Components Analysis, Multi-way Principal Components analysis, Tri-linear Decomposition and Parallel Factor Analysis for Fault Detection in a Semiconductor Etch Process", J. Chemometrics (1999). 43 | 44 | - *Polymer Pilot Plant Data*: 45 | 46 | Originally obtained from ftp://ftp.cis.upenn.edu/pub/ungar/chemdata/ 47 | 48 | - *Debutanizer Column Data from a Petroleum Refinery*: 49 | 50 | Available as supplementary material at https://link.springer.com/book/10.1007/978-1-84628-480-9. 51 | 52 | Citation: Fortuna et. al., Soft sensors for monitoring and control of industrial processes, Springer, 2007 53 | 54 | - *Concrete Compressive Strength Data*: 55 | 56 | Available at the UCI machine learning repository https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength 57 | 58 | Copyright: Prof. I-Cheng Yeh 59 | Citation: I-Cheng Yeh, "Modeling of strength of high performance concrete using artificial neural networks," Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998) 60 | 61 | - *Wastewater Treatment Plant Data*: 62 | 63 | Available at the UCI machine learning repository https://archive.ics.uci.edu/ml/datasets/water+treatment+plant 64 | 65 | - *Combined Cycle Power Plant data*: 66 | 67 | Available at the UCI machine learning repository https://archive.ics.uci.edu/ml/datasets/combined+cycle+power+plant 68 | 69 | Citation: Pınar Tüfekci, Prediction of full load electrical power output of a base load operated combined cycle power plant using machine learning methods, International Journal of Electrical Power & Energy Systems, Volume 60, September 2014, Pages 126-140, ISSN 0142-0615 70 | 71 | - *SISO Heater System Data*: 72 | 73 | Provided by Prof. John Hedengren at https://apmonitor.com/do/index.php/Main/LSTMNetwork. Direct links for the training and validation data: https://apmonitor.com/do/uploads/Main/tclab_dyn_data3.txt and https://apmonitor.com/pdc/uploads/Main/tclab_data4.txt. File names will need to be changed to match the ones used in the book. 74 | 75 | - *Gas Turbine Data*: 76 | 77 | Originally available at NASA prognostics data repository https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/. Data available at https://data.nasa.gov/Aerospace/CMAPSS-Jet-Engine-Simulated-Data/ff5v-kuh6/about_data. 78 | Training and validation data file names used in the text are different than the original file names. 79 | 80 | Citation: A. Saxena and K. Goebel (2008). "Turbofan Engine Degradation Simulation Data Set", NASA Ames Prognostics Data Repository (http://ti.arc.nasa.gov/project/prognostic-data-repository), NASA Ames Research Center, Moffett Field, CA 81 | 82 | License: CC0: Public Domain (https://creativecommons.org/publicdomain/zero/1.0/) 83 | 84 | --------------------------------------------------------------------------------