├── Chapter_ANN
    ├── CCPP_FFNN.ipynb
    ├── CCPP_FFNN.py
    ├── CCPP_FFNN_gridSearch.ipynb
    ├── CCPP_data_explore.ipynb
    ├── CCPP_data_explore.py
    ├── Folds5x2_pp.xlsx
    ├── ccpp_FFNN_gridSearch.py
    ├── debutanizer_FFNN.ipynb
    ├── debutanizer_FFNN.py
    ├── debutanizer_PLS.ipynb
    ├── debutanizer_PLS.py
    ├── debutanizer_data.txt
    ├── debutanizer_dataExplore.ipynb
    ├── debutanizer_dataExplore.py
    ├── info.txt
    ├── kamyr-digester.csv
    ├── kamyr_data_FFNN_earlyStopping.ipynb
    ├── kamyr_data_FFNN_earlyStopping.py
    ├── quadratic_function_singleLayer.ipynb
    └── quadratic_function_singleLayer.py
├── Chapter_BestPractices
    ├── 3Way_Holdout_Method.ipynb
    ├── 3wayHoldout_Method.py
    ├── Centering_Scaling.ipynb
    ├── Centering_Scaling.py
    ├── Feature_Engineering_OneHotEncoding.ipynb
    ├── Feature_Engineering_OneHotEncoding.py
    ├── Feature_Engineering_quadraticFit.ipynb
    ├── Feature_Engineering_quadraticFit.py
    ├── GridSearchCV.ipynb
    ├── GridSearchCV.py
    ├── Holdout_Method.ipynb
    ├── Holdout_Method.py
    ├── Pipeline_quadraticFit.ipynb
    ├── Pipeline_quadraticFit.py
    ├── Regularization.ipynb
    ├── ValidationCurve.ipynb
    ├── ValidationCurve.py
    ├── info.txt
    ├── kFold_CrossValidation.ipynb
    ├── kFold_CrossValidation.py
    ├── quadratic_raw_data.csv
    └── regularization.py
├── Chapter_Clustering_GMM
    ├── DBSCAN_clustering.ipynb
    ├── DBSCAN_clustering.py
    ├── DBSCAN_illustration.ipynb
    ├── DBSCAN_illustration.py
    ├── Etch_data_explore.ipynb
    ├── Etch_data_explore.py
    ├── GMM_clustering.ipynb
    ├── GMM_clustering.py
    ├── GMM_illustration.ipynb
    ├── GMM_illustration.py
    ├── MACHINE_Data.mat
    ├── Metal_etch_complete_data_visualize.ipynb
    ├── Metal_etch_complete_data_visualize.py
    ├── ProcessMonitoring_GMM.ipynb
    ├── ProcessMonitoring_GMM.py
    ├── k_means_clustering.ipynb
    ├── k_means_clustering.py
    ├── k_means_failure.ipynb
    └── k_means_failure.py
├── Chapter_DecisionTrees_EnsembleLearning
    ├── Bagging_illustration.ipynb
    ├── Bagging_illustration.py
    ├── DT_illustration.ipynb
    ├── DT_illustration.py
    ├── RF_illustration.ipynb
    ├── RF_illustration.py
    ├── SoftSensing_ConcreteStrength_PLS.ipynb
    ├── SoftSensing_ConcreteStrength_PLS.py
    ├── SoftSensing_ConcreteStrength_RF.ipynb
    ├── SoftSensing_ConcreteStrength_RF.py
    ├── SoftSensing_WastewaterPlant_PLS.ipynb
    ├── SoftSensing_WastewaterPlant_PLS.py
    ├── SoftSensing_WastewaterPlant_XGBoost.ipynb
    ├── SoftSensing_WastewaterPlant_XGBoost.py
    ├── cement_strength.txt
    ├── info.txt
    ├── water-treatment.data
    └── water-treatment.names
├── Chapter_LatentVariable1
    ├── DimensionalityReduction.ipynb
    ├── DimensionalityReduction.py
    ├── DynamicPCA.ipynb
    ├── DynamicPCA.py
    ├── KernelPCA.ipynb
    ├── KernelPCA.py
    ├── LDPE.csv
    ├── ProcessMonitoring_PCA.ipynb
    ├── ProcessMonitoring_PCA.py
    ├── ProcessMonitoring_PLS.ipynb
    ├── ProcessMonitoring_PLS.py
    ├── SoftSensor_PLS.ipynb
    ├── kamyr-digester.csv
    ├── proc1a.xls
    └── softSensor_PLS.py
├── Chapter_LatentVariable2
    ├── DimensionalityReduction_FDA.ipynb
    ├── DimensionalityReduction_FDA.py
    ├── DimensionalityReduction_ICA.ipynb
    ├── DimensionalityReduction_ICA.py
    ├── FDA_illustration.ipynb
    ├── FDA_illustration.py
    ├── FaultClassification_FDA.ipynb
    ├── FaultClassification_FDA.py
    ├── ICA_illustration.ipynb
    ├── ICA_illustration.py
    ├── ProcessMonitoring_ICA.ipynb
    ├── ProcessMonitoring_ICA.py
    ├── ProcessMonitoring_PCA.ipynb
    ├── ProcessMonitoring_PCA.py
    ├── TEP_data_explore.ipynb
    ├── TE_processData_explore.py
    ├── d00.dat
    ├── d00_te.dat
    ├── d01.dat
    ├── d01_te.dat
    ├── d02.dat
    ├── d02_te.dat
    ├── d03.dat
    ├── d03_te.dat
    ├── d04.dat
    ├── d04_te.dat
    ├── d05.dat
    ├── d05_te.dat
    ├── d06.dat
    ├── d06_te.dat
    ├── d07.dat
    ├── d07_te.dat
    ├── d08.dat
    ├── d08_te.dat
    ├── d09.dat
    ├── d09_te.dat
    ├── d10.dat
    ├── d10_te.dat
    ├── d11.dat
    ├── d11_te.dat
    ├── d12.dat
    ├── d12_te.dat
    ├── d13.dat
    ├── d13_te.dat
    ├── d14.dat
    ├── d14_te.dat
    ├── d15.dat
    ├── d15_te.dat
    ├── d16.dat
    ├── d16_te.dat
    ├── d17.dat
    ├── d17_te.dat
    ├── d18.dat
    ├── d18_te.dat
    ├── d19.dat
    ├── d19_te.dat
    ├── d20.dat
    ├── d20_te.dat
    ├── d21.dat
    ├── d21_te.dat
    └── info.txt
├── Chapter_OtherUsefulMethods
    ├── FD-kNN.ipynb
    ├── FD-kNN.py
    ├── KDE_ControlLimits_for_ICAmonitoring.ipynb
    ├── KDE_ControlLimits_for_ICAmonitoring.py
    ├── KDE_GridSearchCV.ipynb
    ├── KDE_GridSearchCV.py
    └── info.txt
├── Chapter_Preprocessing
    ├── EmbeddedMethods_Lasso.py
    ├── Embedded_Method_Lasso.ipynb
    ├── MLR_VSdata.py
    ├── Missing_data_imputation.ipynb
    ├── Missing_data_imputation.py
    ├── MultivariateLinearRegression_VSdata.ipynb
    ├── Multivariate_Outliers_MCD.ipynb
    ├── Multivariate_Outliers_MahalanobisDistance.ipynb
    ├── Multivariate_outliers_MCD.py
    ├── Multivariate_outliers_Mahalanobis_distance.py
    ├── Univariate_Outliers.ipynb
    ├── Univariate_Outliers.py
    ├── VSdata.csv
    ├── VSdata_val.csv
    ├── WrapperMethods_backward_SFS.py
    ├── Wrapper_Methods_backward_SFS.ipynb
    ├── complex2D_outlier.csv
    ├── deNoising_process_signals.py
    ├── denoising_process_signals.ipynb
    ├── filterMethods.py
    ├── filter_Methods.ipynb
    ├── info.txt
    ├── noisy_flow_signal.csv
    └── simple2D_outlier.csv
├── Chapter_RNN
    ├── AircraftEngine_dataExploration.ipynb
    ├── AircraftEngine_dataExplore.py
    ├── PM_test.txt
    ├── PM_train.txt
    ├── PM_truth.txt
    ├── SISO_Heater_system_RNN.ipynb
    ├── SISO_Heater_system_RNN.py
    ├── TCLab_test_data.txt
    ├── TCLab_train_data.txt
    ├── TEP_dataExploration.ipynb
    ├── TEP_dataExploration.py
    ├── TEP_faultClassification_RNN.ipynb
    ├── TEPclassification_RNN.py
    ├── info.txt
    ├── predictiveMaint_Regression_RNN.py
    ├── predictiveMaint_binaryClassification_RNN.ipynb
    ├── predictiveMaint_binaryClassification_RNN.py
    └── predictiveMaint_regression_RNN.ipynb
├── Chapter_ReinforcementLearning
    ├── RL_agent_train_test.ipynb
    ├── RL_agent_train_test.py
    ├── Tank_Environment.py
    ├── actor_saved
    │   ├── keras_metadata.pb
    │   ├── saved_model.pb
    │   └── variables
    │   │   ├── variables.data-00000-of-00001
    │   │   └── variables.index
    ├── disturbance_200.csv
    └── info.txt
├── Chapter_ScriptingEnvironment
    ├── NumpyBasics.ipynb
    ├── NumpyBasics.py
    ├── PandasBasics.ipynb
    ├── PandasBasics.py
    ├── PythonBasics.ipynb
    ├── PythonBasics.py
    ├── info.txt
    ├── quadratic_raw_data.csv
    ├── typicalML_script.ipynb
    └── typicalML_script.py
├── Chapter_SupportVectorMachines
    ├── Metal_etch_2DPCA_testData.csv
    ├── Metal_etch_2DPCA_trainingData.csv
    ├── SVDD_FaultDetection.ipynb
    ├── SVDD_FaultDetection.py
    ├── SVDD_OneClassClassification.ipynb
    ├── SVDD_OneClassClassification.py
    ├── SVDD_toyDataset.csv
    ├── SVM_BinaryClassification.ipynb
    ├── SVM_BinaryClassification.py
    ├── SVM_Kernel_BinaryClassification.ipynb
    ├── SVM_Kernel_BinaryClassification.py
    ├── SVM_Kernel_BinaryClassification_noGridSearch.py
    ├── SVM_SoftMarginClassification.ipynb
    ├── SVM_SoftMarginClassification.py
    ├── SVR_illustration.ipynb
    ├── SVR_illustration.py
    ├── debutanizer_Softsensing_PLS.ipynb
    ├── debutanizer_Softsensing_PLS.py
    ├── debutanizer_Softsensing_SVR.ipynb
    ├── debutanizer_Softsensing_SVR.py
    ├── debutanizer_data.txt
    ├── info.txt
    ├── polymer.dat
    ├── polymerPlantData_Softsensing_PLS.ipynb
    ├── polymerPlantData_Softsensing_PLS.py
    ├── polymerPlantData_Softsensing_SVR.ipynb
    ├── polymerPlantData_Softsensing_SVR.py
    ├── toyDataset.csv
    └── toyDataset2.csv
├── Chapter_WebDeployment
    ├── FDD.py
    ├── FDD_withHTML.py
    ├── FDD_withoutHTML.py
    ├── PCAmetrics_history.pickle
    ├── PCAmodelData.pickle
    ├── ProcessMonitoring_PCA.py
    ├── contributionPlot.png
    ├── frontEndTemplate.html
    ├── helloWorld.py
    ├── info.txt
    ├── metricPlot.png
    ├── proc1a.xlsx
    ├── processLatestDatabase_local.csv
    └── sample.html
├── Images
    ├── Book3_coverPage.JPG
    └── ML-for-PSE-2023Edition-CoverPage.JPG
├── LICENSE
└── README.md


/Chapter_ANN/CCPP_FFNN.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                         FFNN modeling of CCPP
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np, pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% read data
10 | data = pd.read_excel('Folds5x2_pp.xlsx', usecols = 'A:E').values
11 | X = data[:,0:4]
12 | y = data[:,4][:,np.newaxis]
13 | 
14 | #%% separate train and test data
15 | from sklearn.model_selection import train_test_split
16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)
17 | 
18 | #%% scale data
19 | from sklearn.preprocessing import StandardScaler
20 | 
21 | X_scaler = StandardScaler()
22 | X_train_scaled = X_scaler.fit_transform(X_train)
23 | X_test_scaled = X_scaler.transform(X_test)
24 | 
25 | y_scaler = StandardScaler()
26 | y_train_scaled = y_scaler.fit_transform(y_train)
27 | y_test_scaled = y_scaler.transform(y_test)
28 | 
29 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
30 | ##                          Define & Fit FFNN model
31 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
32 | 
33 | #%% import Keras libraries
34 | from tensorflow.keras import Sequential
35 | from tensorflow.keras.layers import Dense
36 | 
37 | #%% define model
38 | model = Sequential()
39 | model.add(Dense(8, activation='relu', kernel_initializer='he_normal', input_shape=(4,))) # 8 neurons in 1st hidden layer; this hidden layer accepts data from a 4 dimensional input
40 | model.add(Dense(5, activation='relu', kernel_initializer='he_normal')) # 5 neurons in 2nd layer
41 | model.add(Dense(1)) # output layer
42 | 
43 | #%% compile model
44 | model.compile(loss='mse', optimizer='Adam') # mean-squared error is to be minimized
45 | 
46 | #%% fit model
47 | model.fit(X_train_scaled, y_train_scaled, epochs=25, batch_size=50)
48 | 
49 | #%% predict y_test
50 | y_test_scaled_pred = model.predict(X_test_scaled)
51 | y_test_pred = y_scaler.inverse_transform(y_test_scaled_pred)
52 | 
53 | plt.figure()
54 | plt.plot(y_test, y_test_pred, '*')
55 | plt.xlabel('y_test')
56 | plt.ylabel('y_test_pred')
57 | 
58 | #%% metrics
59 | from sklearn.metrics import r2_score
60 | print('R2:', r2_score(y_test, y_test_pred))
61 | 
62 | #%% model summary
63 | model.summary()


--------------------------------------------------------------------------------
/Chapter_ANN/CCPP_data_explore.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Chapter: Feedforward Neural Networks\n",
 8 |     "\n",
 9 |     "\n",
10 |     "# Topic: Combined Cycle Power Plant data exploration"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 2,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "# import required packages\n",
20 |     "import numpy as np, pandas as pd\n",
21 |     "import matplotlib.pyplot as plt"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": 3,
27 |    "metadata": {},
28 |    "outputs": [],
29 |    "source": [
30 |     "# read data\n",
31 |     "data = pd.read_excel('Folds5x2_pp.xlsx', usecols = 'A:E').values\n",
32 |     "X = data[:,0:4]\n",
33 |     "y = data[:,4][:,np.newaxis]"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": 4,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": [
42 |     "#%% plot input vs output for each input\n",
43 |     "plt.figure()\n",
44 |     "plt.plot(X[:,0], y, '*')\n",
45 |     "plt.title('AT vs EP')\n",
46 |     "\n",
47 |     "plt.figure()\n",
48 |     "plt.plot(X[:,1], y, '*')\n",
49 |     "plt.title('V vs EP')\n",
50 |     "\n",
51 |     "plt.figure()\n",
52 |     "plt.plot(X[:,2], y, '*')\n",
53 |     "plt.title('AP vs EP')\n",
54 |     "\n",
55 |     "plt.figure()\n",
56 |     "plt.plot(X[:,3], y, '*')\n",
57 |     "plt.title('RH vs EP')"
58 |    ]
59 |   },
60 |   {
61 |    "cell_type": "code",
62 |    "execution_count": null,
63 |    "metadata": {},
64 |    "outputs": [],
65 |    "source": []
66 |   }
67 |  ],
68 |  "metadata": {
69 |   "kernelspec": {
70 |    "display_name": "Python 3",
71 |    "language": "python",
72 |    "name": "python3"
73 |   },
74 |   "language_info": {
75 |    "codemirror_mode": {
76 |     "name": "ipython",
77 |     "version": 3
78 |    },
79 |    "file_extension": ".py",
80 |    "mimetype": "text/x-python",
81 |    "name": "python",
82 |    "nbconvert_exporter": "python",
83 |    "pygments_lexer": "ipython3",
84 |    "version": "3.7.4"
85 |   }
86 |  },
87 |  "nbformat": 4,
88 |  "nbformat_minor": 2
89 | }
90 | 


--------------------------------------------------------------------------------
/Chapter_ANN/CCPP_data_explore.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          exploration of CCPP data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np, pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% read data
10 | data = pd.read_excel('Folds5x2_pp.xlsx', usecols = 'A:E').values
11 | X = data[:,0:4]
12 | y = data[:,4][:,np.newaxis]
13 | 
14 | #%% plot input vs output for each input
15 | plt.figure()
16 | plt.plot(X[:,0], y, '*')
17 | plt.title('AT vs EP')
18 | 
19 | plt.figure()
20 | plt.plot(X[:,1], y, '*')
21 | plt.title('V vs EP')
22 | 
23 | plt.figure()
24 | plt.plot(X[:,2], y, '*')
25 | plt.title('AP vs EP')
26 | 
27 | plt.figure()
28 | plt.plot(X[:,3], y, '*')
29 | plt.title('RH vs EP')


--------------------------------------------------------------------------------
/Chapter_ANN/Folds5x2_pp.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_ANN/Folds5x2_pp.xlsx


--------------------------------------------------------------------------------
/Chapter_ANN/ccpp_FFNN_gridSearch.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    grid search-based FFNN model for ccpp data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np, pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% read data
10 | data = pd.read_excel('Folds5x2_pp.xlsx', usecols = 'A:E').values
11 | X = data[:,0:4]
12 | y = data[:,4][:,np.newaxis]
13 | 
14 | #%% separate training, validation, test data
15 | from sklearn.model_selection import train_test_split
16 | 
17 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)
18 | X_est, X_val, y_est, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state = 100)
19 | 
20 | #%% scale data
21 | from sklearn.preprocessing import StandardScaler
22 | 
23 | X_scaler = StandardScaler()
24 | X_est_scaled = X_scaler.fit_transform(X_est)
25 | X_val_scaled = X_scaler.transform(X_val)
26 | X_test_scaled = X_scaler.transform(X_test)
27 | 
28 | y_scaler = StandardScaler()
29 | y_est_scaled = y_scaler.fit_transform(y_est)
30 | y_val_scaled = y_scaler.transform(y_val)
31 | y_test_scaled = y_scaler.transform(y_test)
32 | 
33 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
34 | ##                          Define FFNN model
35 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
36 | 
37 | #%% import packages
38 | from tensorflow.keras import Sequential
39 | from tensorflow.keras.layers import Dense
40 | from tensorflow.keras import regularizers
41 | from tensorflow.keras.optimizers import Adam
42 | 
43 | #%% model function
44 | def FFNN_model(hidden_layers, layer_size, regularizationValue, learningRate):
45 |     model = Sequential()
46 |     model.add(Dense(layer_size, kernel_regularizer=regularizers.L1(regularizationValue), activation='relu', kernel_initializer='he_normal', input_shape=(4,)))
47 |     
48 |     for _ in range(hidden_layers-1):
49 |         model.add(Dense(layer_size, kernel_regularizer=regularizers.L1(regularizationValue), activation='relu', kernel_initializer='he_normal'))
50 |         
51 |     model.add(Dense(1))
52 |     model.compile(loss='mse', optimizer=Adam(learning_rate=learningRate))
53 |     
54 |     return model
55 | 
56 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
57 | ##                 KerasRegressor wrapper and gridSearchCV
58 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
59 | 
60 | #%% KerasRegressor
61 | from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
62 | model = KerasRegressor(build_fn=FFNN_model, epochs=25, batch_size=50)
63 | 
64 | #%% gridSearchCV
65 | from sklearn.model_selection import GridSearchCV
66 | 
67 | param_grid={
68 |     "hidden_layers":[1, 2],
69 |     "layer_size":np.arange(1,10),
70 |     "regularizationValue": [0.001, 0.01, 0.1],
71 |     "learningRate":[0.05, 0.01, 0.1]
72 | }
73 | 
74 | grid_searchcv = GridSearchCV(model, param_grid)
75 | grid_searchcv.fit(X_est_scaled, y_est_scaled, validation_data=(X_val_scaled, y_val_scaled))
76 | 
77 | print("The best parameters obtained are:", grid_searchcv.best_params_)
78 | 
79 | #%% best model
80 | model = grid_searchcv.best_estimator_.model
81 | 
82 | #%% predict y_test
83 | y_test_scaled_pred = model.predict(X_test_scaled)
84 | y_test_pred = y_scaler.inverse_transform(y_test_scaled_pred)
85 | 
86 | plt.figure()
87 | plt.plot(y_test, y_test_pred, '*')
88 | plt.xlabel('y_test')
89 | plt.ylabel('y_test_pred')
90 | 
91 | #%% metrics
92 | from sklearn.metrics import r2_score
93 | print('R2:', r2_score(y_test, y_test_pred))
94 | 
95 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
96 | ##                 save model
97 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
98 | model.save('CCPP_FFNN_bestModel')


--------------------------------------------------------------------------------
/Chapter_ANN/debutanizer_FFNN.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          FFNN model with debutanizer data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% random number seed for result reproducibility 
10 | from numpy.random import seed
11 | seed(1)
12 | import tensorflow
13 | tensorflow.random.set_seed(2)
14 | 
15 | #%% read data
16 | data = np.loadtxt('debutanizer_data.txt', skiprows=5)
17 | 
18 | #%% separate training, validation, and test data
19 | from sklearn.model_selection import train_test_split
20 | X = data[:,0:-1]
21 | y = data[:,-1][:,np.newaxis]
22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 100)
23 | X_est, X_val, y_est, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 100)
24 | 
25 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
26 | ##                          Fit FFNN model
27 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
28 | 
29 | #%% import packages
30 | from tensorflow.keras import Sequential
31 | from tensorflow.keras.layers import Dense
32 | from tensorflow.keras import regularizers
33 | from tensorflow.keras.callbacks import EarlyStopping
34 | from tensorflow.keras.optimizers import Adam
35 | 
36 | #%% define model
37 | model = Sequential()
38 | model.add(Dense(60, kernel_regularizer=regularizers.L1(0.0000001), activation='relu', kernel_initializer='he_normal', input_shape=(7,)))
39 | model.add(Dense(30, kernel_regularizer=regularizers.L1(0.0000001), activation='relu', kernel_initializer='he_normal'))
40 | model.add(Dense(1, kernel_regularizer=regularizers.L1(0.0000001)))
41 | 
42 | #%% compile model
43 | model.compile(loss='mse', optimizer=Adam(learning_rate=0.005))
44 | 
45 | #%% fit model
46 | es = EarlyStopping(monitor='val_loss', patience=200)
47 | history = model.fit(X_est, y_est, epochs=2000, batch_size=32, validation_data=(X_val, y_val), callbacks=es)
48 | 
49 | #%% plot validation curve
50 | plt.figure()
51 | plt.title('Validation Curves')
52 | plt.xlabel('Epoch')
53 | plt.ylabel('MSE')
54 | plt.plot(history.history['loss'], label='train')
55 | plt.plot(history.history['val_loss'], label='val')
56 | plt.legend()
57 | plt.grid()
58 | plt.show()
59 | 
60 | #%% predict y
61 | y_test_pred = model.predict(X_test)
62 | y_val_pred = model.predict(X_val)
63 | y_est_pred = model.predict(X_est)
64 | 
65 | #%% plots of raw and predicted data
66 | plt.figure()
67 | plt.plot(y_test, y_test_pred, '*')
68 | plt.xlabel('C4 content (test data)')
69 | plt.ylabel('C4 content (prediction)')
70 | 
71 | plt.figure()
72 | plt.plot(y_test, 'b', label='Raw data')
73 | plt.plot(y_test_pred, 'r', label='FFNN prediction')
74 | plt.ylabel('C4 content (test data)')
75 | plt.xlabel('Sample #')
76 | plt.legend()
77 | 
78 | #%% residuals
79 | plt.figure()
80 | plt.plot(y_test, y_test-y_test_pred, '*')
81 | plt.xlabel('C4 content test data')
82 | plt.ylabel('residual (raw data- prediction)')
83 | plt.title('residual plot')
84 | 
85 | #%% metrics
86 | from sklearn.metrics import r2_score
87 | print('R2 for test dataset:', r2_score(y_test, y_test_pred))
88 | print('R2:', r2_score(y_val, y_val_pred))
89 | print('R2:', r2_score(y_est, y_est_pred))


--------------------------------------------------------------------------------
/Chapter_ANN/debutanizer_PLS.py:
--------------------------------------------------------------------------------
  1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | ##                          PLS model with debutanizer data
  3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 | 
  5 | #%% import required packages
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | #%% read data
 10 | data = np.loadtxt('debutanizer_data.txt', skiprows=5)
 11 | 
 12 | #%% separate train and test data
 13 | from sklearn.model_selection import train_test_split
 14 | X = data[:,0:-1]
 15 | y = data[:,-1][:,np.newaxis]
 16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 100)
 17 | 
 18 | #%% scale data
 19 | from sklearn.preprocessing import StandardScaler
 20 | 
 21 | X_scaler = StandardScaler()
 22 | X_train_normal = X_scaler.fit_transform(X_train)
 23 | X_test_normal = X_scaler.transform(X_test)
 24 | 
 25 | y_scaler = StandardScaler()
 26 | y_train_normal = y_scaler.fit_transform(y_train)
 27 | y_test_normal = y_scaler.transform(y_test)
 28 | 
 29 | #%% Finding # latents using kFold cross validation
 30 | from sklearn.model_selection import KFold
 31 | from sklearn.metrics import mean_squared_error
 32 | from sklearn.cross_decomposition import PLSRegression
 33 | 
 34 | scaler = StandardScaler()
 35 | 
 36 | fit_MSE = []
 37 | validate_MSE = []
 38 | for n_comp in range(1,8):
 39 |     local_fit_MSE = []
 40 |     local_validate_MSE = []
 41 |     
 42 |     kfold = KFold(n_splits = 10, shuffle = True, random_state = 100)
 43 |     for fit_index, validate_index in kfold.split(y_train):
 44 |         X_fit_normal = scaler.fit_transform(X_train[fit_index])
 45 |         X_validate_normal = scaler.transform(X_train[validate_index])
 46 |         
 47 |         y_fit_normal = scaler.fit_transform(y_train[fit_index])
 48 |         y_validate_normal = scaler.transform(y_train[validate_index])
 49 |         
 50 |         pls = PLSRegression(n_components = n_comp)
 51 |         pls.fit(X_fit_normal, y_fit_normal)
 52 |         
 53 |         local_fit_MSE.append(mean_squared_error(y_fit_normal, pls.predict(X_fit_normal)))
 54 |         local_validate_MSE.append(mean_squared_error(y_validate_normal, 
 55 |                                                         pls.predict(X_validate_normal)))
 56 |     
 57 |     fit_MSE.append(np.mean(local_fit_MSE))
 58 |     validate_MSE.append(np.mean(local_validate_MSE))
 59 | 
 60 | 
 61 | # plot
 62 | plt.figure()
 63 | plt.plot(range(1,8), fit_MSE, 'b*', label = 'Training MSE')
 64 | plt.plot(range(1,8), validate_MSE, 'r*', label = 'Validation MSE')
 65 | plt.xticks(range(1,8))
 66 | plt.ylabel('Mean Squared Error (MSE)')
 67 | plt.xlabel('# of latents')
 68 | plt.legend()
 69 | 
 70 | #%% build PLS model
 71 | pls = PLSRegression(n_components = 5)
 72 | pls.fit(X_train_normal, y_train_normal)
 73 | 
 74 | #%% check training vs test accuracy
 75 | print('Accuracy over training data: ', pls.score(X_train_normal, y_train_normal))
 76 | print('Accuracy over test data: ', pls.score(X_test_normal, y_test_normal))
 77 | 
 78 | #%% plots of raw and predicted data
 79 | y_train_normal_predict = pls.predict(X_train_normal)
 80 | y_test_normal_predict = pls.predict(X_test_normal)
 81 | 
 82 | y_train_predict = y_scaler.inverse_transform(y_train_normal_predict)
 83 | y_test_predict = y_scaler.inverse_transform(y_test_normal_predict)
 84 | 
 85 | 
 86 | plt.figure()
 87 | plt.plot(y_train, 'b',  label = 'Raw data')
 88 | plt.plot(y_train_predict, 'r', label = 'PLS prediction')
 89 | plt.ylabel('C4 content (training data)')
 90 | plt.xlabel('Sample #')
 91 | plt.legend()
 92 | 
 93 | 
 94 | plt.figure()
 95 | plt.plot(y_test, 'b',  label = 'Raw data')
 96 | plt.plot(y_test_predict, 'r',  label = 'PLS prediction')
 97 | plt.ylabel('C4 content (test data)')
 98 | plt.xlabel('Sample #')
 99 | plt.legend()
100 | 
101 | plt.figure()
102 | plt.plot(y_test, y_test_predict, '*')
103 | plt.xlabel('C4 content (test data)')
104 | plt.ylabel('C4 content (prediction)')
105 | 
106 | #%% residuals
107 | plt.figure()
108 | plt.plot(y_test, y_test-y_test_predict, '*')
109 | plt.xlabel('C4 content test data')
110 | plt.ylabel('residual (raw data- prediction)')
111 | plt.title('residual plot')


--------------------------------------------------------------------------------
/Chapter_ANN/debutanizer_dataExplore.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          Exploration of debutanizer data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% read data
10 | data = np.loadtxt('debutanizer_data.txt', skiprows=5)
11 | 
12 | #%% plot each variable
13 | plt.figure()
14 | plt.plot(data[:,0])
15 | plt.ylabel('top Temperature')
16 | plt.xlabel('samples')
17 | plt.xlim((0,2500))
18 | 
19 | plt.figure()
20 | plt.plot(data[:,1])
21 | plt.ylabel('top Pressure')
22 | plt.xlabel('samples')
23 | plt.xlim((0,2500))
24 | 
25 | plt.figure()
26 | plt.plot(data[:,2])
27 | plt.ylabel('reflux flow')
28 | plt.xlabel('samples')
29 | plt.xlim((0,2500))
30 | 
31 | plt.figure()
32 | plt.plot(data[:,3])
33 | plt.ylabel('flow to next process')
34 | plt.xlabel('samples')
35 | plt.xlim((0,2500))
36 | 
37 | plt.figure()
38 | plt.plot(data[:,4])
39 | plt.ylabel('6th tray Temperature')
40 | plt.xlabel('samples')
41 | plt.xlim((0,2500))
42 | 
43 | plt.figure()
44 | plt.plot(data[:,5])
45 | plt.ylabel('bottom Temperature 1')
46 | plt.xlabel('samples')
47 | plt.xlim((0,2500))
48 | 
49 | plt.figure()
50 | plt.plot(data[:,6])
51 | plt.ylabel('bottom Temperature 2')
52 | plt.xlabel('samples')
53 | plt.xlim((0,2500))
54 | 
55 | plt.figure()
56 | plt.plot(data[:,7])
57 | plt.ylabel('C4 content')
58 | plt.xlabel('samples')
59 | plt.xlim((0,2500))


--------------------------------------------------------------------------------
/Chapter_ANN/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_ANN/kamyr_data_FFNN_earlyStopping.py:
--------------------------------------------------------------------------------
  1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | ##                          FFNN-based Soft Sensor for kamyr dataset
  3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 | 
  5 | #%% import required packages
  6 | import numpy as np
  7 | import pandas as pd
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | #%% random number seed for result reproducibility 
 11 | from numpy.random import seed
 12 | seed(10)
 13 | import tensorflow
 14 | tensorflow.random.set_seed(20)
 15 | 
 16 | #%% fetch data
 17 | data = pd.read_csv('kamyr-digester.csv', usecols = range(1,23))        
 18 | 
 19 | #%% pre-process
 20 | # find the # of nan entries in each column
 21 | na_counts = data.isna().sum(axis = 0) 
 22 | 
 23 | # remove columns that have a lot of nan entries
 24 | data_cleaned = data.drop(columns = ['AAWhiteSt-4 ','SulphidityL-4 ']) 
 25 | 
 26 | # remove any row that have any nan entry
 27 | data_cleaned = data_cleaned.dropna(axis = 0) 
 28 | 
 29 | # separate X, y
 30 | y = data_cleaned.iloc[:,0].values[:,np.newaxis] # StandardScaler requires 2D array
 31 | X = data_cleaned.iloc[:,1:].values
 32 | 
 33 | print('Number of samples left: ', X.shape[0])
 34 | 
 35 | #%% separate train and test data
 36 | from sklearn.model_selection import train_test_split
 37 | 
 38 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)
 39 | X_est, X_val, y_est, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state = 100)
 40 | 
 41 | #%% scale data
 42 | from sklearn.preprocessing import StandardScaler
 43 | 
 44 | X_scaler = StandardScaler()
 45 | X_est_scaled = X_scaler.fit_transform(X_est)
 46 | X_val_scaled = X_scaler.transform(X_val)
 47 | X_test_scaled = X_scaler.transform(X_test)
 48 | 
 49 | y_scaler = StandardScaler()
 50 | y_est_scaled = y_scaler.fit_transform(y_est)
 51 | y_val_scaled = y_scaler.transform(y_val)
 52 | y_test_scaled = y_scaler.transform(y_test)
 53 | 
 54 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 55 | ##                  Define & Fit FFNN model without early stopping
 56 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 57 | 
 58 | #%% import packages
 59 | from tensorflow.keras import Sequential
 60 | from tensorflow.keras.layers import Dense
 61 | 
 62 | #%% define model
 63 | def FFNN_model():
 64 |     model = Sequential()
 65 |     model.add(Dense(20, activation='tanh', kernel_initializer='he_normal', input_shape=(19,)))
 66 |     model.add(Dense(5, activation='tanh', kernel_initializer='he_normal'))
 67 |     model.add(Dense(1))
 68 |     model.compile(loss='mse', optimizer='Adam')
 69 |     return model
 70 | 
 71 | #%% fit model
 72 | history = FFNN_model().fit(X_est_scaled, y_est_scaled, epochs=250, batch_size=32, validation_data=(X_val_scaled, y_val_scaled))
 73 | 
 74 | #%% plot validation curve
 75 | plt.figure()
 76 | plt.title('Validation Curves')
 77 | plt.xlabel('Epoch')
 78 | plt.ylabel('MSE')
 79 | plt.plot(history.history['loss'], label='training')
 80 | plt.plot(history.history['val_loss'], label='validation')
 81 | plt.legend()
 82 | plt.grid()
 83 | plt.show()
 84 | 
 85 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 86 | ##                  Define & Fit FFNN model with early stopping
 87 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 88 | 
 89 | #%% random number seed for result reproducibility 
 90 | from numpy.random import seed
 91 | seed(10)
 92 | import tensorflow
 93 | tensorflow.random.set_seed(20)
 94 | 
 95 | #%% fit model again with early stopping
 96 | from tensorflow.keras.callbacks import EarlyStopping
 97 | es = EarlyStopping(monitor='val_loss', patience=15)
 98 | 
 99 | history = FFNN_model().fit(X_est_scaled, y_est_scaled, epochs=250, batch_size=32, validation_data=(X_val_scaled, y_val_scaled), callbacks=es)
100 | 
101 | #%% plot validation curve
102 | plt.figure()
103 | plt.title('Validation Curves')
104 | plt.xlabel('Epoch')
105 | plt.ylabel('MSE')
106 | plt.plot(history.history['loss'], label='training')
107 | plt.plot(history.history['val_loss'], label='validation')
108 | plt.legend()
109 | plt.grid()
110 | plt.show()


--------------------------------------------------------------------------------
/Chapter_ANN/quadratic_function_singleLayer.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                         FFNN modeling of y = x*x
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% random number seed for result reproducibility 
10 | from numpy.random import seed
11 | seed(1)
12 | import tensorflow
13 | tensorflow.random.set_seed(2)
14 | 
15 | #%% generate data
16 | x = np.linspace(-1,1,500)
17 | y = x*x 
18 | plt.plot(x,y)
19 | 
20 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
21 | ##                          Define & Fit FFNN model
22 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
23 | 
24 | #%% import Keras libraries
25 | from tensorflow.keras import Sequential
26 | from tensorflow.keras.layers import Dense
27 | from tensorflow.keras.optimizers import Adam
28 | 
29 | #%% define model
30 | n_nodes = 5
31 | 
32 | model = Sequential()
33 | model.add(Dense(n_nodes, activation='relu', input_shape=(1,)))
34 | model.add(Dense(1))
35 | 
36 | #%% compile model
37 | model.compile(loss='mse', optimizer=Adam(learning_rate=0.05))
38 | 
39 | #%% fit model
40 | history = model.fit(x, y, epochs=400, batch_size=50)
41 | 
42 | plt.figure()
43 | plt.xlabel('Epoch')
44 | plt.ylabel('MSE')
45 | plt.plot(history.history['loss'], label='train')
46 | plt.show()
47 | 
48 | #%% predict y_test
49 | y_pred = model.predict(x)
50 | 
51 | plt.figure()
52 | plt.plot(x, y, '--b', label='y=x^2')
53 | plt.plot(x, y_pred, '--r', label='Approximation')
54 | plt.xlabel('x')
55 | plt.title('y_pred vs y')
56 | plt.legend()
57 | 
58 | plt.figure()
59 | plt.plot(y_pred, 'r')
60 | plt.title('y_pred')
61 | 
62 | #%% metrics
63 | from sklearn.metrics import r2_score
64 | print('R2:', r2_score(y, y_pred))
65 | 
66 | #%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
67 | ##                          inner layer activations
68 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
69 | import tensorflow.keras.backend as K
70 | activations = []
71 | for layer in model.layers:
72 |     keras_function = K.function([model.input], [layer.output])
73 |     activations.append(keras_function(x))
74 | 
75 | #%% plot activations
76 | layer1_activations = activations[0][0]
77 | for node in range(n_nodes):
78 |     plt.figure()
79 |     plt.plot(x, layer1_activations[:,node])
80 |     plt.title('node ' + str(node+1) + ' activation')


--------------------------------------------------------------------------------
/Chapter_BestPractices/3Way_Holdout_Method.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Chapter: Best Practices\n",
 8 |     "\n",
 9 |     "# Topic: 3Way Holdout Method"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": 1,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "# read data\n",
19 |     "import numpy as np\n",
20 |     "data = np.loadtxt('quadratic_raw_data.csv', delimiter=',')\n",
21 |     "x = data[:,0,None]; y = data[:,1,None]"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": 2,
27 |    "metadata": {},
28 |    "outputs": [],
29 |    "source": [
30 |     "# create pipeline for quadratic fit via linear model \n",
31 |     "# import relevant classes\n",
32 |     "from sklearn.pipeline import Pipeline\n",
33 |     "from sklearn.preprocessing import PolynomialFeatures\n",
34 |     "from sklearn.preprocessing import StandardScaler\n",
35 |     "from sklearn.linear_model import LinearRegression\n",
36 |     "\n",
37 |     "# add transformers and estimators sequentially as list of tuples\n",
38 |     "# the names ‘poly’, ‘scaler’, ‘model’ can be used to access the individual elements of pipeline later \n",
39 |     "pipe = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),\n",
40 |     "                 ('scaler', StandardScaler()),\n",
41 |     "                 ('model', LinearRegression())])"
42 |    ]
43 |   },
44 |   {
45 |    "cell_type": "code",
46 |    "execution_count": 3,
47 |    "metadata": {},
48 |    "outputs": [
49 |     {
50 |      "name": "stdout",
51 |      "output_type": "stream",
52 |      "text": [
53 |       "Number of samples in fitting set:  14\n",
54 |       "Number of samples in validation set:  6\n",
55 |       "Number of samples in test set:  5\n"
56 |      ]
57 |     }
58 |    ],
59 |    "source": [
60 |     "# train-validate-test split\n",
61 |     "from sklearn.model_selection import train_test_split\n",
62 |     "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)\n",
63 |     "x_fit, x_val, y_fit, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=1)\n",
64 |     "\n",
65 |     "print('Number of samples in fitting set: ', x_fit.shape[0])\n",
66 |     "print('Number of samples in validation set: ', x_val.shape[0])\n",
67 |     "print('Number of samples in test set: ', x_test.shape[0])"
68 |    ]
69 |   }
70 |  ],
71 |  "metadata": {
72 |   "kernelspec": {
73 |    "display_name": "Python 3 (ipykernel)",
74 |    "language": "python",
75 |    "name": "python3"
76 |   },
77 |   "language_info": {
78 |    "codemirror_mode": {
79 |     "name": "ipython",
80 |     "version": 3
81 |    },
82 |    "file_extension": ".py",
83 |    "mimetype": "text/x-python",
84 |    "name": "python",
85 |    "nbconvert_exporter": "python",
86 |    "pygments_lexer": "ipython3",
87 |    "version": "3.9.7"
88 |   }
89 |  },
90 |  "nbformat": 4,
91 |  "nbformat_minor": 2
92 | }
93 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/3wayHoldout_Method.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##            Split dataset nto training, validation, and test sets
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | #%% read data
 5 | import numpy as np
 6 | data = np.loadtxt('quadratic_raw_data.csv', delimiter=',')
 7 | x = data[:,0,None]; y = data[:,1,None]
 8 | 
 9 | #%% create pipeline for quadratic fit via linear model 
10 | # import relevant classes
11 | from sklearn.pipeline import Pipeline
12 | from sklearn.preprocessing import PolynomialFeatures
13 | from sklearn.preprocessing import StandardScaler
14 | from sklearn.linear_model import LinearRegression
15 | 
16 | # add transformers and estimators sequentially as list of tuples
17 | # the names ‘poly’, ‘scaler’, ‘model’ can be used to access the individual elements of pipeline later 
18 | pipe = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),
19 |                  ('scaler', StandardScaler()),
20 |                  ('model', LinearRegression())])
21 | 
22 | #%% train-validate-test split
23 | from sklearn.model_selection import train_test_split
24 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
25 | x_fit, x_val, y_fit, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=1)
26 | 
27 | print('Number of samples in fitting set: ', x_fit.shape[0])
28 | print('Number of samples in validation set: ', x_val.shape[0])
29 | print('Number of samples in test set: ', x_test.shape[0])
30 | 
31 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/Centering_Scaling.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Centering & Scaling
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% Standard scaling
 6 | import numpy as np
 7 | from sklearn.preprocessing import StandardScaler
 8 | 
 9 | X = np.array([[ 1000, 0.01,  300],
10 |               [ 1200,  0.06,  350], 
11 |               [ 1500,  0.1, 320]])
12 | scaler = StandardScaler().fit(X) # computes mean & std column-wise
13 | X_scaled = scaler.transform(X) # transform using computed mean and std
14 | 
15 | # check mean = 0 and variance = 1 for every variable/column after scaling 
16 | print(X_scaled.mean(axis=0)) # return 1D array of size(3,1)
17 | print(X_scaled.std(axis=0)) # return 1D array of size(3,1)
18 | 
19 | # access mean and variance via object properties
20 | print(scaler.mean_) # return 1D array of size(3,1)
21 | print(scaler.var_) # return 1D array of size(3,1)
22 | 
23 | #%% Normalization
24 | from sklearn.preprocessing import MinMaxScaler
25 | 
26 | scaler = MinMaxScaler() # create object
27 | X_scaled = scaler.fit_transform(X) # fit & transform 
28 | 
29 | # check min = 0 and max = 1 for every variable/column after scaling 
30 | print(X_scaled.min(axis=0))
31 | print(X_scaled.max(axis=0))
32 | 
33 | # access min and max via object properties
34 | print(scaler.data_min_)
35 | print(scaler.data_max_)
36 | 
37 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
38 | ##                    Robust Centering & Scaling
39 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
40 | 
41 | #%% Generate oulier-infested data
42 | X = np.random.normal(40, 1, (1500,1))
43 | X[200:300] = X[200:300] +8; X[1000:1150] = X[1000:1150] + 8
44 | 
45 | # plot
46 | import matplotlib.pyplot as plt
47 | plt.plot(X, '.-')
48 | plt.xlabel('sample #'), plt.ylabel('variable measurement')
49 | plt.title('Raw measurements')
50 | 
51 | #%% Transform via standard scaling
52 | scaler = StandardScaler().fit(X)
53 | X_scaled = scaler.transform(X)
54 | 
55 | # mean and std
56 | print('Estimated mean = ', scaler.mean_[0])
57 | print('Estimated standard deviation = ', np.sqrt(scaler.var_[0]))
58 | 
59 | # plot
60 | plt.figure()
61 | plt.plot(X_scaled, '.-')
62 | plt.xlabel('sample #'), plt.ylabel('scaled variable measurement')
63 | plt.xlim((0,1500))
64 | plt.title('Standard scaling')
65 | 
66 | #%% Transform via robust MAD scaling
67 | # compute median and MAD
68 | from scipy import stats
69 | median = np.median(X)
70 | MAD = stats.median_absolute_deviation(X)
71 | 
72 | # scale
73 | X_scaled = (X - median)/MAD[0]
74 | 
75 | # median and MAD
76 | print('Estimated robust location = ', median)
77 | print('Estimated robust spread = ', MAD)
78 | 
79 | # plot
80 | plt.figure()
81 | plt.plot(X_scaled, '.-')
82 | plt.xlabel('sample #'), plt.ylabel('scaled variable measurement')
83 | plt.xlim((0,1500))
84 | plt.title('Robust MAD scaling')
85 | 
86 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/Feature_Engineering_OneHotEncoding.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Chapter: Best Practices\n",
 8 |     "\n",
 9 |     "# Topic: Feature Engineering (one-hot encoding)"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": 2,
15 |    "metadata": {},
16 |    "outputs": [
17 |     {
18 |      "name": "stdout",
19 |      "output_type": "stream",
20 |      "text": [
21 |       "[[1. 0. 0.]\n",
22 |       " [0. 0. 1.]\n",
23 |       " [0. 1. 0.]\n",
24 |       " [0. 0. 1.]]\n",
25 |       "[array(['type A', 'type B', 'type C'], dtype='<U6')]\n"
26 |      ]
27 |     }
28 |    ],
29 |    "source": [
30 |     "import numpy as np\n",
31 |     "from sklearn.preprocessing import OneHotEncoder\n",
32 |     "\n",
33 |     "x = np.array([['type A'],\n",
34 |     "              ['type C'],\n",
35 |     "              ['type B'],\n",
36 |     "              ['type C']])\n",
37 |     "ohe = OneHotEncoder(sparse=False) # sparse=False returns array\n",
38 |     "X_encoded = ohe.fit_transform(x) # x in numerical form\n",
39 |     "\n",
40 |     "print(X_encoded)\n",
41 |     "print(ohe.categories_)"
42 |    ]
43 |   }
44 |  ],
45 |  "metadata": {
46 |   "kernelspec": {
47 |    "display_name": "Python 3 (ipykernel)",
48 |    "language": "python",
49 |    "name": "python3"
50 |   },
51 |   "language_info": {
52 |    "codemirror_mode": {
53 |     "name": "ipython",
54 |     "version": 3
55 |    },
56 |    "file_extension": ".py",
57 |    "mimetype": "text/x-python",
58 |    "name": "python",
59 |    "nbconvert_exporter": "python",
60 |    "pygments_lexer": "ipython3",
61 |    "version": "3.9.7"
62 |   }
63 |  },
64 |  "nbformat": 4,
65 |  "nbformat_minor": 2
66 | }
67 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/Feature_Engineering_OneHotEncoding.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                   Feature Engineering (one-hot encoding)
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | import numpy as np
 6 | from sklearn.preprocessing import OneHotEncoder
 7 | 
 8 | x = np.array([['type A'],
 9 |               ['type C'],
10 |               ['type B'],
11 |               ['type C']])
12 | ohe = OneHotEncoder(sparse=False) # sparse=False returns array
13 | X_encoded = ohe.fit_transform(x) # x in numerical form
14 | 
15 | print(X_encoded)
16 | print(ohe.categories_)
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/Feature_Engineering_quadraticFit.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Feature Engineering (quadratic fit via linear model)
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | #%% read data
 5 | import numpy as np
 6 | data = np.loadtxt('quadratic_raw_data.csv', delimiter=',')
 7 | x = data[:,0,None]; y = data[:,1,None]
 8 | 
 9 | # plot
10 | import matplotlib.pyplot as plt
11 | plt.figure()
12 | plt.plot(x,y, 'o')
13 | plt.xlabel('x'), plt.ylabel('y')
14 | 
15 | #%% generate quadratic features
16 | from sklearn.preprocessing import PolynomialFeatures
17 | 
18 | poly = PolynomialFeatures(degree=2, include_bias=False)
19 | X_poly = poly.fit_transform(x) # X_poly: 1st column is x, 2nd column is x^2 
20 | 
21 | #%% scale model inputs
22 | from sklearn.preprocessing import StandardScaler
23 | 
24 | scaler = StandardScaler() 
25 | X_scaled = scaler.fit_transform(X_poly) 
26 | 
27 | #%% linear fit & predict
28 | from sklearn.linear_model import LinearRegression
29 | 
30 | model = LinearRegression()
31 | model.fit(X_poly, y)
32 | y_predicted = model.predict(X_poly)
33 | 
34 | #%% check predictions
35 | # plot
36 | plt.figure()
37 | plt.plot(x,y, 'o', label='raw data')
38 | plt.plot(x,y_predicted, label='quadratic fit')
39 | plt.legend()
40 | plt.xlabel('x'), plt.ylabel('y')
41 | 
42 | # accuracy
43 | from sklearn.metrics import r2_score
44 | print('Fit accuracy = ', r2_score(y, y_predicted))


--------------------------------------------------------------------------------
/Chapter_BestPractices/GridSearchCV.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                      GridSearchCV for Model Tuning
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | #%% read data
 5 | import numpy as np
 6 | data = np.loadtxt('quadratic_raw_data.csv', delimiter=',')
 7 | x = data[:,0,None]; y = data[:,1,None]
 8 | 
 9 | #%% create pipeline
10 | from sklearn.pipeline import Pipeline
11 | from sklearn.preprocessing import PolynomialFeatures
12 | from sklearn.preprocessing import StandardScaler
13 | from sklearn.linear_model import LinearRegression
14 | 
15 | pipe = Pipeline([('poly', PolynomialFeatures(include_bias=False)),
16 |                  ('scaler', StandardScaler()),
17 |                  ('model', LinearRegression())])
18 | 
19 | #%% separate training and test data
20 | from sklearn.model_selection import train_test_split
21 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
22 | 
23 | #%% find optimal hyperparameter via GridSearchCV
24 | from sklearn.model_selection import GridSearchCV
25 | 
26 | param_grid = {'poly__degree': np.arange(1,6)}
27 | gs = GridSearchCV(pipe, param_grid, scoring='neg_mean_squared_error', cv=3)
28 | gs.fit(x_train, y_train)
29 | 
30 | print('Optimal hyperparameter:', gs.best_params_)
31 | 
32 | #%% get best model and predict
33 | pipe_best = gs.best_estimator_
34 | y_predicted_train = pipe_best.predict(x_train) # can also use gs.predict(x_train)
35 | y_predicted_test = pipe_best.predict(x_test) # can also use gs.predict(x_test)
36 | 
37 | #%% performance metrics
38 | from sklearn.metrics import mean_squared_error as mse
39 | 
40 | print('Training metric (mse) = ', mse(y_train, y_predicted_train))
41 | print('Test metric (mse) = ', mse(y_test, y_predicted_test))
42 | 
43 | #%% plot predictions
44 | y_predicted = pipe_best.predict(x)
45 | 
46 | from matplotlib import pyplot as plt
47 | plt.figure()
48 | plt.plot(x_train,y_train, 'bo', label='raw training data')
49 | plt.plot(x_test,y_test, 'ro', label='raw test data')
50 | plt.plot(x,y_predicted, color='orange', label='quadratic fit')
51 | plt.legend()
52 | plt.xlabel('x'), plt.ylabel('y')
53 | 
54 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/Holdout_Method.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Split dataset into training and test sets
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | #%% read data
 5 | import numpy as np
 6 | data = np.loadtxt('quadratic_raw_data.csv', delimiter=',')
 7 | x = data[:,0,None]; y = data[:,1,None]
 8 | 
 9 | #%% create pipeline for quadratic fit via linear model 
10 | # import relevant classes
11 | from sklearn.pipeline import Pipeline
12 | from sklearn.preprocessing import PolynomialFeatures
13 | from sklearn.preprocessing import StandardScaler
14 | from sklearn.linear_model import LinearRegression
15 | 
16 | # add transformers and estimators sequentially as list of tuples
17 | # the names ‘poly’, ‘scaler’, ‘model’ can be used to access the individual elements of pipeline later 
18 | pipe = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),
19 |                  ('scaler', StandardScaler()),
20 |                  ('model', LinearRegression())])
21 | 
22 | #%% separate training data
23 | from sklearn.model_selection import train_test_split
24 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
25 | 
26 | print('Number of samples in training set: ', x_train.shape[0])
27 | print('Number of samples in test set: ', x_test.shape[0])
28 | 
29 | #%% fit pipeline and predict
30 | pipe.fit(x_train, y_train)
31 | y_predicted_train = pipe.predict(x_train)
32 | y_predicted_test = pipe.predict(x_test)
33 | 
34 | #%% performance metrics
35 | from sklearn.metrics import mean_squared_error as mse
36 | 
37 | print('Training metric (mse) = ', mse(y_train, y_predicted_train))
38 | print('Test metric (mse) = ', mse(y_test, y_predicted_test))
39 | 
40 | #%% plot predictions
41 | y_predicted = pipe.predict(x)
42 | 
43 | from matplotlib import pyplot as plt
44 | plt.figure()
45 | plt.plot(x_train,y_train, 'bo', label='raw training data')
46 | plt.plot(x_test,y_test, 'ro', label='raw test data')
47 | plt.plot(x,y_predicted, color='orange', label='quadratic fit')
48 | plt.legend()
49 | plt.xlabel('x'), plt.ylabel('y')
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/Pipeline_quadraticFit.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Quadratic fit using pipeline
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | #%% read data
 5 | import numpy as np
 6 | data = np.loadtxt('quadratic_raw_data.csv', delimiter=',')
 7 | x = data[:,0,None]; y = data[:,1,None]
 8 | 
 9 | #%% create pipeline for quadratic fit via linear model 
10 | # import relevant classes
11 | from sklearn.pipeline import Pipeline
12 | from sklearn.preprocessing import PolynomialFeatures
13 | from sklearn.preprocessing import StandardScaler
14 | from sklearn.linear_model import LinearRegression
15 | 
16 | # add transformers and estimators sequentially as list of tuples
17 | # the names ‘poly’, ‘scaler’, ‘model’ can be used to access the individual elements of pipeline later 
18 | pipe = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),
19 |                  ('scaler', StandardScaler()),
20 |                  ('model', LinearRegression())])
21 | 
22 | #%% fit pipeline and predict
23 | pipe.fit(x, y)
24 | y_predicted = pipe.predict(x)
25 | 
26 | #%% check predictions
27 | # plot
28 | from matplotlib import pyplot as plt
29 | plt.figure()
30 | plt.plot(x,y, 'o', label='raw data')
31 | plt.plot(x,y_predicted, label='quadratic fit')
32 | plt.legend()
33 | plt.xlabel('x'), plt.ylabel('y')
34 | 
35 | # performance metric
36 | from sklearn.metrics import r2_score
37 | print('Fitting metric (R2) = ', r2_score(y, y_predicted))
38 | 
39 | # performance metric vis score method
40 | print('Fitting metric (R2) = ', pipe.score(x,y)) # no need to import metrics module
41 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/ValidationCurve.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                       Validation Curve
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | #%% read data
 5 | import numpy as np
 6 | data = np.loadtxt('quadratic_raw_data.csv', delimiter=',')
 7 | x = data[:,0,None]; y = data[:,1,None]
 8 | 
 9 | #%% create pipeline
10 | from sklearn.pipeline import Pipeline
11 | from sklearn.preprocessing import PolynomialFeatures
12 | from sklearn.preprocessing import StandardScaler
13 | from sklearn.linear_model import LinearRegression
14 | 
15 | pipe = Pipeline([('poly', PolynomialFeatures(include_bias=False)),
16 |                  ('scaler', StandardScaler()),
17 |                  ('model', LinearRegression())])
18 | 
19 | #%% separate fitting and validation data
20 | from sklearn.model_selection import train_test_split
21 | x_fit, x_val, y_fit, y_val = train_test_split(x, y, test_size=0.2, random_state=1)
22 | 
23 | #%% assess performance on validation sets for different hyper-parameter values
24 | from sklearn.metrics import mean_squared_error as mse
25 | fit_MSEs = []
26 | validation_MSEs = [] 
27 | 
28 | for poly_degree in range(1,6):
29 |     # set hyper-parameter value
30 |     pipe['poly'].degree = poly_degree
31 |     # fit & predict
32 |     pipe.fit(x_fit, y_fit)
33 |     y_pred_fit = pipe.predict(x_fit)
34 |     y_pred_val = pipe.predict(x_val)
35 |     # compute scores and append
36 |     fit_MSE = mse(y_fit, y_pred_fit)
37 |     validation_MSE = mse(y_val, y_pred_val)
38 |     fit_MSEs.append(fit_MSE), validation_MSEs.append(validation_MSE)
39 | 
40 | #%% plot validation curve
41 | from matplotlib import pyplot as plt
42 | plt.figure()
43 | plt.plot(np.arange(1,6), fit_MSEs, 'b--', label='fitting MSEs')
44 | plt.plot(np.arange(1,6), validation_MSEs, 'g--', label='validation MSEs')
45 | plt.legend(), plt.xlabel('Polynomial degree'), plt.ylabel('MSE')
46 | plt.xticks([1,2,3,4,5])
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/kFold_CrossValidation.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                      K-fold Cross Validation for Model Tuning
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | #%% read data
 5 | import numpy as np
 6 | data = np.loadtxt('quadratic_raw_data.csv', delimiter=',')
 7 | x = data[:,0,None]; y = data[:,1,None]
 8 | 
 9 | #%% create pipeline
10 | from sklearn.pipeline import Pipeline
11 | from sklearn.preprocessing import PolynomialFeatures
12 | from sklearn.preprocessing import StandardScaler
13 | from sklearn.linear_model import LinearRegression
14 | 
15 | pipe = Pipeline([('poly', PolynomialFeatures(include_bias=False)),
16 |                  ('scaler', StandardScaler()),
17 |                  ('model', LinearRegression())])
18 | 
19 | #%% separate training and test data
20 | from sklearn.model_selection import train_test_split
21 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
22 | 
23 | #%% generate k=3 folds on training data
24 | from sklearn.model_selection import KFold
25 | kfold = KFold(n_splits = 3, shuffle = True, random_state = 1)
26 | 
27 | #%% compute validation performances using the K folds
28 | from sklearn.metrics import mean_squared_error as mse
29 | overall_fit_MSEs = [] # overall errors for different polynomial degrees  
30 | overall_val_MSEs = []
31 | 
32 | max_polyDegree = 5
33 | for poly_degree in range(1,max_polyDegree+1):# loop over hyperparameters
34 |     pipe['poly'].degree = poly_degree
35 |     
36 |     split_fit_MSEs = [] # errors for different splits 
37 |     split_val_MSEs = []
38 |     
39 |     for fit_indices, val_indices in kfold.split(x_train): # loop over splits
40 |         x_fit = x_train[fit_indices]
41 |         y_fit = y_train[fit_indices]
42 |         x_val = x_train[val_indices]
43 |         y_val = y_train[val_indices]
44 |         
45 |         # fit & predict
46 |         pipe.fit(x_fit, y_fit)
47 |         y_pred_fit = pipe.predict(x_fit)
48 |         y_pred_val = pipe.predict(x_val)
49 |         
50 |         # compute average error and append
51 |         split_fit_MSEs.append(mse(y_fit, y_pred_fit))
52 |         split_val_MSEs.append(mse(y_val, y_pred_val))
53 |     
54 |     overall_fit_MSEs.append(np.mean(split_fit_MSEs))
55 |     overall_val_MSEs.append(np.mean(split_val_MSEs))
56 | 
57 | #%% plot validation curve
58 | from matplotlib import pyplot as plt
59 | plt.figure()
60 | plt.plot(np.arange(1,max_polyDegree+1), overall_fit_MSEs, 'b--', label='fitting MSEs')
61 | plt.plot(np.arange(1,max_polyDegree+1), overall_val_MSEs, 'g--', label='validation MSEs')
62 | plt.legend(), plt.xlabel('Polynomial degree'), plt.ylabel('MSE')
63 | plt.xticks(np.arange(1,max_polyDegree+1))
64 | plt.ylim([0,40])
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/Chapter_BestPractices/quadratic_raw_data.csv:
--------------------------------------------------------------------------------
 1 | -5.000000000000000000e+00,1.900000000000000000e+01
 2 | -4.400000000000000355e+00,1.800000000000000000e+01
 3 | -4.349999999999999645e+00,1.780000000000000071e+01
 4 | -4.099999999999999645e+00,1.600000000000000000e+01
 5 | -4.049999999999999822e+00,1.219999999999999929e+01
 6 | -4.000000000000000000e+00,1.200000000000000000e+01
 7 | -3.799999999999999822e+00,7.000000000000000000e+00
 8 | -3.700000000000000178e+00,6.000000000000000000e+00
 9 | -3.500000000000000000e+00,5.500000000000000000e+00
10 | -3.000000000000000000e+00,6.500000000000000000e+00
11 | -2.899999999999999911e+00,6.480000000000000426e+00
12 | -2.799999999999999822e+00,6.450000000000000178e+00
13 | -2.000000000000000000e+00,2.500000000000000000e+00
14 | -1.949999999999999956e+00,2.450000000000000178e+00
15 | -1.350000000000000089e+00,0.000000000000000000e+00
16 | -1.300000000000000044e+00,-2.500000000000000000e-01
17 | -1.149999999999999911e+00,-3.000000000000000000e+00
18 | 0.000000000000000000e+00,-1.000000000000000000e+00
19 | 2.000000000000000111e-01,-1.000000000000000056e-01
20 | 2.999999999999999889e-01,-1.199999999999999956e-01
21 | 5.000000000000000000e-01,-1.100000000000000006e-01
22 | 1.199999999999999956e+00,-2.000000000000000000e+00
23 | 1.699999999999999956e+00,1.000000000000000000e+00
24 | 1.800000000000000044e+00,1.399999999999999911e+00
25 | 1.899999999999999911e+00,1.449999999999999956e+00
26 | 


--------------------------------------------------------------------------------
/Chapter_Clustering_GMM/DBSCAN_clustering.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          DBSCAN clustering of Etch data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% fetch data
10 | import scipy.io
11 | 
12 | matlab_data = scipy.io.loadmat('MACHINE_Data.mat', struct_as_record = False)
13 | Etch_data = matlab_data['LAMDATA']
14 | calibration_dataAll = Etch_data[0,0].calibration # calibration_dataAll[i,0] corresponds to a 2D data from ith batch where columns correspond to different variables 
15 | 
16 | variable_names = Etch_data[0,0].variables
17 | 
18 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
19 | ##                          perform Multiway PCA
20 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
21 | 
22 | #%% generate unfolded data matrix
23 | n_vars = variable_names.size - 2 # first 2 columns are not process variables
24 | n_samples = 85 # following the work of He et al.
25 | 
26 | unfolded_dataMatrix = np.empty((1,n_vars*n_samples))
27 | for expt in range(calibration_dataAll.size):
28 |     calibration_expt = calibration_dataAll[expt,0][5:90,2:] # removing first 5 measurements as done in He et al.
29 |     
30 |     if calibration_expt.shape[0] < 85:
31 |         continue
32 |     
33 |     unfolded_row = np.ravel(calibration_expt, order='F')[np.newaxis,:]
34 |     unfolded_dataMatrix = np.vstack((unfolded_dataMatrix, unfolded_row))
35 | 
36 | unfolded_dataMatrix = unfolded_dataMatrix[1:,:]
37 | 
38 | #%% scale data
39 | from sklearn.preprocessing import StandardScaler
40 | 
41 | scaler = StandardScaler()
42 | data_train_normal = scaler.fit_transform(unfolded_dataMatrix)
43 |            
44 | #%% PCA
45 | from sklearn.decomposition import PCA
46 | 
47 | pca = PCA(n_components = 3) # following the work of He et al.
48 | score_train = pca.fit_transform(data_train_normal)
49 | 
50 | #%% visualize in 2D
51 | plt.figure()
52 | plt.scatter(score_train[:,0],score_train[:,1])
53 | plt.xlabel('PC1 scores')
54 | plt.ylabel('PC2 scores')
55 | 
56 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
57 | ##                          DBSCAN on PCA scores
58 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
59 | 
60 | #%% fit DBSCAN model
61 | from sklearn.cluster import DBSCAN
62 | 
63 | db = DBSCAN(eps = 5, min_samples = 3).fit(score_train)
64 | cluster_label = db.labels_
65 | 
66 | plt.figure()
67 | plt.scatter(score_train[:, 0], score_train[:, 1], c = cluster_label, s=20, cmap='viridis')
68 | plt.xlabel('PC1 scores')
69 | plt.ylabel('PC2 scores')
70 | 
71 | print('Cluster labels: ', np.unique(cluster_label))


--------------------------------------------------------------------------------
/Chapter_Clustering_GMM/DBSCAN_illustration.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          DBSCAN algorithm for the illustration example
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | from sklearn.datasets import make_blobs
 9 | from sklearn.cluster import DBSCAN
10 | 
11 | #%% generate data
12 | n_samples = 1500
13 | X, _ = make_blobs(n_samples=n_samples, random_state=100)
14 | 
15 | plt.figure()
16 | plt.scatter(X[:,0], X[:,1])
17 | 
18 | rotation_matrix = [[0.60, -0.70], [-0.5, 0.7]]
19 | X_transformed = np.dot(X, rotation_matrix)
20 | 
21 | plt.figure()
22 | plt.scatter(X_transformed[:,0], X_transformed[:,1])
23 | 
24 | #%% fit DBSCAN model
25 | db = DBSCAN(eps = 0.5, min_samples = 5).fit(X_transformed) # talk about impact of parameters
26 | cluster_label = db.labels_
27 | 
28 | plt.figure()
29 | plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c = cluster_label, s=20, cmap='viridis')
30 | plt.xlabel('Variable 1')
31 | plt.ylabel('Variable 2')
32 | 
33 | print('Cluster labels: ', np.unique(cluster_label))


--------------------------------------------------------------------------------
/Chapter_Clustering_GMM/Etch_data_explore.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          exploration of Etch data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% fetch data
10 | import scipy.io
11 | 
12 | matlab_data = scipy.io.loadmat('MACHINE_Data.mat', struct_as_record = False)
13 | Etch_data = matlab_data['LAMDATA']
14 | calibration_dataAll = Etch_data[0,0].calibration # calibration_dataAll[i,0] corresponds to a 2D data from ith batch where columns correspond to different variables 
15 | 
16 | variable_names = Etch_data[0,0].variables
17 | 
18 | #%% plot data of a particular variable for all calibration experiment
19 | plt.figure()
20 | _ = [plt.plot(calibration_dataAll[expt,0][:,6]) for expt in range(calibration_dataAll.size)]
21 | plt.xlabel('Time (s)')
22 | plt.ylabel(variable_names[6])
23 | 
24 | plt.figure()
25 | _ = [plt.plot(calibration_dataAll[expt,0][:,19]) for expt in range(calibration_dataAll.size)]
26 | plt.xlabel('Time (s)')
27 | plt.ylabel(variable_names[19])
28 | 
29 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
30 | ##                          perform Multiway PCA
31 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
32 | 
33 | #%% generate unfolded data matrix
34 | n_vars = variable_names.size - 2 # first 2 columns are not process variables
35 | n_samples = 85 # following the work of He et al.
36 | 
37 | unfolded_dataMatrix = np.empty((1,n_vars*n_samples))
38 | for expt in range(calibration_dataAll.size):
39 |     calibration_expt = calibration_dataAll[expt,0][5:90,2:] # removing first 5 measurements as done in He et al.
40 |     
41 |     if calibration_expt.shape[0] < 85:
42 |         continue
43 |     
44 |     unfolded_row = np.ravel(calibration_expt, order='F')[np.newaxis,:]
45 |     unfolded_dataMatrix = np.vstack((unfolded_dataMatrix, unfolded_row))
46 | 
47 | unfolded_dataMatrix = unfolded_dataMatrix[1:,:]
48 | 
49 | #%% scale data
50 | from sklearn.preprocessing import StandardScaler
51 | 
52 | scaler = StandardScaler()
53 | data_train_normal = scaler.fit_transform(unfolded_dataMatrix)
54 |            
55 | #%% PCA
56 | from sklearn.decomposition import PCA
57 | 
58 | pca = PCA(n_components = 3) # following the work of He et al.
59 | score_train = pca.fit_transform(data_train_normal)
60 | 
61 | #%% visualize in 2D
62 | plt.figure()
63 | plt.scatter(score_train[:,0],score_train[:,1])
64 | plt.xlabel('PC1 scores')
65 | plt.ylabel('PC2 scores')


--------------------------------------------------------------------------------
/Chapter_Clustering_GMM/GMM_clustering.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          GMM clustering of Etch data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | from sklearn.mixture import GaussianMixture
 9 | 
10 | #%% fetch data
11 | import scipy.io
12 | 
13 | matlab_data = scipy.io.loadmat('MACHINE_Data.mat', struct_as_record = False)
14 | Etch_data = matlab_data['LAMDATA']
15 | calibration_dataAll = Etch_data[0,0].calibration # calibration_dataAll[i,0] corresponds to a 2D data from ith batch where columns correspond to different variables 
16 | 
17 | variable_names = Etch_data[0,0].variables
18 | 
19 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
20 | ##                          perform Multiway PCA
21 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
22 | 
23 | #%% generate unfolded data matrix
24 | n_vars = variable_names.size - 2 # first 2 columns are not process variables
25 | n_samples = 85 # following the work of He et al.
26 | 
27 | unfolded_dataMatrix = np.empty((1,n_vars*n_samples))
28 | for expt in range(calibration_dataAll.size):
29 |     calibration_expt = calibration_dataAll[expt,0][5:90,2:] # removing first 5 measurements as done in He et al.
30 |     
31 |     if calibration_expt.shape[0] < 85:
32 |         continue
33 |     
34 |     unfolded_row = np.ravel(calibration_expt, order='F')[np.newaxis,:]
35 |     unfolded_dataMatrix = np.vstack((unfolded_dataMatrix, unfolded_row))
36 | 
37 | unfolded_dataMatrix = unfolded_dataMatrix[1:,:]
38 | 
39 | #%% scale data
40 | from sklearn.preprocessing import StandardScaler
41 | 
42 | scaler = StandardScaler()
43 | data_train_normal = scaler.fit_transform(unfolded_dataMatrix)
44 |            
45 | #%% PCA
46 | from sklearn.decomposition import PCA
47 | 
48 | pca = PCA(n_components = 3) # following the work of He et al.
49 | score_train = pca.fit_transform(data_train_normal)
50 | 
51 | #%% visualize in 2D
52 | plt.figure()
53 | plt.scatter(score_train[:,0],score_train[:,1])
54 | plt.xlabel('PC1 scores')
55 | plt.ylabel('PC2 scores')
56 | 
57 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
58 | ##                          GMM on PCA scores
59 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
60 | #%% finding # of components via BIC 
61 | BICs = []
62 | lowestBIC = np.inf
63 | for n_cluster in range(1, 10):
64 |     gmm = GaussianMixture(n_components = n_cluster, random_state = 100)
65 |     gmm.fit(score_train)
66 |     BIC = gmm.bic(score_train)
67 |     BICs.append(BIC)
68 |     
69 |     if BIC < lowestBIC:
70 |         optimal_n_cluster = n_cluster 
71 |         lowestBIC = BIC
72 | 
73 | plt.figure()
74 | plt.plot(range(1,10), BICs, marker='o')
75 | plt.xlabel('Number of components')
76 | plt.ylabel('BIC')
77 | plt.show()
78 | 
79 | #%% fit GMM model to metal-etch data
80 | gmm = GaussianMixture(n_components = optimal_n_cluster, random_state = 100)
81 | cluster_label = gmm.fit_predict(score_train)
82 | 
83 | plt.figure()
84 | plt.scatter(score_train[:, 0], score_train[:, 1], c = cluster_label, s=20, cmap='viridis')
85 | plt.xlabel('PC1 scores')
86 | plt.ylabel('PC2 scores')
87 | 
88 | cluster_centers = gmm.means_
89 | cluster_plot_labels = ['Cluster ' + str(i+1) for i in range(optimal_n_cluster)]
90 | for i in range(optimal_n_cluster):
91 |     plt.scatter(cluster_centers[i, 0], cluster_centers[i, 1], c='red', s=20, marker = '*', alpha=0.5)
92 |     plt.annotate(cluster_plot_labels[i], (cluster_centers[i,0], cluster_centers[i,1]))


--------------------------------------------------------------------------------
/Chapter_Clustering_GMM/GMM_illustration.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          GMM algorithm for the illustration example
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | from sklearn.datasets import make_blobs
 9 | 
10 | #%% generate data
11 | n_samples = 1500
12 | X, _ = make_blobs(n_samples=n_samples, random_state=100)
13 | 
14 | plt.figure()
15 | plt.scatter(X[:,0], X[:,1])
16 | 
17 | rotation_matrix = [[0.60, -0.70], [-0.5, 0.7]]
18 | X_transformed = np.dot(X, rotation_matrix)
19 | 
20 | plt.figure()
21 | plt.scatter(X_transformed[:,0], X_transformed[:,1])
22 | 
23 | #%% fit GMM model
24 | from sklearn.mixture import GaussianMixture
25 | 
26 | gmm = GaussianMixture(n_components = 3, random_state = 100)
27 | cluster_label = gmm.fit_predict(X_transformed)
28 | 
29 | plt.figure()
30 | plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c = cluster_label, s=20, cmap='viridis')
31 | plt.xlabel('Variable 1')
32 | plt.ylabel('Variable 2')
33 | 
34 | cluster_centers = gmm.means_ # cluster centers
35 | cluster_plot_labels = ['Cluster ' + str(i+1) for i in range(gmm.n_components)]
36 | for i in range(gmm.n_components):
37 |     plt.scatter(cluster_centers[i, 0], cluster_centers[i, 1], c='red', s=20, marker = '*', alpha=0.5)
38 |     plt.annotate(cluster_plot_labels[i], (cluster_centers[i,0], cluster_centers[i,1]))
39 | 
40 | #%% membership probabilities
41 | probs = gmm.predict_proba(X_transformed[1069,np.newaxis]) # predict_proba requires 2D array
42 | print('Posterior probablities of clusters 1, 2, 3 for the data-point: ', probs[-1,:])
43 | 
44 | #%% posterior probability calculation
45 | x = X_transformed[1069,np.newaxis]
46 | 
47 | import scipy.stats
48 | g1 = scipy.stats.multivariate_normal(gmm.means_[0,:], gmm.covariances_[0,:]).pdf(x)
49 | g2 = scipy.stats.multivariate_normal(gmm.means_[1,:], gmm.covariances_[1,:]).pdf(x)
50 | g3 = scipy.stats.multivariate_normal(gmm.means_[2,:], gmm.covariances_[2,:]).pdf(x)
51 | print('Local component densities: ', g1, g2, g3)
52 | 
53 | den = gmm.weights_[0]*g1 + gmm.weights_[1]*g2 + gmm.weights_[2]*g3
54 | posterior_prob_cluster1 = gmm.weights_[0]*g1/den
55 | posterior_prob_cluster2 = gmm.weights_[1]*g2/den
56 | posterior_prob_cluster3 = gmm.weights_[2]*g3/den 
57 | print('Posterior probabilities: ', posterior_prob_cluster1, posterior_prob_cluster2, posterior_prob_cluster3)
58 | 
59 | #%% finding # of components via BIC method
60 | BICs = []
61 | lowestBIC = np.inf
62 | for n_cluster in range(1, 10):
63 |     gmm = GaussianMixture(n_components = n_cluster, random_state = 100)
64 |     gmm.fit(X_transformed)
65 |     BIC = gmm.bic(X_transformed)
66 |     BICs.append(BIC)
67 |     
68 |     if BIC < lowestBIC:
69 |         optimal_n_cluster = n_cluster 
70 |         lowestBIC = BIC
71 | 
72 | plt.figure()
73 | plt.plot(range(1,10), BICs, marker='o')
74 | plt.scatter(optimal_n_cluster, lowestBIC, c='red', marker='*', s=1000)
75 | plt.xlabel('Number of clusters')
76 | plt.ylabel('BIC')
77 | plt.show()
78 | 
79 | #%% finding # of components via FJ algorithm
80 | from gmm_mml import GmmMml
81 | gmmFJ = GmmMml(plots=False)
82 | gmmFJ.fit(X_transformed)
83 | cluster_label = gmmFJ.predict(X_transformed)
84 | 
85 | plt.figure()
86 | plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c = cluster_label, s=20, cmap='viridis')
87 | plt.xlabel('Variable 1')
88 | plt.ylabel('Variable 2')
89 | 
90 | clusters = np.unique(cluster_label)
91 | print(clusters)


--------------------------------------------------------------------------------
/Chapter_Clustering_GMM/MACHINE_Data.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_Clustering_GMM/MACHINE_Data.mat


--------------------------------------------------------------------------------
/Chapter_Clustering_GMM/Metal_etch_complete_data_visualize.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          Training & Test data of Etch dataset
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% fetch data
10 | import scipy.io
11 | 
12 | matlab_data = scipy.io.loadmat('MACHINE_Data.mat', struct_as_record = False)
13 | Etch_data = matlab_data['LAMDATA']
14 | calibration_dataAll = Etch_data[0,0].calibration # calibration_dataAll[i,0] corresponds to a 2D data from ith batch where columns correspond to different variables 
15 | test_dataAll = Etch_data[0,0].test
16 | 
17 | variable_names = Etch_data[0,0].variables
18 | 
19 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
20 | ##                          perform Multiway PCA
21 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
22 | 
23 | #%% generate unfolded data matrix
24 | n_vars = variable_names.size - 2 # first 2 columns are not process variables
25 | n_samples = 85 # following the work of He et al.
26 | 
27 | unfolded_dataMatrix = np.empty((1,n_vars*n_samples))
28 | for expt in range(calibration_dataAll.size):
29 |     calibration_expt = calibration_dataAll[expt,0][5:90,2:] # removing first 5 measurements as done in He et al.
30 |     
31 |     if calibration_expt.shape[0] < 85:
32 |         continue
33 |     
34 |     unfolded_row = np.ravel(calibration_expt, order='F')[np.newaxis,:]
35 |     unfolded_dataMatrix = np.vstack((unfolded_dataMatrix, unfolded_row))
36 | 
37 | unfolded_dataMatrix = unfolded_dataMatrix[1:,:]
38 | 
39 | #%% generate unfolded test data matrix (with only 85 samples)
40 | unfolded_TestdataMatrix = np.empty((1,n_vars*n_samples))
41 | for expt in range(test_dataAll.size):
42 |     test_expt = test_dataAll[expt,0][5:90,2:]
43 |     
44 |     if test_expt.shape[0] < 85:
45 |         continue
46 |     
47 |     unfolded_row = np.ravel(test_expt, order='F')[np.newaxis,:]
48 |     unfolded_TestdataMatrix = np.vstack((unfolded_TestdataMatrix, unfolded_row))
49 | 
50 | unfolded_TestdataMatrix = unfolded_TestdataMatrix[1:,:]
51 | 
52 | #%% scale data
53 | from sklearn.preprocessing import StandardScaler
54 | 
55 | scaler = StandardScaler()
56 | data_train_normal = scaler.fit_transform(unfolded_dataMatrix)
57 | data_test_normal = scaler.transform(unfolded_TestdataMatrix)
58 |            
59 | #%% PCA
60 | from sklearn.decomposition import PCA
61 | 
62 | pca = PCA(n_components = 3) # following the work of He et al.
63 | score_train = pca.fit_transform(data_train_normal)
64 | score_test = pca.transform(data_test_normal)
65 | 
66 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
67 | ##                          visualize in PCA score space
68 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
69 | 
70 | #%% visualize in 2D
71 | plt.figure()
72 | plt.scatter(score_train[:,0],score_train[:,1], c='blue', alpha=0.1)
73 | plt.scatter(score_test[:,0],score_test[:,1], c='red', marker = '*')
74 | plt.xlabel('PC1 scores')
75 | plt.ylabel('PC2 scores')
76 | 
77 | #%% visualize in 3D
78 | from mpl_toolkits.mplot3d import Axes3D
79 | fig = plt.figure()
80 | ax = Axes3D(fig)
81 | ax.scatter(score_train[:,0],score_train[:,1],score_train[:,2], c='blue', alpha=0.1)
82 | ax.scatter(score_test[:,0],score_test[:,1],score_test[:,2], c='red', marker = '*')
83 | ax.set_xlabel('PC1 scores')
84 | ax.set_ylabel('PC2 scores')
85 | ax.set_zlabel('PC3 scores')


--------------------------------------------------------------------------------
/Chapter_Clustering_GMM/k_means_clustering.py:
--------------------------------------------------------------------------------
  1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | ##                          K-Means clustering of Etch data
  3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 | 
  5 | #%% import required packages
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | from sklearn.cluster import KMeans
  9 | 
 10 | #%% fetch data
 11 | import scipy.io
 12 | 
 13 | matlab_data = scipy.io.loadmat('MACHINE_Data.mat', struct_as_record = False)
 14 | Etch_data = matlab_data['LAMDATA']
 15 | calibration_dataAll = Etch_data[0,0].calibration # calibration_dataAll[i,0] corresponds to a 2D data from ith batch where columns correspond to different variables 
 16 | 
 17 | variable_names = Etch_data[0,0].variables
 18 | 
 19 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 20 | ##                          perform Multiway PCA
 21 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 22 | 
 23 | #%% generate unfolded data matrix
 24 | n_vars = variable_names.size - 2 # first 2 columns are not process variables
 25 | n_samples = 85 # following the work of He et al.
 26 | 
 27 | unfolded_dataMatrix = np.empty((1,n_vars*n_samples))
 28 | for expt in range(calibration_dataAll.size):
 29 |     calibration_expt = calibration_dataAll[expt,0][5:90,2:] # removing first 5 measurements as done in He et al.
 30 |     
 31 |     if calibration_expt.shape[0] < 85:
 32 |         continue
 33 |     
 34 |     unfolded_row = np.ravel(calibration_expt, order='F')[np.newaxis,:]
 35 |     unfolded_dataMatrix = np.vstack((unfolded_dataMatrix, unfolded_row))
 36 | 
 37 | unfolded_dataMatrix = unfolded_dataMatrix[1:,:]
 38 | 
 39 | #%% scale data
 40 | from sklearn.preprocessing import StandardScaler
 41 | 
 42 | scaler = StandardScaler()
 43 | data_train_normal = scaler.fit_transform(unfolded_dataMatrix)
 44 |            
 45 | #%% PCA
 46 | from sklearn.decomposition import PCA
 47 | 
 48 | pca = PCA(n_components = 3) # following the work of He et al.
 49 | score_train = pca.fit_transform(data_train_normal)
 50 | 
 51 | #%% visualize in 2D
 52 | plt.figure()
 53 | plt.scatter(score_train[:,0],score_train[:,1])
 54 | plt.xlabel('PC1 scores')
 55 | plt.ylabel('PC2 scores')
 56 | 
 57 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 58 | ##                          K-Means on PCA scores
 59 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 60 | 
 61 | #%% determining number of clusters via elbow method
 62 | SSEs = []
 63 | for n_cluster in range(1, 10):
 64 |     kmeans = KMeans(n_clusters = n_cluster, random_state = 100).fit(score_train)
 65 |     SSEs.append(kmeans.inertia_)
 66 | 
 67 | plt.figure()
 68 | plt.plot(range(1,10), SSEs, marker='o')
 69 | plt.xlabel('Number of clusters')
 70 | plt.ylabel('SSEs')
 71 | plt.show()
 72 | 
 73 | #%% fit k-means model
 74 | n_cluster = 3
 75 | kmeans = KMeans(n_clusters = n_cluster, random_state = 100).fit(score_train)
 76 | cluster_label = kmeans.predict(score_train) # can also use kmeans.labels_
 77 | 
 78 | plt.figure()
 79 | plt.scatter(score_train[:, 0], score_train[:, 1], c = cluster_label, s = 20, cmap = 'viridis')
 80 | plt.xlabel('PC1 scores')
 81 | plt.ylabel('PC2 scores')
 82 | 
 83 | cluster_centers = kmeans.cluster_centers_
 84 | cluster_plot_labels = ['Cluster ' + str(i+1) for i in range(n_cluster)]
 85 | for i in range(n_cluster):
 86 |     plt.scatter(cluster_centers[i,0], cluster_centers[i,1], c = 'red', s = 40, marker = '*', alpha = 0.9)
 87 |     plt.annotate(cluster_plot_labels[i], (cluster_centers[i,0], cluster_centers[i,1]))
 88 | 
 89 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 90 | ##                          Analyzing cluster quality via silhouette plots
 91 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 92 | 
 93 | #%% average silhouette score
 94 | from sklearn.metrics import silhouette_samples, silhouette_score
 95 | 
 96 | silhouette_avgValue = silhouette_score(score_train, cluster_label)
 97 | print('Average silhouette score is :', silhouette_avgValue)
 98 | 
 99 | #%% silhouette plot
100 | from matplotlib import cm
101 | 
102 | plt.figure()
103 | silhouette_values = silhouette_samples(score_train, cluster_label)
104 | y_lower, y_upper = 0, 0
105 | yticks = []
106 | for i in range(n_cluster):
107 |     cluster_silhouette_vals = silhouette_values[cluster_label == i]
108 |     cluster_silhouette_vals.sort()
109 |     
110 |     y_upper += len(cluster_silhouette_vals)
111 |     color = cm.nipy_spectral(i / n_cluster)
112 |     plt.barh(range(y_lower, y_upper),cluster_silhouette_vals,height=1.0,edgecolor='none',color=color)
113 |     
114 |     yticks.append((y_lower + y_upper) / 2)
115 |     y_lower += len(cluster_silhouette_vals)
116 | 
117 | plt.axvline(silhouette_avgValue, color="red", linestyle="--")
118 | plt.yticks(yticks, np.arange(n_cluster)+1)
119 | plt.xlabel('Silhouette coefficient values')
120 | plt.ylabel('Cluster')


--------------------------------------------------------------------------------
/Chapter_Clustering_GMM/k_means_failure.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          An illustration where K-Means clustering fails
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | from sklearn.cluster import KMeans
 9 | 
10 | #%% generate ellipsoidal shaped data
11 | from sklearn.datasets import make_blobs
12 | 
13 | n_samples = 1500
14 | X, y = make_blobs(n_samples=n_samples, random_state=100)
15 | 
16 | plt.figure()
17 | plt.scatter(X[:,0], X[:,1])
18 | 
19 | rotation_matrix = [[0.60, -0.70], [-0.5, 0.7]]
20 | X_transformed = np.dot(X, rotation_matrix)
21 | 
22 | plt.figure()
23 | plt.scatter(X_transformed[:,0], X_transformed[:,1])
24 | 
25 | #%% determining number of clusters via elbow method
26 | SSEs = []
27 | for n_cluster in range(1, 10):
28 |     kmeans = KMeans(n_clusters = n_cluster, random_state = 100).fit(X_transformed)
29 |     SSEs.append(kmeans.inertia_)
30 | 
31 | plt.figure()
32 | plt.plot(range(1,10), SSEs, marker='o')
33 | plt.xlabel('Number of clusters')
34 | plt.ylabel('SSEs')
35 | plt.show()
36 | 
37 | #%% fit k-means model
38 | n_cluster = 3
39 | kmeans = KMeans(n_clusters = n_cluster, random_state = 100).fit(X_transformed)
40 | cluster_label = kmeans.predict(X_transformed) # can also use kmeans.labels_
41 | 
42 | plt.figure()
43 | plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c = cluster_label, s=20, cmap='viridis')
44 | plt.xlabel('Variable 1')
45 | plt.ylabel('Variable 2')
46 | 
47 | cluster_centers = kmeans.cluster_centers_
48 | cluster_plot_labels = ['Cluster ' + str(i+1) for i in range(n_cluster)]
49 | for i in range(n_cluster):
50 |     plt.scatter(cluster_centers[i, 0], cluster_centers[i, 1], c='red', s=20, marker = '*', alpha=0.5)
51 |     plt.annotate(cluster_plot_labels[i], (cluster_centers[i,0], cluster_centers[i,1]))


--------------------------------------------------------------------------------
/Chapter_DecisionTrees_EnsembleLearning/Bagging_illustration.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                  Robust fitting via Bagging: Illustration
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import
 6 | import numpy as np
 7 | 
 8 | #%% generate training samples
 9 | from sklearn import datasets
10 | noisy_moons = datasets.make_moons(n_samples=200, noise=0.3, random_state=10)
11 | X, y = noisy_moons
12 | 
13 | #%% fit DT model
14 | from sklearn import tree
15 | DT_model = tree.DecisionTreeClassifier().fit(X, y)
16 | 
17 | #%% plot decision surface of the decision tree
18 | import matplotlib.pyplot as plt
19 | plt.figure()
20 | cmap = plt.cm.RdBu
21 | 
22 | # Plot the decision boundary
23 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
24 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
25 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
26 | 
27 | # create grid to evaluate model
28 | Z = DT_model.predict(np.c_[xx.ravel(), yy.ravel()])
29 | Z = Z.reshape(xx.shape)
30 | plt.contourf(xx, yy, Z, cmap=cmap, alpha=0.2)
31 | 
32 | # plot the training data-points
33 | plt.scatter(X[:, 0], X[:, 1], marker="o", c=y, cmap=cmap, s=25, edgecolor="k")
34 | plt.xlabel('x1'), plt.ylabel('x2')
35 | 
36 | #%% fit bagging model
37 | from sklearn.ensemble import BaggingClassifier
38 | Bagging_model = BaggingClassifier(n_estimators=500, max_samples=50, random_state=100).fit(X, y)
39 | 
40 | #%% plot decision surface of the decision tree
41 | plt.figure()
42 | 
43 | # Plot the decision boundary
44 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
45 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
46 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
47 | 
48 | # create grid to evaluate model
49 | Z = Bagging_model.predict(np.c_[xx.ravel(), yy.ravel()])
50 | Z = Z.reshape(xx.shape)
51 | plt.contourf(xx, yy, Z, cmap=cmap, alpha=0.2)
52 | 
53 | # plot the trainign data-points
54 | plt.scatter(X[:, 0], X[:, 1], marker="o", c=y, cmap=cmap, s=25, edgecolor="k")
55 | plt.xlabel('x1'), plt.ylabel('x2')
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/Chapter_DecisionTrees_EnsembleLearning/DT_illustration.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                         DT quadratic fitting
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import
 6 | import numpy as np
 7 | np.random.seed(1)
 8 | 
 9 | #%% generate data
10 | x = np.linspace(-1, 1, 50)[:, None]
11 | y = x*x + 0.25 + np.random.normal(0, 0.15, (50,1))
12 | 
13 | #%% plot raw data
14 | import matplotlib.pyplot as plt
15 | plt.figure()
16 | plt.scatter(x, y, edgecolor="black", c="darkorange")
17 | plt.xlabel('x'), plt.ylabel('y')
18 | 
19 | #%% fit regularized DT model
20 | from sklearn import tree
21 | model = tree.DecisionTreeRegressor(max_depth=3).fit(x, y)
22 | 
23 | #%% predict and plot
24 | y_pred = model.predict(x)
25 | 
26 | plt.figure()
27 | plt.scatter(x, y, edgecolor="black", c="darkorange", label="raw data")
28 | plt.plot(x, y_pred, 'yellowgreen', label="predictions")
29 | plt.xlabel('x'), plt.ylabel('y')
30 | plt.legend()
31 | 
32 | #%% plot tree
33 | plt.figure(figsize=(20,8))
34 | tree.plot_tree(model, feature_names=['x'], filled=True, rounded=True)
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/Chapter_DecisionTrees_EnsembleLearning/RF_illustration.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                         RF quadratic fitting
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import
 6 | import numpy as np
 7 | np.random.seed(1)
 8 | 
 9 | #%% generate data
10 | x = np.linspace(-1, 1, 50)[:, None]
11 | y = x*x + 0.25 + np.random.normal(0, 0.15, (50,1))
12 | 
13 | #%% plot raw data
14 | import matplotlib.pyplot as plt
15 | plt.figure()
16 | plt.scatter(x, y, edgecolor="black", c="darkorange")
17 | plt.xlabel('x'), plt.ylabel('y')
18 | 
19 | #%% fit RF model
20 | from sklearn.ensemble import RandomForestRegressor
21 | model = RandomForestRegressor(n_estimators=20).fit(x, y)
22 | 
23 | #%% predict and plot
24 | y_pred = model.predict(x)
25 | 
26 | plt.figure()
27 | plt.scatter(x, y, edgecolor="black", c="darkorange", label="raw data")
28 | plt.plot(x, y_pred, 'yellowgreen', label="predictions")
29 | plt.xlabel('x'), plt.ylabel('y')
30 | plt.legend()
31 | 
32 | #%% get predictions from constituent trees and plot
33 | tree_list = model.estimators_
34 | y_pred_tree1 = tree_list[5].predict(x)
35 | y_pred_tree2 = tree_list[15].predict(x)
36 | 
37 | 
38 | plt.figure()
39 | plt.scatter(x, y, edgecolor="black", c="darkorange", label="raw data")
40 | plt.plot(x, y_pred_tree1, 'red', alpha=0.5, label="DT6 predictions")
41 | plt.plot(x, y_pred_tree2, 'blue', alpha=0.5, label="DT16 predictions")
42 | plt.xlabel('x'), plt.ylabel('y')
43 | plt.legend()
44 | 
45 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
46 | ##                  Impact of # of trees on validation error
47 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
48 | 
49 | #%% generate validation data
50 | np.random.seed(2)
51 | 
52 | x_val = np.linspace(-1, 1, 50)[:, None]
53 | y_val = x_val*x_val + 0.25 + np.random.normal(0, 0.15, (50,1))
54 | 
55 | #%% plot raw validation data
56 | import matplotlib.pyplot as plt
57 | plt.figure()
58 | plt.scatter(x_val, y_val, edgecolor="black", c="darkorange")
59 | plt.xlabel('x'), plt.ylabel('y')
60 | 
61 | #%% fit multiple RFs with # of trees ranging from 10 to 500
62 | from sklearn.metrics import mean_squared_error as mse
63 | 
64 | val_errors = []
65 | n_tree_grid = np.arange(2,250,5)
66 | for n_tree in n_tree_grid:
67 |     model = RandomForestRegressor(n_estimators=n_tree).fit(x, y)
68 |     y_val_pred = model.predict(x_val)
69 |     val_errors.append(mse(y_val, y_val_pred))
70 | 
71 | #%% plot validation errors
72 | plt.figure()
73 | plt.plot(n_tree_grid, val_errors, 'm')  
74 | plt.xlabel('# of trees'), plt.ylabel('Validation MSE')
75 |     
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/Chapter_DecisionTrees_EnsembleLearning/SoftSensing_ConcreteStrength_PLS.py:
--------------------------------------------------------------------------------
  1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | ##       Soft Sensing via PLS in Concrete Construction Industry
  3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 | 
  5 | #%% import required packages
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | #%% read data
 10 | data = np.loadtxt('cement_strength.txt', delimiter=',', skiprows=1)
 11 | X = data[:,0:-1]
 12 | y = data[:,-1][:,None]
 13 | 
 14 | #%% plot raw output data
 15 | plt.figure()
 16 | plt.plot(y, color='navy', linestyle = ':', marker='.', linewidth=0.5, markeredgecolor = 'k')
 17 | plt.xlabel('Sample #')
 18 | plt.ylabel('Concrete strength (MPa)')
 19 | 
 20 | #%% separate train and test data
 21 | from sklearn.model_selection import train_test_split
 22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 100)
 23 | 
 24 | #%% scale data
 25 | from sklearn.preprocessing import StandardScaler
 26 | 
 27 | X_scaler = StandardScaler()
 28 | X_train_normal = X_scaler.fit_transform(X_train)
 29 | X_test_normal = X_scaler.transform(X_test)
 30 | 
 31 | y_scaler = StandardScaler()
 32 | y_train_normal = y_scaler.fit_transform(y_train)
 33 | y_test_normal = y_scaler.transform(y_test)
 34 | 
 35 | #%% Finding # latents using kFold cross validation
 36 | from sklearn.model_selection import KFold
 37 | from sklearn.metrics import mean_squared_error
 38 | from sklearn.cross_decomposition import PLSRegression
 39 | 
 40 | scaler = StandardScaler()
 41 | 
 42 | fit_MSE = []
 43 | validate_MSE = []
 44 | for n_comp in range(1,9):
 45 |     local_fit_MSE = []
 46 |     local_validate_MSE = []
 47 |     
 48 |     kfold = KFold(n_splits = 10, shuffle = True, random_state = 100)
 49 |     for fit_index, validate_index in kfold.split(y_train):
 50 |         X_fit_normal = scaler.fit_transform(X_train[fit_index])
 51 |         X_validate_normal = scaler.transform(X_train[validate_index])
 52 |         
 53 |         y_fit_normal = scaler.fit_transform(y_train[fit_index])
 54 |         y_validate_normal = scaler.transform(y_train[validate_index])
 55 |         
 56 |         pls = PLSRegression(n_components = n_comp)
 57 |         pls.fit(X_fit_normal, y_fit_normal)
 58 |         
 59 |         local_fit_MSE.append(mean_squared_error(y_fit_normal, pls.predict(X_fit_normal)))
 60 |         local_validate_MSE.append(mean_squared_error(y_validate_normal, 
 61 |                                                         pls.predict(X_validate_normal)))
 62 |     
 63 |     fit_MSE.append(np.mean(local_fit_MSE))
 64 |     validate_MSE.append(np.mean(local_validate_MSE))
 65 | 
 66 | 
 67 | # plot
 68 | plt.figure()
 69 | plt.plot(range(1,9), fit_MSE, 'b*', label = 'Training MSE')
 70 | plt.plot(range(1,9), validate_MSE, 'r*', label = 'Validation MSE')
 71 | plt.xticks(range(1,9))
 72 | plt.ylabel('Mean Squared Error (MSE)')
 73 | plt.xlabel('# of latents')
 74 | plt.legend()
 75 | 
 76 | #%% fit PLS model
 77 | pls = PLSRegression(n_components = 7)
 78 | pls.fit(X_train_normal, y_train_normal)
 79 | 
 80 | #%% predict and plot
 81 | y_train_normal_predicted = pls.predict(X_train_normal)
 82 | y_test_normal_predicted = pls.predict(X_test_normal)
 83 | 
 84 | y_train_predicted = y_scaler.inverse_transform(y_train_normal_predicted)
 85 | y_test_predicted = y_scaler.inverse_transform(y_test_normal_predicted)
 86 | 
 87 | plt.figure()
 88 | plt.plot(y_train, y_train_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
 89 | plt.plot(y_train, y_train, '-r', linewidth=0.5)
 90 | plt.xlabel('raw training data')
 91 | plt.ylabel('prediction')
 92 | 
 93 | plt.figure()
 94 | plt.plot(y_test, y_test_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
 95 | plt.plot(y_test, y_test, '-r', linewidth=0.5)
 96 | plt.xlabel('raw test data')
 97 | plt.ylabel('prediction')
 98 | 
 99 | #%% check training vs test accuracy
100 | from sklearn.metrics import r2_score
101 | print('Accuracy over training data: ', r2_score(y_train, y_train_predicted))
102 | print('Accuracy over test data: ', r2_score(y_test, y_test_predicted))
103 | 


--------------------------------------------------------------------------------
/Chapter_DecisionTrees_EnsembleLearning/SoftSensing_ConcreteStrength_RF.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##       Soft Sensing via Random Forests in Concrete Construction Industry
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% read data
10 | data = np.loadtxt('cement_strength.txt', delimiter=',', skiprows=1)
11 | X = data[:,0:-1]
12 | y = data[:,-1]
13 | 
14 | #%% plot raw output data
15 | plt.figure()
16 | plt.plot(y, color='navy', linestyle = ':', marker='.', linewidth=0.5, markeredgecolor = 'k')
17 | plt.xlabel('Sample #')
18 | plt.ylabel('Concrete strength (MPa)')
19 | 
20 | #%% separate training and test data
21 | from sklearn.model_selection import train_test_split
22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 100)
23 | 
24 | #%% fit RF model
25 | from sklearn.ensemble import RandomForestRegressor
26 | model = RandomForestRegressor(n_estimators=200, max_features=3, oob_score=True, random_state=1).fit(X_train, y_train)
27 | 
28 | #%% predict and plot
29 | y_train_predicted = model.predict(X_train)
30 | y_test_predicted = model.predict(X_test)
31 | 
32 | plt.figure()
33 | plt.plot(y_train, y_train_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
34 | plt.plot(y_train, y_train, '-r', linewidth=0.5)
35 | plt.xlabel('raw training data')
36 | plt.ylabel('prediction')
37 | 
38 | plt.figure()
39 | plt.plot(y_test, y_test_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
40 | plt.plot(y_test, y_test, '-r', linewidth=0.5)
41 | plt.xlabel('raw test data')
42 | plt.ylabel('prediction')
43 | 
44 | #%% check training vs test accuracy
45 | from sklearn.metrics import r2_score
46 | print('Accuracy over training data: ', r2_score(y_train, y_train_predicted))
47 | print('Accuracy over test data: ', r2_score(y_test, y_test_predicted))
48 | 
49 | #%% oob_score
50 | print('OOB score: ', model.oob_score_)
51 | 
52 | #%% feature importances
53 | var_names = ['cement','slag','flyash','water','superplasticizer','coarseaggregate','fineaggregate','age']
54 | importances = model.feature_importances_
55 | 
56 | plt.figure()
57 | plt.barh(var_names, importances)
58 | plt.xlabel('Feature importances')


--------------------------------------------------------------------------------
/Chapter_DecisionTrees_EnsembleLearning/SoftSensing_WastewaterPlant_PLS.py:
--------------------------------------------------------------------------------
  1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | ##       Soft sensing via PLS on UCI Wastewater Treatment Plant data
  3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 | 
  5 | #%% import packages
  6 | import numpy as np
  7 | from matplotlib import pyplot as plt
  8 | 
  9 | #%% read data
 10 | import pandas as pd
 11 | 
 12 | data_raw = pd.read_csv('water-treatment.data', header=None,na_values="?" ) # dataset uses '?' to denote missing value
 13 | X_raw = data_raw.iloc[:,1:23]
 14 | y_raw = data_raw.iloc[:,29]
 15 | 
 16 | #%% handle missing data
 17 | # genearte a dataframe from X and y
 18 | data = pd.concat([X_raw, y_raw], axis=1)
 19 | 
 20 | # remove rows with missing data
 21 | data.dropna(axis=0, how='any', inplace=True)
 22 | 
 23 | #%% separate inputs and output
 24 | X = data.iloc[:,:-1].values
 25 | y = data.iloc[:,22:23].values
 26 | 
 27 | #%% separate train and test data
 28 | from sklearn.model_selection import train_test_split
 29 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)
 30 | 
 31 | #%% scale data
 32 | from sklearn.preprocessing import StandardScaler
 33 | 
 34 | X_scaler = StandardScaler()
 35 | X_train_normal = X_scaler.fit_transform(X_train)
 36 | X_test_normal = X_scaler.transform(X_test)
 37 | 
 38 | y_scaler = StandardScaler()
 39 | y_train_normal = y_scaler.fit_transform(y_train)
 40 | y_test_normal = y_scaler.transform(y_test)
 41 | 
 42 | #%% Finding # latents using kFold cross validation
 43 | from sklearn.model_selection import KFold
 44 | from sklearn.metrics import mean_squared_error
 45 | from sklearn.cross_decomposition import PLSRegression
 46 | 
 47 | scaler = StandardScaler()
 48 | 
 49 | fit_MSE = []
 50 | validate_MSE = []
 51 | for n_comp in range(1,23):
 52 |     local_fit_MSE = []
 53 |     local_validate_MSE = []
 54 |     
 55 |     kfold = KFold(n_splits = 10, shuffle = True, random_state = 100)
 56 |     for fit_index, validate_index in kfold.split(y_train):
 57 |         X_fit_normal = scaler.fit_transform(X_train[fit_index])
 58 |         X_validate_normal = scaler.transform(X_train[validate_index])
 59 |         
 60 |         y_fit_normal = scaler.fit_transform(y_train[fit_index])
 61 |         y_validate_normal = scaler.transform(y_train[validate_index])
 62 |         
 63 |         pls = PLSRegression(n_components = n_comp)
 64 |         pls.fit(X_fit_normal, y_fit_normal)
 65 |         
 66 |         local_fit_MSE.append(mean_squared_error(y_fit_normal, pls.predict(X_fit_normal)))
 67 |         local_validate_MSE.append(mean_squared_error(y_validate_normal, 
 68 |                                                         pls.predict(X_validate_normal)))
 69 |     
 70 |     fit_MSE.append(np.mean(local_fit_MSE))
 71 |     validate_MSE.append(np.mean(local_validate_MSE))
 72 | 
 73 | 
 74 | # plot
 75 | plt.figure()
 76 | plt.plot(range(1,23), fit_MSE, 'b*', label = 'Training MSE')
 77 | plt.plot(range(1,23), validate_MSE, 'r*', label = 'Validation MSE')
 78 | plt.xticks(range(1,23))
 79 | plt.ylabel('Mean Squared Error (MSE)')
 80 | plt.xlabel('# of latents')
 81 | plt.legend()
 82 | 
 83 | #%% fit PLS model
 84 | pls = PLSRegression(n_components = 10)
 85 | pls.fit(X_train_normal, y_train_normal)
 86 | 
 87 | #%% predict and plot
 88 | y_train_normal_predicted = pls.predict(X_train_normal)
 89 | y_test_normal_predicted = pls.predict(X_test_normal)
 90 | 
 91 | y_train_predicted = y_scaler.inverse_transform(y_train_normal_predicted)
 92 | y_test_predicted = y_scaler.inverse_transform(y_test_normal_predicted)
 93 | 
 94 | plt.figure()
 95 | plt.plot(y_train, y_train_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
 96 | plt.plot(y_train, y_train, '-r', linewidth=0.5)
 97 | plt.xlabel('raw training data')
 98 | plt.ylabel('prediction')
 99 | 
100 | plt.figure()
101 | plt.plot(y_test, y_test_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
102 | plt.plot(y_test, y_test, '-r', linewidth=0.5)
103 | plt.xlabel('raw test data')
104 | plt.ylabel('prediction')
105 | 
106 | #%% check training vs test accuracy
107 | from sklearn.metrics import r2_score
108 | print('Accuracy over training data: ', r2_score(y_train, y_train_predicted))
109 | print('Accuracy over test data: ', r2_score(y_test, y_test_predicted))
110 | 


--------------------------------------------------------------------------------
/Chapter_DecisionTrees_EnsembleLearning/SoftSensing_WastewaterPlant_XGBoost.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##       Soft sensing via XGBoost on UCI Wastewater Treatment Plant data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import pandas as pd
 7 | 
 8 | data_raw = pd.read_csv('water-treatment.data', header=None,na_values="?" ) # dataset uses '?' to denote missing value
 9 | X_raw = data_raw.iloc[:,1:23]
10 | y_raw = data_raw.iloc[:,29]
11 | 
12 | #%% handle missing data
13 | # generate a dataframe from inputs dataframe and output series
14 | data = pd.concat([X_raw, y_raw], axis=1)
15 | 
16 | # check for presence of missing values
17 | print(data.info())
18 | 
19 | # remove rows with missing data
20 | data.dropna(axis=0, how='any', inplace=True)
21 | 
22 | print('Number of samples remaining:', data.shape[0])
23 | 
24 | #%% separate inputs and output
25 | X = data.iloc[:,:-1]
26 | y = data.iloc[:,-1]
27 | 
28 | #%% plot 1st input and output to show variability
29 | from matplotlib import pyplot as plt
30 | plt.figure()
31 | plt.plot(X.iloc[:,0].values, color='brown', linestyle = ':', marker='.', linewidth=0.5, markeredgecolor = 'k')
32 | plt.xlabel('Sample #')
33 | plt.ylabel('Input flow to plant')
34 | 
35 | plt.figure()
36 | plt.plot(X.iloc[:,8].values, color='brown', linestyle = ':', marker='.', linewidth=0.5, markeredgecolor = 'k')
37 | plt.xlabel('Sample #')
38 | plt.ylabel('Input conductivity to plant')
39 | 
40 | plt.figure()
41 | plt.plot(y.values, color='navy', linestyle = ':', marker='.', linewidth=0.5, markeredgecolor = 'k')
42 | plt.xlabel('Sample #')
43 | plt.ylabel('Output Conductivity')
44 | 
45 | #%% separate fitting, validation, and test data
46 | from sklearn.model_selection import train_test_split
47 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)
48 | X_fit, X_val, y_fit, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state = 100)
49 | 
50 | #%% fit XGBoost model
51 | import xgboost 
52 | model = xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, random_state=100)
53 | model.fit(X_fit, y_fit, eval_set=[(X_val, y_val)], early_stopping_rounds=2)
54 | 
55 | #%% predict and plot
56 | y_train_predicted = model.predict(X_train)
57 | y_test_predicted = model.predict(X_test)
58 | 
59 | plt.figure()
60 | plt.plot(y_train, y_train_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
61 | plt.plot(y_train, y_train, '-r', linewidth=0.5)
62 | plt.xlabel('raw training data')
63 | plt.ylabel('prediction')
64 | 
65 | plt.figure()
66 | plt.plot(y_test, y_test_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
67 | plt.plot(y_test, y_test, '-r', linewidth=0.5)
68 | plt.xlabel('raw test data')
69 | plt.ylabel('prediction')
70 | 
71 | #%% check training vs test accuracy
72 | from sklearn.metrics import r2_score
73 | print('Accuracy over training data: ', r2_score(y_train, y_train_predicted))
74 | print('Accuracy over test data: ', r2_score(y_test, y_test_predicted))
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/Chapter_DecisionTrees_EnsembleLearning/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_LatentVariable1/DimensionalityReduction.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | """
 5 | #%% import required packages
 6 | import numpy as np
 7 | import pandas as pd
 8 | from sklearn.preprocessing import StandardScaler
 9 | from sklearn.decomposition import PCA
10 | 
11 | #%% fetch data
12 | data = pd.read_excel('proc1a.xls', skiprows = 1,usecols = 'C:AI')
13 | 
14 | #%% separate train data
15 | data_train = data.iloc[0:69,]
16 |            
17 | #%% scale data
18 | scaler = StandardScaler()
19 | data_train_normal = scaler.fit_transform(data_train)
20 |            
21 | #%% PCA
22 | pca = PCA()
23 | score_train = pca.fit_transform(data_train_normal)
24 | 
25 | #%% confirm no correlation
26 | corr_coef = np.corrcoef(score_train,rowvar = False)
27 | print('Correlation matrix: \n', corr_coef[0:3,0:3]) # printing only a portion
28 | 
29 | #%% visualize explained variance
30 | import matplotlib.pyplot as plt
31 | 
32 | explained_variance = 100*pca.explained_variance_ratio_ # in percentage
33 | cum_explained_variance = np.cumsum(explained_variance) # cumulative % variance explained
34 | 
35 | plt.figure()
36 | plt.plot(cum_explained_variance, 'r+', label = 'cumulative % variance explained')
37 | plt.plot(explained_variance, 'b+' , label = '% variance explained by each PC')
38 | plt.ylabel('Explained variance (in %)')
39 | plt.xlabel('Principal component number')
40 | plt.legend()
41 | plt.show()
42 | 
43 | #%% decide # of PCs to retain and compute reduced data in PC space
44 | n_comp = np.argmax(cum_explained_variance >= 90) + 1
45 | score_train_reduced = score_train[:,0:n_comp]
46 | 
47 | print('Number of PCs cumulatively explaining atleast 90% variance: ', n_comp)
48 | 
49 | #%% confirm that only about 10% of original information is lost
50 | from sklearn.metrics import r2_score
51 | 
52 | V_matrix = pca.components_.T
53 | P_matrix = V_matrix[:,0:n_comp] 
54 | 
55 | data_train_normal_reconstruct = np.dot(score_train_reduced, P_matrix.T)
56 | R2_score = r2_score(data_train_normal, data_train_normal_reconstruct) 
57 | 
58 | print('% information lost = ', 100*(1-R2_score))
59 | 
60 | #%% alternative approach
61 | pca = PCA(n_components = 0.9)
62 | score_train_reduced = pca.fit_transform(data_train_normal)
63 | 
64 | data_train_normal_reconstruct = pca.inverse_transform(score_train_reduced)
65 | R2_score = r2_score(data_train_normal, data_train_normal_reconstruct) 
66 | 
67 | print('% information lost = ', 100*(1-R2_score))
68 | 
69 | #%% plot to compare original and reconstructed variables
70 | var = 32
71 | plt.figure()
72 | plt.plot(data_train_normal[:,var],label = 'Measured data')
73 | plt.plot(data_train_normal_reconstruct[:,var],label = 'Reconstructed data')
74 | plt.ylabel('Variable # '+ str(var))
75 | plt.xlabel('sample #')
76 | plt.legend()


--------------------------------------------------------------------------------
/Chapter_LatentVariable1/DynamicPCA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Chapter: Dimension Reduction and Latent Variable Methods (Part 1)\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "# Topic: Dynamic PCA"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# import required packages\n",
 20 |     "import numpy as np\n",
 21 |     "import pandas as pd\n",
 22 |     "from sklearn.preprocessing import StandardScaler\n",
 23 |     "from sklearn.decomposition import PCA"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# fetch data\n",
 33 |     "data = pd.read_excel('proc1a.xls', skiprows = 1,usecols = 'C:AI')\n",
 34 |     "\n",
 35 |     "# separate train data\n",
 36 |     "data_train = data.iloc[0:69,]"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# augment training data\n",
 46 |     "lag = 5\n",
 47 |     "N = data_train.shape[0]\n",
 48 |     "m = data_train.shape[1]\n",
 49 |     "\n",
 50 |     "data_train_augmented = np.zeros((N-lag,(lag+1)*m))\n",
 51 |     "\n",
 52 |     "for sample in range(lag, N):\n",
 53 |     "    dataBlock = data_train.iloc[sample-lag:sample+1,:].values # converting from pandas dataframe to numpy array\n",
 54 |     "    data_train_augmented[sample-lag,:] = np.reshape(dataBlock, (1,-1), order = 'F')"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 5,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# scale data\n",
 64 |     "scaler = StandardScaler()\n",
 65 |     "data_train_augmented_normal = scaler.fit_transform(data_train_augmented)\n",
 66 |     "\n",
 67 |     "# PCA\n",
 68 |     "pca = PCA()\n",
 69 |     "score_train = pca.fit_transform(data_train_augmented_normal)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": []
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 3",
 83 |    "language": "python",
 84 |    "name": "python3"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 3
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython3",
 96 |    "version": "3.7.4"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 2
101 | }
102 | 


--------------------------------------------------------------------------------
/Chapter_LatentVariable1/DynamicPCA.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | """
 5 | #%% import required packages
 6 | import numpy as np
 7 | import pandas as pd
 8 | from sklearn.preprocessing import StandardScaler
 9 | from sklearn.decomposition import PCA
10 | 
11 | #%% fetch data
12 | data = pd.read_excel('proc1a.xls', skiprows = 1,usecols = 'C:AI')
13 | 
14 | #%% separate train data
15 | data_train = data.iloc[0:69,]
16 | 
17 | # %% augment training data
18 | lag = 5
19 | N = data_train.shape[0]
20 | m = data_train.shape[1]
21 | 
22 | data_train_augmented = np.zeros((N-lag,(lag+1)*m))
23 | 
24 | for sample in range(lag, N):
25 |     dataBlock = data_train.iloc[sample-lag:sample+1,:].values # converting from pandas dataframe to numpy array
26 |     data_train_augmented[sample-lag,:] = np.reshape(dataBlock, (1,-1), order = 'F')
27 |            
28 | #%% scale data
29 | scaler = StandardScaler()
30 | data_train_augmented_normal = scaler.fit_transform(data_train_augmented)
31 | 
32 | #%% PCA
33 | pca = PCA()
34 | score_train = pca.fit_transform(data_train_augmented_normal)
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/Chapter_LatentVariable1/KernelPCA.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | """
 5 | #%% import required packages
 6 | import numpy as np
 7 | import pandas as pd
 8 | from sklearn.preprocessing import StandardScaler
 9 | from sklearn.decomposition import KernelPCA
10 | 
11 | #%% fetch data
12 | data = pd.read_excel('KPCA_example.xlsx')
13 |            
14 | #%% scale data
15 | scaler = StandardScaler()
16 | data_train_normal = scaler.fit_transform(data)
17 |            
18 | #%% PCA
19 | kpca = KernelPCA(kernel='rbf', gamma = 1)
20 | score = kpca.fit_transform(data)
21 | 
22 | #%% visualize explained variance
23 | import matplotlib.pyplot as plt
24 | 
25 | lambdas = kpca.lambdas_
26 | explained_variance_ratio = lambdas/np.sum(lambdas)
27 | 
28 | explained_variance = 100*explained_variance_ratio # in percentage
29 | cum_explained_variance = np.cumsum(explained_variance) # cumulative % variance explained
30 | 
31 | plt.figure()
32 | plt.plot(cum_explained_variance, 'r+', label = 'cumulative % variance explained')
33 | plt.plot(explained_variance, 'b+' , label = '% variance explained by each PC')
34 | plt.ylabel('Explained variance (in %)')
35 | plt.xlabel('Principal component number')
36 | plt.legend()
37 | plt.show()
38 | 
39 | #%% decide # of PCs to retain and compute reduced data in PC space
40 | n_comp = np.argmax(cum_explained_variance >= 90) + 1
41 | score_reduced = score[:,0:n_comp]
42 | 
43 | print('Number of PCs cumulatively explaining atleast 90% variance: ', n_comp)


--------------------------------------------------------------------------------
/Chapter_LatentVariable1/proc1a.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_LatentVariable1/proc1a.xls


--------------------------------------------------------------------------------
/Chapter_LatentVariable1/softSensor_PLS.py:
--------------------------------------------------------------------------------
  1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | ##                          PLS-based Soft Sensor
  3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 | 
  5 | #%% import required packages
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | #%% fetch data
 10 | data = pd.read_csv('kamyr-digester.csv', usecols = range(1,23))        
 11 | 
 12 | #%% pre-process
 13 | # find the # of nan entries in each column
 14 | na_counts = data.isna().sum(axis = 0) 
 15 | 
 16 | # remove columns that have a lot of nan entries
 17 | data_cleaned = data.drop(columns = ['AAWhiteSt-4 ','SulphidityL-4 ']) 
 18 | 
 19 | # remove any row that have any nan entry
 20 | data_cleaned = data_cleaned.dropna(axis = 0) 
 21 | 
 22 | # separate X, y
 23 | y = data_cleaned.iloc[:,0].values[:,np.newaxis] # StandardScaler requires 2D array
 24 | X = data_cleaned.iloc[:,1:].values
 25 | 
 26 | print('Number of samples left: ', X.shape[0])
 27 | 
 28 | #%% separate train and test data
 29 | from sklearn.model_selection import train_test_split
 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)
 31 | 
 32 | #%% scale data
 33 | from sklearn.preprocessing import StandardScaler
 34 | 
 35 | X_scaler = StandardScaler()
 36 | X_train_normal = X_scaler.fit_transform(X_train)
 37 | X_test_normal = X_scaler.transform(X_test)
 38 | 
 39 | y_scaler = StandardScaler()
 40 | y_train_normal = y_scaler.fit_transform(y_train)
 41 | y_test_normal = y_scaler.transform(y_test)
 42 | 
 43 | #%% Finding # latents using kFold cross validation
 44 | from sklearn.model_selection import KFold
 45 | from sklearn.metrics import mean_squared_error
 46 | from sklearn.cross_decomposition import PLSRegression
 47 | import matplotlib.pyplot as plt
 48 | 
 49 | scaler = StandardScaler()
 50 | 
 51 | fit_MSE = []
 52 | validate_MSE = []
 53 | for n_comp in range(1,20):
 54 |     local_fit_MSE = []
 55 |     local_validate_MSE = []
 56 |     
 57 |     kfold = KFold(n_splits = 10, shuffle = True, random_state = 100)
 58 |     for fit_index, validate_index in kfold.split(y_train):
 59 |         X_fit_normal = scaler.fit_transform(X_train[fit_index])
 60 |         X_validate_normal = scaler.transform(X_train[validate_index])
 61 |         
 62 |         y_fit_normal = scaler.fit_transform(y_train[fit_index])
 63 |         y_validate_normal = scaler.transform(y_train[validate_index])
 64 |         
 65 |         pls = PLSRegression(n_components = n_comp)
 66 |         pls.fit(X_fit_normal, y_fit_normal)
 67 |         
 68 |         local_fit_MSE.append(mean_squared_error(y_fit_normal, pls.predict(X_fit_normal)))
 69 |         local_validate_MSE.append(mean_squared_error(y_validate_normal, 
 70 |                                                         pls.predict(X_validate_normal)))
 71 |     
 72 |     fit_MSE.append(np.mean(local_fit_MSE))
 73 |     validate_MSE.append(np.mean(local_validate_MSE))
 74 | 
 75 | 
 76 | # plot
 77 | plt.figure()
 78 | plt.plot(range(1,20), fit_MSE, 'b*', label = 'Training MSE')
 79 | plt.plot(range(1,20), validate_MSE, 'r*', label = 'Validation MSE')
 80 | plt.xticks(range(1,20))
 81 | plt.ylabel('Mean Squared Error (MSE)')
 82 | plt.xlabel('# of latents')
 83 | plt.legend()
 84 | 
 85 | #%% build PLS model
 86 | pls = PLSRegression(n_components = 9)
 87 | pls.fit(X_train_normal, y_train_normal)
 88 | 
 89 | #%% check training vs test accuracy
 90 | y_train_normal_predict = pls.predict(X_train_normal)
 91 | y_test_normal_predict = pls.predict(X_test_normal)
 92 | 
 93 | print('Accuracy over training data: ', pls.score(X_train_normal, y_train_normal))
 94 | print('Accuracy over test data: ', pls.score(X_test_normal, y_test_normal))
 95 | 
 96 | #%% plots of raw and predicted data
 97 | y_train_predict = y_scaler.inverse_transform(y_train_normal_predict)
 98 | y_test_predict = y_scaler.inverse_transform(y_test_normal_predict)
 99 | 
100 | 
101 | plt.figure()
102 | plt.plot(y_train, 'b',  label = 'Raw data')
103 | plt.plot(y_train_predict, 'r', label = 'PLS prediction')
104 | plt.ylabel('Kappa number (training data)')
105 | plt.xlabel('Sample #')
106 | plt.legend()
107 | 
108 | 
109 | plt.figure()
110 | plt.plot(y_test, 'b',  label = 'Raw data')
111 | plt.plot(y_test_predict, 'r',  label = 'PLS prediction')
112 | plt.ylabel('Kappa number (test data)')
113 | plt.xlabel('Sample #')
114 | plt.legend()


--------------------------------------------------------------------------------
/Chapter_LatentVariable2/DimensionalityReduction_ICA.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          ICA model for TEP data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% fetch TE data
10 | TEdata_noFault_train = np.loadtxt('d00.dat').T # data arrangement in d00.dat is different than that in other files
11 | 
12 | # select variables as done in Lee et al.
13 | xmeas = TEdata_noFault_train[:,0:22]
14 | xmv = TEdata_noFault_train[:,41:52]
15 | data_noFault_train = np.hstack((xmeas, xmv))
16 | 
17 | #%% scale data
18 | from sklearn.preprocessing import StandardScaler
19 | scaler = StandardScaler()
20 | data_train_normal = scaler.fit_transform(data_noFault_train)
21 |            
22 | #%% fit ICA model 
23 | from sklearn.decomposition import FastICA
24 | ica = FastICA(max_iter=1000, tol=0.005, random_state=1).fit(data_train_normal)
25 | W = ica.components_
26 | 
27 | #%% confirm L2 norm of all IC scores is 1
28 | S = ica.transform(data_train_normal)
29 | S_L2_norms = np.linalg.norm(S, 2, axis = 0) 
30 | 
31 | #%% sort the ICs in importance order using L2 norm of each row 
32 | L2_norm = np.linalg.norm(W, 2, axis=1)
33 | sort_order = np.flip(np.argsort(L2_norm)) # descending order
34 | L2_norm_sorted_pct = 100*L2_norm[sort_order]/np.sum(L2_norm)
35 | 
36 | plt.figure()
37 | plt.plot(L2_norm, 'b')
38 | plt.xlabel('IC number (unsorted)')
39 | plt.ylabel('L2 norm')
40 | 
41 | plt.figure()
42 | plt.plot(L2_norm_sorted_pct, 'b+')
43 | plt.xlabel('IC number (sorted)')
44 | plt.ylabel('% L2 norm')
45 | 
46 | W_sorted = W[sort_order,:] # row 1 now corresponds to the most important IC and so on
47 | 
48 | #%% decide # of ICs to retain via PCA variance method and compute ICs
49 | from sklearn.decomposition import PCA
50 | pca = PCA().fit(data_train_normal)
51 | 
52 | explained_variance = 100*pca.explained_variance_ratio_ # in percentage
53 | cum_explained_variance = np.cumsum(explained_variance) # cumulative % variance explained
54 | 
55 | n_comp = np.argmax(cum_explained_variance >= 90) + 1
56 | 
57 | print('Number of PCs cumulatively explaining atleast 90% variance: ', n_comp)
58 | 
59 | #%% compute ICs with reduced dimension
60 | Wd = W_sorted[0:n_comp,:]
61 | Sd = np.dot(Wd, data_train_normal.T) # row 1 contains scores of the most important IC
62 | 


--------------------------------------------------------------------------------
/Chapter_LatentVariable2/FDA_illustration.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          Illustration example for FDA/LDA
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | from sklearn.decomposition import PCA
 8 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 9 | from sklearn.preprocessing import StandardScaler
10 | import matplotlib.pyplot as plt
11 | 
12 | #%% generate data
13 | x1_class1 = np.random.uniform(1, 6, 100)
14 | x2_class1 = x1_class1 + 1 + np.random.normal(0,0.5,100)
15 | X_class1 = np.column_stack((x1_class1, x2_class1))
16 | 
17 | 
18 | x1_class2 = np.random.uniform(2, 7, 100)
19 | x2_class2 = x1_class2 - 1 + np.random.normal(0,0.5,100)
20 | X_class2 = np.column_stack((x1_class2, x2_class2))
21 | 
22 | plt.figure()
23 | plt.plot(x1_class1, x2_class1, 'b.', label='Class 1')
24 | plt.plot(x1_class2, x2_class2, 'r.', label='Class 2')
25 | plt.xlabel('x1')
26 | plt.ylabel('x2')
27 | plt.legend()
28 | plt.show()
29 | 
30 | X = np.vstack((X_class1, X_class2))
31 | y = np.concatenate((np.ones(100,), 2*np.ones(100,)))
32 | 
33 | #%% scale data
34 | scalar = StandardScaler()
35 | X_normal = scalar.fit_transform(X)
36 |            
37 | #%% extract latent variables via PCA
38 | pca = PCA(n_components=1)
39 | score_pca = pca.fit_transform(X_normal)
40 | 
41 | plt.figure()
42 | plt.plot(score_pca[0:100], np.zeros((100,)), 'b.')
43 | plt.plot(score_pca[100:], np.zeros((100,)), 'r.')
44 | plt.ylim((-2,100))
45 | plt.xlabel('PCA score')
46 | plt.ylabel('sample #')
47 | 
48 | #%% extract latent variables via LDA
49 | lda = LinearDiscriminantAnalysis(n_components=1)
50 | score_lda = lda.fit_transform(X_normal, y)
51 | 
52 | plt.figure()
53 | plt.plot(score_lda[0:100], np.zeros((100,)), 'b.')
54 | plt.plot(score_lda[100:], np.zeros((100,)), 'r.')
55 | plt.ylim((-2,100))
56 | plt.xlabel('LDA score')
57 | plt.ylabel('sample #')


--------------------------------------------------------------------------------
/Chapter_LatentVariable2/FaultClassification_FDA.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          Fault classification via FDA
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% fetch TEP data for faults 5,10,19 
10 | TEdata_Fault5_train = np.loadtxt('d05.dat')
11 | TEdata_Fault10_train = np.loadtxt('d10.dat')
12 | TEdata_Fault19_train = np.loadtxt('d19.dat')
13 | TEdata_Faulty_train = np.vstack((TEdata_Fault5_train, TEdata_Fault10_train, TEdata_Fault19_train))
14 | 
15 | # select variables as done in Lee et al.
16 | xmeas = TEdata_Faulty_train[:,0:22]
17 | xmv = TEdata_Faulty_train[:,41:52]
18 | data_Faulty_train = np.hstack((xmeas, xmv))
19 | 
20 | # generate sample labels
21 | n_rows_train = TEdata_Fault5_train.shape[0]
22 | y_train = np.concatenate((5*np.ones(n_rows_train,), 10*np.ones(n_rows_train,), 19*np.ones(n_rows_train,)))
23 |          
24 | #%% scale data
25 | from sklearn.preprocessing import StandardScaler
26 | scaler = StandardScaler()
27 | Faultydata_train_scaled = scaler.fit_transform(data_Faulty_train)
28 | 
29 | #%% visualize all scaled variables
30 | plt.figure()
31 | plt.plot(Faultydata_train_scaled)
32 | plt.show()
33 |            
34 | #%% fit LDA model
35 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
36 | lda = LinearDiscriminantAnalysis()
37 | scores_train_lda = lda.fit_transform(Faultydata_train_scaled, y_train)
38 | 
39 | #%% visualize LDA scores
40 | plt.figure()
41 | plt.plot(scores_train_lda[0:n_rows_train,0], scores_train_lda[0:n_rows_train,1], 'b.', label='Fault 5')
42 | plt.plot(scores_train_lda[n_rows_train:2*n_rows_train,0], scores_train_lda[n_rows_train:2*n_rows_train,1], 'r.', label='Fault 10')
43 | plt.plot(scores_train_lda[2*n_rows_train:3*n_rows_train,0], scores_train_lda[2*n_rows_train:3*n_rows_train,1], 'm.', label='Fault 19')
44 | plt.legend()
45 | plt.xlabel('FD1 (training data)')
46 | plt.ylabel('FD2 (training data)')
47 | 
48 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
49 | ##                     Control limit determination for fault5 class
50 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
51 | import scipy.stats
52 | Nj = n_rows_train
53 | k = 2
54 | 
55 | alpha = 0.01# 99% control limit
56 | T2_CL = k*(Nj**2-1)*scipy.stats.f.ppf(1-alpha,k,Nj-k)/(Nj*(Nj-k))
57 | 
58 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
59 | ##                     Fault classification with fault 5 test data
60 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
61 | # mean and covariance for Fault 5 class
62 | scores_train_lda_Fault5 = scores_train_lda[0:n_rows_train,:]
63 | cov_scores_train_Fault5 = np.cov(scores_train_lda_Fault5.T)
64 | mean_scores_train_Fault5 = np.mean(scores_train_lda_Fault5, axis = 0)
65 | 
66 | #%% fetch TE test dta for fault 5
67 | TEdata_Fault5_test = np.loadtxt('d05_te.dat')
68 | TEdata_Fault5_test = TEdata_Fault5_test[160:,:]
69 | n_rows_test = TEdata_Fault5_test.shape[0]
70 | 
71 | # select variables as done in Lee et al.
72 | xmeas = TEdata_Fault5_test[:,0:22]
73 | xmv = TEdata_Fault5_test[:,41:52]
74 | data_Faulty_test = np.hstack((xmeas, xmv))
75 | 
76 | #%% scale data and transform
77 | Faultydata_test_scaled = scaler.transform(data_Faulty_test)
78 | scores_test_lda = lda.transform(Faultydata_test_scaled)
79 | 
80 | #%% compute T2 statistic for test data for Fault 5 class
81 | T2_test = np.zeros((n_rows_test,))
82 | for sample in range(n_rows_test):
83 |     score_sample = scores_test_lda[sample,:]
84 |     score_sample_centered = score_sample - mean_scores_train_Fault5
85 |     T2_test[sample] = np.dot(np.dot(score_sample_centered[np.newaxis,:],np.linalg.inv(cov_scores_train_Fault5)),score_sample_centered[np.newaxis,:].T)
86 | 
87 | #%% plot test prediction
88 | outsideCL_flag = T2_test > T2_CL
89 | insideCL_flag = T2_test <= T2_CL
90 | plt.figure()
91 | plt.plot(scores_test_lda[outsideCL_flag,0], scores_test_lda[outsideCL_flag,1], 'k.', label='outside Fault 5 boundary')
92 | plt.plot(scores_test_lda[insideCL_flag,0], scores_test_lda[insideCL_flag,1], 'b.', label='inside Fault 5 boundary')
93 | plt.xlabel('FD1 (test data)')
94 | plt.ylabel('FD2 (test data)')
95 | plt.legend()
96 | 
97 | print('Percentage  of samples correctly diagnosed as Fault 5: ', 100*np.sum(T2_test < T2_CL)/n_rows_test)
98 | 


--------------------------------------------------------------------------------
/Chapter_LatentVariable2/ICA_illustration.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          Illustration example for ICA vs PCA
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | from sklearn.decomposition import PCA
 8 | from sklearn.decomposition import FastICA
 9 | import matplotlib.pyplot as plt
10 | 
11 | #%% generate independent data
12 | s1 = 2*np.sin(2*np.pi*8*np.arange(500)/500)
13 | s2 = np.random.uniform(-2, 2, 500)
14 | 
15 | plt.figure()
16 | plt.plot(s1)
17 | plt.xlabel('sample #')
18 | plt.ylabel('s1')
19 | 
20 | plt.figure()
21 | plt.plot(s2)
22 | plt.xlabel('sample #')
23 | plt.ylabel('s2')
24 | 
25 | plt.figure()
26 | plt.scatter(s1, s2)
27 | plt.xlabel('s1')
28 | plt.ylabel('s2')
29 | 
30 | #%% generate transformed observed data
31 | x1 = (2/3)*s1 + s2
32 | x2 = (2/3)*s1 + (1/3)*s2
33 | 
34 | X = np.column_stack((x1,x2))
35 | 
36 | plt.figure()
37 | plt.plot(x1)
38 | plt.xlabel('sample #')
39 | plt.ylabel('x1')
40 | 
41 | plt.figure()
42 | plt.plot(x2)
43 | plt.xlabel('sample #')
44 | plt.ylabel('x2')
45 | 
46 | plt.figure()
47 | plt.scatter(x1, x2)
48 | plt.xlabel('x1')
49 | plt.ylabel('x2')
50 |            
51 | #%% extract latent variables via PCA
52 | pca = PCA()
53 | T = pca.fit_transform(X)
54 | 
55 | plt.figure()
56 | plt.plot(T[:,0])
57 | plt.xlabel('sample #')
58 | plt.ylabel('t1')
59 | 
60 | plt.figure()
61 | plt.plot(T[:,1])
62 | plt.xlabel('sample #')
63 | plt.ylabel('t2')
64 | 
65 | plt.figure()
66 | plt.scatter(T[:,0], T[:,1])
67 | plt.xlabel('t1')
68 | plt.ylabel('t2')
69 | 
70 | #%% extract latent variables via ICA
71 | ica = FastICA()
72 | U = ica.fit_transform(X)
73 | 
74 | plt.figure()
75 | plt.plot(U[:,0])
76 | plt.xlabel('sample #')
77 | plt.ylabel('u1')
78 | 
79 | plt.figure()
80 | plt.plot(U[:,1])
81 | plt.xlabel('sample #')
82 | plt.ylabel('u2')
83 | 
84 | plt.figure()
85 | plt.scatter(U[:,0], U[:,1])
86 | plt.xlabel('u1')
87 | plt.ylabel('u2')
88 | 
89 | 


--------------------------------------------------------------------------------
/Chapter_LatentVariable2/TE_processData_explore.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          TE data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% fetch TE data
10 | TEdata_noFault_train = np.loadtxt('d00.dat').T # data arrnagement in d00.dat is different than that in other files
11 | TEdata_Fault_train = np.loadtxt('d10.dat')
12 | 
13 | #%% quick visualize
14 | plt.figure()
15 | plt.plot(TEdata_noFault_train[:,17])
16 | plt.xlabel('sample #')
17 | plt.ylabel('Stripper Tempearture')
18 | plt.title('Normal operation')
19 | 
20 | plt.figure()
21 | plt.plot(TEdata_Fault_train[:,17])
22 | plt.xlabel('sample #')
23 | plt.ylabel('Stripper Tempearture')
24 | plt.title('Faulty operation')      
25 | 
26 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
27 | ##             Visualize normal and faulty data in PC space
28 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
29 | 
30 | #%% scale data
31 | from sklearn.preprocessing import StandardScaler
32 | scaler = StandardScaler()
33 | TEdata_noFault_scaled = scaler.fit_transform(TEdata_noFault_train)
34 | TEdata_Fault_scaled = scaler.transform(TEdata_Fault_train)
35 | 
36 | #%% build PCA model and copmute PC scores
37 | from sklearn.decomposition import PCA
38 | pca = PCA(n_components = 3).fit(TEdata_noFault_scaled)
39 | TEdata_noFault_scores = pca.transform(TEdata_noFault_scaled)
40 | TEdata_Fault_scores = pca.transform(TEdata_Fault_scaled)
41 | 
42 | #%% visualize in 3D plot
43 | from mpl_toolkits.mplot3d import Axes3D
44 | fig = plt.figure()
45 | ax = Axes3D(fig)
46 | ax.scatter(TEdata_noFault_scores[:,0],TEdata_noFault_scores[:,1],TEdata_noFault_scores[:,2], c='blue', alpha=0.1, label='Normal operation')
47 | ax.scatter(TEdata_Fault_scores[:,0],TEdata_Fault_scores[:,1],TEdata_Fault_scores[:,2], c='red', marker = '*', label='Faulty operation')
48 | ax.set_xlabel('PC1 scores')
49 | ax.set_ylabel('PC2 scores')
50 | ax.set_zlabel('PC3 scores')
51 | ax.legend()
52 | 
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/Chapter_LatentVariable2/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_OtherUsefulMethods/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_Preprocessing/EmbeddedMethods_Lasso.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##           Implementing embedded method (Lasso) on simulated process data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import numpy as np
 7 | VSdata = np.loadtxt('VSdata.csv', delimiter=',')
 8 | 
 9 | #%% separate X and y
10 | y = VSdata[:,0]
11 | X = VSdata[:,1:]
12 | 
13 | #%% scale data
14 | from sklearn.preprocessing import StandardScaler
15 | xscaler = StandardScaler()
16 | X_scaled = xscaler.fit_transform(X)
17 | 
18 | yscaler = StandardScaler()
19 | y_scaled = yscaler.fit_transform(y[:,None])
20 | 
21 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
22 | ##           Lasso-based variable selection
23 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
24 | 
25 | #%% fit Lasso model 
26 | from sklearn.linear_model import LassoCV
27 | Lasso_model = LassoCV(cv=5).fit(X_scaled, y_scaled)
28 | 
29 | #%% find the relevant inputs using model coefficients
30 | top_k_inputs = np.argsort(abs(Lasso_model.coef_))[::-1][:10] + 1
31 | print('Relevant inputs: ', top_k_inputs)
32 | 
33 | 


--------------------------------------------------------------------------------
/Chapter_Preprocessing/Embedded_Method_Lasso.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Chapter: Data Preprocessing\n",
  8 |     "\n",
  9 |     "# Topic: Embedded Method: Lasso"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# read data\n",
 19 |     "import numpy as np\n",
 20 |     "VSdata = np.loadtxt('VSdata.csv', delimiter=',')"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "# separate X and y\n",
 30 |     "y = VSdata[:,0]\n",
 31 |     "X = VSdata[:,1:]"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# scale data\n",
 41 |     "from sklearn.preprocessing import StandardScaler\n",
 42 |     "xscaler = StandardScaler()\n",
 43 |     "X_scaled = xscaler.fit_transform(X)\n",
 44 |     "\n",
 45 |     "yscaler = StandardScaler()\n",
 46 |     "y_scaled = yscaler.fit_transform(y[:,None])"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 6,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# fit Lasso model \n",
 56 |     "from sklearn.linear_model import LassoCV\n",
 57 |     "Lasso_model = LassoCV(cv=5).fit(X_scaled, y_scaled.ravel())"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 7,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "Relevant inputs:  [21 22 20 23 24 19 25 18 33 14]\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "# cfind the relevant inputs using model coefficients\n",
 75 |     "top_k_inputs = np.argsort(abs(Lasso_model.coef_))[::-1][:10] + 1\n",
 76 |     "print('Relevant inputs: ', top_k_inputs)"
 77 |    ]
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 3 (ipykernel)",
 83 |    "language": "python",
 84 |    "name": "python3"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 3
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython3",
 96 |    "version": "3.9.7"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 2
101 | }
102 | 


--------------------------------------------------------------------------------
/Chapter_Preprocessing/MLR_VSdata.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##           Implementing MLR on simulated process data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import numpy as np
 7 | VSdata = np.loadtxt('VSdata.csv', delimiter=',')
 8 | VSdata_val = np.loadtxt('VSdata_val.csv', delimiter=',')
 9 | 
10 | #%% separate X and y
11 | y_train = VSdata[:,0]
12 | X_train = VSdata[:,1:]
13 | 
14 | y_val = VSdata_val[:,0]
15 | X_val = VSdata_val[:,1:]
16 | 
17 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
18 | ##           MLR using all variables
19 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
20 | 
21 | #%% fit model on training data 
22 | from sklearn.linear_model import LinearRegression
23 | from sklearn.preprocessing import StandardScaler
24 | from matplotlib import pyplot as plt
25 | 
26 | # scale X
27 | scaler_all = StandardScaler()
28 | X_train_scaled = scaler_all.fit_transform(X_train)
29 | X_val_scaled = scaler_all.transform(X_val)
30 | 
31 | # fit
32 | MLR_all = LinearRegression().fit(X_train_scaled, y_train)
33 | 
34 | # predict
35 | y_val_pred = MLR_all.predict(X_val_scaled)
36 | 
37 | # score
38 | R2_all_train = MLR_all.score(X_train_scaled, y_train)
39 | R2_all = MLR_all.score(X_val_scaled, y_val)
40 | 
41 | # plot raw vs predicted target
42 | plt.figure()
43 | plt.plot(y_val, y_val_pred, '.')
44 | plt.title('Using all variables')
45 | 
46 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
47 | ##           MLR using only 10 relevant variables
48 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
49 | # select only relevant inputs
50 | k = 10
51 | X_train_rel = X_train[:,16:16+k]
52 | X_val_rel = X_val[:,16:16+k]
53 | 
54 | # scale X
55 | scaler_rel = StandardScaler()
56 | X_train_rel_scaled = scaler_rel.fit_transform(X_train_rel)
57 | X_val_rel_scaled = scaler_rel.transform(X_val_rel)
58 | 
59 | # fit
60 | MLR_rel = LinearRegression().fit(X_train_rel_scaled, y_train)
61 | 
62 | # predict
63 | y_val_rel_pred = MLR_rel.predict(X_val_rel_scaled)
64 | 
65 | # score
66 | R2_rel_train = MLR_rel.score(X_train_rel_scaled, y_train)
67 | R2_rel = MLR_rel.score(X_val_rel_scaled, y_val)
68 | 
69 | # plot raw vs predicted target
70 | plt.figure()
71 | plt.plot(y_val, y_val_rel_pred, '.')
72 | plt.title('Using relevant variables')
73 | 


--------------------------------------------------------------------------------
/Chapter_Preprocessing/Missing_data_imputation.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Chapter: Data Preprocessing\n",
 8 |     "\n",
 9 |     "# Topic: Data imputation"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": 1,
15 |    "metadata": {},
16 |    "outputs": [
17 |     {
18 |      "name": "stdout",
19 |      "output_type": "stream",
20 |      "text": [
21 |       "[[1. 2. 5.]\n",
22 |       " [3. 4. 3.]\n",
23 |       " [4. 6. 5.]\n",
24 |       " [8. 8. 7.]]\n"
25 |      ]
26 |     }
27 |    ],
28 |    "source": [
29 |     "# Mean imputation\n",
30 |     "import numpy as np\n",
31 |     "from sklearn.impute import SimpleImputer\n",
32 |     "\n",
33 |     "sample_data = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]\n",
34 |     "mean_imputeModel = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
35 |     "\n",
36 |     "print(mean_imputeModel.fit_transform(sample_data))"
37 |    ]
38 |   },
39 |   {
40 |    "cell_type": "code",
41 |    "execution_count": 2,
42 |    "metadata": {},
43 |    "outputs": [
44 |     {
45 |      "name": "stdout",
46 |      "output_type": "stream",
47 |      "text": [
48 |       "[[1.  2.  4. ]\n",
49 |       " [3.  4.  3. ]\n",
50 |       " [5.5 6.  5. ]\n",
51 |       " [8.  8.  7. ]]\n"
52 |      ]
53 |     }
54 |    ],
55 |    "source": [
56 |     "# KNN imputation\n",
57 |     "from sklearn.impute import KNNImputer\n",
58 |     "\n",
59 |     "knn_imputeModel = KNNImputer(n_neighbors=2)\n",
60 |     "print(knn_imputeModel.fit_transform(sample_data))"
61 |    ]
62 |   }
63 |  ],
64 |  "metadata": {
65 |   "kernelspec": {
66 |    "display_name": "Python 3 (ipykernel)",
67 |    "language": "python",
68 |    "name": "python3"
69 |   },
70 |   "language_info": {
71 |    "codemirror_mode": {
72 |     "name": "ipython",
73 |     "version": 3
74 |    },
75 |    "file_extension": ".py",
76 |    "mimetype": "text/x-python",
77 |    "name": "python",
78 |    "nbconvert_exporter": "python",
79 |    "pygments_lexer": "ipython3",
80 |    "version": "3.9.7"
81 |   }
82 |  },
83 |  "nbformat": 4,
84 |  "nbformat_minor": 2
85 | }
86 | 


--------------------------------------------------------------------------------
/Chapter_Preprocessing/Missing_data_imputation.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##              data imputation
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% Mean imputation
 6 | import numpy as np
 7 | from sklearn.impute import SimpleImputer
 8 | 
 9 | sample_data = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
10 | mean_imputeModel = SimpleImputer(missing_values=np.nan, strategy='mean')
11 | 
12 | print(mean_imputeModel.fit_transform(sample_data))
13 | 
14 | #%% KNN imputation
15 | from sklearn.impute import KNNImputer
16 | 
17 | knn_imputeModel = KNNImputer(n_neighbors=2)
18 | print(knn_imputeModel.fit_transform(sample_data))
19 | 


--------------------------------------------------------------------------------
/Chapter_Preprocessing/Multivariate_outliers_MCD.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##          Multivariate outlier detection via MCD-based Mahalanobis distances
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import numpy as np
 7 | data_2Doutlier = np.loadtxt('complex2D_outlier.csv', delimiter=',')
 8 | 
 9 | # plot
10 | import matplotlib.pyplot as plt
11 | plt.plot(data_2Doutlier[:-30,0], data_2Doutlier[:-30,1], '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
12 | plt.plot(data_2Doutlier[-30:,0], data_2Doutlier[-30:,1], '.r', markeredgecolor='k', markeredgewidth=0.5, ms=11)
13 | plt.xlabel('x1'), plt.ylabel('x2')
14 | plt.title('Raw measurements')
15 | 
16 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
17 | ##                    Non-robust Mahalanobis distances
18 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
19 | 
20 | #%% compute Mahalanobis distances
21 | from sklearn.covariance import EmpiricalCovariance
22 | 
23 | emp_cov = EmpiricalCovariance().fit(data_2Doutlier)
24 | MD_emp_cov = emp_cov.mahalanobis(data_2Doutlier)
25 | 
26 | #%% transform Mahalanobis distances into normal distribution via cubic-root
27 | MD_emp_cov_cubeRoot = np.power(MD_emp_cov, 0.333)
28 | 
29 | #%% find hampel identifier bounds
30 | from scipy import stats
31 | 
32 | median = np.median(MD_emp_cov_cubeRoot)
33 | sigma_MAD = stats.median_absolute_deviation(MD_emp_cov_cubeRoot)
34 | 
35 | upperBound_MD_emp_cov = np.power(median+3*sigma_MAD, 3)
36 | lowerBound_MD_emp_cov = np.power(median-3*sigma_MAD, 3)
37 | 
38 | #%% plot Mahalanobis distances with bounds
39 | plt.figure()
40 | plt.plot(MD_emp_cov[:-30], '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
41 | plt.plot(np.arange(300,330), MD_emp_cov[-30:], '.r', markeredgecolor='k', markeredgewidth=0.5, ms=11)
42 | 
43 | plt.hlines(upperBound_MD_emp_cov, 0, 330, colors='r', linestyles='dashdot', label='Upper bound') 
44 | plt.hlines(lowerBound_MD_emp_cov, 0, 330, colors='r', linestyles='dashed', label='Lower bound') 
45 | 
46 | plt.xlabel('sample #'), plt.ylabel('Mahalanobis distance')
47 | plt.title('Mahalanobis distances of raw measurements')
48 | plt.legend(loc='upper left')
49 | 
50 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
51 | ##                    MCD-based robust Mahalanobis distances
52 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
53 | from sklearn.covariance import MinCovDet
54 | 
55 | MCD_cov = MinCovDet().fit(data_2Doutlier)
56 | MD_MCD = MCD_cov.mahalanobis(data_2Doutlier)
57 | 
58 | #%% transform Mahalanobis distances into normal distribution via cubic-root
59 | MD_MCD_cubeRoot = np.power(MD_MCD, 0.333)
60 | 
61 | #%% find hampel identifier bounds
62 | from scipy import stats
63 | 
64 | median = np.median(MD_MCD_cubeRoot)
65 | sigma_MAD = stats.median_absolute_deviation(MD_MCD_cubeRoot)
66 | 
67 | upperBound_MD_emp_cov = np.power(median+3*sigma_MAD, 3)
68 | lowerBound_MD_emp_cov = np.power(median-3*sigma_MAD, 3)
69 | 
70 | #%% plot Mahalanobis distances with bounds
71 | plt.figure()
72 | plt.plot(MD_MCD[:-30], '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
73 | plt.plot(np.arange(300,330), MD_MCD[-30:], '.r', markeredgecolor='k', markeredgewidth=0.5, ms=11)
74 | 
75 | plt.hlines(upperBound_MD_emp_cov, 0, 330, colors='r', linestyles='dashdot', label='Upper bound') 
76 | plt.hlines(lowerBound_MD_emp_cov, 0, 330, colors='r', linestyles='dashed', label='Lower bound') 
77 | 
78 | plt.xlabel('sample #'), plt.ylabel('Mahalanobis distance')
79 | plt.title('MCD_based Mahalanobis distances of raw measurements')
80 | plt.legend(loc='upper left')
81 | 


--------------------------------------------------------------------------------
/Chapter_Preprocessing/Multivariate_outliers_Mahalanobis_distance.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##          Multivariate outlier detection via Mahalanobis distances
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import numpy as np
 7 | data_2Doutlier = np.loadtxt('simple2D_outlier.csv', delimiter=',')
 8 | 
 9 | # plot
10 | import matplotlib.pyplot as plt
11 | plt.plot(data_2Doutlier[:-5,0], data_2Doutlier[:-5,1], '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
12 | plt.plot(data_2Doutlier[-5:,0], data_2Doutlier[-5:,1], '.r', markeredgecolor='k', markeredgewidth=0.5, ms=11)
13 | plt.xlabel('x1'), plt.ylabel('x2')
14 | plt.title('Raw measurements')
15 | 
16 | #%% compute Mahalanobis distances
17 | from sklearn.covariance import EmpiricalCovariance
18 | 
19 | emp_cov = EmpiricalCovariance().fit(data_2Doutlier)
20 | MD_emp_cov = emp_cov.mahalanobis(data_2Doutlier)
21 | 
22 | #%% transform Mahalanobis distances into normal distribution via cubic-root
23 | MD_cubeRoot = np.power(MD_emp_cov, 0.333)
24 | 
25 | #%% find hampel identifier bounds
26 | from scipy import stats
27 | 
28 | median = np.median(MD_cubeRoot)
29 | sigma_MAD = stats.median_absolute_deviation(MD_cubeRoot)
30 | 
31 | upperBound_MD_emp_cov = np.power(median+3*sigma_MAD, 3)
32 | lowerBound_MD_emp_cov = np.power(median-3*sigma_MAD, 3)
33 | 
34 | #%% plot Mahalanobis distances with bounds (last 5 samples are the outliers)
35 | plt.figure(), plt.plot(MD_emp_cov[:-5], '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
36 | plt.plot(np.arange(300,305), MD_emp_cov[-5:], '.r', markeredgecolor='k', markeredgewidth=0.5, ms=11)
37 | 
38 | plt.hlines(upperBound_MD_emp_cov, 0, 305, colors='r', linestyles='dashdot', label='Upper bound') 
39 | plt.hlines(lowerBound_MD_emp_cov, 0, 305, colors='r', linestyles='dashed', label='Lower bound') 
40 | 
41 | plt.xlabel('sample #'), plt.ylabel('Mahalanobis distance')
42 | plt.title('Mahalanobis distances of raw measurements')
43 | plt.legend(loc='upper left')


--------------------------------------------------------------------------------
/Chapter_Preprocessing/Univariate_Outliers.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Univariate outlier detection
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% Generate oulier-infested data
 6 | import numpy as np
 7 | 
 8 | X = np.random.normal(40, 1, (1500,1))
 9 | X[200:300] = X[200:300] +8; X[1000:1150] = X[1000:1150] + 8
10 | 
11 | # plot
12 | import matplotlib.pyplot as plt
13 | plt.plot(X, '.-')
14 | plt.xlabel('sample #'), plt.ylabel('variable measurement')
15 | plt.title('Raw measurements')
16 | 
17 | #%% 3-sigma rule
18 | # location & spread
19 | mu = np.mean(X)
20 | sigma = np.std(X)
21 | 
22 | # mean and std
23 | print('Estimated mean = ', mu)
24 | print('Estimated standard deviation = ', sigma)
25 | 
26 | # plot
27 | plt.figure()
28 | plt.plot(X, '.-', alpha=0.8, markeredgecolor='k', markeredgewidth=0.1, ms=3)
29 | plt.hlines(mu, 0, 1500, colors='m', linestyles='dashdot', label='Mean') 
30 | plt.hlines(mu+3*sigma, 0, 1500, colors='r', linestyles='dashdot', label='Upper bound') 
31 | plt.hlines(mu-3*sigma, 0, 1500, colors='r', linestyles='dashed', label='Lower bound') 
32 | 
33 | plt.xlabel('sample #'), plt.ylabel('Variable measurement')
34 | plt.xlim((0,1500))
35 | plt.title('3-sigma bounds')
36 | plt.legend(loc='upper right')
37 | 
38 | #%% hampel identifier
39 | # compute median and MAD
40 | from scipy import stats
41 | 
42 | median = np.median(X)
43 | sigma_MAD = stats.median_absolute_deviation(X) # default scaling of 1.4826 is built-in
44 | 
45 | # median & sigma_MAD
46 | print('Estimated robust location = ', median)
47 | print('Estimated robust spread = ', sigma_MAD)
48 | 
49 | # plot
50 | plt.figure()
51 | plt.plot(X, '.-', alpha=0.8, markeredgecolor='k', markeredgewidth=0.1, ms=3)
52 | plt.hlines(median, 0, 1500, colors='m', linestyles='dashdot', label='Mean') 
53 | plt.hlines(median+3*sigma_MAD, 0, 1500, colors='r', linestyles='dashdot', label='Upper bound') 
54 | plt.hlines(median-3*sigma_MAD, 0, 1500, colors='r', linestyles='dashed', label='Lower bound') 
55 | 
56 | plt.xlabel('sample #'), plt.ylabel('Variable measurement')
57 | plt.xlim((0,1500))
58 | plt.title('Hampel identifier bounds')
59 | plt.legend(loc='upper right')


--------------------------------------------------------------------------------
/Chapter_Preprocessing/WrapperMethods_backward_SFS.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##           Implementing backward SFS on simulated process data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import numpy as np
 7 | VSdata = np.loadtxt('VSdata.csv', delimiter=',')
 8 | 
 9 | #%% separate X and y
10 | y = VSdata[:,0]
11 | X = VSdata[:,1:]
12 | 
13 | #%% scale data
14 | from sklearn.preprocessing import StandardScaler
15 | xscaler = StandardScaler()
16 | X_scaled = xscaler.fit_transform(X)
17 | 
18 | yscaler = StandardScaler()
19 | y_scaled = yscaler.fit_transform(y[:,None])
20 | 
21 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
22 | ##           SFS-based variable selection
23 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
24 | from sklearn.feature_selection import SequentialFeatureSelector
25 | from sklearn.linear_model import LinearRegression
26 | 
27 | BSFS = SequentialFeatureSelector(LinearRegression(), n_features_to_select=10, direction='backward', cv=5).fit(X_scaled, y_scaled)
28 | 
29 | #%% check selected inputs
30 | print('Inputs selected: ', BSFS.get_support(indices=True)+1) # returns integer index of the features selected
31 | 
32 | #%% reduce X to only top relevant inputs
33 | X_relevant = BSFS.transform(X)


--------------------------------------------------------------------------------
/Chapter_Preprocessing/Wrapper_Methods_backward_SFS.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Chapter: Data Preprocessing\n",
  8 |     "\n",
  9 |     "# Topic: Wrapper Method: Backward SFS"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 3,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# read data\n",
 19 |     "import numpy as np\n",
 20 |     "VSdata = np.loadtxt('VSdata.csv', delimiter=',')"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 4,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "# separate X and y\n",
 30 |     "y = VSdata[:,0]\n",
 31 |     "X = VSdata[:,1:]"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 5,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# scale data\n",
 41 |     "from sklearn.preprocessing import StandardScaler\n",
 42 |     "xscaler = StandardScaler()\n",
 43 |     "X_scaled = xscaler.fit_transform(X)\n",
 44 |     "\n",
 45 |     "yscaler = StandardScaler()\n",
 46 |     "y_scaled = yscaler.fit_transform(y[:,None])"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 7,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# SFS-based variable selection\n",
 56 |     "from sklearn.feature_selection import SequentialFeatureSelector\n",
 57 |     "from sklearn.linear_model import LinearRegression\n",
 58 |     "\n",
 59 |     "BSFS = SequentialFeatureSelector(LinearRegression(), n_features_to_select=10, direction='backward', cv=5).fit(X_scaled, y_scaled)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 8,
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stdout",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "Inputs selected:  [18 19 20 21 22 23 24 25 31 33]\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "# check selected inputs\n",
 77 |     "print('Inputs selected: ', BSFS.get_support(indices=True)+1) # returns integer index of the features selected"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 9,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# reduce X to only top relevant inputs\n",
 87 |     "X_relevant = BSFS.transform(X)"
 88 |    ]
 89 |   }
 90 |  ],
 91 |  "metadata": {
 92 |   "kernelspec": {
 93 |    "display_name": "Python 3 (ipykernel)",
 94 |    "language": "python",
 95 |    "name": "python3"
 96 |   },
 97 |   "language_info": {
 98 |    "codemirror_mode": {
 99 |     "name": "ipython",
100 |     "version": 3
101 |    },
102 |    "file_extension": ".py",
103 |    "mimetype": "text/x-python",
104 |    "name": "python",
105 |    "nbconvert_exporter": "python",
106 |    "pygments_lexer": "ipython3",
107 |    "version": "3.9.7"
108 |   }
109 |  },
110 |  "nbformat": 4,
111 |  "nbformat_minor": 2
112 | }
113 | 


--------------------------------------------------------------------------------
/Chapter_Preprocessing/deNoising_process_signals.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    De-noising Process Signals
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import numpy as np
 7 | noisy_signal = np.loadtxt('noisy_flow_signal.csv', delimiter=',')
 8 | 
 9 | #%% SMA filter
10 | import pandas as pd
11 | 
12 | windowSize = 15
13 | smoothed_signal_MA = pd.DataFrame(noisy_signal).rolling(windowSize).mean().values
14 | 
15 | #%% SG filter
16 | from scipy.signal import savgol_filter
17 | 
18 | smoothed_signal_SG = savgol_filter(noisy_signal, window_length = 15, polyorder = 2)
19 | 
20 | #%% plots
21 | from matplotlib import pyplot as plt
22 | 
23 | plt.figure(figsize=(11,3))
24 | plt.plot(noisy_signal, alpha=0.3, label='Noisy signal')
25 | plt.plot(smoothed_signal_MA, color='m', label='SMA smoothed signal')
26 | plt.plot(smoothed_signal_SG, color='orange', label='SG smoothed signal')
27 | plt.xlabel('Sample #'), plt.ylabel('Value')
28 | plt.legend()


--------------------------------------------------------------------------------
/Chapter_Preprocessing/filterMethods.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##           Implementing filter methods on simulated process data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import numpy as np
 7 | VSdata = np.loadtxt('VSdata.csv', delimiter=',')
 8 | 
 9 | #%% separate X and y
10 | y = VSdata[:,0]
11 | X = VSdata[:,1:]
12 | 
13 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
14 | ##           Linear correlation-based variable selection
15 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
16 | 
17 | # compute linear correlation based scores 
18 | from sklearn.feature_selection import SelectKBest
19 | from sklearn.feature_selection import f_regression
20 | 
21 | VSmodel_Correlation = SelectKBest(f_regression, k=10).fit(X, y)
22 | input_scores = VSmodel_Correlation.scores_
23 | 
24 | # find the top ranked inputs
25 | top_k_inputs_Correlation = np.argsort(input_scores)[::-1][:10] + 1#  [::-1] reverses the array returned by argsort() and [:n] gives that last n elements
26 | print(top_k_inputs_Correlation)
27 | 
28 | # reduce X to only top relevant inputs
29 | X_relevant = VSmodel_Correlation.transform(X)
30 | 
31 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
32 | ##           MI-based variable selection
33 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
34 | 
35 | # compute linear correlation based scores 
36 | from sklearn.feature_selection import mutual_info_regression
37 | 
38 | VSmodel_MI = SelectKBest(mutual_info_regression, k=10).fit(X, y)
39 | input_scores = VSmodel_MI.scores_
40 | 
41 | # find the top ranked inputs
42 | top_k_inputs_MI = np.argsort(input_scores)[::-1][:10] #  [::-1] reverses the array returned by argsort() and [:n] gives that last n elements
43 | print(top_k_inputs_MI)
44 | 
45 | # reduce X to only top relevant inputs
46 | X_relevant = VSmodel_MI.transform(X)


--------------------------------------------------------------------------------
/Chapter_Preprocessing/filter_Methods.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Chapter: Data Preprocessing\n",
  8 |     "\n",
  9 |     "# Topic: Filter Methods for Variable Selection"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# read data\n",
 19 |     "import numpy as np\n",
 20 |     "VSdata = np.loadtxt('VSdata.csv', delimiter=',')"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "# separate X and y\n",
 30 |     "y = VSdata[:,0]\n",
 31 |     "X = VSdata[:,1:]"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n",
 41 |     "##           Linear correlation-based variable selection\n",
 42 |     "## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 7,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "[22 21 24 20 30 29 32 31 28 27]\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "# compute linear correlation based scores \n",
 60 |     "from sklearn.feature_selection import SelectKBest\n",
 61 |     "from sklearn.feature_selection import f_regression\n",
 62 |     "\n",
 63 |     "VSmodel_Correlation = SelectKBest(f_regression, k=10).fit(X, y)\n",
 64 |     "input_scores = VSmodel_Correlation.scores_\n",
 65 |     "\n",
 66 |     "# find the top ranked inputs\n",
 67 |     "top_k_inputs_Correlation = np.argsort(input_scores)[::-1][:10] + 1#  [::-1] reverses the array returned by argsort() and [:n] gives that last n elements\n",
 68 |     "print(top_k_inputs_Correlation)\n",
 69 |     "\n",
 70 |     "# reduce X to only top relevant inputs\n",
 71 |     "X_relevant = VSmodel_Correlation.transform(X)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 5,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n",
 81 |     "##           MI-based variable selection\n",
 82 |     "## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 8,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "name": "stdout",
 92 |      "output_type": "stream",
 93 |      "text": [
 94 |       "[21  8  0  4  5 37  1 30 13 16]\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "# compute linear correlation based scores \n",
100 |     "from sklearn.feature_selection import mutual_info_regression\n",
101 |     "\n",
102 |     "VSmodel_MI = SelectKBest(mutual_info_regression, k=10).fit(X, y)\n",
103 |     "input_scores = VSmodel_MI.scores_\n",
104 |     "\n",
105 |     "# find the top ranked inputs\n",
106 |     "top_k_inputs_MI = np.argsort(input_scores)[::-1][:10] #  [::-1] reverses the array returned by argsort() and [:n] gives that last n elements\n",
107 |     "print(top_k_inputs_MI)\n",
108 |     "\n",
109 |     "# reduce X to only top relevant inputs\n",
110 |     "X_relevant = VSmodel_MI.transform(X)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": []
119 |   }
120 |  ],
121 |  "metadata": {
122 |   "kernelspec": {
123 |    "display_name": "Python 3 (ipykernel)",
124 |    "language": "python",
125 |    "name": "python3"
126 |   },
127 |   "language_info": {
128 |    "codemirror_mode": {
129 |     "name": "ipython",
130 |     "version": 3
131 |    },
132 |    "file_extension": ".py",
133 |    "mimetype": "text/x-python",
134 |    "name": "python",
135 |    "nbconvert_exporter": "python",
136 |    "pygments_lexer": "ipython3",
137 |    "version": "3.9.7"
138 |   }
139 |  },
140 |  "nbformat": 4,
141 |  "nbformat_minor": 2
142 | }
143 | 


--------------------------------------------------------------------------------
/Chapter_Preprocessing/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_RNN/AircraftEngine_dataExplore.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                     Exploring aircraft engine data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | from sklearn.preprocessing import StandardScaler
 9 | 
10 | #%% read data
11 | # training
12 | train_df = pd.read_csv('PM_train.txt', sep=" ", header=None)
13 | train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True) # last two columns are blank
14 | train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
15 |                      's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
16 |                      's15', 's16', 's17', 's18', 's19', 's20', 's21']
17 | 
18 | # test 
19 | test_df = pd.read_csv('PM_test.txt', sep=" ", header=None)
20 | test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
21 | test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
22 |                      's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
23 |                      's15', 's16', 's17', 's18', 's19', 's20', 's21']
24 | 
25 | # actual RUL for each engine-id in the test data
26 | truth_df = pd.read_csv('PM_truth.txt', sep=" ", header=None)
27 | truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)
28 | 
29 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
30 | ##                     exploratory graphs (training)
31 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
32 | # get all sensor data for an engine ID
33 | engineID = 1
34 | engineDataAll = train_df.loc[train_df['id'] == engineID]
35 | engineDataSensor = engineDataAll.iloc[:, 5:]
36 | 
37 | # normalize
38 | scalar = StandardScaler()
39 | engineDataSensor_scaled = scalar.fit_transform(engineDataSensor.values)
40 | 
41 | # plot all sensor data for an engine ID
42 | plt.figure()
43 | plt.plot(engineDataSensor_scaled)
44 | plt.xlabel('Engine cycle')
45 | plt.ylabel('Scaled sensor values')
46 | plt.title('Training sensor Data for engineID ' + str(engineID))
47 | plt.box(False)
48 | 
49 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
50 | ##                     exploratory graphs (test)
51 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
52 | # get all sensor data for an engine ID
53 | engineID = 90
54 | engineDataAll = test_df.loc[test_df['id'] == engineID]
55 | engineDataSensor = engineDataAll.iloc[:, 5:]
56 | 
57 | # normalize
58 | scalar = StandardScaler()
59 | engineDataSensor_scaled = scalar.fit_transform(engineDataSensor.values)
60 | 
61 | # plot all sensor data for an engine ID
62 | plt.figure()
63 | plt.plot(engineDataSensor_scaled)
64 | plt.xlabel('Engine cycle')
65 | plt.ylabel('Scaled sensor values')
66 | plt.title('Test sensor Data for engineID ' + str(engineID))
67 | plt.box(False)
68 | 


--------------------------------------------------------------------------------
/Chapter_RNN/PM_truth.txt:
--------------------------------------------------------------------------------
  1 | 112 
  2 | 98 
  3 | 69 
  4 | 82 
  5 | 91 
  6 | 93 
  7 | 91 
  8 | 95 
  9 | 111 
 10 | 96 
 11 | 97 
 12 | 124 
 13 | 95 
 14 | 107 
 15 | 83 
 16 | 84 
 17 | 50 
 18 | 28 
 19 | 87 
 20 | 16 
 21 | 57 
 22 | 111 
 23 | 113 
 24 | 20 
 25 | 145 
 26 | 119 
 27 | 66 
 28 | 97 
 29 | 90 
 30 | 115 
 31 | 8 
 32 | 48 
 33 | 106 
 34 | 7 
 35 | 11 
 36 | 19 
 37 | 21 
 38 | 50 
 39 | 142 
 40 | 28 
 41 | 18 
 42 | 10 
 43 | 59 
 44 | 109 
 45 | 114 
 46 | 47 
 47 | 135 
 48 | 92 
 49 | 21 
 50 | 79 
 51 | 114 
 52 | 29 
 53 | 26 
 54 | 97 
 55 | 137 
 56 | 15 
 57 | 103 
 58 | 37 
 59 | 114 
 60 | 100 
 61 | 21 
 62 | 54 
 63 | 72 
 64 | 28 
 65 | 128 
 66 | 14 
 67 | 77 
 68 | 8 
 69 | 121 
 70 | 94 
 71 | 118 
 72 | 50 
 73 | 131 
 74 | 126 
 75 | 113 
 76 | 10 
 77 | 34 
 78 | 107 
 79 | 63 
 80 | 90 
 81 | 8 
 82 | 9 
 83 | 137 
 84 | 58 
 85 | 118 
 86 | 89 
 87 | 116 
 88 | 115 
 89 | 136 
 90 | 28 
 91 | 38 
 92 | 20 
 93 | 85 
 94 | 55 
 95 | 128 
 96 | 137 
 97 | 82 
 98 | 59 
 99 | 117 
100 | 20 
101 | 


--------------------------------------------------------------------------------
/Chapter_RNN/TEP_dataExploration.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                              Exploring TEP data 
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import pyreadr
 7 | fault_free_training_data = pyreadr.read_r('TEP_FaultFree_Training.RData')['fault_free_training'] # pandas dataframe
 8 | fault_free_testing_data = pyreadr.read_r('TEP_FaultFree_Testing.RData')['fault_free_testing']
 9 | faulty_training_data = pyreadr.read_r('TEP_Faulty_Training.RData')['faulty_training']
10 | faulty_testing_data = pyreadr.read_r('TEP_Faulty_Testing.RData')['faulty_testing']
11 | 
12 | #%% remove fault 3,9,15 data from faulty dataset
13 | faulty_training_data = faulty_training_data[faulty_training_data['faultNumber'] != 3]
14 | faulty_training_data = faulty_training_data[faulty_training_data['faultNumber'] != 9]
15 | faulty_training_data = faulty_training_data[faulty_training_data['faultNumber'] != 15]
16 | 
17 | faulty_testing_data = faulty_testing_data[faulty_testing_data['faultNumber'] != 3]
18 | faulty_testing_data = faulty_testing_data[faulty_testing_data['faultNumber'] != 9]
19 | faulty_testing_data = faulty_testing_data[faulty_testing_data['faultNumber'] != 15]
20 | 
21 | #%% sample process values from selected simulation runs
22 | # fault free
23 | faultFree_simulationData = fault_free_training_data[fault_free_training_data['simulationRun'] == 1]
24 | faultFree_simulationData = faultFree_simulationData.iloc[:,3:13]
25 | 
26 | # faulty
27 | faulty_simulationData_1 = faulty_training_data[faulty_training_data['simulationRun'] == 1]
28 | faulty_simulationData_1 = faulty_simulationData_1[faulty_simulationData_1['faultNumber'] == 1]
29 | faulty_simulationData_1 = faulty_simulationData_1.iloc[:,3:13]
30 | 
31 | faulty_simulationData_2 = faulty_training_data[faulty_training_data['simulationRun'] == 1]
32 | faulty_simulationData_2 = faulty_simulationData_2[faulty_simulationData_2['faultNumber'] == 8]
33 | faulty_simulationData_2 = faulty_simulationData_2.iloc[:,3:13]
34 | 
35 | faulty_simulationData_3 = faulty_training_data[faulty_training_data['simulationRun'] == 1]
36 | faulty_simulationData_3 = faulty_simulationData_3[faulty_simulationData_3['faultNumber'] == 12]
37 | faulty_simulationData_3 = faulty_simulationData_3.iloc[:,3:13]
38 | 
39 | #%% scale data
40 | from sklearn.preprocessing import StandardScaler
41 | 
42 | scaler = StandardScaler()
43 | faultFree_simulationData_scaled = scaler.fit_transform(faultFree_simulationData)
44 | faulty_simulationData_1_scaled = scaler.transform(faulty_simulationData_1)
45 | faulty_simulationData_2_scaled = scaler.transform(faulty_simulationData_2)
46 | faulty_simulationData_3_scaled = scaler.transform(faulty_simulationData_3)
47 | 
48 | #%% plots
49 | import matplotlib.pyplot as plt
50 | legendNames = ['signal' + str(i+1) for i in range(10)]
51 | 
52 | plt.figure()
53 | plt.plot(faultFree_simulationData_scaled)
54 | plt.xlabel('Time step')
55 | plt.ylabel('Scaled values')
56 | plt.title('Training measurements for non-faulty data')
57 | plt.legend(legendNames, loc='upper left')
58 | 
59 | plt.figure()
60 | plt.plot(faulty_simulationData_1_scaled)
61 | plt.xlabel('Time step')
62 | plt.ylabel('Scaled values')
63 | plt.title('Training measurements for fault 1 data')
64 | plt.legend(legendNames, loc='upper left')
65 | 
66 | plt.figure()
67 | plt.plot(faulty_simulationData_2_scaled)
68 | plt.xlabel('Time step')
69 | plt.ylabel('Scaled values')
70 | plt.title('Training measurements for fault 8 data')
71 | plt.legend(legendNames, loc='upper left')
72 | 
73 | plt.figure()
74 | plt.plot(faulty_simulationData_3_scaled)
75 | plt.xlabel('Time step')
76 | plt.ylabel('Scaled values')
77 | plt.title('Training measurements for fault 8 data (run = 300)')
78 | plt.legend(legendNames, loc='upper left')


--------------------------------------------------------------------------------
/Chapter_RNN/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_ReinforcementLearning/actor_saved/keras_metadata.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_ReinforcementLearning/actor_saved/keras_metadata.pb


--------------------------------------------------------------------------------
/Chapter_ReinforcementLearning/actor_saved/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_ReinforcementLearning/actor_saved/saved_model.pb


--------------------------------------------------------------------------------
/Chapter_ReinforcementLearning/actor_saved/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_ReinforcementLearning/actor_saved/variables/variables.data-00000-of-00001


--------------------------------------------------------------------------------
/Chapter_ReinforcementLearning/actor_saved/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_ReinforcementLearning/actor_saved/variables/variables.index


--------------------------------------------------------------------------------
/Chapter_ReinforcementLearning/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_ScriptingEnvironment/NumpyBasics.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Numpy Basics
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | # create a 2D array
 5 | import numpy as np
 6 | 
 7 | arr2D = np.array([[1,4,6],[2,5,7]]) 
 8 | 
 9 | # getting information about arr2D
10 | print(arr2D.size) # returns 6, the no. of items
11 | print(arr2D.ndim) # returns 2, the no. of dimensions
12 | print(arr2D.shape) # returns tuple(2,3) corresponding to 2 rows & 3 columns
13 | 
14 | # create a 1D array
15 | arr1D = np.array([1,4,6]) 
16 | 
17 | # getting information about arr1D
18 | print(arr1D.size) # returns 3, the no. of items
19 | print(arr1D.ndim) # returns 1, the no. of dimensions
20 | print(arr1D.shape) # returns tuple(3,) corresponding to 3 items
21 | 
22 | #%% creating numpy arrays
23 | # creating sequence of numbers
24 | arr1 = np.arange(3, 6) # same as Python range function; results in array([3,4,5])
25 | arr2 = np.arange(3, 9, 2) # the 3rd argument defines the step size; results in array([3,5,7])
26 | arr3 = np.linspace(1,7,3) # creates evenly spaced 3 values from 1 to 7; results in array([1,4,7])
27 | 
28 | # creating special arrays
29 | arr4 = np.ones((2,1)) # array of shape (2,1) with all items as 1
30 | arr5 = np.zeros((2,2)) # all items as zero; often used as placeholder array at beginning of script
31 | arr6 = np.eye(2) # diagonal items as 1
32 | 
33 | # adding axis to existing arrays (e.g., converting 1D array to 2D array)
34 | print(arr1[:, np.newaxis])
35 | arr7 = arr1[:, None] # same as above
36 | 
37 | # combining / stacking arrays
38 | print(np.hstack((arr1, arr2))) # horizontally stacks passed arrays
39 | print(np.vstack((arr1, arr2))) # vertically stacks passed arrays
40 | print(np.hstack((arr5,arr4))) # array 4 added as a column into arr5
41 | print(np.vstack((arr5,arr6))) # rows of array 6 added onto arr5
42 | 
43 | #%% basic numpy functions
44 | print(arr2D.sum(axis=0))
45 | print(arr2D.sum(axis=1))
46 | 
47 | #%% indexing arrays
48 | # accessing individual items
49 | print(arr2D[1,2]) # returns 7
50 | 
51 | # slicing
52 | arr8 = np.arange(10).reshape((2,5)) # rearrange the 1D array into shape (2,5)
53 | print((arr8[0:1,1:3]))
54 | print((arr8[0,1:3])) # note that a 1D array is returned here instead of the 2D array above
55 | 
56 | # accessing entire row or column
57 | print(arr8[1]) # returns 2nd row as array([5,6,7,8,9]); same as arr8[1,:]
58 | print(arr8[:, 4]) # returns items of 5th column as a 1D array 
59 | 
60 | # extract a subarray from arr8 and modify it
61 | arr8_sub = arr8[:, :2] # columns 0 and 1 from all rows
62 | arr8_sub[1, 1] = 1000
63 | print(arr8) # arr8 gets modified as well!! 
64 | 
65 | # use copy method for a separate copy
66 | arr8 = np.arange(10).reshape((2,5))
67 | arr8_sub2 = arr8[:, :2].copy()
68 | arr8_sub2[1, 1] = 100
69 | print(arr8)
70 | 
71 | # Fancy indexing
72 | # combination of simple and fancy indexing
73 | arr8_sub3 = arr8[:, [0, 1]] # note how columns are indexed via a list
74 | arr8_sub3[1, 1] = 100 # arr8_sub3 becomes same as arr8_sub2 but arr8 is not modified here
75 | print(arr8)
76 | 
77 | # use boolean mask to select subarray
78 | arr8_sub4 = arr8[arr8 > 5] # returns array([6,7,8,9]), i.e., all values > 5
79 | arr8_sub4[0] = 0 # again, arr8 is not affected
80 | print(arr8)
81 | 
82 | #%% vectorized operations
83 | vec1 = np.array([1,2,3,4])
84 | vec2 = np.array([5,6,7,8])
85 | vec_sum = vec1 + vec2 # returns array([6,8,10,12]); no need to loop through index 0 to 3
86 | 
87 | # slightly more complex operation (computing distance between vectors)
88 | vec_distance = np.sqrt(np.sum((vec1 - vec2)**2)) # vec_distance = 8.0
89 | 


--------------------------------------------------------------------------------
/Chapter_ScriptingEnvironment/PandasBasics.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Pandas Basics
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | # create a series (1D structure)
 5 | import pandas as pd
 6 | 
 7 | data = [10,8,6]
 8 | s = pd.Series(data) # can pass numpy array as well
 9 | print(s)
10 | 
11 | # create a dataframe
12 | data = [[1,10],[1,8],[1,6]]
13 | df = pd.DataFrame(data, columns=['id', 'value'])
14 | print(df)
15 | 
16 | # dataframe from series
17 | s2 = pd.Series([1,1,1])
18 | df = pd.DataFrame({'id':s2, 'value':s})
19 | print(df)
20 | 
21 | #%% data access
22 | # column(s) selection
23 | print(df['id']) # returns column 'id' as a series
24 | print(df.id) # same as above
25 | print(df[['id']]) # returns specified columns in the list as a dataframe
26 | 
27 | # row selection
28 | df.index = [100, 101, 102] # changing row indices from [0,1,2] to [100,101,102]
29 | print(df)
30 | print(df.loc[101]) # returns 2nd row as a series; can provide a list for multiple rows selection
31 | print(df.iloc[1]) # integer location-based selection; same result as above
32 | 
33 | # individual item selection
34 | print(df.loc[101, 'value']) # returns 8
35 | print(df.iloc[1, 1]) # same as above
36 | 
37 | #%% data aggregation exanple
38 | # create another dataframe using df
39 | df2 = df.copy()
40 | df2.id = 2 # make all items in column 'id' as 2
41 | df2.value *= 4 # multiply all items in column 'value' by 4
42 | print(df2)
43 | 
44 | # combine df and df2
45 | df3 = df.append(df2) # a new object is retuned unlike Python’s append function
46 | print(df3)
47 | 
48 | # id-based mean values computation
49 | print(df3.groupby('id').mean()) # returns a dataframe
50 | 
51 | #%% file I/O
52 | # reading from excel and csv files
53 | dataset1 = pd.read_excel('filename.xlsx') # several parameter  options are available to customize what data is read
54 | dataset2 = pd.read_csv('filename.xlsx')
55 | 


--------------------------------------------------------------------------------
/Chapter_ScriptingEnvironment/PythonBasics.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Python Basics
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% basic data types
 6 | i = 2 # integer; type(i) = int
 7 | f = 1.2 # floating-point number; type(f) = float
 8 | s = 'two' # string; type(s) = str
 9 | b = True # boolean; type(b) = bool
10 | 
11 | # basic operations
12 | print(i+2) # displays 4
13 | print(f*2) # displays 2.4
14 | print(not b)# displays False
15 | 
16 | #%% ordered sequences
17 | # different ways of creating lists
18 | list1 = [2,4,6]
19 | list2 = ['air',3,1,5]
20 | list3 = list(range(4)) # equals [0,1,2,3]; range function returns a sequence of numbers starting from 0 (default) with increments of 1 (default)
21 | list3.append(8) # returns [0,1,2,3,8];  append function adds new items to existing list
22 | list4 = list1 + list2 # equals [2,4,6,'air',3,1,5]
23 | list5 = [list2, list3] # nested list [['air', 3, 1, 5], [0, 1, 2, 3,8]]
24 | 
25 | # creating tuples
26 | tuple1 = (0,1,'two')
27 | tuple2 = (list1, list2) # equals ([2, 4, 6, 8], ['air', 3, 1, 5])
28 | 
29 | #%% list comprehension
30 | # return powers of list items
31 | newList1 = [item**2 for item in list3] # equals [0,1,4,9, 64]
32 | # nested list comprehension
33 | newList2 = [item2**2 for item2 in [item**2 for item in list3]] # equals [0,1,16,81, 4096]
34 | 
35 | #%% Indexing and slicing sequences
36 | # working with single item using positive or negative indexes
37 | print(list1[0]) # displays 2, the 1st item in list1
38 | list2[1] = 1 # list2 becomes ['air',1,1,5]
39 | print(list2[-2]) # displays 1, the 2nd last element in list2
40 | 
41 | # accessing multiple items through slicing
42 | # Syntax: givenList[start,stop,step]; if unspecified, start=0, stop=list length, step=1
43 | print(list4[0:3]) # displays [2,4,6], the 1st, 2nd, 3rd items; note that index 3 item is excluded
44 | print(list4[:3]) # same as above
45 | print(list4[4:len(list4)]) # displays [3,1,5]; len() function returns the number of items in list
46 | print(list4[4:]) # same as above
47 | print(list4[::3]) # displays [2, 'air', 5]
48 | print(list4[::-1]) # displays list 4 backwards [5, 1, 3, 'air', 6, 4, 2]
49 | list4[2:4] = [0,0,0] # list 4 becomes [2, 4, 0, 0, 0, 3, 1, 5]
50 | 
51 | #%% Execution control statements 
52 | # conditional execution
53 | # selectively execute code based on condition
54 | if list1[0] > 0:
55 |     list1[0] = 'positive'
56 | else:
57 |     list1[0] = 'negative'
58 |     
59 | # loop execution
60 | # code below computes sum of squares of numbers in list 3
61 | sum_of_squares = 0
62 | for i in range(len(list3)):
63 |     sum_of_squares += list3[i]**2
64 | 
65 | print(sum_of_squares) # displays 78
66 | 
67 | #%% custom functions
68 | # define function instructions
69 | def sumSquares(givenList):
70 |     sum_of_squares = 0
71 |     for i in range(len(givenList)):
72 |         sum_of_squares += givenList[i]**2
73 |     
74 |     return sum_of_squares
75 | 
76 | # call/re-use the custom function multiple times
77 | print(sumSquares(list3)) # displays 78
78 | print(sumSquares(list4)) # displays 55
79 | 
80 |  


--------------------------------------------------------------------------------
/Chapter_ScriptingEnvironment/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_ScriptingEnvironment/quadratic_raw_data.csv:
--------------------------------------------------------------------------------
 1 | -5.000000000000000000e+00,1.900000000000000000e+01
 2 | -4.400000000000000355e+00,1.800000000000000000e+01
 3 | -4.349999999999999645e+00,1.780000000000000071e+01
 4 | -4.099999999999999645e+00,1.600000000000000000e+01
 5 | -4.049999999999999822e+00,1.219999999999999929e+01
 6 | -4.000000000000000000e+00,1.200000000000000000e+01
 7 | -3.799999999999999822e+00,7.000000000000000000e+00
 8 | -3.700000000000000178e+00,6.000000000000000000e+00
 9 | -3.500000000000000000e+00,5.500000000000000000e+00
10 | -3.000000000000000000e+00,6.500000000000000000e+00
11 | -2.899999999999999911e+00,6.480000000000000426e+00
12 | -2.799999999999999822e+00,6.450000000000000178e+00
13 | -2.000000000000000000e+00,2.500000000000000000e+00
14 | -1.949999999999999956e+00,2.450000000000000178e+00
15 | -1.350000000000000089e+00,0.000000000000000000e+00
16 | -1.300000000000000044e+00,-2.500000000000000000e-01
17 | -1.149999999999999911e+00,-3.000000000000000000e+00
18 | 0.000000000000000000e+00,-1.000000000000000000e+00
19 | 2.000000000000000111e-01,-1.000000000000000056e-01
20 | 2.999999999999999889e-01,-1.199999999999999956e-01
21 | 5.000000000000000000e-01,-1.100000000000000006e-01
22 | 1.199999999999999956e+00,-2.000000000000000000e+00
23 | 1.699999999999999956e+00,1.000000000000000000e+00
24 | 1.800000000000000044e+00,1.399999999999999911e+00
25 | 1.899999999999999911e+00,1.449999999999999956e+00
26 | 


--------------------------------------------------------------------------------
/Chapter_ScriptingEnvironment/typicalML_script.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Linear regression model
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import libraries
 6 | import numpy as np
 7 | from sklearn.preprocessing import PolynomialFeatures
 8 | from sklearn.preprocessing import StandardScaler
 9 | from sklearn.linear_model import LinearRegression
10 | from sklearn.metrics import r2_score
11 | import matplotlib.pyplot as plt
12 | 
13 | #%% read data
14 | data = np.loadtxt('quadratic_raw_data.csv', delimiter=',')
15 | x = data[:,0:1]; y = data[:,1:] # equivalent to y = data[:,1,None] which returns 2D array
16 | 
17 | #%% Pre-process / Feature engineering
18 | poly = PolynomialFeatures(degree=2, include_bias=False)
19 | X_poly = poly.fit_transform(x) # X_poly: 1st column is x, 2nd column is x^2 
20 | 
21 | #%% scale model input variables
22 | scaler = StandardScaler() 
23 | X_scaled = scaler.fit_transform(X_poly) 
24 | 
25 | #%% fit linear model & predict
26 | model = LinearRegression()
27 | model.fit(X_poly, y)
28 | y_predicted = model.predict(X_poly)
29 | 
30 | #%% Assess model accuracy
31 | print('Fit accuracy = ', r2_score(y, y_predicted))
32 | 
33 | #%% plot predictions
34 | plt.figure(figsize=(4, 2))
35 | plt.plot(x, y, 'o', label='raw data')
36 | plt.plot(x, y_predicted, label='quadratic fit')
37 | plt.legend()
38 | plt.xlabel('x'), plt.ylabel('y')
39 | 
40 | 


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/Metal_etch_2DPCA_testData.csv:
--------------------------------------------------------------------------------
 1 | 1.723544992180245927e+00,9.056845222188112388e+00
 2 | -1.195362798389852976e+01,1.971340850279771573e+01
 3 | -1.405180395370675228e+01,1.003786454500131242e+01
 4 | -2.296861786600156208e+01,5.985086921969785578e+01
 5 | -8.375268274972489380e+00,1.115643566056243685e+01
 6 | -1.173779751252286374e+01,1.000032875202828109e+01
 7 | -1.077657111425542347e+01,-3.574611061778244903e+01
 8 | -1.546675594585076574e+01,2.381405848082144772e+01
 9 | -1.152423531101815612e+01,8.593634005642147855e+00
10 | 6.834573733321335220e-01,-1.356739897215637569e+01
11 | -1.144209951547643556e+01,-1.816846025077951765e+01
12 | -9.969315306557533063e+00,-3.025019322744546102e+01
13 | -1.444956309528516236e+01,2.780732221277049376e+01
14 | -1.618541901915454417e+01,-1.541740007231091170e+01
15 | 1.236806249975188088e+01,5.187754758901448326e+00
16 | 1.774901749020017405e+01,1.940141924912499860e+01
17 | 1.876992176399419776e+01,8.031281916261185927e+00
18 | 2.659171665908793258e+01,-1.896470000938777289e+01
19 | 2.257578315399075564e+01,1.854098261967717676e+01
20 | 3.469430765643569714e+01,-1.916772089195522444e+00
21 | 


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/SVDD_FaultDetection.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##           Process  Fault Detection via SVDD in metal etch dataset
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import numpy as np
 7 | 
 8 | X_train = np.loadtxt('Metal_etch_2DPCA_trainingData.csv', delimiter=',')
 9 | 
10 | #%% bandwidth via modified mean criteria
11 | import scipy.spatial
12 | 
13 | N = X_train.shape[0]
14 | phi = 1/np.log(N-1)
15 | delta = -0.14818008*np.power(phi,4) + 0.2846623624*np.power(phi,3) - 0.252853808*np.power(phi,2) + 0.159059498*phi - 0.001381145
16 | D2 = np.sum(scipy.spatial.distance.pdist(X_train, 'sqeuclidean'))/(N*(N-1)/2) # pdist computes pairwise distances between observations 
17 | sigma = np.sqrt(D2/np.log((N-1)/delta*delta))
18 | gamma = 1/(2*sigma*sigma)
19 | 
20 | #%% SVM fit
21 | from sklearn.svm import OneClassSVM
22 | 
23 | model = OneClassSVM(nu=0.01, gamma=0.025).fit(X_train) # nu corresponds to f 
24 | 
25 | #%% predict for test data
26 | X_test = np.loadtxt('Metal_etch_2DPCA_testData.csv', delimiter=',')
27 | y_test = model.predict(X_test) # y=-1 for outliers
28 | 
29 | print('Number of faults identified: ', np.sum(y_test == -1), ' out of ', len(y_test)) 
30 | 
31 | #%% plot SVDD boundaries
32 | import matplotlib.pyplot as plt
33 | 
34 | plt.figure()
35 | plt.scatter(X_train[:, 0], X_train[:, 1], edgecolors='k', alpha=0.8)
36 | plt.xlabel('PC1 scores')
37 | plt.ylabel('PC2 scores')
38 | 
39 | # get axis limits
40 | ax = plt.gca()
41 | xlim = ax.get_xlim()
42 | ylim = ax.get_ylim()
43 | 
44 | # create grid to evaluate model
45 | xx = np.linspace(xlim[0], xlim[1], 100)
46 | yy = np.linspace(ylim[0], ylim[1], 100)
47 | YY, XX = np.meshgrid(yy, xx)
48 | xy = np.vstack([XX.ravel(), YY.ravel()]).T
49 | Z = model.decision_function(xy).reshape(XX.shape)
50 | 
51 | # plot decision boundary and supporting planes
52 | ax.contour(XX, YY, Z, levels=[0], alpha=0.9, linestyles=['-'], colors=['red'])
53 | 
54 | #%% plot test data
55 | plt.scatter(X_test[y_test==-1, 0],X_test[y_test==-1,1], c='red', marker = '*', label='True Positive')
56 | plt.scatter(X_test[y_test==1, 0],X_test[y_test==1,1], c='magenta', marker = '*', label='False Negative')
57 | plt.legend()
58 | 


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/SVDD_OneClassClassification.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##          Nonlinear boundary generation via One Class SVM / SVDD
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% generate data
 6 | import numpy as np
 7 | 
 8 | X = np.loadtxt('SVDD_toyDataset.csv', delimiter=',')
 9 | 
10 | #%% compute bandwidth via modified mean criteria
11 | import scipy.spatial
12 | 
13 | N = X.shape[0]
14 | phi = 1/np.log(N-1)
15 | delta = -0.14818008*np.power(phi,4) + 0.2846623624*np.power(phi,3) - 0.252853808*np.power(phi,2) + 0.159059498*phi - 0.001381145
16 | D2 = np.sum(scipy.spatial.distance.pdist(X, 'sqeuclidean'))/(N*(N-1)/2) # pdist computes pairwise distances between observations 
17 | sigma = np.sqrt(D2/np.log((N-1)/delta*delta))
18 | gamma = 1/(2*sigma*sigma)
19 | 
20 | #%% SVM fit
21 | from sklearn.svm import OneClassSVM
22 | 
23 | model = OneClassSVM(nu=0.01, gamma=5)
24 | model.fit(X) 
25 | 
26 | #%% plot SVM boundaries
27 | import matplotlib.pyplot as plt
28 | 
29 | plt.figure()
30 | plt.scatter(X[:, 0], X[:, 1], edgecolors='k', alpha=0.8)
31 | plt.xlabel('x1')
32 | plt.ylabel('x2')
33 | 
34 | # get axis limits
35 | ax = plt.gca()
36 | xlim = ax.get_xlim()
37 | ylim = ax.get_ylim()
38 | 
39 | # create grid to evaluate model
40 | xx = np.linspace(xlim[0], xlim[1], 100)
41 | yy = np.linspace(ylim[0], ylim[1], 100)
42 | YY, XX = np.meshgrid(yy, xx)
43 | xy = np.vstack([XX.ravel(), YY.ravel()]).T
44 | Z = model.decision_function(xy).reshape(XX.shape)
45 | 
46 | # plot decision boundary and supporting planes
47 | ax.contour(XX, YY, Z, levels=[0], alpha=0.9, linestyles=['-'], colors=['red'])


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/SVM_BinaryClassification.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Binary classification via SVM on toy dataset
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import numpy as np
 7 | 
 8 | data = np.loadtxt('toyDataset.csv', delimiter=',')
 9 | X = data[:, [0, 1]]; y = data[:, 2]
10 | 
11 | #%% scale model inputs
12 | from sklearn.preprocessing import StandardScaler
13 | 
14 | scaler = StandardScaler() 
15 | X_scaled = scaler.fit_transform(X) 
16 | 
17 | #%% fit SVM model
18 | from sklearn.svm import SVC # for large datasets LinearSVC class is preferable
19 | 
20 | model = SVC(kernel='linear', C=100)
21 | model.fit(X_scaled, y)
22 | 
23 | #%% get details of support vectors
24 | print('# of support vectors:', len(model.support_))
25 | 
26 | #%% plot SVM boundaries
27 | import matplotlib.pyplot as plt
28 | 
29 | plt.figure()
30 | plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k')
31 | plt.xlabel('X1'), plt.ylabel('X2')
32 | 
33 | # get axis limits
34 | ax = plt.gca()
35 | xlim = ax.get_xlim()
36 | ylim = ax.get_ylim()
37 | 
38 | # create grid to evaluate model
39 | xx = np.linspace(xlim[0], xlim[1], 100)
40 | yy = np.linspace(ylim[0], ylim[1], 100)
41 | YY, XX = np.meshgrid(yy, xx)
42 | xy = np.vstack([XX.ravel(), YY.ravel()]).T
43 | Z = model.decision_function(xy).reshape(XX.shape)
44 | 
45 | # plot decision boundary and supporting planes
46 | ax.contour(XX, YY, Z, levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'], colors=['green', 'red', 'green'])
47 | 
48 | # highlight support vectors
49 | ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=200, linewidth=2, alpha=0.25)
50 | 
51 | 


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/SVM_Kernel_BinaryClassification.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##         Nonlinear binary classification via kernel SVM on toy dataset
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% generate data
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.datasets import make_circles
 8 | 
 9 | X, y = make_circles(500, factor=.08, noise=.1, random_state=1)
10 | # note that y = 0,1 here and need not be +-1; SVM does internal transformation accordingly
11 | 
12 | # plot
13 | plt.figure()
14 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k')
15 | plt.xlabel('x1')
16 | plt.ylabel('x2')
17 | plt.title('raw data')
18 | 
19 | #%% find optimal hyperparameter via GridSearchCV
20 | from sklearn.svm import SVC
21 | from sklearn.model_selection import GridSearchCV
22 | 
23 | param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[0.01, 0.1, 1, 10, 100]}
24 | gs = GridSearchCV(SVC(), param_grid, cv=5).fit(X, y) # no scaling required as input variables are already scaled
25 | 
26 | print('Optimal hyperparameter:', gs.best_params_)
27 | 
28 | #%% plot model predictions
29 | y_predicted = gs.predict(X)
30 | 
31 | # plot
32 | plt.figure()
33 | plt.scatter(X[:, 0], X[:, 1], c=y_predicted, cmap=plt.cm.Paired, edgecolors='k')
34 | plt.xlabel('x1')
35 | plt.ylabel('x2')
36 | plt.title('predictions')
37 | 
38 | #%% plot SVM boundaries
39 | plt.figure()
40 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k')
41 | plt.xlabel('X1'), plt.ylabel('X2')
42 | 
43 | # get axis limits
44 | ax = plt.gca()
45 | xlim = ax.get_xlim()
46 | ylim = ax.get_ylim()
47 | 
48 | # create grid to evaluate model
49 | import numpy as np
50 | xx = np.linspace(xlim[0], xlim[1], 100)
51 | yy = np.linspace(ylim[0], ylim[1], 100)
52 | YY, XX = np.meshgrid(yy, xx)
53 | xy = np.vstack([XX.ravel(), YY.ravel()]).T
54 | Z = gs.decision_function(xy).reshape(XX.shape)
55 | 
56 | # plot decision boundary and supporting planes
57 | ax.contour(XX, YY, Z, levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'], colors=['green', 'red', 'green'])


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/SVM_Kernel_BinaryClassification_noGridSearch.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##          Nonlinear binary classification via kernel SVM on toy dataset
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% generate data
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | from sklearn.datasets import make_circles
 9 | 
10 | X, y = make_circles(500, factor=.08, noise=.1, random_state=1)
11 | # note that y = 0,1 here and need not be +-1; SVM does internal transformation accordingly
12 | 
13 | # plot
14 | plt.figure()
15 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k')
16 | plt.xlabel('x1')
17 | plt.ylabel('x2')
18 | plt.title('raw data')
19 | 
20 | #%% SVM fit
21 | from sklearn.svm import SVC
22 | 
23 | model = SVC(C=100, gamma=1)
24 | model.fit(X, y) # no scaling required as input variables are already scaled
25 | 
26 | #%% plot model predictions
27 | y_predicted = model.predict(X)
28 | 
29 | # plot
30 | plt.figure()
31 | plt.scatter(X[:, 0], X[:, 1], c=y_predicted, cmap=plt.cm.Paired, edgecolors='k')
32 | plt.xlabel('x1')
33 | plt.ylabel('x2')
34 | plt.title('predictions')
35 | 
36 | #%% plot SVM boundaries
37 | plt.figure()
38 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k')
39 | plt.xlabel('x1')
40 | plt.ylabel('x2')
41 | 
42 | # get axis limits
43 | ax = plt.gca()
44 | xlim = ax.get_xlim()
45 | ylim = ax.get_ylim()
46 | 
47 | # create grid to evaluate model
48 | xx = np.linspace(xlim[0], xlim[1], 100)
49 | yy = np.linspace(ylim[0], ylim[1], 100)
50 | YY, XX = np.meshgrid(yy, xx)
51 | xy = np.vstack([XX.ravel(), YY.ravel()]).T
52 | Z = model.decision_function(xy).reshape(XX.shape)
53 | 
54 | # plot decision boundary and supporting planes
55 | ax.contour(XX, YY, Z, levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'], colors=['green', 'red', 'green'])


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/SVM_SoftMarginClassification.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##             Binary classification via soft margin SVM on toy dataset
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% read data
 6 | import numpy as np
 7 | 
 8 | data = np.loadtxt('toyDataset2.csv', delimiter=',')
 9 | X = data[:,0:2]; y = data[:,2]
10 | 
11 | #%% scale model inputs
12 | from sklearn.preprocessing import StandardScaler
13 | 
14 | scaler = StandardScaler() 
15 | X_scaled = scaler.fit_transform(X) 
16 | 
17 | #%% SVM fit
18 | from sklearn.svm import SVC
19 | 
20 | model = SVC(kernel='linear', C=100)
21 | model.fit(X_scaled, y)
22 | 
23 | #%% get details of support vectors
24 | print('# of support vectors:', len(model.support_)) 
25 | # The BAD sample lying on the wrong side of the support plane is also a support vector
26 | 
27 | #%% plot SVM boundaries
28 | import matplotlib.pyplot as plt
29 | 
30 | plt.figure()
31 | plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k')
32 | 
33 | # get axis limits
34 | ax = plt.gca()
35 | xlim = ax.get_xlim()
36 | ylim = ax.get_ylim()
37 | 
38 | # create grid to evaluate model
39 | xx = np.linspace(xlim[0], xlim[1], 100)
40 | yy = np.linspace(ylim[0], ylim[1], 100)
41 | YY, XX = np.meshgrid(yy, xx)
42 | xy = np.vstack([XX.ravel(), YY.ravel()]).T
43 | Z = model.decision_function(xy).reshape(XX.shape)
44 | 
45 | # plot decision boundary and supporting planes
46 | ax.contour(XX, YY, Z, levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'], colors=['green', 'red', 'green'])
47 | 
48 | # highlight support vectors
49 | ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=200, linewidth=2, alpha=0.25)
50 | 


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/SVR_illustration.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                         SVR quadratic fitting
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import
 6 | import numpy as np
 7 | np.random.seed(1)
 8 | 
 9 | #%% generate data
10 | x = np.linspace(-1, 1, 50)[:, None]
11 | y = x*x + 0.25
12 | y = y + np.random.normal(0, 0.15, (50,1))
13 | 
14 | #%% plot
15 | import matplotlib.pyplot as plt
16 | plt.figure()
17 | plt.scatter(x,y,edgecolors='k', alpha=0.8)
18 | plt.xlabel('x'), plt.ylabel('y')
19 | 
20 | #%% fit SVR model
21 | from sklearn.svm import SVR
22 | 
23 | epsilon = 0.1
24 | model = SVR(gamma=0.5, C=10, epsilon=epsilon)
25 | model.fit(x, y)
26 | 
27 | #%% predict
28 | xx = np.linspace(-1, 1, 200)[:, None]
29 | yy_predicted = model.predict(xx)
30 | yy_epsilon_tube_upper = yy_predicted + epsilon
31 | yy_epsilon_tube_lower = yy_predicted - epsilon
32 | 
33 | #%% get support vectors
34 | x_SVs = model.support_vectors_
35 | y_SVs = y[model.support_]
36 | 
37 | #%% plot
38 | plt.figure()
39 | plt.scatter(x,y,edgecolors='k', alpha=0.8)
40 | plt.plot(xx, yy_predicted, 'r')
41 | plt.plot(xx, yy_epsilon_tube_upper, '--g')
42 | plt.plot(xx, yy_epsilon_tube_lower, '--g')
43 | plt.scatter(x_SVs, y_SVs, s=200, linewidth=2, edgecolors='m', alpha=0.15)
44 | plt.xlabel('x'), plt.ylabel('y')
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/debutanizer_Softsensing_PLS.py:
--------------------------------------------------------------------------------
  1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | ##                          PLS model with debutanizer data
  3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 | 
  5 | #%% import required packages
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | #%% read data
 10 | data = np.loadtxt('debutanizer_data.txt', skiprows=5)
 11 | 
 12 | #%% separate train and test data
 13 | from sklearn.model_selection import train_test_split
 14 | X = data[:,0:-1]
 15 | y = data[:,-1][:,np.newaxis]
 16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 100)
 17 | 
 18 | #%% scale data
 19 | from sklearn.preprocessing import StandardScaler
 20 | 
 21 | X_scaler = StandardScaler()
 22 | X_train_normal = X_scaler.fit_transform(X_train)
 23 | X_test_normal = X_scaler.transform(X_test)
 24 | 
 25 | y_scaler = StandardScaler()
 26 | y_train_normal = y_scaler.fit_transform(y_train)
 27 | y_test_normal = y_scaler.transform(y_test)
 28 | 
 29 | #%% Finding # latents using kFold cross validation
 30 | from sklearn.model_selection import KFold
 31 | from sklearn.metrics import mean_squared_error
 32 | from sklearn.cross_decomposition import PLSRegression
 33 | 
 34 | scaler = StandardScaler()
 35 | 
 36 | fit_MSE = []
 37 | validate_MSE = []
 38 | for n_comp in range(1,8):
 39 |     local_fit_MSE = []
 40 |     local_validate_MSE = []
 41 |     
 42 |     kfold = KFold(n_splits = 10, shuffle = True, random_state = 100)
 43 |     for fit_index, validate_index in kfold.split(y_train):
 44 |         X_fit_normal = scaler.fit_transform(X_train[fit_index])
 45 |         X_validate_normal = scaler.transform(X_train[validate_index])
 46 |         
 47 |         y_fit_normal = scaler.fit_transform(y_train[fit_index])
 48 |         y_validate_normal = scaler.transform(y_train[validate_index])
 49 |         
 50 |         pls = PLSRegression(n_components = n_comp)
 51 |         pls.fit(X_fit_normal, y_fit_normal)
 52 |         
 53 |         local_fit_MSE.append(mean_squared_error(y_fit_normal, pls.predict(X_fit_normal)))
 54 |         local_validate_MSE.append(mean_squared_error(y_validate_normal, 
 55 |                                                         pls.predict(X_validate_normal)))
 56 |     
 57 |     fit_MSE.append(np.mean(local_fit_MSE))
 58 |     validate_MSE.append(np.mean(local_validate_MSE))
 59 | 
 60 | 
 61 | # plot
 62 | plt.figure()
 63 | plt.plot(range(1,8), fit_MSE, 'b*', label = 'Training MSE')
 64 | plt.plot(range(1,8), validate_MSE, 'r*', label = 'Validation MSE')
 65 | plt.xticks(range(1,8))
 66 | plt.ylabel('Mean Squared Error (MSE)')
 67 | plt.xlabel('# of latents')
 68 | plt.legend()
 69 | 
 70 | #%% build PLS model
 71 | pls = PLSRegression(n_components = 5)
 72 | pls.fit(X_train_normal, y_train_normal)
 73 | 
 74 | #%% check training vs test accuracy
 75 | print('Accuracy over training data: ', pls.score(X_train_normal, y_train_normal))
 76 | print('Accuracy over test data: ', pls.score(X_test_normal, y_test_normal))
 77 | 
 78 | #%% plots of raw and predicted data
 79 | y_train_normal_predict = pls.predict(X_train_normal)
 80 | y_test_normal_predict = pls.predict(X_test_normal)
 81 | 
 82 | y_train_predict = y_scaler.inverse_transform(y_train_normal_predict)
 83 | y_test_predict = y_scaler.inverse_transform(y_test_normal_predict)
 84 | 
 85 | 
 86 | plt.figure()
 87 | plt.plot(y_train, 'b',  label = 'Raw data')
 88 | plt.plot(y_train_predict, 'r', label = 'PLS prediction')
 89 | plt.ylabel('C4 content (training data)')
 90 | plt.xlabel('Sample #')
 91 | plt.legend()
 92 | 
 93 | 
 94 | plt.figure()
 95 | plt.plot(y_test, 'b',  label = 'Raw data')
 96 | plt.plot(y_test_predict, 'r',  label = 'PLS prediction')
 97 | plt.ylabel('C4 content (test data)')
 98 | plt.xlabel('Sample #')
 99 | plt.legend()
100 | 
101 | plt.figure()
102 | plt.plot(y_train, y_train_predict, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
103 | plt.plot(y_train, y_train, '-r', linewidth=0.5)
104 | plt.xlabel('C4 content (raw training data)')
105 | plt.ylabel('C4 content (prediction)')
106 | 
107 | plt.figure()
108 | plt.plot(y_test, y_test_predict, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
109 | plt.plot(y_test, y_test, '-r', linewidth=0.5)
110 | plt.xlabel('C4 content (raw test data)')
111 | plt.ylabel('C4 content (prediction)')
112 | 
113 | #%% residuals
114 | plt.figure()
115 | plt.plot(y_test, y_test-y_test_predict, '*')
116 | plt.xlabel('C4 content test data')
117 | plt.ylabel('residual (raw data- prediction)')
118 | plt.title('residual plot')


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/debutanizer_Softsensing_SVR.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          SVR model with debutanizer data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% read data
10 | data = np.loadtxt('debutanizer_data.txt', skiprows=5)
11 | 
12 | #%% separate train and test data
13 | from sklearn.model_selection import train_test_split
14 | X = data[:,0:-1]
15 | y = data[:,-1]
16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 100)
17 | 
18 | #%% fit SVR model via grid-search
19 | from sklearn.svm import SVR
20 | from sklearn.model_selection import GridSearchCV
21 | 
22 | model = SVR(epsilon=0.05)
23 | param_grid = [{'gamma': np.linspace(1,10,10), 'C': np.linspace(0.01,500,10)}]
24 | gs = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=10, verbose=2)
25 | 
26 | gs.fit(X_train, y_train)
27 | print('Optimal hyperparameter:', gs.best_params_)
28 | 
29 | #%% predict using the best model
30 | y_train_predicted = gs.predict(X_train)
31 | y_test_predicted = gs.predict(X_test)
32 | 
33 | #%% plots of raw and predicted data
34 | plt.figure()
35 | plt.plot(y_train, 'b',  label = 'Raw data')
36 | plt.plot(y_train_predicted, 'r', label = 'SVR prediction')
37 | plt.ylabel('C4 content (training data)')
38 | plt.xlabel('Sample #')
39 | plt.legend()
40 | 
41 | 
42 | plt.figure()
43 | plt.plot(y_test, 'b',  label = 'Raw data')
44 | plt.plot(y_test_predicted, 'r',  label = 'SVR prediction')
45 | plt.ylabel('C4 content (test data)')
46 | plt.xlabel('Sample #')
47 | plt.legend()
48 | 
49 | plt.figure()
50 | plt.plot(y_train, y_train_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
51 | plt.plot(y_train, y_train, '-r', linewidth=0.5)
52 | plt.xlabel('C4 content (raw training data)')
53 | plt.ylabel('C4 content (prediction)')
54 | 
55 | plt.figure()
56 | plt.plot(y_test, y_test_predicted, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
57 | plt.plot(y_test, y_test, '-r', linewidth=0.5)
58 | plt.xlabel('C4 content (raw test data)')
59 | plt.ylabel('C4 content (prediction)')
60 | 
61 | #%% residuals
62 | plt.figure()
63 | plt.plot(y_test, y_test-y_test_predicted, '*')
64 | plt.xlabel('C4 content test data')
65 | plt.ylabel('residual (raw data- prediction)')
66 | plt.title('residual plot')
67 | 
68 | #%% check training vs test accuracy
69 | from sklearn.metrics import r2_score
70 | print('Accuracy over training data: ', r2_score(y_train, y_train_predicted))
71 | print('Accuracy over test data: ', r2_score(y_test, y_test_predicted))


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/polymerPlantData_Softsensing_PLS.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                         PLS model with polymer plant data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% read data
10 | data = np.loadtxt('polymer.dat')
11 | X = data[:,0:10]
12 | Y = data[:,10:]
13 | y = Y[:,3:]
14 | 
15 | #%% scale data
16 | from sklearn.preprocessing import StandardScaler
17 | 
18 | X_scaler = StandardScaler()
19 | X_scaled = X_scaler.fit_transform(X)
20 | 
21 | y_scaler = StandardScaler()
22 | y_scaled = y_scaler.fit_transform(y)
23 | 
24 | #%% Finding # latents using kFold cross validation
25 | from sklearn.model_selection import KFold
26 | from sklearn.metrics import mean_squared_error
27 | from sklearn.cross_decomposition import PLSRegression
28 | 
29 | scaler = StandardScaler()
30 | 
31 | fit_MSE = []
32 | validate_MSE = []
33 | for n_comp in range(1,10):
34 |     local_fit_MSE = []
35 |     local_validate_MSE = []
36 |     
37 |     kfold = KFold(n_splits = 10, shuffle = True, random_state = 100)
38 |     for fit_index, validate_index in kfold.split(y):
39 |         X_fit_scaled = scaler.fit_transform(X[fit_index])
40 |         X_validate_scaled = scaler.transform(X[validate_index])
41 |         
42 |         y_fit_scaled = scaler.fit_transform(y[fit_index])
43 |         y_validate_scaled = scaler.transform(y[validate_index])
44 |         
45 |         pls = PLSRegression(n_components = n_comp)
46 |         pls.fit(X_fit_scaled, y_fit_scaled)
47 |         
48 |         local_fit_MSE.append(mean_squared_error(y_fit_scaled, pls.predict(X_fit_scaled)))
49 |         local_validate_MSE.append(mean_squared_error(y_validate_scaled, 
50 |                                                         pls.predict(X_validate_scaled)))
51 |     
52 |     fit_MSE.append(np.mean(local_fit_MSE))
53 |     validate_MSE.append(np.mean(local_validate_MSE))
54 | 
55 | 
56 | # plot
57 | plt.figure()
58 | plt.plot(range(1,10), fit_MSE, 'b*', label = 'Training MSE')
59 | plt.plot(range(1,10), validate_MSE, 'r*', label = 'Validation MSE')
60 | plt.xticks(range(1,10))
61 | plt.ylabel('Mean Squared Error (MSE)')
62 | plt.xlabel('# of latents')
63 | plt.legend()
64 | 
65 | 
66 | #%% build PLS model and predict
67 | from sklearn.cross_decomposition import PLSRegression
68 | 
69 | pls = PLSRegression(n_components = 5)
70 | pls.fit(X_scaled, y_scaled)
71 | 
72 | y_predicted_PLS = y_scaler.inverse_transform(pls.predict(X_scaled))
73 | 
74 | #%% plots of raw and predicted data
75 | plt.figure()
76 | plt.plot(y, y_predicted_PLS, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9, markerfacecolor = 'C4')
77 | plt.plot(y, y, '-r', linewidth=0.5)
78 | plt.xlabel('measured data')
79 | plt.ylabel('predicted data ')
80 | 
81 | #%% metrics
82 | from sklearn.metrics import r2_score
83 | print('R2:', r2_score(y, y_predicted_PLS))


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/polymerPlantData_Softsensing_SVR.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                          SVR model with polymer plant data
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import required packages
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #%% read data
10 | data = np.loadtxt('polymer.dat')
11 | X = data[:,0:10]
12 | Y = data[:,10:]
13 | y = Y[:,2]
14 | 
15 | #%% fit SVR model
16 | from sklearn.svm import SVR
17 | from sklearn.model_selection import GridSearchCV
18 | 
19 | model = SVR(epsilon=0.01) # default epsilon = 0.1
20 | param_grid = [{'gamma': np.linspace(0.1e-05,5,100), 'C': np.linspace(0.01,5000,100)}]
21 | gs = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=10, verbose=2)
22 | 
23 | gs.fit(X, y)
24 | print('Optimal hyperparameter:', gs.best_params_)
25 | 
26 | #%% predict using the best model
27 | y_predicted_SVR = gs.predict(X) 
28 | 
29 | #%% plots of raw and predicted data
30 | plt.figure()
31 | plt.plot(y, y_predicted_SVR, '.', markeredgecolor='k', markeredgewidth=0.5, ms=9)
32 | plt.plot(y, y, '-r', linewidth=0.5)
33 | plt.xlabel('measured data'), plt.ylabel('predicted data ')
34 | 
35 | #%% metrics
36 | from sklearn.metrics import r2_score
37 | print('R2:', r2_score(y, y_predicted_SVR))


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/toyDataset.csv:
--------------------------------------------------------------------------------
 1 | 1.178862847343031817e+00,1.043650985051199021e+00,-1.000000000000000000e+00
 2 | 1.009649746807200765e+00,8.136507296635508979e-01,-1.000000000000000000e+00
 3 | 9.722611797485600782e-01,9.645241020731013526e-01,-1.000000000000000000e+00
 4 | 9.917258518517539922e-01,9.372999323176152142e-01,-1.000000000000000000e+00
 5 | 9.956181831024071283e-01,9.522781969640496946e-01,-1.000000000000000000e+00
 6 | 8.686135246637317620e-01,1.088462238049958453e+00,-1.000000000000000000e+00
 7 | 1.088131804220753063e+00,1.170957306365294937e+00,-1.000000000000000000e+00
 8 | 1.005003364217686102e+00,9.595322585399108650e-01,-1.000000000000000000e+00
 9 | 9.454640052380469672e-01,8.453522684417031918e-01,-1.000000000000000000e+00
10 | 1.098236743425816009e+00,8.898932369888523652e-01,-1.000000000000000000e+00
11 | 8.814953472979827342e-01,9.794350100577459139e-01,-1.000000000000000000e+00
12 | 1.148614835507459020e+00,1.023671626722691297e+00,-1.000000000000000000e+00
13 | 8.976214860073531421e-01,9.287006799887950192e-01,-1.000000000000000000e+00
14 | 1.062524496616283010e+00,9.839486636813076226e-01,-1.000000000000000000e+00
15 | 9.231163649680770300e-01,9.769969277722061474e-01,-1.000000000000000000e+00
16 | 1.789406751968644516e+00,1.487133293975156256e+00,1.000000000000000000e+00
17 | 1.550705200525287486e+00,1.174829970657395695e+00,1.000000000000000000e+00
18 | 1.603548068650810787e+00,9.597100192185596956e-01,1.000000000000000000e+00
19 | 1.589144957396505298e+00,1.127134908698859572e+00,1.000000000000000000e+00
20 | 1.834877355074896244e+00,1.234170292063891949e+00,1.000000000000000000e+00
21 | 1.505205746499770347e+00,1.327601054272420589e+00,1.000000000000000000e+00
22 | 1.657247508866390495e+00,1.040823075565584954e+00,1.000000000000000000e+00
23 | 1.628402042997896038e+00,1.179368674437410780e+00,1.000000000000000000e+00
24 | 1.595134124268523967e+00,1.253565657843322079e+00,1.000000000000000000e+00
25 | 1.430209067890807262e+00,1.217868576218473331e+00,1.000000000000000000e+00
26 | 1.821582013026379343e+00,1.352335740914497819e+00,1.000000000000000000e+00
27 | 1.832982499992199088e+00,1.384326878638266978e+00,1.000000000000000000e+00
28 | 1.878505175839104702e+00,1.115803917871956319e+00,1.000000000000000000e+00
29 | 1.801500008846861789e+00,1.026693256526946429e+00,1.000000000000000000e+00
30 | 1.627653787519133699e+00,1.020263354791302257e+00,1.000000000000000000e+00
31 | 


--------------------------------------------------------------------------------
/Chapter_SupportVectorMachines/toyDataset2.csv:
--------------------------------------------------------------------------------
 1 | 1.178862847343031817e+00,1.043650985051199021e+00,-1.000000000000000000e+00
 2 | 1.009649746807200765e+00,8.136507296635508979e-01,-1.000000000000000000e+00
 3 | 9.722611797485600782e-01,9.645241020731013526e-01,-1.000000000000000000e+00
 4 | 9.917258518517539922e-01,9.372999323176152142e-01,-1.000000000000000000e+00
 5 | 9.956181831024071283e-01,9.522781969640496946e-01,-1.000000000000000000e+00
 6 | 8.686135246637317620e-01,1.088462238049958453e+00,-1.000000000000000000e+00
 7 | 1.088131804220753063e+00,1.170957306365294937e+00,-1.000000000000000000e+00
 8 | 1.005003364217686102e+00,9.595322585399108650e-01,-1.000000000000000000e+00
 9 | 9.454640052380469672e-01,8.453522684417031918e-01,-1.000000000000000000e+00
10 | 1.098236743425816009e+00,8.898932369888523652e-01,-1.000000000000000000e+00
11 | 8.814953472979827342e-01,9.794350100577459139e-01,-1.000000000000000000e+00
12 | 1.148614835507459020e+00,1.023671626722691297e+00,-1.000000000000000000e+00
13 | 8.976214860073531421e-01,9.287006799887950192e-01,-1.000000000000000000e+00
14 | 1.062524496616283010e+00,9.839486636813076226e-01,-1.000000000000000000e+00
15 | 9.231163649680770300e-01,9.769969277722061474e-01,-1.000000000000000000e+00
16 | 1.789406751968644516e+00,1.487133293975156256e+00,1.000000000000000000e+00
17 | 1.550705200525287486e+00,1.174829970657395695e+00,1.000000000000000000e+00
18 | 1.603548068650810787e+00,9.597100192185596956e-01,1.000000000000000000e+00
19 | 1.589144957396505298e+00,1.127134908698859572e+00,1.000000000000000000e+00
20 | 1.834877355074896244e+00,1.234170292063891949e+00,1.000000000000000000e+00
21 | 1.505205746499770347e+00,1.327601054272420589e+00,1.000000000000000000e+00
22 | 1.657247508866390495e+00,1.040823075565584954e+00,1.000000000000000000e+00
23 | 1.628402042997896038e+00,1.179368674437410780e+00,1.000000000000000000e+00
24 | 1.595134124268523967e+00,1.253565657843322079e+00,1.000000000000000000e+00
25 | 1.430209067890807262e+00,1.217868576218473331e+00,1.000000000000000000e+00
26 | 1.821582013026379343e+00,1.352335740914497819e+00,1.000000000000000000e+00
27 | 1.832982499992199088e+00,1.384326878638266978e+00,1.000000000000000000e+00
28 | 1.878505175839104702e+00,1.115803917871956319e+00,1.000000000000000000e+00
29 | 1.801500008846861789e+00,1.026693256526946429e+00,1.000000000000000000e+00
30 | 1.627653787519133699e+00,1.020263354791302257e+00,1.000000000000000000e+00
31 | 1.699999999999999956e+00,1.100000000000000089e+00,-1.000000000000000000e+00
32 | 


--------------------------------------------------------------------------------
/Chapter_WebDeployment/FDD.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Hello World Web App
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import packages
 6 | import cherrypy
 7 | 
 8 | #%% FDD tool Web application
 9 | class FDDapp(object):
10 |     @cherrypy.expose
11 |     def getResults(self):
12 |         processState = runPCAmodel() # returns 'All good' or 'Issue detected'
13 |         return processState
14 | 
15 | #%% execution settings
16 | cherrypy.config.update({'server.socket_host': '0.0.0.0'})
17 | 
18 | if __name__ == '__main__':
19 |     cherrypy.quickstart(FDDapp()) # when this script is executed, host FDDapp app 
20 | 


--------------------------------------------------------------------------------
/Chapter_WebDeployment/PCAmetrics_history.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_WebDeployment/PCAmetrics_history.pickle


--------------------------------------------------------------------------------
/Chapter_WebDeployment/PCAmodelData.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_WebDeployment/PCAmodelData.pickle


--------------------------------------------------------------------------------
/Chapter_WebDeployment/ProcessMonitoring_PCA.py:
--------------------------------------------------------------------------------
  1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | ##                          Train PCA model
  3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 | 
  5 | #%% import required packages
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn.preprocessing import StandardScaler
  9 | from sklearn.decomposition import PCA
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | #%% fetch data
 13 | data = pd.read_excel('proc1a.xlsx', skiprows = 1,usecols = 'C:AI')
 14 | 
 15 | #%% separate train data
 16 | data_train = data.iloc[0:69,]
 17 |            
 18 | #%% scale data
 19 | scaler = StandardScaler()
 20 | data_train_normal = scaler.fit_transform(data_train)
 21 |            
 22 | #%% PCA
 23 | pca = PCA()
 24 | score_train = pca.fit_transform(data_train_normal)
 25 | 
 26 | #%% decide # of PCs to retain and compute reduced data in PC space
 27 | explained_variance = 100*pca.explained_variance_ratio_ # in percentage
 28 | cum_explained_variance = np.cumsum(explained_variance) # cumulative % variance explained
 29 | 
 30 | n_comp = np.argmax(cum_explained_variance >= 90) + 1
 31 | score_train_reduced = score_train[:,0:n_comp]
 32 | 
 33 | print('Number of PCs cumulatively explaining atleast 90% variance: ', n_comp)
 34 | 
 35 | #%% reconstruct original data
 36 | V_matrix = pca.components_.T
 37 | P_matrix = V_matrix[:,0:n_comp] 
 38 | 
 39 | data_train_normal_reconstruct = np.dot(score_train_reduced, P_matrix.T)
 40 | 
 41 | #%% calculate T2 for training data
 42 | lambda_k = np.diag(pca.explained_variance_[0:n_comp]) # eigenvalue = explained variance
 43 | lambda_k_inv = np.linalg.inv(lambda_k)
 44 | 
 45 | T2_train = np.zeros((data_train_normal.shape[0],))
 46 | 
 47 | for i in range(data_train_normal.shape[0]):
 48 |     T2_train[i] = np.dot(np.dot(score_train_reduced[i,:],lambda_k_inv),score_train_reduced[i,:].T)
 49 | 
 50 | #%% calculate Q for training data
 51 | error_train = data_train_normal - data_train_normal_reconstruct
 52 | Q_train = np.sum(error_train*error_train, axis = 1)
 53 | 
 54 | #%% T2_train control limit
 55 | import scipy.stats
 56 | 
 57 | N = data_train_normal.shape[0]
 58 | k = n_comp
 59 | 
 60 | alpha = 0.01# 99% control limit
 61 | T2_CL = k*(N**2-1)*scipy.stats.f.ppf(1-alpha,k,N-k)/(N*(N-k))
 62 | 
 63 | #%% Q_train control limit
 64 | eig_vals = pca.explained_variance_
 65 | m = data_train_normal.shape[1]
 66 | 
 67 | theta1 = np.sum(eig_vals[k:])
 68 | theta2 = np.sum([eig_vals[j]**2 for j in range(k,m)])
 69 | theta3 = np.sum([eig_vals[j]**3 for j in range(k,m)])
 70 | h0 = 1-2*theta1*theta3/(3*theta2**2)
 71 | 
 72 | z_alpha = scipy.stats.norm.ppf(1-alpha)
 73 | Q_CL = theta1*(z_alpha*np.sqrt(2*theta2*h0**2)/theta1+ 1 + theta2*h0*(1-h0)/theta1**2)**2 
 74 | 
 75 | #%% Q_train plot with CL
 76 | plt.figure()
 77 | plt.plot(Q_train)
 78 | plt.plot([1,len(Q_train)],[Q_CL,Q_CL], color='red')
 79 | plt.xlabel('Sample #')
 80 | plt.ylabel('Q for training data')
 81 | plt.show()
 82 |            
 83 | #%% T2_train plot with CL
 84 | plt.figure()
 85 | plt.plot(T2_train)
 86 | plt.plot([1,len(T2_train)],[T2_CL,T2_CL], color='red')
 87 | plt.xlabel('Sample #')
 88 | plt.ylabel('T$^2$ for training data')
 89 | plt.show()
 90 | 
 91 | #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 92 | ##                          Save model for later use
 93 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 94 | import pickle
 95 | PCAmodelData = {"PCAmodel": pca,
 96 |                 "scaler": scaler,
 97 |                 "n_comp": n_comp,
 98 |                 "P_matrix": P_matrix,
 99 |                 "lambda_k_inv": lambda_k_inv,
100 |                 "Q_CL": Q_CL,
101 |                 "T2_CL": T2_CL} # dictionary data structure uses key-value pairs
102 | 
103 | with open('PCAmodelData.pickle', 'wb') as f:
104 |     pickle.dump(PCAmodelData, f, pickle.HIGHEST_PROTOCOL)


--------------------------------------------------------------------------------
/Chapter_WebDeployment/contributionPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_WebDeployment/contributionPlot.png


--------------------------------------------------------------------------------
/Chapter_WebDeployment/frontEndTemplate.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 | <meta http-equiv="refresh" content="30">
 5 | 
 6 | <style>
 7 | 	body 
 8 | 	{
 9 | 	  font-family: Arial;
10 | 	  text-align: center;
11 | 	}
12 | 	
13 | 	#topHeader
14 | 	{
15 |       padding: 15px;
16 | 	  background: #1abc9c;
17 | 	  color: white;
18 | 	  font-size: 30px;
19 | 	}
20 | </style>
21 | 
22 | <script>
23 |     function toggleDisp(imgID) {
24 |         var x = document.getElementById(imgID);
25 |         if (x.style.display == "none") {
26 |             x.style.display = "block"; }
27 |         else {
28 |             x.style.display = "none"; }
29 |     }  
30 | </script>
31 | 
32 | </head>
33 | 
34 | <body>
35 | 	<h1 id='topHeader'> Smart Process Monitoring Tool </h1>
36 | 	
37 | 	{% if state == 0 %}
38 | 		<p style="color:green;"> <b> All Good </b> </p> <br> 
39 | 	{% else %}
40 | 		<p style="color:red;"> <b> Issue Detected </b> </p> <br> 
41 | 	{% endif %}
42 | 	
43 | 	<center> <img src="/metricPlot.png" height="300" width="1450"> </center> <br> <br>
44 | 	
45 | 	<button type="button" onclick="toggleDisp('contri_img')"> Show/Hide Contribution Plot </button> <br> <br> <br>
46 | 	
47 | 	<center> <img id='contri_img' src="/contributionPlot.png" style="display: none;" height="300" width="1350"> </center>
48 | 	
49 | </body>
50 | </html>
51 | 


--------------------------------------------------------------------------------
/Chapter_WebDeployment/helloWorld.py:
--------------------------------------------------------------------------------
 1 | ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | ##                    Hello World Web App
 3 | ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 | 
 5 | #%% import packages
 6 | import cherrypy
 7 | 
 8 | #%% Web application will be written as a Python class.
 9 | # Methods of the class will be used to respond to client requests
10 | class HelloWorld(object):
11 |     @cherrypy.expose
12 |     def index(self):
13 |         return "Hello world!"
14 | 
15 | #%% execution settings
16 | cherrypy.config.update({'server.socket_host': '0.0.0.0'})
17 | 
18 | if __name__ == '__main__':
19 |     cherrypy.quickstart(HelloWorld()) # when this script is executed, host HelloWorld app 
20 | 


--------------------------------------------------------------------------------
/Chapter_WebDeployment/info.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Chapter_WebDeployment/metricPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_WebDeployment/metricPlot.png


--------------------------------------------------------------------------------
/Chapter_WebDeployment/proc1a.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Chapter_WebDeployment/proc1a.xlsx


--------------------------------------------------------------------------------
/Chapter_WebDeployment/processLatestDatabase_local.csv:
--------------------------------------------------------------------------------
 1 | 0.51,-0.04,0.13,-0.79,0.26,0.29,-2.74,-0.19,-0.17,0.21,-0.1,0.38,-0.04,0.07,0,-0.4,-0.25,-0.14,-0.05,-1.27,-0.97,-0.1,0.91,1.15,-0.11,-0.01,-0.65,-0.03,-0.19,-0.31,0.07,0.46,-0.74
 2 | 0.1,0.05,-0.2,0.1,-0.25,0.4,-0.59,-0.92,-0.88,-4.76,-0.81,-0.03,-0.79,0.28,-0.35,-0.24,-0.53,-0.28,-0.48,-0.74,-0.65,-0.22,0.5,0.42,-0.14,-0.03,-1.1,-0.2,-0.24,-0.53,-0.1,-0.1,-1.09
 3 | -0.51,0.6,-0.22,0,-0.24,0.3,0.29,-0.38,-0.42,0.31,-0.29,0.3,-0.22,0.12,0.13,-0.37,-0.66,-0.26,-0.04,-0.67,-0.49,-0.75,0.83,0.17,-0.07,-0.01,-0.73,-0.06,-0.11,0.19,0.07,-0.14,-0.99
 4 | -0.5,0.67,-0.16,-0.01,-0.23,0.24,0.32,-0.34,-0.36,0.25,-0.23,0.34,-0.16,0.1,0.14,-0.39,6.57,-0.25,0.01,-0.7,-0.47,-0.77,0.88,0.22,-0.1,-0.02,-0.66,0.01,-0.31,-0.08,0.07,-0.22,-0.72
 5 | -1.19,-0.81,-1.74,0.08,-0.17,0.23,0.34,-0.15,0.01,0.02,0.04,0.29,0.05,0.18,0.38,-0.37,-0.62,-0.16,-0.14,-0.89,-0.42,-0.79,0.34,0.58,-0.12,-0.06,-0.68,-0.13,-0.31,-0.15,0.04,-0.28,-0.98
 6 | -0.88,-1.78,-1.42,0.06,-0.51,-0.37,-4.28,0.01,0.46,-0.28,0.49,0.72,0.47,-0.09,0.64,-0.54,-0.14,-0.05,0.39,-0.99,-0.52,-0.97,0.3,0.38,-0.5,-0.25,-0.49,-0.29,-0.21,0.26,0.17,-0.19,-0.36
 7 | -0.31,-0.81,-1.22,0.62,-0.43,-0.04,-2.43,0.26,0.45,-0.27,0.58,-0.12,0.37,0,1.46,-0.49,-0.28,-0.12,0.24,-1.04,-1.11,-0.8,0.19,0.24,-0.39,-0.19,-0.61,-0.23,-0.52,0.04,0.1,-0.57,-0.58
 8 | -0.61,0.14,-0.5,0.24,-0.33,0.33,0.38,-0.47,-0.12,0.23,0,0.46,-0.07,0.08,0.13,-0.43,-0.69,-0.28,-0.02,-0.79,-0.83,-0.62,0.51,0.12,-0.07,0.02,-0.45,0.03,-0.43,-0.07,0.06,-0.6,-1.08
 9 | -1.14,-0.56,-0.71,0.96,-0.28,0.27,0.91,-0.46,-0.04,0.21,0.03,0.47,0.03,0.09,-0.45,-0.44,-1,-0.38,0,-0.77,-0.03,-0.74,0.05,-0.4,0.23,-5.06,0.09,0.32,-0.17,0.01,0.15,-0.59,-1.09
10 | -2.47,0.41,-0.38,0.47,-0.21,0.32,0.67,-0.9,-0.36,0.49,-0.24,0.51,-0.14,0.07,-0.78,-0.78,-0.98,-0.47,0.14,-0.43,0.18,-0.96,-0.39,-0.4,0.6,0.47,0.71,0.76,0,0.14,0.42,-0.81,-1.7
11 | -2.26,0.08,-1.54,-0.56,-0.11,0.26,0.73,-0.73,-0.27,0.34,-0.12,-0.16,0.02,0.04,-0.45,-0.8,-0.84,-0.43,0.19,0.23,0.96,-1.5,-0.8,-0.94,0.86,0.63,0.99,1.03,0.31,0.41,0.59,-0.88,-1.73
12 | -1.94,-0.95,-0.94,0.07,-0.07,0.33,0.73,-0.48,-0.06,0.23,0.01,-0.27,0.07,0.11,-0.58,-0.76,-0.59,-0.38,0.08,0.41,1.57,-1.86,-0.91,-1.1,0.81,0.57,0.83,0.9,0.39,0.44,0.49,-0.87,-1.67
13 | -1.22,0.53,0.29,-0.27,-0.01,0.27,0.79,-0.46,0.12,-0.04,0.23,-0.25,0.39,0.11,-0.83,-0.77,-0.54,-0.32,0.13,0.92,1.53,-1.7,-1.28,-0.96,0.78,0.6,1.02,0.98,0.61,0.45,-4.51,-0.71,-1.39
14 | -1.03,-0.25,-0.6,-0.24,0.05,0.27,0.46,-0.22,-0.02,0.11,0.04,0.22,0.24,0.25,-0.55,-0.99,-0.36,-0.22,-0.12,1.28,1.48,-1.43,-1.54,-0.95,1,0.73,1.22,1.1,0.98,0.18,0.48,-0.86,-0.64
15 | -0.77,-0.45,0.16,-0.54,-0.02,0.39,0.51,-0.24,0.16,0.01,0.24,-0.54,0.34,0.27,-0.56,-0.98,-0.51,-0.22,0,1.05,1.86,-1.14,-1.57,-1.08,0.94,0.69,1.13,1.04,0.94,0.14,0.44,-1.28,-0.92
16 | -0.6,-0.34,-1.56,-0.37,-0.14,0.29,0.42,0.02,0.53,-0.35,0.6,0.08,0.55,0.34,-0.66,-0.94,-0.66,-0.29,-0.12,1.03,1.73,-1.14,-1.67,-1.7,0.66,0.59,0.99,0.85,0.88,0.01,0.28,-1.26,-0.26
17 | -1.32,-0.21,-0.24,-0.9,-0.28,0.03,0.5,-0.04,0.88,-0.52,0.94,0.02,0.68,0.38,-1.22,-0.92,-0.8,-0.41,-0.28,0.84,1.46,-0.88,-1.73,-1.73,-8.36,0.72,1.16,1.03,0.96,-0.12,0.26,-1.46,-0.52
18 | -1.48,-0.57,-0.12,-1.08,-0.3,0.21,0.36,-0.3,0.55,-0.24,0.55,-0.01,0.35,0.4,-0.96,-0.91,-0.87,-0.63,-0.19,0.56,1.77,-1.06,-1.76,-1.83,0.82,0.77,1.35,1.12,1.17,-0.05,0.29,-1.49,-0.81
19 | -1.37,-0.77,-0.02,-1.36,-0.11,0.22,0.4,-0.26,0.69,-0.3,0.7,-0.68,0.5,0.38,-0.73,-1.34,-0.79,-0.57,-0.09,0.89,1.93,-1.32,-1.71,-1.86,0.66,0.69,1.37,1.01,1.18,-0.07,0.28,-1.69,-0.54
20 | -1.3,-0.63,-0.19,-1.09,-0.06,-5.7,0.37,-0.38,0.61,-0.23,0.6,-0.69,0.48,0.4,-0.87,-1.17,-0.78,-0.79,-0.14,1.54,1.92,-1.28,-1.72,-2.11,1.07,0.95,1.67,1.4,1.28,0.04,0.35,-1.82,-1.13
21 | -1.19,-0.03,0.11,-1.05,-0.09,0.09,0.66,-0.01,1.33,-0.97,1.31,-0.55,0.99,0.33,-1.11,-1.23,-0.69,-0.82,0.03,1.27,2.46,-1.58,-1.57,-1.97,0.92,0.83,1.62,1.27,1.1,-4.16,0.37,-1.54,-0.92
22 | -0.94,-0.06,0.71,-1.15,-0.06,-5.7,0.65,-0.29,0.79,-0.56,0.84,-0.72,0.59,0.44,-1.09,-2.06,-0.69,-7.34,-0.2,1.78,2.11,-1.19,-1.84,-2.07,0.83,0.78,1.28,1.12,1.13,-0.08,0.26,-1.85,-0.81
23 | -1.62,0.12,0.61,-0.97,0.13,0.25,0.46,-0.21,0.95,-0.58,0.95,-0.77,0.67,0.46,-1.08,-2.05,-0.89,-1.13,-0.27,1.7,1.96,-1.41,-2.36,-2.28,0.83,0.75,1.36,1.08,1.18,-0.16,0.22,-1.9,-0.79
24 | 


--------------------------------------------------------------------------------
/Chapter_WebDeployment/sample.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <style>
 4 | 	body 
 5 | 	{
 6 | 	  font-family: Arial; 
 7 | 	  text-align: center;
 8 | 	}
 9 | 	
10 | 	#topHeader
11 | 	{
12 |       padding: 15px;
13 | 	  background: #1abc9c;
14 | 	  color: white;
15 | 	  font-size: 30px;
16 | 	}
17 | </style>
18 | 
19 | <body>
20 | 	<h1 id='topHeader'>Smart Process Monitoring Tool</h1>
21 | 	<p style="color:red;"> <b> Issue detected </b> </p>
22 | </body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/Images/Book3_coverPage.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Images/Book3_coverPage.JPG


--------------------------------------------------------------------------------
/Images/ML-for-PSE-2023Edition-CoverPage.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ML-PSE/Machine_Learning_for_PSE/7bb15eee2e1f00168dd03db8e67ccf194ea72675/Images/ML-for-PSE-2023Edition-CoverPage.JPG


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine_Learning_for_PSE
 2 | 
 3 | Chapter-wise code repository for the book 'Machine Learning in Python for Process Systems Engineering'
 4 | 
 5 | ![](/Images/Book3_coverPage.JPG)
 6 | 
 7 | ## Book Links:
 8 | - *Google Play*:  https://play.google.com/store/books/details?id=K_NjEAAAQBAJ
 9 | - *LeanPub*:  https://leanpub.com/machineLearningPSE
10 | 
11 | ## Original data sources for datasets used in this book:
12 | [Weblinks mentioned below may change or may no longer exist in future. Relevant data files have been provided in the respective folders in this repository. If you plan to share or use any dataset, please abide by the license policy (and/or the citation requests, if any) for the dataset.]
13 | 
14 | - *Polymer Manufacturing Process Data*:  
15 | 
16 |        Originally obtained from https://landing.umetrics.com/downloads-other-downloads (unfortunately this link no longer seems to work; data file is provided in the respective folder in this repository). 
17 |        Dataset also referenced at https://www.academia.edu/38630159/Multivariate_data_analysis_wiki
18 |        
19 |              
20 | - *Pulp & Paper Manufacturing Process Data*:
21 | 
22 |        Obtained from https://openmv.net. 
23 |        
24 |        Citation: Dayal et al. "Application of feedforward neural networks and partial least squares regression for modelling Kappa number in a continuous Kamyr digester", Pulp and Paper Canada, 95, 1994, p T7-T13.
25 |        
26 |            
27 | - *Low-Density Polyethylene (LDPE) Process Data*:
28 | 
29 |        Obtained from https://openmv.net.
30 | 
31 |            
32 | - *Tennessee Eastman Process Data*: 
33 | 
34 |        Available at https://github.com/camaramm/tennessee-eastman-profBraatz. Bigger dataset available at https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/6C3JR1. 
35 |        
36 |        Citation: Reith, C.A., B.D. Amsel, R. Tran., and B. Maia. Additional Tennessee Eastman process simulation data for anomaly detection evaluation. Harvard Dataverse, Version 1, 2017
37 |        
38 | - *Semiconductor Manufacturing Process Data*: 
39 | 
40 |        Obtained from http://www.eigenvector.com/data/Etch/. 
41 |        
42 |        Citation: B.M. Wise, N.B. Gallagher, S.W. Butler, D.D. White, Jr. and G.G. Barna, "A Comparison of Principal Components Analysis, Multi-way Principal Components analysis, Tri-linear Decomposition and Parallel Factor Analysis for Fault Detection in a Semiconductor Etch Process", J. Chemometrics (1999).
43 |        
44 | - *Polymer Pilot Plant Data*:
45 | 
46 |        Originally obtained from ftp://ftp.cis.upenn.edu/pub/ungar/chemdata/
47 |   
48 | - *Debutanizer Column Data from a Petroleum Refinery*:
49 | 
50 |        Available as supplementary material at https://link.springer.com/book/10.1007/978-1-84628-480-9. 
51 |        
52 |        Citation: Fortuna et. al., Soft sensors for monitoring and control of industrial processes, Springer, 2007
53 |        
54 | - *Concrete Compressive Strength Data*:
55 | 
56 |        Available at the UCI machine learning repository https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength
57 |        
58 |        Copyright: Prof. I-Cheng Yeh
59 |        Citation: I-Cheng Yeh, "Modeling of strength of high performance concrete using artificial neural networks," Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998)
60 | 
61 | - *Wastewater Treatment Plant Data*:
62 | 
63 |        Available at the UCI machine learning repository https://archive.ics.uci.edu/ml/datasets/water+treatment+plant
64 |        
65 | - *Combined Cycle Power Plant data*:
66 | 
67 |        Available at the UCI machine learning repository https://archive.ics.uci.edu/ml/datasets/combined+cycle+power+plant
68 |        
69 |        Citation: Pınar Tüfekci, Prediction of full load electrical power output of a base load operated combined cycle power plant using machine learning methods, International Journal of Electrical Power & Energy Systems, Volume 60, September 2014, Pages 126-140, ISSN 0142-0615
70 |        
71 | - *SISO Heater System Data*:
72 | 
73 |        Provided by Prof. John Hedengren at https://apmonitor.com/do/index.php/Main/LSTMNetwork. Direct links for the training and validation data: https://apmonitor.com/do/uploads/Main/tclab_dyn_data3.txt and  https://apmonitor.com/pdc/uploads/Main/tclab_data4.txt. File names will need to be changed to match the ones used in the book. 
74 |        
75 | - *Gas Turbine Data*:
76 | 
77 |        Originally available at NASA prognostics data repository https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/. Data available at https://data.nasa.gov/Aerospace/CMAPSS-Jet-Engine-Simulated-Data/ff5v-kuh6/about_data.
78 |        Training and validation data file names used in the text are different than the original file names. 
79 |        
80 |        Citation: A. Saxena and K. Goebel (2008). "Turbofan Engine Degradation Simulation Data Set", NASA Ames Prognostics Data Repository (http://ti.arc.nasa.gov/project/prognostic-data-repository), NASA Ames Research Center, Moffett Field, CA
81 |        
82 |        License: CC0: Public Domain (https://creativecommons.org/publicdomain/zero/1.0/)
83 | 
84 | 


--------------------------------------------------------------------------------