├── 1.KNN ├── 1.KNN(distance).py ├── 10.WKNN(regression).py ├── 11.KNN(boston).py ├── 2.KNN(optimal_k).py ├── 3.maxmin_ratio.py ├── 4.KNN(sklearn).py ├── 5.WKNN.py ├── 6.WKNN(iris).py ├── 7.KNN(Jaccard).py ├── 8.KNN(IOF).py └── 9.KNN(regression).py ├── 10.GBM ├── 1.GBM(regression).py ├── 2.SGBM(regression).py ├── 3.GBM(classification).py ├── 4.SGBM(classification).py └── 5.GBM(multi-classification).py ├── 11.xGBoost ├── 1.XGBoost(regression).py ├── 2.XGBoost(classification).py ├── 3.appoximation(1).py ├── 4.appoximation(2).py ├── 5.santander.py ├── MyXGBoostClassifier.py ├── MyXGBoostRegressor.py └── data │ └── santander.zip ├── 12.LGBM ├── 1.histogram_based.py ├── 2.goss.py ├── 3.greedy_bundling.py ├── 4.merge_features.py ├── 5.efb_onehot.py ├── 6.santander.py └── data │ └── santander.zip ├── 2.DecisionTree ├── 1.ID3(titanic_part).py ├── 2.CART(classification).py ├── 3.CART(titanic_part).py ├── 4.CART_CCP(titanic).py ├── 5.CART(multiclass).py ├── 6.CART(regression).py ├── MyDTreeClassifier.py ├── MyDTreeRegressor.py └── data │ ├── titanic.csv │ └── titanic_clean.csv ├── 3.LinearRegression ├── 1.scipy_opt(ols).py ├── 10.ransac(2).py ├── 11.boston(ransac).py ├── 2.boston(ols).py ├── 3.boston(sklearn).py ├── 4.scipy_opt(tls).py ├── 5.boston(tls).py ├── 6.lwr(scipy).py ├── 7.lwr(sklearn).py ├── 8.boston(lwr).py ├── 9.ransac(1).py └── data │ ├── boston_house.pkl │ └── wls_sample_data.csv ├── 4.LogisticRegression ├── 1.bin_class(scipy).py ├── 10.lwlr(sklearn).py ├── 11.lwlr_2(sklearn).py ├── 2.bin_class(sklearn).py ├── 3.bin_class(scipy_cancer).py ├── 4.bin_class(sklearn_cancer).py ├── 5.multiclass(ovr_1).py ├── 6.multiclass(ovr_2).py ├── 7.multiclass(softmax_scipy).py ├── 8.multiclass(softmax_sklearn).py └── 9.lwlr(scipy).py ├── 5.Convex ├── 1.plot_convex.py ├── 2.EQP.py ├── 3.IQP_1.py ├── 4.IQP_2.py ├── 5.QP.py └── 6.LP.py ├── 6.SVM ├── 1.cvxopt(hard_margin).py ├── 10.multiclass(OvR).py ├── 11.cvxopt(svr_linear).py ├── 12.cvxopt(svr_nonlinear).py ├── 2.cvxopt(soft_margin).py ├── 3.SVC(soft_margin).py ├── 4.linearSVC(soft_margin).py ├── 5.check_kernel.py ├── 6.cvxopt(kernel_trick).py ├── 7.SVC(kernel_trick).py ├── 8.Kernel(titanic).py ├── 9.multiclass(OvO).py └── data │ └── titanic.csv ├── 7.KMeans ├── 1.kmeans(basic).py ├── 2.sklearn(kmeans).py ├── 3.kmeans(plus).py └── 4.sklearn(mnist).py ├── 8.RandomForest ├── 1.RF(titanic).py ├── 2.RF(sklearn).py ├── 3.RF_OOB.py ├── 4.RF_OOB(sklearn).py ├── 5.RF_proximity.py ├── 6.RF_outlier.py ├── 7.iForest_test.py ├── 8.iForest_outlier.py ├── MyDTreeClassifierRF.py └── data │ ├── titanic.csv │ ├── titanic_clean.csv │ └── titanic_clean1.csv ├── 9.AdaBoost ├── 1.AdaBoost(binary1).py ├── 2.AdaBoost(binary2).py ├── 3.AdaBoost(multiclass).py ├── 4.sklearn(AdaBoost).py └── 5.AdaBoost(regression).py └── README.md /1.KNN/1.KNN(distance).py: -------------------------------------------------------------------------------- 1 | # [MXML-1-01] 1.KNN(distance).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/EVEzkS5It0I 10 | # 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from sklearn.datasets import make_blobs 14 | from sklearn.model_selection import train_test_split 15 | 16 | # create dataset 17 | x, y = make_blobs(n_samples=300, n_features=2, 18 | centers=[[0., 0.], [0.25, 0.5], [0.5, 0.]], 19 | cluster_std=0.15, center_box=(-1., 1.)) 20 | 21 | # Visualize the dataset and class by color 22 | plt.figure(figsize=(5, 5)) 23 | for i, color in enumerate(['red', 'blue', 'green']): 24 | p = x[y==i] 25 | plt.scatter(p[:, 0], p[:, 1], s=50, c=color, 26 | label='y=' + str(i), alpha=0.5) 27 | plt.legend() 28 | plt.show() 29 | 30 | # split dataset into train and test data 31 | x_train, x_test, y_train, y_test = train_test_split(x, y) 32 | K = 10 # the number of nearest neighbors 33 | 34 | # 1. Calculate the distance between test and train data. 35 | d_train = x_train[np.newaxis, :, :] # expand D0 axis 36 | d_test = x_test[:, np.newaxis, :] # expand D1 axis 37 | distance = np.sqrt(np.sum((d_train - d_test) ** 2, axis=2)) 38 | 39 | # 2. Find K nearest neighbors 40 | i_near = np.argsort(distance, axis=1)[:, :K] 41 | y_near = y_train[i_near] 42 | 43 | # 3. majority voting 44 | y_pred = np.array([np.bincount(p).argmax() for p in y_near]) 45 | 46 | # Measure the accuracy for test data 47 | print('Accuracy = {:.4f}'.format((y_pred == y_test).mean())) 48 | -------------------------------------------------------------------------------- /1.KNN/10.WKNN(regression).py: -------------------------------------------------------------------------------- 1 | # [MXML-1-07] 10.WKNN(regression).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/_ZxTTvbZOtc 10 | # 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from sklearn.neighbors import KNeighborsRegressor 14 | 15 | # Generate training and test data 16 | n_train = 1000 # the number of training data points 17 | n_test = 100 # the number of test data points 18 | x_train = np.random.random(n_train).reshape(-1, 1) 19 | y_train = 2.0 * np.sin(2.0 * np.pi * x_train)\ 20 | + np.random.normal(0.0, 0.5, size=(n_train,1))+3. 21 | y_train = y_train.reshape(-1) 22 | x_test = np.linspace(x_train.min(), x_train.max(), n_test)\ 23 | .reshape(-1, 1) 24 | 25 | # Generate the distance matrix between x_test and x_train 26 | d_train = x_train[np.newaxis, :, :] 27 | d_test = x_test[:, np.newaxis, :] 28 | dist= np.abs(d_train - d_test).reshape(n_test, n_train) + 1e-8 29 | 30 | # Find K nearest neighbors 31 | K = 200 32 | i_near = np.argsort(dist, axis=1)[:, :K] # (100, 200) 33 | y_near = y_train[i_near] # (100, 200) 34 | 35 | # Compute the weights to apply to the neighbors 36 | w_dist = np.array([dist[i, :][i_near[i, :]] \ 37 | for i in range(x_test.shape[0])]) 38 | w_inv = 1. / w_dist 39 | 40 | # Predict the y values ​​of the test data by weighted average method 41 | y_pred1 = (y_near * w_inv).sum(axis=1) / w_inv.sum(axis=1) 42 | 43 | # Plot the training and test data points with their predicted 44 | # y values ​​(y_pred) 45 | def plot_prediction(y_pred): 46 | plt.figure(figsize=(6,4)) 47 | plt.scatter(x_train, y_train, c='blue', s=20, alpha=0.5, label='train data') 48 | plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction') 49 | plt.xlim(0, 1) 50 | plt.ylim(0, 7) 51 | plt.legend() 52 | plt.show() 53 | 54 | plot_prediction(y_pred1) 55 | 56 | # Predict the y values ​​of the test data using KNeighborsRegressor 57 | # from scikit-learn. 58 | knn = KNeighborsRegressor(n_neighbors=K, weights='distance') 59 | knn.fit(x_train, y_train) 60 | y_pred2 = knn.predict(x_test) 61 | plot_prediction(y_pred2) 62 | -------------------------------------------------------------------------------- /1.KNN/11.KNN(boston).py: -------------------------------------------------------------------------------- 1 | # [MXML-1-07] 11.KNN(boston).py 2 | # Predict the house prices in Boston using KNN 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/_ZxTTvbZOtc 11 | # 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | from sklearn.neighbors import KNeighborsRegressor 15 | from sklearn.model_selection import train_test_split 16 | import pickle 17 | 18 | # Read Boston house price dataset 19 | with open('data/boston_house.pkl', 'rb') as f: 20 | data = pickle.load(f) 21 | x = data['data'] # shape = (506, 13) 22 | y = data['target'] # shape = (506,) 23 | x_train, x_test, y_train, y_test = train_test_split(x, y) 24 | 25 | # Z-score Normalization 26 | x_mu = x_train.mean(axis=0) 27 | x_sd = x_train.std(axis=0) 28 | y_mu = y_train.mean() 29 | y_sd = y_train.std() 30 | zx_train = (x_train - x_mu) / x_sd 31 | zy_train = (y_train - y_mu) / y_sd 32 | zx_test = (x_test - x_mu) / x_sd 33 | zy_test = (y_test - y_mu) / y_sd 34 | 35 | # Visually check the actual and predicted prices 36 | def plot_predictions(y_true, y_pred): 37 | plt.figure(figsize=(5, 4)) 38 | plt.scatter(y_true, y_pred, s=20, c='r') 39 | plt.xlabel('y_true') 40 | plt.ylabel('y_pred') 41 | plt.show() 42 | 43 | # Simple average method 44 | model1 = KNeighborsRegressor(n_neighbors = 10) 45 | model1.fit(zx_train, zy_train) 46 | y_pred1 = model1.predict(zx_test) * y_sd + y_mu 47 | plot_predictions(y_test, y_pred1) 48 | print('KNN R2 = {:.3f}'.format(model1.score(zx_test, zy_test))) 49 | 50 | # Weighted average method 51 | model2 = KNeighborsRegressor(n_neighbors = 30, weights='distance') 52 | model2.fit(zx_train, zy_train) 53 | y_pred2 = model2.predict(zx_test) * y_sd + y_mu 54 | plot_predictions(y_test, y_pred2) 55 | print('WKNN R2 = {:.3f}'.format(model2.score(zx_test, zy_test))) 56 | 57 | a=np.array([.751, .671, .797, .802, .737, .789, .771, .735, .736, .668]) 58 | a=np.array([.741, .669, .757, .764, .657, .703, .718, .747, .682, .647]) 59 | a.mean() 60 | -------------------------------------------------------------------------------- /1.KNN/2.KNN(optimal_k).py: -------------------------------------------------------------------------------- 1 | # [MXML-1-02] 2.KNN(optimal_k).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/tIKsjeyaVnc 10 | # 11 | import numpy as np 12 | from sklearn.datasets import make_blobs 13 | import matplotlib.pyplot as plt 14 | from sklearn.model_selection import train_test_split 15 | 16 | # create dataset 17 | x, y = make_blobs(n_samples=900, n_features=2, 18 | centers=[[0., 0.], [0.25, 0.5], [0.5, 0.]], 19 | cluster_std=0.2, center_box=(-1., 1.)) 20 | 21 | # Visualize the dataset and classes by color 22 | plt.figure(figsize=(5, 5)) 23 | for i, color in enumerate(['red', 'blue', 'green']): 24 | p = x[y==i] 25 | plt.scatter(p[:, 0], p[:, 1], s=20, c=color, 26 | label='y=' + str(i), alpha=0.5) 27 | plt.legend() 28 | plt.show() 29 | 30 | # Split the dataset into training and test data 31 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) 32 | N = x_train.shape[0] 33 | 34 | # Z-score Normalization. 35 | # The values ​​in this data set have similar scales, 36 | # so there is no need to normalize them. But let's try this 37 | # just for practice. 38 | 39 | # Calculate the mean and standard deviation from the training data 40 | # and apply them to the test data. 41 | mean = x_train.mean(axis=0) 42 | std = x_train.std(axis=0) 43 | z_train = (x_train - mean) / std 44 | z_test = (x_test - mean) / std 45 | 46 | # A function for performing the KNN classification algorithm. 47 | def knn_predict(train, test, k): 48 | # 1. Create a distance matrix. 49 | d_train = train[np.newaxis, :, :] # Add a new axis at D0 50 | d_test = test[:, np.newaxis, :] # Add a new axis at D1 51 | 52 | p = 2 # Euclidean distance 53 | d = np.sum(np.abs(d_train - d_test) ** p, axis=-1) ** (1/p) 54 | 55 | # 2. Find K nearest neighbors 56 | i_nearest = np.argsort(d, axis=1)[:, :k] # index 57 | y_nearest = y_train[i_nearest] 58 | 59 | # 3. majority voting 60 | return np.array([np.bincount(i).argmax() for i in y_nearest]) 61 | 62 | # Measure the accuracy of the test data while changing K value. 63 | accuracy = [] 64 | k_vals = np.arange(1, 700, 10) 65 | for k in k_vals: 66 | # Estimate the classes of all test data points and measure the accuracy. 67 | y_pred = knn_predict(z_train, z_test, k) 68 | accuracy.append((y_pred == y_test).mean()) 69 | 70 | # Observe how the accuracy changes as K changes. 71 | plt.figure(figsize=(5, 3)) 72 | plt.plot(k_vals, accuracy, '-') 73 | plt.axvline(x=np.sqrt(N), c='r', ls='--') 74 | plt.ylim(0.5, 1) 75 | plt.show() 76 | 77 | # Generate a large number of test data points and roughly determine 78 | # the decision boundary. 79 | # x_many = np.random.uniform(-0.5, 1.5, (1000, 2)) 80 | x_many = np.random.uniform(-0.5, 1.5, (1000, 2)) 81 | z_many = (x_many - mean) / std 82 | y_many = knn_predict(z_train, z_many, k=int(np.sqrt(N))) 83 | 84 | # Check the decision boundary 85 | plt.figure(figsize=(5,5)) 86 | color = [['red', 'blue', 'green'][a] for a in y_many] 87 | plt.scatter(x_many[:, 0], x_many[:, 1], s=100, c=color, alpha=0.3) 88 | plt.scatter(x_train[:, 0], x_train[:, 1], s=80, c='black') 89 | plt.scatter(x_train[:, 0], x_train[:, 1], s=10, c='yellow') 90 | plt.xlim(-0.5, 1.0) 91 | plt.ylim(-0.5, 1.0) 92 | plt.show() 93 | 94 | -------------------------------------------------------------------------------- /1.KNN/3.maxmin_ratio.py: -------------------------------------------------------------------------------- 1 | # [MXML-01-03] 3.maxmin_ratio.py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/qZ_6UAVnNMw 10 | # 11 | import numpy as np 12 | from sklearn.datasets import fetch_openml 13 | from sklearn.decomposition import PCA 14 | 15 | # Load the MNIST dataset 16 | mnist = fetch_openml('mnist_784', parser='auto') 17 | x = np.array(mnist['data']) / 255 18 | 19 | # Compute the distances between a single data point and all other 20 | # data points in a given data set. 21 | def distance(data): 22 | # Randomly choose a single data point from the dataset. 23 | i = np.random.randint(0, data.shape[0]) 24 | tp = data[i] 25 | 26 | # Remove the chosen data point from the dataset. 27 | xp = np.delete(data, i, axis=0) 28 | 29 | # Compute the distances between tp and xp 30 | d = np.sqrt(np.sum((xp - tp) ** 2, axis=-1)) 31 | 32 | # Return the minimum distance and maximum distance 33 | return d.min(), d.max() 34 | 35 | # Compute the average ratio of minimum to maximum distances 36 | # in a 784-dimensional feature space 37 | r_maxmin = [] 38 | for i in range(10): 39 | dmin, dmax = distance(x) 40 | r_maxmin.append(dmax / dmin) 41 | print("max-min ratio (p=784): {0:.2f}".format(np.mean(r_maxmin))) 42 | 43 | # Compute the average ratio of minimum to maximum distances 44 | # in a 5-dimensional feature space 45 | pca = PCA(n_components=5) 46 | pca.fit(x) 47 | x_pca = pca.transform(x) 48 | 49 | r_maxmin = [] 50 | for i in range(10): 51 | dmin, dmax = distance(x_pca) 52 | r_maxmin.append(dmax / dmin) 53 | print("max-min ratio (p=5) : {0:.2f}".format(np.mean(r_maxmin))) 54 | -------------------------------------------------------------------------------- /1.KNN/4.KNN(sklearn).py: -------------------------------------------------------------------------------- 1 | # [MXML-01-03] 4.KNN(sklearn).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/qZ_6UAVnNMw 10 | # 11 | import numpy as np 12 | from sklearn.datasets import load_iris 13 | from sklearn.neighbors import KNeighborsClassifier 14 | from sklearn.model_selection import train_test_split 15 | import matplotlib.pyplot as plt 16 | 17 | # Load the Iris dataset. 18 | # x: data, the number of samples=150, the number of features=4 19 | # y: target data with class (0,1,2) 20 | x, y = load_iris(return_X_y=True) 21 | 22 | # Split the dataset to training, validation and test data 23 | x_train, x_test, y_train, y_test=train_test_split(x, y, \ 24 | test_size = 0.4) 25 | x_val, x_test, y_val, y_test=train_test_split(x_test, y_test,\ 26 | test_size = 0.5) 27 | # Z-score normalization 28 | mean = x_train.mean(axis=0) 29 | std = x_train.std(axis=0) 30 | 31 | x_train = (x_train - mean) / std # Z-score normalization 32 | x_val = (x_val - mean) / std # use mean and std from x_train 33 | x_test = (x_test - mean) / std # use mean and std from x_train 34 | 35 | # Set K to sqrt(N) 36 | sqr_k = int(np.sqrt(x_train.shape[0])) 37 | 38 | # Build a KNN classification model 39 | knn = KNeighborsClassifier(n_neighbors=sqr_k, metric='minkowski', p=2) 40 | 41 | # Model fitting. Since KNN is a lazy learner, no learning is performed 42 | # at this step. It simply stores the training data points and the 43 | # parameters. 44 | knn.fit(x_train, y_train) 45 | 46 | # Predict the class of validation data. 47 | # The actual learning takes place at this stage, when test or 48 | # validation data is provided. 49 | y_pred = knn.predict(x_val) 50 | 51 | # Measure the accuracy on the validation data 52 | accuracy = (y_val == y_pred).mean() 53 | print('\nK: sqr_K = {}, Accuracy on validation data = {:.3f}'\ 54 | .format(sqr_k, accuracy)) 55 | 56 | # Determine the optimal K. 57 | # Measure the accuracy on the validation data while changing K. 58 | accuracy = [] 59 | for k in range(2, 20): 60 | knn = KNeighborsClassifier(n_neighbors = k) 61 | knn.fit(x_train, y_train) 62 | y_pred = knn.predict(x_val) 63 | accuracy.append((y_val == y_pred).mean()) 64 | 65 | # Find the optimal K value with the highest accuracy. 66 | opt_k = np.array(accuracy).argmax() + 2 67 | 68 | # Observe how the accuracy changes as K changes. 69 | plt.plot(np.arange(2, 20), accuracy, marker='o') 70 | plt.xticks(np.arange(2, 20)) 71 | plt.axvline(x = opt_k, c='blue', ls = '--') 72 | plt.axvline(x = sqr_k, c='red', ls = '--') 73 | plt.ylim(0.8, 1.1) 74 | plt.title('optimal K = ' + str(opt_k)) 75 | plt.show() 76 | 77 | # Finally, we use the test data to measure the final performance 78 | # of the model. 79 | knn = KNeighborsClassifier(n_neighbors = opt_k) 80 | knn.fit(x_train, y_train) 81 | y_pred = knn.predict(x_test) 82 | accuracy = (y_test == y_pred).mean() 83 | print('\nK: opt_k = {}, Accuracy on test data = {:.3f}' 84 | .format(opt_k, accuracy)) 85 | -------------------------------------------------------------------------------- /1.KNN/5.WKNN.py: -------------------------------------------------------------------------------- 1 | # [MXML-1-04] 5.WKNN.py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/Lu6GAc4FYz8 10 | # 11 | import numpy as np 12 | 13 | # Let's assume that the distance matrix between the test data and 14 | # the training data is given as follows. shape = (5, 10) 15 | dist = np.array( 16 | # train: 0 1 2 3 4 5 6 7 8 9 test 17 | [[5. , 3.5, 4.3, 3.4, 1.4, 6.5, 2.7, 5.1, 2.9, 2.8], # i=0 18 | [4.4, 1.9, 3.6, 3.3, 0.5, 5.5, 2.1, 4.4, 1.3, 2.3], # i=1 19 | [4.6, 1. , 3.9, 4.4, 3. , 4.7, 3.2, 4.4, 1.4, 3.5], # i=2 20 | [4.7, 0.6, 3.9, 4.1, 1.7, 5.3, 2.7, 4.6, 0.4, 3. ], # i=3 21 | [3. , 3.6, 2.4, 1.4, 2.4, 4.8, 1.2, 3.2, 3. , 1.1]]) # i=4 22 | 23 | # target class y (0 1 2 3 4 5 6 7 8 9) 24 | y_train = np.array([0, 1, 1, 0, 1, 0, 1, 1, 0, 0]) 25 | C = [0, 1] # the class y is either 0 or 1 26 | K = 7 # 7-nearest neighbors 27 | T = 5 # the number of test data points 28 | 29 | # Find K nearest neighbors 30 | i_near = np.argsort(dist, axis=1)[:, :K] 31 | y_near = y_train[i_near] 32 | 33 | # Compute the inverse distance 34 | w_dist = np.array([dist[i, :][i_near[i, :]] for i in range(T)]) 35 | w_inv = 1. / w_dist 36 | 37 | # Predict the class of test data using the inverse weighted distance 38 | y_pred = [] 39 | for i in range(T): 40 | iw_dist = [w_inv[i][y_near[i] == j].sum() for j in C] 41 | y_pred.append(np.argmax(iw_dist / w_inv[i].sum())) 42 | 43 | print(y_pred) 44 | -------------------------------------------------------------------------------- /1.KNN/6.WKNN(iris).py: -------------------------------------------------------------------------------- 1 | # [MXML-1-04] 6.WKNN(iris).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/Lu6GAc4FYz8 10 | # 11 | import numpy as np 12 | from sklearn.datasets import load_iris 13 | from sklearn.model_selection import train_test_split 14 | 15 | # Load Iris dataset 16 | x, y = load_iris(return_X_y=True) 17 | 18 | # Split the dataset to training and test data 19 | x_train, x_test, y_train, y_test = train_test_split(x, y) 20 | N = x_train.shape[0] # the number of training data points 21 | T = x_test.shape[0] # the number of test data points 22 | C = np.unique(y) # categories of y: [0, 1, 2] 23 | K = int(np.sqrt(N)) # appropriate K value 24 | 25 | # Z-score Normalization. 26 | mean = x_train.mean(axis=0); std = x_train.std(axis=0) 27 | z_train = (x_train - mean) / std 28 | z_test = (x_test - mean) / std 29 | 30 | # Predict the class of test data. 31 | # 1. Compute the distance matrix between test and train data. 32 | d_train = z_train[np.newaxis, :, :] 33 | d_test = z_test[:, np.newaxis, :] 34 | dist = np.sqrt(np.sum((d_train - d_test) ** 2, axis=2)) 35 | dist += 1e-8 # To prevent the distance from becoming 0 36 | 37 | # 2. Find K nearest neighbors. 38 | i_near = np.argsort(dist, axis=1)[:, :K] 39 | y_near = y_train[i_near] 40 | 41 | # 3. Compute the inverse distance 42 | w_inv = 1. / np.array([dist[i, :][i_near[i, :]] for i in range(T)]) 43 | 44 | # 4. Predict the class of the test data using the weights of the 45 | # inverse distance 46 | y_pred1 = [] 47 | for i in range(T): 48 | iw_dist = [w_inv[i][y_near[i] == j].sum() for j in C] 49 | y_pred1.append(np.argmax(iw_dist / w_inv[i].sum())) 50 | y_pred1 = np.array(y_pred1) 51 | 52 | # Measure the accuracy on the test data. 53 | accuracy = (y_test == y_pred1).mean() 54 | print('\nAccuracy on test data = {:.3f}'.format(accuracy)) 55 | 56 | # Compare with the results of sklearn's KNeighborsClassifier. 57 | from sklearn.neighbors import KNeighborsClassifier 58 | 59 | # 'distance': weight points by the inverse of their distance. 60 | # in this case, closer neighbors of a query point will have 61 | # a greater influence than neighbors which are further away. 62 | knn = KNeighborsClassifier(n_neighbors=K, weights='distance') 63 | knn.fit(z_train, y_train) 64 | y_pred2 = knn.predict(z_test) 65 | accuracy = (y_test == y_pred2).mean() 66 | print('Accuracy on test data (sklearn) = {:.3f}'.format(accuracy)) 67 | 68 | print('from scratch: y_pred1\n', y_pred1) 69 | print('from sklearn: y_pred2\n', y_pred2) 70 | 71 | (y_pred1 != y_pred2).sum() 72 | -------------------------------------------------------------------------------- /1.KNN/7.KNN(Jaccard).py: -------------------------------------------------------------------------------- 1 | # [MXML-1-05] 7.KNN(Jaccard).py 2 | # KNN classification on categorical data 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/dDJwm25-_l8 11 | # 12 | import numpy as np 13 | from sklearn.preprocessing import OneHotEncoder, LabelEncoder 14 | from sklearn.metrics import jaccard_score 15 | 16 | # Golf play dataset 17 | # data source: 18 | # https://www.kaggle.com/datasets/priy998/golf-play-dataset 19 | # columns = [outlook, temperature, humidity, windy, play] 20 | data = np.array( 21 | [['sunny', 'hot', 'high', False, 'no'], 22 | ['sunny', 'hot', 'high', True, 'no'], 23 | ['overcast', 'hot', 'high', False, 'yes'], 24 | ['rainy', 'mild', 'high', False, 'yes'], 25 | ['rainy', 'cool', 'normal', False, 'yes'], 26 | ['rainy', 'cool', 'normal', True, 'no'], 27 | ['overcast', 'cool', 'normal', True, 'yes'], 28 | ['sunny', 'mild', 'high', False, 'no'], 29 | ['sunny', 'cool', 'normal', False, 'yes'], 30 | ['rainy', 'mild', 'normal', False, 'yes'], 31 | ['sunny', 'mild', 'normal', True, 'yes'], 32 | ['overcast', 'mild', 'high', True, 'yes'], 33 | ['overcast', 'hot', 'normal', False, 'yes'], 34 | ['rainy', 'mild', 'high', True, 'no'], 35 | ['sunny', 'mild', 'high', True, 'no']]) 36 | 37 | # x: one-hot encoded or label encoded features 38 | # y: target, k: the number of nearest neighbors 39 | # average: 'binary' or 'macro' 40 | def predict(x, y, k, average): 41 | match = [] 42 | for t in range(x.shape[0]): 43 | x_test = x[t] 44 | y_test = y[t] 45 | x_train = np.delete(x, t, axis=0) 46 | y_train = np.delete(y, t, axis=0) 47 | 48 | # Compute the Jaccard similarity between a test data point 49 | # and all training data points. 50 | similarities = [] 51 | for i in range(x_train.shape[0]): 52 | J = jaccard_score(x_train[i], x_test, 53 | average=average, zero_division=0.0) 54 | similarities.append(J) 55 | 56 | # Find the k nearest neighbors of the test data point. 57 | j = np.argsort(similarities)[::-1][:k] 58 | 59 | # Predict the class of the test data point by majority vote 60 | y_pred = np.bincount(y_train[j]).argmax() 61 | 62 | # Store whether y_pred and y_test match or not. 63 | match.append(y_pred == y_test) 64 | 65 | print("True class: {}, Predicted class: {}, is match: {}"\ 66 | .format(y_test, y_pred, match[-1])) 67 | return np.mean(match) # return the accuracy 68 | 69 | # One-hot encoding 70 | ohe = OneHotEncoder().fit_transform(data).toarray().astype('int') 71 | x = ohe[:, :-2] # one-hot encoded features 72 | y = ohe[:, -1] # target 73 | K = 5 # 5 nearest neighbors 74 | 75 | print("\n* One-hot encoding:") 76 | acc = predict(x, y, K, average='binary') 77 | print("Accuracy: {:.3f}".format(acc)) 78 | 79 | # Label encoding 80 | le = [] 81 | for i in range(data.shape[1]): 82 | le.append(LabelEncoder().fit_transform(data[:, i])) 83 | le = np.array(le).T 84 | 85 | x = le[:, :-1] # label encoded features 86 | y = le[:, -1] # target 87 | 88 | print("\n* Label encoding:") 89 | acc = predict(x, y, K, average='macro') 90 | print("Accuracy: {:.3f}".format(acc)) 91 | 92 | 93 | -------------------------------------------------------------------------------- /1.KNN/9.KNN(regression).py: -------------------------------------------------------------------------------- 1 | # [MXML-1-07] 9.KNN(regression).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/_ZxTTvbZOtc 10 | # 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from sklearn.neighbors import KNeighborsRegressor 14 | 15 | # Generate training and test data 16 | n_train = 1000 # the number of training data points 17 | n_test = 100 # the number of test data points 18 | x_train = np.random.random(n_train).reshape(-1, 1) 19 | y_train = 2.0 * np.sin(2.0 * np.pi * x_train)\ 20 | + np.random.normal(0.0, 0.5, size=(n_train,1))+3. 21 | y_train = y_train.reshape(-1) 22 | x_test = np.linspace(x_train.min(), x_train.max(), n_test)\ 23 | .reshape(-1, 1) 24 | 25 | # Generate the distance matrix between x_test and x_train 26 | d_train = x_train[np.newaxis, :, :] 27 | d_test = x_test[:, np.newaxis, :] 28 | dist= np.abs(d_train - d_test).reshape(n_test, n_train) 29 | 30 | # Find K nearest neighbors 31 | K = 20 32 | i_near = np.argsort(dist, axis=1)[:, :K] # (100, 20) 33 | y_near = y_train[i_near] # (100, 20) 34 | 35 | # Predict the y values ​​of the test data by simple average method 36 | y_pred1 = y_near.mean(axis=1) 37 | 38 | # Plot the training and test data points with their predicted 39 | # y values ​​(y_pred1) 40 | def plot_prediction(y_pred): 41 | plt.figure(figsize=(6,4)) 42 | plt.scatter(x_train, y_train, c='blue', s=20, alpha=0.5, label='train data') 43 | plt.plot(x_test, y_pred, c='red', lw=3.0, label='prediction') 44 | plt.xlim(0, 1) 45 | plt.ylim(0, 7) 46 | plt.legend() 47 | plt.show() 48 | 49 | # Predict the y-values ​​of the test data using the simple 50 | # average method. 51 | plot_prediction(y_pred1) 52 | 53 | # Predict the y values ​​of the test data using scikit-learn's KNeighborsRegressor 54 | knn = KNeighborsRegressor(n_neighbors=K) 55 | knn.fit(x_train, y_train) 56 | y_pred2 = knn.predict(x_test) 57 | plot_prediction(y_pred2) 58 | 59 | -------------------------------------------------------------------------------- /10.GBM/1.GBM(regression).py: -------------------------------------------------------------------------------- 1 | # [MXML-10-03] 1.GBM(regression).py 2 | # Implementation of GBM algorithm using DecisionTreeRegressor. 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/hF-1HHKPxq4 11 | # 12 | import numpy as np 13 | from sklearn.tree import DecisionTreeRegressor 14 | import matplotlib.pyplot as plt 15 | 16 | # Create training data for regression 17 | def nonlinear_data(n, s): 18 | rtn_x, rtn_y = [], [] 19 | for i in range(n): 20 | x = np.random.random() 21 | y = 2.0 * np.sin(2.0 * np.pi * x) + np.random.normal(0.0, s) + 3.0 22 | rtn_x.append(x) 23 | rtn_y.append(y) 24 | 25 | return np.array(rtn_x).reshape(-1,1), np.array(rtn_y) 26 | 27 | # Create training data 28 | x, y = nonlinear_data(n=500, s=0.5) 29 | 30 | n_depth = 3 # tree depth 31 | n_tree = 50 # the number of trees (M) 32 | alpha = 0.05 # learning rate 33 | 34 | # step-1: Initialize model with a constant value. 35 | F0 = y.mean() 36 | 37 | # Training 38 | Fm = F0 39 | models = [] 40 | loss = [] 41 | for m in range(n_tree): 42 | # step-2 (A): Compute so-called pseudo-residuals 43 | residual = y - Fm 44 | 45 | # step-2 (B): Fit a regression tree to the residual 46 | gb_model = DecisionTreeRegressor(max_depth=n_depth) 47 | gb_model.fit(x, residual) 48 | 49 | # step-2 (C): compute gamma (prediction) 50 | gamma = gb_model.predict(x) 51 | 52 | # step-2 (D): Update the model 53 | Fm = Fm + alpha * gamma 54 | 55 | # Store trained tree models 56 | models.append(gb_model) 57 | 58 | # Calculate loss. loss = mean squared error. 59 | loss.append(((y - Fm) ** 2).sum()) 60 | 61 | # step-3: Output Fm(x) – Prediction of test data 62 | y_pred = F0 63 | x_test = np.linspace(0, 1, 50).reshape(-1, 1) 64 | for model in models: 65 | y_pred += alpha * model.predict(x_test) 66 | 67 | # Check the loss history 68 | plt.figure(figsize=(6,4)) 69 | plt.plot(loss, c='red') 70 | plt.xlabel('m : iteration') 71 | plt.ylabel('loss: mean squared error') 72 | plt.title('loss history') 73 | plt.show() 74 | 75 | # Visualize the training data and prediction results 76 | def plot_prediction(x, y, x_test, y_pred, title): 77 | plt.figure(figsize=(6,4)) 78 | plt.scatter(x, y, c='blue', s=20, alpha=0.5, label='train data') 79 | plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction') 80 | plt.xlim(0, 1) 81 | plt.ylim(0, 7) 82 | plt.legend() 83 | plt.title(title) 84 | plt.show() 85 | 86 | plot_prediction(x, y, x_test, y_pred, 'From scratch') 87 | 88 | # Compare with the results of sklearn’s GradientBoostingRegressor 89 | from sklearn.ensemble import GradientBoostingRegressor 90 | 91 | sk_model = GradientBoostingRegressor(n_estimators=n_tree, 92 | learning_rate=alpha, 93 | max_depth=n_depth) 94 | 95 | sk_model.fit(x, y) # training 96 | y_pred = sk_model.predict(x_test) # prediction 97 | 98 | # Visualize the training data and prediction results 99 | plot_prediction(x, y, x_test, y_pred, 'GradientBoostingRegressor') 100 | 101 | sk_model.estimators_ 102 | -------------------------------------------------------------------------------- /10.GBM/2.SGBM(regression).py: -------------------------------------------------------------------------------- 1 | # [MXML-10-03] 2.SGBM(regression).py 2 | # Stochastic Gradient Boosting Method (1999, Friedman) 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/hF-1HHKPxq4 11 | # 12 | import numpy as np 13 | from sklearn.tree import DecisionTreeRegressor 14 | import matplotlib.pyplot as plt 15 | 16 | # Create training data for regression 17 | def nonlinear_data(n, s): 18 | rtn_x, rtn_y = [], [] 19 | for i in range(n): 20 | x = np.random.random() 21 | y = 2.0 * np.sin(2.0 * np.pi * x) + np.random.normal(0.0, s) + 3.0 22 | rtn_x.append(x) 23 | rtn_y.append(y) 24 | 25 | return np.array(rtn_x).reshape(-1,1), np.array(rtn_y) 26 | 27 | # Visualize the training data and prediction results 28 | def plot_prediction(x, y, x_test, y_pred): 29 | plt.figure(figsize=(6,4)) 30 | plt.scatter(x, y, c='blue', s=20, alpha=0.5, label='train data') 31 | plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction') 32 | plt.xlim(0, 1) 33 | plt.ylim(0, 7) 34 | plt.legend() 35 | plt.show() 36 | 37 | # Create training data 38 | x, y = nonlinear_data(n=500, s=0.5) 39 | 40 | n_data = x.shape[0] 41 | n_depth = 3 # tree depth (weak learner) 42 | n_tree = 50 # the number of trees (M) 43 | f_rate = 0.5 # rate of sampling 44 | lr = 0.05 # learning rate 45 | 46 | # step-1: Initialize model with a constant value. 47 | F0 = y.mean() 48 | 49 | # Training 50 | Fm = np.repeat(F0, n_data) 51 | models = [] 52 | loss = [] 53 | for m in range(n_tree): 54 | # data sampling without replacement 55 | si = np.random.choice(range(n_data), int(n_data * f_rate), replace=False) 56 | 57 | # step-2 (A): Compute so-called pseudo-residuals 58 | residual = y[si] - Fm[si] 59 | 60 | # step-2 (B): Fit a regression tree to the residual 61 | gb_model = DecisionTreeRegressor(max_depth=n_depth) 62 | gb_model.fit(x[si], residual) 63 | 64 | # step-2 (C): compute gamma (prediction) 65 | gamma = gb_model.predict(x) 66 | 67 | # step-2 (D): Update the model 68 | Fm = Fm + lr * gamma 69 | 70 | # Store trained tree models 71 | models.append(gb_model) 72 | 73 | # Calculate loss. loss = mean squared error. 74 | loss.append(((y - Fm) ** 2).sum()) 75 | 76 | # Check the loss history 77 | plt.figure(figsize=(6,4)) 78 | plt.plot(loss, c='red') 79 | plt.xlabel('m : iteration') 80 | plt.ylabel('loss: mean squared error') 81 | plt.title('loss history') 82 | plt.show() 83 | 84 | # step-3: Output Fm(x) - Prediction 85 | y_pred = F0 86 | x_test = np.linspace(0, 1, 50).reshape(-1, 1) 87 | for model in models: 88 | y_pred += lr * model.predict(x_test) 89 | 90 | # Visualize the training data and prediction results 91 | plot_prediction(x, y, x_test, y_pred) 92 | 93 | # Compare with the results of sklearn’s GradientBoostingRegressor 94 | from sklearn.ensemble import GradientBoostingRegressor 95 | sk_model = GradientBoostingRegressor(n_estimators=n_tree, 96 | learning_rate=lr, 97 | max_depth=n_depth, 98 | subsample=f_rate) 99 | 100 | sk_model.fit(x, y) # Training 101 | y_pred = sk_model.predict(x_test) # Prediction 102 | 103 | plot_prediction(x, y, x_test, y_pred) -------------------------------------------------------------------------------- /11.xGBoost/1.XGBoost(regression).py: -------------------------------------------------------------------------------- 1 | # [MXML-11-03] 1.XGBoost(regression).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/Ms_xxQFrTWc 10 | # 11 | import numpy as np 12 | from MyXGBoostRegressor import MyXGBRegressor 13 | import matplotlib.pyplot as plt 14 | 15 | # Plot the training data and estimated curve 16 | def plot_prediction(x, y, x_test, y_pred): 17 | plt.figure(figsize=(7, 5)) 18 | plt.scatter(x, y, c='blue', s=20, alpha=0.5, label='train data') 19 | plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction') 20 | plt.xlim(0, 1) 21 | plt.ylim(0, 7) 22 | plt.legend() 23 | plt.show() 24 | 25 | # Generate the training data 26 | def nonlinear_data(n, s): 27 | rtn_x, rtn_y = [], [] 28 | for i in range(n): 29 | x = np.random.random() 30 | y = 2.0 * np.sin(2.0 * np.pi * x) + np.random.normal(0.0, s) + 3.0 31 | rtn_x.append(x) 32 | rtn_y.append(y) 33 | return np.array(rtn_x).reshape(-1,1), np.array(rtn_y) 34 | x, y = nonlinear_data(n=500, s=0.5) 35 | 36 | y_mean = y.mean() # initial prediction 37 | n_depth = 3 # tree depth 38 | n_tree = 20 # the number of trees 39 | eta = 0.3 # learning rate 40 | reg_lambda = 1.0 # regularization constant 41 | prune_gamma = 2.0 # pruning constant 42 | 43 | my_model = MyXGBRegressor(n_estimators=n_tree, 44 | max_depth=n_depth, 45 | learning_rate=eta, 46 | prune_gamma=prune_gamma, 47 | reg_lambda=reg_lambda, 48 | base_score = y_mean) 49 | loss = my_model.fit(x, y) 50 | 51 | # Check the loss history 52 | plt.figure(figsize=(5,4)) 53 | plt.plot(loss, c='red') 54 | plt.xlabel('m : iteration') 55 | plt.ylabel('loss: mean squared error') 56 | plt.title('loss history') 57 | plt.show() 58 | 59 | x_test = np.linspace(0, 1, 50).reshape(-1, 1) 60 | y_pred = my_model.predict(x_test) 61 | 62 | # Plot the training data and estimated curve 63 | plot_prediction(x, y, x_test, y_pred) 64 | 65 | # XGBRegressor 결과와 비교한다. 66 | # https://xgboost.readthedocs.io/en/stable/python/python_api.html 67 | # #module-xgboost.sklearn 68 | # --------------------------------------------------------------- 69 | from xgboost import XGBRegressor 70 | 71 | xg_model = XGBRegressor(n_estimators=n_tree, 72 | max_depth=n_depth, 73 | learning_rate=eta, 74 | gamma=prune_gamma, 75 | reg_lambda=reg_lambda, 76 | base_score = y_mean) 77 | xg_model.fit(x, y) 78 | y_pred = xg_model.predict(x_test) # predict the test data 79 | 80 | # Plot the training data and estimated curve 81 | plot_prediction(x, y, x_test, y_pred) 82 | 83 | -------------------------------------------------------------------------------- /11.xGBoost/2.XGBoost(classification).py: -------------------------------------------------------------------------------- 1 | # [MXML-11-06] 2.XGBoost(classification).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/oKLBon15bTc 10 | # 11 | import numpy as np 12 | from sklearn.datasets import make_blobs 13 | from MyXGBoostClassifier import MyXGBClassifier 14 | import matplotlib.pyplot as plt 15 | 16 | # Plot the training and test data, and the prediction result 17 | def plot_prediction(x, y, x_test, y_pred): 18 | plt.figure(figsize=(5,5)) 19 | color = ['red' if a == 1 else 'blue' for a in y_pred] 20 | plt.scatter(x_test[:, 0], x_test[:, 1], s=100, c=color, 21 | alpha=0.3) 22 | plt.scatter(x[:, 0], x[:, 1], s=80, c='black') 23 | plt.scatter(x[:, 0], x[:, 1], s=10, c='yellow') 24 | plt.xlim(-0.5, 1.0) 25 | plt.ylim(-0.5, 1.0) 26 | plt.show() 27 | 28 | # Generate the training data 29 | x, y = make_blobs(n_samples=200, n_features=2, 30 | centers=[[0., 0.], [0.5, 0.5]], 31 | cluster_std=0.18, center_box=(-1., 1.)) 32 | 33 | # y_init = y.mean() # initial prediction 34 | y_init = np.repeat(y.mean(), y.shape[0]) 35 | n_depth = 3 # # tree depth 36 | n_tree = 20 # the number of trees 37 | eta = 0.3 # learning rate 38 | reg_lambda = 0.1 # regularization constant 39 | prune_gamma = 0.01 # pruning constant 40 | 41 | my_model = MyXGBClassifier(n_estimators=n_tree, 42 | max_depth=n_depth, 43 | learning_rate=eta, 44 | prune_gamma = prune_gamma, 45 | reg_lambda=reg_lambda, 46 | base_score = y_init) 47 | loss = my_model.fit(x, y) 48 | 49 | # Check the loss history 50 | plt.figure(figsize=(5,4)) 51 | plt.plot(loss, c='red') 52 | plt.xlabel('m : iteration') 53 | plt.ylabel('loss: binary cross entropy') 54 | plt.title('loss history') 55 | plt.show() 56 | 57 | x_test = np.random.uniform(-0.5, 1.5, (1000, 2)) 58 | y_pred = my_model.predict(x_test) 59 | 60 | # Plot the training and test data, and the prediction result 61 | plot_prediction(x, y, x_test, y_pred) 62 | 63 | # Compare with the results from XGBRegressor library. 64 | # https://xgboost.readthedocs.io/en/stable/python/python_api.html 65 | # #module-xgboost.sklearn 66 | # --------------------------------------------------------------- 67 | from xgboost import XGBClassifier 68 | 69 | xg_model = XGBClassifier(objective='binary:logistic', 70 | tree_method = 'exact', 71 | n_estimators=n_tree, 72 | max_depth=n_depth, 73 | learning_rate=eta, 74 | gamma=prune_gamma, 75 | reg_lambda=reg_lambda, 76 | base_score=y_init) 77 | 78 | xg_model.fit(x, y) 79 | 80 | # Predict the target class of the test data and visualize the result 81 | y_pred = xg_model.predict(x_test) 82 | plot_prediction(x, y, x_test, y_pred) 83 | 84 | # plt.figure(figsize=(5,5)) 85 | # color = ['red' if a == 1 else 'blue' for a in y] 86 | # plt.scatter(x[:, 0], x[:, 1], s=80, alpha=0.5, c=color) 87 | # plt.xlim(-0.5, 1.0) 88 | # plt.ylim(-0.5, 1.0) 89 | # plt.show() -------------------------------------------------------------------------------- /11.xGBoost/3.appoximation(1).py: -------------------------------------------------------------------------------- 1 | # [MXML-11-07] 3.approximation(1).py 2 | # 논문 [1] Tianqi Chen et, al., 2016, XGBoost: A Scalable Tree Boosting System 3 | # 3. SPLIT FINDING ALGORITHMS 4 | # 3.2 Approximate Algorithm 5 | # 6 | # This code was used in the machine learning online 7 | # course provided by 8 | # www.youtube.com/@meanxai 9 | # www.github.com/meanxai/machine_learning 10 | # 11 | # A detailed description of this code can be found in 12 | # https://youtu.be/AQOPXlxXF_0 13 | # 14 | import numpy as np 15 | from MyXGBoostRegressor import MyXGBRegressor 16 | import time 17 | 18 | # Create training data 19 | def nonlinear_data(n, s): 20 | rtn_x, rtn_y = [], [] 21 | for i in range(n): 22 | x = np.random.random() 23 | y = 2.0 * np.sin(2.0 * np.pi * x) + \ 24 | np.random.normal(0.0, s) + 3.0 25 | rtn_x.append(x) 26 | rtn_y.append(y) 27 | return np.array(rtn_x).reshape(-1,1), np.array(rtn_y) 28 | x, y = nonlinear_data(n=50000, s=0.5) 29 | 30 | # 1. Exact Greedy Algorithm (EGA) 31 | # ------------------------------- 32 | start_time = time.time() 33 | my_model = MyXGBRegressor(n_estimators = 1, 34 | max_depth = 1, 35 | base_score = y.mean()) 36 | 37 | my_model.fit(x, y) 38 | e = my_model.models[0].estimator2 39 | 40 | print('\nExact greedy algorithm:') 41 | print('split point =', np.round(e['split_point'], 3)) 42 | print('gain =', np.round(e['gain'], 3)) 43 | print('running time = {:.2f} seconds'.format(time.time() - start_time)) 44 | 45 | # 2.Approximate Algorithm (AA). 46 | # ------------------------------- 47 | from multiprocessing.pool import Pool 48 | def find_split_point(x, y): 49 | # MyXGBRegressor is a class implemented with EGA. 50 | # To implement this properly, you need to implement the 51 | # Approximate Algorithm inside the MyXGBRegressor. 52 | my_model = MyXGBRegressor(n_estimators = 1, 53 | max_depth = 1, # root node만 확인함. 54 | base_score = y.mean()) 55 | 56 | my_model.fit(x, y) 57 | e = my_model.models[0].estimator2 58 | return [e['split_point'], e['gain']] 59 | 60 | # Divide the data into five parts and allocate 20% of the data to 61 | # each part. 62 | c_point = np.percentile(x, [20, 40, 60, 80, 100]) 63 | 64 | # maps the data into buckets split by c_point 65 | l_bound = -np.inf 66 | x_block, y_block = [], [] 67 | for p in c_point: 68 | idx = np.where(np.logical_and(x > l_bound, x <= p))[0] 69 | x_block.append(x[idx]) 70 | y_block.append(y[idx]) 71 | l_bound = p 72 | 73 | start_time = time.time() 74 | mp = Pool(5) 75 | args = [[ax, ay] for ax, ay in zip(x_block, y_block)] 76 | ret = mp.starmap_async(find_split_point, args) 77 | mp.close() 78 | mp.join() 79 | 80 | print('\nApproximate Algorithm:') 81 | print('split_points =', np.array(ret.get())[:, 0].round(3)) 82 | print('gain =', np.array(ret.get())[:, 1].round(2)) 83 | print('running time = {:.2f} seconds'.format(time.time() - start_time)) 84 | print('number of data in blocks =', [len(a) for a in x_block]) -------------------------------------------------------------------------------- /11.xGBoost/4.appoximation(2).py: -------------------------------------------------------------------------------- 1 | # [MXML-11-08] 4.approximation(2).py 2 | # Tianqi Chen et, al., 2016, XGBoost: A Scalable Tree Boosting System 3 | # 3. SPLIT FINDING ALGORITHMS 4 | # 3.3 Weighted Quantile Sketch 5 | # 6 | # This code was used in the machine learning online 7 | # course provided by 8 | # www.youtube.com/@meanxai 9 | # www.github.com/meanxai/machine_learning 10 | # 11 | # A detailed description of this code can be found in 12 | # https://youtu.be/ejUvX1L-yzE 13 | # 14 | import numpy as np 15 | from sklearn.datasets import make_blobs 16 | from xgboost import XGBClassifier 17 | from sklearn.model_selection import train_test_split 18 | import time 19 | 20 | # Create a simple training dataset 21 | x, y = make_blobs(n_samples=500000, n_features=2, 22 | centers=[[0., 0.], [0.5, 0.5]], 23 | cluster_std=0.2, center_box=(-1., 1.)) 24 | 25 | x_train, x_test, y_train, y_test = train_test_split(x, y) 26 | 27 | TREES = 200 # the number of trees 28 | DEPTH = 5 # the depth of tree 29 | ETA = 0.1 # learning rate, eta 30 | LAMB = 1.0 # regularization constant 31 | GAMMA = 0.1 # pruning constant 32 | EPS = 0.03 # epsilon for approximate and weighted quantile sketch 33 | 34 | # 1. Exact Greedy Algorithm (EGA) 35 | # ------------------------------- 36 | start_time = time.time() 37 | model = XGBClassifier(n_estimators = TREES, 38 | max_depth = DEPTH, 39 | learning_rate = ETA, # η 40 | gamma = GAMMA, # γ for pruning 41 | reg_lambda = LAMB, # λ for regularization 42 | base_score = 0.5, # initial prediction value 43 | tree_method = 'exact') # exact greedy algorithm 44 | 45 | model.fit(x_train, y_train) 46 | acc = model.score(x_test, y_test) 47 | 48 | print('\nExact greedy algorithm:') 49 | print('Accuracy =', np.round(acc, 3)) 50 | print('running time = {:.2f} seconds'.format(time.time() - start_time)) 51 | 52 | # 2.Approximate Algorithm (AA). 53 | # ------------------------------- 54 | start_time = time.time() 55 | model = XGBClassifier(n_estimators = TREES, 56 | max_depth = DEPTH, 57 | learning_rate = ETA, # η 58 | gamma = GAMMA, # γ for pruning 59 | reg_lambda = LAMB, # λ for regularization 60 | base_score = 0.5, # initial prediction value 61 | max_bin = int(1/EPS), # sketch_eps is replaced by max_bin 62 | tree_method = 'approx') # weighted quantile sketch 63 | 64 | model.fit(x_train, y_train) 65 | acc = model.score(x_test, y_test) 66 | 67 | print('\nWeighted Quantile Sketch:') 68 | print('Accuracy =', np.round(acc, 3)) 69 | print('running time = {:.2f} seconds'.format(time.time() - start_time)) 70 | 71 | # tree_method: 72 | # 73 | # https://xgboost.readthedocs.io/en/stable/parameter.html 74 | # auto: Same as the hist tree method. 75 | # exact: Exact greedy algorithm. Enumerates all split candidates. 76 | # approx: Approximate greedy algorithm using quantile sketch and gradient histogram. 77 | # hist: Faster histogram optimized approximate greedy algorithm. 78 | # 79 | # https://xgboost.readthedocs.io/en/latest/treemethod.html 80 | # approx tree method: An approximation tree method described in 81 | # reference paper. It runs sketching before building each tree using 82 | # all the rows (rows belonging to the root). Hessian is used as weights 83 | # during sketch. The algorithm can be accessed by setting tree_method 84 | # to approx. 85 | 86 | # max_bin: 87 | # 88 | # https://github.com/dmlc/xgboost/issues/8063 89 | # Also, the parameter sketch_eps is replaced by max_bin for aligning 90 | # with hist, the old default for max_bin translated from sketch_eps 91 | # was around 63 while the rewritten one is 256, which means the new 92 | # implementation builds larger histogram. 93 | 94 | # import matplotlib.pyplot as plt 95 | # x, y = make_blobs(n_samples=10000, n_features=2, 96 | # centers=[[0., 0.], [0.5, 0.5]], 97 | # cluster_std=0.2, center_box=(-1., 1.)) 98 | 99 | # plt.figure(figsize=(5,5)) 100 | # color = ['red' if a == 1 else 'blue' for a in y] 101 | # plt.scatter(x[:, 0], x[:, 1], s=1, alpha=0.8, c=color) 102 | # # plt.xlim(-0.5, 1.0) 103 | # # plt.ylim(-0.5, 1.0) 104 | # plt.show() -------------------------------------------------------------------------------- /11.xGBoost/5.santander.py: -------------------------------------------------------------------------------- 1 | # [MXML-11-09] 5.santander.py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/fALcIVr6zjY 10 | # 11 | import pandas as pd 12 | import numpy as np 13 | from xgboost import XGBClassifier 14 | from sklearn.metrics import roc_auc_score 15 | from sklearn.model_selection import train_test_split 16 | import matplotlib.pyplot as plt 17 | 18 | # Read the Santander Customer Satisfaction Dataset. 19 | # df.shape = (76020, 371) 20 | df = pd.read_csv("data/santander.csv", encoding='latin-1') 21 | 22 | # Replace the values of the 'var3' feature containing -99999999 with 2 23 | # and drop the 'ID' feature. 24 | df['var3'].replace(-999999, 2, inplace=True) 25 | df.drop('ID', axis = 1, inplace=True) 26 | 27 | # Separate features and label from the dataset 28 | # and generate training and test data. 29 | x = df.drop('TARGET', axis=1) 30 | y = df['TARGET'] 31 | x_train, x_test, y_train, y_test = train_test_split(x, y) 32 | 33 | TREES = 200 # the number of trees 34 | DEPTH = 5 # the depth of tree 35 | ETA = 0.1 # learning rate, eta 36 | LAMB = 1.0 # regularization constant 37 | GAMMA = 0.1 # pruning constant 38 | EPS = 0.03 # epsilon for approximate and weighted quantile sketch 39 | 40 | # Create an XGBoost classification model and fit it to the training data 41 | model = XGBClassifier(n_estimators = TREES, 42 | max_depth = DEPTH, 43 | learning_rate = ETA, # η 44 | gamma = GAMMA, # γ for pruning 45 | reg_lambda = LAMB, # λ for regularization 46 | base_score = 0.5, # initial prediction value 47 | missing = 0.0, # for sparsity-aware 48 | subsample = 0.5, # Subsample ratio of the training instance 49 | colsample_bynode = 0.5, # Subsample ratio of columns for each split 50 | max_bin = int(1/EPS), # sketch_eps is replaced by max_bin 51 | tree_method = 'approx') # weighted quantile sketch 52 | 53 | model.fit(x_train, y_train) 54 | 55 | # Predict the test data and measure the performance with ROC-AUC. 56 | y_prob = model.predict_proba(x_test)[:, 1] 57 | auc = roc_auc_score(y_test, y_prob) 58 | print('\nROC-AUC = {:.4f}'.format(auc)) 59 | 60 | # colsample_bytree (Optional[float]) – Subsample ratio of columns when constructing 61 | # each tree. 62 | # colsample_bylevel (Optional[float]) – Subsample ratio of columns for each level. 63 | 64 | # colsample_bynode (Optional[float]) – Subsample ratio of columns for each split. 65 | 66 | -------------------------------------------------------------------------------- /11.xGBoost/data/santander.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/meanxai/machine_learning/fba47e91cc7449eb5d7ea8b7ec1fb0fd616ebd71/11.xGBoost/data/santander.zip -------------------------------------------------------------------------------- /12.LGBM/1.histogram_based.py: -------------------------------------------------------------------------------- 1 | # [MXML-12-01] 1.histogram-based.py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/N39NE4Nj6vc 10 | # 11 | import numpy as np 12 | from sklearn.datasets import make_blobs 13 | from multiprocessing.pool import Pool 14 | import matplotlib.pyplot as plt 15 | 16 | # Create a training data set. 17 | x, y = make_blobs(n_samples=300, n_features=2, 18 | centers=[[0., 0.], [0.5, 0.3]], 19 | cluster_std=0.15, center_box=(-1., 1.)) 20 | 21 | plt.figure(figsize=(4,4)) 22 | color = [['red', 'blue'][a] for a in y] 23 | plt.scatter(x[:,0], x[:,1], c=color, alpha=0.3) 24 | plt.show() 25 | 26 | def find_local_split_point(f, s_point): 27 | GL = HL = 0.0 28 | l_bound = -np.inf # lower left bound 29 | max_gain = -np.inf 30 | 31 | for j in s_point: 32 | # split the parent node into the left and right nodes. 33 | left = np.where(np.logical_and(f > l_bound, f <= j))[0] 34 | right = np.where(f > j)[0] 35 | 36 | # After splitting the parent node, calculate the scores of its children. 37 | GL += g[left].sum() 38 | HL += (h[left] * (1. - h[left])).sum() 39 | GR = G - GL 40 | HR = H - HL 41 | 42 | # Calculate the gain for this split 43 | gain = (GL ** 2)/(HL + r) + (GR ** 2)/(HR + r) - p_score 44 | 45 | # Find the maximum gain. 46 | if gain > max_gain: 47 | max_gain = gain 48 | b_point = j # best split point 49 | l_bound = j 50 | 51 | return b_point, max_gain 52 | 53 | y0 = np.ones(shape=y.shape) * 0.5 # initial prediction 54 | g = -(y - y0) # negative residual. 55 | h = y0 * (1. - y0) # Hessian. 56 | 57 | # Create a histogram of the parent node for each feature 58 | n_bin = 30 # the number of bins 59 | g0_parent, f0_bin = np.histogram(x[:, 0], n_bin, weights=g) # feature 0 60 | g1_parent, f1_bin = np.histogram(x[:, 1], n_bin, weights=g) # feature 1 61 | 62 | # Find the best split point of each feature 63 | G = g.sum() 64 | H = h.sum() 65 | r = 0.0 66 | gamma = 0.0 67 | p_score = (G ** 2) / (H + r) # parent's score before splitting the node 68 | 69 | # Find global best split point through parallel processing 70 | # vertical partitioning method is used. 71 | mp = Pool(2) 72 | args = [[x[:, 0], f0_bin], [x[:, 1], f1_bin]] 73 | ret = mp.starmap_async(find_local_split_point, args) 74 | mp.close() 75 | mp.join() 76 | 77 | results = ret.get() 78 | p1 = results[0][0]; p2 = results[1][0] 79 | gain1 = results[0][1]; gain2 = results[1][1] 80 | 81 | if gain1 > gain2: 82 | b_fid = 0 83 | b_point = p1 84 | else: 85 | b_fid = 1 86 | b_point = p2 87 | 88 | print('\nbest feature id =', b_fid) 89 | print('best split point =', b_point.round(3)) 90 | 91 | -------------------------------------------------------------------------------- /12.LGBM/2.goss.py: -------------------------------------------------------------------------------- 1 | # [MXML-12-02] 2.goss.py 2 | # Implement GOSS algorithm presented in the paper. 3 | # Add GOSS feature to XGBoost. 4 | # 5 | # This code was used in the machine learning online 6 | # course provided by 7 | # www.youtube.com/@meanxai 8 | # www.github.com/meanxai/machine_learning 9 | # 10 | # A detailed description of this code can be found in 11 | # https://youtu.be/APZyWo9hIj0 12 | # 13 | import numpy as np 14 | from sklearn.datasets import make_blobs 15 | import matplotlib.pyplot as plt 16 | from xgboost import XGBClassifier 17 | from lightgbm import LGBMClassifier 18 | 19 | # Create a training dataset 20 | x, y = make_blobs(n_samples=10000, n_features=2, 21 | centers=[[0., 0.], [0.5, 0.5]], 22 | cluster_std=0.25, center_box=(-1., 1.)) 23 | 24 | plt.figure(figsize=(4,4)) 25 | color = [['red', 'blue'][a] for a in y] 26 | plt.scatter(x[:,0], x[:,1], s=1, c=color, alpha=0.5) 27 | plt.show() 28 | 29 | n_boost = 50 # the number of boosting 30 | eta = 0.3 # learning rate 31 | max_depth = 2 # max_depth of a tree 32 | 33 | def base_model(x, y, weights, F0): 34 | model = XGBClassifier(n_estimators=1, # just 1 round 35 | learning_rate=eta, 36 | max_depth=max_depth, 37 | max_bin=20, tree_method='hist', 38 | base_score=None) 39 | 40 | # g and h are multiplied by their weights. 41 | model.fit(x, y, sample_weight = weights, base_margin=F0) 42 | return model 43 | 44 | # Algorithm 2: Gradient-based One-Side Sampling (GOSS) 45 | a = 0.3 # sampling ratio of large gradient data 46 | b = 0.2 # sampling ratio of small gradient data 47 | fact = (1. - a) / b 48 | topN = int(a * x.shape[0]) 49 | randN = int(b * x.shape[0]) 50 | models = [] 51 | Fm = np.zeros(y.shape) # initial prediction in log(odds) 52 | 53 | for i in range(n_boost): 54 | y_prev = 1. / (1. + np.exp(-Fm)) 55 | g = -(y - y_prev) # negative residual. first order gradients 56 | w = np.ones(shape=x.shape[0]) # initial sample weights. 57 | sorted_g = np.argsort(np.abs(g))[::-1] 58 | topSet = sorted_g[:topN] 59 | randSet = np.random.choice(sorted_g[topN:], size=randN, replace=False) 60 | usedSet = np.hstack([topSet, randSet]) 61 | w[randSet] *= fact # Assign weight f act to the small gradient data 62 | 63 | newModel = base_model(x[usedSet], y[usedSet], w[usedSet], F0=Fm[usedSet]) 64 | Fm += newModel.predict(x, output_margin=True) 65 | models.append(newModel) 66 | 67 | # Create a test dataset and predict the class of test data 68 | x_test = np.random.uniform(-1.0, 1.5, (1000, 2)) 69 | 70 | test_Fm = np.zeros(x_test.shape[0]) 71 | for model in models: 72 | test_Fm += model.predict(x_test, output_margin=True) 73 | 74 | y_prob = 1. / (1. + np.exp(-test_Fm)) # log(odds) --> probability 75 | y_pred = (y_prob > 0.5) * 1 76 | 77 | # Check the prediction results and the decision boundary. 78 | def check_result(x, y, x_test, y_pred, title): 79 | plt.figure(figsize=(4,4)) 80 | color2 = [['red', 'blue'][a] for a in y_pred] 81 | plt.scatter(x_test[:, 0], x_test[:, 1], s=50, c=color2, 82 | alpha=0.3) 83 | 84 | # Only part of the training data is drawn. 85 | plt.scatter(x[:300, 0], x[:300, 1], s=50, c='black') 86 | plt.scatter(x[:300, 0], x[:300, 1], s=5, c='yellow') 87 | plt.xlim(-1.0, 1.5) 88 | plt.ylim(-1.0, 1.5) 89 | plt.title(title) 90 | plt.show() 91 | 92 | check_result(x, y, x_test, y_pred, "Result of the code from scratch") 93 | 94 | # Use LGBMClassifier library and compare the result from above code 95 | model = LGBMClassifier(n_estimators = 20, 96 | max_depth=max_depth, 97 | learning_rate=eta, 98 | max_bins=20, 99 | boosting_type="goss", 100 | top_rate=0.3, 101 | other_rate=0.2) 102 | 103 | model.fit(x, y) 104 | y_pred = model.predict(x_test) 105 | check_result(x, y, x_test, y_pred, "Result of LGBMClassifier") 106 | -------------------------------------------------------------------------------- /12.LGBM/3.greedy_bundling.py: -------------------------------------------------------------------------------- 1 | # [MXML-12-03] 3.greedy_bundling.py 2 | # Algorithm 3: Greedy Bundling 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/Y-IvfsjmqOQ 11 | # 12 | import numpy as np 13 | 14 | x = np.array([[1, 1, 0, 0, 1], 15 | [0, 0, 1, 1, 1], 16 | [1, 2, 0, 0, 2], 17 | [0, 0, 2, 3, 1], 18 | [2, 1, 0, 0, 3], 19 | [3, 3, 0, 0, 1], 20 | [0, 0, 3, 0, 2], 21 | [1, 2, 3, 4, 3], 22 | [1, 0, 1, 0, 0], 23 | [2, 3, 0, 0, 2]]) 24 | 25 | # Create a conflict count matrix 26 | n_row = x.shape[0] 27 | n_col = x.shape[1] 28 | conflictCnt = np.zeros((n_col, n_col)) 29 | 30 | for i in range(n_col): 31 | for j in range(i+1, n_col): 32 | # Count the number of conflicts. 33 | conflictCnt[i, j] = len(np.where(x[:, i] * x[:, j] > 0)[0]) 34 | 35 | # Copy upper triangle to lower triangle 36 | # iu = (array([0, 0, 0, 0, 1, 1, 1, 2, 2, 3]), 37 | # array([1, 2, 3, 4, 2, 3, 4, 3, 4, 4])) 38 | iu = np.triu_indices(n_col, 1) 39 | il = (iu[1], iu[0]) 40 | conflictCnt[il] = conflictCnt[iu] 41 | 42 | # Create a search order matrix 43 | degree = conflictCnt.sum(axis=0) 44 | searchOrder = np.argsort(degree)[::-1] # descending order 45 | 46 | # ---------------------------- 47 | # Algorithm 3: Greedy Bundling 48 | # ---------------------------- 49 | K = 1 # max conflict count 50 | bundles = [] 51 | bundlesConflict = [] 52 | for i in searchOrder: # i = [4, 0, 1, 2, 3] 53 | needNew = True 54 | for j in range(len(bundles)): 55 | cnt = conflictCnt[bundles[j][-1], i] 56 | # Only edges less than or equal to K are considered. 57 | if cnt + bundlesConflict[j] <= K: 58 | # Add the feature number i to the j-th bundle. 59 | bundles[j].append(i) 60 | 61 | # Update the number of conflicts of features in the 62 | # j-th bundle. 63 | bundlesConflict[j] += cnt 64 | needNew = False 65 | break 66 | 67 | if needNew: 68 | bundles.append([i]) 69 | bundlesConflict.append(0.) 70 | 71 | print('\nconflictCnt:\n', conflictCnt) 72 | print('\nsearchOrder:\n', searchOrder) 73 | 74 | print('\nbundles:', bundles) 75 | print('bundlesConflict:', bundlesConflict) 76 | 77 | # conflictCnt: 78 | # 0 1 2 3 4 79 | # 0 [0., 6., 2., 1., 6.] 80 | # 1 [6., 0., 1., 1., 6.] 81 | # 2 [2., 1., 0., 3., 4.] 82 | # 3 [1., 1., 3., 0., 3.] 83 | # 4 [6., 6., 4., 3., 0.] 84 | 85 | # searchOrder 86 | # array([4, 0, 1, 2, 3]) 87 | # 88 | # bundles: 89 | # j=0 j=1 j=2 ← bundle number 90 | # +--↓-----↓-------↓-----+ 91 | # | [4] [0, 3] [1, 2] | 92 | # +----------------------+ 93 | # 94 | # bundlesConflict 95 | # +----------------------+ 96 | # | 0 1 1 | 97 | # +----------------------+ 98 | -------------------------------------------------------------------------------- /12.LGBM/4.merge_features.py: -------------------------------------------------------------------------------- 1 | # [MXML-12-04] 4.merge_features.py 2 | # Implementation of Algorithm 4: Merge Exclusive Features 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/orSRRtWtPwE 11 | # 12 | import numpy as np 13 | 14 | x = np.array([[1, 1, 0, 0, 1], 15 | [0, 0, 1, 1, 1], 16 | [1, 2, 0, 0, 2], 17 | [0, 0, 2, 3, 1], 18 | [2, 1, 0, 0, 3], 19 | [3, 3, 0, 0, 1], 20 | [0, 0, 3, 0, 2], 21 | [1, 2, 3, 4, 3], # <-- conflict here 22 | [1, 0, 1, 0, 0], 23 | [2, 3, 0, 0, 2]]) 24 | 25 | # Algorithm 4: Merge Exclusive Features 26 | def merge_features(numData, F): 27 | binRanges = [0] 28 | totalBin = 0 29 | for f in F: 30 | totalBin += np.max(f) 31 | binRanges.append(totalBin) 32 | 33 | newBin = np.zeros(numData, dtype=int) 34 | for i in range(numData): 35 | newBin[i] = 0 36 | for j in range(len(F)): 37 | if F[j][i] != 0: 38 | newBin[i] = F[j][i] + binRanges[j] 39 | return newBin, binRanges 40 | 41 | # modified Algorithm 4 (skip-zero-version) 42 | def merge_features2(numData, F): 43 | binRanges = [0] 44 | totalBin = 0 45 | for f in F: 46 | totalBin += np.max(f) 47 | binRanges.append(totalBin) 48 | 49 | # initialize newBin with F[0] to skip zero in binRanges[0] 50 | newBin = F[0] 51 | for i in range(numData): 52 | for j in range(1, len(F)): 53 | if F[j][i] != 0: 54 | newBin[i] = F[j][i] + binRanges[j] 55 | return newBin, binRanges 56 | 57 | bundles = [[4], [0, 3], [1, 2]] # The result of Greedy Bundling 58 | 59 | F = [x[:, i] for i in bundles[1]] 60 | newBin, binRanges = merge_features(x.shape[0], F) 61 | print('\nnewBin:', newBin) 62 | print('binRanges:', binRanges) 63 | 64 | newBin, binRanges = merge_features2(x.shape[0], F) 65 | print('\nnewBin:', newBin) 66 | print('binRanges:', binRanges) 67 | -------------------------------------------------------------------------------- /12.LGBM/5.efb_onehot.py: -------------------------------------------------------------------------------- 1 | # [MXML-12-05] 5.efb_onehot.py 2 | # Merge one-hot encoded features using EFB 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/NqpkYja5g2Y 11 | # 12 | import numpy as np 13 | from sklearn.preprocessing import OneHotEncoder 14 | 15 | # Algorithm 3: Greedy Bundling algorithm 16 | def greedy_bundling(x, K): 17 | # Create a conflict count matrix 18 | n_row = x.shape[0] 19 | n_col = x.shape[1] 20 | conflictCnt = np.zeros((n_col, n_col)) 21 | 22 | for i in range(n_col): 23 | for j in range(i+1, n_col): 24 | # Count the number of conflicts. 25 | conflictCnt[i, j] = len(np.where(x[:, i] * x[:, j] > 0)[0]) 26 | 27 | # Copy upper triangle to lower triangle 28 | iu = np.triu_indices(n_col, 1) 29 | il = (iu[1], iu[0]) 30 | conflictCnt[il] = conflictCnt[iu] 31 | 32 | # Create a search order matrix 33 | degree = conflictCnt.sum(axis=0) 34 | searchOrder = np.argsort(degree)[::-1] # descending order 35 | 36 | bundles = [] 37 | bundlesConflict = [] 38 | for i in searchOrder: 39 | needNew = True 40 | for j in range(len(bundles)): 41 | cnt = conflictCnt[bundles[j][-1], i] 42 | if cnt + bundlesConflict[j] <= K: 43 | bundles[j].append(i) 44 | bundlesConflict[j] += cnt 45 | needNew = False 46 | break 47 | 48 | if needNew: 49 | bundles.append([i]) 50 | bundlesConflict.append(0.) 51 | return bundles 52 | 53 | # Algorithm 4: Merge Exclusive Features (skip-zero-version) 54 | def merge_features(numData, F): 55 | binRanges = [0] 56 | totalBin = 0 57 | for f in F: 58 | totalBin += np.max(f) 59 | binRanges.append(totalBin) 60 | 61 | newBin = F[0] # initialize newBin to F[0] 62 | for i in range(numData): 63 | for j in range(1, len(F)): 64 | if F[j][i] != 0: 65 | newBin[i] = F[j][i] + binRanges[j] 66 | return newBin, binRanges 67 | 68 | # Generate random data and perform one-hot encoding. 69 | n_samples = 100 70 | n_features = 4 71 | x = np.random.randint(low=0, high=4, size=(n_samples, n_features)) 72 | enc = OneHotEncoder() 73 | x_ohe = enc.fit_transform(x).toarray() 74 | 75 | print('Original features [:5]:'); print(x[:5]) 76 | print('\nOne-hot encoding [:5]:'); print(x_ohe[:5]) 77 | 78 | # Find bundles 79 | bundles = greedy_bundling(x_ohe, K=1) 80 | 81 | # If we know the bundles exactly, like this, 82 | # bundles = [[0,1,2,3], [4,5,6,7], [8,9,10,11], [12,13,14,15]] 83 | # we can get the original features from the merged features. 84 | 85 | print('\nbundles:', bundles) 86 | # [[14, 12, 15, 13], [10, 8, 11, 9], [5, 4, 6, 7], [3, 2, 1, 0]] 87 | 88 | # Merge one-hot encoded features 89 | x_efb = np.zeros(shape=x.shape).astype('int') 90 | for i, bundle in enumerate(bundles): 91 | F = [x_ohe[:, i] for i in bundle] 92 | newBin, binRanges = merge_features(x_ohe.shape[0], F) 93 | x_efb[:, i] = np.array(newBin) - 1 94 | 95 | print('\nOriginal features [:5]:'); print(x[:5]) 96 | print('\nMerged features [:5]:'); print(x_efb[:5]) 97 | 98 | -------------------------------------------------------------------------------- /12.LGBM/6.santander.py: -------------------------------------------------------------------------------- 1 | # [MXML-12-05] 6.santander.py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/NqpkYja5g2Y 10 | # 11 | import pandas as pd 12 | from lightgbm import LGBMClassifier 13 | from xgboost import XGBClassifier 14 | from sklearn.metrics import roc_auc_score, roc_curve 15 | from sklearn.model_selection import train_test_split 16 | import matplotlib.pyplot as plt 17 | import time 18 | 19 | # Read the Santander Customer Satisfaction Dataset. 20 | # df.shape = (76020, 371) 21 | df = pd.read_csv("data/santander.csv", encoding='latin-1') 22 | 23 | # Replace the values of the 'var3' feature containing -99999999 24 | # with 2 and drop the 'ID' feature. 25 | df['var3'].replace(-999999, 2, inplace=True) 26 | df.drop('ID', axis = 1, inplace=True) 27 | 28 | # Separate features and label from the dataset 29 | # and generate training and test data. 30 | x_feat = df.drop('TARGET', axis=1) 31 | y_target = df['TARGET'] 32 | x_train, x_test, y_train, y_test = train_test_split(x_feat, y_target) 33 | 34 | # 1. XGBoost 35 | # Create an XGBoost classification model and fit it to the training data 36 | start_time = time.time() 37 | model = XGBClassifier(n_estimators = 200, 38 | max_depth = 5, 39 | learning_rate = 0.1, # η 40 | gamma = 0.1, # γ for pruning 41 | reg_lambda = 1.0, # λ for regularization 42 | base_score = 0.5, # initial prediction value 43 | subsample = 0.5, # Subsample ratio of the training instance 44 | colsample_bynode = 0.5, # Subsample ratio of columns for each split 45 | max_bin = int(1/0.03), # sketch_eps is replaced by max_bin 46 | tree_method = 'approx') # weighted quantile sketch 47 | 48 | model.fit(x_train, y_train) 49 | 50 | # Predict the test data and measure the performance with ROC-AUC. 51 | y_prob = model.predict_proba(x_test)[:, 1] 52 | auc = roc_auc_score(y_test, y_prob) 53 | 54 | print('\nXGBoost results:') 55 | print('running time = {:.2f} seconds'.format(time.time() - start_time)) 56 | print('ROC-AUC = {:.4f}'.format(auc)) 57 | 58 | # 2. LightGBM 59 | # Create a LightGBM model 60 | start_time = time.time() 61 | model = LGBMClassifier(n_estimators = 200, 62 | max_depth = 5, 63 | learning_rate = 0.1, 64 | boosting_type="goss", # default: gbdt - traditional gradient based decision tree 65 | top_rate=0.3, 66 | other_rate=0.2, 67 | enable_bundle=True, # default: True. enable EFB 68 | is_unbalance = True) 69 | 70 | # training 71 | model.fit(x_train, y_train) 72 | 73 | # Predict the test data and measure the performance with AUC. 74 | y_pred = model.predict_proba(x_test)[:, 1] 75 | auc = roc_auc_score(y_test, y_pred) 76 | 77 | print('\nLightGBM results:') 78 | print('running time = {:.2f} seconds'.format(time.time() - start_time)) 79 | print("ROC AUC = {0:.4f}".format(auc)) 80 | 81 | # Draw the ROC curve 82 | fprs, tprs, thresholds = roc_curve(y_test, y_pred) 83 | 84 | plt.plot(fprs, tprs, label = 'ROC') 85 | plt.plot([0,1], [0,1], '--', label = 'Random') 86 | plt.legend() 87 | plt.xlabel('FPR') 88 | plt.ylabel('TPR') 89 | plt.show() 90 | 91 | -------------------------------------------------------------------------------- /12.LGBM/data/santander.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/meanxai/machine_learning/fba47e91cc7449eb5d7ea8b7ec1fb0fd616ebd71/12.LGBM/data/santander.zip -------------------------------------------------------------------------------- /2.DecisionTree/1.ID3(titanic_part).py: -------------------------------------------------------------------------------- 1 | # [MXML-2-03] 1.ID3(titanic_part).py 2 | # ID3/C4.5 decision tree test code 3 | # CART is widely used than ID3/C4.5. Sklearn supports CART. 4 | # 5 | # This code was used in the machine learning online 6 | # course provided by 7 | # www.youtube.com/@meanxai 8 | # www.github.com/meanxai/machine_learning 9 | # 10 | # A detailed description of this code can be found in 11 | # https://youtu.be/m3o0-K07gLI 12 | # 13 | # 14 | # I used the package below to test ID3/C4.5. 15 | # https://github.com/svaante/decision-tree-id3 16 | # pip install decision-tree-id3 17 | # pip install pydot 18 | # pip install graphviz 19 | # sudo apt install graphviz 20 | # ----------------------------------------------------------- 21 | 22 | # "from sklearn.externals import six" is used for id3, but "six" 23 | # is missing in the sklearn.externals, resulting in the following 24 | # error: cannot import name "six" from 'sklearn.externals' 25 | # Add following to prevent errors. 26 | import six 27 | import sys; sys.modules['sklearn.externals.six'] = six 28 | import pandas as pd 29 | from id3 import Id3Estimator 30 | from id3 import export_graphviz 31 | import pydot 32 | from sklearn.model_selection import train_test_split 33 | 34 | # Use just 3 features in the Titanic dataset: 35 | feat_names = ['Pclass', 'Sex', 'Age'] 36 | df = pd.read_csv('data/titanic.csv')[feat_names + ['Survived']] 37 | df = df.dropna().reset_index() 38 | df.info() 39 | 40 | # Separate the data into feature and target. 41 | x_data = df[feat_names].copy() 42 | y_data = df['Survived'] 43 | 44 | # Convert string (Sex) to number. female = 0, male = 1 45 | x_data['Sex'] = x_data['Sex'].map({'female':0, 'male':1}) 46 | 47 | # Convert real numbers (Age) to 4 categories. 48 | x_data['Age'] = pd.qcut(x_data['Age'], 4, labels=False) 49 | 50 | # Split the data into training and test data. 51 | x_train, x_test, y_train, y_test = train_test_split(x_data, y_data) 52 | 53 | # Build ID3/C4.5 decision tree. 54 | estimator = Id3Estimator(gain_ratio=True, prune=True) 55 | estimator = estimator.fit(x_train, y_train, check_input=False) 56 | 57 | # Evaluate performance with test data. 58 | y_pred = estimator.predict(x_test) 59 | acc = (y_pred == y_test).mean() 60 | print('\nAccuracy of test data = {:.4f}'.format(acc)) 61 | 62 | # Evaluate performance with training data. 63 | y_pred = estimator.predict(x_train) 64 | acc = (y_pred == y_train).mean() 65 | print('Accuracy of train data = {:.4f}\n'.format(acc)) 66 | 67 | # Visualize the tree result 68 | tree = export_graphviz(estimator.tree_, 'id3_tree.dot', feat_names) 69 | (graph,) = pydot.graph_from_dot_file('id3_tree.dot') 70 | graph.write_png('id3_tree.png') 71 | !nomacs 'id3_tree.png' # Check the tree image with the image viewer. 72 | -------------------------------------------------------------------------------- /2.DecisionTree/2.CART(classification).py: -------------------------------------------------------------------------------- 1 | # [MXML-2-07] 2.CART(classification).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/gct9gGOvPek 10 | # 11 | import numpy as np 12 | import pandas as pd 13 | from MyDTreeClassifier import MyDTreeClassifier 14 | from sklearn.tree import DecisionTreeClassifier 15 | from sklearn import tree 16 | from sklearn.datasets import load_iris 17 | from sklearn.model_selection import train_test_split 18 | import matplotlib.pyplot as plt 19 | import pprint 20 | 21 | # Read the Titanic dataset and perform simple preprocessing. 22 | df = pd.read_csv('data/titanic.csv') 23 | df['Age'].fillna(df['Age'].mean(), inplace = True) # Replace with average 24 | df['Embarked'].fillna('N', inplace = True) # Replace with 'N' 25 | df['Sex'] = df['Sex'].factorize()[0] # label encoding 26 | df['Embarked'] = df['Embarked'].factorize()[0] # label encoding 27 | df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True) 28 | 29 | # Survived Pclass Sex Age SibSp Parch Fare Embarked 30 | # 0 0 3 0 22.0 1 0 7.2500 0 31 | # 1 1 1 1 38.0 1 0 71.2833 1 32 | # 2 1 3 1 26.0 0 0 7.9250 0 33 | # 3 1 1 1 35.0 1 0 53.1000 0 34 | # 4 0 3 0 35.0 0 0 8.0500 0 35 | 36 | # split the data into train, validation and test data. 37 | y = np.array(df['Survived']) 38 | x = np.array(df.drop('Survived', axis=1)) 39 | x_train, x_test, y_train, y_test = train_test_split(x, y) 40 | 41 | depth = 3 42 | my_model = MyDTreeClassifier(max_depth = depth) 43 | my_model.fit(x_train, y_train) 44 | my_pred = my_model.predict(x_test) 45 | acc = (y_test == my_pred).mean() 46 | print('MyTreeClassifier: accuracy = {:.3f}'.format(acc)) 47 | 48 | # Compare the results with sklearn's DecisionTreeClassifier. 49 | # ---------------------------------------------------------- 50 | sk_model = DecisionTreeClassifier(max_depth=depth, 51 | random_state=1) 52 | sk_model.fit(x_train, y_train) 53 | sk_pred = sk_model.predict(x_test) 54 | acc = (y_test == sk_pred).mean() 55 | print('DecisionTreeClassifier: accuracy = {:.3f}'.format(acc)) 56 | 57 | print('\nMyTreeClassifier: estimator2:') 58 | pprint.pprint(my_model.estimator2, sort_dicts=False) 59 | 60 | plt.figure(figsize=(12, 6)) 61 | tree.plot_tree(sk_model) 62 | plt.show() 63 | -------------------------------------------------------------------------------- /2.DecisionTree/3.CART(titanic_part).py: -------------------------------------------------------------------------------- 1 | # [MXML-2-08] 3.CART(sklearn).py 2 | # DecisionTreeClassifier in sklearn 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/XqNuY1RHlNU 11 | # 12 | # The characteristics of DecisionTreeClassifier: 13 | # 1. Use the CART algorithm (binary tree). 14 | # ID3/C4.5 (general tree) is not supported. 15 | # 2. Categorical feature is not directly supported. 16 | # All categorical features (e.g. 'female', 'male') must be 17 | # converted to numeric data (e.g. 0, 1). 18 | # All numeric features are treated as continuous features. 19 | # Split using inequality. (e.g. sex ≤ 0.5) 20 | # ----------------------------------------------------------- 21 | import numpy as np 22 | import pandas as pd 23 | from sklearn.tree import DecisionTreeClassifier 24 | from sklearn.model_selection import train_test_split 25 | from sklearn import tree 26 | import matplotlib.pyplot as plt 27 | 28 | # Of the Titanic dataset, only the following three features are used. 29 | feat_names = ['Pclass', 'Sex', 'Age'] 30 | df = pd.read_csv('data/titanic.csv')[feat_names + ['Survived']] 31 | df['Sex'] = df['Sex'].factorize()[0] # convert string to number 32 | df = df.dropna() # Delete all rows with missing values. 33 | col_names = list(df.columns) 34 | 35 | # Separate the Titanic data into features and target class. 36 | x_data = np.array(df[feat_names]) # features 37 | y_data = np.array(df['Survived']) # target class 38 | 39 | # Split the data into training, validation and test data. 40 | x_train, x_test, y_train, y_test = \ 41 | train_test_split(x_data, y_data, test_size = 0.3) 42 | 43 | x_test, x_eval, y_test, y_eval = \ 44 | train_test_split(x_test, y_test, test_size = 0.5) 45 | 46 | # Create decision tree models of various depths, 47 | # and measure the accuracy of validation data for each model. 48 | train_acc = [] 49 | eval_acc = [] 50 | max_depth = 8 51 | for d in range(1, max_depth+1): 52 | model = DecisionTreeClassifier(max_depth=d) 53 | model.fit(x_train, y_train) 54 | 55 | # Measure the accuracy of this model using the training data. 56 | y_pred = model.predict(x_train) 57 | train_acc.append((y_pred == y_train).mean()) 58 | 59 | # Measure the accuracy of this model using the validation data. 60 | y_pred = model.predict(x_eval) 61 | eval_acc.append((y_pred == y_eval).mean()) 62 | print('Depth = {}, train_acc = {:.4f}, eval_acc = {:.4f}'\ 63 | .format(d, train_acc[-1], eval_acc[-1])) 64 | 65 | # Find the optimal depth with the highest accuracy of validation data. 66 | opt_depth = np.argmax(eval_acc) + 1 67 | 68 | # Visualize accuracy changes as depth changes. 69 | plt.plot(train_acc, marker='o', label='train') 70 | plt.plot(eval_acc, marker='o', label='evaluation') 71 | plt.legend() 72 | plt.title('Accuracy') 73 | plt.xlabel('tree depth') 74 | plt.ylabel('accuracy') 75 | plt.xticks(np.arange(max_depth), np.arange(1, max_depth+1)) 76 | plt.axvline(x=opt_depth-1, ls='--') 77 | plt.ylim(0.5, 1.0) 78 | plt.show() 79 | 80 | # Regenerate the tree with optimal depth. 81 | # model = DecisionTreeClassifier(max_depth=opt_depth) 82 | 83 | # I set max_step=3 as a constant value for tree visualization. 84 | model = DecisionTreeClassifier(max_depth=3) 85 | model.fit(x_train, y_train) 86 | 87 | # Use test data to evaluate final performance. 88 | y_pred = model.predict(x_test) 89 | test_acc = (y_pred == y_test).mean() 90 | print('Optimal depth = {}, test_acc = {:.4f}'.format(opt_depth, test_acc)) 91 | 92 | # Visualize the tree 93 | # plt.figure(figsize=(20,10)) 94 | plt.figure(figsize=(14,6)) 95 | tree.plot_tree(model, feature_names = feat_names, fontsize=10) 96 | plt.show() 97 | 98 | # Analyze the importance of features. 99 | feature_importance = model.feature_importances_ 100 | n_feature = x_train.shape[1] 101 | idx = np.arange(n_feature) 102 | 103 | plt.barh(idx, feature_importance, align='center', color='green') 104 | plt.yticks(idx, col_names[:-1], size=12) 105 | plt.xlabel('importance', size=15) 106 | plt.ylabel('feature', size=15) 107 | plt.show() 108 | 109 | print('feature importance = {}'\ 110 | .format(feature_importance.round(3))) 111 | -------------------------------------------------------------------------------- /2.DecisionTree/5.CART(multiclass).py: -------------------------------------------------------------------------------- 1 | # [MXML-2-10]: 5.CART(multiclass).py 2 | # Multiclass classification test code 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/o43mZv_Cmxw 11 | # 12 | import numpy as np 13 | from sklearn.datasets import load_iris 14 | from MyDTreeClassifier import MyDTreeClassifier 15 | from sklearn.tree import DecisionTreeClassifier 16 | from sklearn.model_selection import train_test_split 17 | 18 | # Load iris dataset 19 | # x: data, the number of samples=150, the number of features=4 20 | # y: target data with class (0,1,2) 21 | x, y = load_iris(return_X_y=True) 22 | 23 | # Generate training and test data 24 | x_train, x_test, y_train, y_test = train_test_split(x, y) 25 | 26 | # Model-1: using our model - refer to [MXML-2-07] video 27 | model1 = MyDTreeClassifier(max_depth=3) 28 | model1.fit(x_train, y_train) 29 | 30 | # Estimate the class of validation date. 31 | y_pred1 = model1.predict(x_test) 32 | 33 | # Measure the accuracy for validation data 34 | accuracy1 = (y_test == y_pred1).mean() 35 | print('\nAccuracy of Model-1 = {:.3f}'.format(accuracy1)) 36 | 37 | # Model-2: using sklearn 38 | model2 = DecisionTreeClassifier(max_depth=3) 39 | model2.fit(x_train, y_train) 40 | 41 | # Estimate the class of validation date. 42 | y_pred2 = model2.predict(x_test) 43 | 44 | # Measure the accuracy for validation data 45 | accuracy2 = (y_test == y_pred2).mean() 46 | print('Accuracy of Model-2 = {:.3f}'.format(accuracy2)) 47 | 48 | print("\nModel-1: y_pred1") 49 | print(y_pred1) 50 | print("\nModel-2: y_pred2") 51 | print(y_pred2) 52 | 53 | -------------------------------------------------------------------------------- /2.DecisionTree/6.CART(regression).py: -------------------------------------------------------------------------------- 1 | # [MXML-02-11] 6.CART(regression).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/Bc-k9Dv5SNg 10 | # 11 | import numpy as np 12 | from MyDTreeRegressor import MyDTreeRegressor 13 | from sklearn.tree import DecisionTreeRegressor 14 | import matplotlib.pyplot as plt 15 | from sklearn import tree 16 | import pprint 17 | 18 | # Plot the training data and draw the estimated curve. 19 | def plot_prediction(x, y, x_test, y_pred, title): 20 | plt.figure(figsize=(6,4)) 21 | plt.scatter(x, y, c='blue', s=20, alpha=0.5, label='train data') 22 | plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction') 23 | plt.xlim(0, 1) 24 | plt.ylim(0, 7) 25 | plt.legend() 26 | plt.title(title) 27 | plt.show() 28 | 29 | # Generate nonlinear data for regression testing. 30 | def noisy_sine_data(n, s): 31 | rtn_x, rtn_y = [], [] 32 | for i in range(n): 33 | x= np.random.random() 34 | y= 2.0*np.sin(2.0*np.pi*x)+np.random.normal(0.0, s) + 3.0 35 | rtn_x.append(x) 36 | rtn_y.append(y) 37 | return np.array(rtn_x).reshape(-1,1), np.array(rtn_y) 38 | 39 | # Create training and test data 40 | x_train, y_train = noisy_sine_data(n=500, s=0.5) 41 | x_test = np.linspace(0, 1, 50).reshape(-1, 1) 42 | 43 | depth = 3 44 | my_model = MyDTreeRegressor(max_depth = depth) 45 | my_model.fit(x_train, y_train) 46 | my_pred = my_model.predict(x_test) 47 | 48 | # Plot the training data and draw the estimated curve. 49 | plot_prediction(x_train, y_train, x_test, my_pred, 'MyDTreeRegressor') 50 | 51 | # Compare with sklearn's DecisionTreeRegressor() results. 52 | # ------------------------------------------------------- 53 | sk_model = DecisionTreeRegressor(max_depth = depth) 54 | sk_model.fit(x_train, y_train) 55 | sk_pred = sk_model.predict(x_test) 56 | 57 | # Plot the training data and draw the estimated curve. 58 | plot_prediction(x_train, y_train, x_test, sk_pred, 'DecisionTreeRegressor') 59 | 60 | # Compare trees created by the two models. 61 | print('\nMyDTreeRegressor: estimator2:') 62 | pprint.pprint(my_model.estimator2, sort_dicts=False) 63 | 64 | plt.figure(figsize=(12,7)) 65 | tree.plot_tree(sk_model) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /3.LinearRegression/1.scipy_opt(ols).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-02] 1.scipy_opt(ols).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/YBk1FS1vmv4 10 | # 11 | from scipy import optimize 12 | from sklearn.metrics import r2_score 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | 16 | # y = ax + b + Gaussian noise 17 | def reg_data(a, b, n, s): 18 | rtn_x, rtn_y = [], [] 19 | for i in range(n): 20 | x = np.random.normal(0.0, 0.5) 21 | y = a * x + b + np.random.normal(0.0, s) 22 | rtn_x.append(x) 23 | rtn_y.append(y) 24 | return np.array(rtn_x).reshape(-1,1), np.array(rtn_y) 25 | 26 | # Generate 1,000 data points drawn from y = ax + b + noise 27 | # s : standard deviation of the noise distribution 28 | x, y = reg_data(a=0.5, b=0.3, n=1000, s=0.2) 29 | 30 | # y = w0 + w1*x1 + w2*x2 + ... → w0*x0 + w1*x1 + w2*x2 + ... (x0 = 1) 31 | # y = [w0, w1, w2, ...] * [x0, x1, x2, ...].T (T : transpose) 32 | # y = W * X.T 33 | X = np.hstack([np.ones([x.shape[0], 1]), x]) 34 | REG_CONST = 0.01 # regularization constant 35 | 36 | # Regularized loss function : Mean Squared Error 37 | def ols_loss(W, args): 38 | e = np.dot(W, X.T) - y 39 | mse = np.mean(np.square(e)) # mean squared error 40 | 41 | # We typically do not penalize the intercept term. 42 | loss = mse + REG_CONST * np.sum(np.square(W[1:])) 43 | 44 | # save W and loss 45 | if args[0] == True: 46 | trace_W.append([W, loss]) 47 | return loss 48 | 49 | # Perform optimization process 50 | trace_W = [] 51 | result = optimize.minimize(ols_loss, [-4., 4], args=[True]) 52 | print(result) 53 | 54 | # Plot the training data and draw the regression line. 55 | y_hat = np.dot(result.x, X.T) 56 | plt.figure(figsize=(6, 6)) 57 | plt.scatter(x, y, s=5, c='r') 58 | plt.plot(x, y_hat, c='blue') 59 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 60 | plt.axhline(y=0, ls='--', lw=0.5, c='black') 61 | plt.show() 62 | 63 | # Draw the loss function and the path to the optimal point. 64 | m = 5 65 | t = 0.1 66 | w0, w1 = np.meshgrid(np.arange(-m, m, t), np.arange(-m, m, t)) 67 | zs = np.array([ols_loss([a,b], [False]) for [a, b] in zip(np.ravel(w0), np.ravel(w1))]) 68 | z = zs.reshape(w0.shape) 69 | 70 | fig = plt.figure(figsize=(7, 7)) 71 | ax = fig.add_subplot(111, projection='3d') 72 | 73 | # Draw the surface of the loss function 74 | ax.plot_surface(w0, w1, z, alpha=0.7) 75 | 76 | # Dwaw the path to the optimal point. 77 | b = np.array([tw0 for [tw0, tw1], td in trace_W]) 78 | w = np.array([tw1 for [tw0, tw1], td in trace_W]) 79 | d = np.array([td for [tw0, tw1], td in trace_W]) 80 | ax.plot(b, w, d, marker='o', color="r") 81 | 82 | ax.set_xlabel('W0 (bias)') 83 | ax.set_ylabel('W1 (slope)') 84 | ax.set_zlabel('distance') 85 | ax.azim = -50 86 | ax.elev = 50 87 | plt.show() 88 | 89 | # Check the R2 score 90 | sst = np.sum(np.square(y - np.mean(y))) # total sum of squares 91 | sse = np.sum(np.square(y - y_hat)) # sum of squares of error 92 | r2 = 1 - sse / sst 93 | print('\nR2 score = {:.4f}'.format(r2)) 94 | print('R2 score = {:.4f}'.format(r2_score(y, y_hat))) 95 | -------------------------------------------------------------------------------- /3.LinearRegression/10.ransac(2).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-07] 10.ransac(2).py 2 | # Implementing RANSAC using sklearn's RANSACRegressor. 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/A2QnStjnlVE 11 | # 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | from sklearn.linear_model import LinearRegression, RANSACRegressor 15 | from sklearn.metrics import r2_score 16 | 17 | # Generate n data samples with outliers. 18 | def reg_data_outlier(a, b, n, s, outlier_rate=0.1): 19 | n1 = int(n * outlier_rate) # the number of outliers 20 | n2 = n - n1 # the number of inliers 21 | 22 | # Generate normal data points (inliers) 23 | x2 = np.random.normal(0.0, 0.5, size=n2) 24 | y2 = a * x2 + b + np.random.normal(0.0, s, size=n2) 25 | 26 | # Generate abnormal data points (outliers) 27 | x1 = np.random.normal(0.5, 0.1, size=n1) 28 | y1 = a * x1 + b * 3 + np.abs(np.random.normal(0.0, s, size=n1)) 29 | 30 | x = np.hstack([x2, x1]).reshape(-1,1) 31 | y = np.hstack([y2, y1]) 32 | 33 | return x, y 34 | 35 | x, y = reg_data_outlier(a=0.5, b=0.3, n=1000, s=0.2, outlier_rate=0.2) 36 | 37 | # min_samples: 38 | # min_samples is chosen as X.shape[1] + 1. 39 | # stop_probability: 40 | # RANSAC iteration stops if at least one outlier-free set of the training 41 | # data is sampled in RANSAC. This requires to generate at least N samples 42 | # (iterations): 43 | # residual_threshold: 44 | # By default the threshold is chosen as the MAD (median absolute deviation) 45 | # of the target values y. 46 | model = RANSACRegressor(LinearRegression(), 47 | stop_probability = 0.99, # default 48 | residual_threshold = None, # default 49 | min_samples = 10) 50 | 51 | model.fit(x, y) 52 | 53 | w = model.estimator_.coef_ 54 | b = model.estimator_.intercept_ 55 | 56 | # Visually check the data and final regression line 57 | y_pred = model.predict(x) 58 | plt.figure(figsize=(6,5)) 59 | plt.scatter(x, y, s=5, c='r') 60 | plt.plot(x, y_pred, c='blue') 61 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 62 | plt.axhline(y=0, ls='--', lw=0.5, c='black') 63 | plt.show() 64 | 65 | print('\nRANSAC results:') 66 | print('Regression line: y = {:.3f}x + {:.3f}'.format(w[0], b)) 67 | print('R2 score = {:.3f}'.format(r2_score(y, y_pred))) 68 | 69 | 70 | -------------------------------------------------------------------------------- /3.LinearRegression/11.boston(ransac).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-07] 11.boston(ransac).py 2 | # Predict the Boston house prices using RANSAC 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/A2QnStjnlVE 11 | # 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | from sklearn.linear_model import RANSACRegressor, Ridge 15 | from sklearn.model_selection import train_test_split 16 | import pickle 17 | 18 | # Read Boston house price dataset 19 | with open('data/boston_house.pkl', 'rb') as f: 20 | data = pickle.load(f) 21 | x = data['data'] # shape = (506, 13) 22 | y = data['target'] # shape = (506,) 23 | x_train, x_test, y_train, y_test = train_test_split(x, y) 24 | 25 | # min_samples: 26 | # min_samples is chosen as X.shape[1] + 1. 27 | # stop_probability: 28 | # RANSAC iteration stops if at least one outlier-free set of the training 29 | # data is sampled in RANSAC. This requires to generate at least N samples 30 | # (iterations): 31 | # residual_threshold: 32 | # By default the threshold is chosen as the MAD (median absolute deviation) 33 | # of the target values y. 34 | model = RANSACRegressor(Ridge(alpha=0.01), 35 | stop_probability = 0.99, # default 36 | residual_threshold = None, # default 37 | min_samples = 50) 38 | 39 | model.fit(x_train, y_train) 40 | 41 | # Visually check the actual and predicted prices 42 | y_pred = model.predict(x_test) 43 | plt.figure(figsize=(6, 5)) 44 | plt.scatter(y_test, y_pred, s=20, c='r') 45 | plt.xlabel('y_test') 46 | plt.ylabel('y_pred') 47 | plt.show() 48 | 49 | print('RANSAC R2 = {:.3f}'.format(model.score(x_test, y_test))) 50 | 51 | -------------------------------------------------------------------------------- /3.LinearRegression/2.boston(ols).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-03] 2.boston(ols).py 2 | # prediction of Boston house price 3 | # Applying Mean centering, Normalization, Ridge Regularization 4 | # 5 | # This code was used in the machine learning online 6 | # course provided by 7 | # www.youtube.com/@meanxai 8 | # www.github.com/meanxai/machine_learning 9 | # 10 | # A detailed description of this code can be found in 11 | # https://youtu.be/gLekbL_pI1A 12 | # 13 | from scipy import optimize 14 | import matplotlib.pyplot as plt 15 | import numpy as np 16 | import pandas as pd 17 | from sklearn.metrics import r2_score 18 | from sklearn.model_selection import train_test_split 19 | import pickle 20 | 21 | # Read Boston house price dataset 22 | with open('data/boston_house.pkl', 'rb') as f: 23 | data = pickle.load(f) 24 | 25 | x = data['data'] # shape = (506, 13) 26 | y = data['target'] # shape = (506,) 27 | 28 | # Split the dataset into training and test data 29 | x_train, x_test, y_train, y_test = train_test_split(x, y) 30 | REG_CONST = 0.01 # regularization constant 31 | 32 | # Mean centering & Normalization are performed on training data. 33 | x_offset = x_train.mean(axis=0) 34 | x_scale = x_train.std(axis=0) 35 | y_offset = y_train.mean() 36 | 37 | xm_train = (x_train - x_offset) / x_scale 38 | ym_train = y_train - y_offset 39 | 40 | # Regularized mean squared error loss function 41 | def ols_loss(W): 42 | # Calculating MSE using the training data 43 | d_train = np.dot(W, xm_train.T) - ym_train 44 | mse = np.mean(np.square(d_train)) 45 | loss = mse + REG_CONST * np.sum(np.square(W)) 46 | 47 | # Save the loss history. 48 | trc_loss.append(loss) 49 | return loss 50 | 51 | # Perform optimization process 52 | trc_loss = [] 53 | W0 = np.ones(xm_train.shape[1]) * 0.1 # W의 초깃값. 54 | result = optimize.minimize(ols_loss, W0) 55 | 56 | # Check the results 57 | print(result.success) # check if success = True 58 | print(result.message) 59 | 60 | # Visually check the regularized MSE of the training data. 61 | plt.figure(figsize=(6, 4)) 62 | plt.plot(trc_loss, label = 'loss_train') 63 | plt.legend() 64 | plt.xlabel('epochs') 65 | plt.show() 66 | 67 | # Convert result.x to the coef and the intercept 68 | # y_hat = coef * x + intercept 69 | coef = result.x / x_scale 70 | intercept = y_offset - np.dot(x_offset, coef.T) 71 | 72 | # Predict y values of the test data. 73 | y_pred = np.dot(coef, x_test.T) + intercept 74 | 75 | # Visually check the predicted and actual y values ​​of the test data. 76 | plt.figure(figsize=(6, 5)) 77 | plt.scatter(y_test, y_pred, s=20, c='r') 78 | plt.xlabel('y_test') 79 | plt.ylabel('y_pred') 80 | plt.show() 81 | 82 | df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred}) 83 | print('\n', df.head(10)) 84 | 85 | # Check R2 score of the test data. 86 | print('\nR2 score = {:.4f}'.format(r2_score(y_test, y_pred))) 87 | 88 | -------------------------------------------------------------------------------- /3.LinearRegression/3.boston(sklearn).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-03] 3.boston(sklearn).py 2 | # prediction of Boston house price 3 | # using sklear’s LinearRegression, Ridge, Lasso 4 | # 5 | # This code was used in the machine learning online 6 | # course provided by 7 | # www.youtube.com/@meanxai 8 | # www.github.com/meanxai/machine_learning 9 | # 10 | # A detailed description of this code can be found in 11 | # https://youtu.be/gLekbL_pI1A 12 | # 13 | import matplotlib.pyplot as plt 14 | from sklearn.linear_model import LinearRegression, Ridge, Lasso 15 | from sklearn.model_selection import train_test_split 16 | import pickle 17 | 18 | # Read Boston house price dataset 19 | with open('data/boston_house.pkl', 'rb') as f: 20 | data = pickle.load(f) 21 | 22 | x = data['data'] # features, shape = (506, 13) 23 | y = data['target'] # target, shape = (506,) 24 | 25 | # Split the dataset into training and test data 26 | x_train, x_test, y_train, y_test = train_test_split(x, y) 27 | 28 | # 1. LinearRegression() 29 | # --------------------- 30 | model = LinearRegression() 31 | model.fit(x_train, y_train) 32 | y_pred = model.predict(x_test) 33 | 34 | # Visually check the predicted and actual y values ​​of the test data. 35 | plt.figure(figsize=(6, 5)) 36 | plt.scatter(y_test, y_pred, s=20, c='r') 37 | plt.xlabel('y_test') 38 | plt.ylabel('y_pred') 39 | plt.show() 40 | 41 | # 평가용 데이터의 R2를 확인한다. 42 | r2 = model.score(x_test, y_test) 43 | print('\nR2 (LinearRegression) = {:.3f}'.format(r2)) 44 | 45 | # 2. Ridge regularization 46 | # ----------------------- 47 | model = Ridge(alpha=0.01) 48 | model.fit(x_train, y_train) 49 | r2 = model.score(x_test, y_test) 50 | print('R2 (Ridge) = {:.3f}'.format(r2)) 51 | 52 | # 3. Lasso regularization 53 | # ----------------------- 54 | model = Lasso(alpha=0.01) 55 | model.fit(x_train, y_train) 56 | r2 = model.score(x_test, y_test) 57 | print('R2 (Lasso) = {:.3f}'.format(r2)) 58 | -------------------------------------------------------------------------------- /3.LinearRegression/4.scipy_opt(tls).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-04] 4.scipy_opt(tls).py 2 | # Implementation of TLS using scipy.optimize. Apply Ridge. 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/yDdbC9BhdwM 11 | # 12 | from scipy import optimize 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | 16 | # y = ax + b + Gaussian noise 17 | def reg_data(a, b, n, s): 18 | rtn_x, rtn_y = [], [] 19 | for i in range(n): 20 | x = np.random.normal(0.0, 0.5) 21 | y = a * x + b + np.random.normal(0.0, s) 22 | rtn_x.append(x) 23 | rtn_y.append(y) 24 | return np.array(rtn_x).reshape(-1,1), np.array(rtn_y) 25 | 26 | # Generate 1,000 data points drawn from y = ax + b + noise 27 | x, y = reg_data(a=0.5, b=0.3, n=1000, s=0.2) 28 | 29 | # y = w0 + w1*x1 + w2*x2 + ... → w0*x0 + w1*x1 + ... (x0 = 1) 30 | # y = [w0, w1, w2, ...] * [x0, x1, x2, ...].T (T : transpose) 31 | # y = W * X.T 32 | X = np.hstack([np.ones([x.shape[0], 1]), x]) 33 | REG_CONST = 0.01 # regularization constant 34 | 35 | # Cost function: square sum of the perpendicular distances 36 | # between data points and the regression line. 37 | def tls_loss(W, args): 38 | numerator = np.square(np.dot(W, X.T) - y) 39 | denominator = np.square(W[1]) + 1 40 | d2 = numerator / denominator 41 | msd = np.mean(d2) 42 | loss = msd + REG_CONST * np.sum(np.square(W)) 43 | 44 | # save W and loss history 45 | if args[0] == True: 46 | trace_W.append([W, loss]) 47 | 48 | return loss 49 | 50 | # Perform optimization process 51 | trace_W = [] 52 | result = optimize.minimize(tls_loss, [-4, 0.5], args=[True]) 53 | print(result) 54 | 55 | # Plot the training data and draw the regression line. 56 | y_hat = np.dot(result.x, X.T) 57 | plt.figure(figsize=(6,5)) 58 | plt.scatter(x, y, s=5, c='r') 59 | plt.plot(x, y_hat, c='blue') 60 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 61 | plt.axhline(y=0, ls='--', lw=0.5, c='black') 62 | plt.show() 63 | 64 | # Draw the loss function and the path to the optimal point. 65 | m = 5 66 | t = 0.1 67 | w0, w1 = np.meshgrid(np.arange(-m, m, t), np.arange(-m, m, t)) 68 | zs = np.array([tls_loss([a,b], [False]) for [a, b] in zip(np.ravel(w0), np.ravel(w1))]) 69 | z = zs.reshape(w0.shape) 70 | 71 | fig = plt.figure(figsize=(10,10)) 72 | ax = fig.add_subplot(111, projection='3d') 73 | 74 | # Draw the surface of loss function 75 | ax.plot_surface(w0, w1, z, alpha=0.8) 76 | 77 | # Draw the path to the optimal point. 78 | b = np.array([tw0 for [tw0, tw1], td in trace_W[:50]]) 79 | w = np.array([tw1 for [tw0, tw1], td in trace_W[:50]]) 80 | d = np.array([td for [tw0, tw1], td in trace_W[:50]]) 81 | ax.plot(b, w, d, marker='o', color='red') 82 | 83 | ax.set_xlabel('W0 (bias)') 84 | ax.set_ylabel('W1 (slope)') 85 | ax.set_zlabel('distance') 86 | ax.azim = -50 87 | ax.elev = 50 88 | plt.show() 89 | 90 | # Check the R2 score 91 | sst = np.sum(np.square(y - np.mean(y))) # total sum of squares 92 | sse = np.sum(np.square(y - y_hat)) # sum of squares of residuals 93 | r2 = 1 - sse / sst 94 | print('\nR2 score = {:.4f}'.format(r2)) -------------------------------------------------------------------------------- /3.LinearRegression/5.boston(tls).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-04] 5.boston(tls).py 2 | # prediction of Boston house price by TLS 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/yDdbC9BhdwM 11 | # 12 | from scipy import optimize 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | import pandas as pd 16 | from sklearn.metrics import r2_score 17 | from sklearn.model_selection import train_test_split 18 | import pickle 19 | 20 | # Read Boston house price dataset 21 | with open('data/boston_house.pkl', 'rb') as f: 22 | data = pickle.load(f) 23 | x = data['data'] # shape = (506, 13) 24 | y = data['target'] # shape = (506,) 25 | x_train, x_test, y_train, y_test = train_test_split(x, y) 26 | 27 | # Apply mean-centering to the training data 28 | x_offset = x_train.mean(axis=0) 29 | y_offset = y_train.mean() 30 | 31 | xm_train = x_train - x_offset 32 | ym_train = y_train - y_offset 33 | 34 | # Apply Ridge regularization 35 | REG_CONST = 0.01 36 | 37 | # Cost function for OLS 38 | def ols_loss(W): 39 | err = np.dot(W, xm_train.T) - ym_train 40 | mse = np.sqrt(np.mean(np.square(err))) 41 | loss = mse + REG_CONST * np.sum(np.square(W)) 42 | return loss 43 | 44 | # Cost function for TLS 45 | def tls_loss(W): 46 | numerator = np.square(np.dot(W, xm_train.T) - ym_train) 47 | denominator = np.sum(np.square(W)) + 1 48 | d2 = numerator / denominator 49 | msd = np.sqrt(np.mean(d2)) 50 | loss = msd + REG_CONST * np.sum(np.square(W)) 51 | 52 | # save loss history 53 | trc_loss_train.append(loss) 54 | return loss 55 | 56 | # Perform optimization process 57 | trc_loss_train = [] 58 | 59 | # Perform OLS 60 | W0 = np.array([1.0] * x_train.shape[1]) # W의 초깃값 61 | result = optimize.minimize(ols_loss, W0) 62 | 63 | # Perform TLS 64 | # The optimal W found by OLS is used as the initial value of TLS. 65 | W0 = result.x 66 | result = optimize.minimize(tls_loss, W0) 67 | print(result.success) # check if success = True 68 | print(result.message) 69 | 70 | # Check the loss history 71 | plt.figure(figsize=(6, 4)) 72 | plt.plot(trc_loss_train, label = 'loss_train') 73 | plt.legend() 74 | plt.xlabel('epochs') 75 | plt.show() 76 | 77 | # y_hat = coef * x + intercept 78 | coef = result.x 79 | intercept = y_offset - np.dot(x_offset, coef.T) 80 | 81 | # Predict the y values of the test data 82 | y_pred = np.dot(coef, x_test.T) + intercept 83 | 84 | # Visually check the actual and predicted y values ​​of the test data. 85 | plt.figure(figsize=(6, 5)) 86 | plt.scatter(y_test, y_pred, s=20, c='r') 87 | plt.xlabel('y_test') 88 | plt.ylabel('y_pred') 89 | plt.show() 90 | 91 | df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred}) 92 | print('\n', df.head(10)) 93 | 94 | # Check the R2 score 95 | print('\nTLS R2 score = {:.4f}'.format(r2_score(y_test, y_pred))) 96 | 97 | # Check the R2 score from OLS 98 | ols_coef = W0 99 | ols_icept = y_offset - np.dot(x_offset, ols_coef.T) 100 | y_ols_pred = np.dot(ols_coef, x_test.T) + ols_icept 101 | print('OLS R2 score = {:.4f}'.format(r2_score(y_test, y_ols_pred))) 102 | 103 | -------------------------------------------------------------------------------- /3.LinearRegression/6.lwr(scipy).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-5] 6.lwr(scipy).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/d1-QS4uTgj8 10 | # 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from scipy import optimize 14 | from sklearn.model_selection import train_test_split 15 | 16 | # Generate sinusoidal data with Gaussian noise added. 17 | def noisy_sine_data(n, s): 18 | rtn_x, rtn_y = [], [] 19 | for i in range(n): 20 | x= np.random.random() 21 | y= 2.0*np.sin(2.0*np.pi*x)+np.random.normal(0.0, s) + 3.0 22 | rtn_x.append(x) 23 | rtn_y.append(y) 24 | return np.array(rtn_x).reshape(-1,1), np.array(rtn_y) 25 | 26 | # Create 1,000 data points for LWR testing. 27 | x, y = noisy_sine_data(n=1000, s=0.7) 28 | x_train, x_test, y_train, y_test = train_test_split(x, y) 29 | x1_train = np.hstack([np.ones([x_train.shape[0], 1]), x_train]) 30 | 31 | # Visualize the training and test data 32 | plt.figure(figsize=(6,5)) 33 | plt.scatter(x_train, y_train, s=5, c='orange', label='train') 34 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', 35 | label='test') 36 | plt.legend() 37 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 38 | plt.axhline(y=0, ls='--', lw=0.5, c='black') 39 | plt.show() 40 | 41 | # Find the weight for each data point. 42 | # train: training data, test: test data point to be predicted 43 | def get_weight(train, test, tau): 44 | d2 = np.sum(np.square(train - test), axis=1) 45 | w = np.exp(-d2 / (2. * tau * tau)) 46 | return w 47 | 48 | # Weighted cost function 49 | def lwr_loss(W, weight): 50 | d = np.dot(W, x1_train.T) - y_train 51 | wmsd = np.mean(weight * np.square(d)) 52 | return wmsd 53 | 54 | y_pred = [] 55 | for tx in x_test: 56 | weight = get_weight(x_train, tx, 0.05) 57 | result = optimize.minimize(lwr_loss, [0.1, 0.1], args=weight) 58 | y_pred.append(np.dot(result.x[1], tx) + result.x[0]) 59 | y_pred = np.array(y_pred).reshape(-1,) 60 | 61 | # Visualize the predicted results 62 | plt.figure(figsize=(6,5)) 63 | plt.scatter(x_train, y_train, s=5, c='orange', label='train') 64 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', 65 | label='test') 66 | plt.scatter(x_test, y_pred, s=5, c='red', label='prediction') 67 | plt.legend() 68 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 69 | plt.axhline(y=0, ls='--', lw=0.5, c='black') 70 | plt.show() 71 | -------------------------------------------------------------------------------- /3.LinearRegression/7.lwr(sklearn).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-5] 7.lwr(sklearn).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/d1-QS4uTgj8 10 | # 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from sklearn.linear_model import Ridge 14 | from sklearn.model_selection import train_test_split 15 | 16 | # Generate sinusoidal data with Gaussian noise added. 17 | def noisy_sine_data(n, s): 18 | rtn_x, rtn_y = [], [] 19 | for i in range(n): 20 | x= np.random.random() 21 | y= 2.0*np.sin(2.0*np.pi*x)+np.random.normal(0.0, s) + 3.0 22 | rtn_x.append(x) 23 | rtn_y.append(y) 24 | return np.array(rtn_x).reshape(-1,1), np.array(rtn_y) 25 | 26 | # Create 1,000 data points for LWR testing. 27 | x, y = noisy_sine_data(n=1000, s=0.7) 28 | x_train, x_test, y_train, y_test = train_test_split(x, y) 29 | x1_train = np.hstack([np.ones([x_train.shape[0], 1]), x_train]) 30 | 31 | # Visualize the training and test data 32 | plt.figure(figsize=(6, 5)) 33 | plt.scatter(x_train, y_train, s=5, c='orange', label='train') 34 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', 35 | label='test') 36 | plt.legend() 37 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 38 | plt.axhline(y=0, ls='--', lw=0.5, c='black') 39 | plt.show() 40 | 41 | # Find the weight for each data point. 42 | # train: training data, test: test data point to be predicted 43 | def get_weight(train, test, tau): 44 | d2 = np.sum(np.square(train - test), axis=1) 45 | w = np.exp(-d2 / (2. * tau * tau)) 46 | return w 47 | 48 | # predict the target value of the test data 49 | y_pred = [] 50 | for tx in x_test: 51 | weight = get_weight(x_train, tx, 0.05) 52 | model = Ridge(alpha=0.01) 53 | model.fit(x_train, y_train, sample_weight = weight) 54 | y_pred.append(model.predict(tx.reshape(-1,1))[0]) 55 | y_pred = np.array(y_pred).reshape(-1,) 56 | 57 | # Visualize the predicted results 58 | plt.figure(figsize=(6, 5)) 59 | plt.scatter(x_train, y_train, s=5, c='orange', label='train') 60 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', 61 | label='test') 62 | plt.scatter(x_test, y_pred, s=5, c='red', label='prediction') 63 | plt.legend() 64 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 65 | plt.axhline(y=0, ls='--', lw=0.5, c='black') 66 | plt.show() 67 | -------------------------------------------------------------------------------- /3.LinearRegression/8.boston(lwr).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-5] 8.bostn(lwr).py 2 | # Predicting the Boston house price using LWR 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/d1-QS4uTgj8 11 | # 12 | import numpy as np 13 | import pandas as pd 14 | import matplotlib.pyplot as plt 15 | from sklearn.linear_model import Ridge 16 | from sklearn.metrics import r2_score 17 | from sklearn.model_selection import train_test_split 18 | import pickle 19 | 20 | # Read saved dataset 21 | with open('data/boston_house.pkl', 'rb') as f: 22 | data = pickle.load(f) 23 | x = data['data'] # shape = (506, 13) 24 | y = data['target'] # shape = (506,) 25 | x_train, x_test, y_train, y_test = train_test_split(x, y) 26 | 27 | # Find the weight for each data point. 28 | # train: training data, test: test data point to be predicted 29 | def get_weight(train, test, tau): 30 | d2 = np.sum(np.square(train - test), axis=1) 31 | w = np.exp(-d2 / (2. * tau * tau)) 32 | return w 33 | 34 | y_pred = [] 35 | for tx in x_test: 36 | weight = get_weight(x_train, tx, 50.0) 37 | model = Ridge(alpha=0.01) 38 | model.fit(x_train, y_train, sample_weight = weight) 39 | y_pred.append(model.predict(tx.reshape(1, -1))[0]) 40 | 41 | y_pred = np.array(y_pred).reshape(-1,) 42 | 43 | # Visually check the actual and predicted y values ​​of the test data. 44 | plt.figure(figsize=(6, 5)) 45 | plt.scatter(y_test, y_pred, s=10, c='r') 46 | plt.xlabel('y_test') 47 | plt.ylabel('y_pred') 48 | plt.show() 49 | 50 | print('\nR2 (LWR) = {:.3f}'.format(r2_score(y_test, y_pred))) 51 | -------------------------------------------------------------------------------- /3.LinearRegression/9.ransac(1).py: -------------------------------------------------------------------------------- 1 | # [MXML-3-07] 9.ransac(1).py 2 | # Implementing RANSAC from scratch 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/A2QnStjnlVE 11 | # 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | from sklearn.linear_model import LinearRegression 15 | from sklearn.metrics import r2_score 16 | 17 | # Generate n data samples with outliers. 18 | def reg_data_outlier(a, b, n, s, outlier_rate=0.1): 19 | n1 = int(n * outlier_rate) # the number of outliers 20 | n2 = n - n1 # the number of inliers 21 | 22 | # Generate normal data points (inliers) 23 | x2 = np.random.normal(0.0, 0.5, size=n2) 24 | y2 = a * x2 + b + np.random.normal(0.0, s, size=n2) 25 | 26 | # Generate abnormal data points (outliers) 27 | x1 = np.random.normal(0.5, 0.1, size=n1) 28 | y1 = a * x1 + b * 3 + np.abs(np.random.normal(0.0, s, size=n1)) 29 | 30 | x = np.hstack([x2, x1]).reshape(-1,1) 31 | y = np.hstack([y2, y1]) 32 | 33 | return x, y 34 | 35 | x, y = reg_data_outlier(a=0.5, b=0.3, n=1000, s=0.2, outlier_rate=0.2) 36 | 37 | # 1. OLS 38 | model = LinearRegression() 39 | result = model.fit(x.reshape(-1,1), y) 40 | 41 | # Visualize the data and regression line 42 | w = result.coef_ 43 | b = result.intercept_ 44 | y_hat = np.dot(w, x.T) + b 45 | 46 | plt.figure(figsize=(6,5)) 47 | plt.scatter(x, y, s=5, c='r') 48 | plt.plot(x, y_hat, c='blue') 49 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 50 | plt.axhline(y=0, ls='--', lw=0.5, c='black') 51 | plt.show() 52 | 53 | print('\nOLS results:') 54 | print('Regression line: y = {:.3f}x + {:.3f}'.format(w[0], b)) 55 | print('R2 score = {:.3f}'.format(r2_score(y, y_hat))) 56 | 57 | # RANSAC 58 | n_sample =10 # the number of samples chosen randomly from original data 59 | z_prob = 0.99 # the probability z 60 | w_prob = 0.8 # the probability w 61 | 62 | # The maximum number of attempts to find a consensus set 63 | k_maxiter = int(np.log(1.0 - z_prob) / np.log(1.0 - w_prob ** n_sample)) 64 | 65 | # RANSACRegressor/residual_threshold: 66 | # the threshold is chosen as the MAD (median absolute deviation) of the 67 | # target values y 68 | threshold = np.median(np.abs(y - np.median(y))) 69 | 70 | ransac_w = 0 # slope 71 | ransac_b = 0 # intercept 72 | ransac_c = 0 # count within the error tolerance 73 | for i in range(k_maxiter): 74 | # sampling without replacement 75 | idx = np.random.choice(np.arange(0, x.shape[0]-1), n_sample, replace=False) 76 | xs = x[idx] 77 | ys = y[idx] 78 | 79 | # OLS Regression 80 | model = LinearRegression() 81 | result = model.fit(xs, ys) 82 | 83 | # Calculate the absolute value of residuals. 84 | y_pred = np.dot(result.coef_, x.T) + result.intercept_ 85 | residual = np.abs(y - y_pred) 86 | 87 | # Count the number of times the residual is less than the threshold. 88 | count = (residual < threshold).sum() 89 | 90 | # Find the regression line where the count is largest. 91 | if count > ransac_c: 92 | ransac_c = count 93 | ransac_w = result.coef_ 94 | ransac_b = result.intercept_ 95 | 96 | y_pred = np.dot(ransac_w, x.T) + ransac_b 97 | 98 | # Visually check the data and final regression line 99 | plt.figure(figsize=(6,5)) 100 | plt.scatter(x, y, s=5, c='r') 101 | plt.plot(x, y_pred, c='blue') 102 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 103 | plt.axhline(y=0, ls='--', lw=0.5, c='black') 104 | plt.show() 105 | 106 | print('\nRANSAC results:') 107 | print('The maximum number of k = {}'.format(k_maxiter)) 108 | print('Threshold = {:.3f}'.format(threshold)) 109 | print('Regression line: y = {:.3f}x + {:.3f}'.format(ransac_w[0], ransac_b)) 110 | print('R2 score = {:.3f}'.format(r2_score(y, y_pred))) 111 | -------------------------------------------------------------------------------- /3.LinearRegression/data/boston_house.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/meanxai/machine_learning/fba47e91cc7449eb5d7ea8b7ec1fb0fd616ebd71/3.LinearRegression/data/boston_house.pkl -------------------------------------------------------------------------------- /3.LinearRegression/data/wls_sample_data.csv: -------------------------------------------------------------------------------- 1 | state,y,x1,x2,x3,region 2 | ME,235,3944,325,508,1 3 | NH,231,4578,323,564,1 4 | VT,270,4011,328,322,1 5 | MA,261,5233,305,846,1 6 | RI,300,4780,303,871,1 7 | CT,317,5889,307,774,1 8 | NY,387,5663,301,856,1 9 | NJ,285,5759,310,889,1 10 | PA,300,4894,300,715,1 11 | OH,221,5012,324,753,2 12 | IN,264,4908,329,649,2 13 | IL,308,5753,320,830,2 14 | MI,379,5439,337,738,2 15 | WI,342,4634,328,659,2 16 | MN,378,4921,330,664,2 17 | IA,232,4869,318,572,2 18 | MO,231,4672,309,701,2 19 | ND,246,4782,333,443,2 20 | SD,230,4296,330,446,2 21 | NB,268,4827,318,615,2 22 | KS,337,5057,304,661,2 23 | DE,344,5540,328,722,3 24 | MD,330,5331,323,766,3 25 | VA,261,4715,317,631,3 26 | WV,214,3828,310,390,3 27 | NC,245,4120,321,450,3 28 | SC,233,3817,342,476,3 29 | GA,250,4243,339,603,3 30 | FL,243,4647,287,805,3 31 | KY,216,3967,325,523,3 32 | TN,212,3946,315,588,3 33 | AL,208,3724,332,584,3 34 | MS,215,3448,358,445,3 35 | AR,221,3680,320,500,3 36 | LA,244,3825,355,661,3 37 | OK,234,4189,306,680,3 38 | TX,269,4336,335,797,3 39 | MT,302,4418,335,534,4 40 | ID,268,4323,344,541,4 41 | WY,323,4813,331,605,4 42 | CO,304,5046,324,785,4 43 | NM,317,3764,366,698,4 44 | AZ,332,4504,340,796,4 45 | UT,315,4005,378,804,4 46 | NV,291,5560,330,809,4 47 | WA,312,4989,313,726,4 48 | OR,316,4697,305,671,4 49 | CA,332,5438,307,909,4 50 | AK,546,5613,386,484,4 51 | HI,311,5309,333,831,4 52 | -------------------------------------------------------------------------------- /4.LogisticRegression/1.bin_class(scipy).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-02] 1.bin_class(scipy).pyt 2 | # Logistic Regression : binary classification 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/MifHxwJYOyU 11 | # 12 | from scipy import optimize 13 | import numpy as np 14 | from sklearn.model_selection import train_test_split 15 | import matplotlib.pyplot as plt 16 | 17 | # Create a simple dataset for binary classification 18 | def bin_class_data(n): 19 | n1 = int(n / 2) 20 | a = np.random.normal(-1.0, 1.0, n1) 21 | b = np.random.normal(1.0, 1.0, n1) 22 | x = np.hstack([a, b]).reshape(-1, 1) 23 | y = np.hstack([np.zeros(n1), np.ones(n1)]) 24 | return x, y 25 | 26 | x, y = bin_class_data(n=1000) # create 1000 data points 27 | X = np.hstack([np.ones([x.shape[0], 1]), x]) 28 | y = y.astype('int8') 29 | 30 | # Visually check the data 31 | plt.scatter(x, y, c='r', s=10, alpha=0.5) 32 | plt.show() 33 | 34 | # Split the data into training and test data 35 | x_train, x_test, y_train, y_test = train_test_split(X, y) 36 | 37 | # Loss function : mean of binary cross entropy 38 | def bce_loss(W, args): 39 | tx = args[0] 40 | ty = args[1] 41 | trc = args[2] 42 | y_hat = 1.0 / (1 + np.exp(-np.dot(W, tx.T))) 43 | bce = -ty * np.log(y_hat + 1e-8) - (1.0 - ty) * np.log(1.0 - y_hat + 1e-8) 44 | loss = bce.mean() 45 | 46 | # save the loss 47 | if trc == True: 48 | trace_W.append([W, loss]) 49 | return loss 50 | 51 | # Perform an optimization process 52 | trace_W = [] 53 | result = optimize.minimize(fun = bce_loss, 54 | x0 = [-5, 15], 55 | args=[x_train, y_train, True]) 56 | 57 | # print the result. result.x contains the optimal parameters 58 | print(result) 59 | 60 | # Visually check the data and the predicted regression curves 61 | y_hat = 1.0 / (1 + np.exp(-np.dot(result.x, x_train.T))) 62 | plt.figure(figsize=(5, 4)) 63 | plt.scatter(x, y, s=5, c='r', label = 'data') 64 | plt.scatter(x_train[:, 1], y_hat, c='blue', s=1, label = 'sigmoid') 65 | plt.legend() 66 | plt.axhline(y = 0.5, linestyle='--', linewidth=0.5) 67 | plt.show() 68 | 69 | # Measure the accuracy of test data 70 | y_prob = 1.0 / (1 + np.exp(-np.dot(result.x, x_test.T))) 71 | y_pred = (y_prob > 0.5).astype('int8') 72 | acc = (y_pred == y_test).mean() 73 | print('\nAccuracy of test data = {:.3f}'.format(acc)) 74 | 75 | # Visually check the loss function and the path to the optimal point 76 | w0, w1 = np.meshgrid(np.arange(-20, 20, 1), np.arange(-5, 20, 1)) 77 | zs = np.array([bce_loss(np.array([a, b]), [x_train, y_train, False]) \ 78 | for [a, b] in zip(np.ravel(w0), np.ravel(w1))]) 79 | z = zs.reshape(w0.shape) 80 | 81 | fig = plt.figure(figsize=(10,10)) 82 | ax = fig.add_subplot(111, projection='3d') 83 | 84 | # Drawing the surface of the loss function 85 | ax.plot_surface(w0, w1, z, alpha=0.7) 86 | 87 | # Drawing the path to the optimal point 88 | b = np.array([tw0 for [tw0, tw1], td in trace_W]) 89 | w = np.array([tw1 for [tw0, tw1], td in trace_W]) 90 | d = np.array([td for [tw0, tw1], td in trace_W]) 91 | ax.plot(b[0], w[0], d[0], marker='x', markersize=15, color="r") 92 | ax.plot(b[-1], w[-1], d[-1], marker='*', markersize=20, color="r") 93 | ax.plot(b, w, d, marker='o', color="r") 94 | 95 | ax.set_xlabel('W0 (bias)') 96 | ax.set_ylabel('W1 (slope)') 97 | ax.set_zlabel('cross entropy') 98 | ax.azim = 50 99 | ax.elev = 50 # [50, 0] 100 | plt.show() 101 | 102 | # Visually see that the loss decreases as the iteration progresses 103 | plt.figure(figsize=(5, 4)) 104 | plt.plot([e for w, e in trace_W], color='red') 105 | plt.title('train loss') 106 | plt.xlabel('epoch') 107 | plt.ylabel('loss') 108 | plt.show() 109 | -------------------------------------------------------------------------------- /4.LogisticRegression/10.lwlr(sklearn).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-05] 10.lwlr(sklearn).py 2 | # Use the sample_weight argument in sklearn's LogisticRegression model. 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/d1-QS4uTgj8 11 | # 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from sklearn.linear_model import LogisticRegression 15 | from sklearn.model_selection import train_test_split 16 | 17 | # Generate a simple dataset 18 | def lwlr_data1(n): 19 | n1 = int(n / 3) 20 | a = np.random.normal(-1.0, 0.5, n1) 21 | b = np.random.normal(1.0, 0.5, n1) 22 | c = np.random.normal(3.0, 0.5, n - n1 * 2) 23 | x = np.hstack([a, b, c]).reshape(-1, 1) 24 | y = np.hstack([np.zeros(n1), np.ones(n1), np.zeros(n - n1 * 2)]) 25 | return x, y 26 | 27 | # Generate training and test data 28 | x, y = lwlr_data1(n=2000) 29 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) 30 | 31 | # Visualize the dataset 32 | plt.figure(figsize=(6, 3)) 33 | plt.scatter(x_train, y_train, s=5, c='orange', label='train') 34 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', label='test') 35 | plt.legend() 36 | plt.show() 37 | 38 | # Calculating the weights of training data points 39 | # xx : training data, xx : test data 40 | def get_weight(xx, tx, tau): 41 | distance = np.sum(np.square(xx - tx), axis=1) 42 | w = np.exp(-distance / (2 * tau * tau)) 43 | return w 44 | 45 | y_prob = [] 46 | for tx in x_test: 47 | weight = get_weight(x_train, tx, 0.6) 48 | model = LogisticRegression() 49 | model.fit(x_train, y_train, sample_weight = weight) 50 | y_prob.append(model.predict_proba(tx.reshape(-1, 1))[:, 1]) 51 | 52 | y_prob = np.array(y_prob).reshape(-1,) 53 | 54 | # Visually check the training and test data, and predicted probability. 55 | plt.figure(figsize=(6, 3)) 56 | plt.scatter(x_train, y_train, s=5, c='orange', label='train') 57 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', label='test') 58 | plt.scatter(x_test, y_prob, s=5, c='red', label='prediction') 59 | plt.legend() 60 | plt.axhline(y=0.5, ls='--', lw=0.5, c='black') 61 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 62 | plt.axvline(x=2, ls='--', lw=0.5, c='black') 63 | plt.show() 64 | 65 | # Measure the accuracy of the test data 66 | y_pred = (y_prob > 0.5).astype('int8') 67 | acc = (y_pred == y_test).mean() 68 | print('\nAccuracy of the test data = {:.3f}'.format(acc)) 69 | -------------------------------------------------------------------------------- /4.LogisticRegression/11.lwlr_2(sklearn).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-05] 11.lwlr_2(sklearn).py 2 | # Check the non-linear decision boundary 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/d1-QS4uTgj8 11 | # 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from matplotlib.colors import ListedColormap 15 | from sklearn.linear_model import LogisticRegression 16 | from sklearn.model_selection import train_test_split 17 | 18 | # Generate a simple dataset 19 | def lwlr_data2(n, s): 20 | n1 = int(n / 3) 21 | x, y = [], [] 22 | for a, b, c, m in [(1, 1, 0, n1), 23 | (2, 2, 1, n-n1*2), (3, 1, 0, n1)]: 24 | x1 = np.random.normal(a, s, m).reshape(-1,1) 25 | x2 = np.random.normal(b, s, m).reshape(-1,1) 26 | x.extend(np.hstack([x1, x2])) 27 | y.extend(np.ones(m) * c) 28 | x = np.array(x).reshape(-1, 2) 29 | y = np.array(y).astype('int8').reshape(-1, 1) 30 | return x, y.reshape(-1,) 31 | x, y = lwlr_data2(n=1000, s=0.5) 32 | 33 | # Visually check the data distribution. 34 | m = ['o', '^'] 35 | color = ['red', 'blue'] 36 | plt.figure(figsize=(5,5)) 37 | for i in [0, 1]: 38 | idx = np.where(y == i) 39 | plt.scatter(x[idx, 0], x[idx, 1], 40 | c=color[i], 41 | marker = m[i], 42 | s = 20, 43 | edgecolor = 'black', 44 | alpha = 0.5, 45 | label='class-'+str(i)) 46 | plt.legend() 47 | plt.show() 48 | 49 | # Split the data into the training and test data 50 | x_train, x_test, y_train, y_test = train_test_split(x, y) 51 | 52 | # Calculating the weights of training data points 53 | # xx : training data, xx : test data 54 | def get_weight(xx, tx, tau): 55 | distance = np.sum(np.square(xx - tx), axis=1) 56 | w = np.exp(-distance / (2 * tau * tau)) 57 | return w 58 | 59 | # Predict the classes of the test data 60 | y_prob = [] 61 | tau = 0.1 62 | for tx in x_test: 63 | weight = get_weight(x_train, tx, tau) 64 | model = LogisticRegression() 65 | model.fit(x_train, y_train, sample_weight = weight) 66 | y_prob.append(model.predict_proba(tx.reshape(-1, 2))[:, 1]) 67 | y_prob = np.array(y_prob).reshape(-1,) 68 | 69 | # Measure the accuracy of the test data 70 | y_pred = (y_prob > 0.5).astype('int8') 71 | acc = (y_pred == y_test).mean() 72 | print('\nAccuracy of the test data = {:.3f}'.format(acc)) 73 | 74 | # Visualize the non-linear decision boundary 75 | # reference : 76 | # https://psrivasin.medium.com/ 77 | # plotting-decision-boundaries-using-numpy-and-matplotlib-f5613d8acd19 78 | x_min, x_max = x_test[:, 0].min() - 0.1, x_test[:,0].max() + 0.1 79 | y_min, y_max = x_test[:, 1].min() - 0.1, x_test[:, 1].max() + 0.1 80 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), 81 | np.linspace(y_min, y_max, 50)) 82 | x_in = np.c_[xx.ravel(), yy.ravel()] 83 | 84 | # Predict the classes of the data points in the x_in variable. 85 | y_prob = [] 86 | for tx in x_in: 87 | weight = get_weight(x_train, tx, tau) 88 | 89 | model = LogisticRegression() 90 | model.fit(x_train, y_train, sample_weight = weight) 91 | y_prob.append(model.predict_proba(tx.reshape(-1, 2))[:, 1]) 92 | y_prob = np.array(y_prob).reshape(-1,) 93 | y_pred = (y_prob > 0.5).astype('int8') 94 | 95 | # Draw the decision boundary 96 | y_pred = np.round(y_pred).reshape(xx.shape) 97 | 98 | plt.figure(figsize=(5, 5)) 99 | for i in [0, 1]: 100 | idx = np.where(y == i) 101 | plt.scatter(x[idx, 0], x[idx, 1], 102 | c=color[i], 103 | marker = m[i], 104 | s = 40, 105 | edgecolor = 'black', 106 | alpha = 0.5, 107 | label='class-' + str(i)) 108 | plt.contour(xx, yy, y_pred, cmap=ListedColormap(['red', 'blue']), alpha=0.5) 109 | plt.axis('tight') 110 | plt.xlim(xx.min(), xx.max()) 111 | plt.ylim(yy.min(), yy.max()) 112 | plt.xlabel('x1') 113 | plt.ylabel('x2') 114 | plt.legend() 115 | plt.show() 116 | -------------------------------------------------------------------------------- /4.LogisticRegression/2.bin_class(sklearn).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-02] 2.bin_class(sklearn).py 2 | # Logistic Regression : binary classification 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/MifHxwJYOyU 11 | # 12 | import numpy as np 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.model_selection import train_test_split 15 | from sklearn.datasets import make_blobs 16 | import matplotlib.pyplot as plt 17 | from matplotlib.colors import ListedColormap 18 | 19 | # Create simple training data 20 | x, y = make_blobs(n_samples=1000, n_features=2, 21 | centers=[[1., 1.], [2., 2.]], 22 | cluster_std=0.5) 23 | 24 | # Visually check the data 25 | color = ['red', 'blue'] 26 | for i in [0, 1]: 27 | idx = np.where(y == i) 28 | plt.scatter(x[idx, 0], x[idx, 1], c=color[i], s = 10, 29 | alpha = 0.5, label='class-'+str(i)) 30 | plt.legend() 31 | plt.show() 32 | 33 | # Split the data into training and test data 34 | x_train, x_test, y_train, y_test = train_test_split(x, y) 35 | 36 | # Create a model and fit it to training data. 37 | model = LogisticRegression() 38 | model.fit(x_train, y_train) 39 | 40 | # Predict the classes of test data, and measure the accuracy. 41 | y_pred = model.predict(x_test) 42 | acc = (y_pred == y_test).mean() 43 | print('\nAccuracy of test data = {:.3f}'.format(acc)) 44 | 45 | # Visually check the decision boundary. 46 | # reference : 47 | # https://psrivasin.medium.com/ 48 | # plotting-decision-boundaries-using-numpy-and-matplotlib-f5613d8acd19 49 | x1_min, x1_max = x_test[:, 0].min() - 0.1, x_test[:,0].max() + 0.1 50 | y1_min, y1_max = x_test[:, 1].min() - 0.1, x_test[:, 1].max() + 0.1 51 | x1, x2 = np.meshgrid(np.linspace(x1_min, x1_max, 100), 52 | np.linspace(y1_min, y1_max, 100)) 53 | x_in = np.c_[x1.ravel(), x2.ravel()] # shape = (10000, 2) 54 | 55 | # Predict all the data points in the meshgrid area. 56 | y_pred = model.predict(x_in) 57 | 58 | # Drawing the data and decision boundary 59 | y_pred = y_pred.reshape(x1.shape) # shape = (100, 100) 60 | 61 | plt.figure(figsize=(5,5)) 62 | m = ['o', '^'] 63 | color = ['red', 'blue'] 64 | for i in [0, 1]: 65 | idx = np.where(y == i) 66 | plt.scatter(x[idx, 0], x[idx, 1], 67 | c=color[i], 68 | marker = m[i], 69 | s = 40, 70 | edgecolor = 'black', 71 | alpha = 0.5, 72 | label='class-'+str(i)) 73 | plt.contour(x1, x2, y_pred, cmap=ListedColormap(['red', 'blue']), alpha=0.5) 74 | 75 | plt.axis('tight') 76 | plt.xlim(x1.min(), x1.max()) 77 | plt.ylim(x2.min(), x2.max()) 78 | plt.xlabel('x1') 79 | plt.ylabel('x2') 80 | plt.legend() 81 | plt.show() 82 | -------------------------------------------------------------------------------- /4.LogisticRegression/3.bin_class(scipy_cancer).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-02] 3.bin_class(scipy_cancer).py 2 | # Breast cancer dataset 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/MifHxwJYOyU 11 | # 12 | from scipy import optimize 13 | import numpy as np 14 | from sklearn.model_selection import train_test_split 15 | from sklearn.datasets import load_breast_cancer 16 | import matplotlib.pyplot as plt 17 | 18 | # Read breast cancer dataset 19 | x, y = load_breast_cancer(return_X_y=True) 20 | 21 | # Split the data into training and test data 22 | x_train, x_test, y_train, y_test = train_test_split(x, y) 23 | 24 | # Z-score normalization 25 | # When normalzing the test data, use the mean and standard deviation 26 | # from the training data. 27 | x_mean = x_train.mean(axis=0).reshape(1, -1) 28 | x_std = x_train.std(axis=0).reshape(1, -1) 29 | x_train = (x_train - x_mean) / x_std 30 | x_test = (x_test - x_mean) / x_std 31 | 32 | # Add a column vector with all 1 to the feature matrix. 33 | # [0.3, 0.4, ...] --> [1.0, 0.3, 0.4, ...] 34 | # [0.1, 0.5, ...] --> [1.0, 0.1, 0.5, ...] 35 | # [ ...] 36 | x1_train = np.hstack([np.ones([x_train.shape[0], 1]), x_train]) 37 | x1_test = np.hstack([np.ones([x_test.shape[0], 1]), x_test]) 38 | 39 | REG_CONST = 0.01 # regularization constant 40 | 41 | # Loss function : mean of binary cross entropy 42 | def bce_loss(W, args): 43 | train_x = args[0] 44 | train_y = args[1] 45 | test_x = args[2] 46 | test_y = args[3] 47 | 48 | # Calculate the loss of training data 49 | y_hat = 1.0 / (1 + np.exp(-np.dot(W, train_x.T))) 50 | train_bce = -train_y * np.log(y_hat + 1e-10) - (1.0 - train_y) * np.log(1.0 - y_hat + 1e-10) 51 | train_loss = train_bce.mean() + REG_CONST * np.mean(np.square(W)) 52 | 53 | # Calculate the loss of test data 54 | # It is independent of training and is measured later to observe changes in loss. 55 | y_hat = 1.0 / (1 + np.exp(-np.dot(W, test_x.T))) 56 | test_bce = -test_y * np.log(y_hat + 1e-10) - (1.0 - test_y) * np.log(1.0 - y_hat + 1e-10) 57 | test_loss = test_bce.mean() + REG_CONST * np.mean(np.square(W)) 58 | 59 | # Save the loss 60 | trc_train_loss.append(train_loss) 61 | trc_test_loss.append(test_loss) 62 | 63 | return train_loss 64 | 65 | # Perform an optimization process 66 | trc_train_loss = [] 67 | trc_test_loss = [] 68 | init_w = np.ones(x1_train.shape[1]) * 0.1 69 | result = optimize.minimize(fun = bce_loss, 70 | x0 = init_w, 71 | args=[x1_train, y_train, x1_test, y_test]) 72 | 73 | # print the result. result.x contains the optimal parameters 74 | print(result) 75 | 76 | # Measure the accuracy of test data 77 | y_prob = 1.0 / (1 + np.exp(-np.dot(result.x, x1_test.T))) 78 | y_pred = (y_prob > 0.5).astype('int8') 79 | acc = (y_pred == y_test).mean() 80 | print('\nAccuracy of test data = {:.3f}'.format(acc)) 81 | 82 | # Visually see that the loss decreases as the iteration progresses 83 | plt.figure(figsize=(5, 4)) 84 | plt.plot(trc_train_loss, color='blue', label='train loss') 85 | plt.plot(trc_test_loss, color='red', label='test loss') 86 | plt.legend() 87 | plt.title('Loss history') 88 | plt.xlabel('epoch') 89 | plt.ylabel('loss') 90 | plt.show() 91 | 92 | -------------------------------------------------------------------------------- /4.LogisticRegression/4.bin_class(sklearn_cancer).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-02] 4.bin_class(sklearn_cancer).py 2 | # Using sklearn's LogisticRegression() 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/MifHxwJYOyU 11 | # 12 | import numpy as np 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.model_selection import train_test_split 15 | from sklearn.datasets import load_breast_cancer 16 | 17 | # Read breast cancer dataset 18 | cancer = load_breast_cancer() 19 | x = cancer['data'] 20 | y = cancer['target'] 21 | 22 | # Split the data into training and test data 23 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) 24 | 25 | # Z-score normalization 26 | # When normalzing the test data, use the mean and standard deviation 27 | # from the training data. 28 | x_mean = x_train.mean(axis=0).reshape(1, -1) 29 | x_std = x_train.std(axis=0).reshape(1, -1) 30 | x_train = (x_train - x_mean) / x_std 31 | x_test = (x_test - x_mean) / x_std 32 | 33 | # regularization constant (strenth) 34 | REG_CONST = 0.01 35 | 36 | # Create a model and fit it to the training data. 37 | # C: inverse of regularization strength 38 | model = LogisticRegression(penalty='l2', C=1./REG_CONST, max_iter=300) 39 | model.fit(x_train, y_train) 40 | 41 | # Predict the classes of test data and measure the accuracy of test data 42 | y_pred = model.predict(x_test) 43 | acc = (y_pred == y_test).mean() 44 | print('\nAccuracy of test data = {:.3f}'.format(acc)) 45 | -------------------------------------------------------------------------------- /4.LogisticRegression/5.multiclass(ovr_1).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-03] 5.multiclass(ovr_1).py 2 | # Multi-class classification (OvR : one vs rest) 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/d6FcGZp8AHc 11 | # 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.preprocessing import OneHotEncoder 14 | from sklearn.datasets import load_iris 15 | from sklearn.model_selection import train_test_split 16 | import numpy as np 17 | 18 | # Read iris dataset 19 | x, y = load_iris(return_X_y=True) 20 | 21 | # one-hot encoding of the y labels. 22 | y_ohe = OneHotEncoder().fit_transform(y.reshape(-1,1)).toarray() 23 | 24 | # Split the data into the training and test data 25 | x_train, x_test, y_train, y_test = train_test_split(x, y_ohe, test_size = 0.2) 26 | 27 | # Perform the OvR. Since there are three labels, three models are used. 28 | models = [] 29 | for m in range(y_train.shape[1]): 30 | y_sub = y_train[:, m] # y for binary classification 31 | models.append(LogisticRegression()) 32 | models[-1].fit(x_train, y_sub) 33 | 34 | # The labels of the test data are predicted using three trained models. 35 | y_prob = np.zeros(shape=y_test.shape) 36 | for m in range(y_test.shape[1]): 37 | y_prob[:, m] = models[m].predict_proba(x_test)[:, 1] 38 | 39 | # y is predicted as the label with the highest value in y_prob. 40 | y_pred = np.argmax(y_prob, axis=1) 41 | 42 | # Measure the accuracy of the test data 43 | y_true = np.argmax(y_test, axis=1) 44 | acc = (y_true == y_pred).mean() 45 | print('Accuracy of test data = {:.3f}'.format(acc)) 46 | 47 | # Check the estimated parameters. 48 | for m in range(y_test.shape[1]): 49 | w = models[m].coef_ 50 | b = models[m].intercept_ 51 | print("\nModel-{}:".format(m)) 52 | print("w:", w) 53 | print("b:", b) 54 | -------------------------------------------------------------------------------- /4.LogisticRegression/6.multiclass(ovr_2).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-03] 6.multiclass(ovr_2).py 2 | # Multiclass classification (OvR : One-vs-Rest) 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/d6FcGZp8AHc 11 | # 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.datasets import load_iris 14 | from sklearn.model_selection import train_test_split 15 | import numpy as np 16 | 17 | # Read iris dataset 18 | x, y = load_iris(return_X_y=True) 19 | 20 | # Split the data into the training and test data 21 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) 22 | 23 | # Use the multi_class='ovr' function of sklearn's LogisticRegression. 24 | # Even if the 'ovr' is not set, multiclass classification is automatically 25 | # performed by referring to the number of classes. 26 | # This was explicitly set to facilitate understanding. 27 | model = LogisticRegression(multi_class='ovr', max_iter=300) 28 | model.fit(x_train, y_train) 29 | 30 | # Predict the classes of the test data 31 | y_pred = model.predict(x_test) 32 | 33 | # Measure the accuracy of the test data 34 | acc = (y_test == y_pred).mean() 35 | print('\nAccuracy of test data = {:.3f}'.format(acc)) 36 | 37 | # Check the estimated parameters. 38 | print('\nmodel.coef_ =\n\n', model.coef_) 39 | print('\nmodel.intercept_ =\n\n', model.intercept_) 40 | 41 | -------------------------------------------------------------------------------- /4.LogisticRegression/7.multiclass(softmax_scipy).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-04] 7.multiclass(softmax_scipy).pyy_prob 2 | # Multiclass classification (Softmax regression) 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/D_z48GLwAyM 11 | # 12 | from scipy import optimize 13 | from sklearn.datasets import load_iris 14 | from sklearn.model_selection import train_test_split 15 | from sklearn.preprocessing import OneHotEncoder 16 | import matplotlib.pyplot as plt 17 | import numpy as np 18 | 19 | # Read iris dataset 20 | x, y = load_iris(return_X_y=True) 21 | 22 | # one-hot encoding of the y labels. 23 | y_ohe = OneHotEncoder().fit_transform(y.reshape(-1,1)).toarray() 24 | 25 | # Split the data into the training and test data 26 | x_train, x_test, y_train, y_test = train_test_split(x, y_ohe) 27 | 28 | # Add a column vector with all 1 to the feature matrix. 29 | x1_train = np.hstack([np.ones([x_train.shape[0], 1]), x_train]) 30 | x1_test = np.hstack([np.ones([x_test.shape[0], 1]), x_test]) 31 | 32 | REG_CONST = 0.01 # Regularization constant 33 | n_feature = x_train.shape[1] # The number of features 34 | n_class = y_train.shape[1] # The number of classes 35 | 36 | def softmax(z): 37 | s = np.exp(z) / np.sum(np.exp(z), axis=1).reshape(-1,1) 38 | return s 39 | 40 | # Loss function: mean of cross entropy 41 | def ce_loss(W, args): 42 | train_x = args[0] # shape=(112,5) 43 | train_y = args[1] # shape=(112,3) 44 | test_x = args[2] 45 | test_y = args[3] 46 | W = W.reshape((n_class, n_feature + 1)) # shape=(3, 5) 47 | 48 | # Calculate the loss of training data 49 | z = np.dot(W, train_x.T).T # shape=(112, 3) 50 | y_hat = softmax(z) 51 | train_ce = np.sum(-train_y * np.log(y_hat + 1e-10), axis=1) 52 | train_loss = train_ce.mean() + REG_CONST * np.mean(np.square(W)) 53 | 54 | # Calculate the loss of test data 55 | # It is independent of training and is measured later to observe changes in loss. 56 | z = np.dot(W, test_x.T).T 57 | y_hat = softmax(z) 58 | test_ce = np.sum(-test_y * np.log(y_hat + 1e-10), axis=1) 59 | test_loss = test_ce.mean() + REG_CONST * np.mean(np.square(W)) 60 | 61 | # Save the loss 62 | trc_train_loss.append(train_loss) 63 | trc_test_loss.append(test_loss) 64 | 65 | return train_loss 66 | 67 | # Perform an optimization process 68 | trc_train_loss = [] 69 | trc_test_loss = [] 70 | init_w = np.ones(n_class * (n_feature + 1)) * 0.1 # shape=(3, 5) → 1D 71 | 72 | # constraints: w0 = 0, b0 = 0 73 | def b0_w0(w): 74 | n = np.arange(n_feature + 1) 75 | return w[n] 76 | 77 | cons = [{'type':'eq', 'fun': b0_w0}] 78 | result = optimize.minimize(ce_loss, init_w, 79 | constraints=cons, 80 | args=[x1_train, y_train, x1_test, y_test]) 81 | 82 | # print the result. result.x contains the optimal parameters 83 | print(result) 84 | 85 | # Measure the accuracy of test data 86 | W = result.x.reshape(n_class, n_feature + 1) 87 | z = np.dot(W, x1_test.T).T 88 | y_prob = softmax(z) 89 | y_pred = np.argmax(y_prob, axis=1) 90 | y_true = np.argmax(y_test, axis=1) 91 | acc = (y_pred == y_true).mean() 92 | print('\nAccuracy of test data = {:.3f}'.format(acc)) 93 | 94 | # Visually see that the loss decreases as the iteration progresses 95 | plt.figure(figsize=(5, 4)) 96 | plt.plot(trc_train_loss, color='blue', label='train loss') 97 | plt.plot(trc_test_loss, color='red', label='test loss') 98 | plt.legend() 99 | plt.title('Loss history') 100 | plt.xlabel('epoch') 101 | plt.ylabel('loss') 102 | plt.show() 103 | 104 | # Check the parameters 105 | w = result.x.reshape((n_class, n_feature + 1)) 106 | print('\n', w) 107 | 108 | -------------------------------------------------------------------------------- /4.LogisticRegression/8.multiclass(softmax_sklearn).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-04] multiclass(softmax_scipy).py 2 | # Multi-class classification (Softmax regression) 3 | # Use LogisticRegression(multi_class='multinomial') 4 | # 5 | # This code was used in the machine learning online 6 | # course provided by 7 | # www.youtube.com/@meanxai 8 | # www.github.com/meanxai/machine_learning 9 | # 10 | # A detailed description of this code can be found in 11 | # https://youtu.be/D_z48GLwAyM 12 | # 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.datasets import load_iris 15 | from sklearn.model_selection import train_test_split 16 | 17 | # Read iris dataset 18 | x, y = load_iris(return_X_y=True) 19 | 20 | # Split the data into the training and test data 21 | x_train, x_test, y_train, y_test = train_test_split(x, y) 22 | 23 | # Create a model and fit it to the training data. 24 | # Use multi_class = 'multinomial' 25 | model = LogisticRegression(multi_class='multinomial', max_iter=300) 26 | model.fit(x_train, y_train) 27 | 28 | # Predict the classes of the test data 29 | y_pred = model.predict(x_test) 30 | 31 | # Measure the accuracy 32 | acc = (y_test == y_pred).mean() 33 | print('\nAccuracy of test data = {:.3f}'.format(acc)) 34 | 35 | # Check the estimated parameters. 36 | print('\nmodel.coef_ =\n\n', model.coef_) 37 | print('\nmodel.intercept_ =\n\n', model.intercept_) 38 | 39 | -------------------------------------------------------------------------------- /4.LogisticRegression/9.lwlr(scipy).py: -------------------------------------------------------------------------------- 1 | # [MXML-4-05] 9.lwlr(scipy).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/d1-QS4uTgj8 10 | # 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from scipy import optimize 14 | from sklearn.model_selection import train_test_split 15 | 16 | # Generate a simple dataset 17 | def lwlr_data1(n): 18 | n1 = int(n / 3) 19 | a = np.random.normal(-1.0, 0.5, n1) 20 | b = np.random.normal(1.0, 0.5, n1) 21 | c = np.random.normal(3.0, 0.5, n - n1 * 2) 22 | x = np.hstack([a, b, c]).reshape(-1, 1) 23 | y = np.hstack([np.zeros(n1), np.ones(n1), np.zeros(n - n1 * 2)]) 24 | return x, y 25 | 26 | # Generate training and test data 27 | x, y = lwlr_data1(n=2000) 28 | x_train, x_test, y_train, y_test = train_test_split(x, y) 29 | x1_train = np.hstack([np.ones([x_train.shape[0], 1]), x_train]) 30 | x1_test = np.hstack([np.ones([x_test.shape[0], 1]), x_test]) 31 | 32 | # Visualize the dataset 33 | plt.figure(figsize=(6, 3)) 34 | plt.scatter(x_train, y_train, s=5, c='orange', alpha=0.5, label='train') 35 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', alpha=0.5, label='test') 36 | plt.legend() 37 | plt.show() 38 | 39 | # Calculating the weights of training data points 40 | # xx : training data, tx : test data 41 | def get_weight(xx, tx, tau): 42 | distance = np.sum(np.square(xx - tx), axis=1) 43 | w = np.exp(-distance / (2 * tau * tau)) 44 | return w 45 | 46 | # the mean of weighted binary cross entropy 47 | def wbce_loss(W, weight): 48 | y_hat = 1.0 / (1 + np.exp(-np.dot(W, x1_train.T))) 49 | bce = -y_train * np.log(y_hat + 1e-10) - (1.0 - y_train) * np.log(1.0 - y_hat + 1e-10) 50 | bce *= weight 51 | return bce.mean() 52 | 53 | y_prob = [] 54 | for tx in x1_test: 55 | weight = get_weight(x_train, tx, 0.6) 56 | result = optimize.minimize(wbce_loss, [0.1, 0.1], args=weight) 57 | y_prob.append(1.0 / (1 + np.exp(-np.dot(result.x, tx.T)))) 58 | y_prob = np.array(y_prob).reshape(-1,) 59 | 60 | # Visually check the training and test data, 61 | # and the predicted probability. 62 | plt.figure(figsize=(6, 3)) 63 | plt.scatter(x_train, y_train, s=5, c='orange', label='train') 64 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', label='test') 65 | plt.scatter(x_test, y_prob, s=5, c='red', label='prediction') 66 | plt.legend() 67 | plt.axhline(y=0.5, ls='--', lw=0.5, c='black') 68 | plt.axvline(x=0, ls='--', lw=0.5, c='black') 69 | plt.axvline(x=2, ls='--', lw=0.5, c='black') 70 | plt.show() 71 | 72 | # Measure the accuracy of the test data 73 | y_pred = (y_prob > 0.5).astype('int8') 74 | acc = (y_pred == y_test).mean() 75 | print('\nAccuracy of the test data = {:.3f}'.format(acc)) 76 | -------------------------------------------------------------------------------- /5.Convex/1.plot_convex.py: -------------------------------------------------------------------------------- 1 | # [MXML-5-01] 1.plot_convex.py (Plot 3D convex function) 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/8BiHfVrdClU 10 | # 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | 14 | # f(x) 15 | def f_xy(x1, x2): 16 | return (x1 ** 2) + (x2 ** 2) 17 | # return 3 * x1 + x2 18 | # return (x1 ** 2) + x2 * (x1 - 1) 19 | # return 2 * (x1 ** 2) + (x2 ** 2) + x1 * x2 + x1 + x2 20 | # return -5 * x1 / 3 - x2 + 5 21 | 22 | t = 0.1 23 | x, y = np.meshgrid(np.arange(-10, 10, t), np.arange(-10, 10, t)) 24 | zs = np.array([f_xy(a, b) for [a, b] in zip(np.ravel(x), np.ravel(y))]) 25 | z = zs.reshape(x.shape) 26 | 27 | fig = plt.figure(figsize=(7,7)) 28 | ax = fig.add_subplot(111, projection='3d') 29 | 30 | # surface를 그린다. 31 | ax.plot_surface(x, y, z, alpha=0.7) 32 | 33 | ax.set_xlabel('x1') 34 | ax.set_ylabel('x2') 35 | ax.set_zlabel('f(x)') 36 | ax.azim = -50 37 | ax.elev = 30 38 | plt.show() 39 | 40 | 41 | -------------------------------------------------------------------------------- /5.Convex/2.EQP.py: -------------------------------------------------------------------------------- 1 | # [MXML-5-03] 2.EQP.py 2 | # Equality constrained QP (EQP) 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/yn04TeRxKko 11 | # 12 | # Least squares problem: 13 | # minimize x1^2 + x2^2 14 | # subject to x1 + x2 = 1 15 | # 16 | # QP standard form: 17 | # minimize 1/2 * xT.P.x + qT.x 18 | # subject to G.x <= h 19 | # A.x = b 20 | # 21 | # min. 1/2 * [x1 x2][2 0][x1] + [0 0][x1] 22 | # [0 2][x2] [x2] 23 | # 24 | # s.t. [1 1][x1] = 1 25 | # [x2] 26 | # 27 | # x = [x1] P = [2 0] q = [0] A = [1 1] b = 1 28 | # [x2] [0 2] [0] 29 | from cvxopt import matrix, solvers 30 | import numpy as np 31 | 32 | P = matrix(np.array([[2, 0], [0, 2]]), tc='d') 33 | q = matrix(np.array([[0], [0]]), tc='d') 34 | A = matrix(np.array([[1, 1]]), tc='d') 35 | b = matrix(1, tc='d') 36 | 37 | sol = solvers.qp(P, q, A=A, b=b) 38 | 39 | p_star = sol['primal objective'] 40 | x1, x2 = sol['x'] 41 | y = sol['y'][0] # Lagrange multiplier for A.x = b 42 | gap = sol['gap'] # duality gap 43 | 44 | # z and y are Lagrange multipliers. z is not used here. 45 | # L = (1/2) * xT.P.x + qT.x + zT(G.x - h) + yT(A.x - b) 46 | # zT = z-transpose, yT = y-transpose 47 | print('\nx1 = {:.3f}'.format(x1)) 48 | print('x2 = {:.3f}'.format(x2)) 49 | print('y = {:.3f}'.format(y)) 50 | print('p* = {:.3f}'.format(p_star)) 51 | print('duality gap = {:.3f}'.format(gap)) 52 | 53 | 54 | -------------------------------------------------------------------------------- /5.Convex/3.IQP_1.py: -------------------------------------------------------------------------------- 1 | # [MXML-5-03] 3.IQP_1.py 2 | # Inequality constrained QP (IQP-1) 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/yn04TeRxKko 11 | # 12 | # 13 | # Least squares problem: 14 | # minimize x1^2 + x2^2 15 | # subject to x1 + x2 <= 1 16 | # 17 | # QP standard form 18 | # minimize 1/2 * xT.P.x + qT.x 19 | # subject to G.x <= h 20 | # A.x = b 21 | # 22 | # min. 1/2 [x1 x2][2 0][x1] + [0 0][x1] 23 | # [0 2][x2] [x2] 24 | # 25 | # s.t. [1 1][x1] <= 1 26 | # [x2] 27 | # 28 | # x = [x1] P = [2 0] q = [0] G = [1 1] h = 1 29 | # [x2] [0 2] [0] 30 | from cvxopt import matrix, solvers 31 | import numpy as np 32 | 33 | P = matrix(np.array([[2, 0], [0, 2]]), tc='d') 34 | q = matrix(np.array([[0], [0]]), tc='d') 35 | G = matrix(np.array([[1, 1]]), tc='d') 36 | h = matrix(1, tc='d') 37 | 38 | sol = solvers.qp(P, q, G, h) 39 | 40 | p_star = sol['primal objective'] 41 | x1, x2 = sol['x'] 42 | z = sol['z'][0] # Lagrange multiplier for G.x <= h 43 | gap = sol['gap'] # duality gap 44 | s = sol['s'][0] # slack variable 45 | 46 | # z and y are Lagrange multipliers. y is not used here. 47 | # L = (1/2) * xT.P.x + qT.x + zT(G.x - h) + yT(A.x - b) 48 | # zT = z-transpose, yT = y-transpose 49 | print('\nx1 = {:.3f}'.format(x1)) 50 | print('x2 = {:.3f}'.format(x2)) 51 | print('z = {:.3f}'.format(z)) 52 | print('s = {:.3f}'.format(s)) 53 | print('p* = {:.3f}'.format(p_star)) 54 | print('duality gap = {:.3f}'.format(gap)) 55 | 56 | -------------------------------------------------------------------------------- /5.Convex/4.IQP_2.py: -------------------------------------------------------------------------------- 1 | # [MXML-5-03] 4.IQP_2.py 2 | # Inequality constrained QP (IQP-2) 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/yn04TeRxKko 11 | # 12 | # 13 | # Least squares problem: 14 | # minimize x1^2 + x2^2 15 | # subject to x1 + x2 >= 1 --> -x1 - x2 <= -1로 변환. 16 | # 17 | # QP standard form 18 | # minimize (1/2) * xT.P.x + qT.x 19 | # subject to G.x <= h 20 | # A.x = b 21 | # 22 | # min. (1/2) * [x1 x2][2 0][x1] + [0 0][x1] 23 | # [0 2][x2] [x2] 24 | # 25 | # s.t. [-1 -1][x1] <= -1 26 | # [x2] 27 | # 28 | # x = [x1] P = [2 0] q = [0] G = [-1 -1] h = -1 29 | # [x2] [0 2] [0] 30 | from cvxopt import matrix, solvers 31 | import numpy as np 32 | 33 | P = matrix(np.array([[2, 0], [0, 2]]), tc='d') 34 | q = matrix(np.array([[0], [0]]), tc='d') 35 | G = matrix(np.array([[-1, -1]]), tc='d') 36 | h = matrix(-1, tc='d') 37 | 38 | sol = solvers.qp(P, q, G, h) 39 | 40 | p_star = sol['primal objective'] 41 | x1, x2 = sol['x'] 42 | z = sol['z'][0] # Lagrange multiplier for G.x <= h 43 | s = sol['s'][0] # slack variable 44 | gap = sol['gap'] # duality gap 45 | 46 | # z and y are Lagrange multipliers. y is not used here. 47 | # L = (1/2) * xT.P.x + qT.x + zT(G.x - h) + yT(A.x - b) 48 | # zT = z-transpose, yT = y-transpose 49 | print('\nx1 = {:.3f}'.format(x1)) 50 | print('x2 = {:.3f}'.format(x2)) 51 | print('z = {:.3f}'.format(z)) 52 | print('s = {:.3f}'.format(s)) 53 | print('p* = {:.3f}'.format(p_star)) 54 | print('duality gap = {:.3f}'.format(gap)) 55 | 56 | -------------------------------------------------------------------------------- /5.Convex/5.QP.py: -------------------------------------------------------------------------------- 1 | # [MXML-5-04] 5.QP.py 2 | # QP problem with an equality and an inequality constraints. 3 | # https://cvxopt.org/examples/tutorial/qp.html 4 | # 5 | # This code was used in the machine learning online 6 | # course provided by 7 | # www.youtube.com/@meanxai 8 | # www.github.com/meanxai/machine_learning 9 | # 10 | # A detailed description of this code can be found in 11 | # https://youtu.be/_5QuyiCI1rc 12 | # 13 | # min. 2 * x1^2 + x2^2 + x1 * x2 + x1 + x2 14 | # s.t. x1 >= 0 15 | # x2 >= 0 16 | # x1 + x2 = 1 17 | # 18 | # QP standard form 19 | # minimize 1/2 * xT.P.x + qT.x 20 | # subject to G.x <= h 21 | # A.x = b 22 | # 23 | # min. 1/2 [x1 x2][4 1][x1] + [1 1][x1] 24 | # [1 2][x2] [x2] 25 | # 26 | # s.t. [-1 0][x1] <= [0] 27 | # [ 0 -1][x2] [0] 28 | # 29 | # [1 1][x1] = 1 30 | # [x2] 31 | # 32 | # x = [x1] P = [4 1] q = [1] G = [-1 0] h = [0] A = [1 1] b = 1 33 | # [x2] [1 2] [1] [ 0 -1] [0] 34 | from cvxopt import matrix, solvers 35 | import numpy as np 36 | 37 | P = matrix(np.array([[4, 1], [1, 2]]), tc='d') 38 | q = matrix(np.array([[1], [1]]), tc='d') 39 | G = matrix(np.array([[-1, 0],[0, -1]]), tc='d') 40 | h = matrix(np.array([[0], [0]]), tc='d') 41 | A = matrix(np.array([[1, 1]]), tc='d') 42 | b = matrix(1, tc='d') 43 | 44 | sol = solvers.qp(P, q, G, h, A, b) 45 | 46 | p_star = sol['primal objective'] 47 | x1, x2 = sol['x'] 48 | y = sol['y'][0] # Lagrange multiplier for x1 + x2 = 1 49 | z1 = sol['z'][0] # Lagrange multiplier for -x1 <= 0 50 | z2 = sol['z'][1] # Lagrange multiplier for -x2 <= 0 51 | gap = sol['gap'] # duality gap 52 | 53 | # z and y are Lagrange multipliers. 54 | # L = (1/2) * xT.P.x + qT.x + zT(G.x - h) + yT(A.x - b) 55 | # zT = z-transpose, yT = y-transpose 56 | print('\nx1 = {:.3f}'.format(x1)) 57 | print('x2 = {:.3f}'.format(x2)) 58 | print('y = {:.3f}'.format(y)) 59 | print('z1 = {:.3f}'.format(z1)) 60 | print('z2 = {:.3f}'.format(z2)) 61 | print('p* = {:.3f}'.format(p_star)) 62 | print('duality gap = {:.3f}'.format(gap)) -------------------------------------------------------------------------------- /5.Convex/6.LP.py: -------------------------------------------------------------------------------- 1 | # [MXML-5-04] 6.LP.py 2 | # https://cvxopt.org/examples/tutorial/lp.html 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/_5QuyiCI1rc 11 | # 12 | # min. 2 * x1 + x2 13 | # s.t. -x1 + x2 <= 1 14 | # x1 + x2 >= 2 --> -x1 - x2 <= -2 15 | # x2 >= 0 --> -x2 <= 0 16 | # x1 - 2 * x2 <= 4 17 | # x1 - 5 * x2 = 15 18 | # 19 | # LP standard form 20 | # minimize cT.x 21 | # subject to G.x <= h 22 | # A.x = b 23 | # 24 | # min. [2 1][x1] 25 | # [x2] 26 | # 27 | # s.t. G.x <= h [-1 1][x1] <= [ 1] 28 | # [-1 -1][x2] [-2] 29 | # [ 0 -1] [ 0] 30 | # [ 1 -2] [ 4] 31 | # 32 | # A.x = b [1 -5][x1] = 15 33 | # [x2] 34 | # 35 | # x = [x1] c = [2] G = [-1 1] h = [ 1] A = [1 1] b = 1 36 | # [x2] [1] [-1 -1] [-2] 37 | # [ 0 -1] [ 0] 38 | # [ 1 -2] [ 4] 39 | from cvxopt import matrix, solvers 40 | import numpy as np 41 | 42 | c = matrix(np.array([[2], [1]]), tc='d') 43 | G = matrix(np.array([[-1, 1],[-1, -1],[0, -1],[1, -2]]), tc='d') 44 | h = matrix(np.array([[1], [-2], [0], [4]]), tc='d') 45 | A = matrix(np.array([[1, -5]]), tc='d') 46 | b = matrix(1, tc='d') 47 | sol = solvers.lp(c, G, h, A, b) 48 | 49 | p_star = sol['primal objective'] 50 | x1, x2 = sol['x'] 51 | y = sol['y'][0] # Lagrange multiplier for A.x = b 52 | z1 = sol['z'][0] # Lagrange multiplier for G1.x <= h1 53 | z2 = sol['z'][1] # Lagrange multiplier for G2.x <= h2 54 | z3 = sol['z'][2] # Lagrange multiplier for G3.x <= h3 55 | z4 = sol['z'][3] # Lagrange multiplier for G4.x <= h4 56 | gap = sol['gap'] # duality gap 57 | 58 | # z and y are Lagrange multipliers. 59 | # L = cT.x + zT(G.x - h) + yT(A.x - b) 60 | # zT = z-transpose, yT = y-transpose 61 | print('\nx1 = {:.3f}'.format(x1)) 62 | print('x2 = {:.3f}'.format(x2)) 63 | print('y = {:.3f}'.format(y)) 64 | print('z1 = {:.3f}'.format(z1)) 65 | print('z2 = {:.3f}'.format(z2)) 66 | print('z3 = {:.3f}'.format(z3)) 67 | print('z4 = {:.3f}'.format(z4)) 68 | print('p* = {:.3f}'.format(p_star)) 69 | print('duality gap = {:.3f}'.format(gap)) 70 | -------------------------------------------------------------------------------- /6.SVM/1.cvxopt(hard_margin).py: -------------------------------------------------------------------------------- 1 | # [MXML-6-02] 1.cvxopt(hard_margin).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/9oRPq9oa4uA 10 | # 11 | from cvxopt import matrix as matrix 12 | from cvxopt import solvers as solvers 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | 16 | # 3 data points. 17 | x = np.array([[1., 3.], [2., 2.], [1., 1.]]) 18 | y = np.array([[1.], [1.], [-1.]]) 19 | 20 | # Calculate H matrix 21 | H = np.outer(y, y) * np.dot(x, x.T) 22 | 23 | # Construct the matrices required for QP in standard form. 24 | n = x.shape[0] 25 | P = matrix(H) 26 | q = matrix(-np.ones((n, 1))) 27 | G = matrix(-np.eye(n)) 28 | h = matrix(np.zeros(n)) 29 | A = matrix(y.reshape(1, -1)) 30 | b = matrix(np.zeros(1)) 31 | 32 | # solver parameters 33 | solvers.options['abstol'] = 1e-10 34 | solvers.options['reltol'] = 1e-10 35 | solvers.options['feastol'] = 1e-10 36 | 37 | # Perform QP 38 | sol = solvers.qp(P, q, G, h, A, b) 39 | 40 | # the solution of the QP, λ 41 | lamb = np.array(sol['x']) 42 | 43 | # Calculate w using the lambda, which is the solution to QP. 44 | w = np.sum(lamb * y * x, axis=0).reshape(1, -1) 45 | 46 | # Find support vectors 47 | sv_idx = np.where(lamb > 1e-5)[0] 48 | sv_lamb = lamb[sv_idx] 49 | sv_x = x[sv_idx] 50 | sv_y = y[sv_idx].reshape(1, -1) 51 | 52 | # Calculate b using the support vectors and calculate the average. 53 | # Reference: Bishop, Pattern Recognition and Machine Learning, p.330, 54 | # equation (7.18) 55 | b = sv_y - np.dot(w, sv_x.T) 56 | b = np.mean(b) 57 | 58 | print('\nlambda =', np.round(lamb.flatten(), 3)) 59 | print('w =', np.round(w, 3)) 60 | print('b =', np.round(b, 3)) 61 | 62 | # Visualize the data points 63 | plt.figure(figsize=(5,5)) 64 | color= ['red' if a == 1 else 'blue' for a in y] 65 | plt.scatter(x[:, 0], x[:, 1], s=200, c=color, alpha=0.7) 66 | plt.xlim(0, 4) 67 | plt.ylim(0, 4) 68 | 69 | # Visualize the decision boundary 70 | x1_dec = np.linspace(0, 4, 50).reshape(-1, 1) 71 | x2_dec = -(w[0][0] / w[0][1]) * x1_dec - b / w[0][1] 72 | plt.plot(x1_dec, x2_dec, c='black', lw=1.0, label='decision boundary') 73 | 74 | # Visualize the positive & negative boundary 75 | w_norm = np.sqrt(np.sum(w ** 2)) 76 | w_unit = w / w_norm 77 | half_margin = 1 / w_norm 78 | upper = np.hstack([x1_dec, x2_dec]) + half_margin * w_unit 79 | lower = np.hstack([x1_dec, x2_dec]) - half_margin * w_unit 80 | 81 | plt.plot(upper[:, 0], upper[:, 1], '--', lw=1.0, label='positive boundary') 82 | plt.plot(lower[:, 0], lower[:, 1], '--', lw=1.0, label='negative boundary') 83 | 84 | 85 | plt.scatter(sv_x[:, 0], sv_x[:, 1], s=50, marker='o', c='white') 86 | 87 | for s, (x1, x2) in zip(lamb, x): 88 | plt.annotate('λ=' + str(s[0].round(2)), (x1-0.05, x2 + 0.2)) 89 | 90 | plt.legend() 91 | plt.show() 92 | 93 | print("\nMargin = {:.4f}".format(half_margin * 2)) 94 | -------------------------------------------------------------------------------- /6.SVM/10.multiclass(OvR).py: -------------------------------------------------------------------------------- 1 | # [MXML-6-08] 10.multiclass(OvR).py 2 | # Implement multiclass classification of SVM by One-Rest (OvR) 3 | # Since SVC operates as an OvO internally, we will use 4 | # OneVsRestClassifier. 5 | # 6 | # This code was used in the machine learning online 7 | # course provided by 8 | # www.youtube.com/@meanxai 9 | # www.github.com/meanxai/machine_learning 10 | # 11 | # A detailed description of this code can be found in 12 | # https://youtu.be/ogFZchEqmTA 13 | # 14 | import numpy as np 15 | from sklearn.svm import SVC 16 | from sklearn.multiclass import OneVsRestClassifier 17 | import matplotlib.pyplot as plt 18 | from sklearn.datasets import make_blobs 19 | 20 | # Generate the data with 4 clusters. 21 | x, y = make_blobs(n_samples=400, n_features=2, 22 | centers=[[0., 0.2], [0.5, 0.5], [1., -0.2], [0.3, -0.3]], 23 | cluster_std=0.15) 24 | 25 | # Linear SVM model 26 | C = 1.0 27 | model = OneVsRestClassifier(SVC(C=C, kernel='linear')) 28 | model.fit(x, y) 29 | 30 | print(model.estimators_) 31 | # [SVC(kernel='linear'), 32 | # SVC(kernel='linear'), 33 | # SVC(kernel='linear'), 34 | # SVC(kernel='linear')] 35 | 36 | w = np.array([m.coef_[0] for m in model.estimators_]) # (4,2) 37 | b = np.array([m.intercept_[0] for m in model.estimators_]) # (4,) 38 | 39 | # Visualize the data and 4 boundaries. 40 | plt.figure(figsize=(8,7)) 41 | colors = ['red', 'blue', 'green', 'black'] 42 | y_color= [colors[a] for a in y] 43 | for label in model.classes_: 44 | idx = np.where(y == label) 45 | plt.scatter(x[idx, 0], x[idx, 1], s=100, c=colors[label], 46 | alpha=0.5, label='class_' + str(label)) 47 | 48 | # Visualize 4 boundaries. 49 | x1_dec = np.linspace(-2.0, 2.0, 50).reshape(-1, 1) 50 | for i in range(w.shape[0]): 51 | x2_dec = -(w[i, 0] * x1_dec + b[i]) / w[i, 1] 52 | plt.plot(x1_dec, x2_dec, label=str(i)+'_rest') 53 | plt.xlim(-0.5, 1.5) 54 | plt.ylim(-0.7, 1.) 55 | plt.legend() 56 | plt.show() 57 | 58 | # Predict the classes of the test data. 59 | x_test = np.random.uniform(-1.5, 1.5, (2000, 2)) 60 | y_pred1 = model.predict(x_test) 61 | 62 | # To understand how OvR works, let's manually implement the 63 | # process of model.predict(x_test). df.shape = (2000, 4) 64 | df = np.dot(x_test, w.T) + b # decision function 65 | # df = model.decision_function(x_test) # same as above 66 | 67 | y_pred2 = df.argmax(axis=1) 68 | 69 | # Compare y_pred1 and y_pred2. 70 | if (y_pred1 != y_pred2).sum() == 0: 71 | print("# y_pred1 and y_pred2 are exactly the same.") 72 | else: 73 | print("# y_pred1 and y_pred2 are not the same.") 74 | 75 | # Visualize test data and y_pred1 76 | plt.figure(figsize=(8,7)) 77 | y_color= [colors[a] for a in y_pred1] 78 | for label in model.classes_: 79 | idx = np.where(y_pred1 == label) 80 | plt.scatter(x_test[idx, 0], x_test[idx, 1], 81 | s=100, 82 | c=colors[label], 83 | alpha=0.3, 84 | label='class_' + str(label)) 85 | 86 | plt.xlim(-1.5, 1.8) 87 | plt.ylim(-0.7, 1.) 88 | plt.show() 89 | 90 | # decision_function_shape = 'ovr' in SVC 91 | # model2 = SVC(C=C, kernel='linear', decision_function_shape='ovr') 92 | # model2.fit(x, y) 93 | 94 | # # w and b are generated by OvO method. 95 | # print("w:\n", model2.coef_) # (6,2) 96 | # print("b:\n", model2.intercept_) # (6,) 97 | 98 | # df2 = model2.decision_function(x_test) 99 | # y_pred3 = df2.argmax(axis=1) 100 | 101 | # # Visualize test data and y_pred3 102 | # plt.figure(figsize=(8,7)) 103 | # y_color= [colors[a] for a in y_pred3] 104 | # for label in model.classes_: 105 | # idx = np.where(y_pred3 == label) 106 | # plt.scatter(x_test[idx, 0], x_test[idx, 1], s=100, 107 | # c=colors[label], 108 | # alpha=0.3, label='class_' + str(label)) 109 | 110 | # plt.xlim(-1.5, 1.8) 111 | # plt.ylim(-0.7, 1.) 112 | # plt.show() 113 | 114 | -------------------------------------------------------------------------------- /6.SVM/2.cvxopt(soft_margin).py: -------------------------------------------------------------------------------- 1 | # [MXML-6-04] 2.cvxopt(soft_margin).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/LdOcJfJTcwU 10 | # 11 | import numpy as np 12 | from cvxopt import matrix as cvxopt_matrix 13 | from cvxopt import solvers as cvxopt_solvers 14 | import matplotlib.pyplot as plt 15 | 16 | # training data 17 | x = np.array([[0.2, 0.869], 18 | [0.687, 0.212], 19 | [0.822, 0.411], 20 | [0.738, 0.694], 21 | [0.176, 0.458], 22 | [0.306, 0.753], 23 | [0.936, 0.413], 24 | [0.215, 0.410], 25 | [0.612, 0.375], 26 | [0.784, 0.602], 27 | [0.612, 0.554], 28 | [0.357, 0.254], 29 | [0.204, 0.775], 30 | [0.512, 0.745], 31 | [0.498, 0.287], 32 | [0.251, 0.557], 33 | [0.502, 0.523], 34 | [0.119, 0.687], 35 | [0.495, 0.924], 36 | [0.612, 0.851]]) 37 | 38 | y = np.array([-1,1,1,1,-1,-1,1,-1,1,1,-1,1,-1,1,-1,-1,1,-1,1,1]) 39 | y = y.astype('float').reshape(-1, 1) 40 | 41 | C = 50.0 42 | N = x.shape[0] 43 | 44 | # Construct the matrices required for QP in standard form. 45 | H = np.outer(y, y) * np.dot(x, x.T) 46 | P = cvxopt_matrix(H) 47 | q = cvxopt_matrix(np.ones(N) * -1) 48 | A = cvxopt_matrix(y.reshape(1, -1)) 49 | b = cvxopt_matrix(np.zeros(1)) 50 | 51 | g = np.vstack([-np.eye(N), np.eye(N)]) 52 | G = cvxopt_matrix(g) 53 | 54 | h1 = np.hstack([np.zeros(N), np.ones(N) * C]) 55 | h = cvxopt_matrix(h1) 56 | 57 | # solver parameters 58 | cvxopt_solvers.options['abstol'] = 1e-10 59 | cvxopt_solvers.options['reltol'] = 1e-10 60 | cvxopt_solvers.options['feastol'] = 1e-10 61 | 62 | # Perform QP 63 | sol = cvxopt_solvers.qp(P, q, G, h, A, b) 64 | 65 | # the solution to the QP, λ 66 | lamb = np.array(sol['x']) 67 | 68 | # Calculate w using the lambda, which is the solution to QP. 69 | w = np.sum(lamb * y * x, axis=0) 70 | 71 | # Find support vectors 72 | sv_idx = np.where(lamb > 1e-5)[0] 73 | sv_lamb = lamb[sv_idx] 74 | sv_x = x[sv_idx] 75 | sv_y = y[sv_idx] 76 | 77 | sv_plus = sv_x[np.where(sv_y > 0)[0]] # '+1' samples 78 | sv_minus = sv_x[np.where(sv_y < 0)[0]] # '-1' samples 79 | 80 | # Calculate b using the support vectors and calculate the average. 81 | b = -(np.max(np.dot(w, sv_plus.T)) + np.min(np.dot(w, sv_minus.T))) / 2.0 82 | 83 | # Visualize the data points 84 | plt.figure(figsize=(7,7)) 85 | color= ['red' if a == 1 else 'blue' for a in y] 86 | plt.scatter(x[:, 0], x[:, 1], s=200, c=color, alpha=0.7) 87 | plt.xlim(0, 1) 88 | plt.ylim(0, 1) 89 | 90 | # Visualize the decision boundary 91 | x1_dec = np.linspace(0, 1, 50).reshape(-1, 1) 92 | x2_dec = -(w[0] / w[1]) * x1_dec - b / w[1] 93 | plt.plot(x1_dec, x2_dec, c='black', lw=1.0, label='decision boundary') 94 | 95 | # display slack variables, slack variable = max(0, 1 - y(wx + b)) 96 | y_hat = np.dot(w, x.T) + b 97 | slack = np.maximum(0, 1 - y.flatten() * y_hat) 98 | for s, (x1, x2) in zip(slack, x): 99 | plt.annotate(str(s.round(2)), (x1-0.02, x2 + 0.03)) 100 | 101 | # Visualize the positive & negative boundary and support vectors 102 | w_norm = np.sqrt(np.sum(w ** 2)) 103 | w_unit = w / w_norm 104 | half_margin = 1 / w_norm 105 | upper = np.hstack([x1_dec, x2_dec]) + half_margin * w_unit 106 | lower = np.hstack([x1_dec, x2_dec]) - half_margin * w_unit 107 | 108 | plt.plot(upper[:, 0], upper[:, 1], '--', lw=1.0, label='positive boundary') 109 | plt.plot(lower[:, 0], lower[:, 1], '--', lw=1.0, label='negative boundary') 110 | 111 | plt.scatter(sv_x[:, 0], sv_x[:, 1], s=60, marker='o', c='white') 112 | plt.legend() 113 | plt.title('C = ' + str(C) + ', Σξ = ' + str(np.sum(slack).round(2))) 114 | plt.show() 115 | 116 | -------------------------------------------------------------------------------- /6.SVM/3.SVC(soft_margin).py: -------------------------------------------------------------------------------- 1 | # [MXML-6-04] 3.SVC(soft_margin).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/LdOcJfJTcwU 10 | # 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from sklearn.svm import SVC 14 | 15 | # training data 16 | x = np.array([[0.2, 0.869], 17 | [0.687, 0.212], 18 | [0.822, 0.411], 19 | [0.738, 0.694], 20 | [0.176, 0.458], 21 | [0.306, 0.753], 22 | [0.936, 0.413], 23 | [0.215, 0.410], 24 | [0.612, 0.375], 25 | [0.784, 0.602], 26 | [0.612, 0.554], 27 | [0.357, 0.254], 28 | [0.204, 0.775], 29 | [0.512, 0.745], 30 | [0.498, 0.287], 31 | [0.251, 0.557], 32 | [0.502, 0.523], 33 | [0.119, 0.687], 34 | [0.495, 0.924], 35 | [0.612, 0.851]]) 36 | 37 | y = np.array([-1,1,1,1,-1,-1,1,-1,1,1,-1,1,-1,1,-1,-1,1,-1,1,1]) 38 | C = 50 39 | 40 | # Create SVC model and fit it the the training data 41 | model = SVC(C=C, kernel='linear') 42 | model.fit(x, y) 43 | 44 | # parameters 45 | w = model.coef_[0] 46 | b = model.intercept_[0] 47 | 48 | # Visualize the data points 49 | plt.figure(figsize=(7,7)) 50 | color= ['red' if a == 1 else 'blue' for a in y] 51 | plt.scatter(x[:, 0], x[:, 1], s=200, c=color, alpha=0.7) 52 | plt.xlim(0, 1) 53 | plt.ylim(0, 1) 54 | 55 | # Visualize the decision boundary 56 | x1_dec = np.linspace(0, 1, 50).reshape(-1, 1) 57 | x2_dec = -(w[0] / w[1]) * x1_dec - b / w[1] 58 | plt.plot(x1_dec, x2_dec, c='black', lw=1.0, label='decision boundary') 59 | 60 | # Visualize the positive & negative boundary 61 | w_norm = np.sqrt(np.sum(w ** 2)) 62 | w_unit = w / w_norm 63 | half_margin = 1 / w_norm 64 | upper = np.hstack([x1_dec, x2_dec]) + half_margin * w_unit 65 | lower = np.hstack([x1_dec, x2_dec]) - half_margin * w_unit 66 | 67 | plt.plot(upper[:, 0], upper[:, 1], '--', lw=1.0, label='positive boundary') 68 | plt.plot(lower[:, 0], lower[:, 1], '--', lw=1.0, label='negative boundary') 69 | 70 | # display slack variables, slack variable = max(0, 1 - y(wx + b)) 71 | y_hat = np.dot(w, x.T) + b 72 | slack = np.maximum(0, 1 - y * y_hat) 73 | for s, (x1, x2) in zip(slack, x): 74 | plt.annotate(str(s.round(2)), (x1-0.02, x2 + 0.03)) 75 | 76 | # Visualize support vectors. 77 | sv = model.support_vectors_ 78 | plt.scatter(sv[:, 0], sv[:, 1], s=30, c='white') 79 | 80 | plt.title('C = ' + str(C) + ', Σξ = ' + str(np.sum(slack).round(2))) 81 | plt.legend() 82 | plt.show() 83 | 84 | -------------------------------------------------------------------------------- /6.SVM/4.linearSVC(soft_margin).py: -------------------------------------------------------------------------------- 1 | # [MXML-6-04] 4.linearSVC(soft_margin).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/LdOcJfJTcwU 10 | # 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from sklearn.svm import LinearSVC 14 | 15 | # training data 16 | x = np.array([[0.2, 0.869], 17 | [0.687, 0.212], 18 | [0.822, 0.411], 19 | [0.738, 0.694], 20 | [0.176, 0.458], 21 | [0.306, 0.753], 22 | [0.936, 0.413], 23 | [0.215, 0.410], 24 | [0.612, 0.375], 25 | [0.784, 0.602], 26 | [0.612, 0.554], 27 | [0.357, 0.254], 28 | [0.204, 0.775], 29 | [0.512, 0.745], 30 | [0.498, 0.287], 31 | [0.251, 0.557], 32 | [0.502, 0.523], 33 | [0.119, 0.687], 34 | [0.495, 0.924], 35 | [0.612, 0.851]]) 36 | 37 | y = np.array([-1,1,1,1,-1,-1,1,-1,1,1,-1,1,-1,1,-1,-1,1,-1,1,1]) 38 | 39 | C = 50 40 | model = LinearSVC(penalty='l2', loss='hinge', C=C) 41 | # model = LinearSVC(penalty='l2', loss='squared_hinge', C=C) 42 | model.fit(x, y) 43 | 44 | # parameters 45 | w = model.coef_[0] 46 | b = model.intercept_[0] 47 | 48 | # Visualize the data points 49 | plt.figure(figsize=(7,7)) 50 | color= ['red' if a == 1 else 'blue' for a in y] 51 | plt.scatter(x[:, 0], x[:, 1], s=200, c=color, alpha=0.7) 52 | plt.xlim(0, 1) 53 | plt.ylim(0, 1) 54 | 55 | # Visualize the decision boundary 56 | x1_dec = np.linspace(0, 1, 50).reshape(-1, 1) 57 | x2_dec = -(w[0] / w[1]) * x1_dec - b / w[1] 58 | plt.plot(x1_dec, x2_dec, c='black', lw=1.0, label='decision boundary') 59 | 60 | # Visualize the positive & negative boundary 61 | w_norm = np.sqrt(np.sum(w ** 2)) 62 | w_unit = w / w_norm 63 | half_margin = 1 / w_norm 64 | upper = np.hstack([x1_dec, x2_dec]) + half_margin * w_unit 65 | lower = np.hstack([x1_dec, x2_dec]) - half_margin * w_unit 66 | 67 | plt.plot(upper[:, 0], upper[:, 1], '--', lw=1.0, label='positive boundary') 68 | plt.plot(lower[:, 0], lower[:, 1], '--', lw=1.0, label='negative boundary') 69 | 70 | # display slack variables, slack variable = max(0, 1 - y(wx + b)) 71 | y_hat = np.dot(w, x.T) + b 72 | slack = np.maximum(0, 1 - y * y_hat) 73 | for s, (x1, x2) in zip(slack, x): 74 | plt.annotate(str(s.round(2)), (x1-0.02, x2 + 0.03)) 75 | 76 | # Visualize support vectors. 77 | sv = x[np.where(np.abs(y_hat) <= 1.0)[0]] 78 | plt.scatter(sv[:, 0], sv[:, 1], s=30, c='white') 79 | 80 | plt.title('C = ' + str(C) + ', Σξ = ' + str(np.sum(slack).round(2))) 81 | plt.legend() 82 | plt.show() 83 | 84 | # Hinge & squared hinge loss plot for [+] samples (y = +1) 85 | x_rand = np.random.rand(100, 2) 86 | y_rand = np.dot(w, x_rand.T) + b # y_hat for x_rand 87 | s_rand = np.maximum(0, 1 - y_rand) # slack variables for y_rand 88 | 89 | sort_idx = np.argsort(y_rand) 90 | y_rand = y_rand[sort_idx] 91 | s_rand = s_rand[sort_idx] 92 | 93 | plt.plot(y_rand, s_rand, c='blue', label='Hinge loss') 94 | plt.plot(y_rand, s_rand ** 2, c='red', label='Squared hinge loss') 95 | plt.legend() 96 | plt.axvline(x=0, lw=1) 97 | plt.axvline(x=1, lw=1) 98 | plt.xlabel('y_hat') 99 | plt.ylabel('ξ') 100 | plt.ylim(0, 4) 101 | plt.title('Hinge & squared hinge loss for (+) sample') 102 | plt.show() 103 | -------------------------------------------------------------------------------- /6.SVM/5.check_kernel.py: -------------------------------------------------------------------------------- 1 | # [MXML-6-05] 5.check_kernel.py 2 | # For arbitrary real data, if the eigenvalues ​​of the kernel 3 | # matrix (K) are all non-negative, then K is positive semi-definite 4 | # (PSD) and is a valid kernel function. 5 | # 6 | # This code was used in the machine learning online 7 | # course provided by 8 | # www.youtube.com/@meanxai 9 | # www.github.com/meanxai/machine_learning 10 | # 11 | # A detailed description of this code can be found in 12 | # https://youtu.be/NiuJihA05Ds 13 | # 14 | import numpy as np 15 | 16 | # random dataset (2-dims) 17 | x = np.random.rand(100, 2) 18 | n = x.shape[0] 19 | 20 | # kernel functions 21 | rbf_kernel = lambda a, b: np.exp(-np.linalg.norm(a - b)**2 / 2) 22 | pol_kernel = lambda a, b: (1 + np.dot(a, b)) ** 2 23 | sig_kernel = lambda a, b: np.tanh(3 * np.dot(a, b) + 5) 24 | cos_kernel = lambda a, b: np.cos(np.dot(a, b)) 25 | kernels = [rbf_kernel, pol_kernel, sig_kernel, cos_kernel] 26 | names = ['RBF', 'Polynomial', 'Sigmoid', 'Cos'] 27 | 28 | for kernel, name in zip(kernels, names): 29 | # Kernel matrix (Gram matrix). 30 | K = np.array([kernel(x[i], x[j]) 31 | for i in range(n) 32 | for j in range(n)]).reshape(n, n) 33 | 34 | # Find eigenvalues, eigenvectors 35 | w, v = np.linalg.eig(K) 36 | 37 | # The function defined above is a valid kernel if all 38 | # eigenvalues ​​of K are non-negative. 39 | print('\nKernel : ' + name) 40 | print('max eigenvalue =', w.max().round(3)) 41 | print('min eigenvalue =', w.min().round(8)) 42 | 43 | if w.min().real > -1e-8: 44 | print('==> valid kernel') 45 | else: 46 | print('==> invalid kernel') 47 | 48 | -------------------------------------------------------------------------------- /6.SVM/6.cvxopt(kernel_trick).py: -------------------------------------------------------------------------------- 1 | # [MXML-6-06] 6.cvxopt(kernel_trick).py 2 | # Implemen nonlinear SVM using CVXOPT 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/-WVI6b19pag 11 | # 12 | import numpy as np 13 | from cvxopt import matrix as cvxopt_matrix 14 | from cvxopt import solvers as cvxopt_solvers 15 | import matplotlib.pyplot as plt 16 | 17 | # 4 data samples. 2 ‘+’ samples, 2 ‘-’ samples 18 | x = np.array([[0., 1.], [1., 1.], [1., 0.], [0., 0.]]) 19 | y = np.array([[-1.], [1.], [-1.], [1.]]) 20 | 21 | # kernel function 22 | def kernel(a, b, p=3, r=0.5, type="rbf"): 23 | if k_type == "poly": 24 | return (1 + np.dot(a, b)) ** p 25 | else: 26 | return np.exp(-r * np.linalg.norm(a - b)**2) 27 | 28 | C = 1.0 # regularization constant 29 | N = x.shape[0] # the number of data points 30 | k_type = "poly" # kernel type: poly or rbf 31 | 32 | # Kernel matrix. k(xi, xj) = φ(xi)φ(xj). 33 | K = np.array([kernel(x[i], x[j], type=k_type) 34 | for i in range(N) 35 | for j in range(N)]).reshape(N, N) 36 | 37 | # Construct the matrices required for QP in standard form. 38 | H = np.outer(y, y) * K 39 | P = cvxopt_matrix(H) 40 | q = cvxopt_matrix(np.ones(N) * -1) 41 | A = cvxopt_matrix(y.reshape(1, -1)) 42 | b = cvxopt_matrix(np.zeros(1)) 43 | 44 | g = np.vstack([-np.eye(N), np.eye(N)]) 45 | G = cvxopt_matrix(g) 46 | 47 | h1 = np.hstack([np.zeros(N), np.ones(N) * C]) 48 | h = cvxopt_matrix(h1) 49 | 50 | # solver parameters 51 | cvxopt_solvers.options['abstol'] = 1e-10 52 | cvxopt_solvers.options['reltol'] = 1e-10 53 | cvxopt_solvers.options['feastol'] = 1e-10 54 | 55 | # Perform QP 56 | sol = cvxopt_solvers.qp(P, q, G, h, A, b) 57 | 58 | # the solution to the QP, λ 59 | lamb = np.array(sol['x']) 60 | 61 | # Find support vectors 62 | sv_i = np.where(lamb > 1e-5)[0] 63 | sv_m = lamb[sv_i] # lambda 64 | sv_x = x[sv_i] 65 | sv_y = y[sv_i] 66 | 67 | # Calculate b using the support vectors and calculate the average. 68 | def cal_wphi(cond): 69 | wphi = [] 70 | idx = np.where(cond)[0] 71 | for i in idx: 72 | wp = [sv_m[j] * sv_y[j] * kernel(sv_x[i], sv_x[j], type=k_type) \ 73 | for j in range(sv_x.shape[0])] 74 | wphi.append(np.sum(wp)) 75 | return wphi 76 | 77 | b = -(np.max(cal_wphi(sv_y > 0)) + np.min(cal_wphi(sv_y < 0))) / 2. 78 | 79 | # Predict the class of test data. 80 | x_test = np.random.uniform(-0.5, 1.5, (1000, 2)) 81 | n_test = x_test.shape[0] 82 | n_sv = sv_x.shape[0] 83 | ts_K = np.array([kernel(sv_x[i], x_test[j], type=k_type) 84 | for i in range(n_sv) 85 | for j in range(n_test)]).reshape(n_sv, n_test) 86 | 87 | # decision function 88 | y_hat = np.sum(sv_m * sv_y * ts_K, axis=0).reshape(-1, 1) + b 89 | y_pred = np.sign(y_hat) 90 | 91 | # Visualize test data and classes. 92 | plt.figure(figsize=(5,5)) 93 | test_c = ['red' if a == 1 else 'blue' for a in y_pred] 94 | sv_c = ['red' if a == 1 else 'blue' for a in sv_y] 95 | plt.scatter(x_test[:, 0], x_test[:, 1], s=30, c=test_c, alpha=0.3) 96 | plt.scatter(sv_x[:, 0], sv_x[:, 1], s=100, marker='D', c=sv_c, ec='black', lw=2) 97 | plt.axhline(y=0, lw=1) 98 | plt.axvline(x=0, lw=1) 99 | plt.show() 100 | 101 | -------------------------------------------------------------------------------- /6.SVM/7.SVC(kernel_trick).py: -------------------------------------------------------------------------------- 1 | # [MXML-6-06] 7.SVC(kernel_trick).py 2 | # Implement nonlinear SVM using SVC. 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/-WVI6b19pag 11 | # 12 | import numpy as np 13 | from sklearn.svm import SVC 14 | import matplotlib.pyplot as plt 15 | 16 | # 4 data samples. 2 ‘+’ samples, 2 ‘-’ samples 17 | x = np.array([[0., 1.], [1., 1.], [1., 0.], [0., 0.]]) 18 | y = np.array([-1., 1., -1., 1.]) 19 | 20 | C = 1.0 21 | # model = SVC(C=C, kernel='rbf', gamma=0.5) 22 | model = SVC(C=C, kernel='poly', degree=3) 23 | model.fit(x, y) 24 | 25 | # Intercept (b) 26 | # w = model.coef_[0] 27 | # AttributeError: coef_ is only available when using a linear kernel 28 | b = model.intercept_[0] 29 | 30 | # Predict the class of test data. 31 | x_test = np.random.uniform(-0.5, 1.5, (1000, 2)) 32 | 33 | # decision function 34 | y_hat = model.decision_function(x_test) 35 | y_pred = np.sign(y_hat) 36 | # y_pred = model.predict(x_test) # It is the same as above. 37 | 38 | # Visualize test data and classes. 39 | plt.figure(figsize=(5,5)) 40 | test_c = ['red' if a == 1 else 'blue' for a in y_pred] 41 | plt.scatter(x_test[:, 0], x_test[:, 1], s=30, c=test_c, alpha=0.3) 42 | plt.scatter(x[:, 0], x[:, 1], s=100, marker='D', c='white', ec='black', lw=2) 43 | plt.axhline(y=0, lw=1) 44 | plt.axvline(x=0, lw=1) 45 | plt.show() 46 | 47 | -------------------------------------------------------------------------------- /6.SVM/9.multiclass(OvO).py: -------------------------------------------------------------------------------- 1 | # [MXML-6-07] 9.multiclass(OvO).py 2 | # Implement multiclass classification of SVM by One-vs-One (OvO) 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/MAde_oEYB-g 11 | # 12 | import numpy as np 13 | from sklearn.svm import SVC 14 | import matplotlib.pyplot as plt 15 | from sklearn.datasets import make_blobs 16 | from itertools import combinations 17 | 18 | # Generate the data with 4 clusters. 19 | x, y = make_blobs(n_samples=400, n_features=2, 20 | centers=[[0., 0.], [0.5, 0.5], [1., 0.], [-0.8, 0.]], 21 | cluster_std=0.17) 22 | 23 | # Linear SVM 24 | C = 1.0 25 | model = SVC(C=C, kernel='linear', decision_function_shape='ovo') 26 | model.fit(x, y) 27 | 28 | w = model.coef_ 29 | b = model.intercept_ 30 | print("w:\n ", w.round(3)) # shape=(6,2) 31 | print("\nb:\n ", b.round(3)) # shape=(6,) 32 | 33 | # Visualize the data and six boundaries. 34 | plt.figure(figsize=(8,7)) 35 | colors = ['red', 'blue', 'green', 'black'] 36 | y_color= [colors[a] for a in y] 37 | for label in model.classes_: 38 | idx = np.where(y == label) 39 | plt.scatter(x[idx, 0], x[idx, 1], s=100, c=colors[label], 40 | alpha=0.5, label='class_' + str(label)) 41 | 42 | # Visualize six boundaries. 43 | comb = list(combinations(model.classes_, 2)) 44 | x1_dec = np.linspace(-2.0, 2.0, 50).reshape(-1, 1) 45 | for i in range(w.shape[0]): 46 | x2_dec = -(w[i, 0] * x1_dec + b[i]) / w[i, 1] 47 | plt.plot(x1_dec, x2_dec, label=str(comb[i])) 48 | plt.xlim(-1.5, 1.8) 49 | plt.ylim(-0.7, 1.) 50 | plt.legend() 51 | plt.show() 52 | 53 | # Predict the classes of the test data. 54 | x_test = np.random.uniform(-1.5, 1.5, (2000, 2)) 55 | y_pred1 = model.predict(x_test) 56 | 57 | # To understand how OvO works, let's manually implement the 58 | # process of model.predict(x_test). df.shape = (2000, 6) 59 | df = np.dot(x_test, w.T) + b # decision function 60 | # df = model.decision_function(x_test) # same as above 61 | 62 | classes = model.classes_ 63 | n_class = classes.shape[0] 64 | 65 | # Reference: https://stackoverflow.com/questions/20113206/scikit-learn-svc-decision-function-and-predict 66 | y_pred = [] 67 | for i in range(df.shape[0]): 68 | votes = np.zeros(n_class) 69 | for j in range(df.shape[1]): # the number of boundaries 70 | # if df(i, j) > 0, then class=i, else class=j 71 | if df[i][j] > 0: 72 | votes[comb[j][0]] += 1 73 | else: 74 | votes[comb[j][1]] += 1 75 | 76 | v = np.argmax(votes) # majority vote 77 | y_pred.append(classes[v]) 78 | y_pred2 = np.array(y_pred) 79 | 80 | # Compare the results of y_pred1 and y_pred2. 81 | if (y_pred1 != y_pred2).sum() == 0: 82 | print("# y_pred1 and y_pred2 are exactly the same.") 83 | else: 84 | print("# y_pred1 and y_pred2 are not the same.") 85 | 86 | # Visualize test data and y_pred1 87 | plt.figure(figsize=(8,7)) 88 | y_color= [colors[a] for a in y_pred1] 89 | for label in model.classes_: 90 | idx = np.where(y_pred1 == label) 91 | plt.scatter(x_test[idx, 0], x_test[idx, 1], s=100, c=colors[label], 92 | alpha=0.3, label='class_' + str(label)) 93 | 94 | plt.xlim(-1.5, 1.8) 95 | plt.ylim(-0.7, 1.) 96 | plt.show() 97 | 98 | -------------------------------------------------------------------------------- /7.KMeans/2.sklearn(kmeans).py: -------------------------------------------------------------------------------- 1 | # [MXML-7-02] 2.sklearn(kmeans).py 2 | # 3 | # This code was used in the machine learning online 4 | # course provided by 5 | # www.youtube.com/@meanxai 6 | # www.github.com/meanxai/machine_learning 7 | # 8 | # A detailed description of this code can be found in 9 | # https://youtu.be/hToqjr5Kx4Q 10 | # 11 | import numpy as np 12 | from sklearn.datasets import make_blobs 13 | from sklearn.cluster import KMeans 14 | import matplotlib.pyplot as plt 15 | 16 | # Generate training data 17 | x, y = make_blobs(n_samples=300, n_features=2, 18 | centers=[[0., 0.], [0.25, 0.5], [0.5, 0.]], 19 | cluster_std=0.1, center_box=(-1., 1.)) 20 | 21 | K = 3 # the number of clusters 22 | M = 10 # the number of iterations 23 | L = 10 # the number of attempts to prevent local minimum problem. 24 | 25 | model = KMeans(n_clusters = K, # the number of clusters 26 | init='random', # randomly initialize centroids 27 | max_iter=M, # max iterations 28 | n_init = L) # Number of times the k-means algorithm 29 | # is run with different centroid seeds. 30 | 31 | model.fit(x) 32 | 33 | # Visualize training data and clusters color-coded. 34 | def plot_cluster(x, cluster, centroid): 35 | plt.figure(figsize=(5, 5)) 36 | color = [['red', 'blue', 'green'][a] for a in cluster] 37 | plt.scatter(x[:, 0], x[:, 1], s=30, c=color, alpha=0.5) 38 | plt.scatter(centroid[:, 0], centroid[:, 1], s=500, c='white') 39 | plt.scatter(centroid[:, 0], centroid[:, 1], s=250, c='black') 40 | plt.scatter(centroid[:, 0], centroid[:, 1], s=80, c='yellow') 41 | plt.show() 42 | 43 | # Visualize the training result. 44 | plot_cluster(x, model.labels_, model.cluster_centers_) 45 | 46 | # print the final error 47 | # Sum of squared distances of samples to their closest cluster center 48 | print('\nerror = {:.4f}'.format(model.inertia_)) 49 | 50 | -------------------------------------------------------------------------------- /7.KMeans/3.kmeans(plus).py: -------------------------------------------------------------------------------- 1 | # [MXML-7-03] 3.kmeans(plus).py 2 | import numpy as np 3 | import random as rd 4 | from sklearn.datasets import make_blobs 5 | import matplotlib.pyplot as plt 6 | 7 | # Generate training data points. 8 | x, y = make_blobs(n_samples=300, n_features=2, 9 | centers=[[0., 0.], [0.25, 0.5], [0.5, 0.]], 10 | cluster_std=0.1, center_box=(-1., 1.)) 11 | 12 | N = x.shape[0] # the number of data points 13 | K = 3 # the number of data clusters 14 | M = 10 # the number of data iterations 15 | 16 | # Visualize the data points, x. 17 | def plot_data(x): 18 | plt.figure(figsize=(5, 5)) 19 | plt.scatter(x[:, 0], x[:, 1], s=30, c='black', alpha=0.5) 20 | plt.show() 21 | 22 | # Visualize training data points and clusters color-coded. 23 | def plot_cluster(x, cluster, centroid): 24 | plt.figure(figsize=(5, 5)) 25 | color = [['red', 'blue', 'green'][a] for a in cluster] 26 | plt.scatter(x[:, 0], x[:, 1], s=30, c=color, alpha=0.5) 27 | plt.scatter(centroid[:, 0], centroid[:, 1], s=500, c='white') 28 | plt.scatter(centroid[:, 0], centroid[:, 1], s=250, c='black') 29 | plt.scatter(centroid[:, 0], centroid[:, 1], s=80, c='yellow') 30 | plt.show() 31 | 32 | plot_data(x) 33 | 34 | # Generate initial centroids using the K-Means++ algorithm. 35 | xp = x.copy() 36 | centroids = [] 37 | density = np.ones(xp.shape[0]) / N 38 | for c in range(K): 39 | # (1) Choose an initial centroid c(1) uniformly at random from X 40 | # (2) Choose the next centroid c(i), selecting c (i) = x' ∈ X with probability 41 | idx = rd.choices(np.arange(xp.shape[0]), weights=density, k=1)[0] 42 | centroids.append(xp[idx]) 43 | xp = np.delete(xp, idx, axis=0) 44 | 45 | # Create a distance matrix between data points xp and the centroids. 46 | # Please refer to the video [MXML-7-02] for how to create a distance matrix. 47 | x_exp = xp[np.newaxis, :, :] 48 | c_exp = np.array(centroids)[:, np.newaxis, :] 49 | dist = np.sqrt(np.sum(np.square(x_exp - c_exp), axis=2)) 50 | 51 | # Find the centroid closest to each data point. 52 | assign = np.argmin(dist, axis=0) 53 | 54 | # Calculate D(x) 55 | # let D(x) denote the shortest distance from a data point x to 56 | # the closest centroid we have already chosen 57 | Dx = np.sum(np.square(xp - np.array(centroids)[assign]), axis=1) 58 | 59 | # Create a probability density function to select the next centroid. 60 | density = Dx / np.sum(Dx) 61 | 62 | centroids = np.array(centroids) 63 | 64 | # Perform the K-Means algorithm using the centroids generated by K-Means++. 65 | error = [] 66 | for m in range(M): 67 | # Calculate the distances between the training data points and the centroids. 68 | x_exp = x[np.newaxis, :, :] 69 | c_exp = centroids[:, np.newaxis, :] 70 | dist = np.sqrt(np.sum(np.square(x_exp - c_exp), axis=2)) 71 | 72 | # Assign each data point to the nearest centroid. 73 | assign = np.argmin(dist, axis=0) # shape = (N,) 74 | 75 | # update centroids 76 | new_cent = [] 77 | err = 0 78 | for c in range(K): 79 | # Find the data points assigned to centroid c. 80 | idx = np.where(assign == c) 81 | x_idx = x[idx] 82 | 83 | # To measure clustering performance, calculate the error. 84 | err += np.sum(np.sum(np.square(x_idx - centroids[c]), axis=1)) 85 | 86 | # Compute the average coordinates of the data points 87 | # assigned to this centroid. And use that as new centroid. 88 | new_cent.append(np.mean(x_idx, axis=0)) 89 | 90 | error.append(err) 91 | 92 | # Remove the if statement to see the centroids moving. 93 | if m == 0: 94 | plot_cluster(x, assign, centroids) 95 | 96 | # Update centroids 97 | centroids = np.array(new_cent) 98 | 99 | # Visualize the training result. 100 | plot_cluster(x, assign, centroids) 101 | 102 | # Visualize error history 103 | plt.plot(error, 'o-') 104 | plt.title('final error =' + str(np.round(error[-1], 2))) 105 | plt.show() 106 | 107 | # Check the cluster number for each data point. 108 | import pandas as pd 109 | df = pd.DataFrame({'x1': x[:,0], 'x2': x[:,1], 'cluster': assign}) 110 | print(df.head(10)) 111 | -------------------------------------------------------------------------------- /7.KMeans/4.sklearn(mnist).py: -------------------------------------------------------------------------------- 1 | # [MXML-7-03] 4.sklearn(mnist).py 2 | # MNIST clustering 3 | # This code can be found at github.com/meanxai/machine_learning. 4 | import numpy as np 5 | from sklearn.cluster import KMeans 6 | import matplotlib.pyplot as plt 7 | import pickle 8 | 9 | # from sklearn.datasets import fetch_openml 10 | # mnist = fetch_openml('mnist_784') 11 | # mnist.pkl is the saved mnist. 12 | with open('data/mnist.pkl', 'rb') as f: 13 | mnist = pickle.load(f) 14 | 15 | # Use only 10,000 data points and normalize them between 0 and 1 16 | x = np.array(mnist['data'][:10000]) / 255. 17 | 18 | # Cluster the data points into 10 groups using K-Means++. 19 | model = KMeans(n_clusters=10, 20 | init='k-means++', # default 21 | max_iter = 50, 22 | n_init = 5) 23 | 24 | model.fit(x) 25 | clust = model.predict(x) 26 | centroids = model.cluster_centers_ 27 | 28 | # Check out the images for each cluster. 29 | for k in np.unique(clust): 30 | # Find 10 images belonging to cluster k, and centroid image. 31 | idx = np.where(clust == k)[0] 32 | images = x[idx[:10]] 33 | centroid = centroids[k, :] 34 | 35 | # Find 10 images closest to each centroid image. 36 | # d = np.sqrt(np.sum((x[idx] - centroid)**2, axis=1)) 37 | # nearest = np.argsort(d)[:10] 38 | # images = x[idx[nearest]] 39 | 40 | 41 | # display the central image 42 | f = plt.figure(figsize=(8, 2)) 43 | image = centroid.reshape(28, 28) 44 | ax = f.add_subplot(1, 11, 1) 45 | ax.imshow(image, cmap=plt.cm.bone) 46 | ax.grid(False) 47 | ax.set_title("C") 48 | ax.xaxis.set_ticks([]) 49 | ax.yaxis.set_ticks([]) 50 | plt.tight_layout() 51 | 52 | # display 10 images belonging to the centroid 53 | for i in range(10): 54 | image = images[i].reshape(28,28) 55 | ax = f.add_subplot(1, 11, i + 2) 56 | ax.imshow(image, cmap=plt.cm.bone) 57 | ax.grid(False) 58 | ax.set_title(k) 59 | ax.xaxis.set_ticks([]) 60 | ax.yaxis.set_ticks([]) 61 | plt.tight_layout() 62 | -------------------------------------------------------------------------------- /8.RandomForest/1.RF(titanic).py: -------------------------------------------------------------------------------- 1 | # [MXML-8-02]: 1.RF(titanic).py 2 | # Implement Random Forest using MyDtreeClassifierRF. 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/9si5fELmtg0 11 | # 12 | import numpy as np 13 | import pandas as pd 14 | from MyDTreeClassifierRF import MyDTreeClassifierRF 15 | from sklearn.model_selection import train_test_split 16 | 17 | # Read preprocessed Titanic data. 18 | df = pd.read_csv('data/titanic_clean.csv') 19 | 20 | # Survived Pclass Sex Age SibSp Parch Fare Embarked Title 21 | # 0 3 1 22.0 1 0 3.62 3 2 22 | # 1 1 0 38.0 1 0 35.64 0 3 23 | # 1 3 0 26.0 0 0 7.92 3 1 24 | # 1 1 0 35.0 1 0 26.55 3 3 25 | # 0 3 1 35.0 0 0 8.05 3 2 26 | 27 | y = np.array(df['Survived']) 28 | x = np.array(df.drop('Survived', axis=1)) 29 | x_train, x_test, y_train, y_test = train_test_split(x, y) 30 | 31 | n_estimators = 100 32 | n_features = round(np.sqrt(x.shape[1])) # the number of features for column sampling 33 | n_depth = 3 # max_depth of tree 34 | 35 | models = [] # base model list 36 | for i in range(n_estimators): 37 | # Create a tree for Random Forest 38 | model = MyDTreeClassifierRF(max_depth=n_depth, 39 | max_samples = x_train.shape[0], 40 | max_features=n_features) 41 | 42 | # train the tree. 43 | # subsampling by rows and columns is performed within the model 44 | model.fit(x_train, y_train) 45 | 46 | # save trained tree 47 | models.append(model) 48 | 49 | # prediction 50 | y_estimates = np.zeros(shape=(x_test.shape[0], n_estimators)) 51 | for i, model in enumerate(models): 52 | y_estimates[:, i] = model.predict(x_test) 53 | 54 | # synthesizing the estimation results 55 | y_prob = y_estimates.mean(axis=1) 56 | y_pred = (y_prob >= 0.5) * 1 57 | print('\nAccuracy = {:.4f}'.format((y_pred == y_test).mean())) 58 | 59 | models 60 | y_estimates.shape 61 | y_estimates 62 | y_estimates[0, :] 63 | (y_estimates[0, :] == 0.0).sum() 64 | (y_estimates[0, :] == 1.0).sum() 65 | y_prob[0] 66 | y_pred[0] 67 | -------------------------------------------------------------------------------- /8.RandomForest/2.RF(sklearn).py: -------------------------------------------------------------------------------- 1 | # [MXML-8-02]: 2.RF(sklearn).py 2 | # Implement Random Forest using scikit-learn. 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/9si5fELmtg0 11 | # 12 | import numpy as np 13 | import pandas as pd 14 | from sklearn.tree import DecisionTreeClassifier 15 | from sklearn.ensemble import RandomForestClassifier 16 | from sklearn.model_selection import train_test_split 17 | 18 | # Read preprocessed Titanic data. 19 | df = pd.read_csv('data/titanic_clean.csv') 20 | y = np.array(df['Survived']) 21 | x = np.array(df.drop('Survived', axis=1)) 22 | x_train, x_test, y_train, y_test = train_test_split(x, y) 23 | 24 | n_estimators = 100 25 | n_depth = 3 # max_depth of tree 26 | 27 | # Implement Random Forest using DecisionTreeClassifier 28 | models = [] # base model list 29 | n = x_train.shape[0] # the number of train data points 30 | for i in range(n_estimators): 31 | # row subsampling 32 | i_row = np.random.choice(np.arange(0, n), n, replace=True) 33 | x_sample = x_train[i_row, :] 34 | y_sample = y_train[i_row] 35 | 36 | # Create a tree for Random Forest 37 | # Column subsampling for each split is performed within the model. 38 | model = DecisionTreeClassifier(max_depth=n_depth, 39 | max_features="sqrt") 40 | 41 | # train the tree 42 | model.fit(x_sample, y_sample) 43 | 44 | # save trained tree 45 | models.append(model) 46 | 47 | # prediction 48 | y_estimates = np.zeros(shape=(x_test.shape[0], n_estimators)) 49 | for i, model in enumerate(models): 50 | y_estimates[:, i] = model.predict(x_test) 51 | 52 | # synthesizing the estimation results 53 | y_prob = y_estimates.mean(axis=1) 54 | y_pred = (y_prob >= 0.5) * 1 55 | print('\nAccuracy1 = {:.4f}'.format((y_pred == y_test).mean())) 56 | 57 | # Implement Random Forest using RandomForestClassifier 58 | model = RandomForestClassifier(n_estimators=n_estimators, 59 | max_depth=n_depth, 60 | max_samples=n, # default 61 | max_features="sqrt") # default 62 | model.fit(x_train, y_train) 63 | y_pred = model.predict(x_test) 64 | print('\nAccuracy2 = {:.4f}'.format((y_pred == y_test).mean())) 65 | 66 | model.estimators_ 67 | # [DecisionTreeClassifier(max_depth=3, max_features='sqrt', 68 | # random_state=1090277217), 69 | # DecisionTreeClassifier(max_depth=3, max_features='sqrt', 70 | # random_state=1758239483), 71 | # DecisionTreeClassifier(max_depth=3, max_features='sqrt', 72 | # random_state=1420256802) 73 | # ... -------------------------------------------------------------------------------- /8.RandomForest/3.RF_OOB.py: -------------------------------------------------------------------------------- 1 | # [MXML-8-03] 3.RF_OOB.py 2 | # Add Out-Of-Bag (OOB) score feature to 2.RF(titanic).py. 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/DFh7BefJpfQ 11 | # 12 | import numpy as np 13 | import pandas as pd 14 | from MyDTreeClassifierRF import MyDTreeClassifierRF 15 | from sklearn.model_selection import train_test_split 16 | import matplotlib.pyplot as plt 17 | 18 | # Read preprocessed Titanic data. 19 | df = pd.read_csv('data/titanic_clean.csv') 20 | 21 | # Survived Pclass Sex Age SibSp Parch Fare Embarked Title 22 | # 0 3 1 22.0 1 0 3.62 3 2 23 | # 1 1 0 38.0 1 0 35.64 0 3 24 | # 1 3 0 26.0 0 0 7.92 3 1 25 | # 1 1 0 35.0 1 0 26.55 3 3 26 | # 0 3 1 35.0 0 0 8.05 3 2 27 | 28 | y = np.array(df['Survived']) 29 | x = np.array(df.drop('Survived', axis=1)) 30 | x_train, x_test, y_train, y_test = train_test_split(x, y) 31 | 32 | N = x_train.shape[0] # the number of train data points 33 | n_estimators = 50 34 | n_depth = 5 # max_depth of tree 35 | max_features = round(np.sqrt(x_train.shape[1])) # for column sub sampling 36 | 37 | # majority vote for iob_pred, or oob_pred 38 | # p = iob_pred or oob_pred 39 | def majority_vote(p): 40 | cnt_0 = (p == 0).sum(axis=1) 41 | cnt_1 = (p == 1).sum(axis=1) 42 | cnts = np.array([cnt_0, cnt_1]) # shape = (2, 668) 43 | return np.argmax(cnts, axis=0) 44 | 45 | models = [] # base model list 46 | iob_score = [] # Error rate measured with IOB 47 | oob_score = [] # Error rate measured with OOB 48 | 49 | # initialize IOB and OOB prediction map 50 | iob_pred = np.ones(shape=(N, n_estimators)) * -1 51 | oob_pred = np.ones(shape=(N, n_estimators)) * -1 52 | 53 | # Create n_estimators models 54 | for i in range(n_estimators): 55 | # Create a Decision Tree for Random Forest 56 | model = MyDTreeClassifierRF(max_depth=n_depth, 57 | max_samples = N, 58 | max_features = max_features) 59 | 60 | # train 61 | p1, p2 = model.fit(x_train, y_train) 62 | 63 | # save trained tree 64 | models.append(model) 65 | 66 | # Create IOB and OOB prediction map 67 | iob_pred[:, i] = p1 68 | oob_pred[:, i] = p2 69 | 70 | # Calculate IOB and OOB score 71 | y_trn = majority_vote(iob_pred) 72 | y_oob = majority_vote(oob_pred) 73 | 74 | iob_score.append((y_trn != y_train).mean()) 75 | oob_score.append((y_oob != y_train).mean()) 76 | 77 | # Visualize IOB and OOB score 78 | plt.figure(figsize=(6, 4)) 79 | plt.plot(iob_score, color='blue', lw=1.0, label='IOB error') 80 | plt.plot(oob_score, color='red', lw=1.0, label='OOB error') 81 | plt.legend() 82 | plt.xlabel('n_estimators') 83 | plt.ylabel('OOB error rate') 84 | plt.show() 85 | 86 | # prediction 87 | y_estimates = np.zeros(shape=(x_test.shape[0], n_estimators)) 88 | for i, model in enumerate(models): 89 | y_estimates[:, i] = model.predict(x_test) 90 | 91 | # synthesizing the estimation results 92 | y_prob = y_estimates.mean(axis=1) 93 | y_pred = (y_prob >= 0.5) * 1 94 | accuracy = (y_pred == y_test).mean() 95 | print('\nAccuracy of test data = {:.4f}'.format(accuracy)) 96 | print('Final OOB error rate = {:.4f}'.format(oob_score[-1])) 97 | 98 | # OOB probability 99 | # In theory, it would be 0.3679. 100 | # This means that x_train is selected with probability 0.6321 101 | # by row subsampling. (1.0 - 0.3679 = 0.6321) 102 | oob_percent = ((oob_pred >= 0).sum(axis=0) / N).mean() 103 | print('OOB probability = {:.4f}'.format(oob_percent)) 104 | -------------------------------------------------------------------------------- /8.RandomForest/6.RF_outlier.py: -------------------------------------------------------------------------------- 1 | # [MXML-8-06] 6.RF_outlier.py 2 | # Outlier detection using Random Forest’s proximity matrix 3 | # Reference [2]: 4 | # https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#outliers 5 | # 6 | # This code was used in the machine learning online 7 | # course provided by 8 | # www.youtube.com/@meanxai 9 | # www.github.com/meanxai/machine_learning 10 | # 11 | # A detailed description of this code can be found in 12 | # https://youtu.be/ps2QXPnPHVM 13 | # 14 | import numpy as np 15 | from sklearn.datasets import make_blobs 16 | from sklearn.ensemble import RandomForestClassifier 17 | import matplotlib.pyplot as plt 18 | 19 | # Generate training data 20 | x, y = make_blobs(n_samples=600, n_features=2, 21 | centers=[[0., 0.], [0.5, 0.5]], 22 | cluster_std=0.2, center_box=(-1., 1.)) 23 | 24 | # Create Proximity matrix 25 | # normalize = 0: pm / n_tree 26 | # normalize � 0: Normalize columns to sum to 1 27 | def proximity_matrix(model, x, normalize=0): 28 | n_tree = len(model.estimators_) 29 | 30 | # Apply trees in the forest to X, return leaf indices. 31 | leaf = model.apply(x) # shape = (x.shape[0], n_tree) 32 | 33 | pm = ( 34 | (leaf[:, None, :] == leaf[None, :, :]) 35 | .sum(axis=-1) 36 | ) 37 | # # the above is equivalent to: 38 | # pm = np.zeros(shape=(x.shape[0], x.shape[0])) 39 | # for i in range(n_tree): 40 | # t = leaf[:, i] 41 | # pm += np.equal.outer(t, t) * 1. 42 | 43 | np.fill_diagonal(pm, 0) 44 | if normalize == 0: 45 | return pm / n_tree 46 | else: 47 | return pm / pm.sum(axis=0, keepdims=True) 48 | 49 | n_estimators = 50 50 | n_depth = 5 51 | 52 | # Detect outliers using a proximity matrix 53 | model = RandomForestClassifier(n_estimators=n_estimators, 54 | max_depth=n_depth, 55 | max_features="sqrt", # default 56 | bootstrap=True, # default 57 | oob_score=True) 58 | model.fit(x, y) 59 | 60 | # Create a proximity matrix 61 | pm = proximity_matrix(model, x, normalize=0) 62 | 63 | i_y0 = np.where(y == 0)[0] 64 | i_y1 = np.where(y == 1)[0] 65 | i_y = [i_y0, i_y1] 66 | 67 | # 1) average proximity 68 | pi_bar = [] 69 | for i in range(pm.shape[0]): 70 | j_class = y[i] # the class of data instance i 71 | j_same = i_y[j_class] # Data point IDs with the same class as data point i 72 | pi_bar.append(np.sum(pm[i, j_same] ** 2)) 73 | 74 | # 2) raw outlier measure 75 | o_raw = x.shape[0] / np.array(pi_bar) 76 | 77 | # 3) final outlier measure 78 | # For convenience of coding, the mean value was used instead of the 79 | # median, and the standard deviation was used instead of the absolute 80 | # deviation. 81 | f_measure = [] 82 | for i in range(o_raw.shape[0]): 83 | j_class = y[i] # the class of the data instance i 84 | j_same = i_y[j_class] # Data point IDs with the same class as data point i 85 | f_measure.append((o_raw[i] - o_raw[j_same].mean()) / o_raw[j_same].std()) 86 | 87 | # Data in the upper top_rate percentage of f_measure are considered outliers. 88 | top_rate = 0.07 # top 5% 89 | top_idx = np.argsort(f_measure)[::-1][:int(top_rate * x.shape[0])] 90 | 91 | # Visualize normal data and outliers by color. 92 | plt.figure(figsize=(7, 7)) 93 | color = [['blue', 'red'][i] for i in y] 94 | color_out = [['blue', 'red'][i] for i in y[top_idx]] 95 | plt.scatter(x[:, 0], x[:, 1], s=30, c=color, alpha=0.5) 96 | plt.scatter(x[top_idx, 0], x[top_idx, 1], s=400, c='black', alpha=0.5) # outlier scatter 97 | plt.scatter(x[top_idx, 0], x[top_idx, 1], s=200, c='white') 98 | plt.scatter(x[top_idx, 0], x[top_idx, 1], s=30, c=color_out) 99 | plt.show() 100 | -------------------------------------------------------------------------------- /8.RandomForest/7.iForest_test.py: -------------------------------------------------------------------------------- 1 | # [MXML-8-07] 7.iForest_test.py 2 | # Implementation of Isolation Forest using ExtraTreeRegressor 3 | # sklearn's IsolationForest library makes it easy to implement 4 | # Isolation Forest, but I used ExtraTreeRegressor to better understand 5 | # how it works. 6 | # 7 | # This code was used in the machine learning online 8 | # course provided by 9 | # www.youtube.com/@meanxai 10 | # www.github.com/meanxai/machine_learning 11 | # 12 | # A detailed description of this code can be found in 13 | # https://youtu.be/JpZJoOTjMWU 14 | # 15 | from sklearn.tree import ExtraTreeRegressor 16 | import numpy as np 17 | 18 | # simple dataset 19 | x = np.array([2, 2.5, 3.8, 4.1, 10.5, 15.4], dtype=np.float32).reshape(-1, 1) 20 | n = x.shape[0] # the number of data points 21 | n_trees = 10 # the number of trees in Isolation Forest 22 | 23 | # H(i) is the harmonic number and it can be estimated 24 | # by ln(i) + 0.5772156649 (Euler’s constant). 25 | def H(n): 26 | return np.log(n) + 0.5772156649 27 | 28 | # average path length of unsuccessful search in BST 29 | def C(n): 30 | return 2 * H(n-1) - (2 * (n-1) / n) 31 | 32 | hx = np.zeros(n) 33 | for t in range(n_trees): 34 | # Create a tree using random split points 35 | model = ExtraTreeRegressor(max_depth=3, max_features=1) 36 | 37 | # Fit the model to training data. 38 | # Since it is unsupervised learning and there is no target value, 39 | # a binary tree is created by randomly generating target values. 40 | model.fit(x, np.random.uniform(size=n)) 41 | 42 | leaf_id = model.apply(x) # indices of leaf nodes 43 | 44 | # depth of each node, internal and external nodes. 45 | node_depth = model.tree_.compute_node_depths() 46 | 47 | # h(x): accumulated path length of data points 48 | hx += node_depth[leaf_id] - 1.0 49 | 50 | print('Tree',t,':', (hx / (t+1)).round(1)) 51 | 52 | Ehx = hx / n_trees # Average of h(x) 53 | S = 2 ** (-(Ehx / C(n))) # Anomaly scores for each data point 54 | i_out = np.argsort(S)[-2:] # Top 2 anomaly scores 55 | outliers = x[i_out] # outliers 56 | 57 | print('\nAnomaly scores:') 58 | print(S.round(3)) 59 | print('\nOutliers:') 60 | print(outliers) 61 | 62 | # import matplotlib.pyplot as plt 63 | # from sklearn import tree 64 | 65 | # plt.figure(figsize=(12, 8)) 66 | # tree.plot_tree(model) 67 | # plt.show() 68 | -------------------------------------------------------------------------------- /8.RandomForest/8.iForest_outlier.py: -------------------------------------------------------------------------------- 1 | # [MXML-8-07] 8.iForest_outlier.py 2 | # Outlier detection using Isolation Forest (iForest) 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/JpZJoOTjMWU 11 | # 12 | import numpy as np 13 | from sklearn.datasets import make_blobs 14 | from sklearn.ensemble import IsolationForest 15 | import matplotlib.pyplot as plt 16 | 17 | # Create training dataset 18 | x, y = make_blobs(n_samples=600, n_features=2, 19 | centers=[[0., 0.], [0.5, 0.5]], 20 | cluster_std=0.2, center_box=(-1., 1.)) 21 | 22 | model = IsolationForest(n_estimators = 50, contamination=0.05) 23 | model.fit(x) 24 | outlier = model.predict(x) # Normal = 1, Outlier = -1 25 | 26 | # Extract outliers 27 | i_outlier = np.where(outlier == -1)[0] 28 | x_outlier = x[i_outlier, :] 29 | 30 | # Visualize normal data points and outliers by color. 31 | plt.figure(figsize=(7, 7)) 32 | color = [['blue', 'red'][i] for i in y] 33 | color_out = [['blue', 'red'][i] for i in y[i_outlier]] 34 | plt.scatter(x[:, 0], x[:, 1], s=30, c=color, alpha=0.5) 35 | plt.scatter(x_outlier[:, 0], x_outlier[:, 1], s=400, c='black', alpha=0.5) # outlier scatter 36 | plt.scatter(x_outlier[:, 0], x_outlier[:, 1], s=200, c='white') 37 | plt.scatter(x_outlier[:, 0], x_outlier[:, 1], s=30, c=color_out) 38 | plt.show() 39 | 40 | # Check out the distribution of Anomaly score 41 | score = abs(model.score_samples(x)) 42 | score[i_outlier].min() 43 | plt.hist(score, bins = 50) 44 | plt.title('distribution of anomaly score') 45 | plt.xlabel('anomaly score') 46 | plt.ylabel('frequency') 47 | plt.axvline(x=score[i_outlier].min(), c='red') 48 | plt.show() 49 | 50 | 51 | -------------------------------------------------------------------------------- /9.AdaBoost/1.AdaBoost(binary1).py: -------------------------------------------------------------------------------- 1 | # [MXML-9-01] 1.AdaBoost.py 2 | # [1] Yoav Freund et, al., 1999, A Short Introduction to Boosting 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/avs14cAFyHE 11 | # 12 | import numpy as np 13 | import random as rd 14 | from sklearn.datasets import make_blobs 15 | from sklearn.tree import DecisionTreeClassifier 16 | from sklearn.model_selection import train_test_split 17 | import matplotlib.pyplot as plt 18 | 19 | # Create training data 20 | x, y = make_blobs(n_samples=200, n_features=2, 21 | centers=[[0., 0.], [0.5, 0.5]], 22 | cluster_std=0.2, center_box=(-1., 1.)) 23 | y = y * 2 - 1 # [0, 1] --> [-1, 1] 24 | 25 | m = x.shape[0] 26 | R = np.arange(m) 27 | T = 50 28 | 29 | # [1] Figure 1: The boosting algorithm AdaBoost 30 | # --------------------------------------------- 31 | # Given: (x1, y1), ..., (xm, ym) where xi ∈ X, yi ∈ Y = {-1, +1} 32 | # Initialize D1(i) = 1/m 33 | weights = [np.array(np.ones(shape=(m,)) / m)] 34 | eps = [] # epsilon history 35 | alphas = [] # alpha history 36 | models = [] # base learner models 37 | for t in range(T): 38 | # sampling according to the weights 39 | s_idx = np.array(rd.choices(R, weights=weights[-1], k=m)) 40 | sx = x[s_idx] # sample x 41 | sy = y[s_idx] # sample y 42 | 43 | # Train weak learner using distribution Dt. (Dt: weights) 44 | model = DecisionTreeClassifier(max_depth=2) 45 | model.fit(sx, sy) # fit the model to sample data 46 | 47 | # Get weak hypothesis ht : X -> {-1, +1} with error 48 | y_pred = model.predict(x) # predict entire training data 49 | i_not = np.array(y_pred != y).astype(int) # I(y_pred ≠ y) 50 | eps.append(np.sum(weights[-1] * i_not)) 51 | 52 | # Choose αt=(1/2)ln((1-εt)/εt). (α: alpha, ε: eps) 53 | # For αt to be positive, εt must be less than 0.5. 54 | # If εt is greater than 0.5, it means it is worse than a 55 | # random prediction. If so, initialize the weights to 1/m again. 56 | if eps[-1] > 0.5: 57 | weights.append(np.array(np.ones(shape=(m,)) / m)) 58 | alphas.append(0.0) 59 | print('weight re-initialized at t =', t) 60 | else: 61 | alpha = 0.5 * np.log((1 - eps[-1]) / (eps[-1] + 1e-8)) 62 | alphas.append(alpha) 63 | 64 | # Update Dt. 65 | new_weights = weights[-1] * np.exp(-alpha * y * y_pred) 66 | weights.append( new_weights / new_weights.sum()) # normalize 67 | 68 | models.append(model) 69 | 70 | # Output the final hypothesis: 71 | x_test = np.random.uniform(-0.5, 1.5, (1000, 2)) 72 | H = np.zeros(shape=x_test.shape[0]) 73 | for t in range(T): 74 | h = models[t].predict(x_test) 75 | H += alphas[t] * h 76 | 77 | y_pred = np.sign(H) 78 | 79 | # visualize training data and the sampling weights 80 | def plot_train(x, y, w): 81 | plt.figure(figsize=(5,5)) 82 | color = ['red' if a == 1 else 'blue' for a in y] 83 | plt.scatter(x[:, 0], x[:, 1], s=w*10000, c=color, alpha=0.5) 84 | plt.xlim(-0.5, 1.0) 85 | plt.ylim(-0.5, 1.0) 86 | plt.show() 87 | 88 | # visualize decision boundary 89 | def plot_boundary(x, y, x_test, y_pred): 90 | plt.figure(figsize=(5,5)) 91 | color = ['red' if a == 1 else 'blue' for a in y_pred] 92 | plt.scatter(x_test[:, 0], x_test[:, 1], s=100, c=color, alpha=0.3) 93 | plt.scatter(x[:, 0], x[:, 1], s=80, c='black') 94 | plt.scatter(x[:, 0], x[:, 1], s=10, c='yellow') 95 | plt.xlim(-0.5, 1.0) 96 | plt.ylim(-0.5, 1.0) 97 | plt.show() 98 | 99 | plot_train(x, y, w=np.array(np.ones(shape=(m,)) / m)) 100 | plot_train(x, y, w=weights[-1]) 101 | plot_boundary(x, y, x_test, y_pred) 102 | 103 | # Check the changes in α (alpha), ε (eps). 104 | # Check that ε are all less than 0.5 and that α and ε are inversely proportional. 105 | plt.plot(eps, marker='o', markersize=4, c='red', lw=1, label='epsilon') 106 | plt.plot(alphas, marker='o', markersize=4, c='blue', lw=1, label='alpha') 107 | plt.legend() 108 | plt.show() 109 | 110 | 111 | -------------------------------------------------------------------------------- /9.AdaBoost/2.AdaBoost(binary2).py: -------------------------------------------------------------------------------- 1 | # [MXML-9-02] 2.AdaBoost(binary2).py 2 | # Using y = {0, 1} instead of y = {-1, +1} 3 | # 4 | # [1] Yoav Freund et, al., 1999, A Short Introduction to Boosting 5 | # [2] Ji Zhu, et, al., 2006, Multi-class AdaBoost 6 | # 7 | # This code was used in the machine learning online 8 | # course provided by 9 | # www.youtube.com/@meanxai 10 | # www.github.com/meanxai/machine_learning 11 | # 12 | # A detailed description of this code can be found in 13 | # https://youtu.be/LVStXzGpA7Y 14 | # 15 | import numpy as np 16 | import random as rd 17 | from sklearn.datasets import make_blobs 18 | from sklearn.tree import DecisionTreeClassifier 19 | from sklearn.model_selection import train_test_split 20 | import matplotlib.pyplot as plt 21 | 22 | # Create training data 23 | x, y = make_blobs(n_samples=200, n_features=2, 24 | centers=[[0., 0.], [0.5, 0.5]], 25 | cluster_std=0.2, center_box=(-1., 1.)) 26 | 27 | m = x.shape[0] 28 | R = np.arange(m) 29 | T = 50 30 | 31 | weights = [np.array(np.ones(shape=(m,)) / m)] 32 | eps = [] # epsilon history 33 | alphas = [] # alpha history 34 | models = [] # base learner models 35 | for t in range(T): 36 | s_idx = np.array(rd.choices(R, weights=weights[-1], k=m)) # weighted sample index 37 | sx = x[s_idx] # sample x 38 | sy = y[s_idx] # sample y 39 | 40 | model = DecisionTreeClassifier(max_depth=2) 41 | model.fit(sx, sy) # fit the model to sample data 42 | 43 | y_pred = model.predict(x) # predict entire training data 44 | i_not = np.array(y_pred != y).astype(int) # I(y_pred ≠ y) 45 | eps.append(np.sum(weights[-1] * i_not)) 46 | 47 | if eps[-1] > 0.5: 48 | weights.append(np.array(np.ones(shape=(m,)) / m)) 49 | alphas.append(0.0) 50 | print('weight re-initialized at t =', t) 51 | else: 52 | alpha = 0.5 * np.log((1 - eps[-1]) / (eps[-1] + 1e-8)) 53 | alphas.append(alpha) 54 | 55 | new_weights = weights[-1] * np.exp(alpha * i_not) 56 | weights.append(new_weights / new_weights.sum()) # normalize 57 | 58 | models.append(model) 59 | 60 | H = np.zeros(shape=(5, 2)) 61 | h = np.array([1,0,1,0,1]) 62 | 63 | x_test = np.random.uniform(-0.5, 1.5, (1000, 2)) 64 | H = np.zeros(shape=(x_test.shape[0], 2)) 65 | for t in range(T): 66 | h = models[t].predict(x_test) 67 | oh = np.eye(2)[h] # one-hot encoding 68 | H += alphas[t] * oh 69 | 70 | y_pred = np.argmax(H, axis=1) 71 | 72 | # visualize training data and the sampling weights 73 | def plot_train(x, y, w): 74 | plt.figure(figsize=(5,5)) 75 | color = ['red' if a == 1 else 'blue' for a in y] 76 | plt.scatter(x[:, 0], x[:, 1], s=w*10000, c=color, alpha=0.5) 77 | plt.xlim(-0.5, 1.0) 78 | plt.ylim(-0.5, 1.0) 79 | plt.show() 80 | 81 | # visualize decision boundary 82 | def plot_boundary(x, y, x_test, y_pred): 83 | plt.figure(figsize=(5,5)) 84 | color = ['red' if a == 1 else 'blue' for a in y_pred] 85 | plt.scatter(x_test[:, 0], x_test[:, 1], s=100, c=color, alpha=0.3) 86 | plt.scatter(x[:, 0], x[:, 1], s=80, c='black') 87 | plt.scatter(x[:, 0], x[:, 1], s=10, c='yellow') 88 | plt.xlim(-0.5, 1.0) 89 | plt.ylim(-0.5, 1.0) 90 | plt.show() 91 | 92 | plot_train(x, y, w=np.array(np.ones(shape=(m,)) / m)) 93 | plot_train(x, y, w=weights[-1]) 94 | plot_boundary(x, y, x_test, y_pred) 95 | 96 | # Check the changes in α (alpha), ε (eps). 97 | # Check that ε are all less than 0.5 and that α and ε are inversely proportional. 98 | plt.plot(eps, marker='o', markersize=4, c='red', lw=1, label='epsilon') 99 | plt.plot(alphas, marker='o', markersize=4, c='blue', lw=1, label='alpha') 100 | plt.legend() 101 | plt.show() 102 | 103 | 104 | -------------------------------------------------------------------------------- /9.AdaBoost/4.sklearn(AdaBoost).py: -------------------------------------------------------------------------------- 1 | # [MXML-9-03] 4.sklearn(AdaBoost).py 2 | # Test sklearn's AdaBoostClassifier 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/tPeRalG7gYY 11 | # 12 | from sklearn.datasets import load_iris 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.tree import DecisionTreeClassifier 15 | from sklearn.ensemble import AdaBoostClassifier 16 | 17 | # Read iris dataset 18 | x, y = load_iris(return_X_y=True) 19 | 20 | # Create training and test data 21 | x_train, x_test, y_train, y_test = train_test_split(x, y) 22 | 23 | # Use the decision tree as the base weak learner. 24 | dt = DecisionTreeClassifier(max_depth = 1) 25 | 26 | # Generate a AdaBoost model with the SAMME algorithm. 27 | model = AdaBoostClassifier(estimator = dt, 28 | n_estimators = 100, 29 | algorithm = 'SAMME') # default = 'SAMME.R' 30 | 31 | # Fit the model to the training data 32 | model.fit(x_train, y_train) 33 | 34 | # Predict the class of test data and calculate the accuracy 35 | y_pred = model.predict(x_test) 36 | accuracy = (y_pred == y_test).mean() 37 | 38 | print('Accuracy = {:.4f}'.format(accuracy)) 39 | 40 | 41 | -------------------------------------------------------------------------------- /9.AdaBoost/5.AdaBoost(regression).py: -------------------------------------------------------------------------------- 1 | # [MXML-9-04] 5.AdaBoost(regression).py 2 | # [1] Harris Drucker et, al., 1997, Improving Regressors using Boosting Techniques 3 | # 4 | # This code was used in the machine learning online 5 | # course provided by 6 | # www.youtube.com/@meanxai 7 | # www.github.com/meanxai/machine_learning 8 | # 9 | # A detailed description of this code can be found in 10 | # https://youtu.be/nPzW-AmPSLs 11 | # 12 | import numpy as np 13 | import random as rd 14 | from sklearn.tree import DecisionTreeRegressor 15 | import matplotlib.pyplot as plt 16 | 17 | # Create training data 18 | def noisy_sine_data(n, s): 19 | rtn_x, rtn_y = [], [] 20 | for i in range(n): 21 | x = np.random.random() 22 | y = 2.0 * np.sin(2.0 * np.pi * x) + np.random.normal(0.0, s) + 3.0 23 | rtn_x.append(x) 24 | rtn_y.append(y) 25 | return np.array(rtn_x).reshape(-1,1), np.array(rtn_y) 26 | x, y = noisy_sine_data(n=500, s=0.5) 27 | 28 | N = x.shape[0] 29 | R = np.arange(N) 30 | T = 100 31 | 32 | weights = np.array(np.ones(shape=(N,)) / N) 33 | beta = [] # beta history 34 | models = [] # save base learners for prediction 35 | for t in range(T): 36 | s_idx = np.array(rd.choices(R, weights=weights, k=N)) 37 | sx = x[s_idx] # sample x 38 | sy = y[s_idx] # sample y 39 | 40 | # base learner 41 | model = DecisionTreeRegressor(max_depth=5) 42 | model.fit(sx, sy) # Fit the model to sample data 43 | 44 | # Calculate square loss 45 | y_pred = model.predict(x) # predict entire training data 46 | err = np.abs(y - y_pred) 47 | loss = (err / err.max()) ** 2 # squared loss 48 | 49 | loss_avg = np.sum(weights * loss) # average loss 50 | if loss_avg > 0.5: 51 | print('stopped at t={}, loss_avg={:.2f}'.format(t, loss_avg)) 52 | break 53 | 54 | # Calculate beta using average loss. 55 | beta.append(loss_avg / (1. - loss_avg)) 56 | 57 | # Update weights using beta. 58 | new_weights = weights * np.power(beta[-1], (1. - loss)) 59 | weights = new_weights / new_weights.sum() 60 | 61 | # save model 62 | models.append(model) 63 | 64 | # Visualize training data and estimated curve 65 | def plot_prediction(x, y, x_test, y_pred, title=""): 66 | plt.figure(figsize=(5, 3.5)) 67 | plt.scatter(x, y, c='blue', s=20, alpha=0.5, label='train') 68 | plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction') 69 | plt.xlim(0, 1) 70 | plt.ylim(0, 7) 71 | plt.legend() 72 | plt.title(title) 73 | plt.show() 74 | 75 | # prediction. 76 | n_test = 50 77 | x_test = np.linspace(0, 1, n_test).reshape(-1, 1) # test data 78 | log_beta = np.log(1. / np.array(beta)) # log(1/beta) 79 | y_pred = np.array([m.predict(x_test) for m in models]).T 80 | 81 | # Method-1: Using weighted average 82 | w = log_beta/ log_beta.sum() # normalize 83 | wavg_pred = np.sum(y_pred * w, axis=1) 84 | plot_prediction(x, y, x_test, wavg_pred, 'weighted average') 85 | 86 | # weighted median: (sum of the lower w ≥ half of the total sum of w) 87 | i_pred = np.argsort(y_pred, axis=1) 88 | w_acc = np.cumsum(w[i_pred], axis=1) # accumulated w 89 | is_med = w_acc >= 0.5 * w_acc[:, -1][:, np.newaxis] 90 | i_med = is_med.argmax(axis=1) # 23 91 | y_med = i_pred[np.arange(n_test), i_med] # 34 92 | wmed_pred = np.array(y_pred[np.arange(n_test), y_med]) # final estimate 93 | plot_prediction(x, y, x_test, wmed_pred, 'weighted median') 94 | 95 | # Let’s compare the results with sklearn’s AdaBoostRegressor 96 | from sklearn.ensemble import AdaBoostRegressor 97 | 98 | dt = DecisionTreeRegressor(max_depth=5) 99 | model = AdaBoostRegressor(estimator=dt, n_estimators=T, loss='square') 100 | model.fit(x, y) 101 | sk_pred = model.predict(x_test) 102 | plot_prediction(x, y, x_test, sk_pred, 'AdaBoostRegressor') --------------------------------------------------------------------------------