├── 1.KNN
    ├── 1.KNN(distance).py
    ├── 10.WKNN(regression).py
    ├── 11.KNN(boston).py
    ├── 2.KNN(optimal_k).py
    ├── 3.maxmin_ratio.py
    ├── 4.KNN(sklearn).py
    ├── 5.WKNN.py
    ├── 6.WKNN(iris).py
    ├── 7.KNN(Jaccard).py
    ├── 8.KNN(IOF).py
    └── 9.KNN(regression).py
├── 10.GBM
    ├── 1.GBM(regression).py
    ├── 2.SGBM(regression).py
    ├── 3.GBM(classification).py
    ├── 4.SGBM(classification).py
    └── 5.GBM(multi-classification).py
├── 11.xGBoost
    ├── 1.XGBoost(regression).py
    ├── 2.XGBoost(classification).py
    ├── 3.appoximation(1).py
    ├── 4.appoximation(2).py
    ├── 5.santander.py
    ├── MyXGBoostClassifier.py
    ├── MyXGBoostRegressor.py
    └── data
    │   └── santander.zip
├── 12.LGBM
    ├── 1.histogram_based.py
    ├── 2.goss.py
    ├── 3.greedy_bundling.py
    ├── 4.merge_features.py
    ├── 5.efb_onehot.py
    ├── 6.santander.py
    └── data
    │   └── santander.zip
├── 2.DecisionTree
    ├── 1.ID3(titanic_part).py
    ├── 2.CART(classification).py
    ├── 3.CART(titanic_part).py
    ├── 4.CART_CCP(titanic).py
    ├── 5.CART(multiclass).py
    ├── 6.CART(regression).py
    ├── MyDTreeClassifier.py
    ├── MyDTreeRegressor.py
    └── data
    │   ├── titanic.csv
    │   └── titanic_clean.csv
├── 3.LinearRegression
    ├── 1.scipy_opt(ols).py
    ├── 10.ransac(2).py
    ├── 11.boston(ransac).py
    ├── 2.boston(ols).py
    ├── 3.boston(sklearn).py
    ├── 4.scipy_opt(tls).py
    ├── 5.boston(tls).py
    ├── 6.lwr(scipy).py
    ├── 7.lwr(sklearn).py
    ├── 8.boston(lwr).py
    ├── 9.ransac(1).py
    └── data
    │   ├── boston_house.pkl
    │   └── wls_sample_data.csv
├── 4.LogisticRegression
    ├── 1.bin_class(scipy).py
    ├── 10.lwlr(sklearn).py
    ├── 11.lwlr_2(sklearn).py
    ├── 2.bin_class(sklearn).py
    ├── 3.bin_class(scipy_cancer).py
    ├── 4.bin_class(sklearn_cancer).py
    ├── 5.multiclass(ovr_1).py
    ├── 6.multiclass(ovr_2).py
    ├── 7.multiclass(softmax_scipy).py
    ├── 8.multiclass(softmax_sklearn).py
    └── 9.lwlr(scipy).py
├── 5.Convex
    ├── 1.plot_convex.py
    ├── 2.EQP.py
    ├── 3.IQP_1.py
    ├── 4.IQP_2.py
    ├── 5.QP.py
    └── 6.LP.py
├── 6.SVM
    ├── 1.cvxopt(hard_margin).py
    ├── 10.multiclass(OvR).py
    ├── 11.cvxopt(svr_linear).py
    ├── 12.cvxopt(svr_nonlinear).py
    ├── 2.cvxopt(soft_margin).py
    ├── 3.SVC(soft_margin).py
    ├── 4.linearSVC(soft_margin).py
    ├── 5.check_kernel.py
    ├── 6.cvxopt(kernel_trick).py
    ├── 7.SVC(kernel_trick).py
    ├── 8.Kernel(titanic).py
    ├── 9.multiclass(OvO).py
    └── data
    │   └── titanic.csv
├── 7.KMeans
    ├── 1.kmeans(basic).py
    ├── 2.sklearn(kmeans).py
    ├── 3.kmeans(plus).py
    └── 4.sklearn(mnist).py
├── 8.RandomForest
    ├── 1.RF(titanic).py
    ├── 2.RF(sklearn).py
    ├── 3.RF_OOB.py
    ├── 4.RF_OOB(sklearn).py
    ├── 5.RF_proximity.py
    ├── 6.RF_outlier.py
    ├── 7.iForest_test.py
    ├── 8.iForest_outlier.py
    ├── MyDTreeClassifierRF.py
    └── data
    │   ├── titanic.csv
    │   ├── titanic_clean.csv
    │   └── titanic_clean1.csv
├── 9.AdaBoost
    ├── 1.AdaBoost(binary1).py
    ├── 2.AdaBoost(binary2).py
    ├── 3.AdaBoost(multiclass).py
    ├── 4.sklearn(AdaBoost).py
    └── 5.AdaBoost(regression).py
└── README.md


/1.KNN/1.KNN(distance).py:
--------------------------------------------------------------------------------
 1 | # [MXML-1-01] 1.KNN(distance).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/EVEzkS5It0I
10 | #
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from sklearn.datasets import make_blobs
14 | from sklearn.model_selection import train_test_split
15 | 
16 | # create dataset
17 | x, y = make_blobs(n_samples=300, n_features=2, 
18 |                 centers=[[0., 0.], [0.25, 0.5], [0.5, 0.]], 
19 |                 cluster_std=0.15, center_box=(-1., 1.))
20 | 
21 | # Visualize the dataset and class by color
22 | plt.figure(figsize=(5, 5))
23 | for i, color in enumerate(['red', 'blue', 'green']):
24 |     p = x[y==i]
25 |     plt.scatter(p[:, 0], p[:, 1], s=50, c=color, 
26 |                 label='y=' + str(i), alpha=0.5)
27 | plt.legend()    
28 | plt.show()
29 | 
30 | # split dataset into train and test data
31 | x_train, x_test, y_train, y_test = train_test_split(x, y)
32 | K = 10  # the number of nearest neighbors
33 | 
34 | # 1. Calculate the distance between test and train data.
35 | d_train = x_train[np.newaxis, :, :]  # expand D0 axis
36 | d_test = x_test[:, np.newaxis, :]    # expand D1 axis
37 | distance = np.sqrt(np.sum((d_train - d_test) ** 2, axis=2))
38 | 
39 | # 2. Find K nearest neighbors
40 | i_near = np.argsort(distance, axis=1)[:, :K]
41 | y_near = y_train[i_near]
42 | 
43 | # 3. majority voting
44 | y_pred = np.array([np.bincount(p).argmax() for p in y_near])
45 | 
46 | # Measure the accuracy for test data
47 | print('Accuracy = {:.4f}'.format((y_pred == y_test).mean()))
48 | 


--------------------------------------------------------------------------------
/1.KNN/10.WKNN(regression).py:
--------------------------------------------------------------------------------
 1 | # [MXML-1-07] 10.WKNN(regression).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/_ZxTTvbZOtc
10 | #
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from sklearn.neighbors import KNeighborsRegressor
14 | 
15 | # Generate training and test data
16 | n_train = 1000  # the number of training data points
17 | n_test = 100    # the number of test data points
18 | x_train = np.random.random(n_train).reshape(-1, 1)
19 | y_train = 2.0 * np.sin(2.0 * np.pi * x_train)\
20 |              + np.random.normal(0.0, 0.5, size=(n_train,1))+3.
21 | y_train = y_train.reshape(-1)
22 | x_test = np.linspace(x_train.min(), x_train.max(), n_test)\
23 |              .reshape(-1, 1)
24 | 
25 | # Generate the distance matrix between x_test and x_train
26 | d_train = x_train[np.newaxis, :, :]
27 | d_test = x_test[:, np.newaxis, :]
28 | dist= np.abs(d_train - d_test).reshape(n_test, n_train) + 1e-8
29 | 
30 | # Find K nearest neighbors
31 | K = 200
32 | i_near = np.argsort(dist, axis=1)[:, :K]    # (100, 200)
33 | y_near = y_train[i_near]                    # (100, 200)
34 | 
35 | # Compute the weights to apply to the neighbors
36 | w_dist = np.array([dist[i, :][i_near[i, :]] \
37 |                    for i in range(x_test.shape[0])])
38 | w_inv = 1. / w_dist
39 | 
40 | # Predict the y values ​​of the test data by weighted average method
41 | y_pred1 = (y_near * w_inv).sum(axis=1) / w_inv.sum(axis=1)
42 | 
43 | # Plot the training and test data points with their predicted 
44 | # y values ​​(y_pred)
45 | def plot_prediction(y_pred):
46 |     plt.figure(figsize=(6,4))
47 |     plt.scatter(x_train, y_train, c='blue', s=20, alpha=0.5, label='train data')
48 |     plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction')
49 |     plt.xlim(0, 1)
50 |     plt.ylim(0, 7)
51 |     plt.legend()
52 |     plt.show()
53 |     
54 | plot_prediction(y_pred1)
55 | 
56 | # Predict the y values ​​of the test data using KNeighborsRegressor
57 | # from scikit-learn.
58 | knn = KNeighborsRegressor(n_neighbors=K, weights='distance')
59 | knn.fit(x_train, y_train)
60 | y_pred2 = knn.predict(x_test)
61 | plot_prediction(y_pred2)
62 | 


--------------------------------------------------------------------------------
/1.KNN/11.KNN(boston).py:
--------------------------------------------------------------------------------
 1 | # [MXML-1-07] 11.KNN(boston).py
 2 | # Predict the house prices in Boston using KNN
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/_ZxTTvbZOtc
11 | #
12 | import matplotlib.pyplot as plt
13 | import numpy as np
14 | from sklearn.neighbors import KNeighborsRegressor
15 | from sklearn.model_selection import train_test_split
16 | import pickle
17 | 
18 | # Read Boston house price dataset
19 | with open('data/boston_house.pkl', 'rb') as f:
20 |     data = pickle.load(f)
21 | x = data['data']      # shape = (506, 13)
22 | y = data['target']    # shape = (506,)
23 | x_train, x_test, y_train, y_test = train_test_split(x, y)
24 | 
25 | # Z-score Normalization
26 | x_mu = x_train.mean(axis=0)
27 | x_sd = x_train.std(axis=0)
28 | y_mu = y_train.mean()
29 | y_sd = y_train.std()
30 | zx_train = (x_train - x_mu) / x_sd
31 | zy_train = (y_train - y_mu) / y_sd
32 | zx_test = (x_test - x_mu) / x_sd
33 | zy_test = (y_test - y_mu) / y_sd
34 | 
35 | # Visually check the actual and predicted prices
36 | def plot_predictions(y_true, y_pred):
37 |     plt.figure(figsize=(5, 4))
38 |     plt.scatter(y_true, y_pred, s=20, c='r')
39 |     plt.xlabel('y_true')
40 |     plt.ylabel('y_pred')
41 |     plt.show()
42 |     
43 | # Simple average method
44 | model1 = KNeighborsRegressor(n_neighbors = 10)
45 | model1.fit(zx_train, zy_train)
46 | y_pred1 = model1.predict(zx_test) * y_sd + y_mu
47 | plot_predictions(y_test, y_pred1)
48 | print('KNN R2 = {:.3f}'.format(model1.score(zx_test, zy_test)))
49 | 
50 | # Weighted average method
51 | model2 = KNeighborsRegressor(n_neighbors = 30, weights='distance')
52 | model2.fit(zx_train, zy_train)
53 | y_pred2 = model2.predict(zx_test) * y_sd + y_mu
54 | plot_predictions(y_test, y_pred2)
55 | print('WKNN R2 = {:.3f}'.format(model2.score(zx_test, zy_test)))
56 | 
57 | a=np.array([.751, .671, .797, .802, .737, .789, .771, .735, .736, .668])
58 | a=np.array([.741, .669, .757, .764, .657, .703, .718, .747, .682, .647])
59 | a.mean()
60 | 


--------------------------------------------------------------------------------
/1.KNN/2.KNN(optimal_k).py:
--------------------------------------------------------------------------------
 1 | # [MXML-1-02] 2.KNN(optimal_k).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/tIKsjeyaVnc
10 | #
11 | import numpy as np
12 | from sklearn.datasets import make_blobs
13 | import matplotlib.pyplot as plt
14 | from sklearn.model_selection import train_test_split
15 | 
16 | # create dataset
17 | x, y = make_blobs(n_samples=900, n_features=2, 
18 |                 centers=[[0., 0.], [0.25, 0.5], [0.5, 0.]], 
19 |                 cluster_std=0.2, center_box=(-1., 1.))
20 | 
21 | # Visualize the dataset and classes by color
22 | plt.figure(figsize=(5, 5))
23 | for i, color in enumerate(['red', 'blue', 'green']):
24 |     p = x[y==i]
25 |     plt.scatter(p[:, 0], p[:, 1], s=20, c=color, 
26 |                 label='y=' + str(i), alpha=0.5)
27 | plt.legend()    
28 | plt.show()
29 | 
30 | # Split the dataset into training and test data
31 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
32 | N = x_train.shape[0]
33 |    
34 | # Z-score Normalization.
35 | # The values ​​in this data set have similar scales, 
36 | # so there is no need to normalize them. But let's try this 
37 | # just for practice.
38 | 
39 | # Calculate the mean and standard deviation from the training data 
40 | # and apply them to the test data.
41 | mean = x_train.mean(axis=0)
42 | std = x_train.std(axis=0)
43 | z_train = (x_train - mean) / std
44 | z_test = (x_test - mean) / std
45 | 
46 | # A function for performing the KNN classification algorithm.
47 | def knn_predict(train, test, k):
48 |     # 1. Create a distance matrix.
49 |     d_train = train[np.newaxis, :, :]   # Add a new axis at D0
50 |     d_test = test[:, np.newaxis, :]     # Add a new axis at D1
51 |     
52 |     p = 2  # Euclidean distance
53 |     d = np.sum(np.abs(d_train - d_test) ** p, axis=-1) ** (1/p)
54 |     
55 |     # 2. Find K nearest neighbors
56 |     i_nearest = np.argsort(d, axis=1)[:, :k]  # index
57 |     y_nearest = y_train[i_nearest]
58 |     
59 |     # 3. majority voting
60 |     return np.array([np.bincount(i).argmax() for i in y_nearest])
61 | 
62 | # Measure the accuracy of the test data while changing K value.
63 | accuracy = []
64 | k_vals = np.arange(1, 700, 10)
65 | for k in k_vals:
66 |     # Estimate the classes of all test data points and measure the accuracy.
67 |     y_pred = knn_predict(z_train, z_test, k)
68 |     accuracy.append((y_pred == y_test).mean())
69 |     
70 | # Observe how the accuracy changes as K changes.
71 | plt.figure(figsize=(5, 3))
72 | plt.plot(k_vals, accuracy, '-')
73 | plt.axvline(x=np.sqrt(N), c='r', ls='--')
74 | plt.ylim(0.5, 1)
75 | plt.show()
76 | 
77 | # Generate a large number of test data points and roughly determine 
78 | # the decision boundary.
79 | # x_many = np.random.uniform(-0.5, 1.5, (1000, 2))
80 | x_many = np.random.uniform(-0.5, 1.5, (1000, 2))
81 | z_many = (x_many - mean) / std
82 | y_many = knn_predict(z_train, z_many, k=int(np.sqrt(N)))
83 | 
84 | # Check the decision boundary
85 | plt.figure(figsize=(5,5))
86 | color = [['red', 'blue', 'green'][a] for a in y_many]
87 | plt.scatter(x_many[:, 0], x_many[:, 1], s=100, c=color, alpha=0.3)
88 | plt.scatter(x_train[:, 0], x_train[:, 1], s=80, c='black')
89 | plt.scatter(x_train[:, 0], x_train[:, 1], s=10, c='yellow')
90 | plt.xlim(-0.5, 1.0)
91 | plt.ylim(-0.5, 1.0)
92 | plt.show()
93 | 
94 | 


--------------------------------------------------------------------------------
/1.KNN/3.maxmin_ratio.py:
--------------------------------------------------------------------------------
 1 | # [MXML-01-03] 3.maxmin_ratio.py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/qZ_6UAVnNMw
10 | #
11 | import numpy as np
12 | from sklearn.datasets import fetch_openml
13 | from sklearn.decomposition import PCA 
14 | 
15 | # Load the MNIST dataset
16 | mnist = fetch_openml('mnist_784', parser='auto')
17 | x = np.array(mnist['data']) / 255
18 | 
19 | # Compute the distances between a single data point and all other 
20 | # data points in a given data set.
21 | def distance(data):
22 |     # Randomly choose a single data point from the dataset.
23 |     i = np.random.randint(0, data.shape[0])
24 |     tp = data[i]
25 | 
26 |     # Remove the chosen data point from the dataset.
27 |     xp = np.delete(data, i, axis=0)
28 | 
29 |     # Compute the distances between tp and xp
30 |     d = np.sqrt(np.sum((xp - tp) ** 2, axis=-1))
31 | 
32 |     # Return the minimum distance and maximum distance
33 |     return d.min(), d.max()
34 | 
35 | # Compute the average ratio of minimum to maximum distances 
36 | # in a 784-dimensional feature space
37 | r_maxmin = []
38 | for i in range(10):
39 |     dmin, dmax = distance(x)
40 |     r_maxmin.append(dmax / dmin)
41 | print("max-min ratio (p=784): {0:.2f}".format(np.mean(r_maxmin)))
42 | 
43 | # Compute the average ratio of minimum to maximum distances 
44 | # in a 5-dimensional feature space
45 | pca = PCA(n_components=5)
46 | pca.fit(x) 
47 | x_pca = pca.transform(x)
48 | 
49 | r_maxmin = []
50 | for i in range(10):
51 |     dmin, dmax = distance(x_pca)
52 |     r_maxmin.append(dmax / dmin)
53 | print("max-min ratio (p=5)  : {0:.2f}".format(np.mean(r_maxmin)))
54 | 


--------------------------------------------------------------------------------
/1.KNN/4.KNN(sklearn).py:
--------------------------------------------------------------------------------
 1 | # [MXML-01-03] 4.KNN(sklearn).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/qZ_6UAVnNMw
10 | #
11 | import numpy as np
12 | from sklearn.datasets import load_iris
13 | from sklearn.neighbors import KNeighborsClassifier
14 | from sklearn.model_selection import train_test_split
15 | import matplotlib.pyplot as plt
16 | 
17 | # Load the Iris dataset.
18 | # x: data, the number of samples=150, the number of features=4
19 | # y: target data with class (0,1,2)
20 | x, y = load_iris(return_X_y=True)
21 | 
22 | # Split the dataset to training, validation and test data
23 | x_train, x_test, y_train, y_test=train_test_split(x, y, \
24 |                                          test_size = 0.4)
25 | x_val, x_test, y_val, y_test=train_test_split(x_test, y_test,\
26 |                                          test_size = 0.5)
27 | # Z-score normalization
28 | mean = x_train.mean(axis=0)
29 | std = x_train.std(axis=0)
30 | 
31 | x_train = (x_train - mean) / std # Z-score normalization
32 | x_val = (x_val - mean) / std     # use mean and std from x_train
33 | x_test = (x_test - mean) / std   # use mean and std from x_train
34 | 
35 | # Set K to sqrt(N)
36 | sqr_k = int(np.sqrt(x_train.shape[0]))
37 | 
38 | # Build a KNN classification model
39 | knn = KNeighborsClassifier(n_neighbors=sqr_k, metric='minkowski', p=2)
40 | 
41 | # Model fitting. Since KNN is a lazy learner, no learning is performed 
42 | # at this step. It simply stores the training data points and the 
43 | # parameters.
44 | knn.fit(x_train, y_train)
45 | 
46 | # Predict the class of validation data.
47 | # The actual learning takes place at this stage, when test or 
48 | # validation data is provided.
49 | y_pred = knn.predict(x_val)
50 | 
51 | # Measure the accuracy on the validation data
52 | accuracy = (y_val == y_pred).mean()
53 | print('\nK: sqr_K = {}, Accuracy on validation data = {:.3f}'\
54 |       .format(sqr_k, accuracy))
55 | 
56 | # Determine the optimal K.
57 | # Measure the accuracy on the validation data while changing K.
58 | accuracy = []
59 | for k in range(2, 20):
60 |     knn = KNeighborsClassifier(n_neighbors = k)
61 |     knn.fit(x_train, y_train)
62 |     y_pred = knn.predict(x_val)
63 |     accuracy.append((y_val == y_pred).mean())
64 |     
65 | # Find the optimal K value with the highest accuracy.
66 | opt_k = np.array(accuracy).argmax() + 2
67 | 
68 | # Observe how the accuracy changes as K changes.
69 | plt.plot(np.arange(2, 20), accuracy, marker='o')
70 | plt.xticks(np.arange(2, 20))
71 | plt.axvline(x = opt_k, c='blue', ls = '--')
72 | plt.axvline(x = sqr_k, c='red', ls = '--')
73 | plt.ylim(0.8, 1.1)
74 | plt.title('optimal K = ' + str(opt_k))
75 | plt.show()
76 | 
77 | # Finally, we use the test data to measure the final performance 
78 | # of the model.
79 | knn = KNeighborsClassifier(n_neighbors = opt_k)
80 | knn.fit(x_train, y_train)
81 | y_pred = knn.predict(x_test)
82 | accuracy = (y_test == y_pred).mean()
83 | print('\nK: opt_k = {}, Accuracy on test data = {:.3f}'
84 |       .format(opt_k, accuracy))
85 |    


--------------------------------------------------------------------------------
/1.KNN/5.WKNN.py:
--------------------------------------------------------------------------------
 1 | # [MXML-1-04] 5.WKNN.py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/Lu6GAc4FYz8
10 | #
11 | import numpy as np
12 | 
13 | # Let's assume that the distance matrix between the test data and 
14 | # the training data is given as follows. shape = (5, 10)
15 | dist = np.array(
16 | # train: 0     1    2    3    4    5    6    7    8    9       test
17 |        [[5. , 3.5, 4.3, 3.4, 1.4, 6.5, 2.7, 5.1, 2.9, 2.8],   # i=0
18 |         [4.4, 1.9, 3.6, 3.3, 0.5, 5.5, 2.1, 4.4, 1.3, 2.3],   # i=1
19 |         [4.6, 1. , 3.9, 4.4, 3. , 4.7, 3.2, 4.4, 1.4, 3.5],   # i=2
20 |         [4.7, 0.6, 3.9, 4.1, 1.7, 5.3, 2.7, 4.6, 0.4, 3. ],   # i=3
21 |         [3. , 3.6, 2.4, 1.4, 2.4, 4.8, 1.2, 3.2, 3. , 1.1]])  # i=4
22 | 
23 | # target class y   (0  1  2  3  4  5  6  7  8  9)
24 | y_train = np.array([0, 1, 1, 0, 1, 0, 1, 1, 0, 0])
25 | C = [0, 1]   # the class y is either 0 or 1
26 | K = 7        # 7-nearest neighbors
27 | T = 5        # the number of test data points
28 | 
29 | # Find K nearest neighbors
30 | i_near = np.argsort(dist, axis=1)[:, :K]
31 | y_near = y_train[i_near]
32 | 
33 | # Compute the inverse distance
34 | w_dist = np.array([dist[i, :][i_near[i, :]] for i in range(T)])
35 | w_inv = 1. / w_dist
36 | 
37 | # Predict the class of test data using the inverse weighted distance
38 | y_pred = []
39 | for i in range(T):
40 |     iw_dist = [w_inv[i][y_near[i] == j].sum() for j in C]
41 |     y_pred.append(np.argmax(iw_dist / w_inv[i].sum()))
42 | 
43 | print(y_pred)
44 | 


--------------------------------------------------------------------------------
/1.KNN/6.WKNN(iris).py:
--------------------------------------------------------------------------------
 1 | # [MXML-1-04] 6.WKNN(iris).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/Lu6GAc4FYz8
10 | #
11 | import numpy as np
12 | from sklearn.datasets import load_iris
13 | from sklearn.model_selection import train_test_split
14 | 
15 | # Load Iris dataset
16 | x, y = load_iris(return_X_y=True)
17 | 
18 | # Split the dataset to training and test data
19 | x_train, x_test, y_train, y_test = train_test_split(x, y)
20 | N = x_train.shape[0]  # the number of training data points
21 | T = x_test.shape[0]   # the number of test data points
22 | C = np.unique(y)      # categories of y: [0, 1, 2]
23 | K = int(np.sqrt(N))   # appropriate K value
24 | 
25 | # Z-score Normalization.
26 | mean = x_train.mean(axis=0); std = x_train.std(axis=0)
27 | z_train = (x_train - mean) / std
28 | z_test = (x_test - mean) / std
29 | 
30 | # Predict the class of test data.
31 | # 1. Compute the distance matrix between test and train data.
32 | d_train = z_train[np.newaxis, :, :]
33 | d_test = z_test[:, np.newaxis, :]
34 | dist = np.sqrt(np.sum((d_train - d_test) ** 2, axis=2))
35 | dist += 1e-8  # To prevent the distance from becoming 0
36 | 
37 | # 2. Find K nearest neighbors.
38 | i_near = np.argsort(dist, axis=1)[:, :K]
39 | y_near = y_train[i_near]
40 | 
41 | # 3. Compute the inverse distance
42 | w_inv = 1. / np.array([dist[i, :][i_near[i, :]] for i in range(T)])
43 | 
44 | # 4. Predict the class of the test data using the weights of the 
45 | #    inverse distance
46 | y_pred1 = []
47 | for i in range(T):
48 |     iw_dist = [w_inv[i][y_near[i] == j].sum() for j in C]
49 |     y_pred1.append(np.argmax(iw_dist / w_inv[i].sum()))
50 | y_pred1 = np.array(y_pred1)
51 |     
52 | # Measure the accuracy on the test data.
53 | accuracy = (y_test == y_pred1).mean()
54 | print('\nAccuracy on test data = {:.3f}'.format(accuracy))
55 | 
56 | # Compare with the results of sklearn's KNeighborsClassifier.
57 | from sklearn.neighbors import KNeighborsClassifier
58 | 
59 | # 'distance': weight points by the inverse of their distance. 
60 | # in this case, closer neighbors of a query point will have 
61 | # a greater influence than neighbors which are further away.
62 | knn = KNeighborsClassifier(n_neighbors=K, weights='distance')
63 | knn.fit(z_train, y_train)
64 | y_pred2 = knn.predict(z_test)
65 | accuracy = (y_test == y_pred2).mean()
66 | print('Accuracy on test data (sklearn) = {:.3f}'.format(accuracy))
67 | 
68 | print('from scratch: y_pred1\n', y_pred1)
69 | print('from sklearn: y_pred2\n', y_pred2)
70 | 
71 | (y_pred1 != y_pred2).sum()
72 | 


--------------------------------------------------------------------------------
/1.KNN/7.KNN(Jaccard).py:
--------------------------------------------------------------------------------
 1 | # [MXML-1-05] 7.KNN(Jaccard).py
 2 | # KNN classification on categorical data
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/dDJwm25-_l8
11 | #
12 | import numpy as np
13 | from sklearn.preprocessing import OneHotEncoder, LabelEncoder
14 | from sklearn.metrics import jaccard_score
15 | 
16 | # Golf play dataset
17 | # data source: 
18 | # https://www.kaggle.com/datasets/priy998/golf-play-dataset
19 | # columns = [outlook, temperature, humidity, windy, play]
20 | data = np.array(
21 |         [['sunny',    'hot',  'high',   False, 'no'],
22 |          ['sunny',    'hot',  'high',   True,  'no'],
23 |          ['overcast', 'hot',  'high',   False, 'yes'],
24 |          ['rainy',    'mild', 'high',   False, 'yes'],
25 |          ['rainy',    'cool', 'normal', False, 'yes'],
26 |          ['rainy',    'cool', 'normal', True,  'no'],
27 |          ['overcast', 'cool', 'normal', True,  'yes'],
28 |          ['sunny',    'mild', 'high',   False, 'no'],
29 |          ['sunny',    'cool', 'normal', False, 'yes'],
30 |          ['rainy',    'mild', 'normal', False, 'yes'],
31 |          ['sunny',    'mild', 'normal', True,  'yes'],
32 |          ['overcast', 'mild', 'high',   True,  'yes'],
33 |          ['overcast', 'hot',  'normal', False, 'yes'],
34 |          ['rainy',    'mild', 'high',   True,  'no'],
35 |          ['sunny',    'mild', 'high',   True,  'no']])
36 | 
37 | # x: one-hot encoded or label encoded features
38 | # y: target, k: the number of nearest neighbors
39 | # average: 'binary' or 'macro'
40 | def predict(x, y, k, average):
41 |     match = []
42 |     for t in range(x.shape[0]):
43 |         x_test = x[t]
44 |         y_test = y[t]
45 |         x_train = np.delete(x, t, axis=0)
46 |         y_train = np.delete(y, t, axis=0)
47 |         
48 |         # Compute the Jaccard similarity between a test data point
49 |         # and all training data points.
50 |         similarities = []
51 |         for i in range(x_train.shape[0]):
52 |             J = jaccard_score(x_train[i], x_test, 
53 |                               average=average, zero_division=0.0)
54 |             similarities.append(J)
55 |         
56 |         # Find the k nearest neighbors of the test data point.
57 |         j = np.argsort(similarities)[::-1][:k]
58 |         
59 |         # Predict the class of the test data point by majority vote
60 |         y_pred = np.bincount(y_train[j]).argmax()
61 |         
62 |         # Store whether y_pred and y_test match or not.
63 |         match.append(y_pred == y_test)
64 |         
65 |         print("True class: {}, Predicted class: {}, is match: {}"\
66 |               .format(y_test, y_pred, match[-1]))
67 |     return np.mean(match)  # return the accuracy
68 | 
69 | # One-hot encoding
70 | ohe = OneHotEncoder().fit_transform(data).toarray().astype('int')
71 | x = ohe[:, :-2]  # one-hot encoded features
72 | y = ohe[:, -1]   # target
73 | K = 5            # 5 nearest neighbors
74 | 
75 | print("\n* One-hot encoding:")
76 | acc = predict(x, y, K, average='binary')
77 | print("Accuracy: {:.3f}".format(acc))
78 | 
79 | # Label encoding
80 | le = []
81 | for i in range(data.shape[1]):
82 |     le.append(LabelEncoder().fit_transform(data[:, i]))
83 | le = np.array(le).T
84 | 
85 | x = le[:, :-1]  # label encoded features
86 | y = le[:, -1]   # target
87 | 
88 | print("\n* Label encoding:")
89 | acc = predict(x, y, K, average='macro')
90 | print("Accuracy: {:.3f}".format(acc))
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/1.KNN/9.KNN(regression).py:
--------------------------------------------------------------------------------
 1 | # [MXML-1-07] 9.KNN(regression).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/_ZxTTvbZOtc
10 | #
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from sklearn.neighbors import KNeighborsRegressor
14 | 
15 | # Generate training and test data
16 | n_train = 1000  # the number of training data points
17 | n_test = 100    # the number of test data points
18 | x_train = np.random.random(n_train).reshape(-1, 1)
19 | y_train = 2.0 * np.sin(2.0 * np.pi * x_train)\
20 |              + np.random.normal(0.0, 0.5, size=(n_train,1))+3.
21 | y_train = y_train.reshape(-1)
22 | x_test = np.linspace(x_train.min(), x_train.max(), n_test)\
23 |              .reshape(-1, 1)
24 | 
25 | # Generate the distance matrix between x_test and x_train
26 | d_train = x_train[np.newaxis, :, :]
27 | d_test = x_test[:, np.newaxis, :]
28 | dist= np.abs(d_train - d_test).reshape(n_test, n_train)
29 | 
30 | # Find K nearest neighbors
31 | K = 20
32 | i_near = np.argsort(dist, axis=1)[:, :K]    # (100, 20)
33 | y_near = y_train[i_near]                    # (100, 20)
34 | 
35 | # Predict the y values ​​of the test data by simple average method
36 | y_pred1 = y_near.mean(axis=1)
37 | 
38 | # Plot the training and test data points with their predicted 
39 | # y values ​​(y_pred1)
40 | def plot_prediction(y_pred):
41 |     plt.figure(figsize=(6,4))
42 |     plt.scatter(x_train, y_train, c='blue', s=20, alpha=0.5, label='train data')
43 |     plt.plot(x_test, y_pred, c='red', lw=3.0, label='prediction')
44 |     plt.xlim(0, 1)
45 |     plt.ylim(0, 7)
46 |     plt.legend()
47 |     plt.show()
48 |     
49 | # Predict the y-values ​​of the test data using the simple 
50 | # average method.
51 | plot_prediction(y_pred1)
52 | 
53 | # Predict the y values ​​of the test data using scikit-learn's KNeighborsRegressor
54 | knn = KNeighborsRegressor(n_neighbors=K)
55 | knn.fit(x_train, y_train)
56 | y_pred2 = knn.predict(x_test)
57 | plot_prediction(y_pred2)
58 | 
59 | 


--------------------------------------------------------------------------------
/10.GBM/1.GBM(regression).py:
--------------------------------------------------------------------------------
  1 | # [MXML-10-03] 1.GBM(regression).py
  2 | # Implementation of GBM algorithm using DecisionTreeRegressor.
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/hF-1HHKPxq4
 11 | # 
 12 | import numpy as np
 13 | from sklearn.tree import DecisionTreeRegressor
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | # Create training data for regression
 17 | def nonlinear_data(n, s):
 18 |    rtn_x, rtn_y = [], []
 19 |    for i in range(n):
 20 |        x = np.random.random()
 21 |        y = 2.0 * np.sin(2.0 * np.pi * x) + np.random.normal(0.0, s) + 3.0
 22 |        rtn_x.append(x)
 23 |        rtn_y.append(y)
 24 |        
 25 |    return np.array(rtn_x).reshape(-1,1), np.array(rtn_y)
 26 | 
 27 | # Create training data
 28 | x, y = nonlinear_data(n=500, s=0.5)
 29 | 
 30 | n_depth = 3      # tree depth
 31 | n_tree = 50      # the number of trees (M)
 32 | alpha = 0.05     # learning rate
 33 | 
 34 | # step-1: Initialize model with a constant value.
 35 | F0 = y.mean()
 36 | 
 37 | # Training
 38 | Fm = F0
 39 | models = []
 40 | loss = []
 41 | for m in range(n_tree):
 42 |     # step-2 (A): Compute so-called pseudo-residuals
 43 |     residual = y - Fm
 44 |     
 45 |     # step-2 (B): Fit a regression tree to the residual
 46 |     gb_model = DecisionTreeRegressor(max_depth=n_depth)
 47 |     gb_model.fit(x, residual)
 48 |     
 49 |     # step-2 (C): compute gamma (prediction)
 50 |     gamma = gb_model.predict(x)
 51 |     
 52 |     # step-2 (D): Update the model
 53 |     Fm = Fm + alpha * gamma
 54 |     
 55 |     # Store trained tree models
 56 |     models.append(gb_model)
 57 |     
 58 |     # Calculate loss. loss = mean squared error.
 59 |     loss.append(((y - Fm) ** 2).sum())
 60 | 
 61 | # step-3: Output Fm(x) – Prediction of test data
 62 | y_pred = F0
 63 | x_test = np.linspace(0, 1, 50).reshape(-1, 1)
 64 | for model in models:
 65 |     y_pred += alpha * model.predict(x_test)
 66 | 
 67 | # Check the loss history
 68 | plt.figure(figsize=(6,4))
 69 | plt.plot(loss, c='red')
 70 | plt.xlabel('m : iteration')
 71 | plt.ylabel('loss: mean squared error')
 72 | plt.title('loss history')
 73 | plt.show()
 74 | 
 75 | # Visualize the training data and prediction results
 76 | def plot_prediction(x, y, x_test, y_pred, title):
 77 |     plt.figure(figsize=(6,4))
 78 |     plt.scatter(x, y, c='blue', s=20, alpha=0.5, label='train data')
 79 |     plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction')
 80 |     plt.xlim(0, 1)
 81 |     plt.ylim(0, 7)
 82 |     plt.legend()
 83 |     plt.title(title)
 84 |     plt.show()
 85 |     
 86 | plot_prediction(x, y, x_test, y_pred, 'From scratch')
 87 | 
 88 | # Compare with the results of sklearn’s GradientBoostingRegressor
 89 | from sklearn.ensemble import GradientBoostingRegressor
 90 | 
 91 | sk_model = GradientBoostingRegressor(n_estimators=n_tree, 
 92 |                                      learning_rate=alpha, 
 93 |                                      max_depth=n_depth)
 94 | 
 95 | sk_model.fit(x, y)                 # training
 96 | y_pred = sk_model.predict(x_test)  # prediction
 97 | 
 98 | # Visualize the training data and prediction results
 99 | plot_prediction(x, y, x_test, y_pred, 'GradientBoostingRegressor')
100 | 
101 | sk_model.estimators_
102 | 


--------------------------------------------------------------------------------
/10.GBM/2.SGBM(regression).py:
--------------------------------------------------------------------------------
  1 | # [MXML-10-03] 2.SGBM(regression).py 
  2 | # Stochastic Gradient Boosting Method (1999, Friedman)
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/hF-1HHKPxq4
 11 | # 
 12 | import numpy as np
 13 | from sklearn.tree import DecisionTreeRegressor
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | # Create training data for regression
 17 | def nonlinear_data(n, s):
 18 |    rtn_x, rtn_y = [], []
 19 |    for i in range(n):
 20 |        x = np.random.random()
 21 |        y = 2.0 * np.sin(2.0 * np.pi * x) + np.random.normal(0.0, s) + 3.0
 22 |        rtn_x.append(x)
 23 |        rtn_y.append(y)
 24 |        
 25 |    return np.array(rtn_x).reshape(-1,1), np.array(rtn_y)
 26 | 
 27 | # Visualize the training data and prediction results
 28 | def plot_prediction(x, y, x_test, y_pred):
 29 |     plt.figure(figsize=(6,4))
 30 |     plt.scatter(x, y, c='blue', s=20, alpha=0.5, label='train data')
 31 |     plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction')
 32 |     plt.xlim(0, 1)
 33 |     plt.ylim(0, 7)
 34 |     plt.legend()
 35 |     plt.show()
 36 | 
 37 | # Create training data
 38 | x, y = nonlinear_data(n=500, s=0.5)
 39 | 
 40 | n_data = x.shape[0]
 41 | n_depth = 3         # tree depth (weak learner)
 42 | n_tree = 50         # the number of trees (M)
 43 | f_rate = 0.5        # rate of sampling
 44 | lr = 0.05           # learning rate
 45 | 
 46 | # step-1: Initialize model with a constant value.
 47 | F0 = y.mean()
 48 | 
 49 | # Training
 50 | Fm = np.repeat(F0, n_data)
 51 | models = []
 52 | loss = []
 53 | for m in range(n_tree):
 54 |     # data sampling without replacement
 55 |     si = np.random.choice(range(n_data), int(n_data * f_rate), replace=False)
 56 |     
 57 |     # step-2 (A): Compute so-called pseudo-residuals
 58 |     residual = y[si] - Fm[si]
 59 |     
 60 |     # step-2 (B): Fit a regression tree to the residual
 61 |     gb_model = DecisionTreeRegressor(max_depth=n_depth)
 62 |     gb_model.fit(x[si], residual)
 63 |     
 64 |     # step-2 (C): compute gamma (prediction)
 65 |     gamma = gb_model.predict(x)
 66 |     
 67 |     # step-2 (D): Update the model
 68 |     Fm = Fm + lr * gamma
 69 |     
 70 |     # Store trained tree models
 71 |     models.append(gb_model)
 72 |     
 73 |     # Calculate loss. loss = mean squared error.
 74 |     loss.append(((y - Fm) ** 2).sum())
 75 | 
 76 | # Check the loss history
 77 | plt.figure(figsize=(6,4))
 78 | plt.plot(loss, c='red')
 79 | plt.xlabel('m : iteration')
 80 | plt.ylabel('loss: mean squared error')
 81 | plt.title('loss history')
 82 | plt.show()
 83 | 
 84 | # step-3: Output Fm(x) - Prediction
 85 | y_pred = F0
 86 | x_test = np.linspace(0, 1, 50).reshape(-1, 1)
 87 | for model in models:
 88 |     y_pred += lr * model.predict(x_test)
 89 | 
 90 | # Visualize the training data and prediction results
 91 | plot_prediction(x, y, x_test, y_pred)
 92 | 
 93 | # Compare with the results of sklearn’s GradientBoostingRegressor
 94 | from sklearn.ensemble import GradientBoostingRegressor
 95 | sk_model = GradientBoostingRegressor(n_estimators=n_tree, 
 96 |                                      learning_rate=lr, 
 97 |                                      max_depth=n_depth,
 98 |                                      subsample=f_rate)
 99 | 
100 | sk_model.fit(x, y)                 # Training
101 | y_pred = sk_model.predict(x_test)  # Prediction
102 | 
103 | plot_prediction(x, y, x_test, y_pred)


--------------------------------------------------------------------------------
/11.xGBoost/1.XGBoost(regression).py:
--------------------------------------------------------------------------------
 1 | # [MXML-11-03] 1.XGBoost(regression).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/Ms_xxQFrTWc
10 | # 
11 | import numpy as np
12 | from MyXGBoostRegressor import MyXGBRegressor
13 | import matplotlib.pyplot as plt
14 |                  
15 | # Plot the training data and estimated curve
16 | def plot_prediction(x, y, x_test, y_pred):
17 |     plt.figure(figsize=(7, 5))
18 |     plt.scatter(x, y, c='blue', s=20, alpha=0.5, label='train data')
19 |     plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction')
20 |     plt.xlim(0, 1)
21 |     plt.ylim(0, 7)
22 |     plt.legend()
23 |     plt.show()
24 | 
25 | # Generate the training data
26 | def nonlinear_data(n, s):
27 |    rtn_x, rtn_y = [], []
28 |    for i in range(n):
29 |        x = np.random.random()
30 |        y = 2.0 * np.sin(2.0 * np.pi * x) + np.random.normal(0.0, s) + 3.0
31 |        rtn_x.append(x)
32 |        rtn_y.append(y)
33 |    return np.array(rtn_x).reshape(-1,1), np.array(rtn_y)
34 | x, y = nonlinear_data(n=500, s=0.5)
35 | 
36 | y_mean = y.mean()     # initial prediction
37 | n_depth = 3           # tree depth
38 | n_tree = 20           # the number of trees
39 | eta = 0.3             # learning rate
40 | reg_lambda = 1.0      # regularization constant
41 | prune_gamma = 2.0     # pruning constant
42 | 
43 | my_model = MyXGBRegressor(n_estimators=n_tree,
44 |                           max_depth=n_depth,
45 |                           learning_rate=eta,
46 |                           prune_gamma=prune_gamma,
47 |                           reg_lambda=reg_lambda,
48 |                           base_score = y_mean)
49 | loss = my_model.fit(x, y)
50 | 
51 | # Check the loss history
52 | plt.figure(figsize=(5,4))
53 | plt.plot(loss, c='red')
54 | plt.xlabel('m : iteration')
55 | plt.ylabel('loss: mean squared error')
56 | plt.title('loss history')
57 | plt.show()
58 | 
59 | x_test = np.linspace(0, 1, 50).reshape(-1, 1)
60 | y_pred = my_model.predict(x_test)
61 | 
62 | # Plot the training data and estimated curve
63 | plot_prediction(x, y, x_test, y_pred)
64 | 
65 | # XGBRegressor 결과와 비교한다.
66 | # https://xgboost.readthedocs.io/en/stable/python/python_api.html
67 | #       #module-xgboost.sklearn
68 | # ---------------------------------------------------------------
69 | from xgboost import XGBRegressor
70 | 
71 | xg_model = XGBRegressor(n_estimators=n_tree,
72 |                         max_depth=n_depth,
73 |                         learning_rate=eta,
74 |                         gamma=prune_gamma,
75 |                         reg_lambda=reg_lambda,
76 |                         base_score = y_mean)
77 | xg_model.fit(x, y)
78 | y_pred = xg_model.predict(x_test)  # predict the test data
79 | 
80 | # Plot the training data and estimated curve
81 | plot_prediction(x, y, x_test, y_pred)
82 | 
83 | 


--------------------------------------------------------------------------------
/11.xGBoost/2.XGBoost(classification).py:
--------------------------------------------------------------------------------
 1 | # [MXML-11-06] 2.XGBoost(classification).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/oKLBon15bTc
10 | # 
11 | import numpy as np
12 | from sklearn.datasets import make_blobs
13 | from MyXGBoostClassifier import MyXGBClassifier
14 | import matplotlib.pyplot as plt
15 | 
16 | # Plot the training and test data, and the prediction result
17 | def plot_prediction(x, y, x_test, y_pred):
18 |     plt.figure(figsize=(5,5))
19 |     color = ['red' if a == 1 else 'blue' for a in y_pred]
20 |     plt.scatter(x_test[:, 0], x_test[:, 1], s=100, c=color, 
21 |                 alpha=0.3)
22 |     plt.scatter(x[:, 0], x[:, 1], s=80, c='black')
23 |     plt.scatter(x[:, 0], x[:, 1], s=10, c='yellow')
24 |     plt.xlim(-0.5, 1.0)    
25 |     plt.ylim(-0.5, 1.0)
26 |     plt.show()
27 |    
28 | # Generate the training data
29 | x, y = make_blobs(n_samples=200, n_features=2, 
30 |                   centers=[[0., 0.], [0.5, 0.5]], 
31 |                   cluster_std=0.18, center_box=(-1., 1.))
32 | 
33 | # y_init = y.mean()     # initial prediction
34 | y_init = np.repeat(y.mean(), y.shape[0])
35 | n_depth = 3           # # tree depth
36 | n_tree = 20           # the number of trees
37 | eta = 0.3             # learning rate
38 | reg_lambda = 0.1      # regularization constant
39 | prune_gamma = 0.01    # pruning constant
40 | 
41 | my_model = MyXGBClassifier(n_estimators=n_tree,
42 |                            max_depth=n_depth,
43 |                            learning_rate=eta,
44 |                            prune_gamma = prune_gamma,
45 |                            reg_lambda=reg_lambda,
46 |                            base_score = y_init)
47 | loss = my_model.fit(x, y)
48 | 
49 | # Check the loss history
50 | plt.figure(figsize=(5,4))
51 | plt.plot(loss, c='red')
52 | plt.xlabel('m : iteration')
53 | plt.ylabel('loss: binary cross entropy')
54 | plt.title('loss history')
55 | plt.show()
56 | 
57 | x_test = np.random.uniform(-0.5, 1.5, (1000, 2))
58 | y_pred = my_model.predict(x_test)
59 | 
60 | # Plot the training and test data, and the prediction result
61 | plot_prediction(x, y, x_test, y_pred)
62 | 
63 | # Compare with the results from XGBRegressor library.
64 | # https://xgboost.readthedocs.io/en/stable/python/python_api.html
65 | #       #module-xgboost.sklearn
66 | # ---------------------------------------------------------------
67 | from xgboost import XGBClassifier
68 | 
69 | xg_model = XGBClassifier(objective='binary:logistic',
70 |                          tree_method = 'exact',
71 |                          n_estimators=n_tree,
72 |                          max_depth=n_depth,
73 |                          learning_rate=eta,
74 |                          gamma=prune_gamma,
75 |                          reg_lambda=reg_lambda,
76 |                          base_score=y_init)
77 | 
78 | xg_model.fit(x, y)
79 | 
80 | # Predict the target class of the test data and visualize the result
81 | y_pred = xg_model.predict(x_test)
82 | plot_prediction(x, y, x_test, y_pred)
83 | 
84 | # plt.figure(figsize=(5,5))
85 | # color = ['red' if a == 1 else 'blue' for a in y]
86 | # plt.scatter(x[:, 0], x[:, 1], s=80, alpha=0.5, c=color)
87 | # plt.xlim(-0.5, 1.0)    
88 | # plt.ylim(-0.5, 1.0)
89 | # plt.show()


--------------------------------------------------------------------------------
/11.xGBoost/3.appoximation(1).py:
--------------------------------------------------------------------------------
 1 | # [MXML-11-07] 3.approximation(1).py
 2 | # 논문 [1] Tianqi Chen et, al., 2016, XGBoost: A Scalable Tree Boosting System
 3 | # 3. SPLIT FINDING ALGORITHMS
 4 | # 3.2 Approximate Algorithm
 5 | #
 6 | # This code was used in the machine learning online 
 7 | # course provided by 
 8 | # www.youtube.com/@meanxai
 9 | # www.github.com/meanxai/machine_learning
10 | #
11 | # A detailed description of this code can be found in
12 | # https://youtu.be/AQOPXlxXF_0
13 | # 
14 | import numpy as np
15 | from MyXGBoostRegressor import MyXGBRegressor
16 | import time
17 | 
18 | # Create training data
19 | def nonlinear_data(n, s):
20 |    rtn_x, rtn_y = [], []
21 |    for i in range(n):
22 |        x = np.random.random()
23 |        y = 2.0 * np.sin(2.0 * np.pi * x) + \
24 |            np.random.normal(0.0, s) + 3.0
25 |        rtn_x.append(x)
26 |        rtn_y.append(y)
27 |    return np.array(rtn_x).reshape(-1,1), np.array(rtn_y)
28 | x, y = nonlinear_data(n=50000, s=0.5)
29 | 
30 | # 1. Exact Greedy Algorithm (EGA)
31 | # -------------------------------
32 | start_time = time.time()
33 | my_model = MyXGBRegressor(n_estimators = 1,
34 |                           max_depth = 1,
35 |                           base_score = y.mean())
36 | 
37 | my_model.fit(x, y)
38 | e = my_model.models[0].estimator2
39 | 
40 | print('\nExact greedy algorithm:')
41 | print('split point =', np.round(e['split_point'], 3))
42 | print('gain =', np.round(e['gain'], 3))
43 | print('running time = {:.2f} seconds'.format(time.time() - start_time))
44 | 
45 | # 2.Approximate Algorithm (AA).
46 | # -------------------------------
47 | from multiprocessing.pool import Pool
48 | def find_split_point(x, y):
49 |     # MyXGBRegressor is a class implemented with EGA. 
50 |     # To implement this properly, you need to implement the 
51 |     # Approximate Algorithm inside the MyXGBRegressor.
52 |     my_model = MyXGBRegressor(n_estimators = 1,
53 |                               max_depth = 1,        # root node만 확인함.
54 |                               base_score = y.mean())
55 | 
56 |     my_model.fit(x, y)
57 |     e = my_model.models[0].estimator2
58 |     return [e['split_point'], e['gain']]
59 |     
60 | # Divide the data into five parts and allocate 20% of the data to
61 | # each part.
62 | c_point = np.percentile(x, [20, 40, 60, 80, 100])
63 | 
64 | # maps the data into buckets split by c_point
65 | l_bound = -np.inf
66 | x_block, y_block = [], []
67 | for p in c_point:
68 |     idx = np.where(np.logical_and(x > l_bound, x <= p))[0]
69 |     x_block.append(x[idx])
70 |     y_block.append(y[idx])
71 |     l_bound = p
72 | 
73 | start_time = time.time()
74 | mp = Pool(5)
75 | args = [[ax, ay] for ax, ay in zip(x_block, y_block)]
76 | ret = mp.starmap_async(find_split_point, args)
77 | mp.close()
78 | mp.join()
79 | 
80 | print('\nApproximate Algorithm:')
81 | print('split_points =', np.array(ret.get())[:, 0].round(3))
82 | print('gain =', np.array(ret.get())[:, 1].round(2))
83 | print('running time = {:.2f} seconds'.format(time.time() - start_time))
84 | print('number of data in blocks =', [len(a) for a in x_block])


--------------------------------------------------------------------------------
/11.xGBoost/4.appoximation(2).py:
--------------------------------------------------------------------------------
  1 | # [MXML-11-08] 4.approximation(2).py
  2 | # Tianqi Chen et, al., 2016, XGBoost: A Scalable Tree Boosting System
  3 | # 3. SPLIT FINDING ALGORITHMS
  4 | # 3.3 Weighted Quantile Sketch
  5 | #
  6 | # This code was used in the machine learning online 
  7 | # course provided by 
  8 | # www.youtube.com/@meanxai
  9 | # www.github.com/meanxai/machine_learning
 10 | #
 11 | # A detailed description of this code can be found in
 12 | # https://youtu.be/ejUvX1L-yzE
 13 | # 
 14 | import numpy as np
 15 | from sklearn.datasets import make_blobs
 16 | from xgboost import XGBClassifier
 17 | from sklearn.model_selection import train_test_split
 18 | import time
 19 | 
 20 | # Create a simple training dataset
 21 | x, y = make_blobs(n_samples=500000, n_features=2, 
 22 |                   centers=[[0., 0.], [0.5, 0.5]], 
 23 |                   cluster_std=0.2, center_box=(-1., 1.))
 24 | 
 25 | x_train, x_test, y_train, y_test = train_test_split(x, y)
 26 | 
 27 | TREES = 200  # the number of trees
 28 | DEPTH = 5    # the depth of tree
 29 | ETA = 0.1    # learning rate, eta
 30 | LAMB = 1.0   # regularization constant
 31 | GAMMA = 0.1  # pruning constant
 32 | EPS = 0.03   # epsilon for approximate and weighted quantile sketch
 33 | 
 34 | # 1. Exact Greedy Algorithm (EGA)
 35 | # -------------------------------
 36 | start_time = time.time()
 37 | model = XGBClassifier(n_estimators = TREES,
 38 |                       max_depth = DEPTH,
 39 |                       learning_rate = ETA,    # η
 40 |                       gamma = GAMMA,          # γ for pruning
 41 |                       reg_lambda = LAMB,      # λ for regularization
 42 |                       base_score = 0.5,       # initial prediction value
 43 |                       tree_method = 'exact')  # exact greedy algorithm
 44 | 
 45 | model.fit(x_train, y_train)
 46 | acc = model.score(x_test, y_test)
 47 | 
 48 | print('\nExact greedy algorithm:')
 49 | print('Accuracy =', np.round(acc, 3))
 50 | print('running time = {:.2f} seconds'.format(time.time() - start_time))
 51 | 
 52 | # 2.Approximate Algorithm (AA).
 53 | # -------------------------------
 54 | start_time = time.time()
 55 | model = XGBClassifier(n_estimators = TREES,
 56 |                       max_depth = DEPTH,
 57 |                       learning_rate = ETA,    # η
 58 |                       gamma = GAMMA,          # γ for pruning
 59 |                       reg_lambda = LAMB,      # λ for regularization
 60 |                       base_score = 0.5,       # initial prediction value
 61 |                       max_bin = int(1/EPS),   # sketch_eps is replaced by max_bin
 62 |                       tree_method = 'approx') # weighted quantile sketch
 63 | 
 64 | model.fit(x_train, y_train)
 65 | acc = model.score(x_test, y_test)
 66 | 
 67 | print('\nWeighted Quantile Sketch:')
 68 | print('Accuracy =', np.round(acc, 3))
 69 | print('running time = {:.2f} seconds'.format(time.time() - start_time))
 70 | 
 71 | # tree_method:
 72 | #
 73 | # https://xgboost.readthedocs.io/en/stable/parameter.html
 74 | # auto: Same as the hist tree method.
 75 | # exact: Exact greedy algorithm. Enumerates all split candidates.
 76 | # approx: Approximate greedy algorithm using quantile sketch and gradient histogram.
 77 | # hist: Faster histogram optimized approximate greedy algorithm.
 78 | #
 79 | # https://xgboost.readthedocs.io/en/latest/treemethod.html
 80 | # approx tree method: An approximation tree method described in 
 81 | # reference paper. It runs sketching before building each tree using 
 82 | # all the rows (rows belonging to the root). Hessian is used as weights 
 83 | # during sketch. The algorithm can be accessed by setting tree_method 
 84 | # to approx.
 85 | 
 86 | # max_bin:
 87 | #
 88 | # https://github.com/dmlc/xgboost/issues/8063
 89 | # Also, the parameter sketch_eps is replaced by max_bin for aligning 
 90 | # with hist, the old default for max_bin translated from sketch_eps 
 91 | # was around 63 while the rewritten one is 256, which means the new 
 92 | # implementation builds larger histogram.
 93 | 
 94 | # import matplotlib.pyplot as plt
 95 | # x, y = make_blobs(n_samples=10000, n_features=2, 
 96 | #                   centers=[[0., 0.], [0.5, 0.5]], 
 97 | #                   cluster_std=0.2, center_box=(-1., 1.))
 98 | 
 99 | # plt.figure(figsize=(5,5))
100 | # color = ['red' if a == 1 else 'blue' for a in y]
101 | # plt.scatter(x[:, 0], x[:, 1], s=1, alpha=0.8, c=color)
102 | # # plt.xlim(-0.5, 1.0)    
103 | # # plt.ylim(-0.5, 1.0)
104 | # plt.show()


--------------------------------------------------------------------------------
/11.xGBoost/5.santander.py:
--------------------------------------------------------------------------------
 1 | # [MXML-11-09] 5.santander.py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/fALcIVr6zjY
10 | # 
11 | import pandas as pd
12 | import numpy as np
13 | from xgboost import XGBClassifier
14 | from sklearn.metrics import roc_auc_score
15 | from sklearn.model_selection import train_test_split
16 | import matplotlib.pyplot as plt
17 | 
18 | # Read the Santander Customer Satisfaction Dataset.
19 | # df.shape = (76020, 371)
20 | df = pd.read_csv("data/santander.csv", encoding='latin-1')
21 | 
22 | # Replace the values of the 'var3' feature containing -99999999 with 2 
23 | # and drop the 'ID' feature.
24 | df['var3'].replace(-999999, 2, inplace=True)
25 | df.drop('ID', axis = 1, inplace=True)
26 | 
27 | # Separate features and label from the dataset 
28 | # and generate training and test data.
29 | x = df.drop('TARGET', axis=1)
30 | y = df['TARGET']
31 | x_train, x_test, y_train, y_test = train_test_split(x, y)
32 | 
33 | TREES = 200  # the number of trees
34 | DEPTH = 5    # the depth of tree
35 | ETA = 0.1    # learning rate, eta
36 | LAMB = 1.0   # regularization constant
37 | GAMMA = 0.1  # pruning constant
38 | EPS = 0.03   # epsilon for approximate and weighted quantile sketch
39 | 
40 | # Create an XGBoost classification model and fit it to the training data
41 | model = XGBClassifier(n_estimators = TREES,
42 |                       max_depth = DEPTH,
43 |                       learning_rate = ETA,    # η
44 |                       gamma = GAMMA,          # γ for pruning
45 |                       reg_lambda = LAMB,      # λ for regularization
46 |                       base_score = 0.5,       # initial prediction value
47 |                       missing = 0.0,          # for sparsity-aware
48 |                       subsample = 0.5,        # Subsample ratio of the training instance
49 |                       colsample_bynode = 0.5, # Subsample ratio of columns for each split
50 |                       max_bin = int(1/EPS),   # sketch_eps is replaced by max_bin
51 |                       tree_method = 'approx') # weighted quantile sketch
52 | 
53 | model.fit(x_train, y_train)
54 | 
55 | # Predict the test data and measure the performance with ROC-AUC.
56 | y_prob = model.predict_proba(x_test)[:, 1]
57 | auc = roc_auc_score(y_test, y_prob)
58 | print('\nROC-AUC = {:.4f}'.format(auc))
59 | 
60 | # colsample_bytree (Optional[float]) – Subsample ratio of columns when constructing 
61 | #                                      each tree.
62 | # colsample_bylevel (Optional[float]) – Subsample ratio of columns for each level.
63 | 
64 | # colsample_bynode (Optional[float]) – Subsample ratio of columns for each split.
65 | 
66 | 


--------------------------------------------------------------------------------
/11.xGBoost/data/santander.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/meanxai/machine_learning/fba47e91cc7449eb5d7ea8b7ec1fb0fd616ebd71/11.xGBoost/data/santander.zip


--------------------------------------------------------------------------------
/12.LGBM/1.histogram_based.py:
--------------------------------------------------------------------------------
 1 | # [MXML-12-01] 1.histogram-based.py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/N39NE4Nj6vc
10 | # 
11 | import numpy as np
12 | from sklearn.datasets import make_blobs
13 | from multiprocessing.pool import Pool
14 | import matplotlib.pyplot as plt
15 | 
16 | # Create a training data set.
17 | x, y = make_blobs(n_samples=300, n_features=2, 
18 |                   centers=[[0., 0.], [0.5, 0.3]], 
19 |                   cluster_std=0.15, center_box=(-1., 1.))
20 | 
21 | plt.figure(figsize=(4,4))
22 | color = [['red', 'blue'][a] for a in y]
23 | plt.scatter(x[:,0], x[:,1], c=color, alpha=0.3)
24 | plt.show()
25 | 
26 | def find_local_split_point(f, s_point):
27 |     GL = HL = 0.0
28 |     l_bound = -np.inf           # lower left bound
29 |     max_gain = -np.inf
30 |     
31 |     for j in s_point:
32 |         # split the parent node into the left and right nodes.
33 |         left = np.where(np.logical_and(f > l_bound, f <= j))[0]
34 |         right = np.where(f > j)[0]
35 |         
36 |         # After splitting the parent node, calculate the scores of its children.
37 |         GL += g[left].sum()
38 |         HL += (h[left] * (1. - h[left])).sum()
39 |         GR = G - GL
40 |         HR = H - HL
41 |         
42 |         # Calculate the gain for this split
43 |         gain = (GL ** 2)/(HL + r) + (GR ** 2)/(HR + r) - p_score
44 |             
45 |         # Find the maximum gain.
46 |         if gain > max_gain:
47 |             max_gain = gain
48 |             b_point = j      # best split point
49 |         l_bound = j
50 |     
51 |     return b_point, max_gain
52 | 
53 | y0 = np.ones(shape=y.shape) * 0.5  # initial prediction
54 | g = -(y - y0)            # negative residual.
55 | h = y0 * (1. - y0)       # Hessian.
56 | 
57 | # Create a histogram of the parent node for each feature
58 | n_bin = 30  # the number of bins
59 | g0_parent, f0_bin = np.histogram(x[:, 0], n_bin, weights=g)  # feature 0
60 | g1_parent, f1_bin = np.histogram(x[:, 1], n_bin, weights=g)  # feature 1
61 | 
62 | # Find the best split point of each feature
63 | G = g.sum()
64 | H = h.sum()
65 | r = 0.0
66 | gamma = 0.0
67 | p_score = (G ** 2) / (H + r)    # parent's score before splitting the node
68 | 
69 | # Find global best split point through parallel processing
70 | # vertical partitioning method is used.
71 | mp = Pool(2)
72 | args = [[x[:, 0], f0_bin], [x[:, 1], f1_bin]]
73 | ret = mp.starmap_async(find_local_split_point, args)
74 | mp.close()
75 | mp.join()
76 | 
77 | results = ret.get()
78 | p1 = results[0][0];    p2 = results[1][0]
79 | gain1 = results[0][1]; gain2 = results[1][1]
80 | 
81 | if gain1 > gain2:
82 |     b_fid = 0
83 |     b_point = p1
84 | else:    
85 |     b_fid = 1
86 |     b_point = p2
87 |     
88 | print('\nbest feature id =', b_fid)
89 | print('best split point =', b_point.round(3))
90 | 
91 | 


--------------------------------------------------------------------------------
/12.LGBM/2.goss.py:
--------------------------------------------------------------------------------
  1 | # [MXML-12-02] 2.goss.py 
  2 | # Implement GOSS algorithm presented in the paper.
  3 | # Add GOSS feature to XGBoost.
  4 | #
  5 | # This code was used in the machine learning online 
  6 | # course provided by 
  7 | # www.youtube.com/@meanxai
  8 | # www.github.com/meanxai/machine_learning
  9 | #
 10 | # A detailed description of this code can be found in
 11 | # https://youtu.be/APZyWo9hIj0
 12 | # 
 13 | import numpy as np
 14 | from sklearn.datasets import make_blobs
 15 | import matplotlib.pyplot as plt
 16 | from xgboost import XGBClassifier
 17 | from lightgbm import LGBMClassifier
 18 | 
 19 | # Create a training dataset
 20 | x, y = make_blobs(n_samples=10000, n_features=2, 
 21 |                   centers=[[0., 0.], [0.5, 0.5]], 
 22 |                   cluster_std=0.25, center_box=(-1., 1.))
 23 | 
 24 | plt.figure(figsize=(4,4))
 25 | color = [['red', 'blue'][a] for a in y]
 26 | plt.scatter(x[:,0], x[:,1], s=1, c=color, alpha=0.5)
 27 | plt.show()
 28 | 
 29 | n_boost = 50   # the number of boosting
 30 | eta = 0.3      # learning rate
 31 | max_depth = 2  # max_depth of a tree
 32 | 
 33 | def base_model(x, y, weights, F0):
 34 |     model = XGBClassifier(n_estimators=1,  # just 1 round
 35 |                           learning_rate=eta,
 36 |                           max_depth=max_depth,
 37 |                           max_bin=20, tree_method='hist',
 38 |                           base_score=None)
 39 |     
 40 |     # g and h are multiplied by their weights.
 41 |     model.fit(x, y, sample_weight = weights, base_margin=F0)
 42 |     return model
 43 | 
 44 | # Algorithm 2: Gradient-based One-Side Sampling (GOSS)
 45 | a = 0.3                 # sampling ratio of large gradient data
 46 | b = 0.2                 # sampling ratio of small gradient data
 47 | fact = (1. - a) / b
 48 | topN = int(a * x.shape[0])
 49 | randN = int(b * x.shape[0])
 50 | models = []
 51 | Fm = np.zeros(y.shape)  # initial prediction in log(odds)
 52 | 
 53 | for i in range(n_boost):
 54 |     y_prev = 1. / (1. + np.exp(-Fm))
 55 |     g = -(y - y_prev)               # negative residual. first order gradients
 56 |     w = np.ones(shape=x.shape[0])   # initial sample weights.
 57 |     sorted_g = np.argsort(np.abs(g))[::-1]
 58 |     topSet = sorted_g[:topN]
 59 |     randSet = np.random.choice(sorted_g[topN:], size=randN, replace=False)
 60 |     usedSet = np.hstack([topSet, randSet])
 61 |     w[randSet] *= fact  # Assign weight f act to the small gradient data
 62 | 
 63 |     newModel = base_model(x[usedSet], y[usedSet], w[usedSet], F0=Fm[usedSet])
 64 |     Fm += newModel.predict(x, output_margin=True)
 65 |     models.append(newModel)
 66 | 
 67 | # Create a test dataset and predict the class of test data
 68 | x_test = np.random.uniform(-1.0, 1.5, (1000, 2))
 69 | 
 70 | test_Fm = np.zeros(x_test.shape[0])
 71 | for model in models:
 72 |     test_Fm += model.predict(x_test, output_margin=True)
 73 | 
 74 | y_prob = 1. / (1. + np.exp(-test_Fm))  # log(odds) --> probability
 75 | y_pred = (y_prob > 0.5) * 1
 76 | 
 77 | # Check the prediction results and the decision boundary.
 78 | def check_result(x, y, x_test, y_pred, title):
 79 |     plt.figure(figsize=(4,4))
 80 |     color2 = [['red', 'blue'][a] for a in y_pred]
 81 |     plt.scatter(x_test[:, 0], x_test[:, 1], s=50, c=color2, 
 82 |                 alpha=0.3)
 83 |     
 84 |     # Only part of the training data is drawn.
 85 |     plt.scatter(x[:300, 0], x[:300, 1], s=50, c='black')
 86 |     plt.scatter(x[:300, 0], x[:300, 1], s=5, c='yellow')
 87 |     plt.xlim(-1.0, 1.5)    
 88 |     plt.ylim(-1.0, 1.5)
 89 |     plt.title(title)
 90 |     plt.show()
 91 | 
 92 | check_result(x, y, x_test, y_pred, "Result of the code from scratch")
 93 | 
 94 | # Use LGBMClassifier library and compare the result from above code
 95 | model = LGBMClassifier(n_estimators = 20,
 96 |                        max_depth=max_depth,
 97 |                        learning_rate=eta,
 98 |                        max_bins=20,
 99 |                        boosting_type="goss",
100 |                        top_rate=0.3, 
101 |                        other_rate=0.2)
102 | 
103 | model.fit(x, y)
104 | y_pred = model.predict(x_test)
105 | check_result(x, y, x_test, y_pred, "Result of LGBMClassifier")
106 | 


--------------------------------------------------------------------------------
/12.LGBM/3.greedy_bundling.py:
--------------------------------------------------------------------------------
 1 | # [MXML-12-03] 3.greedy_bundling.py
 2 | # Algorithm 3: Greedy Bundling
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/Y-IvfsjmqOQ
11 | # 
12 | import numpy as np
13 | 
14 | x = np.array([[1, 1, 0, 0, 1],
15 |               [0, 0, 1, 1, 1],
16 |               [1, 2, 0, 0, 2],
17 |               [0, 0, 2, 3, 1],
18 |               [2, 1, 0, 0, 3],
19 |               [3, 3, 0, 0, 1],
20 |               [0, 0, 3, 0, 2],
21 |               [1, 2, 3, 4, 3],
22 |               [1, 0, 1, 0, 0],
23 |               [2, 3, 0, 0, 2]])
24 | 
25 | # Create a conflict count matrix
26 | n_row = x.shape[0]
27 | n_col = x.shape[1]
28 | conflictCnt = np.zeros((n_col, n_col))
29 | 
30 | for i in range(n_col):
31 |     for j in range(i+1, n_col):
32 |         # Count the number of conflicts.
33 |         conflictCnt[i, j] = len(np.where(x[:, i] * x[:, j] > 0)[0])
34 | 
35 | # Copy upper triangle to lower triangle
36 | # iu = (array([0, 0, 0, 0, 1, 1, 1, 2, 2, 3]), 
37 | #       array([1, 2, 3, 4, 2, 3, 4, 3, 4, 4]))
38 | iu = np.triu_indices(n_col, 1)
39 | il = (iu[1], iu[0])
40 | conflictCnt[il] = conflictCnt[iu]
41 | 
42 | # Create a search order matrix
43 | degree = conflictCnt.sum(axis=0)
44 | searchOrder = np.argsort(degree)[::-1]  # descending order
45 | 
46 | # ----------------------------
47 | # Algorithm 3: Greedy Bundling
48 | # ----------------------------
49 | K = 1        # max conflict count
50 | bundles = []
51 | bundlesConflict = []
52 | for i in searchOrder:  # i = [4, 0, 1, 2, 3]
53 |     needNew = True
54 |     for j in range(len(bundles)):
55 |         cnt = conflictCnt[bundles[j][-1], i]
56 |         # Only edges less than or equal to K are considered.
57 |         if cnt + bundlesConflict[j] <= K:
58 |             # Add the feature number i to the j-th bundle.
59 |             bundles[j].append(i)
60 |             
61 |             # Update the number of conflicts of features in the 
62 |             # j-th bundle.
63 |             bundlesConflict[j] += cnt     
64 |             needNew = False
65 |             break
66 |         
67 |     if needNew:
68 |         bundles.append([i])
69 |         bundlesConflict.append(0.)
70 | 
71 | print('\nconflictCnt:\n', conflictCnt)
72 | print('\nsearchOrder:\n', searchOrder)
73 | 
74 | print('\nbundles:', bundles)
75 | print('bundlesConflict:', bundlesConflict)
76 | 
77 | # conflictCnt:
78 | #    0   1   2   3   4
79 | # 0 [0., 6., 2., 1., 6.]
80 | # 1 [6., 0., 1., 1., 6.]
81 | # 2 [2., 1., 0., 3., 4.]
82 | # 3 [1., 1., 3., 0., 3.]
83 | # 4 [6., 6., 4., 3., 0.]
84 | 
85 | # searchOrder
86 | # array([4, 0, 1, 2, 3])
87 | #
88 | # bundles:
89 | #   j=0   j=1     j=2  ← bundle number
90 | # +--↓-----↓-------↓-----+
91 | # | [4]  [0, 3]  [1, 2]  |
92 | # +----------------------+
93 | #
94 | # bundlesConflict
95 | # +----------------------+
96 | # |  0      1       1    |
97 | # +----------------------+
98 | 


--------------------------------------------------------------------------------
/12.LGBM/4.merge_features.py:
--------------------------------------------------------------------------------
 1 | # [MXML-12-04] 4.merge_features.py
 2 | # Implementation of Algorithm 4: Merge Exclusive Features
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/orSRRtWtPwE
11 | # 
12 | import numpy as np
13 | 
14 | x = np.array([[1, 1, 0, 0, 1],
15 |               [0, 0, 1, 1, 1],
16 |               [1, 2, 0, 0, 2],
17 |               [0, 0, 2, 3, 1],
18 |               [2, 1, 0, 0, 3],
19 |               [3, 3, 0, 0, 1],
20 |               [0, 0, 3, 0, 2],
21 |               [1, 2, 3, 4, 3],   # <-- conflict here
22 |               [1, 0, 1, 0, 0],
23 |               [2, 3, 0, 0, 2]])
24 | 
25 | # Algorithm 4: Merge Exclusive Features
26 | def merge_features(numData, F):
27 |     binRanges = [0]
28 |     totalBin = 0
29 |     for f in F:
30 |         totalBin += np.max(f)
31 |         binRanges.append(totalBin)
32 | 
33 |     newBin = np.zeros(numData, dtype=int)
34 |     for i in range(numData):
35 |         newBin[i] = 0
36 |         for j in range(len(F)):
37 |             if F[j][i] != 0:
38 |                 newBin[i] = F[j][i] + binRanges[j]
39 |     return newBin, binRanges
40 | 
41 | # modified Algorithm 4 (skip-zero-version)
42 | def merge_features2(numData, F):
43 |     binRanges = [0]
44 |     totalBin = 0
45 |     for f in F:
46 |         totalBin += np.max(f)
47 |         binRanges.append(totalBin)
48 | 
49 |     # initialize newBin with F[0] to skip zero in binRanges[0]
50 |     newBin = F[0]
51 |     for i in range(numData):
52 |         for j in range(1, len(F)):
53 |             if F[j][i] != 0:
54 |                 newBin[i] = F[j][i] + binRanges[j]
55 |     return newBin, binRanges
56 | 
57 | bundles = [[4], [0, 3], [1, 2]] # The result of Greedy Bundling
58 | 
59 | F = [x[:, i] for i in bundles[1]]
60 | newBin, binRanges = merge_features(x.shape[0], F)
61 | print('\nnewBin:', newBin)
62 | print('binRanges:', binRanges)
63 | 
64 | newBin, binRanges = merge_features2(x.shape[0], F)
65 | print('\nnewBin:', newBin)
66 | print('binRanges:', binRanges)
67 | 


--------------------------------------------------------------------------------
/12.LGBM/5.efb_onehot.py:
--------------------------------------------------------------------------------
 1 | # [MXML-12-05] 5.efb_onehot.py
 2 | # Merge one-hot encoded features using EFB
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/NqpkYja5g2Y
11 | # 
12 | import numpy as np
13 | from sklearn.preprocessing import OneHotEncoder
14 | 
15 | # Algorithm 3: Greedy Bundling algorithm
16 | def greedy_bundling(x, K):
17 |     # Create a conflict count matrix
18 |     n_row = x.shape[0]
19 |     n_col = x.shape[1]
20 |     conflictCnt = np.zeros((n_col, n_col))
21 |     
22 |     for i in range(n_col):
23 |         for j in range(i+1, n_col):
24 |             # Count the number of conflicts.
25 |             conflictCnt[i, j] = len(np.where(x[:, i] * x[:, j] > 0)[0])
26 |     
27 |     # Copy upper triangle to lower triangle
28 |     iu = np.triu_indices(n_col, 1)
29 |     il = (iu[1], iu[0])
30 |     conflictCnt[il] = conflictCnt[iu]
31 |     
32 |     # Create a search order matrix
33 |     degree = conflictCnt.sum(axis=0)
34 |     searchOrder = np.argsort(degree)[::-1]  # descending order
35 |     
36 |     bundles = []
37 |     bundlesConflict = []
38 |     for i in searchOrder:
39 |         needNew = True
40 |         for j in range(len(bundles)):
41 |             cnt = conflictCnt[bundles[j][-1], i]
42 |             if cnt + bundlesConflict[j] <= K:
43 |                 bundles[j].append(i)
44 |                 bundlesConflict[j] += cnt     
45 |                 needNew = False
46 |                 break
47 |             
48 |         if needNew:
49 |             bundles.append([i])
50 |             bundlesConflict.append(0.)
51 |     return bundles
52 | 
53 | # Algorithm 4: Merge Exclusive Features (skip-zero-version)
54 | def merge_features(numData, F):
55 |     binRanges = [0]
56 |     totalBin = 0
57 |     for f in F:
58 |         totalBin += np.max(f)
59 |         binRanges.append(totalBin)
60 | 
61 |     newBin = F[0]  # initialize newBin to F[0]
62 |     for i in range(numData):
63 |         for j in range(1, len(F)):
64 |             if F[j][i] != 0:
65 |                 newBin[i] = F[j][i] + binRanges[j]
66 |     return newBin, binRanges
67 | 
68 | # Generate random data and perform one-hot encoding.
69 | n_samples = 100
70 | n_features = 4
71 | x = np.random.randint(low=0, high=4, size=(n_samples, n_features))
72 | enc = OneHotEncoder()
73 | x_ohe = enc.fit_transform(x).toarray()
74 | 
75 | print('Original features [:5]:'); print(x[:5])
76 | print('\nOne-hot encoding [:5]:'); print(x_ohe[:5])
77 | 
78 | # Find bundles
79 | bundles = greedy_bundling(x_ohe, K=1)
80 | 
81 | # If we know the bundles exactly, like this,
82 | # bundles = [[0,1,2,3], [4,5,6,7], [8,9,10,11], [12,13,14,15]]
83 | # we can get the original features from the merged features.
84 | 
85 | print('\nbundles:', bundles)
86 | # [[14, 12, 15, 13], [10, 8, 11, 9], [5, 4, 6, 7], [3, 2, 1, 0]]
87 | 
88 | # Merge one-hot encoded features
89 | x_efb = np.zeros(shape=x.shape).astype('int')
90 | for i, bundle in enumerate(bundles):
91 |     F = [x_ohe[:, i] for i in bundle]
92 |     newBin, binRanges = merge_features(x_ohe.shape[0], F)
93 |     x_efb[:, i] = np.array(newBin) - 1
94 | 
95 | print('\nOriginal features [:5]:'); print(x[:5])
96 | print('\nMerged features [:5]:'); print(x_efb[:5])
97 | 
98 | 


--------------------------------------------------------------------------------
/12.LGBM/6.santander.py:
--------------------------------------------------------------------------------
 1 | # [MXML-12-05] 6.santander.py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/NqpkYja5g2Y
10 | # 
11 | import pandas as pd
12 | from lightgbm import LGBMClassifier
13 | from xgboost import XGBClassifier
14 | from sklearn.metrics import roc_auc_score, roc_curve
15 | from sklearn.model_selection import train_test_split
16 | import matplotlib.pyplot as plt
17 | import time
18 | 
19 | # Read the Santander Customer Satisfaction Dataset.
20 | # df.shape = (76020, 371)
21 | df = pd.read_csv("data/santander.csv", encoding='latin-1')
22 | 
23 | # Replace the values of the 'var3' feature containing -99999999 
24 | # with 2 and drop the 'ID' feature.
25 | df['var3'].replace(-999999, 2, inplace=True)
26 | df.drop('ID', axis = 1, inplace=True)
27 | 
28 | # Separate features and label from the dataset 
29 | # and generate training and test data.
30 | x_feat = df.drop('TARGET', axis=1)
31 | y_target = df['TARGET']
32 | x_train, x_test, y_train, y_test = train_test_split(x_feat, y_target)
33 | 
34 | # 1. XGBoost
35 | # Create an XGBoost classification model and fit it to the training data
36 | start_time = time.time()
37 | model = XGBClassifier(n_estimators = 200,
38 |                       max_depth = 5,
39 |                       learning_rate = 0.1,    # η
40 |                       gamma = 0.1,            # γ for pruning
41 |                       reg_lambda = 1.0,       # λ for regularization
42 |                       base_score = 0.5,       # initial prediction value
43 |                       subsample = 0.5,        # Subsample ratio of the training instance
44 |                       colsample_bynode = 0.5, # Subsample ratio of columns for each split
45 |                       max_bin = int(1/0.03),  # sketch_eps is replaced by max_bin
46 |                       tree_method = 'approx') # weighted quantile sketch
47 | 
48 | model.fit(x_train, y_train)
49 | 
50 | # Predict the test data and measure the performance with ROC-AUC.
51 | y_prob = model.predict_proba(x_test)[:, 1]
52 | auc = roc_auc_score(y_test, y_prob)
53 | 
54 | print('\nXGBoost results:')
55 | print('running time = {:.2f} seconds'.format(time.time() - start_time))
56 | print('ROC-AUC = {:.4f}'.format(auc))
57 | 
58 | # 2. LightGBM
59 | # Create a LightGBM model
60 | start_time = time.time()
61 | model = LGBMClassifier(n_estimators = 200,
62 |                        max_depth = 5,
63 |                        learning_rate = 0.1,
64 |                        boosting_type="goss",  # default: gbdt - traditional gradient based decision tree
65 |                        top_rate=0.3, 
66 |                        other_rate=0.2,
67 |                        enable_bundle=True,    # default: True. enable EFB
68 |                        is_unbalance = True)
69 | 
70 | # training
71 | model.fit(x_train, y_train)
72 | 
73 | # Predict the test data and measure the performance with AUC.
74 | y_pred = model.predict_proba(x_test)[:, 1]
75 | auc = roc_auc_score(y_test, y_pred)
76 | 
77 | print('\nLightGBM results:')
78 | print('running time = {:.2f} seconds'.format(time.time() - start_time))
79 | print("ROC AUC = {0:.4f}".format(auc))
80 | 
81 | # Draw the ROC curve
82 | fprs, tprs, thresholds = roc_curve(y_test, y_pred)
83 | 
84 | plt.plot(fprs, tprs, label = 'ROC')
85 | plt.plot([0,1], [0,1], '--', label = 'Random')
86 | plt.legend()
87 | plt.xlabel('FPR')
88 | plt.ylabel('TPR')
89 | plt.show()
90 | 
91 | 


--------------------------------------------------------------------------------
/12.LGBM/data/santander.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/meanxai/machine_learning/fba47e91cc7449eb5d7ea8b7ec1fb0fd616ebd71/12.LGBM/data/santander.zip


--------------------------------------------------------------------------------
/2.DecisionTree/1.ID3(titanic_part).py:
--------------------------------------------------------------------------------
 1 | # [MXML-2-03] 1.ID3(titanic_part).py
 2 | # ID3/C4.5 decision tree test code
 3 | # CART is widely used than ID3/C4.5. Sklearn  supports CART.
 4 | #
 5 | # This code was used in the machine learning online 
 6 | # course provided by 
 7 | # www.youtube.com/@meanxai
 8 | # www.github.com/meanxai/machine_learning
 9 | #
10 | # A detailed description of this code can be found in
11 | # https://youtu.be/m3o0-K07gLI
12 | #
13 | #
14 | # I used the package below to test ID3/C4.5.
15 | # https://github.com/svaante/decision-tree-id3
16 | # pip install decision-tree-id3
17 | # pip install pydot
18 | # pip install graphviz
19 | # sudo apt install graphviz
20 | # -----------------------------------------------------------
21 | 
22 | # "from sklearn.externals import six" is used for id3, but "six"
23 | # is missing in the sklearn.externals, resulting in the following 
24 | # error: cannot import name "six" from 'sklearn.externals'
25 | # Add following to prevent errors.
26 | import six
27 | import sys; sys.modules['sklearn.externals.six'] = six
28 | import pandas as pd
29 | from id3 import Id3Estimator
30 | from id3 import export_graphviz
31 | import pydot
32 | from sklearn.model_selection import train_test_split
33 | 
34 | # Use just 3 features in the Titanic dataset:
35 | feat_names = ['Pclass', 'Sex', 'Age']
36 | df = pd.read_csv('data/titanic.csv')[feat_names + ['Survived']]
37 | df = df.dropna().reset_index()
38 | df.info()
39 | 
40 | # Separate the data into feature and target.
41 | x_data = df[feat_names].copy()
42 | y_data = df['Survived']
43 | 
44 | # Convert string (Sex) to number. female = 0, male = 1
45 | x_data['Sex'] = x_data['Sex'].map({'female':0, 'male':1})
46 | 
47 | # Convert real numbers (Age) to 4 categories.
48 | x_data['Age'] = pd.qcut(x_data['Age'], 4, labels=False)
49 | 
50 | # Split the data into training and test data.
51 | x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)
52 | 
53 | # Build ID3/C4.5 decision tree.
54 | estimator = Id3Estimator(gain_ratio=True, prune=True)
55 | estimator = estimator.fit(x_train, y_train, check_input=False)
56 | 
57 | # Evaluate performance with test data.
58 | y_pred = estimator.predict(x_test)
59 | acc = (y_pred == y_test).mean()
60 | print('\nAccuracy of test data = {:.4f}'.format(acc))
61 | 
62 | # Evaluate performance with training data.
63 | y_pred = estimator.predict(x_train)
64 | acc = (y_pred == y_train).mean()
65 | print('Accuracy of train data = {:.4f}\n'.format(acc))
66 | 
67 | # Visualize the tree result
68 | tree = export_graphviz(estimator.tree_, 'id3_tree.dot', feat_names)
69 | (graph,) = pydot.graph_from_dot_file('id3_tree.dot')
70 | graph.write_png('id3_tree.png')
71 | !nomacs 'id3_tree.png'   # Check the tree image with the image viewer.
72 | 


--------------------------------------------------------------------------------
/2.DecisionTree/2.CART(classification).py:
--------------------------------------------------------------------------------
 1 | # [MXML-2-07] 2.CART(classification).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/gct9gGOvPek
10 | #
11 | import numpy as np
12 | import pandas as pd
13 | from MyDTreeClassifier import MyDTreeClassifier
14 | from sklearn.tree import DecisionTreeClassifier
15 | from sklearn import tree
16 | from sklearn.datasets import load_iris
17 | from sklearn.model_selection import train_test_split
18 | import matplotlib.pyplot as plt
19 | import pprint
20 | 
21 | # Read the Titanic dataset and perform simple preprocessing.
22 | df = pd.read_csv('data/titanic.csv')
23 | df['Age'].fillna(df['Age'].mean(), inplace = True)  # Replace with average
24 | df['Embarked'].fillna('N', inplace = True)          # Replace with 'N'
25 | df['Sex'] = df['Sex'].factorize()[0]                # label encoding
26 | df['Embarked'] = df['Embarked'].factorize()[0]      # label encoding
27 | df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
28 | 
29 | #  Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
30 | # 0       0       3    0  22.0      1      0   7.2500         0
31 | # 1       1       1    1  38.0      1      0  71.2833         1
32 | # 2       1       3    1  26.0      0      0   7.9250         0
33 | # 3       1       1    1  35.0      1      0  53.1000         0
34 | # 4       0       3    0  35.0      0      0   8.0500         0
35 | 
36 | # split the data into train, validation and test data.
37 | y = np.array(df['Survived'])
38 | x = np.array(df.drop('Survived', axis=1))
39 | x_train, x_test, y_train, y_test = train_test_split(x, y)
40 | 
41 | depth = 3
42 | my_model = MyDTreeClassifier(max_depth = depth)
43 | my_model.fit(x_train, y_train)
44 | my_pred = my_model.predict(x_test)
45 | acc = (y_test == my_pred).mean()
46 | print('MyTreeClassifier: accuracy = {:.3f}'.format(acc))
47 | 
48 | # Compare the results with sklearn's DecisionTreeClassifier.
49 | # ----------------------------------------------------------
50 | sk_model = DecisionTreeClassifier(max_depth=depth, 
51 |                                   random_state=1)
52 | sk_model.fit(x_train, y_train)
53 | sk_pred = sk_model.predict(x_test)
54 | acc = (y_test == sk_pred).mean()
55 | print('DecisionTreeClassifier: accuracy = {:.3f}'.format(acc))
56 | 
57 | print('\nMyTreeClassifier: estimator2:')
58 | pprint.pprint(my_model.estimator2, sort_dicts=False)
59 | 
60 | plt.figure(figsize=(12, 6))
61 | tree.plot_tree(sk_model)
62 | plt.show()
63 | 


--------------------------------------------------------------------------------
/2.DecisionTree/3.CART(titanic_part).py:
--------------------------------------------------------------------------------
  1 | # [MXML-2-08] 3.CART(sklearn).py
  2 | # DecisionTreeClassifier in sklearn
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/XqNuY1RHlNU
 11 | #
 12 | # The characteristics of DecisionTreeClassifier:
 13 | # 1. Use the CART algorithm (binary tree). 
 14 | #    ID3/C4.5 (general tree) is not supported. 
 15 | # 2. Categorical feature is not directly supported.
 16 | #    All categorical features (e.g. 'female', 'male') must be 
 17 | #    converted to numeric data (e.g. 0, 1).
 18 | #    All numeric features are treated as continuous features.
 19 | #    Split using inequality. (e.g. sex ≤ 0.5)
 20 | # -----------------------------------------------------------
 21 | import numpy as np
 22 | import pandas as pd
 23 | from sklearn.tree import DecisionTreeClassifier
 24 | from sklearn.model_selection import train_test_split
 25 | from sklearn import tree
 26 | import matplotlib.pyplot as plt
 27 | 
 28 | # Of the Titanic dataset, only the following three features are used.
 29 | feat_names = ['Pclass', 'Sex', 'Age']
 30 | df = pd.read_csv('data/titanic.csv')[feat_names + ['Survived']]
 31 | df['Sex'] = df['Sex'].factorize()[0] # convert string to number
 32 | df = df.dropna()                     # Delete all rows with missing values.
 33 | col_names = list(df.columns)
 34 | 
 35 | # Separate the Titanic data into features and target class.
 36 | x_data = np.array(df[feat_names])  # features
 37 | y_data = np.array(df['Survived'])  # target class
 38 | 
 39 | # Split the data into training, validation and test data.
 40 | x_train, x_test, y_train, y_test = \
 41 |     train_test_split(x_data, y_data, test_size = 0.3)
 42 | 
 43 | x_test, x_eval, y_test, y_eval = \
 44 |     train_test_split(x_test, y_test, test_size = 0.5)
 45 |     
 46 | # Create decision tree models of various depths, 
 47 | # and measure the accuracy of validation data for each model.
 48 | train_acc = []
 49 | eval_acc = []
 50 | max_depth = 8
 51 | for d in range(1, max_depth+1):
 52 |     model = DecisionTreeClassifier(max_depth=d)
 53 |     model.fit(x_train, y_train)
 54 |     
 55 |     # Measure the accuracy of this model using the training data.
 56 |     y_pred = model.predict(x_train)
 57 |     train_acc.append((y_pred == y_train).mean())
 58 | 
 59 |     # Measure the accuracy of this model using the validation data.
 60 |     y_pred = model.predict(x_eval)
 61 |     eval_acc.append((y_pred == y_eval).mean())
 62 |     print('Depth = {}, train_acc = {:.4f}, eval_acc = {:.4f}'\
 63 |           .format(d, train_acc[-1], eval_acc[-1]))
 64 | 
 65 | # Find the optimal depth with the highest accuracy of validation data.
 66 | opt_depth = np.argmax(eval_acc) + 1
 67 | 
 68 | # Visualize accuracy changes as depth changes.
 69 | plt.plot(train_acc, marker='o', label='train')
 70 | plt.plot(eval_acc, marker='o', label='evaluation')
 71 | plt.legend()
 72 | plt.title('Accuracy')
 73 | plt.xlabel('tree depth')
 74 | plt.ylabel('accuracy')
 75 | plt.xticks(np.arange(max_depth), np.arange(1, max_depth+1))
 76 | plt.axvline(x=opt_depth-1, ls='--')
 77 | plt.ylim(0.5, 1.0)
 78 | plt.show()
 79 | 
 80 | # Regenerate the tree with optimal depth. 
 81 | # model = DecisionTreeClassifier(max_depth=opt_depth)
 82 | 
 83 | # I set max_step=3 as a constant value for tree visualization.
 84 | model = DecisionTreeClassifier(max_depth=3)
 85 | model.fit(x_train, y_train)
 86 | 
 87 | # Use test data to evaluate final performance.
 88 | y_pred = model.predict(x_test)
 89 | test_acc = (y_pred == y_test).mean()
 90 | print('Optimal depth = {}, test_acc = {:.4f}'.format(opt_depth, test_acc))
 91 |         
 92 | # Visualize the tree
 93 | # plt.figure(figsize=(20,10))
 94 | plt.figure(figsize=(14,6))
 95 | tree.plot_tree(model, feature_names = feat_names, fontsize=10)
 96 | plt.show()
 97 | 
 98 | # Analyze the importance of features.
 99 | feature_importance = model.feature_importances_
100 | n_feature = x_train.shape[1]
101 | idx = np.arange(n_feature)
102 | 
103 | plt.barh(idx, feature_importance, align='center', color='green')
104 | plt.yticks(idx, col_names[:-1], size=12)
105 | plt.xlabel('importance', size=15)
106 | plt.ylabel('feature', size=15)
107 | plt.show()
108 | 
109 | print('feature importance = {}'\
110 |       .format(feature_importance.round(3)))
111 | 


--------------------------------------------------------------------------------
/2.DecisionTree/5.CART(multiclass).py:
--------------------------------------------------------------------------------
 1 | # [MXML-2-10]: 5.CART(multiclass).py
 2 | # Multiclass classification test code
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/o43mZv_Cmxw
11 | #
12 | import numpy as np
13 | from sklearn.datasets import load_iris
14 | from MyDTreeClassifier import MyDTreeClassifier
15 | from sklearn.tree import DecisionTreeClassifier
16 | from sklearn.model_selection import train_test_split
17 | 
18 | # Load iris dataset
19 | # x: data, the number of samples=150, the number of features=4
20 | # y: target data with class (0,1,2)
21 | x, y = load_iris(return_X_y=True)
22 | 
23 | # Generate training and test data
24 | x_train, x_test, y_train, y_test = train_test_split(x, y)
25 | 
26 | # Model-1: using our model - refer to [MXML-2-07] video
27 | model1 = MyDTreeClassifier(max_depth=3)
28 | model1.fit(x_train, y_train)
29 | 
30 | # Estimate the class of validation date.
31 | y_pred1 = model1.predict(x_test)
32 | 
33 | # Measure the accuracy for validation data
34 | accuracy1 = (y_test == y_pred1).mean()
35 | print('\nAccuracy of Model-1 = {:.3f}'.format(accuracy1))
36 | 
37 | # Model-2: using sklearn
38 | model2 = DecisionTreeClassifier(max_depth=3)
39 | model2.fit(x_train, y_train)
40 | 
41 | # Estimate the class of validation date.
42 | y_pred2 = model2.predict(x_test)
43 | 
44 | # Measure the accuracy for validation data
45 | accuracy2 = (y_test == y_pred2).mean()
46 | print('Accuracy of Model-2 = {:.3f}'.format(accuracy2))
47 | 
48 | print("\nModel-1: y_pred1")
49 | print(y_pred1)
50 | print("\nModel-2: y_pred2")
51 | print(y_pred2)
52 | 
53 | 


--------------------------------------------------------------------------------
/2.DecisionTree/6.CART(regression).py:
--------------------------------------------------------------------------------
 1 | # [MXML-02-11] 6.CART(regression).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/Bc-k9Dv5SNg
10 | #
11 | import numpy as np
12 | from MyDTreeRegressor import MyDTreeRegressor
13 | from sklearn.tree import DecisionTreeRegressor
14 | import matplotlib.pyplot as plt
15 | from sklearn import tree
16 | import pprint
17 | 
18 | # Plot the training data and draw the estimated curve.
19 | def plot_prediction(x, y, x_test, y_pred, title):
20 |     plt.figure(figsize=(6,4))
21 |     plt.scatter(x, y, c='blue', s=20, alpha=0.5, label='train data')
22 |     plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction')
23 |     plt.xlim(0, 1)
24 |     plt.ylim(0, 7)
25 |     plt.legend()
26 |     plt.title(title)
27 |     plt.show()
28 | 
29 | # Generate nonlinear data for regression testing.
30 | def noisy_sine_data(n, s):
31 |    rtn_x, rtn_y = [], []
32 |    for i in range(n):
33 |        x= np.random.random()
34 |        y= 2.0*np.sin(2.0*np.pi*x)+np.random.normal(0.0, s) + 3.0
35 |        rtn_x.append(x)
36 |        rtn_y.append(y)
37 |    return np.array(rtn_x).reshape(-1,1), np.array(rtn_y)
38 | 
39 | # Create training and test data
40 | x_train, y_train = noisy_sine_data(n=500, s=0.5)
41 | x_test = np.linspace(0, 1, 50).reshape(-1, 1)    
42 | 
43 | depth = 3
44 | my_model = MyDTreeRegressor(max_depth = depth)
45 | my_model.fit(x_train, y_train)
46 | my_pred = my_model.predict(x_test)
47 | 
48 | # Plot the training data and draw the estimated curve.
49 | plot_prediction(x_train, y_train, x_test, my_pred, 'MyDTreeRegressor')
50 | 
51 | # Compare with sklearn's DecisionTreeRegressor() results.
52 | # -------------------------------------------------------
53 | sk_model = DecisionTreeRegressor(max_depth = depth)
54 | sk_model.fit(x_train, y_train)
55 | sk_pred = sk_model.predict(x_test)
56 | 
57 | # Plot the training data and draw the estimated curve.
58 | plot_prediction(x_train, y_train, x_test, sk_pred, 'DecisionTreeRegressor')
59 | 
60 | # Compare trees created by the two models.
61 | print('\nMyDTreeRegressor: estimator2:')
62 | pprint.pprint(my_model.estimator2, sort_dicts=False)
63 | 
64 | plt.figure(figsize=(12,7))
65 | tree.plot_tree(sk_model)
66 | plt.show()
67 |         


--------------------------------------------------------------------------------
/3.LinearRegression/1.scipy_opt(ols).py:
--------------------------------------------------------------------------------
 1 | # [MXML-3-02] 1.scipy_opt(ols).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/YBk1FS1vmv4
10 | #
11 | from scipy import optimize
12 | from sklearn.metrics import r2_score
13 | import matplotlib.pyplot as plt
14 | import numpy as np
15 | 
16 | # y = ax + b + Gaussian noise
17 | def reg_data(a, b, n, s):
18 |    rtn_x, rtn_y = [], []
19 |    for i in range(n):
20 |        x = np.random.normal(0.0, 0.5)
21 |        y = a * x + b + np.random.normal(0.0, s)
22 |        rtn_x.append(x)
23 |        rtn_y.append(y)    
24 |    return np.array(rtn_x).reshape(-1,1), np.array(rtn_y)
25 |    
26 | # Generate 1,000 data points drawn from y = ax + b + noise
27 | # s : standard deviation of the noise distribution
28 | x, y = reg_data(a=0.5, b=0.3, n=1000, s=0.2)
29 | 
30 | # y = w0 + w1*x1 + w2*x2 + ... → w0*x0 + w1*x1 + w2*x2 + ... (x0 = 1)
31 | # y = [w0, w1, w2, ...] * [x0, x1, x2, ...].T  (T : transpose)
32 | # y = W * X.T
33 | X = np.hstack([np.ones([x.shape[0], 1]), x])
34 | REG_CONST = 0.01   # regularization constant
35 | 
36 | # Regularized loss function : Mean Squared Error
37 | def ols_loss(W, args):
38 |     e = np.dot(W, X.T) - y
39 |     mse = np.mean(np.square(e))  # mean squared error
40 | 
41 |     # We typically do not penalize the intercept term.
42 |     loss = mse + REG_CONST * np.sum(np.square(W[1:]))
43 |     
44 |     # save W and loss
45 |     if args[0] == True:
46 |         trace_W.append([W, loss])
47 |     return loss
48 | 
49 | # Perform optimization process
50 | trace_W = []
51 | result = optimize.minimize(ols_loss, [-4., 4], args=[True])
52 | print(result)
53 | 
54 | # Plot the training data and draw the regression line.
55 | y_hat = np.dot(result.x, X.T)
56 | plt.figure(figsize=(6, 6))
57 | plt.scatter(x, y, s=5, c='r')
58 | plt.plot(x, y_hat, c='blue')
59 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
60 | plt.axhline(y=0, ls='--', lw=0.5, c='black')
61 | plt.show()
62 | 
63 | # Draw the loss function and the path to the optimal point.
64 | m = 5
65 | t = 0.1
66 | w0, w1 = np.meshgrid(np.arange(-m, m, t), np.arange(-m, m, t))
67 | zs = np.array([ols_loss([a,b], [False]) for [a, b] in zip(np.ravel(w0), np.ravel(w1))])
68 | z = zs.reshape(w0.shape)
69 | 
70 | fig = plt.figure(figsize=(7, 7))
71 | ax = fig.add_subplot(111, projection='3d')
72 | 
73 | # Draw the surface of the loss function
74 | ax.plot_surface(w0, w1, z, alpha=0.7)
75 | 
76 | # Dwaw the path to the optimal point.
77 | b = np.array([tw0 for [tw0, tw1], td in trace_W])
78 | w = np.array([tw1 for [tw0, tw1], td in trace_W])
79 | d = np.array([td for [tw0, tw1], td in trace_W])
80 | ax.plot(b, w, d, marker='o', color="r")
81 | 
82 | ax.set_xlabel('W0 (bias)')
83 | ax.set_ylabel('W1 (slope)')
84 | ax.set_zlabel('distance')
85 | ax.azim = -50
86 | ax.elev = 50
87 | plt.show()
88 | 
89 | # Check the R2 score
90 | sst = np.sum(np.square(y - np.mean(y)))  # total sum of squares
91 | sse = np.sum(np.square(y - y_hat))       # sum of squares of error
92 | r2 = 1 - sse / sst
93 | print('\nR2 score = {:.4f}'.format(r2))
94 | print('R2 score = {:.4f}'.format(r2_score(y, y_hat)))
95 | 


--------------------------------------------------------------------------------
/3.LinearRegression/10.ransac(2).py:
--------------------------------------------------------------------------------
 1 | # [MXML-3-07] 10.ransac(2).py
 2 | # Implementing RANSAC using sklearn's RANSACRegressor.
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/A2QnStjnlVE
11 | #
12 | import matplotlib.pyplot as plt
13 | import numpy as np
14 | from sklearn.linear_model import LinearRegression, RANSACRegressor
15 | from sklearn.metrics import r2_score
16 | 
17 | # Generate n data samples with outliers.
18 | def reg_data_outlier(a, b, n, s, outlier_rate=0.1):
19 |     n1 = int(n * outlier_rate) # the number of outliers
20 |     n2 = n - n1                # the number of inliers
21 |     
22 |     # Generate normal data points (inliers)
23 |     x2 = np.random.normal(0.0, 0.5, size=n2)
24 |     y2 = a * x2 + b + np.random.normal(0.0, s, size=n2)
25 |     
26 |     # Generate abnormal data points (outliers)
27 |     x1 = np.random.normal(0.5, 0.1, size=n1)
28 |     y1 = a * x1 + b * 3 + np.abs(np.random.normal(0.0, s, size=n1))
29 |     
30 |     x = np.hstack([x2, x1]).reshape(-1,1)
31 |     y = np.hstack([y2, y1])
32 |     
33 |     return x, y
34 | 
35 | x, y = reg_data_outlier(a=0.5, b=0.3, n=1000, s=0.2, outlier_rate=0.2)
36 | 
37 | # min_samples:
38 | #    min_samples is chosen as X.shape[1] + 1.
39 | # stop_probability:
40 | #    RANSAC iteration stops if at least one outlier-free set of the training 
41 | #    data is sampled in RANSAC. This requires to generate at least N samples 
42 | #    (iterations):
43 | # residual_threshold:
44 | #    By default the threshold is chosen as the MAD (median absolute deviation) 
45 | #    of the target values y.
46 | model = RANSACRegressor(LinearRegression(),
47 |                         stop_probability = 0.99,     # default
48 |                         residual_threshold = None,   # default
49 |                         min_samples = 10)
50 | 
51 | model.fit(x, y)
52 | 
53 | w = model.estimator_.coef_
54 | b = model.estimator_.intercept_
55 | 
56 | # Visually check the data and final regression line
57 | y_pred = model.predict(x)
58 | plt.figure(figsize=(6,5))
59 | plt.scatter(x, y, s=5, c='r')
60 | plt.plot(x, y_pred, c='blue')
61 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
62 | plt.axhline(y=0, ls='--', lw=0.5, c='black')
63 | plt.show()
64 | 
65 | print('\nRANSAC results:')
66 | print('Regression line: y = {:.3f}x + {:.3f}'.format(w[0], b))
67 | print('R2 score = {:.3f}'.format(r2_score(y, y_pred)))
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/3.LinearRegression/11.boston(ransac).py:
--------------------------------------------------------------------------------
 1 | # [MXML-3-07] 11.boston(ransac).py
 2 | # Predict the Boston house prices using RANSAC
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/A2QnStjnlVE
11 | #
12 | import matplotlib.pyplot as plt
13 | import numpy as np
14 | from sklearn.linear_model import RANSACRegressor, Ridge
15 | from sklearn.model_selection import train_test_split
16 | import pickle
17 | 
18 | # Read Boston house price dataset
19 | with open('data/boston_house.pkl', 'rb') as f:
20 |     data = pickle.load(f)
21 | x = data['data']      # shape = (506, 13)
22 | y = data['target']    # shape = (506,)
23 | x_train, x_test, y_train, y_test = train_test_split(x, y)
24 | 
25 | # min_samples:
26 | #    min_samples is chosen as X.shape[1] + 1.
27 | # stop_probability:
28 | #    RANSAC iteration stops if at least one outlier-free set of the training 
29 | #    data is sampled in RANSAC. This requires to generate at least N samples 
30 | #    (iterations):
31 | # residual_threshold:
32 | #    By default the threshold is chosen as the MAD (median absolute deviation) 
33 | #    of the target values y.
34 | model = RANSACRegressor(Ridge(alpha=0.01),
35 |                         stop_probability = 0.99,     # default
36 |                         residual_threshold = None,   # default
37 |                         min_samples = 50)
38 | 
39 | model.fit(x_train, y_train)
40 | 
41 | # Visually check the actual and predicted prices
42 | y_pred = model.predict(x_test)
43 | plt.figure(figsize=(6, 5))
44 | plt.scatter(y_test, y_pred, s=20, c='r')
45 | plt.xlabel('y_test')
46 | plt.ylabel('y_pred')
47 | plt.show()
48 | 
49 | print('RANSAC R2 = {:.3f}'.format(model.score(x_test, y_test)))
50 | 
51 | 


--------------------------------------------------------------------------------
/3.LinearRegression/2.boston(ols).py:
--------------------------------------------------------------------------------
 1 | # [MXML-3-03] 2.boston(ols).py
 2 | # prediction of Boston house price
 3 | # Applying Mean centering, Normalization, Ridge Regularization
 4 | #
 5 | # This code was used in the machine learning online 
 6 | # course provided by 
 7 | # www.youtube.com/@meanxai
 8 | # www.github.com/meanxai/machine_learning
 9 | #
10 | # A detailed description of this code can be found in
11 | # https://youtu.be/gLekbL_pI1A
12 | #
13 | from scipy import optimize
14 | import matplotlib.pyplot as plt
15 | import numpy as np
16 | import pandas as pd
17 | from sklearn.metrics import r2_score
18 | from sklearn.model_selection import train_test_split
19 | import pickle
20 | 
21 | # Read Boston house price dataset
22 | with open('data/boston_house.pkl', 'rb') as f:
23 |     data = pickle.load(f)
24 |     
25 | x = data['data']      # shape = (506, 13)
26 | y = data['target']    # shape = (506,)
27 | 
28 | # Split the dataset into training and test data
29 | x_train, x_test, y_train, y_test = train_test_split(x, y)
30 | REG_CONST = 0.01   # regularization constant
31 | 
32 | # Mean centering & Normalization are performed on training data.
33 | x_offset = x_train.mean(axis=0)
34 | x_scale = x_train.std(axis=0)
35 | y_offset = y_train.mean()
36 | 
37 | xm_train = (x_train - x_offset) / x_scale
38 | ym_train = y_train - y_offset
39 | 
40 | # Regularized mean squared error loss function
41 | def ols_loss(W):
42 |     # Calculating MSE using the training data
43 |     d_train = np.dot(W, xm_train.T) - ym_train
44 |     mse = np.mean(np.square(d_train))
45 |     loss = mse + REG_CONST * np.sum(np.square(W))
46 |     
47 |     # Save the loss history.
48 |     trc_loss.append(loss)
49 |     return loss
50 | 
51 | # Perform optimization process
52 | trc_loss = []
53 | W0 = np.ones(xm_train.shape[1]) * 0.1  # W의 초깃값.
54 | result = optimize.minimize(ols_loss, W0)
55 | 
56 | # Check the results
57 | print(result.success)    # check if success = True
58 | print(result.message)
59 | 
60 | # Visually check the regularized MSE of the training data.
61 | plt.figure(figsize=(6, 4))
62 | plt.plot(trc_loss, label = 'loss_train')
63 | plt.legend()
64 | plt.xlabel('epochs')
65 | plt.show()
66 | 
67 | # Convert result.x to the coef and the intercept
68 | # y_hat = coef * x + intercept
69 | coef = result.x / x_scale
70 | intercept = y_offset - np.dot(x_offset, coef.T)
71 | 
72 | # Predict y values of the test data.
73 | y_pred = np.dot(coef, x_test.T) + intercept
74 | 
75 | # Visually check the predicted and actual y values ​​of the test data.
76 | plt.figure(figsize=(6, 5))
77 | plt.scatter(y_test, y_pred, s=20, c='r')
78 | plt.xlabel('y_test')
79 | plt.ylabel('y_pred')
80 | plt.show()
81 | 
82 | df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
83 | print('\n', df.head(10))
84 | 
85 | # Check R2 score of the test data.
86 | print('\nR2 score = {:.4f}'.format(r2_score(y_test, y_pred)))
87 | 
88 | 


--------------------------------------------------------------------------------
/3.LinearRegression/3.boston(sklearn).py:
--------------------------------------------------------------------------------
 1 | # [MXML-3-03] 3.boston(sklearn).py
 2 | # prediction of Boston house price
 3 | # using sklear’s LinearRegression, Ridge, Lasso
 4 | #
 5 | # This code was used in the machine learning online 
 6 | # course provided by 
 7 | # www.youtube.com/@meanxai
 8 | # www.github.com/meanxai/machine_learning
 9 | #
10 | # A detailed description of this code can be found in
11 | # https://youtu.be/gLekbL_pI1A
12 | #
13 | import matplotlib.pyplot as plt
14 | from sklearn.linear_model import LinearRegression, Ridge, Lasso
15 | from sklearn.model_selection import train_test_split
16 | import pickle
17 | 
18 | # Read Boston house price dataset
19 | with open('data/boston_house.pkl', 'rb') as f:
20 |     data = pickle.load(f)
21 | 
22 | x = data['data']      # features, shape = (506, 13)
23 | y = data['target']    # target, shape = (506,)
24 | 
25 | # Split the dataset into training and test data
26 | x_train, x_test, y_train, y_test = train_test_split(x, y)
27 | 
28 | # 1. LinearRegression()
29 | # ---------------------
30 | model = LinearRegression()
31 | model.fit(x_train, y_train)
32 | y_pred = model.predict(x_test)
33 | 
34 | # Visually check the predicted and actual y values ​​of the test data.
35 | plt.figure(figsize=(6, 5))
36 | plt.scatter(y_test, y_pred, s=20, c='r')
37 | plt.xlabel('y_test')
38 | plt.ylabel('y_pred')
39 | plt.show()
40 | 
41 | # 평가용 데이터의 R2를 확인한다.
42 | r2 = model.score(x_test, y_test)
43 | print('\nR2 (LinearRegression) = {:.3f}'.format(r2))
44 | 
45 | # 2. Ridge regularization
46 | # -----------------------
47 | model = Ridge(alpha=0.01)
48 | model.fit(x_train, y_train)
49 | r2 = model.score(x_test, y_test)
50 | print('R2 (Ridge) = {:.3f}'.format(r2))
51 | 
52 | # 3. Lasso regularization
53 | # -----------------------
54 | model = Lasso(alpha=0.01)
55 | model.fit(x_train, y_train)
56 | r2 = model.score(x_test, y_test)
57 | print('R2 (Lasso) = {:.3f}'.format(r2))
58 | 


--------------------------------------------------------------------------------
/3.LinearRegression/4.scipy_opt(tls).py:
--------------------------------------------------------------------------------
 1 | # [MXML-3-04] 4.scipy_opt(tls).py
 2 | # Implementation of TLS using scipy.optimize. Apply Ridge.
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/yDdbC9BhdwM
11 | #
12 | from scipy import optimize
13 | import matplotlib.pyplot as plt
14 | import numpy as np
15 | 
16 | # y = ax + b + Gaussian noise
17 | def reg_data(a, b, n, s):
18 |    rtn_x, rtn_y = [], []
19 |    for i in range(n):
20 |        x = np.random.normal(0.0, 0.5)
21 |        y = a * x + b + np.random.normal(0.0, s)
22 |        rtn_x.append(x)
23 |        rtn_y.append(y)    
24 |    return np.array(rtn_x).reshape(-1,1), np.array(rtn_y)
25 |    
26 | # Generate 1,000 data points drawn from y = ax + b + noise
27 | x, y = reg_data(a=0.5, b=0.3, n=1000, s=0.2)
28 | 
29 | # y = w0 + w1*x1 + w2*x2 + ... → w0*x0 + w1*x1 + ... (x0 = 1)
30 | # y = [w0, w1, w2, ...] * [x0, x1, x2, ...].T  (T : transpose)
31 | # y = W * X.T
32 | X = np.hstack([np.ones([x.shape[0], 1]), x])
33 | REG_CONST = 0.01   # regularization constant
34 | 
35 | # Cost function: square sum of the perpendicular distances 
36 | # between data points and the regression line.
37 | def tls_loss(W, args):
38 |     numerator = np.square(np.dot(W, X.T) - y)
39 |     denominator = np.square(W[1]) + 1
40 |     d2 = numerator / denominator
41 |     msd = np.mean(d2)
42 |     loss = msd + REG_CONST * np.sum(np.square(W))
43 |     
44 |     # save W and loss history
45 |     if args[0] == True:
46 |         trace_W.append([W, loss])
47 |         
48 |     return loss
49 | 
50 | # Perform optimization process
51 | trace_W = []
52 | result = optimize.minimize(tls_loss, [-4, 0.5], args=[True])
53 | print(result)
54 | 
55 | # Plot the training data and draw the regression line.
56 | y_hat = np.dot(result.x, X.T)
57 | plt.figure(figsize=(6,5))
58 | plt.scatter(x, y, s=5, c='r')
59 | plt.plot(x, y_hat, c='blue')
60 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
61 | plt.axhline(y=0, ls='--', lw=0.5, c='black')
62 | plt.show()
63 | 
64 | # Draw the loss function and the path to the optimal point.
65 | m = 5
66 | t = 0.1
67 | w0, w1 = np.meshgrid(np.arange(-m, m, t), np.arange(-m, m, t))
68 | zs = np.array([tls_loss([a,b], [False]) for [a, b] in zip(np.ravel(w0), np.ravel(w1))])
69 | z = zs.reshape(w0.shape)
70 | 
71 | fig = plt.figure(figsize=(10,10))
72 | ax = fig.add_subplot(111, projection='3d')
73 | 
74 | # Draw the surface of loss function
75 | ax.plot_surface(w0, w1, z, alpha=0.8)
76 | 
77 | # Draw the path to the optimal point.
78 | b = np.array([tw0 for [tw0, tw1], td in trace_W[:50]])
79 | w = np.array([tw1 for [tw0, tw1], td in trace_W[:50]])
80 | d = np.array([td for [tw0, tw1], td in trace_W[:50]])
81 | ax.plot(b, w, d, marker='o', color='red')
82 | 
83 | ax.set_xlabel('W0 (bias)')
84 | ax.set_ylabel('W1 (slope)')
85 | ax.set_zlabel('distance')
86 | ax.azim = -50
87 | ax.elev = 50
88 | plt.show()
89 | 
90 | # Check the R2 score
91 | sst = np.sum(np.square(y - np.mean(y)))         # total sum of squares
92 | sse = np.sum(np.square(y - y_hat))              # sum of squares of residuals
93 | r2 = 1 - sse / sst
94 | print('\nR2 score = {:.4f}'.format(r2))


--------------------------------------------------------------------------------
/3.LinearRegression/5.boston(tls).py:
--------------------------------------------------------------------------------
  1 | # [MXML-3-04] 5.boston(tls).py
  2 | # prediction of Boston house price by TLS
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/yDdbC9BhdwM
 11 | #
 12 | from scipy import optimize
 13 | import matplotlib.pyplot as plt
 14 | import numpy as np
 15 | import pandas as pd
 16 | from sklearn.metrics import r2_score
 17 | from sklearn.model_selection import train_test_split
 18 | import pickle
 19 | 
 20 | # Read Boston house price dataset
 21 | with open('data/boston_house.pkl', 'rb') as f:
 22 |     data = pickle.load(f)
 23 | x = data['data']      # shape = (506, 13)
 24 | y = data['target']    # shape = (506,)
 25 | x_train, x_test, y_train, y_test = train_test_split(x, y)
 26 | 
 27 | # Apply mean-centering to the training data
 28 | x_offset = x_train.mean(axis=0)
 29 | y_offset = y_train.mean()
 30 | 
 31 | xm_train = x_train - x_offset
 32 | ym_train = y_train - y_offset
 33 | 
 34 | # Apply Ridge regularization
 35 | REG_CONST = 0.01
 36 | 
 37 | # Cost function for OLS
 38 | def ols_loss(W):
 39 |     err = np.dot(W, xm_train.T) - ym_train
 40 |     mse = np.sqrt(np.mean(np.square(err)))
 41 |     loss = mse + REG_CONST * np.sum(np.square(W))
 42 |     return loss
 43 | 
 44 | # Cost function for TLS
 45 | def tls_loss(W):
 46 |     numerator = np.square(np.dot(W, xm_train.T) - ym_train)
 47 |     denominator = np.sum(np.square(W)) + 1
 48 |     d2 = numerator / denominator
 49 |     msd = np.sqrt(np.mean(d2))
 50 |     loss = msd + REG_CONST * np.sum(np.square(W))
 51 |     
 52 |     # save loss history
 53 |     trc_loss_train.append(loss)
 54 |     return loss
 55 | 
 56 | # Perform optimization process
 57 | trc_loss_train = []
 58 | 
 59 | # Perform OLS
 60 | W0 = np.array([1.0] * x_train.shape[1])  # W의 초깃값
 61 | result = optimize.minimize(ols_loss, W0)
 62 | 
 63 | # Perform TLS
 64 | # The optimal W found by OLS is used as the initial value of TLS.
 65 | W0 = result.x
 66 | result = optimize.minimize(tls_loss, W0)
 67 | print(result.success)    # check if success = True
 68 | print(result.message)
 69 | 
 70 | # Check the loss history
 71 | plt.figure(figsize=(6, 4))
 72 | plt.plot(trc_loss_train, label = 'loss_train')
 73 | plt.legend()
 74 | plt.xlabel('epochs')
 75 | plt.show()
 76 | 
 77 | # y_hat = coef * x + intercept
 78 | coef = result.x
 79 | intercept = y_offset - np.dot(x_offset, coef.T)
 80 | 
 81 | # Predict the y values of the test data
 82 | y_pred = np.dot(coef, x_test.T) + intercept
 83 | 
 84 | # Visually check the actual and predicted y values ​​of the test data.
 85 | plt.figure(figsize=(6, 5))
 86 | plt.scatter(y_test, y_pred, s=20, c='r')
 87 | plt.xlabel('y_test')
 88 | plt.ylabel('y_pred')
 89 | plt.show()
 90 | 
 91 | df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
 92 | print('\n', df.head(10))
 93 | 
 94 | # Check the R2 score
 95 | print('\nTLS R2 score = {:.4f}'.format(r2_score(y_test, y_pred)))
 96 | 
 97 | # Check the R2 score from OLS
 98 | ols_coef = W0
 99 | ols_icept = y_offset - np.dot(x_offset, ols_coef.T)
100 | y_ols_pred = np.dot(ols_coef, x_test.T) + ols_icept
101 | print('OLS R2 score = {:.4f}'.format(r2_score(y_test, y_ols_pred)))
102 | 
103 |     


--------------------------------------------------------------------------------
/3.LinearRegression/6.lwr(scipy).py:
--------------------------------------------------------------------------------
 1 | # [MXML-3-5] 6.lwr(scipy).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/d1-QS4uTgj8
10 | #
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from scipy import optimize
14 | from sklearn.model_selection import train_test_split
15 | 
16 | # Generate sinusoidal data with Gaussian noise added.
17 | def noisy_sine_data(n, s):
18 |    rtn_x, rtn_y = [], []
19 |    for i in range(n):
20 |        x= np.random.random()
21 |        y= 2.0*np.sin(2.0*np.pi*x)+np.random.normal(0.0, s) + 3.0
22 |        rtn_x.append(x)
23 |        rtn_y.append(y)
24 |    return np.array(rtn_x).reshape(-1,1), np.array(rtn_y)
25 | 
26 | # Create 1,000 data points for LWR testing.
27 | x, y = noisy_sine_data(n=1000, s=0.7)
28 | x_train, x_test, y_train, y_test = train_test_split(x, y)
29 | x1_train = np.hstack([np.ones([x_train.shape[0], 1]), x_train])
30 | 
31 | # Visualize the training and test data
32 | plt.figure(figsize=(6,5))
33 | plt.scatter(x_train, y_train, s=5, c='orange', label='train')
34 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', 
35 |             label='test')
36 | plt.legend()
37 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
38 | plt.axhline(y=0, ls='--', lw=0.5, c='black')
39 | plt.show()
40 | 
41 | # Find the weight for each data point.
42 | # train: training data, test: test data point to be predicted
43 | def get_weight(train, test, tau):
44 |     d2 = np.sum(np.square(train - test), axis=1)
45 |     w = np.exp(-d2 / (2. * tau * tau))
46 |     return w 
47 | 
48 | # Weighted cost function
49 | def lwr_loss(W, weight):
50 |     d = np.dot(W, x1_train.T) - y_train
51 |     wmsd = np.mean(weight * np.square(d))
52 |     return wmsd
53 | 
54 | y_pred = []
55 | for tx in x_test:
56 |     weight = get_weight(x_train, tx, 0.05)
57 |     result = optimize.minimize(lwr_loss, [0.1, 0.1], args=weight)
58 |     y_pred.append(np.dot(result.x[1], tx) + result.x[0])
59 | y_pred = np.array(y_pred).reshape(-1,)
60 | 
61 | # Visualize the predicted results
62 | plt.figure(figsize=(6,5))
63 | plt.scatter(x_train, y_train, s=5, c='orange', label='train')
64 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', 
65 |             label='test')
66 | plt.scatter(x_test, y_pred, s=5, c='red', label='prediction')
67 | plt.legend()
68 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
69 | plt.axhline(y=0, ls='--', lw=0.5, c='black')
70 | plt.show()
71 | 


--------------------------------------------------------------------------------
/3.LinearRegression/7.lwr(sklearn).py:
--------------------------------------------------------------------------------
 1 | # [MXML-3-5] 7.lwr(sklearn).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/d1-QS4uTgj8
10 | #
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from sklearn.linear_model import Ridge
14 | from sklearn.model_selection import train_test_split
15 | 
16 | # Generate sinusoidal data with Gaussian noise added.
17 | def noisy_sine_data(n, s):
18 |    rtn_x, rtn_y = [], []
19 |    for i in range(n):
20 |        x= np.random.random()
21 |        y= 2.0*np.sin(2.0*np.pi*x)+np.random.normal(0.0, s) + 3.0
22 |        rtn_x.append(x)
23 |        rtn_y.append(y)
24 |    return np.array(rtn_x).reshape(-1,1), np.array(rtn_y)
25 | 
26 | # Create 1,000 data points for LWR testing.
27 | x, y = noisy_sine_data(n=1000, s=0.7)
28 | x_train, x_test, y_train, y_test = train_test_split(x, y)
29 | x1_train = np.hstack([np.ones([x_train.shape[0], 1]), x_train])
30 | 
31 | # Visualize the training and test data
32 | plt.figure(figsize=(6, 5))
33 | plt.scatter(x_train, y_train, s=5, c='orange', label='train')
34 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', 
35 |             label='test')
36 | plt.legend()
37 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
38 | plt.axhline(y=0, ls='--', lw=0.5, c='black')
39 | plt.show()
40 | 
41 | # Find the weight for each data point.
42 | # train: training data, test: test data point to be predicted
43 | def get_weight(train, test, tau):
44 |     d2 = np.sum(np.square(train - test), axis=1)
45 |     w = np.exp(-d2 / (2. * tau * tau))
46 |     return w 
47 | 
48 | # predict the target value of the test data
49 | y_pred = []
50 | for tx in x_test:
51 |     weight = get_weight(x_train, tx, 0.05)
52 |     model = Ridge(alpha=0.01)
53 |     model.fit(x_train, y_train, sample_weight = weight)
54 |     y_pred.append(model.predict(tx.reshape(-1,1))[0])
55 | y_pred = np.array(y_pred).reshape(-1,)
56 | 
57 | # Visualize the predicted results
58 | plt.figure(figsize=(6, 5))
59 | plt.scatter(x_train, y_train, s=5, c='orange', label='train')
60 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue',
61 |             label='test')
62 | plt.scatter(x_test, y_pred, s=5, c='red', label='prediction')
63 | plt.legend()
64 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
65 | plt.axhline(y=0, ls='--', lw=0.5, c='black')
66 | plt.show()
67 | 


--------------------------------------------------------------------------------
/3.LinearRegression/8.boston(lwr).py:
--------------------------------------------------------------------------------
 1 | # [MXML-3-5] 8.bostn(lwr).py
 2 | # Predicting the Boston house price using LWR
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/d1-QS4uTgj8
11 | #
12 | import numpy as np
13 | import pandas as pd
14 | import matplotlib.pyplot as plt
15 | from sklearn.linear_model import Ridge
16 | from sklearn.metrics import r2_score
17 | from sklearn.model_selection import train_test_split
18 | import pickle
19 | 
20 | # Read saved dataset
21 | with open('data/boston_house.pkl', 'rb') as f:
22 |     data = pickle.load(f)
23 | x = data['data']      # shape = (506, 13)
24 | y = data['target']    # shape = (506,)
25 | x_train, x_test, y_train, y_test = train_test_split(x, y)
26 | 
27 | # Find the weight for each data point.
28 | # train: training data, test: test data point to be predicted
29 | def get_weight(train, test, tau):
30 |     d2 = np.sum(np.square(train - test), axis=1)
31 |     w = np.exp(-d2 / (2. * tau * tau))
32 |     return w
33 | 
34 | y_pred = []
35 | for tx in x_test:
36 |     weight = get_weight(x_train, tx, 50.0)
37 |     model = Ridge(alpha=0.01)
38 |     model.fit(x_train, y_train, sample_weight = weight)
39 |     y_pred.append(model.predict(tx.reshape(1, -1))[0])
40 | 
41 | y_pred = np.array(y_pred).reshape(-1,)
42 | 
43 | # Visually check the actual and predicted y values ​​of the test data.
44 | plt.figure(figsize=(6, 5))
45 | plt.scatter(y_test, y_pred, s=10, c='r')
46 | plt.xlabel('y_test')
47 | plt.ylabel('y_pred')
48 | plt.show()
49 | 
50 | print('\nR2 (LWR) = {:.3f}'.format(r2_score(y_test, y_pred)))
51 | 


--------------------------------------------------------------------------------
/3.LinearRegression/9.ransac(1).py:
--------------------------------------------------------------------------------
  1 | # [MXML-3-07] 9.ransac(1).py
  2 | # Implementing RANSAC from scratch
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/A2QnStjnlVE
 11 | #
 12 | import matplotlib.pyplot as plt
 13 | import numpy as np
 14 | from sklearn.linear_model import LinearRegression
 15 | from sklearn.metrics import r2_score
 16 | 
 17 | # Generate n data samples with outliers.
 18 | def reg_data_outlier(a, b, n, s, outlier_rate=0.1):
 19 |     n1 = int(n * outlier_rate) # the number of outliers
 20 |     n2 = n - n1                # the number of inliers
 21 |     
 22 |     # Generate normal data points (inliers)
 23 |     x2 = np.random.normal(0.0, 0.5, size=n2)
 24 |     y2 = a * x2 + b + np.random.normal(0.0, s, size=n2)
 25 |     
 26 |     # Generate abnormal data points (outliers)
 27 |     x1 = np.random.normal(0.5, 0.1, size=n1)
 28 |     y1 = a * x1 + b * 3 + np.abs(np.random.normal(0.0, s, size=n1))
 29 |     
 30 |     x = np.hstack([x2, x1]).reshape(-1,1)
 31 |     y = np.hstack([y2, y1])
 32 |     
 33 |     return x, y
 34 | 
 35 | x, y = reg_data_outlier(a=0.5, b=0.3, n=1000, s=0.2, outlier_rate=0.2)
 36 | 
 37 | # 1. OLS
 38 | model = LinearRegression()
 39 | result = model.fit(x.reshape(-1,1), y)
 40 | 
 41 | # Visualize the data and regression line
 42 | w = result.coef_
 43 | b = result.intercept_
 44 | y_hat = np.dot(w, x.T) + b
 45 | 
 46 | plt.figure(figsize=(6,5))
 47 | plt.scatter(x, y, s=5, c='r')
 48 | plt.plot(x, y_hat, c='blue')
 49 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
 50 | plt.axhline(y=0, ls='--', lw=0.5, c='black')
 51 | plt.show()
 52 | 
 53 | print('\nOLS results:')
 54 | print('Regression line: y = {:.3f}x + {:.3f}'.format(w[0], b))
 55 | print('R2 score = {:.3f}'.format(r2_score(y, y_hat)))
 56 | 
 57 | # RANSAC
 58 | n_sample =10     # the number of samples chosen randomly from original data
 59 | z_prob = 0.99    # the probability z
 60 | w_prob = 0.8     # the probability w
 61 | 
 62 | # The maximum number of attempts to find a consensus set
 63 | k_maxiter = int(np.log(1.0 - z_prob) / np.log(1.0 - w_prob ** n_sample))
 64 | 
 65 | # RANSACRegressor/residual_threshold:
 66 | # the threshold is chosen as the MAD (median absolute deviation) of the 
 67 | # target values y
 68 | threshold = np.median(np.abs(y - np.median(y)))
 69 | 
 70 | ransac_w = 0   # slope
 71 | ransac_b = 0   # intercept
 72 | ransac_c = 0   # count within the error tolerance
 73 | for i in range(k_maxiter):
 74 |     # sampling without replacement
 75 |     idx = np.random.choice(np.arange(0, x.shape[0]-1), n_sample, replace=False)
 76 |     xs = x[idx]
 77 |     ys = y[idx]
 78 |     
 79 |     # OLS Regression
 80 |     model = LinearRegression()
 81 |     result = model.fit(xs, ys)
 82 |     
 83 |     # Calculate the absolute value of residuals.
 84 |     y_pred = np.dot(result.coef_, x.T) + result.intercept_
 85 |     residual = np.abs(y - y_pred)
 86 |     
 87 |     # Count the number of times the residual is less than the threshold.
 88 |     count = (residual < threshold).sum()
 89 |     
 90 |     # Find the regression line where the count is largest.
 91 |     if count > ransac_c:
 92 |         ransac_c = count
 93 |         ransac_w = result.coef_
 94 |         ransac_b = result.intercept_
 95 | 
 96 | y_pred = np.dot(ransac_w, x.T) + ransac_b
 97 | 
 98 | # Visually check the data and final regression line
 99 | plt.figure(figsize=(6,5))
100 | plt.scatter(x, y, s=5, c='r')
101 | plt.plot(x, y_pred, c='blue')
102 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
103 | plt.axhline(y=0, ls='--', lw=0.5, c='black')
104 | plt.show()
105 | 
106 | print('\nRANSAC results:')
107 | print('The maximum number of k = {}'.format(k_maxiter))
108 | print('Threshold = {:.3f}'.format(threshold))
109 | print('Regression line: y = {:.3f}x + {:.3f}'.format(ransac_w[0], ransac_b))
110 | print('R2 score = {:.3f}'.format(r2_score(y, y_pred)))
111 | 


--------------------------------------------------------------------------------
/3.LinearRegression/data/boston_house.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/meanxai/machine_learning/fba47e91cc7449eb5d7ea8b7ec1fb0fd616ebd71/3.LinearRegression/data/boston_house.pkl


--------------------------------------------------------------------------------
/3.LinearRegression/data/wls_sample_data.csv:
--------------------------------------------------------------------------------
 1 | state,y,x1,x2,x3,region
 2 | ME,235,3944,325,508,1
 3 | NH,231,4578,323,564,1
 4 | VT,270,4011,328,322,1
 5 | MA,261,5233,305,846,1
 6 | RI,300,4780,303,871,1
 7 | CT,317,5889,307,774,1
 8 | NY,387,5663,301,856,1
 9 | NJ,285,5759,310,889,1
10 | PA,300,4894,300,715,1
11 | OH,221,5012,324,753,2
12 | IN,264,4908,329,649,2
13 | IL,308,5753,320,830,2
14 | MI,379,5439,337,738,2
15 | WI,342,4634,328,659,2
16 | MN,378,4921,330,664,2
17 | IA,232,4869,318,572,2
18 | MO,231,4672,309,701,2
19 | ND,246,4782,333,443,2
20 | SD,230,4296,330,446,2
21 | NB,268,4827,318,615,2
22 | KS,337,5057,304,661,2
23 | DE,344,5540,328,722,3
24 | MD,330,5331,323,766,3
25 | VA,261,4715,317,631,3
26 | WV,214,3828,310,390,3
27 | NC,245,4120,321,450,3
28 | SC,233,3817,342,476,3
29 | GA,250,4243,339,603,3
30 | FL,243,4647,287,805,3
31 | KY,216,3967,325,523,3
32 | TN,212,3946,315,588,3
33 | AL,208,3724,332,584,3
34 | MS,215,3448,358,445,3
35 | AR,221,3680,320,500,3
36 | LA,244,3825,355,661,3
37 | OK,234,4189,306,680,3
38 | TX,269,4336,335,797,3
39 | MT,302,4418,335,534,4
40 | ID,268,4323,344,541,4
41 | WY,323,4813,331,605,4
42 | CO,304,5046,324,785,4
43 | NM,317,3764,366,698,4
44 | AZ,332,4504,340,796,4
45 | UT,315,4005,378,804,4
46 | NV,291,5560,330,809,4
47 | WA,312,4989,313,726,4
48 | OR,316,4697,305,671,4
49 | CA,332,5438,307,909,4
50 | AK,546,5613,386,484,4
51 | HI,311,5309,333,831,4
52 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/1.bin_class(scipy).py:
--------------------------------------------------------------------------------
  1 | # [MXML-4-02] 1.bin_class(scipy).pyt
  2 | # Logistic Regression : binary classification
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/MifHxwJYOyU
 11 | #
 12 | from scipy import optimize
 13 | import numpy as np
 14 | from sklearn.model_selection import train_test_split
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | # Create a simple dataset for binary classification
 18 | def bin_class_data(n):
 19 |     n1 = int(n / 2)
 20 |     a = np.random.normal(-1.0, 1.0, n1)
 21 |     b = np.random.normal(1.0, 1.0, n1)
 22 |     x = np.hstack([a, b]).reshape(-1, 1)
 23 |     y = np.hstack([np.zeros(n1), np.ones(n1)])
 24 |     return x, y
 25 |     
 26 | x, y = bin_class_data(n=1000)  # create 1000 data points
 27 | X = np.hstack([np.ones([x.shape[0], 1]), x])
 28 | y = y.astype('int8')
 29 | 
 30 | # Visually check the data
 31 | plt.scatter(x, y, c='r', s=10, alpha=0.5)
 32 | plt.show()
 33 | 
 34 | # Split the data into training and test data
 35 | x_train, x_test, y_train, y_test = train_test_split(X, y)
 36 | 
 37 | # Loss function : mean of binary cross entropy
 38 | def bce_loss(W, args):
 39 |     tx = args[0]
 40 |     ty = args[1]
 41 |     trc = args[2]
 42 |     y_hat = 1.0 / (1 + np.exp(-np.dot(W, tx.T)))
 43 |     bce = -ty * np.log(y_hat + 1e-8) - (1.0 - ty) * np.log(1.0 - y_hat + 1e-8)
 44 |     loss = bce.mean()
 45 | 
 46 |     # save the loss
 47 |     if trc == True:
 48 |         trace_W.append([W, loss])
 49 |     return loss
 50 | 
 51 | # Perform an optimization process
 52 | trace_W = []
 53 | result = optimize.minimize(fun = bce_loss,
 54 |                             x0 = [-5, 15],
 55 |                             args=[x_train, y_train, True])
 56 | 
 57 | # print the result. result.x contains the optimal parameters
 58 | print(result)
 59 | 
 60 | # Visually check the data and the predicted regression curves
 61 | y_hat = 1.0 / (1 + np.exp(-np.dot(result.x, x_train.T)))
 62 | plt.figure(figsize=(5, 4))
 63 | plt.scatter(x, y, s=5, c='r', label = 'data')
 64 | plt.scatter(x_train[:, 1], y_hat, c='blue', s=1, label = 'sigmoid')
 65 | plt.legend()
 66 | plt.axhline(y = 0.5, linestyle='--', linewidth=0.5)
 67 | plt.show()
 68 | 
 69 | # Measure the accuracy of test data
 70 | y_prob = 1.0 / (1 + np.exp(-np.dot(result.x, x_test.T)))
 71 | y_pred = (y_prob > 0.5).astype('int8')
 72 | acc = (y_pred == y_test).mean()
 73 | print('\nAccuracy of test data = {:.3f}'.format(acc))
 74 | 
 75 | # Visually check the loss function and the path to the optimal point
 76 | w0, w1 = np.meshgrid(np.arange(-20, 20, 1), np.arange(-5, 20, 1))
 77 | zs = np.array([bce_loss(np.array([a, b]), [x_train, y_train, False]) \
 78 |                for [a, b] in zip(np.ravel(w0), np.ravel(w1))])
 79 | z = zs.reshape(w0.shape)
 80 | 
 81 | fig = plt.figure(figsize=(10,10))
 82 | ax = fig.add_subplot(111, projection='3d')
 83 | 
 84 | # Drawing the surface of the loss function
 85 | ax.plot_surface(w0, w1, z, alpha=0.7)
 86 | 
 87 | # Drawing the path to the optimal point
 88 | b = np.array([tw0 for [tw0, tw1], td in trace_W])
 89 | w = np.array([tw1 for [tw0, tw1], td in trace_W])
 90 | d = np.array([td for [tw0, tw1], td in trace_W])
 91 | ax.plot(b[0], w[0], d[0], marker='x', markersize=15, color="r")
 92 | ax.plot(b[-1], w[-1], d[-1], marker='*', markersize=20, color="r")
 93 | ax.plot(b, w, d, marker='o', color="r")
 94 | 
 95 | ax.set_xlabel('W0 (bias)')
 96 | ax.set_ylabel('W1 (slope)')
 97 | ax.set_zlabel('cross entropy')
 98 | ax.azim = 50
 99 | ax.elev = 50    # [50, 0]
100 | plt.show()
101 | 
102 | # Visually see that the loss decreases as the iteration progresses
103 | plt.figure(figsize=(5, 4))
104 | plt.plot([e for w, e in trace_W], color='red')
105 | plt.title('train loss')
106 | plt.xlabel('epoch')
107 | plt.ylabel('loss')
108 | plt.show()
109 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/10.lwlr(sklearn).py:
--------------------------------------------------------------------------------
 1 | # [MXML-4-05] 10.lwlr(sklearn).py
 2 | # Use the sample_weight argument in sklearn's LogisticRegression model.
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/d1-QS4uTgj8
11 | #
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | from sklearn.linear_model import LogisticRegression
15 | from sklearn.model_selection import train_test_split
16 | 
17 | # Generate a simple dataset
18 | def lwlr_data1(n):
19 |     n1 = int(n / 3)
20 |     a = np.random.normal(-1.0, 0.5, n1)
21 |     b = np.random.normal(1.0, 0.5, n1)
22 |     c = np.random.normal(3.0, 0.5, n - n1 * 2)
23 |     x = np.hstack([a, b, c]).reshape(-1, 1)
24 |     y = np.hstack([np.zeros(n1), np.ones(n1), np.zeros(n - n1 * 2)])
25 |     return x, y
26 | 
27 | # Generate training and test data
28 | x, y = lwlr_data1(n=2000)
29 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
30 | 
31 | # Visualize the dataset
32 | plt.figure(figsize=(6, 3))
33 | plt.scatter(x_train, y_train, s=5, c='orange', label='train')
34 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', label='test')
35 | plt.legend()
36 | plt.show()
37 | 
38 | # Calculating the weights of training data points
39 | # xx : training data, xx : test data
40 | def get_weight(xx, tx, tau):
41 |     distance = np.sum(np.square(xx - tx), axis=1)
42 |     w = np.exp(-distance / (2 * tau * tau))
43 |     return w
44 | 
45 | y_prob = []
46 | for tx in x_test:
47 |     weight = get_weight(x_train, tx, 0.6)
48 |     model = LogisticRegression()
49 |     model.fit(x_train, y_train, sample_weight = weight)
50 |     y_prob.append(model.predict_proba(tx.reshape(-1, 1))[:, 1])
51 |     
52 | y_prob = np.array(y_prob).reshape(-1,)
53 | 
54 | # Visually check the training and test data, and predicted probability.
55 | plt.figure(figsize=(6, 3))
56 | plt.scatter(x_train, y_train, s=5, c='orange', label='train')
57 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', label='test')
58 | plt.scatter(x_test, y_prob, s=5, c='red', label='prediction')
59 | plt.legend()
60 | plt.axhline(y=0.5, ls='--', lw=0.5, c='black')
61 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
62 | plt.axvline(x=2, ls='--', lw=0.5, c='black')
63 | plt.show()
64 | 
65 | # Measure the accuracy of the test data
66 | y_pred = (y_prob > 0.5).astype('int8')
67 | acc = (y_pred == y_test).mean()
68 | print('\nAccuracy of the test data = {:.3f}'.format(acc))
69 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/11.lwlr_2(sklearn).py:
--------------------------------------------------------------------------------
  1 | # [MXML-4-05] 11.lwlr_2(sklearn).py
  2 | # Check the non-linear decision boundary
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/d1-QS4uTgj8
 11 | #
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | from matplotlib.colors import ListedColormap
 15 | from sklearn.linear_model import LogisticRegression
 16 | from sklearn.model_selection import train_test_split
 17 | 
 18 | # Generate a simple dataset
 19 | def lwlr_data2(n, s):
 20 |     n1 = int(n /  3)
 21 |     x, y = [], []
 22 |     for a, b, c, m in [(1, 1, 0, n1), 
 23 |                        (2, 2, 1, n-n1*2), (3, 1, 0, n1)]:
 24 |         x1 = np.random.normal(a, s, m).reshape(-1,1)
 25 |         x2 = np.random.normal(b, s, m).reshape(-1,1)
 26 |         x.extend(np.hstack([x1, x2]))
 27 |         y.extend(np.ones(m) * c)
 28 |     x = np.array(x).reshape(-1, 2)
 29 |     y = np.array(y).astype('int8').reshape(-1, 1)
 30 |     return x, y.reshape(-1,)
 31 | x, y = lwlr_data2(n=1000, s=0.5)
 32 | 
 33 | # Visually check the data distribution.
 34 | m = ['o', '^']
 35 | color = ['red', 'blue']
 36 | plt.figure(figsize=(5,5))
 37 | for i in [0, 1]:
 38 |     idx = np.where(y == i)
 39 |     plt.scatter(x[idx, 0], x[idx, 1], 
 40 |                 c=color[i], 
 41 |                 marker = m[i],
 42 |                 s = 20,
 43 |                 edgecolor = 'black',
 44 |                 alpha = 0.5,
 45 |                 label='class-'+str(i))
 46 | plt.legend()
 47 | plt.show()
 48 | 
 49 | # Split the data into the training and test data
 50 | x_train, x_test, y_train, y_test = train_test_split(x, y)
 51 | 
 52 | # Calculating the weights of training data points
 53 | # xx : training data, xx : test data
 54 | def get_weight(xx, tx, tau):
 55 |     distance = np.sum(np.square(xx - tx), axis=1)
 56 |     w = np.exp(-distance / (2 * tau * tau))
 57 |     return w
 58 | 
 59 | # Predict the classes of the test data
 60 | y_prob = []
 61 | tau = 0.1
 62 | for tx in x_test:
 63 |     weight = get_weight(x_train, tx, tau)
 64 |     model = LogisticRegression()
 65 |     model.fit(x_train, y_train, sample_weight = weight)
 66 |     y_prob.append(model.predict_proba(tx.reshape(-1, 2))[:, 1])
 67 | y_prob = np.array(y_prob).reshape(-1,)
 68 | 
 69 | # Measure the accuracy of the test data
 70 | y_pred = (y_prob > 0.5).astype('int8')
 71 | acc = (y_pred == y_test).mean()
 72 | print('\nAccuracy of the test data = {:.3f}'.format(acc))
 73 | 
 74 | # Visualize the non-linear decision boundary
 75 | # reference : 
 76 | # https://psrivasin.medium.com/
 77 | #   plotting-decision-boundaries-using-numpy-and-matplotlib-f5613d8acd19    
 78 | x_min, x_max = x_test[:, 0].min() - 0.1, x_test[:,0].max() + 0.1
 79 | y_min, y_max = x_test[:, 1].min() - 0.1, x_test[:, 1].max() + 0.1
 80 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50),
 81 |                      np.linspace(y_min, y_max, 50))
 82 | x_in = np.c_[xx.ravel(), yy.ravel()]
 83 | 
 84 | # Predict the classes of the data points in the x_in variable.
 85 | y_prob = []
 86 | for tx in x_in:
 87 |     weight = get_weight(x_train, tx, tau)
 88 |     
 89 |     model = LogisticRegression()
 90 |     model.fit(x_train, y_train, sample_weight = weight)
 91 |     y_prob.append(model.predict_proba(tx.reshape(-1, 2))[:, 1])
 92 | y_prob = np.array(y_prob).reshape(-1,)
 93 | y_pred = (y_prob > 0.5).astype('int8')
 94 | 
 95 | # Draw the decision boundary
 96 | y_pred = np.round(y_pred).reshape(xx.shape)
 97 | 
 98 | plt.figure(figsize=(5, 5))
 99 | for i in [0, 1]:
100 |     idx = np.where(y == i)
101 |     plt.scatter(x[idx, 0], x[idx, 1], 
102 |                 c=color[i], 
103 |                 marker = m[i],
104 |                 s = 40,
105 |                 edgecolor = 'black',
106 |                 alpha = 0.5,
107 |                 label='class-' + str(i))
108 | plt.contour(xx, yy, y_pred, cmap=ListedColormap(['red', 'blue']), alpha=0.5)
109 | plt.axis('tight')
110 | plt.xlim(xx.min(), xx.max())
111 | plt.ylim(yy.min(), yy.max())
112 | plt.xlabel('x1')
113 | plt.ylabel('x2')
114 | plt.legend()
115 | plt.show()
116 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/2.bin_class(sklearn).py:
--------------------------------------------------------------------------------
 1 | # [MXML-4-02] 2.bin_class(sklearn).py
 2 | # Logistic Regression : binary classification
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/MifHxwJYOyU
11 | #
12 | import numpy as np
13 | from sklearn.linear_model import LogisticRegression
14 | from sklearn.model_selection import train_test_split
15 | from sklearn.datasets import make_blobs
16 | import matplotlib.pyplot as plt
17 | from matplotlib.colors import ListedColormap
18 | 
19 | # Create simple training data
20 | x, y = make_blobs(n_samples=1000, n_features=2, 
21 |                 centers=[[1., 1.], [2., 2.]], 
22 |                 cluster_std=0.5)
23 |                 
24 | # Visually check the data
25 | color = ['red', 'blue']
26 | for i in [0, 1]:
27 |     idx = np.where(y == i)
28 |     plt.scatter(x[idx, 0], x[idx, 1], c=color[i], s = 10, 
29 |                 alpha = 0.5, label='class-'+str(i))
30 | plt.legend()
31 | plt.show()                
32 | 
33 | # Split the data into training and test data
34 | x_train, x_test, y_train, y_test = train_test_split(x, y)
35 | 
36 | # Create a model and fit it to training data.
37 | model = LogisticRegression()
38 | model.fit(x_train, y_train)
39 | 
40 | # Predict the classes of test data, and measure the accuracy.
41 | y_pred = model.predict(x_test)
42 | acc = (y_pred == y_test).mean()
43 | print('\nAccuracy of test data = {:.3f}'.format(acc))
44 | 
45 | # Visually check the decision boundary.
46 | # reference : 
47 | # https://psrivasin.medium.com/
48 | #   plotting-decision-boundaries-using-numpy-and-matplotlib-f5613d8acd19
49 | x1_min, x1_max = x_test[:, 0].min() - 0.1, x_test[:,0].max() + 0.1
50 | y1_min, y1_max = x_test[:, 1].min() - 0.1, x_test[:, 1].max() + 0.1
51 | x1, x2 = np.meshgrid(np.linspace(x1_min, x1_max, 100),
52 |                      np.linspace(y1_min, y1_max, 100))
53 | x_in = np.c_[x1.ravel(), x2.ravel()]  # shape = (10000, 2)
54 | 
55 | # Predict all the data points in the meshgrid area.
56 | y_pred = model.predict(x_in)
57 | 
58 | # Drawing the data and decision boundary
59 | y_pred = y_pred.reshape(x1.shape)  # shape = (100, 100)
60 | 
61 | plt.figure(figsize=(5,5))
62 | m = ['o', '^']
63 | color = ['red', 'blue']
64 | for i in [0, 1]:
65 |     idx = np.where(y == i)
66 |     plt.scatter(x[idx, 0], x[idx, 1], 
67 |                 c=color[i], 
68 |                 marker = m[i],
69 |                 s = 40,
70 |                 edgecolor = 'black',
71 |                 alpha = 0.5,
72 |                 label='class-'+str(i))
73 | plt.contour(x1, x2, y_pred, cmap=ListedColormap(['red', 'blue']), alpha=0.5)
74 | 
75 | plt.axis('tight')
76 | plt.xlim(x1.min(), x1.max())
77 | plt.ylim(x2.min(), x2.max())
78 | plt.xlabel('x1')
79 | plt.ylabel('x2')
80 | plt.legend()
81 | plt.show()
82 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/3.bin_class(scipy_cancer).py:
--------------------------------------------------------------------------------
 1 | # [MXML-4-02] 3.bin_class(scipy_cancer).py
 2 | # Breast cancer dataset
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/MifHxwJYOyU
11 | #
12 | from scipy import optimize
13 | import numpy as np
14 | from sklearn.model_selection import train_test_split
15 | from sklearn.datasets import load_breast_cancer
16 | import matplotlib.pyplot as plt
17 | 
18 | # Read breast cancer dataset
19 | x, y = load_breast_cancer(return_X_y=True)
20 | 
21 | # Split the data into training and test data
22 | x_train, x_test, y_train, y_test = train_test_split(x, y)
23 | 
24 | # Z-score normalization
25 | # When normalzing the test data, use the mean and standard deviation 
26 | # from the training data.
27 | x_mean = x_train.mean(axis=0).reshape(1, -1)
28 | x_std = x_train.std(axis=0).reshape(1, -1)
29 | x_train = (x_train - x_mean) / x_std
30 | x_test = (x_test - x_mean) / x_std
31 | 
32 | # Add a column vector with all 1 to the feature matrix.
33 | # [0.3, 0.4, ...] --> [1.0, 0.3, 0.4, ...]
34 | # [0.1, 0.5, ...] --> [1.0, 0.1, 0.5, ...]
35 | # [ ...]
36 | x1_train = np.hstack([np.ones([x_train.shape[0], 1]), x_train])
37 | x1_test = np.hstack([np.ones([x_test.shape[0], 1]), x_test])
38 | 
39 | REG_CONST = 0.01  # regularization constant
40 | 
41 | # Loss function : mean of binary cross entropy
42 | def bce_loss(W, args):
43 |     train_x = args[0]
44 |     train_y = args[1]
45 |     test_x = args[2]
46 |     test_y = args[3]
47 |     
48 |     # Calculate the loss of training data
49 |     y_hat = 1.0 / (1 + np.exp(-np.dot(W, train_x.T)))
50 |     train_bce = -train_y * np.log(y_hat + 1e-10) - (1.0 - train_y) * np.log(1.0 - y_hat + 1e-10)
51 |     train_loss = train_bce.mean() + REG_CONST * np.mean(np.square(W))
52 | 
53 |     # Calculate the loss of test data
54 |     # It is independent of training and is measured later to observe changes in loss.
55 |     y_hat = 1.0 / (1 + np.exp(-np.dot(W, test_x.T)))
56 |     test_bce = -test_y * np.log(y_hat + 1e-10) - (1.0 - test_y) * np.log(1.0 - y_hat + 1e-10)
57 |     test_loss = test_bce.mean() + REG_CONST * np.mean(np.square(W))
58 |     
59 |     # Save the loss
60 |     trc_train_loss.append(train_loss)
61 |     trc_test_loss.append(test_loss)
62 |         
63 |     return train_loss
64 | 
65 | # Perform an optimization process
66 | trc_train_loss = []
67 | trc_test_loss = []
68 | init_w = np.ones(x1_train.shape[1]) * 0.1
69 | result = optimize.minimize(fun = bce_loss,
70 |                            x0 = init_w,
71 |                            args=[x1_train, y_train, x1_test, y_test])
72 | 
73 | # print the result. result.x contains the optimal parameters
74 | print(result)
75 | 
76 | # Measure the accuracy of test data
77 | y_prob = 1.0 / (1 + np.exp(-np.dot(result.x, x1_test.T)))
78 | y_pred = (y_prob > 0.5).astype('int8')
79 | acc = (y_pred == y_test).mean()
80 | print('\nAccuracy of test data = {:.3f}'.format(acc))
81 | 
82 | # Visually see that the loss decreases as the iteration progresses
83 | plt.figure(figsize=(5, 4))
84 | plt.plot(trc_train_loss, color='blue', label='train loss')
85 | plt.plot(trc_test_loss, color='red', label='test loss')
86 | plt.legend()
87 | plt.title('Loss history')
88 | plt.xlabel('epoch')
89 | plt.ylabel('loss')
90 | plt.show()
91 | 
92 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/4.bin_class(sklearn_cancer).py:
--------------------------------------------------------------------------------
 1 | # [MXML-4-02] 4.bin_class(sklearn_cancer).py
 2 | # Using sklearn's LogisticRegression()
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/MifHxwJYOyU
11 | #
12 | import numpy as np
13 | from sklearn.linear_model import LogisticRegression
14 | from sklearn.model_selection import train_test_split
15 | from sklearn.datasets import load_breast_cancer
16 | 
17 | # Read breast cancer dataset
18 | cancer = load_breast_cancer()
19 | x = cancer['data']
20 | y = cancer['target']
21 | 
22 | # Split the data into training and test data
23 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
24 | 
25 | # Z-score normalization
26 | # When normalzing the test data, use the mean and standard deviation 
27 | # from the training data.
28 | x_mean = x_train.mean(axis=0).reshape(1, -1)
29 | x_std = x_train.std(axis=0).reshape(1, -1)
30 | x_train = (x_train - x_mean) / x_std
31 | x_test = (x_test - x_mean) / x_std
32 | 
33 | # regularization constant (strenth)
34 | REG_CONST = 0.01
35 | 
36 | # Create a model and fit it to the training data.
37 | # C: inverse of regularization strength
38 | model = LogisticRegression(penalty='l2', C=1./REG_CONST, max_iter=300)
39 | model.fit(x_train, y_train)
40 | 
41 | # Predict the classes of test data and measure the accuracy of test data
42 | y_pred = model.predict(x_test)
43 | acc = (y_pred == y_test).mean()
44 | print('\nAccuracy of test data = {:.3f}'.format(acc))
45 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/5.multiclass(ovr_1).py:
--------------------------------------------------------------------------------
 1 | # [MXML-4-03] 5.multiclass(ovr_1).py
 2 | # Multi-class classification (OvR : one vs rest)
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/d6FcGZp8AHc
11 | #
12 | from sklearn.linear_model import LogisticRegression
13 | from sklearn.preprocessing import OneHotEncoder
14 | from sklearn.datasets import load_iris
15 | from sklearn.model_selection import train_test_split
16 | import numpy as np
17 | 
18 | # Read iris dataset
19 | x, y = load_iris(return_X_y=True)
20 | 
21 | # one-hot encoding of the y labels.
22 | y_ohe = OneHotEncoder().fit_transform(y.reshape(-1,1)).toarray()
23 | 
24 | # Split the data into the training and test data
25 | x_train, x_test, y_train, y_test = train_test_split(x, y_ohe, test_size = 0.2)
26 | 
27 | # Perform the OvR. Since there are three labels, three models are used. 
28 | models = []
29 | for m in range(y_train.shape[1]):
30 |     y_sub = y_train[:, m] # y for binary classification
31 |     models.append(LogisticRegression())
32 |     models[-1].fit(x_train, y_sub)
33 | 
34 | # The labels of the test data are predicted using three trained models.
35 | y_prob = np.zeros(shape=y_test.shape)
36 | for m in range(y_test.shape[1]):
37 |     y_prob[:, m] = models[m].predict_proba(x_test)[:, 1]
38 | 
39 | # y is predicted as the label with the highest value in y_prob.
40 | y_pred = np.argmax(y_prob, axis=1)
41 | 
42 | # Measure the accuracy of the test data
43 | y_true = np.argmax(y_test, axis=1)
44 | acc = (y_true == y_pred).mean()
45 | print('Accuracy of test data = {:.3f}'.format(acc))
46 | 
47 | # Check the estimated parameters.
48 | for m in range(y_test.shape[1]):
49 |     w = models[m].coef_
50 |     b = models[m].intercept_
51 |     print("\nModel-{}:".format(m))
52 |     print("w:", w)
53 |     print("b:", b)
54 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/6.multiclass(ovr_2).py:
--------------------------------------------------------------------------------
 1 | # [MXML-4-03] 6.multiclass(ovr_2).py
 2 | # Multiclass classification (OvR : One-vs-Rest)
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/d6FcGZp8AHc
11 | #
12 | from sklearn.linear_model import LogisticRegression
13 | from sklearn.datasets import load_iris
14 | from sklearn.model_selection import train_test_split
15 | import numpy as np
16 | 
17 | # Read iris dataset
18 | x, y = load_iris(return_X_y=True)
19 | 
20 | # Split the data into the training and test data
21 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
22 | 
23 | # Use the multi_class='ovr' function of sklearn's LogisticRegression.
24 | # Even if the 'ovr' is not set, multiclass classification is automatically 
25 | # performed by referring to the number of classes. 
26 | # This was explicitly set to facilitate understanding.
27 | model = LogisticRegression(multi_class='ovr', max_iter=300)
28 | model.fit(x_train, y_train)
29 | 
30 | # Predict the classes of the test data
31 | y_pred = model.predict(x_test)
32 | 
33 | # Measure the accuracy of the test data
34 | acc = (y_test == y_pred).mean()
35 | print('\nAccuracy of test data = {:.3f}'.format(acc))
36 | 
37 | # Check the estimated parameters.
38 | print('\nmodel.coef_ =\n\n', model.coef_)
39 | print('\nmodel.intercept_ =\n\n', model.intercept_)
40 | 
41 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/7.multiclass(softmax_scipy).py:
--------------------------------------------------------------------------------
  1 | # [MXML-4-04] 7.multiclass(softmax_scipy).pyy_prob
  2 | # Multiclass classification (Softmax regression)
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/D_z48GLwAyM
 11 | #
 12 | from scipy import optimize
 13 | from sklearn.datasets import load_iris
 14 | from sklearn.model_selection import train_test_split
 15 | from sklearn.preprocessing import OneHotEncoder
 16 | import matplotlib.pyplot as plt
 17 | import numpy as np
 18 | 
 19 | # Read iris dataset
 20 | x, y = load_iris(return_X_y=True)
 21 | 
 22 | # one-hot encoding of the y labels.
 23 | y_ohe = OneHotEncoder().fit_transform(y.reshape(-1,1)).toarray()
 24 | 
 25 | # Split the data into the training and test data
 26 | x_train, x_test, y_train, y_test = train_test_split(x, y_ohe)
 27 | 
 28 | # Add a column vector with all 1 to the feature matrix.
 29 | x1_train = np.hstack([np.ones([x_train.shape[0], 1]), x_train])
 30 | x1_test = np.hstack([np.ones([x_test.shape[0], 1]), x_test])
 31 | 
 32 | REG_CONST = 0.01              # Regularization constant
 33 | n_feature = x_train.shape[1]  # The number of features
 34 | n_class = y_train.shape[1]    # The number of classes
 35 | 
 36 | def softmax(z):
 37 |     s = np.exp(z) / np.sum(np.exp(z), axis=1).reshape(-1,1)
 38 |     return s
 39 |     
 40 | # Loss function: mean of cross entropy
 41 | def ce_loss(W, args):
 42 |     train_x = args[0]  # shape=(112,5)
 43 |     train_y = args[1]  # shape=(112,3)
 44 |     test_x = args[2]
 45 |     test_y = args[3]
 46 |     W = W.reshape((n_class, n_feature + 1)) # shape=(3, 5)
 47 |     
 48 |     # Calculate the loss of training data
 49 |     z = np.dot(W, train_x.T).T              # shape=(112, 3)
 50 |     y_hat = softmax(z)
 51 |     train_ce = np.sum(-train_y * np.log(y_hat + 1e-10), axis=1)
 52 |     train_loss = train_ce.mean() + REG_CONST * np.mean(np.square(W))
 53 | 
 54 |     # Calculate the loss of test data
 55 |     # It is independent of training and is measured later to observe changes in loss.
 56 |     z = np.dot(W, test_x.T).T
 57 |     y_hat = softmax(z)
 58 |     test_ce = np.sum(-test_y * np.log(y_hat + 1e-10), axis=1)
 59 |     test_loss = test_ce.mean() + REG_CONST * np.mean(np.square(W))
 60 |     
 61 |     # Save the loss
 62 |     trc_train_loss.append(train_loss)
 63 |     trc_test_loss.append(test_loss)
 64 |         
 65 |     return train_loss
 66 | 
 67 | # Perform an optimization process
 68 | trc_train_loss = []
 69 | trc_test_loss = []
 70 | init_w = np.ones(n_class * (n_feature + 1)) * 0.1  # shape=(3, 5) → 1D
 71 | 
 72 | # constraints: w0 = 0, b0 = 0
 73 | def b0_w0(w):
 74 |     n = np.arange(n_feature + 1)
 75 |     return w[n]
 76 | 
 77 | cons = [{'type':'eq', 'fun': b0_w0}]
 78 | result = optimize.minimize(ce_loss, init_w,
 79 |                            constraints=cons,
 80 |                            args=[x1_train, y_train, x1_test, y_test])
 81 | 
 82 | # print the result. result.x contains the optimal parameters
 83 | print(result)
 84 | 
 85 | # Measure the accuracy of test data
 86 | W = result.x.reshape(n_class, n_feature + 1)
 87 | z = np.dot(W, x1_test.T).T
 88 | y_prob = softmax(z)
 89 | y_pred = np.argmax(y_prob, axis=1)
 90 | y_true = np.argmax(y_test, axis=1)
 91 | acc = (y_pred == y_true).mean()
 92 | print('\nAccuracy of test data = {:.3f}'.format(acc))
 93 | 
 94 | # Visually see that the loss decreases as the iteration progresses
 95 | plt.figure(figsize=(5, 4))
 96 | plt.plot(trc_train_loss, color='blue', label='train loss')
 97 | plt.plot(trc_test_loss, color='red', label='test loss')
 98 | plt.legend()
 99 | plt.title('Loss history')
100 | plt.xlabel('epoch')
101 | plt.ylabel('loss')
102 | plt.show()
103 | 
104 | # Check the parameters
105 | w = result.x.reshape((n_class, n_feature + 1))
106 | print('\n', w)
107 | 
108 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/8.multiclass(softmax_sklearn).py:
--------------------------------------------------------------------------------
 1 | # [MXML-4-04] multiclass(softmax_scipy).py
 2 | # Multi-class classification (Softmax regression)
 3 | # Use LogisticRegression(multi_class='multinomial')
 4 | #
 5 | # This code was used in the machine learning online 
 6 | # course provided by 
 7 | # www.youtube.com/@meanxai
 8 | # www.github.com/meanxai/machine_learning
 9 | #
10 | # A detailed description of this code can be found in
11 | # https://youtu.be/D_z48GLwAyM
12 | #
13 | from sklearn.linear_model import LogisticRegression
14 | from sklearn.datasets import load_iris
15 | from sklearn.model_selection import train_test_split
16 | 
17 | # Read iris dataset
18 | x, y = load_iris(return_X_y=True)
19 | 
20 | # Split the data into the training and test data
21 | x_train, x_test, y_train, y_test = train_test_split(x, y)
22 | 
23 | # Create a model and fit it to the training data.
24 | # Use multi_class = 'multinomial'
25 | model = LogisticRegression(multi_class='multinomial', max_iter=300)
26 | model.fit(x_train, y_train)
27 | 
28 | # Predict the classes of the test data
29 | y_pred = model.predict(x_test)
30 | 
31 | # Measure the accuracy
32 | acc = (y_test == y_pred).mean()
33 | print('\nAccuracy of test data = {:.3f}'.format(acc))
34 | 
35 | # Check the estimated parameters.
36 | print('\nmodel.coef_ =\n\n', model.coef_)
37 | print('\nmodel.intercept_ =\n\n', model.intercept_)
38 | 
39 | 


--------------------------------------------------------------------------------
/4.LogisticRegression/9.lwlr(scipy).py:
--------------------------------------------------------------------------------
 1 | # [MXML-4-05] 9.lwlr(scipy).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/d1-QS4uTgj8
10 | #
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from scipy import optimize
14 | from sklearn.model_selection import train_test_split
15 | 
16 | # Generate a simple dataset
17 | def lwlr_data1(n):
18 |     n1 = int(n / 3)
19 |     a = np.random.normal(-1.0, 0.5, n1)
20 |     b = np.random.normal(1.0, 0.5, n1)
21 |     c = np.random.normal(3.0, 0.5, n - n1 * 2)
22 |     x = np.hstack([a, b, c]).reshape(-1, 1)
23 |     y = np.hstack([np.zeros(n1), np.ones(n1), np.zeros(n - n1 * 2)])
24 |     return x, y
25 |     
26 | # Generate training and test data
27 | x, y = lwlr_data1(n=2000)
28 | x_train, x_test, y_train, y_test = train_test_split(x, y)
29 | x1_train = np.hstack([np.ones([x_train.shape[0], 1]), x_train])
30 | x1_test = np.hstack([np.ones([x_test.shape[0], 1]), x_test])
31 | 
32 | # Visualize the dataset
33 | plt.figure(figsize=(6, 3))
34 | plt.scatter(x_train, y_train, s=5, c='orange', alpha=0.5, label='train')
35 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', alpha=0.5, label='test')
36 | plt.legend()
37 | plt.show()
38 | 
39 | # Calculating the weights of training data points
40 | # xx : training data, tx : test data
41 | def get_weight(xx, tx, tau):
42 |     distance = np.sum(np.square(xx - tx), axis=1)
43 |     w = np.exp(-distance / (2 * tau * tau))
44 |     return w
45 | 
46 | # the mean of weighted binary cross entropy
47 | def wbce_loss(W, weight):
48 |     y_hat = 1.0 / (1 + np.exp(-np.dot(W, x1_train.T)))
49 |     bce = -y_train * np.log(y_hat + 1e-10) - (1.0 - y_train) * np.log(1.0 - y_hat + 1e-10)
50 |     bce *= weight
51 |     return bce.mean()
52 | 
53 | y_prob = []
54 | for tx in x1_test:
55 |     weight = get_weight(x_train, tx, 0.6)
56 |     result = optimize.minimize(wbce_loss, [0.1, 0.1], args=weight)
57 |     y_prob.append(1.0 / (1 + np.exp(-np.dot(result.x, tx.T))))
58 | y_prob = np.array(y_prob).reshape(-1,)
59 | 
60 | # Visually check the training and test data, 
61 | # and the predicted probability.
62 | plt.figure(figsize=(6, 3))
63 | plt.scatter(x_train, y_train, s=5, c='orange', label='train')
64 | plt.scatter(x_test, y_test, marker='+', s=30, c='blue', label='test')
65 | plt.scatter(x_test, y_prob, s=5, c='red', label='prediction')
66 | plt.legend()
67 | plt.axhline(y=0.5, ls='--', lw=0.5, c='black')
68 | plt.axvline(x=0, ls='--', lw=0.5, c='black')
69 | plt.axvline(x=2, ls='--', lw=0.5, c='black')
70 | plt.show()
71 | 
72 | # Measure the accuracy of the test data
73 | y_pred = (y_prob > 0.5).astype('int8')
74 | acc = (y_pred == y_test).mean()
75 | print('\nAccuracy of the test data = {:.3f}'.format(acc))
76 | 


--------------------------------------------------------------------------------
/5.Convex/1.plot_convex.py:
--------------------------------------------------------------------------------
 1 | # [MXML-5-01] 1.plot_convex.py (Plot 3D convex function)
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/8BiHfVrdClU
10 | #
11 | import matplotlib.pyplot as plt
12 | import numpy as np
13 | 
14 | # f(x)
15 | def f_xy(x1, x2):
16 |     return (x1 ** 2) + (x2 ** 2)
17 |     # return 3 * x1 + x2
18 |     # return (x1 ** 2) + x2 * (x1 - 1)
19 |     # return 2 * (x1 ** 2) + (x2 ** 2) + x1 * x2 + x1 + x2
20 |     # return -5 * x1 / 3 - x2 + 5
21 | 
22 | t = 0.1
23 | x, y = np.meshgrid(np.arange(-10, 10, t), np.arange(-10, 10, t))
24 | zs = np.array([f_xy(a, b) for [a, b] in zip(np.ravel(x), np.ravel(y))])
25 | z = zs.reshape(x.shape)
26 | 
27 | fig = plt.figure(figsize=(7,7))
28 | ax = fig.add_subplot(111, projection='3d')
29 | 
30 | # surface를 그린다.
31 | ax.plot_surface(x, y, z, alpha=0.7)
32 | 
33 | ax.set_xlabel('x1')
34 | ax.set_ylabel('x2')
35 | ax.set_zlabel('f(x)')
36 | ax.azim = -50
37 | ax.elev = 30
38 | plt.show()
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/5.Convex/2.EQP.py:
--------------------------------------------------------------------------------
 1 | # [MXML-5-03] 2.EQP.py
 2 | # Equality constrained QP (EQP)
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/yn04TeRxKko
11 | #
12 | # Least squares problem:
13 | # minimize   x1^2 + x2^2
14 | # subject to x1 + x2 = 1
15 | #
16 | # QP standard form:
17 | # minimize   1/2 * xT.P.x + qT.x
18 | # subject to G.x <= h
19 | #            A.x = b
20 | # 
21 | # min. 1/2 * [x1 x2][2 0][x1] + [0 0][x1]
22 | #                   [0 2][x2]        [x2]
23 | #
24 | # s.t. [1 1][x1] = 1
25 | #           [x2]
26 | #
27 | # x = [x1]  P = [2 0]  q = [0]  A = [1 1]  b = 1
28 | #     [x2]      [0 2]      [0]
29 | from cvxopt import matrix, solvers
30 | import numpy as np
31 | 
32 | P = matrix(np.array([[2, 0], [0, 2]]), tc='d')
33 | q = matrix(np.array([[0], [0]]), tc='d')
34 | A = matrix(np.array([[1, 1]]), tc='d')
35 | b = matrix(1, tc='d')
36 | 
37 | sol = solvers.qp(P, q, A=A, b=b)
38 | 
39 | p_star = sol['primal objective']
40 | x1, x2 = sol['x']
41 | y = sol['y'][0]     # Lagrange multiplier for A.x = b
42 | gap = sol['gap']    # duality gap
43 | 
44 | # z and y are Lagrange multipliers. z is not used here.
45 | # L = (1/2) * xT.P.x + qT.x + zT(G.x - h) + yT(A.x - b)
46 | # zT = z-transpose, yT = y-transpose
47 | print('\nx1 = {:.3f}'.format(x1))
48 | print('x2 = {:.3f}'.format(x2))
49 | print('y = {:.3f}'.format(y))
50 | print('p* = {:.3f}'.format(p_star))
51 | print('duality gap = {:.3f}'.format(gap))
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/5.Convex/3.IQP_1.py:
--------------------------------------------------------------------------------
 1 | # [MXML-5-03] 3.IQP_1.py
 2 | # Inequality constrained QP (IQP-1)
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/yn04TeRxKko
11 | #
12 | # 
13 | # Least squares problem:
14 | # minimize   x1^2 + x2^2
15 | # subject to x1 + x2 <= 1
16 | #
17 | # QP standard form
18 | # minimize   1/2 * xT.P.x + qT.x
19 | # subject to G.x <= h
20 | #            A.x = b
21 | # 
22 | # min. 1/2 [x1 x2][2 0][x1] + [0 0][x1]
23 | #                 [0 2][x2]        [x2]
24 | #
25 | # s.t. [1 1][x1] <= 1
26 | #           [x2]
27 | #
28 | # x = [x1]  P = [2 0]  q = [0]  G = [1 1]  h = 1
29 | #     [x2]      [0 2]      [0]
30 | from cvxopt import matrix, solvers
31 | import numpy as np
32 | 
33 | P = matrix(np.array([[2, 0], [0, 2]]), tc='d')
34 | q = matrix(np.array([[0], [0]]), tc='d')
35 | G = matrix(np.array([[1, 1]]), tc='d')
36 | h = matrix(1, tc='d')
37 | 
38 | sol = solvers.qp(P, q, G, h)
39 | 
40 | p_star = sol['primal objective']
41 | x1, x2 = sol['x']
42 | z = sol['z'][0]     # Lagrange multiplier for G.x <= h
43 | gap = sol['gap']    # duality gap
44 | s = sol['s'][0]     # slack variable
45 | 
46 | # z and y are Lagrange multipliers. y is not used here.
47 | # L = (1/2) * xT.P.x + qT.x + zT(G.x - h) + yT(A.x - b)
48 | # zT = z-transpose, yT = y-transpose
49 | print('\nx1 = {:.3f}'.format(x1))
50 | print('x2 = {:.3f}'.format(x2))
51 | print('z = {:.3f}'.format(z))
52 | print('s = {:.3f}'.format(s))
53 | print('p* = {:.3f}'.format(p_star))
54 | print('duality gap = {:.3f}'.format(gap))
55 | 
56 | 


--------------------------------------------------------------------------------
/5.Convex/4.IQP_2.py:
--------------------------------------------------------------------------------
 1 | # [MXML-5-03] 4.IQP_2.py
 2 | # Inequality constrained QP (IQP-2)
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/yn04TeRxKko
11 | #
12 | # 
13 | # Least squares problem:
14 | # minimize   x1^2 + x2^2
15 | # subject to x1 + x2 >= 1 --> -x1 - x2 <= -1로 변환.
16 | #
17 | # QP standard form
18 | # minimize   (1/2) * xT.P.x + qT.x
19 | # subject to G.x <= h
20 | #            A.x = b
21 | # 
22 | # min. (1/2) * [x1 x2][2 0][x1] + [0 0][x1]
23 | #                     [0 2][x2]        [x2]
24 | #
25 | # s.t. [-1 -1][x1] <= -1
26 | #             [x2]
27 | #
28 | # x = [x1]  P = [2 0]  q = [0]  G = [-1 -1]  h = -1
29 | #     [x2]      [0 2]      [0]
30 | from cvxopt import matrix, solvers
31 | import numpy as np
32 | 
33 | P = matrix(np.array([[2, 0], [0, 2]]), tc='d')
34 | q = matrix(np.array([[0], [0]]), tc='d')
35 | G = matrix(np.array([[-1, -1]]), tc='d')
36 | h = matrix(-1, tc='d')
37 | 
38 | sol = solvers.qp(P, q, G, h)
39 | 
40 | p_star = sol['primal objective']
41 | x1, x2 = sol['x']
42 | z = sol['z'][0]     # Lagrange multiplier for G.x <= h
43 | s = sol['s'][0]     # slack variable
44 | gap = sol['gap']    # duality gap
45 | 
46 | # z and y are Lagrange multipliers. y is not used here.
47 | # L = (1/2) * xT.P.x + qT.x + zT(G.x - h) + yT(A.x - b)
48 | # zT = z-transpose, yT = y-transpose
49 | print('\nx1 = {:.3f}'.format(x1))
50 | print('x2 = {:.3f}'.format(x2))
51 | print('z = {:.3f}'.format(z))
52 | print('s = {:.3f}'.format(s))
53 | print('p* = {:.3f}'.format(p_star))
54 | print('duality gap = {:.3f}'.format(gap))
55 | 
56 | 


--------------------------------------------------------------------------------
/5.Convex/5.QP.py:
--------------------------------------------------------------------------------
 1 | # [MXML-5-04] 5.QP.py
 2 | # QP problem with an equality and an inequality constraints.
 3 | # https://cvxopt.org/examples/tutorial/qp.html
 4 | #
 5 | # This code was used in the machine learning online 
 6 | # course provided by 
 7 | # www.youtube.com/@meanxai
 8 | # www.github.com/meanxai/machine_learning
 9 | #
10 | # A detailed description of this code can be found in
11 | # https://youtu.be/_5QuyiCI1rc
12 | #
13 | # min. 2 * x1^2 + x2^2 + x1 * x2 + x1 + x2
14 | # s.t. x1 >= 0
15 | #      x2 >= 0
16 | #      x1 + x2 = 1
17 | #
18 | # QP standard form
19 | # minimize   1/2 * xT.P.x + qT.x
20 | # subject to G.x <= h
21 | #            A.x = b
22 | # 
23 | # min. 1/2 [x1 x2][4 1][x1] + [1 1][x1]
24 | #                 [1 2][x2]        [x2]
25 | #
26 | # s.t. [-1  0][x1] <= [0]
27 | #      [ 0 -1][x2]    [0]
28 | #      
29 | #      [1 1][x1] = 1
30 | #           [x2]
31 | #
32 | # x = [x1] P = [4 1] q = [1] G = [-1  0] h = [0] A = [1 1] b = 1
33 | #     [x2]     [1 2]     [1]     [ 0 -1]     [0]
34 | from cvxopt import matrix, solvers
35 | import numpy as np
36 | 
37 | P = matrix(np.array([[4, 1], [1, 2]]), tc='d')
38 | q = matrix(np.array([[1], [1]]), tc='d')
39 | G = matrix(np.array([[-1, 0],[0, -1]]), tc='d')
40 | h = matrix(np.array([[0], [0]]), tc='d')
41 | A = matrix(np.array([[1, 1]]), tc='d')
42 | b = matrix(1, tc='d')
43 | 
44 | sol = solvers.qp(P, q, G, h, A, b)
45 | 
46 | p_star = sol['primal objective']
47 | x1, x2 = sol['x']
48 | y = sol['y'][0]     # Lagrange multiplier for x1 + x2 = 1
49 | z1 = sol['z'][0]    # Lagrange multiplier for -x1 <= 0
50 | z2 = sol['z'][1]    # Lagrange multiplier for -x2 <= 0
51 | gap = sol['gap']    # duality gap
52 | 
53 | # z and y are Lagrange multipliers.
54 | # L = (1/2) * xT.P.x + qT.x + zT(G.x - h) + yT(A.x - b)
55 | # zT = z-transpose, yT = y-transpose
56 | print('\nx1 = {:.3f}'.format(x1))
57 | print('x2 = {:.3f}'.format(x2))
58 | print('y = {:.3f}'.format(y))
59 | print('z1 = {:.3f}'.format(z1))
60 | print('z2 = {:.3f}'.format(z2))
61 | print('p* = {:.3f}'.format(p_star))
62 | print('duality gap = {:.3f}'.format(gap))


--------------------------------------------------------------------------------
/5.Convex/6.LP.py:
--------------------------------------------------------------------------------
 1 | # [MXML-5-04] 6.LP.py
 2 | # https://cvxopt.org/examples/tutorial/lp.html
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/_5QuyiCI1rc
11 | # 
12 | # min. 2 * x1 + x2
13 | # s.t. -x1 + x2 <= 1
14 | #       x1 + x2 >= 2    --> -x1 - x2 <= -2
15 | #            x2 >= 0    --> -x2 <= 0
16 | #       x1 - 2 * x2 <= 4
17 | #       x1 - 5 * x2 = 15
18 | #
19 | # LP standard form
20 | # minimize   cT.x
21 | # subject to G.x <= h
22 | #            A.x = b
23 | # 
24 | # min. [2 1][x1]
25 | #           [x2]
26 | #
27 | # s.t. G.x <= h   [-1  1][x1] <= [ 1]
28 | #                 [-1 -1][x2]    [-2]
29 | #                 [ 0 -1]        [ 0]
30 | #                 [ 1 -2]        [ 4]
31 | #      
32 | #      A.x = b    [1 -5][x1] = 15
33 | #                       [x2]
34 | #
35 | # x = [x1] c = [2] G = [-1  1] h = [ 1] A = [1 1] b = 1
36 | #     [x2]     [1]     [-1 -1]     [-2]
37 | #                      [ 0 -1]     [ 0]
38 | #                      [ 1 -2]     [ 4]
39 | from cvxopt import matrix, solvers
40 | import numpy as np
41 | 
42 | c = matrix(np.array([[2], [1]]), tc='d')
43 | G = matrix(np.array([[-1, 1],[-1, -1],[0, -1],[1, -2]]), tc='d')
44 | h = matrix(np.array([[1], [-2], [0], [4]]), tc='d')
45 | A = matrix(np.array([[1, -5]]), tc='d')
46 | b = matrix(1, tc='d')
47 | sol = solvers.lp(c, G, h, A, b)
48 | 
49 | p_star = sol['primal objective']
50 | x1, x2 = sol['x']
51 | y = sol['y'][0]     # Lagrange multiplier for A.x = b
52 | z1 = sol['z'][0]    # Lagrange multiplier for G1.x <= h1
53 | z2 = sol['z'][1]    # Lagrange multiplier for G2.x <= h2
54 | z3 = sol['z'][2]    # Lagrange multiplier for G3.x <= h3
55 | z4 = sol['z'][3]    # Lagrange multiplier for G4.x <= h4
56 | gap = sol['gap']    # duality gap
57 | 
58 | # z and y are Lagrange multipliers.
59 | # L = cT.x + zT(G.x - h) + yT(A.x - b)
60 | # zT = z-transpose, yT = y-transpose
61 | print('\nx1 = {:.3f}'.format(x1))
62 | print('x2 = {:.3f}'.format(x2))
63 | print('y = {:.3f}'.format(y))
64 | print('z1 = {:.3f}'.format(z1))
65 | print('z2 = {:.3f}'.format(z2))
66 | print('z3 = {:.3f}'.format(z3))
67 | print('z4 = {:.3f}'.format(z4))
68 | print('p* = {:.3f}'.format(p_star))
69 | print('duality gap = {:.3f}'.format(gap))
70 | 


--------------------------------------------------------------------------------
/6.SVM/1.cvxopt(hard_margin).py:
--------------------------------------------------------------------------------
 1 | # [MXML-6-02] 1.cvxopt(hard_margin).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/9oRPq9oa4uA
10 | # 
11 | from cvxopt import matrix as matrix
12 | from cvxopt import solvers as solvers
13 | import numpy as np
14 | import matplotlib.pyplot as plt
15 | 
16 | # 3 data points.
17 | x = np.array([[1., 3.], [2., 2.], [1., 1.]])
18 | y = np.array([[1.], [1.], [-1.]])
19 | 
20 | # Calculate H matrix
21 | H = np.outer(y, y) * np.dot(x, x.T)
22 | 
23 | # Construct the matrices required for QP in standard form.
24 | n = x.shape[0]
25 | P = matrix(H)
26 | q = matrix(-np.ones((n, 1)))
27 | G = matrix(-np.eye(n))
28 | h = matrix(np.zeros(n))
29 | A = matrix(y.reshape(1, -1))
30 | b = matrix(np.zeros(1))
31 | 
32 | # solver parameters
33 | solvers.options['abstol'] = 1e-10
34 | solvers.options['reltol'] = 1e-10
35 | solvers.options['feastol'] = 1e-10
36 | 
37 | # Perform QP
38 | sol = solvers.qp(P, q, G, h, A, b)
39 | 
40 | # the solution of the QP, λ
41 | lamb = np.array(sol['x'])
42 | 
43 | # Calculate w using the lambda, which is the solution to QP.
44 | w = np.sum(lamb * y * x, axis=0).reshape(1, -1)
45 | 
46 | # Find support vectors
47 | sv_idx = np.where(lamb > 1e-5)[0]
48 | sv_lamb = lamb[sv_idx]
49 | sv_x = x[sv_idx]
50 | sv_y = y[sv_idx].reshape(1, -1)
51 | 
52 | # Calculate b using the support vectors and calculate the average.
53 | # Reference: Bishop, Pattern Recognition and Machine Learning, p.330, 
54 | # equation (7.18)
55 | b = sv_y - np.dot(w, sv_x.T)
56 | b = np.mean(b)
57 |         
58 | print('\nlambda =', np.round(lamb.flatten(), 3))
59 | print('w =', np.round(w, 3))
60 | print('b =', np.round(b, 3))
61 | 
62 | # Visualize the data points
63 | plt.figure(figsize=(5,5))
64 | color= ['red' if a == 1 else 'blue' for a in y]
65 | plt.scatter(x[:, 0], x[:, 1], s=200, c=color, alpha=0.7)
66 | plt.xlim(0, 4)
67 | plt.ylim(0, 4)
68 | 
69 | # Visualize the decision boundary
70 | x1_dec = np.linspace(0, 4, 50).reshape(-1, 1)
71 | x2_dec = -(w[0][0] / w[0][1]) * x1_dec - b / w[0][1]
72 | plt.plot(x1_dec, x2_dec, c='black', lw=1.0, label='decision boundary')
73 | 
74 | # Visualize the positive & negative boundary
75 | w_norm = np.sqrt(np.sum(w ** 2))
76 | w_unit = w / w_norm
77 | half_margin = 1 / w_norm
78 | upper = np.hstack([x1_dec, x2_dec]) + half_margin * w_unit
79 | lower = np.hstack([x1_dec, x2_dec]) - half_margin * w_unit
80 | 
81 | plt.plot(upper[:, 0], upper[:, 1], '--', lw=1.0, label='positive boundary')
82 | plt.plot(lower[:, 0], lower[:, 1], '--', lw=1.0, label='negative boundary')
83 | 
84 | 
85 | plt.scatter(sv_x[:, 0], sv_x[:, 1], s=50, marker='o', c='white')
86 | 
87 | for s, (x1, x2) in zip(lamb, x):
88 |     plt.annotate('λ=' + str(s[0].round(2)), (x1-0.05, x2 + 0.2))
89 | 
90 | plt.legend()
91 | plt.show()
92 | 
93 | print("\nMargin = {:.4f}".format(half_margin * 2))
94 | 


--------------------------------------------------------------------------------
/6.SVM/10.multiclass(OvR).py:
--------------------------------------------------------------------------------
  1 | # [MXML-6-08] 10.multiclass(OvR).py
  2 | # Implement multiclass classification of SVM by One-Rest (OvR)
  3 | # Since SVC operates as an OvO internally, we will use 
  4 | # OneVsRestClassifier.
  5 | #
  6 | # This code was used in the machine learning online 
  7 | # course provided by 
  8 | # www.youtube.com/@meanxai
  9 | # www.github.com/meanxai/machine_learning
 10 | #
 11 | # A detailed description of this code can be found in
 12 | # https://youtu.be/ogFZchEqmTA
 13 | # 
 14 | import numpy as np
 15 | from sklearn.svm import SVC
 16 | from sklearn.multiclass import OneVsRestClassifier
 17 | import matplotlib.pyplot as plt
 18 | from sklearn.datasets import make_blobs
 19 | 
 20 | # Generate the data with 4 clusters.
 21 | x, y = make_blobs(n_samples=400, n_features=2, 
 22 |                 centers=[[0., 0.2], [0.5, 0.5], [1., -0.2], [0.3, -0.3]], 
 23 |                 cluster_std=0.15)
 24 | 
 25 | # Linear SVM model
 26 | C = 1.0
 27 | model = OneVsRestClassifier(SVC(C=C, kernel='linear'))
 28 | model.fit(x, y)
 29 | 
 30 | print(model.estimators_)
 31 | # [SVC(kernel='linear'),
 32 | #  SVC(kernel='linear'),
 33 | #  SVC(kernel='linear'),
 34 | #  SVC(kernel='linear')]
 35 | 
 36 | w = np.array([m.coef_[0] for m in model.estimators_])      # (4,2)
 37 | b = np.array([m.intercept_[0] for m in model.estimators_]) # (4,)
 38 | 
 39 | # Visualize the data and 4 boundaries.
 40 | plt.figure(figsize=(8,7))
 41 | colors = ['red', 'blue', 'green', 'black']
 42 | y_color= [colors[a] for a in y]
 43 | for label in model.classes_:
 44 |     idx = np.where(y == label)
 45 |     plt.scatter(x[idx, 0], x[idx, 1], s=100, c=colors[label], 
 46 |                 alpha=0.5, label='class_' + str(label))
 47 | 
 48 | # Visualize 4 boundaries.
 49 | x1_dec = np.linspace(-2.0, 2.0, 50).reshape(-1, 1)
 50 | for i in range(w.shape[0]):
 51 |     x2_dec = -(w[i, 0] * x1_dec + b[i]) / w[i, 1]
 52 |     plt.plot(x1_dec, x2_dec, label=str(i)+'_rest')
 53 | plt.xlim(-0.5, 1.5)    
 54 | plt.ylim(-0.7, 1.)
 55 | plt.legend()
 56 | plt.show()
 57 | 
 58 | # Predict the classes of the test data.
 59 | x_test = np.random.uniform(-1.5, 1.5, (2000, 2))
 60 | y_pred1 = model.predict(x_test)
 61 | 
 62 | # To understand how OvR works, let's manually implement the 
 63 | # process of model.predict(x_test). df.shape = (2000, 4)
 64 | df = np.dot(x_test, w.T) + b          # decision function
 65 | # df = model.decision_function(x_test) # same as above
 66 | 
 67 | y_pred2 = df.argmax(axis=1)
 68 | 
 69 | # Compare y_pred1 and y_pred2.
 70 | if (y_pred1 != y_pred2).sum() == 0:
 71 |     print("# y_pred1 and y_pred2 are exactly the same.")
 72 | else:
 73 |     print("# y_pred1 and y_pred2 are not the same.")
 74 | 
 75 | # Visualize test data and y_pred1
 76 | plt.figure(figsize=(8,7))
 77 | y_color= [colors[a] for a in y_pred1]
 78 | for label in model.classes_:
 79 |     idx = np.where(y_pred1 == label)
 80 |     plt.scatter(x_test[idx, 0], x_test[idx, 1], 
 81 |                 s=100,
 82 |                 c=colors[label], 
 83 |                 alpha=0.3, 
 84 |                 label='class_' + str(label))
 85 | 
 86 | plt.xlim(-1.5, 1.8)    
 87 | plt.ylim(-0.7, 1.)
 88 | plt.show()
 89 | 
 90 | # decision_function_shape = 'ovr' in SVC
 91 | # model2 = SVC(C=C, kernel='linear', decision_function_shape='ovr')
 92 | # model2.fit(x, y)
 93 | 
 94 | # # w and b are generated by OvO method.
 95 | # print("w:\n", model2.coef_)       # (6,2)
 96 | # print("b:\n", model2.intercept_)  # (6,)
 97 | 
 98 | # df2 = model2.decision_function(x_test)
 99 | # y_pred3 = df2.argmax(axis=1)
100 | 
101 | # # Visualize test data and y_pred3
102 | # plt.figure(figsize=(8,7))
103 | # y_color= [colors[a] for a in y_pred3]
104 | # for label in model.classes_:
105 | #     idx = np.where(y_pred3 == label)
106 | #     plt.scatter(x_test[idx, 0], x_test[idx, 1], s=100,
107 | #                 c=colors[label], 
108 | #                 alpha=0.3, label='class_' + str(label))
109 | 
110 | # plt.xlim(-1.5, 1.8)    
111 | # plt.ylim(-0.7, 1.)
112 | # plt.show()
113 | 
114 | 


--------------------------------------------------------------------------------
/6.SVM/2.cvxopt(soft_margin).py:
--------------------------------------------------------------------------------
  1 | # [MXML-6-04] 2.cvxopt(soft_margin).py
  2 | #
  3 | # This code was used in the machine learning online 
  4 | # course provided by 
  5 | # www.youtube.com/@meanxai
  6 | # www.github.com/meanxai/machine_learning
  7 | #
  8 | # A detailed description of this code can be found in
  9 | # https://youtu.be/LdOcJfJTcwU
 10 | # 
 11 | import numpy as np
 12 | from cvxopt import matrix as cvxopt_matrix
 13 | from cvxopt import solvers as cvxopt_solvers
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | # training data
 17 | x = np.array([[0.2,   0.869],
 18 |               [0.687, 0.212],
 19 |               [0.822, 0.411],
 20 |               [0.738, 0.694],
 21 |               [0.176, 0.458],
 22 |               [0.306, 0.753],
 23 |               [0.936, 0.413],
 24 |               [0.215, 0.410],
 25 |               [0.612, 0.375],
 26 |               [0.784, 0.602],
 27 |               [0.612, 0.554],
 28 |               [0.357, 0.254],
 29 |               [0.204, 0.775],
 30 |               [0.512, 0.745],
 31 |               [0.498, 0.287],
 32 |               [0.251, 0.557],
 33 |               [0.502, 0.523],
 34 |               [0.119, 0.687],
 35 |               [0.495, 0.924],
 36 |               [0.612, 0.851]])
 37 | 
 38 | y = np.array([-1,1,1,1,-1,-1,1,-1,1,1,-1,1,-1,1,-1,-1,1,-1,1,1])
 39 | y = y.astype('float').reshape(-1, 1)
 40 | 
 41 | C = 50.0
 42 | N = x.shape[0]
 43 | 
 44 | # Construct the matrices required for QP in standard form.
 45 | H = np.outer(y, y) * np.dot(x, x.T)
 46 | P = cvxopt_matrix(H)
 47 | q = cvxopt_matrix(np.ones(N) * -1)
 48 | A = cvxopt_matrix(y.reshape(1, -1))
 49 | b = cvxopt_matrix(np.zeros(1))
 50 | 
 51 | g = np.vstack([-np.eye(N), np.eye(N)])
 52 | G = cvxopt_matrix(g)
 53 | 
 54 | h1 = np.hstack([np.zeros(N), np.ones(N) * C])
 55 | h = cvxopt_matrix(h1)
 56 | 
 57 | # solver parameters
 58 | cvxopt_solvers.options['abstol'] = 1e-10
 59 | cvxopt_solvers.options['reltol'] = 1e-10
 60 | cvxopt_solvers.options['feastol'] = 1e-10
 61 | 
 62 | # Perform QP
 63 | sol = cvxopt_solvers.qp(P, q, G, h, A, b)
 64 | 
 65 | # the solution to the QP, λ
 66 | lamb = np.array(sol['x'])
 67 | 
 68 | # Calculate w using the lambda, which is the solution to QP.
 69 | w = np.sum(lamb * y * x, axis=0)
 70 | 
 71 | # Find support vectors
 72 | sv_idx = np.where(lamb > 1e-5)[0]
 73 | sv_lamb = lamb[sv_idx]
 74 | sv_x = x[sv_idx]
 75 | sv_y = y[sv_idx]
 76 | 
 77 | sv_plus = sv_x[np.where(sv_y > 0)[0]]   # '+1' samples
 78 | sv_minus = sv_x[np.where(sv_y < 0)[0]]  # '-1' samples
 79 | 
 80 | # Calculate b using the support vectors and calculate the average.
 81 | b = -(np.max(np.dot(w, sv_plus.T)) + np.min(np.dot(w, sv_minus.T))) / 2.0
 82 | 
 83 | # Visualize the data points
 84 | plt.figure(figsize=(7,7))
 85 | color= ['red' if a == 1 else 'blue' for a in y]
 86 | plt.scatter(x[:, 0], x[:, 1], s=200, c=color, alpha=0.7)
 87 | plt.xlim(0, 1)
 88 | plt.ylim(0, 1)
 89 | 
 90 | # Visualize the decision boundary
 91 | x1_dec = np.linspace(0, 1, 50).reshape(-1, 1)
 92 | x2_dec = -(w[0] / w[1]) * x1_dec - b / w[1]
 93 | plt.plot(x1_dec, x2_dec, c='black', lw=1.0, label='decision boundary')
 94 | 
 95 | # display slack variables, slack variable = max(0, 1 - y(wx + b))
 96 | y_hat = np.dot(w, x.T) + b
 97 | slack = np.maximum(0, 1 - y.flatten() * y_hat)
 98 | for s, (x1, x2) in zip(slack, x):
 99 |     plt.annotate(str(s.round(2)), (x1-0.02, x2 + 0.03))
100 | 
101 | # Visualize the positive & negative boundary and support vectors
102 | w_norm = np.sqrt(np.sum(w ** 2))
103 | w_unit = w / w_norm
104 | half_margin = 1 / w_norm
105 | upper = np.hstack([x1_dec, x2_dec]) + half_margin * w_unit
106 | lower = np.hstack([x1_dec, x2_dec]) - half_margin * w_unit
107 | 
108 | plt.plot(upper[:, 0], upper[:, 1], '--', lw=1.0, label='positive boundary')
109 | plt.plot(lower[:, 0], lower[:, 1], '--', lw=1.0, label='negative boundary')
110 | 
111 | plt.scatter(sv_x[:, 0], sv_x[:, 1], s=60, marker='o', c='white')
112 | plt.legend()
113 | plt.title('C = ' + str(C) + ',  Σξ = ' + str(np.sum(slack).round(2)))
114 | plt.show()
115 | 
116 | 


--------------------------------------------------------------------------------
/6.SVM/3.SVC(soft_margin).py:
--------------------------------------------------------------------------------
 1 | # [MXML-6-04] 3.SVC(soft_margin).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/LdOcJfJTcwU
10 | # 
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | from sklearn.svm import SVC
14 | 
15 | # training data
16 | x = np.array([[0.2,   0.869],
17 |               [0.687, 0.212],
18 |               [0.822, 0.411],
19 |               [0.738, 0.694],
20 |               [0.176, 0.458],
21 |               [0.306, 0.753],
22 |               [0.936, 0.413],
23 |               [0.215, 0.410],
24 |               [0.612, 0.375],
25 |               [0.784, 0.602],
26 |               [0.612, 0.554],
27 |               [0.357, 0.254],
28 |               [0.204, 0.775],
29 |               [0.512, 0.745],
30 |               [0.498, 0.287],
31 |               [0.251, 0.557],
32 |               [0.502, 0.523],
33 |               [0.119, 0.687],
34 |               [0.495, 0.924],
35 |               [0.612, 0.851]])
36 | 
37 | y = np.array([-1,1,1,1,-1,-1,1,-1,1,1,-1,1,-1,1,-1,-1,1,-1,1,1])
38 | C = 50
39 | 
40 | # Create SVC model and fit it the the training data
41 | model = SVC(C=C, kernel='linear')
42 | model.fit(x, y)
43 | 
44 | # parameters
45 | w = model.coef_[0]
46 | b = model.intercept_[0]
47 | 
48 | # Visualize the data points
49 | plt.figure(figsize=(7,7))
50 | color= ['red' if a == 1 else 'blue' for a in y]
51 | plt.scatter(x[:, 0], x[:, 1], s=200, c=color, alpha=0.7)
52 | plt.xlim(0, 1)
53 | plt.ylim(0, 1)
54 | 
55 | # Visualize the decision boundary
56 | x1_dec = np.linspace(0, 1, 50).reshape(-1, 1)
57 | x2_dec = -(w[0] / w[1]) * x1_dec - b / w[1]
58 | plt.plot(x1_dec, x2_dec, c='black', lw=1.0, label='decision boundary')
59 | 
60 | # Visualize the positive & negative boundary
61 | w_norm = np.sqrt(np.sum(w ** 2))
62 | w_unit = w / w_norm
63 | half_margin = 1 / w_norm
64 | upper = np.hstack([x1_dec, x2_dec]) + half_margin * w_unit
65 | lower = np.hstack([x1_dec, x2_dec]) - half_margin * w_unit
66 | 
67 | plt.plot(upper[:, 0], upper[:, 1], '--', lw=1.0, label='positive boundary')
68 | plt.plot(lower[:, 0], lower[:, 1], '--', lw=1.0, label='negative boundary')
69 |    
70 | # display slack variables, slack variable = max(0, 1 - y(wx + b))
71 | y_hat = np.dot(w, x.T) + b
72 | slack = np.maximum(0, 1 - y * y_hat)
73 | for s, (x1, x2) in zip(slack, x):
74 |     plt.annotate(str(s.round(2)), (x1-0.02, x2 + 0.03))
75 | 
76 | # Visualize support vectors.
77 | sv = model.support_vectors_
78 | plt.scatter(sv[:, 0], sv[:, 1], s=30, c='white')
79 |     
80 | plt.title('C = ' + str(C) + ',  Σξ = ' + str(np.sum(slack).round(2)))
81 | plt.legend()
82 | plt.show()
83 | 
84 | 


--------------------------------------------------------------------------------
/6.SVM/4.linearSVC(soft_margin).py:
--------------------------------------------------------------------------------
  1 | # [MXML-6-04] 4.linearSVC(soft_margin).py
  2 | #
  3 | # This code was used in the machine learning online 
  4 | # course provided by 
  5 | # www.youtube.com/@meanxai
  6 | # www.github.com/meanxai/machine_learning
  7 | #
  8 | # A detailed description of this code can be found in
  9 | # https://youtu.be/LdOcJfJTcwU
 10 | # 
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | from sklearn.svm import LinearSVC
 14 | 
 15 | # training data
 16 | x = np.array([[0.2,   0.869],
 17 |               [0.687, 0.212],
 18 |               [0.822, 0.411],
 19 |               [0.738, 0.694],
 20 |               [0.176, 0.458],
 21 |               [0.306, 0.753],
 22 |               [0.936, 0.413],
 23 |               [0.215, 0.410],
 24 |               [0.612, 0.375],
 25 |               [0.784, 0.602],
 26 |               [0.612, 0.554],
 27 |               [0.357, 0.254],
 28 |               [0.204, 0.775],
 29 |               [0.512, 0.745],
 30 |               [0.498, 0.287],
 31 |               [0.251, 0.557],
 32 |               [0.502, 0.523],
 33 |               [0.119, 0.687],
 34 |               [0.495, 0.924],
 35 |               [0.612, 0.851]])
 36 | 
 37 | y = np.array([-1,1,1,1,-1,-1,1,-1,1,1,-1,1,-1,1,-1,-1,1,-1,1,1])
 38 | 
 39 | C = 50
 40 | model = LinearSVC(penalty='l2', loss='hinge', C=C)
 41 | # model = LinearSVC(penalty='l2', loss='squared_hinge', C=C)
 42 | model.fit(x, y)
 43 | 
 44 | # parameters
 45 | w = model.coef_[0]
 46 | b = model.intercept_[0]
 47 | 
 48 | # Visualize the data points
 49 | plt.figure(figsize=(7,7))
 50 | color= ['red' if a == 1 else 'blue' for a in y]
 51 | plt.scatter(x[:, 0], x[:, 1], s=200, c=color, alpha=0.7)
 52 | plt.xlim(0, 1)
 53 | plt.ylim(0, 1)
 54 | 
 55 | # Visualize the decision boundary
 56 | x1_dec = np.linspace(0, 1, 50).reshape(-1, 1)
 57 | x2_dec = -(w[0] / w[1]) * x1_dec - b / w[1]
 58 | plt.plot(x1_dec, x2_dec, c='black', lw=1.0, label='decision boundary')
 59 | 
 60 | # Visualize the positive & negative boundary
 61 | w_norm = np.sqrt(np.sum(w ** 2))
 62 | w_unit = w / w_norm
 63 | half_margin = 1 / w_norm
 64 | upper = np.hstack([x1_dec, x2_dec]) + half_margin * w_unit
 65 | lower = np.hstack([x1_dec, x2_dec]) - half_margin * w_unit
 66 | 
 67 | plt.plot(upper[:, 0], upper[:, 1], '--', lw=1.0, label='positive boundary')
 68 | plt.plot(lower[:, 0], lower[:, 1], '--', lw=1.0, label='negative boundary')
 69 | 
 70 | # display slack variables, slack variable = max(0, 1 - y(wx + b))
 71 | y_hat = np.dot(w, x.T) + b
 72 | slack = np.maximum(0, 1 - y * y_hat)
 73 | for s, (x1, x2) in zip(slack, x):
 74 |     plt.annotate(str(s.round(2)), (x1-0.02, x2 + 0.03))
 75 | 
 76 | # Visualize support vectors.
 77 | sv = x[np.where(np.abs(y_hat) <= 1.0)[0]]
 78 | plt.scatter(sv[:, 0], sv[:, 1], s=30, c='white')
 79 |     
 80 | plt.title('C = ' + str(C) + ',  Σξ = ' + str(np.sum(slack).round(2)))
 81 | plt.legend()
 82 | plt.show()
 83 | 
 84 | # Hinge & squared hinge loss plot for [+] samples (y = +1)
 85 | x_rand = np.random.rand(100, 2)
 86 | y_rand = np.dot(w, x_rand.T) + b     # y_hat for x_rand
 87 | s_rand = np.maximum(0, 1 - y_rand)   # slack variables for y_rand
 88 | 
 89 | sort_idx = np.argsort(y_rand)
 90 | y_rand = y_rand[sort_idx]
 91 | s_rand = s_rand[sort_idx]
 92 | 
 93 | plt.plot(y_rand, s_rand, c='blue', label='Hinge loss')
 94 | plt.plot(y_rand, s_rand ** 2, c='red', label='Squared hinge loss')
 95 | plt.legend()
 96 | plt.axvline(x=0, lw=1)
 97 | plt.axvline(x=1, lw=1)
 98 | plt.xlabel('y_hat')
 99 | plt.ylabel('ξ')
100 | plt.ylim(0, 4)
101 | plt.title('Hinge & squared hinge loss for (+) sample')
102 | plt.show()
103 | 


--------------------------------------------------------------------------------
/6.SVM/5.check_kernel.py:
--------------------------------------------------------------------------------
 1 | # [MXML-6-05] 5.check_kernel.py
 2 | # For arbitrary real data, if the eigenvalues ​​of the kernel
 3 | # matrix (K) are all non-negative, then K is positive semi-definite
 4 | # (PSD) and is a valid kernel function.
 5 | #
 6 | # This code was used in the machine learning online 
 7 | # course provided by 
 8 | # www.youtube.com/@meanxai
 9 | # www.github.com/meanxai/machine_learning
10 | #
11 | # A detailed description of this code can be found in
12 | # https://youtu.be/NiuJihA05Ds
13 | # 
14 | import numpy as np
15 | 
16 | # random dataset (2-dims)
17 | x = np.random.rand(100, 2)
18 | n = x.shape[0]
19 | 
20 | # kernel functions
21 | rbf_kernel = lambda a, b: np.exp(-np.linalg.norm(a - b)**2 / 2)
22 | pol_kernel = lambda a, b: (1 + np.dot(a, b)) ** 2
23 | sig_kernel = lambda a, b: np.tanh(3 * np.dot(a, b) + 5)
24 | cos_kernel = lambda a, b: np.cos(np.dot(a, b))
25 | kernels = [rbf_kernel, pol_kernel, sig_kernel, cos_kernel]
26 | names = ['RBF', 'Polynomial', 'Sigmoid', 'Cos']
27 | 
28 | for kernel, name in zip(kernels, names):
29 |     # Kernel matrix (Gram matrix).
30 |     K = np.array([kernel(x[i], x[j]) 
31 |                   for i in range(n) 
32 |                   for j in range(n)]).reshape(n, n)
33 |     
34 |     # Find eigenvalues, eigenvectors
35 |     w, v = np.linalg.eig(K)
36 | 
37 |     # The function defined above is a valid kernel if all
38 |     # eigenvalues ​​of K are non-negative.
39 |     print('\nKernel : ' + name)
40 |     print('max eigenvalue =', w.max().round(3))
41 |     print('min eigenvalue =', w.min().round(8))
42 |     
43 |     if w.min().real > -1e-8:
44 |         print('==> valid kernel')
45 |     else:
46 |         print('==> invalid kernel')
47 | 
48 | 


--------------------------------------------------------------------------------
/6.SVM/6.cvxopt(kernel_trick).py:
--------------------------------------------------------------------------------
  1 | # [MXML-6-06] 6.cvxopt(kernel_trick).py
  2 | # Implemen nonlinear SVM using CVXOPT
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/-WVI6b19pag
 11 | # 
 12 | import numpy as np
 13 | from cvxopt import matrix as cvxopt_matrix
 14 | from cvxopt import solvers as cvxopt_solvers
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | # 4 data samples. 2 ‘+’ samples, 2 ‘-’ samples
 18 | x = np.array([[0., 1.], [1., 1.], [1., 0.], [0., 0.]])
 19 | y = np.array([[-1.], [1.], [-1.], [1.]])
 20 | 
 21 | # kernel function
 22 | def kernel(a, b, p=3, r=0.5, type="rbf"):
 23 |     if k_type == "poly":
 24 |         return (1 + np.dot(a, b)) ** p
 25 |     else:
 26 |         return np.exp(-r * np.linalg.norm(a - b)**2)
 27 |     
 28 | C = 1.0          # regularization constant
 29 | N = x.shape[0]   # the number of data points
 30 | k_type = "poly"  # kernel type: poly or rbf
 31 | 
 32 | # Kernel matrix. k(xi, xj) = φ(xi)φ(xj).
 33 | K = np.array([kernel(x[i], x[j], type=k_type) 
 34 |         for i in range(N) 
 35 |         for j in range(N)]).reshape(N, N)
 36 |                   
 37 | # Construct the matrices required for QP in standard form.
 38 | H = np.outer(y, y) * K
 39 | P = cvxopt_matrix(H)
 40 | q = cvxopt_matrix(np.ones(N) * -1)
 41 | A = cvxopt_matrix(y.reshape(1, -1))
 42 | b = cvxopt_matrix(np.zeros(1))
 43 | 
 44 | g = np.vstack([-np.eye(N), np.eye(N)])
 45 | G = cvxopt_matrix(g)
 46 | 
 47 | h1 = np.hstack([np.zeros(N), np.ones(N) * C])
 48 | h = cvxopt_matrix(h1)
 49 | 
 50 | # solver parameters
 51 | cvxopt_solvers.options['abstol'] = 1e-10
 52 | cvxopt_solvers.options['reltol'] = 1e-10
 53 | cvxopt_solvers.options['feastol'] = 1e-10
 54 | 
 55 | # Perform QP
 56 | sol = cvxopt_solvers.qp(P, q, G, h, A, b)
 57 | 
 58 | # the solution to the QP, λ
 59 | lamb = np.array(sol['x'])
 60 | 
 61 | # Find support vectors
 62 | sv_i = np.where(lamb > 1e-5)[0]
 63 | sv_m = lamb[sv_i]    # lambda
 64 | sv_x = x[sv_i]
 65 | sv_y = y[sv_i]
 66 | 
 67 | # Calculate b using the support vectors and calculate the average.
 68 | def cal_wphi(cond):
 69 |     wphi = []
 70 |     idx = np.where(cond)[0]
 71 |     for i in idx:
 72 |         wp = [sv_m[j] * sv_y[j] * kernel(sv_x[i], sv_x[j], type=k_type) \
 73 |               for j in range(sv_x.shape[0])]
 74 |         wphi.append(np.sum(wp))
 75 |     return wphi
 76 | 
 77 | b = -(np.max(cal_wphi(sv_y > 0)) + np.min(cal_wphi(sv_y < 0))) / 2.
 78 | 
 79 | # Predict the class of test data.
 80 | x_test = np.random.uniform(-0.5, 1.5, (1000, 2))
 81 | n_test = x_test.shape[0]
 82 | n_sv = sv_x.shape[0]
 83 | ts_K = np.array([kernel(sv_x[i], x_test[j], type=k_type) 
 84 |         for i in range(n_sv) 
 85 |         for j in range(n_test)]).reshape(n_sv, n_test)
 86 |         
 87 | # decision function
 88 | y_hat = np.sum(sv_m * sv_y * ts_K, axis=0).reshape(-1, 1) + b
 89 | y_pred = np.sign(y_hat)
 90 | 
 91 | # Visualize test data and classes.
 92 | plt.figure(figsize=(5,5))
 93 | test_c = ['red' if a == 1 else 'blue' for a in y_pred]
 94 | sv_c = ['red' if a == 1 else 'blue' for a in sv_y]
 95 | plt.scatter(x_test[:, 0], x_test[:, 1], s=30, c=test_c, alpha=0.3)
 96 | plt.scatter(sv_x[:, 0], sv_x[:, 1], s=100, marker='D', c=sv_c, ec='black', lw=2)
 97 | plt.axhline(y=0, lw=1)
 98 | plt.axvline(x=0, lw=1)
 99 | plt.show()
100 | 
101 | 


--------------------------------------------------------------------------------
/6.SVM/7.SVC(kernel_trick).py:
--------------------------------------------------------------------------------
 1 | # [MXML-6-06] 7.SVC(kernel_trick).py
 2 | # Implement nonlinear SVM using SVC.
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/-WVI6b19pag
11 | # 
12 | import numpy as np
13 | from sklearn.svm import SVC
14 | import matplotlib.pyplot as plt
15 | 
16 | # 4 data samples. 2 ‘+’ samples, 2 ‘-’ samples
17 | x = np.array([[0., 1.], [1., 1.], [1., 0.], [0., 0.]])
18 | y = np.array([-1., 1., -1., 1.])
19 | 
20 | C = 1.0
21 | # model = SVC(C=C, kernel='rbf', gamma=0.5)
22 | model = SVC(C=C, kernel='poly', degree=3)
23 | model.fit(x, y)
24 | 
25 | # Intercept (b)
26 | # w = model.coef_[0]
27 | # AttributeError: coef_ is only available when using a linear kernel
28 | b = model.intercept_[0]
29 | 
30 | # Predict the class of test data.
31 | x_test = np.random.uniform(-0.5, 1.5, (1000, 2))
32 | 
33 | # decision function
34 | y_hat = model.decision_function(x_test)
35 | y_pred = np.sign(y_hat)
36 | # y_pred = model.predict(x_test)  # It is the same as above.
37 | 
38 | # Visualize test data and classes.
39 | plt.figure(figsize=(5,5))
40 | test_c = ['red' if a == 1 else 'blue' for a in y_pred]
41 | plt.scatter(x_test[:, 0], x_test[:, 1], s=30, c=test_c, alpha=0.3)
42 | plt.scatter(x[:, 0], x[:, 1], s=100, marker='D', c='white', ec='black', lw=2)
43 | plt.axhline(y=0, lw=1)
44 | plt.axvline(x=0, lw=1)
45 | plt.show()
46 | 
47 | 


--------------------------------------------------------------------------------
/6.SVM/9.multiclass(OvO).py:
--------------------------------------------------------------------------------
 1 | # [MXML-6-07] 9.multiclass(OvO).py
 2 | # Implement multiclass classification of SVM by One-vs-One (OvO)
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/MAde_oEYB-g
11 | # 
12 | import numpy as np
13 | from sklearn.svm import SVC
14 | import matplotlib.pyplot as plt
15 | from sklearn.datasets import make_blobs
16 | from itertools import combinations
17 | 
18 | # Generate the data with 4 clusters.
19 | x, y = make_blobs(n_samples=400, n_features=2, 
20 |                 centers=[[0., 0.], [0.5, 0.5], [1., 0.], [-0.8, 0.]], 
21 |                 cluster_std=0.17)
22 | 
23 | # Linear SVM
24 | C = 1.0
25 | model = SVC(C=C, kernel='linear', decision_function_shape='ovo')
26 | model.fit(x, y)
27 | 
28 | w = model.coef_
29 | b = model.intercept_
30 | print("w:\n ", w.round(3))     # shape=(6,2)
31 | print("\nb:\n ", b.round(3))   # shape=(6,)
32 | 
33 | # Visualize the data and six boundaries.
34 | plt.figure(figsize=(8,7))
35 | colors = ['red', 'blue', 'green', 'black']
36 | y_color= [colors[a] for a in y]
37 | for label in model.classes_:
38 |     idx = np.where(y == label)
39 |     plt.scatter(x[idx, 0], x[idx, 1], s=100, c=colors[label], 
40 |                 alpha=0.5, label='class_' + str(label))
41 | 
42 | # Visualize six boundaries.
43 | comb = list(combinations(model.classes_, 2))
44 | x1_dec = np.linspace(-2.0, 2.0, 50).reshape(-1, 1)
45 | for i in range(w.shape[0]):
46 |     x2_dec = -(w[i, 0] * x1_dec + b[i]) / w[i, 1]
47 |     plt.plot(x1_dec, x2_dec, label=str(comb[i]))
48 | plt.xlim(-1.5, 1.8)    
49 | plt.ylim(-0.7, 1.)
50 | plt.legend()
51 | plt.show()
52 | 
53 | # Predict the classes of the test data.
54 | x_test = np.random.uniform(-1.5, 1.5, (2000, 2))
55 | y_pred1 = model.predict(x_test)
56 | 
57 | # To understand how OvO works, let's manually implement the 
58 | # process of model.predict(x_test). df.shape = (2000, 6)
59 | df = np.dot(x_test, w.T) + b            # decision function
60 | # df = model.decision_function(x_test)  # same as above
61 | 
62 | classes = model.classes_
63 | n_class = classes.shape[0]
64 | 
65 | # Reference: https://stackoverflow.com/questions/20113206/scikit-learn-svc-decision-function-and-predict
66 | y_pred = []
67 | for i in range(df.shape[0]):
68 |     votes = np.zeros(n_class)
69 |     for j in range(df.shape[1]):    # the number of boundaries
70 |         # if df(i, j) > 0, then class=i, else class=j
71 |         if df[i][j] > 0:            
72 |             votes[comb[j][0]] += 1
73 |         else:
74 |             votes[comb[j][1]] += 1
75 |         
76 |     v = np.argmax(votes)            # majority vote
77 |     y_pred.append(classes[v])
78 | y_pred2 = np.array(y_pred)
79 | 
80 | # Compare the results of y_pred1 and y_pred2.
81 | if (y_pred1 != y_pred2).sum() == 0:
82 |     print("# y_pred1 and y_pred2 are exactly the same.")
83 | else:
84 |     print("# y_pred1 and y_pred2 are not the same.")
85 | 
86 | # Visualize test data and y_pred1
87 | plt.figure(figsize=(8,7))
88 | y_color= [colors[a] for a in y_pred1]
89 | for label in model.classes_:
90 |     idx = np.where(y_pred1 == label)
91 |     plt.scatter(x_test[idx, 0], x_test[idx, 1], s=100, c=colors[label], 
92 |                 alpha=0.3, label='class_' + str(label))
93 | 
94 | plt.xlim(-1.5, 1.8)    
95 | plt.ylim(-0.7, 1.)
96 | plt.show()
97 | 
98 | 


--------------------------------------------------------------------------------
/7.KMeans/2.sklearn(kmeans).py:
--------------------------------------------------------------------------------
 1 | # [MXML-7-02] 2.sklearn(kmeans).py
 2 | #
 3 | # This code was used in the machine learning online 
 4 | # course provided by 
 5 | # www.youtube.com/@meanxai
 6 | # www.github.com/meanxai/machine_learning
 7 | #
 8 | # A detailed description of this code can be found in
 9 | # https://youtu.be/hToqjr5Kx4Q
10 | # 
11 | import numpy as np
12 | from sklearn.datasets import make_blobs
13 | from sklearn.cluster import KMeans
14 | import matplotlib.pyplot as plt
15 | 
16 | # Generate training data
17 | x, y = make_blobs(n_samples=300, n_features=2, 
18 |                   centers=[[0., 0.], [0.25, 0.5], [0.5, 0.]], 
19 |                   cluster_std=0.1, center_box=(-1., 1.))
20 | 
21 | K = 3           # the number of clusters
22 | M = 10          # the number of iterations
23 | L = 10          # the number of attempts to prevent local minimum problem.
24 | 
25 | model = KMeans(n_clusters = K,    # the number of clusters
26 |                init='random',     # randomly initialize centroids
27 |                max_iter=M,        # max iterations
28 |                n_init = L)        # Number of times the k-means algorithm 
29 |                                   # is run with different centroid seeds.
30 | 
31 | model.fit(x)
32 | 
33 | # Visualize training data and clusters color-coded.
34 | def plot_cluster(x, cluster, centroid):
35 |     plt.figure(figsize=(5, 5))
36 |     color = [['red', 'blue', 'green'][a] for a in cluster]
37 |     plt.scatter(x[:, 0], x[:, 1], s=30, c=color, alpha=0.5)
38 |     plt.scatter(centroid[:, 0], centroid[:, 1], s=500, c='white')
39 |     plt.scatter(centroid[:, 0], centroid[:, 1], s=250, c='black')
40 |     plt.scatter(centroid[:, 0], centroid[:, 1], s=80, c='yellow')
41 |     plt.show()
42 | 
43 | # Visualize the training result.
44 | plot_cluster(x, model.labels_, model.cluster_centers_)
45 | 
46 | # print the final error
47 | # Sum of squared distances of samples to their closest cluster center
48 | print('\nerror = {:.4f}'.format(model.inertia_))
49 | 
50 | 


--------------------------------------------------------------------------------
/7.KMeans/3.kmeans(plus).py:
--------------------------------------------------------------------------------
  1 | # [MXML-7-03] 3.kmeans(plus).py
  2 | import numpy as np
  3 | import random as rd
  4 | from sklearn.datasets import make_blobs
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | # Generate training data points.
  8 | x, y = make_blobs(n_samples=300, n_features=2, 
  9 |                   centers=[[0., 0.], [0.25, 0.5], [0.5, 0.]], 
 10 |                   cluster_std=0.1, center_box=(-1., 1.))
 11 | 
 12 | N = x.shape[0]  # the number of data points
 13 | K = 3           # the number of data clusters
 14 | M = 10          # the number of data iterations
 15 | 
 16 | # Visualize the data points, x.
 17 | def plot_data(x):
 18 |     plt.figure(figsize=(5, 5))
 19 |     plt.scatter(x[:, 0], x[:, 1], s=30, c='black', alpha=0.5)
 20 |     plt.show()
 21 | 
 22 | # Visualize training data points and clusters color-coded.
 23 | def plot_cluster(x, cluster, centroid):
 24 |     plt.figure(figsize=(5, 5))
 25 |     color = [['red', 'blue', 'green'][a] for a in cluster]
 26 |     plt.scatter(x[:, 0], x[:, 1], s=30, c=color, alpha=0.5)
 27 |     plt.scatter(centroid[:, 0], centroid[:, 1], s=500, c='white')
 28 |     plt.scatter(centroid[:, 0], centroid[:, 1], s=250, c='black')
 29 |     plt.scatter(centroid[:, 0], centroid[:, 1], s=80, c='yellow')
 30 |     plt.show()
 31 | 
 32 | plot_data(x)
 33 | 
 34 | # Generate initial centroids using the K-Means++ algorithm.
 35 | xp = x.copy()
 36 | centroids = []
 37 | density = np.ones(xp.shape[0]) / N
 38 | for c in range(K):
 39 |     # (1) Choose an initial centroid c(1) uniformly at random from X 
 40 |     # (2) Choose the next centroid c(i), selecting c (i) = x' ∈ X with probability 
 41 |     idx = rd.choices(np.arange(xp.shape[0]), weights=density, k=1)[0]
 42 |     centroids.append(xp[idx])
 43 |     xp = np.delete(xp, idx, axis=0)
 44 | 
 45 |     # Create a distance matrix between data points xp and the centroids.
 46 |     # Please refer to the video [MXML-7-02] for how to create a distance matrix.
 47 |     x_exp = xp[np.newaxis, :, :]
 48 |     c_exp = np.array(centroids)[:, np.newaxis, :]
 49 |     dist = np.sqrt(np.sum(np.square(x_exp - c_exp), axis=2))
 50 |     
 51 |     # Find the centroid closest to each data point.
 52 |     assign = np.argmin(dist, axis=0)
 53 |     
 54 |     # Calculate D(x)
 55 |     # let D(x) denote the shortest distance from a data point x to 
 56 |     # the closest centroid we have already chosen
 57 |     Dx = np.sum(np.square(xp - np.array(centroids)[assign]), axis=1)
 58 |     
 59 |     # Create a probability density function to select the next centroid.
 60 |     density = Dx / np.sum(Dx)
 61 |     
 62 | centroids = np.array(centroids)
 63 | 
 64 | # Perform the K-Means algorithm using the centroids generated by K-Means++.
 65 | error = []
 66 | for m in range(M):
 67 |     # Calculate the distances between the training data points and the centroids. 
 68 |     x_exp = x[np.newaxis, :, :]
 69 |     c_exp = centroids[:, np.newaxis, :]
 70 |     dist = np.sqrt(np.sum(np.square(x_exp - c_exp), axis=2))
 71 |     
 72 |     # Assign each data point to the nearest centroid.
 73 |     assign = np.argmin(dist, axis=0)  # shape = (N,)
 74 |     
 75 |     # update centroids
 76 |     new_cent = []
 77 |     err = 0
 78 |     for c in range(K):
 79 |         # Find the data points assigned to centroid c.
 80 |         idx = np.where(assign == c)
 81 |         x_idx = x[idx]
 82 |         
 83 |         # To measure clustering performance, calculate the error.
 84 |         err += np.sum(np.sum(np.square(x_idx - centroids[c]), axis=1))
 85 |         
 86 |         # Compute the average coordinates of the data points
 87 |         # assigned to this centroid. And use that as new centroid.
 88 |         new_cent.append(np.mean(x_idx, axis=0))
 89 |     
 90 |     error.append(err)
 91 |     
 92 |     # Remove the if statement to see the centroids moving.
 93 |     if m == 0:
 94 |         plot_cluster(x, assign, centroids)
 95 |     
 96 |     # Update centroids
 97 |     centroids = np.array(new_cent)
 98 |             
 99 | # Visualize the training result.
100 | plot_cluster(x, assign, centroids)
101 | 
102 | # Visualize error history
103 | plt.plot(error, 'o-')
104 | plt.title('final error =' + str(np.round(error[-1], 2)))
105 | plt.show()
106 | 
107 | # Check the cluster number for each data point.
108 | import pandas as pd
109 | df = pd.DataFrame({'x1': x[:,0], 'x2': x[:,1], 'cluster': assign})
110 | print(df.head(10))
111 | 


--------------------------------------------------------------------------------
/7.KMeans/4.sklearn(mnist).py:
--------------------------------------------------------------------------------
 1 | # [MXML-7-03] 4.sklearn(mnist).py
 2 | # MNIST clustering
 3 | # This code can be found at github.com/meanxai/machine_learning.
 4 | import numpy as np
 5 | from sklearn.cluster import KMeans
 6 | import matplotlib.pyplot as plt
 7 | import pickle
 8 | 
 9 | # from sklearn.datasets import fetch_openml
10 | # mnist = fetch_openml('mnist_784')
11 | # mnist.pkl is the saved mnist.
12 | with open('data/mnist.pkl', 'rb') as f:
13 |         mnist = pickle.load(f)
14 | 
15 | # Use only 10,000 data points and normalize them between 0 and 1
16 | x = np.array(mnist['data'][:10000]) / 255.
17 | 
18 | # Cluster the data points into 10 groups using K-Means++.
19 | model = KMeans(n_clusters=10, 
20 |                init='k-means++',   # default
21 |                max_iter = 50, 
22 |                n_init = 5)
23 | 
24 | model.fit(x)
25 | clust = model.predict(x)
26 | centroids = model.cluster_centers_
27 |    
28 | # Check out the images for each cluster.
29 | for k in np.unique(clust):
30 |     # Find 10 images belonging to cluster k, and centroid image.
31 |     idx = np.where(clust == k)[0]
32 |     images = x[idx[:10]]
33 |     centroid = centroids[k, :]
34 |     
35 |     # Find 10 images closest to each centroid image.
36 |     # d = np.sqrt(np.sum((x[idx] - centroid)**2, axis=1))
37 |     # nearest = np.argsort(d)[:10]
38 |     # images = x[idx[nearest]]
39 |     
40 |     
41 |     # display the central image
42 |     f = plt.figure(figsize=(8, 2))
43 |     image = centroid.reshape(28, 28)
44 |     ax = f.add_subplot(1, 11, 1)
45 |     ax.imshow(image, cmap=plt.cm.bone)
46 |     ax.grid(False)
47 |     ax.set_title("C")
48 |     ax.xaxis.set_ticks([])
49 |     ax.yaxis.set_ticks([])
50 |     plt.tight_layout()
51 |     
52 |     # display 10 images belonging to the centroid
53 |     for i in range(10):
54 |         image = images[i].reshape(28,28)
55 |         ax = f.add_subplot(1, 11, i + 2)
56 |         ax.imshow(image, cmap=plt.cm.bone)
57 |         ax.grid(False)
58 |         ax.set_title(k)
59 |         ax.xaxis.set_ticks([])
60 |         ax.yaxis.set_ticks([])
61 |         plt.tight_layout()
62 | 


--------------------------------------------------------------------------------
/8.RandomForest/1.RF(titanic).py:
--------------------------------------------------------------------------------
 1 | # [MXML-8-02]: 1.RF(titanic).py
 2 | # Implement Random Forest using MyDtreeClassifierRF.
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/9si5fELmtg0
11 | # 
12 | import numpy as np
13 | import pandas as pd
14 | from MyDTreeClassifierRF import MyDTreeClassifierRF
15 | from sklearn.model_selection import train_test_split
16 | 
17 | # Read preprocessed Titanic data.
18 | df = pd.read_csv('data/titanic_clean.csv')
19 | 
20 | # Survived  Pclass  Sex   Age  SibSp  Parch   Fare  Embarked  Title
21 | #        0       3    1  22.0      1      0   3.62         3      2
22 | #        1       1    0  38.0      1      0  35.64         0      3
23 | #        1       3    0  26.0      0      0   7.92         3      1
24 | #        1       1    0  35.0      1      0  26.55         3      3
25 | #        0       3    1  35.0      0      0   8.05         3      2
26 | 
27 | y = np.array(df['Survived'])
28 | x = np.array(df.drop('Survived', axis=1))
29 | x_train, x_test, y_train, y_test = train_test_split(x, y)
30 | 
31 | n_estimators = 100
32 | n_features = round(np.sqrt(x.shape[1])) # the number of features for column sampling
33 | n_depth = 3                             # max_depth of tree
34 | 
35 | models = []  # base model list
36 | for i in range(n_estimators):
37 |     # Create a tree for Random Forest
38 |     model = MyDTreeClassifierRF(max_depth=n_depth, 
39 |                                 max_samples = x_train.shape[0],
40 |                                 max_features=n_features)
41 |     
42 |     # train the tree.
43 |     # subsampling by rows and columns is performed within the model
44 |     model.fit(x_train, y_train)
45 |     
46 |     # save trained tree
47 |     models.append(model)
48 | 
49 | # prediction
50 | y_estimates = np.zeros(shape=(x_test.shape[0], n_estimators))
51 | for i, model in enumerate(models):
52 |     y_estimates[:, i] = model.predict(x_test)
53 | 
54 | # synthesizing the estimation results
55 | y_prob = y_estimates.mean(axis=1)
56 | y_pred = (y_prob >= 0.5) * 1
57 | print('\nAccuracy = {:.4f}'.format((y_pred == y_test).mean()))
58 | 
59 | models
60 | y_estimates.shape
61 | y_estimates
62 | y_estimates[0, :]
63 | (y_estimates[0, :] == 0.0).sum()
64 | (y_estimates[0, :] == 1.0).sum()
65 | y_prob[0]
66 | y_pred[0]
67 | 


--------------------------------------------------------------------------------
/8.RandomForest/2.RF(sklearn).py:
--------------------------------------------------------------------------------
 1 | # [MXML-8-02]: 2.RF(sklearn).py
 2 | # Implement Random Forest using scikit-learn.
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/9si5fELmtg0
11 | # 
12 | import numpy as np
13 | import pandas as pd
14 | from sklearn.tree import DecisionTreeClassifier
15 | from sklearn.ensemble import RandomForestClassifier
16 | from sklearn.model_selection import train_test_split
17 | 
18 | # Read preprocessed Titanic data.
19 | df = pd.read_csv('data/titanic_clean.csv')
20 | y = np.array(df['Survived'])
21 | x = np.array(df.drop('Survived', axis=1))
22 | x_train, x_test, y_train, y_test = train_test_split(x, y)
23 | 
24 | n_estimators = 100
25 | n_depth = 3                           # max_depth of tree
26 | 
27 | # Implement Random Forest using DecisionTreeClassifier
28 | models = []  # base model list
29 | n = x_train.shape[0]  # the number of train data points
30 | for i in range(n_estimators):
31 |     # row subsampling
32 |     i_row = np.random.choice(np.arange(0, n), n, replace=True)
33 |     x_sample = x_train[i_row, :]
34 |     y_sample = y_train[i_row]
35 | 
36 |     # Create a tree for Random Forest
37 |     # Column subsampling for each split is performed within the model.
38 |     model = DecisionTreeClassifier(max_depth=n_depth,
39 |                                    max_features="sqrt")
40 |     
41 |     # train the tree
42 |     model.fit(x_sample, y_sample)
43 |     
44 |     # save trained tree
45 |     models.append(model)
46 | 
47 | # prediction
48 | y_estimates = np.zeros(shape=(x_test.shape[0], n_estimators))
49 | for i, model in enumerate(models):
50 |     y_estimates[:, i] = model.predict(x_test)
51 | 
52 | # synthesizing the estimation results
53 | y_prob = y_estimates.mean(axis=1)
54 | y_pred = (y_prob >= 0.5) * 1
55 | print('\nAccuracy1 = {:.4f}'.format((y_pred == y_test).mean()))
56 | 
57 | # Implement Random Forest using RandomForestClassifier
58 | model = RandomForestClassifier(n_estimators=n_estimators,
59 |                                max_depth=n_depth,
60 |                                max_samples=n,       # default
61 |                                max_features="sqrt") # default
62 | model.fit(x_train, y_train)
63 | y_pred = model.predict(x_test)
64 | print('\nAccuracy2 = {:.4f}'.format((y_pred == y_test).mean()))
65 | 
66 | model.estimators_
67 | # [DecisionTreeClassifier(max_depth=3, max_features='sqrt',
68 | #                         random_state=1090277217),
69 | #  DecisionTreeClassifier(max_depth=3, max_features='sqrt',
70 | #                         random_state=1758239483),
71 | #  DecisionTreeClassifier(max_depth=3, max_features='sqrt',
72 | #                         random_state=1420256802)
73 | # ...


--------------------------------------------------------------------------------
/8.RandomForest/3.RF_OOB.py:
--------------------------------------------------------------------------------
  1 | # [MXML-8-03] 3.RF_OOB.py
  2 | # Add Out-Of-Bag (OOB) score feature to 2.RF(titanic).py.
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/DFh7BefJpfQ
 11 | # 
 12 | import numpy as np
 13 | import pandas as pd
 14 | from MyDTreeClassifierRF import MyDTreeClassifierRF
 15 | from sklearn.model_selection import train_test_split
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | # Read preprocessed Titanic data.
 19 | df = pd.read_csv('data/titanic_clean.csv')
 20 | 
 21 | # Survived  Pclass  Sex   Age  SibSp  Parch   Fare  Embarked  Title
 22 | #        0       3    1  22.0      1      0   3.62         3      2
 23 | #        1       1    0  38.0      1      0  35.64         0      3
 24 | #        1       3    0  26.0      0      0   7.92         3      1
 25 | #        1       1    0  35.0      1      0  26.55         3      3
 26 | #        0       3    1  35.0      0      0   8.05         3      2
 27 | 
 28 | y = np.array(df['Survived'])
 29 | x = np.array(df.drop('Survived', axis=1))
 30 | x_train, x_test, y_train, y_test = train_test_split(x, y)
 31 | 
 32 | N = x_train.shape[0]  # the number of train data points
 33 | n_estimators = 50
 34 | n_depth = 5        # max_depth of tree
 35 | max_features = round(np.sqrt(x_train.shape[1]))  # for column sub sampling
 36 | 
 37 | # majority vote for iob_pred, or oob_pred
 38 | # p = iob_pred or oob_pred
 39 | def majority_vote(p):
 40 |     cnt_0 = (p == 0).sum(axis=1)
 41 |     cnt_1 = (p == 1).sum(axis=1)
 42 |     cnts = np.array([cnt_0, cnt_1])   # shape = (2, 668)
 43 |     return np.argmax(cnts, axis=0)
 44 | 
 45 | models = []     # base model list
 46 | iob_score = []  # Error rate measured with IOB
 47 | oob_score = []  # Error rate measured with OOB
 48 | 
 49 | # initialize IOB and OOB prediction map
 50 | iob_pred = np.ones(shape=(N, n_estimators)) * -1
 51 | oob_pred = np.ones(shape=(N, n_estimators)) * -1
 52 | 
 53 | # Create n_estimators models
 54 | for i in range(n_estimators):
 55 |     # Create a Decision Tree for Random Forest
 56 |     model = MyDTreeClassifierRF(max_depth=n_depth, 
 57 |                                 max_samples = N,
 58 |                                 max_features = max_features)
 59 |     
 60 |     # train
 61 |     p1, p2 = model.fit(x_train, y_train)
 62 | 
 63 |     # save trained tree
 64 |     models.append(model)
 65 |     
 66 |     # Create IOB and OOB prediction map
 67 |     iob_pred[:, i] = p1
 68 |     oob_pred[:, i] = p2
 69 |            
 70 |     # Calculate IOB and OOB score
 71 |     y_trn = majority_vote(iob_pred)
 72 |     y_oob = majority_vote(oob_pred)
 73 |     
 74 |     iob_score.append((y_trn != y_train).mean())
 75 |     oob_score.append((y_oob != y_train).mean())
 76 | 
 77 | # Visualize IOB and OOB score
 78 | plt.figure(figsize=(6, 4))
 79 | plt.plot(iob_score, color='blue', lw=1.0, label='IOB error')
 80 | plt.plot(oob_score, color='red', lw=1.0, label='OOB error')
 81 | plt.legend()
 82 | plt.xlabel('n_estimators')
 83 | plt.ylabel('OOB error rate')
 84 | plt.show()
 85 | 
 86 | # prediction
 87 | y_estimates = np.zeros(shape=(x_test.shape[0], n_estimators))
 88 | for i, model in enumerate(models):
 89 |     y_estimates[:, i] = model.predict(x_test)
 90 | 
 91 | # synthesizing the estimation results
 92 | y_prob = y_estimates.mean(axis=1)
 93 | y_pred = (y_prob >= 0.5) * 1
 94 | accuracy = (y_pred == y_test).mean()
 95 | print('\nAccuracy of test data = {:.4f}'.format(accuracy))
 96 | print('Final OOB error rate = {:.4f}'.format(oob_score[-1]))
 97 | 
 98 | # OOB probability
 99 | # In theory, it would be 0.3679.
100 | # This means that x_train is selected with probability 0.6321
101 | # by row subsampling. (1.0 - 0.3679 = 0.6321)
102 | oob_percent = ((oob_pred >= 0).sum(axis=0) / N).mean()
103 | print('OOB probability = {:.4f}'.format(oob_percent))
104 | 


--------------------------------------------------------------------------------
/8.RandomForest/6.RF_outlier.py:
--------------------------------------------------------------------------------
  1 | # [MXML-8-06] 6.RF_outlier.py
  2 | # Outlier detection using Random Forest’s proximity matrix
  3 | # Reference [2]:
  4 | # https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#outliers
  5 | #
  6 | # This code was used in the machine learning online 
  7 | # course provided by 
  8 | # www.youtube.com/@meanxai
  9 | # www.github.com/meanxai/machine_learning
 10 | #
 11 | # A detailed description of this code can be found in
 12 | # https://youtu.be/ps2QXPnPHVM
 13 | # 
 14 | import numpy as np
 15 | from sklearn.datasets import make_blobs
 16 | from sklearn.ensemble import RandomForestClassifier
 17 | import matplotlib.pyplot as plt
 18 | 
 19 | # Generate training data
 20 | x, y = make_blobs(n_samples=600, n_features=2, 
 21 |                 centers=[[0., 0.], [0.5, 0.5]], 
 22 |                 cluster_std=0.2, center_box=(-1., 1.))
 23 | 
 24 | # Create Proximity matrix
 25 | # normalize = 0: pm / n_tree
 26 | # normalize �  0: Normalize columns to sum to 1
 27 | def proximity_matrix(model, x, normalize=0):
 28 |     n_tree = len(model.estimators_)
 29 |     
 30 |     # Apply trees in the forest to X, return leaf indices.
 31 |     leaf = model.apply(x)  # shape = (x.shape[0], n_tree)
 32 |     
 33 |     pm = (
 34 |         (leaf[:, None, :] == leaf[None, :, :])
 35 |         .sum(axis=-1)
 36 |     )
 37 |     # # the above is equivalent to:
 38 |     # pm = np.zeros(shape=(x.shape[0], x.shape[0]))
 39 |     # for i in range(n_tree):
 40 |     #     t = leaf[:, i]
 41 |     #     pm += np.equal.outer(t, t) * 1.
 42 | 
 43 |     np.fill_diagonal(pm, 0)    
 44 |     if normalize == 0:
 45 |         return pm / n_tree
 46 |     else:
 47 |         return pm / pm.sum(axis=0, keepdims=True)
 48 | 
 49 | n_estimators = 50
 50 | n_depth = 5
 51 | 
 52 | # Detect outliers using a proximity matrix
 53 | model = RandomForestClassifier(n_estimators=n_estimators,
 54 |                                max_depth=n_depth,
 55 |                                max_features="sqrt",  # default
 56 |                                bootstrap=True,       # default
 57 |                                oob_score=True)
 58 | model.fit(x, y)
 59 | 
 60 | # Create a proximity matrix
 61 | pm = proximity_matrix(model, x, normalize=0)
 62 | 
 63 | i_y0 = np.where(y == 0)[0]
 64 | i_y1 = np.where(y == 1)[0]
 65 | i_y = [i_y0, i_y1]
 66 | 
 67 | # 1) average proximity
 68 | pi_bar = []
 69 | for i in range(pm.shape[0]):
 70 |     j_class = y[i]        # the class of data instance i
 71 |     j_same = i_y[j_class] # Data point IDs with the same class as data point i
 72 |     pi_bar.append(np.sum(pm[i, j_same] ** 2))
 73 | 
 74 | # 2) raw outlier measure
 75 | o_raw = x.shape[0] / np.array(pi_bar)
 76 | 
 77 | # 3) final outlier measure
 78 | # For convenience of coding, the mean value was used instead of the 
 79 | # median, and the standard deviation was used instead of the absolute 
 80 | # deviation.
 81 | f_measure = []
 82 | for i in range(o_raw.shape[0]):
 83 |     j_class = y[i]         # the class of the data instance i
 84 |     j_same = i_y[j_class]  # Data point IDs with the same class as data point i
 85 |     f_measure.append((o_raw[i] - o_raw[j_same].mean()) / o_raw[j_same].std())
 86 | 
 87 | # Data in the upper top_rate percentage of f_measure are considered outliers.
 88 | top_rate = 0.07  # top 5%
 89 | top_idx = np.argsort(f_measure)[::-1][:int(top_rate * x.shape[0])]
 90 | 
 91 | # Visualize normal data and outliers by color.
 92 | plt.figure(figsize=(7, 7))
 93 | color = [['blue', 'red'][i] for i in y]
 94 | color_out = [['blue', 'red'][i] for i in y[top_idx]]
 95 | plt.scatter(x[:, 0], x[:, 1], s=30, c=color, alpha=0.5)
 96 | plt.scatter(x[top_idx, 0], x[top_idx, 1], s=400, c='black', alpha=0.5)  # outlier scatter
 97 | plt.scatter(x[top_idx, 0], x[top_idx, 1], s=200, c='white')
 98 | plt.scatter(x[top_idx, 0], x[top_idx, 1], s=30, c=color_out)
 99 | plt.show()
100 | 


--------------------------------------------------------------------------------
/8.RandomForest/7.iForest_test.py:
--------------------------------------------------------------------------------
 1 | # [MXML-8-07] 7.iForest_test.py
 2 | # Implementation of Isolation Forest using ExtraTreeRegressor
 3 | # sklearn's IsolationForest library makes it easy to implement 
 4 | # Isolation Forest, but I used ExtraTreeRegressor to better understand 
 5 | # how it works.
 6 | #
 7 | # This code was used in the machine learning online 
 8 | # course provided by 
 9 | # www.youtube.com/@meanxai
10 | # www.github.com/meanxai/machine_learning
11 | #
12 | # A detailed description of this code can be found in
13 | # https://youtu.be/JpZJoOTjMWU
14 | # 
15 | from sklearn.tree import ExtraTreeRegressor
16 | import numpy as np
17 | 
18 | # simple dataset
19 | x = np.array([2, 2.5, 3.8, 4.1, 10.5, 15.4], dtype=np.float32).reshape(-1, 1)
20 | n = x.shape[0]   # the number of data points
21 | n_trees = 10     # the number of trees in Isolation Forest
22 | 
23 | # H(i) is the harmonic number and it can be estimated
24 | # by ln(i) + 0.5772156649 (Euler’s constant).
25 | def H(n):
26 |     return np.log(n) + 0.5772156649
27 | 
28 | # average path length of unsuccessful search in BST
29 | def C(n):
30 |     return 2 * H(n-1) - (2 * (n-1) / n)
31 | 
32 | hx = np.zeros(n)
33 | for t in range(n_trees):
34 |     # Create a tree using random split points
35 |     model = ExtraTreeRegressor(max_depth=3, max_features=1)
36 |     
37 |     # Fit the model to training data. 
38 |     # Since it is unsupervised learning and there is no target value, 
39 |     # a binary tree is created by randomly generating target values.
40 |     model.fit(x, np.random.uniform(size=n))
41 |     
42 |     leaf_id = model.apply(x)  # indices of leaf nodes
43 |     
44 |     # depth of each node, internal and external nodes.
45 |     node_depth = model.tree_.compute_node_depths()
46 |     
47 |     # h(x): accumulated path length of data points
48 |     hx += node_depth[leaf_id] - 1.0
49 |     
50 |     print('Tree',t,':', (hx / (t+1)).round(1))
51 | 
52 | Ehx = hx / n_trees          # Average of h(x)
53 | S = 2 ** (-(Ehx / C(n)))    # Anomaly scores for each data point
54 | i_out = np.argsort(S)[-2:]  # Top 2 anomaly scores
55 | outliers = x[i_out]         # outliers
56 | 
57 | print('\nAnomaly scores:')
58 | print(S.round(3))
59 | print('\nOutliers:')
60 | print(outliers)
61 | 
62 | # import matplotlib.pyplot as plt
63 | # from sklearn import tree
64 | 
65 | # plt.figure(figsize=(12, 8))
66 | # tree.plot_tree(model)
67 | # plt.show()
68 | 


--------------------------------------------------------------------------------
/8.RandomForest/8.iForest_outlier.py:
--------------------------------------------------------------------------------
 1 | # [MXML-8-07] 8.iForest_outlier.py
 2 | # Outlier detection using Isolation Forest (iForest)
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/JpZJoOTjMWU
11 | # 
12 | import numpy as np
13 | from sklearn.datasets import make_blobs
14 | from sklearn.ensemble import IsolationForest
15 | import matplotlib.pyplot as plt
16 | 
17 | # Create training dataset
18 | x, y = make_blobs(n_samples=600, n_features=2, 
19 |                 centers=[[0., 0.], [0.5, 0.5]], 
20 |                 cluster_std=0.2, center_box=(-1., 1.))
21 | 
22 | model = IsolationForest(n_estimators = 50, contamination=0.05)
23 | model.fit(x)
24 | outlier = model.predict(x)   # Normal = 1, Outlier = -1
25 | 
26 | # Extract outliers
27 | i_outlier = np.where(outlier == -1)[0]
28 | x_outlier = x[i_outlier, :]
29 | 
30 | # Visualize normal data points and outliers by color.
31 | plt.figure(figsize=(7, 7))
32 | color = [['blue', 'red'][i] for i in y]
33 | color_out = [['blue', 'red'][i] for i in y[i_outlier]]
34 | plt.scatter(x[:, 0], x[:, 1], s=30, c=color, alpha=0.5)
35 | plt.scatter(x_outlier[:, 0], x_outlier[:, 1], s=400, c='black', alpha=0.5)  # outlier scatter
36 | plt.scatter(x_outlier[:, 0], x_outlier[:, 1], s=200, c='white')
37 | plt.scatter(x_outlier[:, 0], x_outlier[:, 1], s=30, c=color_out)
38 | plt.show()
39 | 
40 | # Check out the distribution of Anomaly score
41 | score = abs(model.score_samples(x))
42 | score[i_outlier].min()
43 | plt.hist(score, bins = 50)
44 | plt.title('distribution of anomaly score')
45 | plt.xlabel('anomaly score')
46 | plt.ylabel('frequency')
47 | plt.axvline(x=score[i_outlier].min(), c='red')
48 | plt.show()
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/9.AdaBoost/1.AdaBoost(binary1).py:
--------------------------------------------------------------------------------
  1 | # [MXML-9-01] 1.AdaBoost.py
  2 | # [1] Yoav Freund et, al., 1999, A Short Introduction to Boosting
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/avs14cAFyHE
 11 | # 
 12 | import numpy as np
 13 | import random as rd
 14 | from sklearn.datasets import make_blobs
 15 | from sklearn.tree import DecisionTreeClassifier
 16 | from sklearn.model_selection import train_test_split
 17 | import matplotlib.pyplot as plt
 18 | 
 19 | # Create training data
 20 | x, y = make_blobs(n_samples=200, n_features=2, 
 21 |                 centers=[[0., 0.], [0.5, 0.5]], 
 22 |                 cluster_std=0.2, center_box=(-1., 1.))
 23 | y = y * 2 - 1    # [0, 1] --> [-1, 1]
 24 | 
 25 | m = x.shape[0]
 26 | R = np.arange(m)
 27 | T = 50
 28 | 
 29 | # [1] Figure 1: The boosting algorithm AdaBoost
 30 | # ---------------------------------------------
 31 | # Given: (x1, y1), ..., (xm, ym) where xi ∈ X, yi ∈ Y = {-1, +1}
 32 | # Initialize D1(i) = 1/m
 33 | weights = [np.array(np.ones(shape=(m,)) / m)]
 34 | eps = []      # epsilon history
 35 | alphas = []   # alpha history
 36 | models = []   # base learner models
 37 | for t in range(T):
 38 |     # sampling according to the weights
 39 |     s_idx = np.array(rd.choices(R, weights=weights[-1], k=m))
 40 |     sx = x[s_idx]      # sample x
 41 |     sy = y[s_idx]      # sample y
 42 |     
 43 |     # Train weak learner using distribution Dt. (Dt: weights)
 44 |     model = DecisionTreeClassifier(max_depth=2)
 45 |     model.fit(sx, sy)  # fit the model to sample data
 46 |     
 47 |     # Get weak hypothesis ht : X -> {-1, +1} with error
 48 |     y_pred = model.predict(x)  # predict entire training data
 49 |     i_not = np.array(y_pred != y).astype(int)  # I(y_pred ≠ y)
 50 |     eps.append(np.sum(weights[-1] * i_not))
 51 | 
 52 |     # Choose αt=(1/2)ln((1-εt)/εt).  (α: alpha, ε: eps)
 53 |     # For αt to be positive, εt must be less than 0.5.
 54 |     # If εt is greater than 0.5, it means it is worse than a 
 55 |     # random prediction. If so, initialize the weights to 1/m again.
 56 |     if eps[-1] > 0.5:
 57 |         weights.append(np.array(np.ones(shape=(m,)) / m))
 58 |         alphas.append(0.0)
 59 |         print('weight re-initialized at t =', t)
 60 |     else:
 61 |         alpha = 0.5 * np.log((1 - eps[-1]) / (eps[-1] + 1e-8))
 62 |         alphas.append(alpha)
 63 |         
 64 |         # Update Dt.
 65 |         new_weights = weights[-1] * np.exp(-alpha * y * y_pred)
 66 |         weights.append( new_weights / new_weights.sum())  # normalize
 67 |     
 68 |     models.append(model)
 69 |     
 70 | # Output the final hypothesis:
 71 | x_test = np.random.uniform(-0.5, 1.5, (1000, 2))
 72 | H = np.zeros(shape=x_test.shape[0])
 73 | for t in range(T):
 74 |     h = models[t].predict(x_test)
 75 |     H += alphas[t] * h
 76 | 
 77 | y_pred = np.sign(H)
 78 | 
 79 | # visualize training data and the sampling weights
 80 | def plot_train(x, y, w):
 81 |     plt.figure(figsize=(5,5))
 82 |     color = ['red' if a == 1 else 'blue' for a in y]
 83 |     plt.scatter(x[:, 0], x[:, 1], s=w*10000, c=color, alpha=0.5)
 84 |     plt.xlim(-0.5, 1.0)    
 85 |     plt.ylim(-0.5, 1.0)
 86 |     plt.show()
 87 |     
 88 | # visualize decision boundary
 89 | def plot_boundary(x, y, x_test, y_pred):
 90 |     plt.figure(figsize=(5,5))
 91 |     color = ['red' if a == 1 else 'blue' for a in y_pred]
 92 |     plt.scatter(x_test[:, 0], x_test[:, 1], s=100, c=color, alpha=0.3)
 93 |     plt.scatter(x[:, 0], x[:, 1], s=80, c='black')
 94 |     plt.scatter(x[:, 0], x[:, 1], s=10, c='yellow')
 95 |     plt.xlim(-0.5, 1.0)
 96 |     plt.ylim(-0.5, 1.0)
 97 |     plt.show()
 98 | 
 99 | plot_train(x, y, w=np.array(np.ones(shape=(m,)) / m))
100 | plot_train(x, y, w=weights[-1])
101 | plot_boundary(x, y, x_test, y_pred)
102 | 
103 | # Check the changes in α (alpha), ε (eps).
104 | # Check that ε are all less than 0.5 and that α and ε are inversely proportional.
105 | plt.plot(eps, marker='o', markersize=4, c='red', lw=1, label='epsilon')
106 | plt.plot(alphas, marker='o', markersize=4, c='blue', lw=1, label='alpha')
107 | plt.legend()
108 | plt.show()
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/9.AdaBoost/2.AdaBoost(binary2).py:
--------------------------------------------------------------------------------
  1 | # [MXML-9-02] 2.AdaBoost(binary2).py
  2 | # Using y = {0, 1} instead of y = {-1, +1}
  3 | #
  4 | # [1] Yoav Freund et, al., 1999, A Short Introduction to Boosting
  5 | # [2] Ji Zhu, et, al., 2006, Multi-class AdaBoost
  6 | #
  7 | # This code was used in the machine learning online 
  8 | # course provided by 
  9 | # www.youtube.com/@meanxai
 10 | # www.github.com/meanxai/machine_learning
 11 | #
 12 | # A detailed description of this code can be found in
 13 | # https://youtu.be/LVStXzGpA7Y
 14 | # 
 15 | import numpy as np
 16 | import random as rd
 17 | from sklearn.datasets import make_blobs
 18 | from sklearn.tree import DecisionTreeClassifier
 19 | from sklearn.model_selection import train_test_split
 20 | import matplotlib.pyplot as plt
 21 |       
 22 | # Create training data
 23 | x, y = make_blobs(n_samples=200, n_features=2, 
 24 |                 centers=[[0., 0.], [0.5, 0.5]], 
 25 |                 cluster_std=0.2, center_box=(-1., 1.))
 26 | 
 27 | m = x.shape[0]
 28 | R = np.arange(m)
 29 | T = 50
 30 | 
 31 | weights = [np.array(np.ones(shape=(m,)) / m)]
 32 | eps = []      # epsilon history
 33 | alphas = []   # alpha history
 34 | models = []   # base learner models
 35 | for t in range(T):
 36 |     s_idx = np.array(rd.choices(R, weights=weights[-1], k=m))  # weighted sample index
 37 |     sx = x[s_idx]      # sample x
 38 |     sy = y[s_idx]      # sample y
 39 |     
 40 |     model = DecisionTreeClassifier(max_depth=2)
 41 |     model.fit(sx, sy)  # fit the model to sample data
 42 |     
 43 |     y_pred = model.predict(x)  # predict entire training data
 44 |     i_not = np.array(y_pred != y).astype(int)  # I(y_pred ≠ y)
 45 |     eps.append(np.sum(weights[-1] * i_not))
 46 | 
 47 |     if eps[-1] > 0.5:
 48 |         weights.append(np.array(np.ones(shape=(m,)) / m))
 49 |         alphas.append(0.0)
 50 |         print('weight re-initialized at t =', t)
 51 |     else:
 52 |         alpha = 0.5 * np.log((1 - eps[-1]) / (eps[-1] + 1e-8))
 53 |         alphas.append(alpha)
 54 |         
 55 |         new_weights = weights[-1] * np.exp(alpha * i_not)
 56 |         weights.append(new_weights / new_weights.sum())  # normalize
 57 |     
 58 |     models.append(model)
 59 | 
 60 | H = np.zeros(shape=(5, 2))
 61 | h = np.array([1,0,1,0,1])
 62 |     
 63 | x_test = np.random.uniform(-0.5, 1.5, (1000, 2))
 64 | H = np.zeros(shape=(x_test.shape[0], 2))
 65 | for t in range(T):
 66 |     h = models[t].predict(x_test)
 67 |     oh = np.eye(2)[h]       # one-hot encoding
 68 |     H += alphas[t] * oh
 69 | 
 70 | y_pred = np.argmax(H, axis=1)
 71 | 
 72 | # visualize training data and the sampling weights
 73 | def plot_train(x, y, w):
 74 |     plt.figure(figsize=(5,5))
 75 |     color = ['red' if a == 1 else 'blue' for a in y]
 76 |     plt.scatter(x[:, 0], x[:, 1], s=w*10000, c=color, alpha=0.5)
 77 |     plt.xlim(-0.5, 1.0)    
 78 |     plt.ylim(-0.5, 1.0)
 79 |     plt.show()
 80 |     
 81 | # visualize decision boundary
 82 | def plot_boundary(x, y, x_test, y_pred):
 83 |     plt.figure(figsize=(5,5))
 84 |     color = ['red' if a == 1 else 'blue' for a in y_pred]
 85 |     plt.scatter(x_test[:, 0], x_test[:, 1], s=100, c=color, alpha=0.3)
 86 |     plt.scatter(x[:, 0], x[:, 1], s=80, c='black')
 87 |     plt.scatter(x[:, 0], x[:, 1], s=10, c='yellow')
 88 |     plt.xlim(-0.5, 1.0)
 89 |     plt.ylim(-0.5, 1.0)
 90 |     plt.show()
 91 | 
 92 | plot_train(x, y, w=np.array(np.ones(shape=(m,)) / m))
 93 | plot_train(x, y, w=weights[-1])
 94 | plot_boundary(x, y, x_test, y_pred)
 95 | 
 96 | # Check the changes in α (alpha), ε (eps).
 97 | # Check that ε are all less than 0.5 and that α and ε are inversely proportional.
 98 | plt.plot(eps, marker='o', markersize=4, c='red', lw=1, label='epsilon')
 99 | plt.plot(alphas, marker='o', markersize=4, c='blue', lw=1, label='alpha')
100 | plt.legend()
101 | plt.show()
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/9.AdaBoost/4.sklearn(AdaBoost).py:
--------------------------------------------------------------------------------
 1 | # [MXML-9-03] 4.sklearn(AdaBoost).py
 2 | # Test sklearn's AdaBoostClassifier
 3 | #
 4 | # This code was used in the machine learning online 
 5 | # course provided by 
 6 | # www.youtube.com/@meanxai
 7 | # www.github.com/meanxai/machine_learning
 8 | #
 9 | # A detailed description of this code can be found in
10 | # https://youtu.be/tPeRalG7gYY
11 | # 
12 | from sklearn.datasets import load_iris
13 | from sklearn.model_selection import train_test_split
14 | from sklearn.tree import DecisionTreeClassifier
15 | from sklearn.ensemble import AdaBoostClassifier
16 | 
17 | # Read iris dataset
18 | x, y = load_iris(return_X_y=True)
19 | 
20 | # Create training and test data
21 | x_train, x_test, y_train, y_test = train_test_split(x, y)
22 | 
23 | # Use the decision tree as the base weak learner.
24 | dt = DecisionTreeClassifier(max_depth = 1)
25 | 
26 | # Generate a AdaBoost model with the SAMME algorithm.
27 | model = AdaBoostClassifier(estimator = dt, 
28 |                            n_estimators = 100,
29 |                            algorithm = 'SAMME')  # default = 'SAMME.R'
30 | 
31 | # Fit the model to the training data
32 | model.fit(x_train, y_train)
33 | 
34 | # Predict the class of test data and calculate the accuracy
35 | y_pred = model.predict(x_test)
36 | accuracy = (y_pred == y_test).mean()
37 | 
38 | print('Accuracy = {:.4f}'.format(accuracy))
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/9.AdaBoost/5.AdaBoost(regression).py:
--------------------------------------------------------------------------------
  1 | # [MXML-9-04] 5.AdaBoost(regression).py
  2 | # [1] Harris Drucker et, al., 1997, Improving Regressors using Boosting Techniques
  3 | #
  4 | # This code was used in the machine learning online 
  5 | # course provided by 
  6 | # www.youtube.com/@meanxai
  7 | # www.github.com/meanxai/machine_learning
  8 | #
  9 | # A detailed description of this code can be found in
 10 | # https://youtu.be/nPzW-AmPSLs
 11 | # 
 12 | import numpy as np
 13 | import random as rd
 14 | from sklearn.tree import DecisionTreeRegressor
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | # Create training data
 18 | def noisy_sine_data(n, s):
 19 |    rtn_x, rtn_y = [], []
 20 |    for i in range(n):
 21 |        x = np.random.random()
 22 |        y = 2.0 * np.sin(2.0 * np.pi * x) + np.random.normal(0.0, s) + 3.0
 23 |        rtn_x.append(x)
 24 |        rtn_y.append(y)
 25 |    return np.array(rtn_x).reshape(-1,1), np.array(rtn_y)
 26 | x, y = noisy_sine_data(n=500, s=0.5)
 27 |     
 28 | N = x.shape[0]
 29 | R = np.arange(N)
 30 | T = 100
 31 | 
 32 | weights = np.array(np.ones(shape=(N,)) / N)
 33 | beta = []     # beta history
 34 | models = []   # save base learners for prediction
 35 | for t in range(T):
 36 |     s_idx = np.array(rd.choices(R, weights=weights, k=N))
 37 |     sx = x[s_idx]      # sample x
 38 |     sy = y[s_idx]      # sample y
 39 |            
 40 |     # base learner
 41 |     model = DecisionTreeRegressor(max_depth=5)
 42 |     model.fit(sx, sy)  # Fit the model to sample data
 43 |     
 44 |     # Calculate square loss
 45 |     y_pred = model.predict(x)       # predict entire training data   
 46 |     err = np.abs(y - y_pred)
 47 |     loss = (err / err.max()) ** 2      # squared loss
 48 |     
 49 |     loss_avg = np.sum(weights * loss)  # average loss
 50 |     if loss_avg > 0.5:
 51 |         print('stopped at t={}, loss_avg={:.2f}'.format(t, loss_avg))
 52 |         break
 53 |         
 54 |     # Calculate beta using average loss.
 55 |     beta.append(loss_avg / (1. - loss_avg))
 56 |     
 57 |     # Update weights using beta.
 58 |     new_weights = weights * np.power(beta[-1], (1. - loss))
 59 |     weights = new_weights / new_weights.sum()
 60 |     
 61 |     # save model
 62 |     models.append(model)
 63 | 
 64 | # Visualize training data and estimated curve
 65 | def plot_prediction(x, y, x_test, y_pred, title=""):
 66 |     plt.figure(figsize=(5, 3.5))
 67 |     plt.scatter(x, y, c='blue', s=20, alpha=0.5, label='train')
 68 |     plt.plot(x_test, y_pred, c='red', lw=2.0, label='prediction')
 69 |     plt.xlim(0, 1)
 70 |     plt.ylim(0, 7)
 71 |     plt.legend()
 72 |     plt.title(title)
 73 |     plt.show()
 74 |     
 75 | # prediction.
 76 | n_test = 50
 77 | x_test = np.linspace(0, 1, n_test).reshape(-1, 1) # test data
 78 | log_beta = np.log(1. / np.array(beta))            # log(1/beta)
 79 | y_pred = np.array([m.predict(x_test) for m in models]).T
 80 | 
 81 | # Method-1: Using weighted average
 82 | w = log_beta/ log_beta.sum()    # normalize
 83 | wavg_pred = np.sum(y_pred * w, axis=1)
 84 | plot_prediction(x, y, x_test, wavg_pred, 'weighted average')
 85 | 
 86 | # weighted median: (sum of the lower w ≥ half of the total sum of w)
 87 | i_pred = np.argsort(y_pred, axis=1)
 88 | w_acc = np.cumsum(w[i_pred], axis=1)      # accumulated w
 89 | is_med = w_acc >= 0.5 * w_acc[:, -1][:, np.newaxis]
 90 | i_med = is_med.argmax(axis=1)             # 23
 91 | y_med = i_pred[np.arange(n_test), i_med]  # 34
 92 | wmed_pred = np.array(y_pred[np.arange(n_test), y_med])  # final estimate
 93 | plot_prediction(x, y, x_test, wmed_pred, 'weighted median')
 94 |    
 95 | # Let’s compare the results with sklearn’s AdaBoostRegressor
 96 | from sklearn.ensemble import AdaBoostRegressor
 97 | 
 98 | dt = DecisionTreeRegressor(max_depth=5)
 99 | model = AdaBoostRegressor(estimator=dt, n_estimators=T, loss='square')
100 | model.fit(x, y)
101 | sk_pred = model.predict(x_test)
102 | plot_prediction(x, y, x_test, sk_pred, 'AdaBoostRegressor')


--------------------------------------------------------------------------------