├── LICENSE ├── README.md ├── ch10 ├── ch10_part1.ipynb ├── ch10_part1.py ├── ch10_part2.ipynb └── ch10_part2.py ├── ch11 ├── ch11_part1.ipynb └── ch11_part1.py ├── ch12 ├── ch12_part1.ipynb ├── ch12_part1.py ├── ch12_part2.ipynb ├── ch12_part2.py ├── ch12_part3.ipynb ├── ch12_part3.py └── warpeace_input.txt ├── ch13 ├── ch13_part1.ipynb ├── ch13_part1.py ├── ch13_part2.ipynb ├── ch13_part2.py └── warpeace_input.txt ├── ch14 ├── ch14_part1.ipynb ├── ch14_part1.py ├── ch14_part2.ipynb └── ch14_part2.py ├── ch15 ├── ch15_part1.ipynb ├── ch15_part1.py ├── ch15_part2.ipynb └── ch15_part2.py ├── ch2 ├── ch2_part1.ipynb ├── ch2_part1.py ├── ch2_part2.ipynb └── ch2_part2.py ├── ch3 ├── ch3_part1.ipynb ├── ch3_part1.py ├── ch3_part2.ipynb └── ch3_part2.py ├── ch4 ├── ch4_part1.ipynb ├── ch4_part1.py ├── ch4_part2.ipynb └── ch4_part2.py ├── ch5 ├── 19900101_20230630.csv ├── 20051201_20051210.csv ├── ch5_part1.ipynb ├── ch5_part1.py ├── ch5_part2.ipynb └── ch5_part2.py ├── ch6 ├── ch6_part1.ipynb ├── ch6_part1.py ├── ch6_part2.ipynb └── ch6_part2.py ├── ch7 ├── ch7_part1.ipynb └── ch7_part1.py ├── ch8 ├── ch8_part1.ipynb ├── ch8_part1.py ├── ch8_part2.ipynb └── ch8_part2.py └── ch9 ├── ch9_part1.ipynb ├── ch9_part1.py ├── ch9_part2.ipynb └── ch9_part2.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 packtjaniceg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-Machine-Learning-by-Example-Fourth-Edition 2 | Python Machine Learning by Example, Fourth Edition 3 | -------------------------------------------------------------------------------- /ch10/ch10_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 10 Machine Learning Best Practices 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Best practices in the data preparation stage 11 | 12 | # ## Best practice 4 – Dealing with missing data 13 | 14 | import numpy as np 15 | from sklearn.impute import SimpleImputer 16 | 17 | 18 | data_origin = [[30, 100], 19 | [20, 50], 20 | [35, np.nan], 21 | [25, 80], 22 | [30, 70], 23 | [40, 60]] 24 | 25 | 26 | imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') 27 | imp_mean.fit(data_origin) 28 | 29 | 30 | data_mean_imp = imp_mean.transform(data_origin) 31 | print(data_mean_imp) 32 | 33 | 34 | imp_median = SimpleImputer(missing_values=np.nan, strategy='median') 35 | imp_median.fit(data_origin) 36 | data_median_imp = imp_median.transform(data_origin) 37 | print(data_median_imp) 38 | 39 | 40 | # New samples 41 | new = [[20, np.nan], 42 | [30, np.nan], 43 | [np.nan, 70], 44 | [np.nan, np.nan]] 45 | new_mean_imp = imp_mean.transform(new) 46 | print(new_mean_imp) 47 | 48 | 49 | # Effects of discarding missing values and imputation 50 | from sklearn import datasets 51 | dataset = datasets.load_diabetes() 52 | X_full, y = dataset.data, dataset.target 53 | 54 | 55 | m, n = X_full.shape 56 | m_missing = int(m * 0.25) 57 | print(m, m_missing) 58 | 59 | 60 | np.random.seed(42) 61 | missing_samples = np.array([True] * m_missing + [False] * (m - m_missing)) 62 | np.random.shuffle(missing_samples) 63 | 64 | 65 | missing_features = np.random.randint(low=0, high=n, size=m_missing) 66 | 67 | 68 | X_missing = X_full.copy() 69 | X_missing[np.where(missing_samples)[0], missing_features] = np.nan 70 | 71 | 72 | # Discard samples containing missing values 73 | X_rm_missing = X_missing[~missing_samples, :] 74 | y_rm_missing = y[~missing_samples] 75 | 76 | 77 | # Estimate R^2 on the data set with missing samples removed 78 | from sklearn.ensemble import RandomForestRegressor 79 | from sklearn.model_selection import cross_val_score 80 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100) 81 | score_rm_missing = cross_val_score(regressor, X_rm_missing, y_rm_missing).mean() 82 | print(f'Score with the data set with missing samples removed: {score_rm_missing:.2f}') 83 | 84 | 85 | # Imputation with mean value 86 | imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') 87 | X_mean_imp = imp_mean.fit_transform(X_missing) 88 | 89 | 90 | # Estimate R^2 on the data set with missing samples removed 91 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100) 92 | score_mean_imp = cross_val_score(regressor, X_mean_imp, y).mean() 93 | print(f'Score with the data set with missing values replaced by mean: {score_mean_imp:.2f}') 94 | 95 | 96 | # Estimate R^2 on the full data set 97 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=500) 98 | score_full = cross_val_score(regressor, X_full, y).mean() 99 | print(f'Score with the full data set: {score_full:.2f}') 100 | 101 | 102 | # # Best practices in the training sets generation stage 103 | 104 | # ## Best practice 8 – Deciding whether to select features, and if so, how to do so 105 | 106 | from sklearn.datasets import load_digits 107 | dataset = load_digits() 108 | X, y = dataset.data, dataset.target 109 | print(X.shape) 110 | 111 | 112 | # Estimate accuracy on the original data set 113 | from sklearn.svm import SVC 114 | classifier = SVC(gamma=0.005, random_state=42) 115 | score = cross_val_score(classifier, X, y).mean() 116 | print(f'Score with the original data set: {score:.2f}') 117 | 118 | 119 | # Feature selection with random forest 120 | from sklearn.ensemble import RandomForestClassifier 121 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1, random_state=42) 122 | random_forest.fit(X, y) 123 | 124 | # Sort features based on their importancies 125 | feature_sorted = np.argsort(random_forest.feature_importances_) 126 | 127 | 128 | # Select different number of top features 129 | K = [10, 15, 25, 35, 45] 130 | for k in K: 131 | top_K_features = feature_sorted[-k:] 132 | X_k_selected = X[:, top_K_features] 133 | # Estimate accuracy on the data set with k selected features 134 | classifier = SVC(gamma=0.005) 135 | score_k_features = cross_val_score(classifier, X_k_selected, y).mean() 136 | print(f'Score with the dataset of top {k} features: {score_k_features:.2f}') 137 | 138 | 139 | # ## Best practice 9 – Deciding whether to reduce dimensionality, and if so, how to do so! 140 | 141 | from sklearn.decomposition import PCA 142 | 143 | # Keep different number of top components 144 | N = [10, 15, 25, 35, 45] 145 | for n in N: 146 | pca = PCA(n_components=n) 147 | X_n_kept = pca.fit_transform(X) 148 | # Estimate accuracy on the data set with top n components 149 | classifier = SVC(gamma=0.005) 150 | score_n_components = cross_val_score(classifier, X_n_kept, y).mean() 151 | print(f'Score with the dataset of top {n} components: {score_n_components:.2f}') 152 | 153 | 154 | # ## Best practice 12 – Performing feature engineering without domain expertise 155 | 156 | # ### Binarization and discretization 157 | 158 | from sklearn.preprocessing import Binarizer 159 | X = [[4], [1], [3], [0]] 160 | binarizer = Binarizer(threshold=2.9) 161 | X_new = binarizer.fit_transform(X) 162 | print(X_new) 163 | 164 | 165 | # ### Polynomial transformation 166 | 167 | from sklearn.preprocessing import PolynomialFeatures 168 | X = [[2, 4], 169 | [1, 3], 170 | [3, 2], 171 | [0, 3]] 172 | poly = PolynomialFeatures(degree=2) 173 | X_new = poly.fit_transform(X) 174 | print(X_new) 175 | 176 | 177 | # --- 178 | 179 | # Readers may ignore the next cell. 180 | 181 | get_ipython().system('jupyter nbconvert --to python ch10_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 182 | 183 | -------------------------------------------------------------------------------- /ch10/ch10_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 10 Machine Learning Best Practices 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # ## Best practice 14 – Extracting features from text data 11 | 12 | # ### Word embedding 13 | 14 | from gensim.models import Word2Vec 15 | 16 | 17 | # Sample sentences for training 18 | sentences = [ 19 | ["i", "love", "machine", "learning", "by", "example"], 20 | ["machine", "learning", "and", "deep", "learning", "are", "fascinating"], 21 | ["word", "embedding", "is", "essential", "for", "many", "nlp", "tasks"], 22 | ["word2vec", "produces", "word", "embeddings"] 23 | ] 24 | 25 | # Create and train Word2Vec model 26 | model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=0) 27 | 28 | # Access word vectors 29 | vector = model.wv["machine"] 30 | print("Vector for 'machine':", vector) 31 | 32 | 33 | import torch 34 | import torch.nn as nn 35 | 36 | # Sample data 37 | input_data = torch.LongTensor([[1, 2, 3, 4], [5, 1, 6, 3]]) 38 | 39 | # Define the embedding layer 40 | vocab_size = 10 # Total number of unique words 41 | embedding_dim = 3 # Dimensionality of the embeddings 42 | embedding_layer = nn.Embedding(vocab_size, embedding_dim) 43 | 44 | # Pass input data through the embedding layer 45 | embedded_data = embedding_layer(input_data) 46 | 47 | # Print the embedded data 48 | print("Embedded Data:\n", embedded_data) 49 | 50 | 51 | # # Best practices in the deployment and monitoring stage 52 | 53 | # # Best practice 19 – Saving, loading, and reusing models 54 | 55 | # ### Saving and restoring models using pickle 56 | 57 | from sklearn import datasets 58 | dataset = datasets.load_diabetes() 59 | X, y = dataset.data, dataset.target 60 | 61 | num_new = 30 # the last 30 samples as new data set 62 | X_train = X[:-num_new, :] 63 | y_train = y[:-num_new] 64 | X_new = X[-num_new:, :] 65 | y_new = y[-num_new:] 66 | 67 | 68 | # Data pre-processing 69 | from sklearn.preprocessing import StandardScaler 70 | scaler = StandardScaler() 71 | scaler.fit(X_train) 72 | 73 | 74 | import pickle 75 | # Save the scaler 76 | pickle.dump(scaler, open("scaler.p", "wb" )) 77 | 78 | 79 | X_scaled_train = scaler.transform(X_train) 80 | 81 | 82 | # Regression model training 83 | from sklearn.svm import SVR 84 | regressor = SVR(C=20) 85 | regressor.fit(X_scaled_train, y_train) 86 | 87 | 88 | # Save the regressor 89 | pickle.dump(regressor, open("regressor.p", "wb")) 90 | 91 | 92 | # Deployment 93 | my_scaler = pickle.load(open("scaler.p", "rb" )) 94 | my_regressor = pickle.load(open("regressor.p", "rb")) 95 | 96 | 97 | X_scaled_new = my_scaler.transform(X_new) 98 | predictions = my_regressor.predict(X_scaled_new) 99 | 100 | 101 | # Monitor 102 | from sklearn.metrics import r2_score 103 | print(f'Health check on the model, R^2: {r2_score(y_new, predictions):.3f}') 104 | 105 | 106 | # ### Saving and restoring models in TensorFlow 107 | 108 | import tensorflow as tf 109 | from tensorflow import keras 110 | 111 | cancer_data = datasets.load_breast_cancer() 112 | X = cancer_data.data 113 | X = scaler.fit_transform(X) 114 | y = cancer_data.target 115 | 116 | 117 | learning_rate = 0.005 118 | n_iter = 10 119 | 120 | tf.random.set_seed(42) 121 | 122 | model = keras.Sequential([ 123 | keras.layers.Dense(units=1, activation='sigmoid') 124 | ]) 125 | 126 | model.compile(loss='binary_crossentropy', 127 | optimizer=tf.keras.optimizers.Adam(learning_rate)) 128 | 129 | 130 | model.fit(X, y, epochs=n_iter) 131 | 132 | 133 | model.summary() 134 | 135 | 136 | path = './model_tf' 137 | model.save(path) 138 | 139 | 140 | new_model = tf.keras.models.load_model(path) 141 | 142 | new_model.summary() 143 | 144 | 145 | # ### Saving and restoring models in PyTorch 146 | 147 | X_torch = torch.FloatTensor(X) 148 | y_torch = torch.FloatTensor(y.reshape(y.shape[0], 1)) 149 | 150 | 151 | torch.manual_seed(42) 152 | 153 | model = nn.Sequential(nn.Linear(X.shape[1], 1), 154 | nn.Sigmoid()) 155 | 156 | loss_function = nn.BCELoss() 157 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 158 | 159 | 160 | def train_step(model, X_train, y_train, loss_function, optimizer): 161 | pred_train = model(X_train) 162 | loss = loss_function(pred_train, y_train) 163 | model.zero_grad() 164 | loss.backward() 165 | optimizer.step() 166 | return loss.item() 167 | 168 | 169 | for epoch in range(n_iter): 170 | loss = train_step(model, X_torch, y_torch, loss_function, optimizer) 171 | print(f"Epoch {epoch} - loss: {loss}") 172 | 173 | 174 | print(model) 175 | 176 | 177 | path = './model.pth' 178 | torch.save(model, path) 179 | 180 | 181 | new_model = torch.load(path) 182 | print(new_model) 183 | 184 | 185 | # --- 186 | 187 | # Readers may ignore the next cell. 188 | 189 | get_ipython().system('jupyter nbconvert --to python ch10_part2.ipynb --TemplateExporter.exclude_input_prompt=True') 190 | 191 | -------------------------------------------------------------------------------- /ch11/ch11_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 11 Categorizing Images of Clothing with Convolutional Neural Networks 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Exploring the clothing image dataset 11 | 12 | import torch, torchvision 13 | from torchvision import transforms 14 | 15 | image_path = './' 16 | transform = transforms.Compose([transforms.ToTensor(), 17 | # transforms.Normalize((0.5,), (0.5,)) 18 | ]) 19 | 20 | train_dataset = torchvision.datasets.FashionMNIST(root=image_path, 21 | train=True, 22 | transform=transform, 23 | download=True) 24 | 25 | test_dataset = torchvision.datasets.FashionMNIST(root=image_path, 26 | train=False, 27 | transform=transform, 28 | download=False) 29 | 30 | 31 | print(train_dataset) 32 | 33 | 34 | print(test_dataset) 35 | 36 | 37 | from torch.utils.data import DataLoader 38 | 39 | batch_size = 64 40 | torch.manual_seed(42) 41 | train_dl = DataLoader(train_dataset, batch_size, shuffle=True) 42 | 43 | 44 | data_iter = iter(train_dl) 45 | images, labels = next(data_iter) 46 | 47 | 48 | print(labels) 49 | 50 | 51 | # constant for classes 52 | class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 53 | 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot'] 54 | 55 | 56 | print(images[0].shape) 57 | 58 | 59 | print(torch.max(images), torch.min(images)) 60 | 61 | 62 | import numpy as np 63 | import matplotlib.pyplot as plt 64 | 65 | npimg = images[1].numpy() 66 | plt.imshow(np.transpose(npimg, (1, 2, 0))) 67 | plt.colorbar() 68 | plt.title(class_names[labels[1]]) 69 | plt.show() 70 | 71 | 72 | plt.figure(figsize=(10, 10)) 73 | 74 | for i in range(16): 75 | plt.subplot(4, 4, i + 1) 76 | plt.subplots_adjust(hspace=.3) 77 | plt.xticks([]) 78 | plt.yticks([]) 79 | npimg = images[i].numpy() 80 | plt.imshow(np.transpose(npimg, (1, 2, 0)), cmap="Greys") 81 | plt.title(class_names[labels[i]]) 82 | plt.show() 83 | 84 | 85 | # # Classifying clothing images with CNNs 86 | 87 | # ## Architecting the CNN model 88 | 89 | import torch.nn as nn 90 | model = nn.Sequential() 91 | 92 | 93 | model.add_module('conv1', nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3)) 94 | model.add_module('relu1', nn.ReLU()) 95 | 96 | 97 | model.add_module('pool1', nn.MaxPool2d(kernel_size=2)) 98 | 99 | 100 | model.add_module('conv2', nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)) 101 | model.add_module('relu2', nn.ReLU()) 102 | 103 | 104 | model.add_module('pool2', nn.MaxPool2d(kernel_size=2)) 105 | 106 | 107 | model.add_module('conv3', nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3)) 108 | model.add_module('relu3', nn.ReLU()) 109 | 110 | 111 | x = torch.rand((64, 1, 28, 28)) 112 | print(model(x).shape) 113 | 114 | 115 | model.add_module('flatten', nn.Flatten()) 116 | 117 | 118 | print(model(x).shape) 119 | 120 | 121 | model.add_module('fc1', nn.Linear(1152, 64)) 122 | model.add_module('relu4', nn.ReLU()) 123 | 124 | 125 | model.add_module('fc2', nn.Linear(64, 10)) 126 | model.add_module('output', nn.Softmax(dim = 1)) 127 | 128 | 129 | print(model) 130 | 131 | 132 | from torchsummary import summary 133 | 134 | 135 | summary(model, input_size=(1, 28, 28), batch_size=-1, device="cpu") 136 | 137 | 138 | # ## Fitting the CNN model 139 | 140 | device = torch.device("cuda:0") 141 | # device = torch.device("cpu") 142 | model = model.to(device) 143 | 144 | loss_fn = nn.CrossEntropyLoss() 145 | optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 146 | 147 | 148 | def train(model, optimizer, num_epochs, train_dl): 149 | for epoch in range(num_epochs): 150 | loss_train = 0 151 | accuracy_train = 0 152 | for x_batch, y_batch in train_dl: 153 | x_batch = x_batch.to(device) 154 | y_batch = y_batch.to(device) 155 | pred = model(x_batch) 156 | loss = loss_fn(pred, y_batch) 157 | loss.backward() 158 | optimizer.step() 159 | optimizer.zero_grad() 160 | loss_train += loss.item() * y_batch.size(0) 161 | is_correct = (torch.argmax(pred, dim=1) == y_batch).float() 162 | accuracy_train += is_correct.sum().cpu() 163 | 164 | loss_train /= len(train_dl.dataset) 165 | accuracy_train /= len(train_dl.dataset) 166 | 167 | print(f'Epoch {epoch+1} - loss: {loss_train:.4f} - accuracy: {accuracy_train:.4f}') 168 | 169 | 170 | num_epochs = 30 171 | train(model, optimizer, num_epochs, train_dl) 172 | 173 | 174 | test_dl = DataLoader(test_dataset, batch_size, shuffle=False) 175 | 176 | def evaluate_model(model, test_dl): 177 | accuracy_test = 0 178 | with torch.no_grad(): 179 | for x_batch, y_batch in test_dl: 180 | pred = model.cpu()(x_batch) 181 | is_correct = torch.argmax(pred, dim=1) == y_batch 182 | accuracy_test += is_correct.float().sum().item() 183 | 184 | print(f'Accuracy on test set: {100 * accuracy_test / 10000} %') 185 | 186 | evaluate_model(model, test_dl) 187 | 188 | 189 | # ## Visualizing the convolutional filters 190 | 191 | conv3_weight = model.conv3.weight.data 192 | print(conv3_weight.shape) 193 | 194 | 195 | plt.figure(figsize=(10, 10)) 196 | 197 | n_filters = 16 198 | for i in range(n_filters): 199 | weight = conv3_weight[i].cpu().numpy() 200 | plt.subplot(4, 4, i+1) 201 | plt.xticks([]) 202 | plt.yticks([]) 203 | plt.imshow(weight[0], cmap='gray') 204 | 205 | plt.show() 206 | 207 | 208 | # # Boosting the CNN classifier with data augmentation 209 | 210 | # ## Flipping for data augmentation 211 | 212 | def display_image_greys(image): 213 | npimg = image.numpy() 214 | plt.imshow(np.transpose(npimg, (1, 2, 0)), cmap="Greys") 215 | plt.xticks([]) 216 | plt.yticks([]) 217 | 218 | 219 | image = images[1] 220 | plt.figure(figsize=(8, 8)) 221 | plt.subplot(1, 2, 1) 222 | display_image_greys(image) 223 | 224 | ## flipping (horizontally) 225 | img_flipped = transforms.functional.hflip(image) 226 | plt.subplot(1, 2, 2) 227 | display_image_greys(img_flipped) 228 | 229 | plt.show() 230 | 231 | 232 | 233 | torch.manual_seed(42) 234 | flip_transform = transforms.Compose([transforms.RandomHorizontalFlip()]) 235 | 236 | plt.figure(figsize=(10, 10)) 237 | plt.subplot(1, 4, 1) 238 | display_image_greys(image) 239 | 240 | for i in range(3): 241 | plt.subplot(1, 4, i+2) 242 | img_flip = flip_transform(image) 243 | display_image_greys(img_flip) 244 | 245 | 246 | # ## Rotation for data augmentation 247 | 248 | # rotate 249 | 250 | torch.manual_seed(42) 251 | rotate_transform = transforms.Compose([transforms.RandomRotation(20)]) 252 | 253 | plt.figure(figsize=(10, 10)) 254 | plt.subplot(1, 4, 1) 255 | display_image_greys(image) 256 | 257 | for i in range(3): 258 | plt.subplot(1, 4, i+2) 259 | img_rotate = rotate_transform(image) 260 | display_image_greys(img_rotate) 261 | 262 | 263 | # ## Cropping for data augmentation 264 | 265 | torch.manual_seed(42) 266 | crop_transform = transforms.Compose([ 267 | transforms.RandomResizedCrop(size=(28, 28), scale=(0.7, 1))]) 268 | 269 | plt.figure(figsize=(10, 10)) 270 | plt.subplot(1, 4, 1) 271 | display_image_greys(image) 272 | 273 | for i in range(3): 274 | plt.subplot(1, 4, i+2) 275 | img_crop = crop_transform(image) 276 | display_image_greys(img_crop) 277 | 278 | 279 | 280 | # # Improving the clothing image classifier with data augmentation 281 | 282 | torch.manual_seed(42) 283 | transform_train = transforms.Compose([ 284 | transforms.RandomHorizontalFlip(), 285 | transforms.RandomRotation(10), 286 | transforms.RandomResizedCrop(size=(28, 28), scale=(0.9, 1)), 287 | transforms.ToTensor(), 288 | ]) 289 | 290 | 291 | train_dataset_aug = torchvision.datasets.FashionMNIST(root=image_path, 292 | train=True, 293 | transform=transform_train, 294 | download=False) 295 | 296 | 297 | from torch.utils.data import Subset 298 | train_dataset_aug_small = Subset(train_dataset_aug, torch.arange(500)) 299 | 300 | 301 | train_dl_aug_small = DataLoader(train_dataset_aug_small, batch_size, shuffle=True) 302 | 303 | 304 | model = nn.Sequential() 305 | model.add_module('conv1', nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3)) 306 | model.add_module('relu1', nn.ReLU()) 307 | model.add_module('pool1', nn.MaxPool2d(kernel_size=2)) 308 | 309 | model.add_module('conv2', nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)) 310 | model.add_module('relu2', nn.ReLU()) 311 | model.add_module('pool2', nn.MaxPool2d(kernel_size=2)) 312 | 313 | model.add_module('conv3', nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3)) 314 | model.add_module('relu3', nn.ReLU()) 315 | 316 | model.add_module('flatten', nn.Flatten()) 317 | model.add_module('fc1', nn.Linear(1152, 64)) 318 | model.add_module('relu4', nn.ReLU()) 319 | 320 | model.add_module('fc2', nn.Linear(64, 10)) 321 | model.add_module('output', nn.Softmax(dim = 1)) 322 | 323 | model = model.to(device) 324 | 325 | 326 | optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 327 | train(model, optimizer, 1000, train_dl_aug_small) 328 | 329 | 330 | evaluate_model(model, test_dl) 331 | 332 | 333 | # # Advancing the CNN classifier with transfer learning 334 | 335 | from torchvision.models import resnet18 336 | my_resnet = resnet18(weights='IMAGENET1K_V1') 337 | 338 | 339 | my_resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False) 340 | 341 | num_ftrs = my_resnet.fc.in_features 342 | my_resnet.fc = nn.Linear(num_ftrs, 10) 343 | 344 | 345 | my_resnet = my_resnet.to(device) 346 | optimizer = torch.optim.Adam(my_resnet.parameters(), lr=0.001) 347 | train(my_resnet, optimizer, 10, train_dl) 348 | 349 | 350 | evaluate_model(my_resnet, test_dl) 351 | 352 | 353 | # --- 354 | 355 | # Readers may ignore the next cell. 356 | 357 | get_ipython().system('jupyter nbconvert --to python ch11_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 358 | 359 | -------------------------------------------------------------------------------- /ch12/ch12_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 12 Making Predictions with Sequences Using Recurrent Neural Networks 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Analyzing movie review sentiment with RNNs 11 | 12 | # ## Analyzing and preprocessing the data 13 | 14 | from torchtext.datasets import IMDB 15 | 16 | train_dataset = list(IMDB(split='train')) 17 | test_dataset = list(IMDB(split='test')) 18 | 19 | print(len(train_dataset), len(test_dataset)) 20 | 21 | 22 | # !conda install -c pytorch torchtext -y 23 | 24 | 25 | # !conda install -c conda-forge portalocker -y 26 | 27 | 28 | import re 29 | from collections import Counter, OrderedDict 30 | 31 | def tokenizer(text): 32 | text = re.sub('<[^>]*>', '', text) 33 | emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()) 34 | text = re.sub('[\W]+', ' ', text.lower()) +\ 35 | ' '.join(emoticons).replace('-', '') 36 | tokenized = text.split() 37 | return tokenized 38 | 39 | token_counts = Counter() 40 | train_labels = [] 41 | for label, line in train_dataset: 42 | train_labels.append(label) 43 | tokens = tokenizer(line) 44 | token_counts.update(tokens) 45 | 46 | 47 | print('Vocab-size:', len(token_counts)) 48 | print(Counter(train_labels)) 49 | 50 | 51 | from torchtext.vocab import vocab 52 | 53 | sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True) 54 | ordered_dict = OrderedDict(sorted_by_freq_tuples) 55 | 56 | vocab_mapping = vocab(ordered_dict) 57 | 58 | vocab_mapping.insert_token("", 0) 59 | vocab_mapping.insert_token("", 1) 60 | vocab_mapping.set_default_index(1) 61 | 62 | 63 | print([vocab_mapping[token] for token in ['this', 'is', 'an', 'example']]) 64 | print([vocab_mapping[token] for token in ['this', 'is', 'example2']]) 65 | 66 | 67 | import torch 68 | import torch.nn as nn 69 | 70 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 71 | 72 | text_transform = lambda x: [vocab[token] for token in tokenizer(x)] 73 | 74 | def collate_batch(batch): 75 | label_list, text_list, lengths = [], [], [] 76 | for _label, _text in batch: 77 | label_list.append(1. if _label == 2 else 0.) 78 | processed_text = [vocab_mapping[token] for token in tokenizer(_text)] 79 | text_list.append(torch.tensor(processed_text, dtype=torch.int64)) 80 | lengths.append(len(processed_text)) 81 | label_list = torch.tensor(label_list) 82 | lengths = torch.tensor(lengths) 83 | padded_text_list = nn.utils.rnn.pad_sequence( 84 | text_list, batch_first=True) 85 | return padded_text_list.to(device), label_list.to(device), lengths.to(device) 86 | 87 | 88 | # from torch.nn.utils.rnn import pad_sequence 89 | # a = [torch.tensor([11, 7, 35, 462], dtype=torch.int64), torch.tensor([11, 7, 35, 462, 11], dtype=torch.int64)] 90 | # b = [torch.tensor([11, 7, 35], dtype=torch.int64), torch.tensor([11, 7, 35, 462, 11, 12], dtype=torch.int64)] 91 | # # c = torch.ones(1, 15, 300) 92 | # pad_sequence(a, True).size() 93 | 94 | 95 | from torch.utils.data import DataLoader 96 | torch.manual_seed(0) 97 | dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_batch) 98 | text_batch, label_batch, length_batch = next(iter(dataloader)) 99 | print(text_batch) 100 | print(label_batch) 101 | print(length_batch) 102 | print(text_batch.shape) 103 | 104 | 105 | batch_size = 32 106 | 107 | train_dl = DataLoader(train_dataset, batch_size=batch_size, 108 | shuffle=True, collate_fn=collate_batch) 109 | 110 | test_dl = DataLoader(test_dataset, batch_size=batch_size, 111 | shuffle=False, collate_fn=collate_batch) 112 | 113 | 114 | # ## Building a simple LSTM network 115 | 116 | vocab_size = len(vocab_mapping) 117 | embed_dim = 32 118 | rnn_hidden_dim = 50 119 | fc_hidden_dim = 32 120 | 121 | 122 | class RNN(nn.Module): 123 | def __init__(self, vocab_size, embed_dim, rnn_hidden_dim, fc_hidden_dim): 124 | super().__init__() 125 | self.embedding = nn.Embedding(vocab_size, 126 | embed_dim, 127 | padding_idx=0) 128 | self.rnn = nn.LSTM(embed_dim, rnn_hidden_dim, 129 | batch_first=True) 130 | self.fc1 = nn.Linear(rnn_hidden_dim, fc_hidden_dim) 131 | self.relu = nn.ReLU() 132 | self.fc2 = nn.Linear(fc_hidden_dim, 1) 133 | self.sigmoid = nn.Sigmoid() 134 | 135 | def forward(self, text, lengths): 136 | out = self.embedding(text) 137 | out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True) 138 | out, (hidden, cell) = self.rnn(out) 139 | out = hidden[-1, :, :] 140 | out = self.fc1(out) 141 | out = self.relu(out) 142 | out = self.fc2(out) 143 | out = self.sigmoid(out) 144 | return out 145 | 146 | 147 | 148 | model = RNN(vocab_size, embed_dim, rnn_hidden_dim, fc_hidden_dim) 149 | model = model.to(device) 150 | 151 | 152 | loss_fn = nn.BCELoss() 153 | optimizer = torch.optim.Adam(model.parameters(), lr=0.003) 154 | 155 | 156 | def train(model, dataloader, optimizer): 157 | model.train() 158 | total_acc, total_loss = 0, 0 159 | for text_batch, label_batch, length_batch in dataloader: 160 | optimizer.zero_grad() 161 | pred = model(text_batch, length_batch)[:, 0] 162 | loss = loss_fn(pred, label_batch) 163 | loss.backward() 164 | optimizer.step() 165 | total_acc += ((pred>=0.5).float() == label_batch).float().sum().item() 166 | total_loss += loss.item()*label_batch.size(0) 167 | 168 | total_loss /= len(dataloader.dataset) 169 | total_acc /= len(train_dl.dataset) 170 | print(f'Epoch {epoch+1} - loss: {total_loss:.4f} - accuracy: {total_acc:.4f}') 171 | 172 | 173 | 174 | torch.manual_seed(0) 175 | num_epochs = 10 176 | for epoch in range(num_epochs): 177 | train(model, train_dl, optimizer) 178 | 179 | 180 | def evaluate(model, dataloader): 181 | model.eval() 182 | total_acc = 0 183 | with torch.no_grad(): 184 | for text_batch, label_batch, lengths in dataloader: 185 | pred = model(text_batch, lengths)[:, 0] 186 | total_acc += ((pred>=0.5).float() == label_batch).float().sum().item() 187 | print(f'Accuracy on test set: {100 * total_acc/len(dataloader.dataset)} %') 188 | 189 | evaluate(model, test_dl) 190 | 191 | 192 | # ## Stacking multiple LSTM layers 193 | 194 | nn.LSTM(embed_dim, rnn_hidden_dim, num_layers=2, batch_first=True) 195 | 196 | 197 | # --- 198 | 199 | # Readers may ignore the next cell. 200 | 201 | get_ipython().system('jupyter nbconvert --to python ch12_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 202 | 203 | -------------------------------------------------------------------------------- /ch12/ch12_part2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d1d4890d-a136-4be7-88ad-cca747947f4e", 6 | "metadata": {}, 7 | "source": [ 8 | "Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)\n", 9 | "\n", 10 | "Chapter 12 Making Predictions with Sequences Using Recurrent Neural Networks\n", 11 | "\n", 12 | "Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "851bb88a-6660-4390-9282-bf43e461ac88", 18 | "metadata": {}, 19 | "source": [ 20 | "# Revisiting stock price forecasting with LSTM" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "id": "22828a4c-8789-4f71-9854-987a0629687e", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import pandas as pd\n", 31 | "import torch\n", 32 | "import torch.nn as nn" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "id": "8cfc60ba-9224-4a2c-9bcf-bcee140bae3e", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Reusing the feature generation function we developed\n", 43 | "def generate_features(df):\n", 44 | " \"\"\"\n", 45 | " Generate features for a stock/index based on historical price and performance\n", 46 | " @param df: dataframe with columns \"Open\", \"Close\", \"High\", \"Low\", \"Volume\", \"Adj Close\"\n", 47 | " @return: dataframe, data set with new features\n", 48 | " \"\"\"\n", 49 | " df_new = pd.DataFrame()\n", 50 | " # 6 original features\n", 51 | " df_new['open'] = df['Open']\n", 52 | " df_new['open_1'] = df['Open'].shift(1)\n", 53 | " df_new['close_1'] = df['Close'].shift(1)\n", 54 | " df_new['high_1'] = df['High'].shift(1)\n", 55 | " df_new['low_1'] = df['Low'].shift(1)\n", 56 | " df_new['volume_1'] = df['Volume'].shift(1)\n", 57 | " # # 31 generated features\n", 58 | " # # average price\n", 59 | " df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1)\n", 60 | " df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1)\n", 61 | " df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1)\n", 62 | " df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30']\n", 63 | " df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365']\n", 64 | " df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365']\n", 65 | " # # average volume\n", 66 | " df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1)\n", 67 | " df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1)\n", 68 | " df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1)\n", 69 | " df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30']\n", 70 | " df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365']\n", 71 | " df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365']\n", 72 | " # # standard deviation of prices\n", 73 | " df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1)\n", 74 | " df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1)\n", 75 | " df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1)\n", 76 | " df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30']\n", 77 | " df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365']\n", 78 | " df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365']\n", 79 | " # # standard deviation of volumes\n", 80 | " df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1)\n", 81 | " df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1)\n", 82 | " df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1)\n", 83 | " df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30']\n", 84 | " df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365']\n", 85 | " df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365']\n", 86 | " # # # return\n", 87 | " df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1)\n", 88 | " df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1)\n", 89 | " df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1)\n", 90 | " df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1)\n", 91 | " df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1)\n", 92 | " df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1)\n", 93 | " df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1)\n", 94 | " # the target\n", 95 | " df_new['close'] = df['Close']\n", 96 | " df_new = df_new.dropna(axis=0)\n", 97 | " return df_new" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 3, 103 | "id": "ec9b69f9-82ff-4ac6-a193-29aa84cd4797", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "data_raw = pd.read_csv('19900101_20230630.csv', index_col='Date')\n", 108 | "data = generate_features(data_raw)\n", 109 | "\n", 110 | "start_train = '1990-01-01'\n", 111 | "end_train = '2022-12-31'\n", 112 | "\n", 113 | "start_test = '2023-01-01'\n", 114 | "end_test = '2023-06-30'\n", 115 | "\n", 116 | "data_train = data.loc[start_train:end_train]\n", 117 | "X_train = data_train.drop('close', axis=1).values\n", 118 | "y_train = data_train['close'].values\n", 119 | "\n", 120 | "data_test = data.loc[start_test:end_test]\n", 121 | "X_test = data_test.drop('close', axis=1).values\n", 122 | "y_test = data_test['close'].values" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 4, 128 | "id": "433d14fa-5f4c-4036-8314-541d65cce4a6", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "from sklearn.preprocessing import StandardScaler\n", 133 | "scaler = StandardScaler()\n", 134 | "\n", 135 | "X_scaled_train = torch.FloatTensor(scaler.fit_transform(X_train))\n", 136 | "X_scaled_test = torch.FloatTensor(scaler.transform(X_test))\n", 137 | "\n", 138 | "y_train_torch = torch.FloatTensor(y_train)\n", 139 | "y_test_torch = torch.FloatTensor(y_test)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 5, 145 | "id": "20dce8e1-2445-44bb-85e0-dbebf16fcbf3", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "# Define a function to create sequences\n", 150 | "def create_sequences(data, labels, seq_length):\n", 151 | " sequences = []\n", 152 | " for i in range(len(data) - seq_length):\n", 153 | " seq = data[i:i+seq_length]\n", 154 | " label = labels[i+seq_length-1]\n", 155 | " sequences.append((seq, label))\n", 156 | " return sequences\n", 157 | "\n", 158 | " \n", 159 | "# Create sequences with a sequence length of 5\n", 160 | "seq_length = 5\n", 161 | "sequence_train = create_sequences(X_scaled_train, y_train_torch, seq_length)\n", 162 | "sequence_test = create_sequences(X_scaled_test, y_test_torch, seq_length)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 6, 168 | "id": "619613a7-c1c0-4791-b3de-62ef15fb9089", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "from torch.utils.data import DataLoader\n", 173 | "torch.manual_seed(0)\n", 174 | "\n", 175 | "batch_size = 128 \n", 176 | "train_dl = DataLoader(sequence_train, batch_size=batch_size,\n", 177 | " shuffle=True)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 7, 183 | "id": "924de558-f9b4-44f3-b3cf-d3248c546954", 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "class RNN(nn.Module):\n", 188 | " def __init__(self, input_dim, rnn_hidden_dim, fc_hidden_dim):\n", 189 | " super().__init__()\n", 190 | " self.rnn = nn.LSTM(input_dim, rnn_hidden_dim, 2,\n", 191 | " batch_first=True)\n", 192 | " self.fc1 = nn.Linear(rnn_hidden_dim, fc_hidden_dim)\n", 193 | " self.relu = nn.ReLU()\n", 194 | " self.fc2 = nn.Linear(fc_hidden_dim, 1)\n", 195 | "\n", 196 | " def forward(self, x):\n", 197 | " out, (hidden, cell) = self.rnn(x)\n", 198 | " out = hidden[-1, :, :]\n", 199 | " out = self.fc1(out)\n", 200 | " out = self.relu(out)\n", 201 | " out = self.fc2(out)\n", 202 | " return out\n", 203 | "\n", 204 | "\n", 205 | "torch.manual_seed(42)\n", 206 | "rnn_hidden_dim = 16\n", 207 | "fc_hidden_dim = 16\n", 208 | "model = RNN(X_train.shape[1], rnn_hidden_dim, fc_hidden_dim) \n", 209 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 210 | "model = model.to(device)\n", 211 | "\n", 212 | "\n", 213 | "loss_fn = nn.MSELoss()\n", 214 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.003)\n" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 8, 220 | "id": "ae0b5bf5-53cc-4c0f-8989-a615d3031571", 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "def train(model, dataloader, optimizer):\n", 225 | " model.train()\n", 226 | " total_loss = 0\n", 227 | " for seq, label in dataloader:\n", 228 | " optimizer.zero_grad()\n", 229 | " pred = model(seq.to(device))[:, 0]\n", 230 | " loss = loss_fn(pred, label.to(device))\n", 231 | " loss.backward()\n", 232 | " optimizer.step()\n", 233 | " total_loss += loss.item()*label.size(0)\n", 234 | " return total_loss/len(dataloader.dataset)\n" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 9, 240 | "id": "a55cecc0-7081-4615-a783-08ba9d5d09de", 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "Epoch 1 - loss: 24797427.9047\n", 248 | "Epoch 101 - loss: 10503.0117\n", 249 | "Epoch 201 - loss: 3234.3346\n", 250 | "Epoch 301 - loss: 2735.4141\n", 251 | "Epoch 401 - loss: 2297.7157\n", 252 | "Epoch 501 - loss: 2108.5702\n", 253 | "Epoch 601 - loss: 1741.5264\n", 254 | "Epoch 701 - loss: 2798.3159\n", 255 | "Epoch 801 - loss: 1635.2345\n", 256 | "Epoch 901 - loss: 1459.4806\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "\n", 262 | "num_epochs = 1000 \n", 263 | "for epoch in range(num_epochs):\n", 264 | " loss = train(model, train_dl, optimizer)\n", 265 | " if epoch % 100 == 0:\n", 266 | " print(f'Epoch {epoch+1} - loss: {loss:.4f}')" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 10, 272 | "id": "c25679b2-1f3a-45f8-a945-ac02926a922e", 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "predictions, y = [], []\n", 277 | " \n", 278 | "for seq, label in sequence_test:\n", 279 | " with torch.no_grad():\n", 280 | " pred = model.cpu()(seq.view(1, seq_length, X_test.shape[1]))[:, 0]\n", 281 | " predictions.append(pred)\n", 282 | " y.append(label)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 11, 288 | "id": "cea9280b-5675-497b-9da0-9d3efb0e13e9", 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | "R^2: 0.897\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "from sklearn.metrics import r2_score\n", 301 | "print(f'R^2: {r2_score(y, predictions):.3f}')\n" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "id": "a221aaed-fed8-415d-b291-34ff46884e0b", 307 | "metadata": {}, 308 | "source": [ 309 | "---" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "id": "f86bbbd2-7c6c-419e-898f-8f683f2600d0", 315 | "metadata": {}, 316 | "source": [ 317 | "Readers may ignore the next cell." 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 12, 323 | "id": "ad91961b-758c-4cdd-8f90-8126bf40ed5c", 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stderr", 328 | "output_type": "stream", 329 | "text": [ 330 | "[NbConvertApp] Converting notebook ch12_part2.ipynb to python\n", 331 | "[NbConvertApp] Writing 7105 bytes to ch12_part2.py\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "!jupyter nbconvert --to python ch12_part2.ipynb --TemplateExporter.exclude_input_prompt=True" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "id": "609dece8-49c6-489d-8085-29d839097116", 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [] 346 | } 347 | ], 348 | "metadata": { 349 | "kernelspec": { 350 | "display_name": "Python 3 (ipykernel)", 351 | "language": "python", 352 | "name": "python3" 353 | }, 354 | "language_info": { 355 | "codemirror_mode": { 356 | "name": "ipython", 357 | "version": 3 358 | }, 359 | "file_extension": ".py", 360 | "mimetype": "text/x-python", 361 | "name": "python", 362 | "nbconvert_exporter": "python", 363 | "pygments_lexer": "ipython3", 364 | "version": "3.11.4" 365 | } 366 | }, 367 | "nbformat": 4, 368 | "nbformat_minor": 5 369 | } 370 | -------------------------------------------------------------------------------- /ch12/ch12_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 12 Making Predictions with Sequences Using Recurrent Neural Networks 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Revisiting stock price forecasting with LSTM 11 | 12 | import pandas as pd 13 | import torch 14 | import torch.nn as nn 15 | 16 | 17 | # Reusing the feature generation function we developed 18 | def generate_features(df): 19 | """ 20 | Generate features for a stock/index based on historical price and performance 21 | @param df: dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adj Close" 22 | @return: dataframe, data set with new features 23 | """ 24 | df_new = pd.DataFrame() 25 | # 6 original features 26 | df_new['open'] = df['Open'] 27 | df_new['open_1'] = df['Open'].shift(1) 28 | df_new['close_1'] = df['Close'].shift(1) 29 | df_new['high_1'] = df['High'].shift(1) 30 | df_new['low_1'] = df['Low'].shift(1) 31 | df_new['volume_1'] = df['Volume'].shift(1) 32 | # # 31 generated features 33 | # # average price 34 | df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1) 35 | df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1) 36 | df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1) 37 | df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30'] 38 | df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365'] 39 | df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365'] 40 | # # average volume 41 | df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1) 42 | df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1) 43 | df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1) 44 | df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30'] 45 | df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365'] 46 | df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365'] 47 | # # standard deviation of prices 48 | df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1) 49 | df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1) 50 | df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1) 51 | df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30'] 52 | df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365'] 53 | df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365'] 54 | # # standard deviation of volumes 55 | df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1) 56 | df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1) 57 | df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1) 58 | df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30'] 59 | df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365'] 60 | df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365'] 61 | # # # return 62 | df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1) 63 | df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1) 64 | df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1) 65 | df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1) 66 | df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1) 67 | df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1) 68 | df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1) 69 | # the target 70 | df_new['close'] = df['Close'] 71 | df_new = df_new.dropna(axis=0) 72 | return df_new 73 | 74 | 75 | data_raw = pd.read_csv('19900101_20230630.csv', index_col='Date') 76 | data = generate_features(data_raw) 77 | 78 | start_train = '1990-01-01' 79 | end_train = '2022-12-31' 80 | 81 | start_test = '2023-01-01' 82 | end_test = '2023-06-30' 83 | 84 | data_train = data.loc[start_train:end_train] 85 | X_train = data_train.drop('close', axis=1).values 86 | y_train = data_train['close'].values 87 | 88 | data_test = data.loc[start_test:end_test] 89 | X_test = data_test.drop('close', axis=1).values 90 | y_test = data_test['close'].values 91 | 92 | 93 | from sklearn.preprocessing import StandardScaler 94 | scaler = StandardScaler() 95 | 96 | X_scaled_train = torch.FloatTensor(scaler.fit_transform(X_train)) 97 | X_scaled_test = torch.FloatTensor(scaler.transform(X_test)) 98 | 99 | y_train_torch = torch.FloatTensor(y_train) 100 | y_test_torch = torch.FloatTensor(y_test) 101 | 102 | 103 | # Define a function to create sequences 104 | def create_sequences(data, labels, seq_length): 105 | sequences = [] 106 | for i in range(len(data) - seq_length): 107 | seq = data[i:i+seq_length] 108 | label = labels[i+seq_length-1] 109 | sequences.append((seq, label)) 110 | return sequences 111 | 112 | 113 | # Create sequences with a sequence length of 5 114 | seq_length = 5 115 | sequence_train = create_sequences(X_scaled_train, y_train_torch, seq_length) 116 | sequence_test = create_sequences(X_scaled_test, y_test_torch, seq_length) 117 | 118 | 119 | from torch.utils.data import DataLoader 120 | torch.manual_seed(0) 121 | 122 | batch_size = 128 123 | train_dl = DataLoader(sequence_train, batch_size=batch_size, 124 | shuffle=True) 125 | 126 | 127 | class RNN(nn.Module): 128 | def __init__(self, input_dim, rnn_hidden_dim, fc_hidden_dim): 129 | super().__init__() 130 | self.rnn = nn.LSTM(input_dim, rnn_hidden_dim, 2, 131 | batch_first=True) 132 | self.fc1 = nn.Linear(rnn_hidden_dim, fc_hidden_dim) 133 | self.relu = nn.ReLU() 134 | self.fc2 = nn.Linear(fc_hidden_dim, 1) 135 | 136 | def forward(self, x): 137 | out, (hidden, cell) = self.rnn(x) 138 | out = hidden[-1, :, :] 139 | out = self.fc1(out) 140 | out = self.relu(out) 141 | out = self.fc2(out) 142 | return out 143 | 144 | 145 | torch.manual_seed(42) 146 | rnn_hidden_dim = 16 147 | fc_hidden_dim = 16 148 | model = RNN(X_train.shape[1], rnn_hidden_dim, fc_hidden_dim) 149 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 150 | model = model.to(device) 151 | 152 | 153 | loss_fn = nn.MSELoss() 154 | optimizer = torch.optim.Adam(model.parameters(), lr=0.003) 155 | 156 | 157 | def train(model, dataloader, optimizer): 158 | model.train() 159 | total_loss = 0 160 | for seq, label in dataloader: 161 | optimizer.zero_grad() 162 | pred = model(seq.to(device))[:, 0] 163 | loss = loss_fn(pred, label.to(device)) 164 | loss.backward() 165 | optimizer.step() 166 | total_loss += loss.item()*label.size(0) 167 | return total_loss/len(dataloader.dataset) 168 | 169 | 170 | num_epochs = 1000 171 | for epoch in range(num_epochs): 172 | loss = train(model, train_dl, optimizer) 173 | if epoch % 100 == 0: 174 | print(f'Epoch {epoch+1} - loss: {loss:.4f}') 175 | 176 | 177 | predictions, y = [], [] 178 | 179 | for seq, label in sequence_test: 180 | with torch.no_grad(): 181 | pred = model.cpu()(seq.view(1, seq_length, X_test.shape[1]))[:, 0] 182 | predictions.append(pred) 183 | y.append(label) 184 | 185 | 186 | from sklearn.metrics import r2_score 187 | print(f'R^2: {r2_score(y, predictions):.3f}') 188 | 189 | 190 | # --- 191 | 192 | # Readers may ignore the next cell. 193 | 194 | get_ipython().system('jupyter nbconvert --to python ch12_part2.ipynb --TemplateExporter.exclude_input_prompt=True') 195 | 196 | 197 | 198 | 199 | -------------------------------------------------------------------------------- /ch12/ch12_part3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 12 Making Predictions with Sequences Using Recurrent Neural Networks 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Writing your own War and Peace with RNNs 11 | 12 | # ## Acquiring and analyzing the training data 13 | 14 | with open('warpeace_input.txt', 'r', encoding="utf8") as fp: 15 | raw_text = fp.read() 16 | raw_text = raw_text.lower() 17 | 18 | 19 | print(raw_text[:200]) 20 | 21 | 22 | all_words = raw_text.split() 23 | unique_words = list(set(all_words)) 24 | print(f'Number of unique words: {len(unique_words)}') 25 | 26 | 27 | n_chars = len(raw_text) 28 | print(f'Total characters: {n_chars}') 29 | 30 | 31 | chars = sorted(list(set(raw_text))) 32 | vocab_size = len(chars) 33 | print(f'Total vocabulary (unique characters): {vocab_size}') 34 | print(chars) 35 | 36 | 37 | # ## Constructing the training set for the RNN text generator 38 | 39 | index_to_char = dict((i, c) for i, c in enumerate(chars)) 40 | char_to_index = dict((c, i) for i, c in enumerate(chars)) 41 | print(char_to_index) 42 | 43 | 44 | import numpy as np 45 | text_encoded = np.array( 46 | [char_to_index[ch] for ch in raw_text], 47 | dtype=np.int32) 48 | 49 | 50 | seq_length = 40 51 | chunk_size = seq_length + 1 52 | 53 | text_chunks = np.array([text_encoded[i:i+chunk_size] 54 | for i in range(len(text_encoded)-chunk_size+1)]) 55 | 56 | 57 | import torch 58 | from torch.utils.data import Dataset 59 | 60 | class SeqDataset(Dataset): 61 | def __init__(self, text_chunks): 62 | self.text_chunks = text_chunks 63 | 64 | def __len__(self): 65 | return len(self.text_chunks) 66 | 67 | def __getitem__(self, idx): 68 | text_chunk = self.text_chunks[idx] 69 | return text_chunk[:-1].long(), text_chunk[1:].long() 70 | 71 | seq_dataset = SeqDataset(torch.from_numpy(text_chunks)) 72 | 73 | 74 | from torch.utils.data import DataLoader 75 | 76 | batch_size = 64 77 | 78 | torch.manual_seed(0) 79 | seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True) 80 | 81 | 82 | # ## Building and Training an RNN text generator 83 | 84 | import torch.nn as nn 85 | 86 | class RNN(nn.Module): 87 | def __init__(self, vocab_size, embed_dim, rnn_hidden_dim): 88 | super().__init__() 89 | self.embedding = nn.Embedding(vocab_size, embed_dim) 90 | self.rnn_hidden_dim = rnn_hidden_dim 91 | self.rnn = nn.LSTM(embed_dim, rnn_hidden_dim, 92 | batch_first=True) 93 | self.fc = nn.Linear(rnn_hidden_dim, vocab_size) 94 | 95 | def forward(self, x, hidden, cell): 96 | out = self.embedding(x).unsqueeze(1) 97 | out, (hidden, cell) = self.rnn(out, (hidden, cell)) 98 | out = self.fc(out).reshape(out.size(0), -1) 99 | return out, hidden, cell 100 | 101 | def init_hidden(self, batch_size): 102 | hidden = torch.zeros(1, batch_size, self.rnn_hidden_dim) 103 | cell = torch.zeros(1, batch_size, self.rnn_hidden_dim) 104 | return hidden, cell 105 | 106 | 107 | embed_dim = 256 108 | rnn_hidden_dim = 512 109 | 110 | torch.manual_seed(0) 111 | model = RNN(vocab_size, embed_dim, rnn_hidden_dim) 112 | 113 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 114 | model = model.to(device) 115 | model 116 | 117 | 118 | loss_fn = nn.CrossEntropyLoss() 119 | optimizer = torch.optim.Adam(model.parameters(), lr=0.003) 120 | 121 | 122 | num_epochs = 10000 123 | 124 | torch.manual_seed(0) 125 | 126 | for epoch in range(num_epochs): 127 | hidden, cell = model.init_hidden(batch_size) 128 | seq_batch, target_batch = next(iter(seq_dl)) 129 | seq_batch = seq_batch.to(device) 130 | target_batch = target_batch.to(device) 131 | optimizer.zero_grad() 132 | loss = 0 133 | for c in range(seq_length): 134 | pred, hidden, cell = model(seq_batch[:, c], hidden.to(device), cell.to(device)) 135 | loss += loss_fn(pred, target_batch[:, c]) 136 | loss.backward() 137 | optimizer.step() 138 | loss = loss.item()/seq_length 139 | if epoch % 500 == 0: 140 | print(f'Epoch {epoch} - loss: {loss:.4f}') 141 | 142 | 143 | from torch.distributions.categorical import Categorical 144 | 145 | def generate_text(model, starting_str, len_generated_text=500): 146 | encoded_input = torch.tensor([char_to_index[s] for s in starting_str]) 147 | encoded_input = torch.reshape(encoded_input, (1, -1)) 148 | 149 | generated_str = starting_str 150 | 151 | model.eval() 152 | 153 | hidden, cell = model.init_hidden(1) 154 | for c in range(len(starting_str)-1): 155 | _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 156 | 157 | last_char = encoded_input[:, -1] 158 | for _ in range(len_generated_text): 159 | logits, hidden, cell = model(last_char.view(1), hidden, cell) 160 | logits = torch.squeeze(logits, 0) 161 | last_char = Categorical(logits=logits).sample() 162 | generated_str += str(index_to_char[last_char.item()]) 163 | 164 | return generated_str 165 | 166 | 167 | model.to('cpu') 168 | torch.manual_seed(0) 169 | print(generate_text(model, 'the emperor', 500)) 170 | 171 | 172 | # --- 173 | 174 | # Readers may ignore the next cell. 175 | 176 | get_ipython().system('jupyter nbconvert --to python ch12_part3.ipynb --TemplateExporter.exclude_input_prompt=True') 177 | 178 | -------------------------------------------------------------------------------- /ch13/ch13_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 13 Advancing language understanding and Generation with the Transformer models 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Understanding self-attention 11 | 12 | import torch 13 | 14 | sentence = torch.tensor( 15 | [0, # python 16 | 8, # machine 17 | 1, # learning 18 | 6, # by 19 | 2] # example 20 | ) 21 | 22 | sentence 23 | 24 | 25 | torch.manual_seed(0) 26 | embed = torch.nn.Embedding(10, 16) 27 | sentence_embed = embed(sentence).detach() 28 | 29 | 30 | sentence_embed 31 | 32 | 33 | d = sentence_embed.shape[1] 34 | w_key = torch.rand(d, d) 35 | w_query = torch.rand(d, d) 36 | w_value = torch.rand(d, d) 37 | 38 | 39 | token1_embed = sentence_embed[0] 40 | key_1 = w_key.matmul(token1_embed) 41 | query_1 = w_query.matmul(token1_embed) 42 | value_1 = w_value.matmul(token1_embed) 43 | 44 | 45 | key_1 46 | 47 | 48 | keys = sentence_embed.matmul(w_key.T) 49 | 50 | 51 | keys[0] 52 | 53 | 54 | values = sentence_embed.matmul(w_value.T) 55 | 56 | 57 | import torch.nn.functional as F 58 | a1 = F.softmax(query_1.matmul(keys.T) / d ** 0.5, dim=0) 59 | 60 | 61 | a1 62 | 63 | 64 | z1 = a1.matmul(values) 65 | z1 66 | 67 | 68 | # # Improving sentiment analysis with BERT and Transformers 69 | 70 | # ## Fine-tuning a pre-trained BERT model for sentiment Analysis 71 | 72 | from torchtext.datasets import IMDB 73 | 74 | train_dataset = list(IMDB(split='train')) 75 | test_dataset = list(IMDB(split='test')) 76 | 77 | print(len(train_dataset), len(test_dataset)) 78 | 79 | 80 | train_texts = [train_sample[1] for train_sample in train_dataset] 81 | train_labels = [train_sample[0] for train_sample in train_dataset] 82 | 83 | test_texts = [test_sample[1] for test_sample in test_dataset] 84 | test_labels = [test_sample[0] for test_sample in test_dataset] 85 | 86 | 87 | import transformers 88 | from transformers import DistilBertTokenizerFast 89 | 90 | # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') 91 | tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', local_files_only=True) 92 | 93 | 94 | train_encodings = tokenizer(train_texts, truncation=True, padding=True) 95 | test_encodings = tokenizer(test_texts, truncation=True, padding=True) 96 | 97 | 98 | train_encodings[0] 99 | 100 | 101 | class IMDbDataset(torch.utils.data.Dataset): 102 | def __init__(self, encodings, labels): 103 | self.encodings = encodings 104 | self.labels = labels 105 | 106 | def __getitem__(self, idx): 107 | item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} 108 | item['labels'] = torch.tensor([0., 1.] if self.labels[idx] == 2 else [1., 0.]) 109 | return item 110 | 111 | def __len__(self): 112 | return len(self.labels) 113 | 114 | 115 | train_encoded_dataset = IMDbDataset(train_encodings, train_labels) 116 | test_encoded_dataset = IMDbDataset(test_encodings, test_labels) 117 | 118 | 119 | batch_size = 32 120 | train_dl = torch.utils.data.DataLoader(train_encoded_dataset, batch_size=batch_size, shuffle=True) 121 | test_dl = torch.utils.data.DataLoader(test_encoded_dataset, batch_size=batch_size, shuffle=False) 122 | 123 | 124 | from transformers import DistilBertForSequenceClassification 125 | 126 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 127 | 128 | model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', local_files_only=True) 129 | model.to(device) 130 | 131 | 132 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5) 133 | 134 | 135 | def train(model, dataloader, optimizer): 136 | model.train() 137 | total_loss = 0 138 | for batch in dataloader: 139 | optimizer.zero_grad() 140 | 141 | input_ids = batch['input_ids'].to(device) 142 | attention_mask = batch['attention_mask'].to(device) 143 | labels = batch['labels'].to(device) 144 | outputs = model(input_ids, attention_mask=attention_mask, labels=labels) 145 | loss = outputs['loss'] 146 | 147 | optimizer.zero_grad() 148 | loss.backward() 149 | optimizer.step() 150 | 151 | total_loss += loss.item()*len(batch) 152 | 153 | return total_loss/len(dataloader.dataset) 154 | 155 | 156 | 157 | def evaluate(model, dataloader): 158 | model.eval() 159 | total_acc = 0 160 | with torch.no_grad(): 161 | for batch in dataloader: 162 | 163 | input_ids = batch['input_ids'].to(device) 164 | attention_mask = batch['attention_mask'].to(device) 165 | labels = batch['labels'].to(device) 166 | outputs = model(input_ids, attention_mask=attention_mask) 167 | logits = outputs['logits'] 168 | 169 | pred = torch.argmax(logits, 1) 170 | total_acc += (pred == torch.argmax(labels, 1)).float().sum().item() 171 | 172 | return total_acc/len(dataloader.dataset) 173 | 174 | 175 | 176 | torch.manual_seed(0) 177 | num_epochs = 1 178 | for epoch in range(num_epochs): 179 | train_loss = train(model, train_dl, optimizer) 180 | train_acc = evaluate(model, train_dl) 181 | print(f'Epoch {epoch+1} - loss: {train_loss:.4f} - accuracy: {train_acc:.4f}') 182 | 183 | 184 | test_acc = evaluate(model, test_dl) 185 | print(f'Accuracy on test set: {100 * test_acc:.2f} %') 186 | 187 | 188 | # torch.cuda.mem_get_info() 189 | 190 | 191 | # torch.cuda.empty_cache() 192 | 193 | 194 | # free up memory 195 | del model 196 | 197 | 198 | # ## Using the Trainer API to train Transformer models 199 | 200 | model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', local_files_only=True) 201 | model.to(device) 202 | 203 | optim = torch.optim.Adam(model.parameters(), lr=5e-5) 204 | 205 | 206 | # !conda install -c conda-forge accelerate -y 207 | 208 | 209 | from transformers import Trainer, TrainingArguments 210 | 211 | training_args = TrainingArguments( 212 | output_dir='./results', 213 | num_train_epochs=1, 214 | per_device_train_batch_size=32, 215 | logging_dir='./logs', 216 | logging_steps=50, 217 | ) 218 | 219 | 220 | # trainer = Trainer( 221 | # model=model, 222 | # args=training_args, 223 | # train_dataset=train_encoded_dataset, 224 | # optimizers=(optim, None) 225 | # ) 226 | 227 | 228 | from datasets import load_metric 229 | import numpy as np 230 | 231 | metric = load_metric("accuracy") 232 | 233 | def compute_metrics(eval_pred): 234 | logits, labels = eval_pred 235 | pred = np.argmax(logits, axis=-1) 236 | return metric.compute(predictions=pred, references=np.argmax(labels, 1)) 237 | 238 | 239 | trainer = Trainer( 240 | model=model, 241 | compute_metrics=compute_metrics, 242 | args=training_args, 243 | train_dataset=train_encoded_dataset, 244 | eval_dataset=test_encoded_dataset, 245 | optimizers=(optim, None) 246 | ) 247 | 248 | 249 | trainer.train() 250 | 251 | 252 | print(trainer.evaluate()) 253 | 254 | 255 | # --- 256 | 257 | # Readers may ignore the next cell. 258 | 259 | get_ipython().system('jupyter nbconvert --to python ch13_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 260 | 261 | -------------------------------------------------------------------------------- /ch13/ch13_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 13 Advancing language understanding and Generation with the Transformer models 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Generating text using GPT 11 | 12 | # ## Writing your own War and Peace with GPT 13 | 14 | from transformers import pipeline, set_seed 15 | 16 | generator = pipeline('text-generation', model='gpt2') 17 | set_seed(0) 18 | generator("I love machine learning", 19 | max_length=20, 20 | num_return_sequences=3) 21 | 22 | 23 | from transformers import TextDataset, GPT2Tokenizer 24 | 25 | # tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 26 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2', local_files_only=True) 27 | 28 | 29 | text_dataset = TextDataset(tokenizer=tokenizer, file_path='warpeace_input.txt', block_size=128) 30 | 31 | 32 | len(text_dataset) 33 | 34 | 35 | from transformers import DataCollatorForLanguageModeling 36 | data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) 37 | 38 | 39 | import torch 40 | from transformers import GPT2LMHeadModel 41 | model = GPT2LMHeadModel.from_pretrained('gpt2') 42 | 43 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 44 | model.to(device) 45 | 46 | 47 | optim = torch.optim.Adam(model.parameters(), lr=5e-5) 48 | 49 | 50 | from transformers import Trainer, TrainingArguments 51 | 52 | training_args = TrainingArguments( 53 | output_dir='./gpt_results', 54 | num_train_epochs=20, 55 | per_device_train_batch_size=16, 56 | logging_dir='./gpt_logs', 57 | save_total_limit=1, 58 | logging_steps=500, 59 | ) 60 | 61 | 62 | trainer = Trainer( 63 | model=model, 64 | args=training_args, 65 | data_collator=data_collator, 66 | train_dataset=text_dataset, 67 | optimizers=(optim, None) 68 | ) 69 | 70 | 71 | trainer.train() 72 | 73 | 74 | def generate_text(prompt_text, model, tokenizer, max_length): 75 | input_ids = tokenizer.encode(prompt_text, return_tensors="pt").to(device) 76 | 77 | # Generate response 78 | output_sequences = model.generate( 79 | input_ids=input_ids, 80 | max_length=max_length, 81 | num_return_sequences=1, 82 | no_repeat_ngram_size=2, 83 | top_p=0.9, 84 | ) 85 | 86 | # Decode the generated responses 87 | responses = [] 88 | for response_id in output_sequences: 89 | response = tokenizer.decode(response_id, skip_special_tokens=True) 90 | responses.append(response) 91 | 92 | return responses 93 | 94 | 95 | prompt_text = "the emperor" 96 | responses = generate_text(prompt_text, model, tokenizer, 100) 97 | 98 | for response in responses: 99 | print(response) 100 | 101 | 102 | # --- 103 | 104 | # Readers may ignore the next cell. 105 | 106 | get_ipython().system('jupyter nbconvert --to python ch13_part2.ipynb --TemplateExporter.exclude_input_prompt=True') 107 | 108 | -------------------------------------------------------------------------------- /ch14/ch14_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 14 Building an Image Search Engine Using Multimodal Models 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Getting started with the dataset 11 | 12 | # ## Loading the Flickr8k dataset 13 | 14 | import os 15 | from PIL import Image 16 | import torch 17 | from torch.utils.data import Dataset, DataLoader 18 | import torchvision.transforms as transforms 19 | 20 | 21 | image_dir = "flickr8k/Flicker8k_Dataset" 22 | caption_file = "flickr8k/captions.txt" 23 | 24 | 25 | from transformers import DistilBertTokenizer 26 | tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') 27 | 28 | 29 | class Flickr8kDataset(Dataset): 30 | def __init__(self, image_dir, caption_file): 31 | self.image_dir = image_dir 32 | self.transform = transforms.Compose([ 33 | transforms.Resize((224, 224)), 34 | transforms.ToTensor(), 35 | ]) 36 | self.image_paths, self.captions = self.read_caption_file(caption_file) 37 | 38 | def read_caption_file(self, caption_file): 39 | image_paths = [] 40 | captions = [] 41 | 42 | with open(caption_file, "r") as file: 43 | lines = file.readlines() 44 | for line in lines[1:]: 45 | parts = line.strip().split(",") 46 | image_paths.append(os.path.join(self.image_dir, parts[0])) 47 | captions.append(parts[1]) 48 | 49 | self.caption_encodings = tokenizer(captions, truncation=True, padding=True, max_length=200) 50 | 51 | return image_paths, captions 52 | 53 | def __len__(self): 54 | return len(self.image_paths) 55 | 56 | def __getitem__(self, idx): 57 | item = {key: torch.tensor(val[idx]) for key, val in self.caption_encodings.items()} 58 | 59 | caption = self.captions[idx] 60 | item["caption"] = caption 61 | 62 | img_path = self.image_paths[idx] 63 | img = Image.open(img_path).convert("RGB") 64 | img = self.transform(img) 65 | item['image'] = img 66 | 67 | return item 68 | 69 | 70 | flickr8k_dataset = Flickr8kDataset(image_dir=image_dir, caption_file=caption_file) 71 | 72 | 73 | item_sample = next(iter(flickr8k_dataset)) 74 | 75 | 76 | item_sample 77 | 78 | 79 | import matplotlib.pyplot as plt 80 | import numpy as np 81 | 82 | npimg = item_sample['image'].numpy() 83 | plt.imshow(np.transpose(npimg, (1, 2, 0))) 84 | 85 | 86 | 87 | torch.manual_seed(0) 88 | batch_size = 32 89 | data_loader = DataLoader(flickr8k_dataset, batch_size=batch_size, shuffle=True) 90 | 91 | 92 | # # Architecting the CLIP model 93 | 94 | # ## Vision encoder 95 | 96 | import torch.nn as nn 97 | from torchvision.models import resnet50 98 | 99 | class VisionEncoder(nn.Module): 100 | def __init__(self): 101 | super().__init__() 102 | pretrained_resnet50 = resnet50(pretrained=True) 103 | self.model = nn.Sequential(*list(pretrained_resnet50.children())[:-1]) 104 | 105 | for param in self.model.parameters(): 106 | param.requires_grad = False 107 | 108 | def forward(self, x): 109 | x= self.model(x) 110 | x = x.view(x.size(0), -1) 111 | return x 112 | 113 | 114 | 115 | # ## Text encoder 116 | 117 | from transformers import DistilBertModel 118 | 119 | 120 | class TextEncoder(nn.Module): 121 | def __init__(self): 122 | super().__init__() 123 | self.model = DistilBertModel.from_pretrained('distilbert-base-uncased') 124 | for param in self.model.parameters(): 125 | param.requires_grad = False 126 | 127 | def forward(self, input_ids, attention_mask=None): 128 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 129 | return outputs.last_hidden_state[:, 0, :] 130 | 131 | 132 | # ## Projection head for contractive learning 133 | 134 | class ProjectionHead(nn.Module): 135 | def __init__(self, embedding_dim, projection_dim=256, dropout=0.1): 136 | super().__init__() 137 | self.projection = nn.Linear(embedding_dim, projection_dim) 138 | self.gelu = nn.GELU() 139 | self.fc = nn.Linear(projection_dim, projection_dim) 140 | self.dropout = nn.Dropout(dropout) 141 | self.layer_norm = nn.LayerNorm(projection_dim) 142 | 143 | def forward(self, x): 144 | projection = self.projection(x) 145 | x = self.gelu(projection) 146 | x = self.fc(x) 147 | x = self.dropout(x) 148 | x = projection + x 149 | x = self.layer_norm(x) 150 | return x 151 | 152 | 153 | 154 | # ## CLIP model 155 | 156 | import torch.nn.functional as F 157 | 158 | class CLIPModel(nn.Module): 159 | def __init__(self, image_embedding=2048, text_embedding=768): 160 | super().__init__() 161 | self.vision_encoder = VisionEncoder() 162 | self.text_encoder = TextEncoder() 163 | self.image_projection = ProjectionHead(embedding_dim=image_embedding) 164 | self.text_projection = ProjectionHead(embedding_dim=text_embedding) 165 | 166 | def forward(self, batch): 167 | image_features = self.vision_encoder(batch["image"]) 168 | text_features = self.text_encoder( 169 | input_ids=batch["input_ids"], attention_mask=batch["attention_mask"] 170 | ) 171 | image_embeddings = self.image_projection(image_features) 172 | text_embeddings = self.text_projection(text_features) 173 | 174 | logits = text_embeddings @ image_embeddings.T 175 | images_similarity = image_embeddings @ image_embeddings.T 176 | texts_similarity = text_embeddings @ text_embeddings.T 177 | targets = F.softmax((images_similarity + texts_similarity)/2 , dim=-1) 178 | texts_loss = F.cross_entropy(logits, targets) 179 | images_loss = F.cross_entropy(logits.T, targets.T) 180 | loss = (images_loss + texts_loss) / 2 181 | return loss.mean() 182 | 183 | 184 | 185 | # # Finding images with words 186 | 187 | # ## Training the CLIP model 188 | 189 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 190 | 191 | 192 | model = CLIPModel().to(device) 193 | 194 | 195 | optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 196 | 197 | 198 | def train(model, dataloader, optimizer): 199 | model.train() 200 | total_loss = 0 201 | b = 0 202 | for batch in dataloader: 203 | optimizer.zero_grad() 204 | batch = {k: v.to(device) for k, v in batch.items() if k != "caption"} 205 | loss = model(batch) 206 | optimizer.zero_grad() 207 | loss.backward() 208 | optimizer.step() 209 | 210 | total_loss += loss.item()*len(batch) 211 | 212 | return total_loss/len(dataloader.dataset) 213 | 214 | 215 | num_epochs = 3 216 | for epoch in range(num_epochs): 217 | train_loss = train(model, data_loader, optimizer) 218 | print(f'Epoch {epoch+1} - loss: {train_loss:.4f}') 219 | 220 | 221 | # ## Obtaining embeddings for images and text to identify matches 222 | 223 | torch.manual_seed(0) 224 | data_loader = DataLoader(flickr8k_dataset, batch_size=batch_size, shuffle=True) 225 | sample_batch = next(iter(data_loader)) 226 | 227 | 228 | batch_image_features = model.vision_encoder(sample_batch["image"].to(device)) 229 | batch_image_embeddings = model.image_projection(batch_image_features) 230 | 231 | 232 | 233 | def search_top_images(model, image_embeddings, query, n=1): 234 | encoded_query = tokenizer([query]) 235 | batch = { 236 | key: torch.tensor(values).to(device) 237 | for key, values in encoded_query.items() 238 | } 239 | model.eval() 240 | with torch.no_grad(): 241 | text_features = model.text_encoder( 242 | input_ids=batch["input_ids"], 243 | attention_mask=batch["attention_mask"]) 244 | text_embeddings = model.text_projection(text_features) 245 | 246 | dot_similarity = text_embeddings @ image_embeddings.T 247 | values, indices = torch.topk(dot_similarity.squeeze(0), n) 248 | return indices 249 | 250 | 251 | 252 | query = "a running dog" 253 | top_image_ids = search_top_images(model, batch_image_embeddings, query, 2) 254 | for id in top_image_ids: 255 | image = sample_batch["image"][id] 256 | npimg = image.numpy() 257 | plt.imshow(np.transpose(npimg, (1, 2, 0))) 258 | plt.title(f"Query: {query}") 259 | plt.show() 260 | 261 | 262 | query = "kids jumping into a pool" 263 | top_image_ids = search_top_images(model, batch_image_embeddings, query, 1) 264 | for id in top_image_ids: 265 | image = sample_batch["image"][id] 266 | npimg = image.numpy() 267 | plt.imshow(np.transpose(npimg, (1, 2, 0))) 268 | plt.title(f"Query: {query}") 269 | plt.show() 270 | 271 | 272 | 273 | # --- 274 | 275 | # Readers may ignore the next cell. 276 | 277 | get_ipython().system('jupyter nbconvert --to python ch14_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 278 | 279 | -------------------------------------------------------------------------------- /ch14/ch14_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 14 Building an Image Search Engine Using Multimodal Models 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Finding images with words 11 | 12 | # ## Image search using the pre-trained CLIP model 13 | 14 | from sentence_transformers import SentenceTransformer, util 15 | model = SentenceTransformer('clip-ViT-B-32') 16 | 17 | 18 | import os 19 | import glob 20 | from PIL import Image 21 | import torch 22 | 23 | 24 | image_paths = list(glob.glob('flickr8k/Flicker8k_Dataset/*.jpg')) 25 | 26 | all_image_embeddings = [] 27 | for img_path in image_paths: 28 | img = Image.open(img_path) 29 | all_image_embeddings.append(model.encode(img, convert_to_tensor=True)) 30 | 31 | 32 | 33 | import matplotlib.pyplot as plt 34 | 35 | 36 | def search_top_images(model, image_embeddings, query, top_k=1): 37 | query_embeddings = model.encode([query], convert_to_tensor=True, show_progress_bar=False) 38 | hits = util.semantic_search(query_embeddings, image_embeddings, top_k=top_k)[0] 39 | return hits 40 | 41 | 42 | query = "a swimming dog" 43 | hits = search_top_images(model, all_image_embeddings, query) 44 | 45 | for hit in hits: 46 | img_path = image_paths[hit['corpus_id']] 47 | image = Image.open(img_path) 48 | plt.imshow(image) 49 | plt.title(f"Query: {query}") 50 | plt.show() 51 | 52 | 53 | image_query = Image.open("flickr8k/Flicker8k_Dataset/240696675_7d05193aa0.jpg") 54 | hits = search_top_images(model, all_image_embeddings, image_query, 3)[1:] 55 | 56 | plt.imshow(image_query) 57 | plt.title(f"Query image") 58 | plt.show() 59 | 60 | for hit in hits: 61 | img_path = image_paths[hit['corpus_id']] 62 | image = Image.open(img_path) 63 | plt.imshow(image) 64 | plt.title(f"Similar image") 65 | plt.show() 66 | 67 | 68 | # ## Zero-shot classification 69 | 70 | from torchvision.datasets import CIFAR100 71 | cifar100 = CIFAR100(root="CIFAR100", download=True, train=False) 72 | 73 | 74 | print(cifar100.classes) 75 | print("Number of classes in CIFAR100 dataset:", len(cifar100.classes)) 76 | 77 | 78 | sample_index = 0 79 | img, class_id = cifar100[sample_index] 80 | print(f"Class of the sample image: {class_id} - {cifar100.classes[class_id]}") 81 | 82 | 83 | sample_image_embeddings = model.encode(img, convert_to_tensor=True) 84 | 85 | 86 | class_text = model.encode(cifar100.classes, convert_to_tensor=True) 87 | 88 | 89 | hits = util.semantic_search(sample_image_embeddings, class_text, top_k=1)[0] 90 | pred = hits[0]['corpus_id'] 91 | print(f"Predicted class of the sample image: {pred}") 92 | 93 | 94 | all_image_embeddings = [] 95 | class_true = [] 96 | for img, class_id in cifar100: 97 | class_true.append(class_id) 98 | all_image_embeddings.append(model.encode(img, convert_to_tensor=True)) 99 | 100 | 101 | class_pred = [] 102 | for hit in util.semantic_search(all_image_embeddings, class_text, top_k=1): 103 | class_pred.append(hit[0]['corpus_id']) 104 | 105 | 106 | from sklearn.metrics import accuracy_score 107 | acc = accuracy_score(class_true, class_pred) 108 | print(f"Accuracy of zero-shot classification: {acc * 100}%") 109 | 110 | 111 | # --- 112 | 113 | # Readers may ignore the next cell. 114 | 115 | get_ipython().system('jupyter nbconvert --to python ch14_part2.ipynb --TemplateExporter.exclude_input_prompt=True') 116 | 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /ch15/ch15_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 15 Making Decisions in Complex Environments with Reinforcement Learning 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Setting up the working environment 11 | 12 | # ## Installing OpenAI Gym 13 | 14 | import gymnasium as gym 15 | print(gym.envs.registry.keys()) 16 | 17 | 18 | # # Solving the FrozenLake environment with dynamic programming 19 | 20 | # ## Simulating the FrozenLake environment 21 | 22 | env = gym.make("FrozenLake-v1", render_mode="rgb_array") 23 | 24 | n_state = env.observation_space.n 25 | print(n_state) 26 | n_action = env.action_space.n 27 | print(n_action) 28 | 29 | 30 | env.reset(seed=0) 31 | 32 | 33 | import matplotlib.pyplot as plt 34 | plt.imshow(env.render()) 35 | 36 | 37 | new_state, reward, terminated, truncated, info = env.step(2) 38 | is_done = terminated or truncated 39 | 40 | env.render() 41 | print(new_state) 42 | print(reward) 43 | print(is_done) 44 | print(info) 45 | 46 | 47 | plt.imshow(env.render()) 48 | 49 | 50 | def run_episode(env, policy): 51 | state, _ = env.reset() 52 | total_reward = 0 53 | is_done = False 54 | while not is_done: 55 | action = policy[state].item() 56 | state, reward, terminated, truncated, info = env.step(action) 57 | is_done = terminated or truncated 58 | total_reward += reward 59 | if is_done: 60 | break 61 | return total_reward 62 | 63 | 64 | import torch 65 | 66 | n_episode = 1000 67 | 68 | total_rewards = [] 69 | for episode in range(n_episode): 70 | random_policy = torch.randint(high=n_action, size=(n_state,)) 71 | total_reward = run_episode(env, random_policy) 72 | total_rewards.append(total_reward) 73 | 74 | print(f'Average total reward under random policy: {sum(total_rewards)/n_episode}') 75 | 76 | 77 | print(env.env.P[6]) 78 | 79 | 80 | # ## Solving FrozenLake with the value iteration algorithm 81 | 82 | def value_iteration(env, gamma, threshold): 83 | """ 84 | Solve a given environment with value iteration algorithm 85 | @param env: Gymnasium environment 86 | @param gamma: discount factor 87 | @param threshold: the evaluation will stop once values for all states are less than the threshold 88 | @return: values of the optimal policy for the given environment 89 | """ 90 | n_state = env.observation_space.n 91 | n_action = env.action_space.n 92 | V = torch.zeros(n_state) 93 | while True: 94 | V_temp = torch.empty(n_state) 95 | for state in range(n_state): 96 | v_actions = torch.zeros(n_action) 97 | for action in range(n_action): 98 | for trans_prob, new_state, reward, _ in env.env.P[state][action]: 99 | v_actions[action] += trans_prob * (reward + gamma * V[new_state]) 100 | V_temp[state] = torch.max(v_actions) 101 | max_delta = torch.max(torch.abs(V - V_temp)) 102 | V = V_temp.clone() 103 | if max_delta <= threshold: 104 | break 105 | return V 106 | 107 | 108 | gamma = 0.99 109 | threshold = 0.0001 110 | 111 | V_optimal = value_iteration(env, gamma, threshold) 112 | print('Optimal values:\n', V_optimal) 113 | 114 | 115 | 116 | def extract_optimal_policy(env, V_optimal, gamma): 117 | """ 118 | Obtain the optimal policy based on the optimal values 119 | @param env: Gymnasium environment 120 | @param V_optimal: optimal values 121 | @param gamma: discount factor 122 | @return: optimal policy 123 | """ 124 | n_state = env.observation_space.n 125 | n_action = env.action_space.n 126 | optimal_policy = torch.zeros(n_state) 127 | for state in range(n_state): 128 | v_actions = torch.zeros(n_action) 129 | for action in range(n_action): 130 | for trans_prob, new_state, reward, _ in env.env.P[state][action]: 131 | v_actions[action] += trans_prob * (reward + gamma * V_optimal[new_state]) 132 | optimal_policy[state] = torch.argmax(v_actions) 133 | return optimal_policy 134 | 135 | 136 | optimal_policy = extract_optimal_policy(env, V_optimal, gamma) 137 | print('Optimal policy:\n', optimal_policy) 138 | 139 | 140 | def run_episode(env, policy): 141 | state, _ = env.reset() 142 | total_reward = 0 143 | is_done = False 144 | while not is_done: 145 | action = policy[state].item() 146 | state, reward, terminated, truncated, info = env.step(action) 147 | is_done = terminated or truncated 148 | total_reward += reward 149 | if is_done: 150 | break 151 | return total_reward 152 | 153 | 154 | n_episode = 1000 155 | total_rewards = [] 156 | for episode in range(n_episode): 157 | total_reward = run_episode(env, optimal_policy) 158 | total_rewards.append(total_reward) 159 | 160 | print('Average total reward under the optimal policy:', sum(total_rewards) / n_episode) 161 | 162 | 163 | # ## Solving FrozenLake with the policy iteration algorithm 164 | 165 | def policy_evaluation(env, policy, gamma, threshold): 166 | """ 167 | Perform policy evaluation 168 | @param env: Gymnasium environment 169 | @param policy: policy matrix containing actions and their probability in each state 170 | @param gamma: discount factor 171 | @param threshold: the evaluation will stop once values for all states are less than the threshold 172 | @return: values of the given policy 173 | """ 174 | n_state = policy.shape[0] 175 | V = torch.zeros(n_state) 176 | while True: 177 | V_temp = torch.zeros(n_state) 178 | for state in range(n_state): 179 | action = policy[state].item() 180 | for trans_prob, new_state, reward, _ in env.env.P[state][action]: 181 | V_temp[state] += trans_prob * (reward + gamma * V[new_state]) 182 | max_delta = torch.max(torch.abs(V - V_temp)) 183 | V = V_temp.clone() 184 | if max_delta <= threshold: 185 | break 186 | return V 187 | 188 | 189 | def policy_improvement(env, V, gamma): 190 | """ 191 | Obtain an improved policy based on the values 192 | @param env: Gymnasium environment 193 | @param V: policy values 194 | @param gamma: discount factor 195 | @return: the policy 196 | """ 197 | n_state = env.observation_space.n 198 | n_action = env.action_space.n 199 | policy = torch.zeros(n_state) 200 | for state in range(n_state): 201 | v_actions = torch.zeros(n_action) 202 | for action in range(n_action): 203 | for trans_prob, new_state, reward, _ in env.env.P[state][action]: 204 | v_actions[action] += trans_prob * (reward + gamma * V[new_state]) 205 | policy[state] = torch.argmax(v_actions) 206 | return policy 207 | 208 | 209 | def policy_iteration(env, gamma, threshold): 210 | """ 211 | Solve a given environment with policy iteration algorithm 212 | @param env: Gymnasium environment 213 | @param gamma: discount factor 214 | @param threshold: the evaluation will stop once values for all states are less than the threshold 215 | @return: optimal values and the optimal policy for the given environment 216 | """ 217 | n_state = env.observation_space.n 218 | n_action = env.action_space.n 219 | policy = torch.randint(high=n_action, size=(n_state,)).float() 220 | while True: 221 | V = policy_evaluation(env, policy, gamma, threshold) 222 | policy_improved = policy_improvement(env, V, gamma) 223 | if torch.equal(policy_improved, policy): 224 | return V, policy_improved 225 | policy = policy_improved 226 | 227 | 228 | gamma = 0.99 229 | threshold = 0.0001 230 | 231 | 232 | V_optimal, optimal_policy = policy_iteration(env, gamma, threshold) 233 | print('Optimal values:\n', V_optimal) 234 | print('Optimal policy:\n', optimal_policy) 235 | 236 | 237 | # --- 238 | 239 | # Readers may ignore the next cell. 240 | 241 | 242 | 243 | -------------------------------------------------------------------------------- /ch15/ch15_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 15 Making Decisions in Complex Environments with Reinforcement Learning 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Performing Monte Carlo learning 11 | 12 | # ## Simulating the Blackjack environment 13 | 14 | import gymnasium as gym 15 | 16 | env = gym.make('Blackjack-v1') 17 | 18 | env.reset(seed=0) 19 | 20 | 21 | env.step(1) 22 | 23 | 24 | env.step(1) 25 | 26 | 27 | env.step(0) 28 | 29 | 30 | # ## Performing Monte Carlo policy evaluation 31 | 32 | def run_episode(env, hold_score): 33 | state, _ = env.reset() 34 | rewards = [] 35 | states = [state] 36 | while True: 37 | action = 1 if state[0] < hold_score else 0 38 | state, reward, terminated, truncated, info = env.step(action) 39 | is_done = terminated or truncated 40 | states.append(state) 41 | rewards.append(reward) 42 | if is_done: 43 | break 44 | return states, rewards 45 | 46 | 47 | from collections import defaultdict 48 | 49 | def mc_prediction_first_visit(env, hold_score, gamma, n_episode): 50 | V = defaultdict(float) 51 | N = defaultdict(int) 52 | for episode in range(n_episode): 53 | states_t, rewards_t = run_episode(env, hold_score) 54 | return_t = 0 55 | G = {} 56 | for state_t, reward_t in zip(states_t[1::-1], rewards_t[::-1]): 57 | return_t = gamma * return_t + reward_t 58 | G[state_t] = return_t 59 | for state, return_t in G.items(): 60 | if state[0] <= 21: 61 | V[state] += return_t 62 | N[state] += 1 63 | for state in V: 64 | V[state] = V[state] / N[state] 65 | return V 66 | 67 | 68 | gamma = 1 69 | hold_score = 18 70 | n_episode = 500000 71 | 72 | value = mc_prediction_first_visit(env, hold_score, gamma, n_episode) 73 | 74 | print(value) 75 | 76 | print('Number of states:', len(value)) 77 | 78 | 79 | # ## Performing on-policy Monte Carlo control 80 | 81 | import torch 82 | 83 | def run_episode(env, Q, n_action): 84 | state, _ = env.reset() 85 | rewards = [] 86 | actions = [] 87 | states = [] 88 | action = torch.randint(0, n_action, [1]).item() 89 | while True: 90 | actions.append(action) 91 | states.append(state) 92 | state, reward, terminated, truncated, info = env.step(action) 93 | is_done = terminated or truncated 94 | rewards.append(reward) 95 | if is_done: 96 | break 97 | action = torch.argmax(Q[state]).item() 98 | return states, actions, rewards 99 | 100 | 101 | def mc_control_on_policy(env, gamma, n_episode): 102 | n_action = env.action_space.n 103 | G_sum = defaultdict(float) 104 | N = defaultdict(int) 105 | Q = defaultdict(lambda: torch.empty(n_action)) 106 | for episode in range(n_episode): 107 | states_t, actions_t, rewards_t = run_episode(env, Q, n_action) 108 | return_t = 0 109 | G = {} 110 | for state_t, action_t, reward_t in zip(states_t[::-1], actions_t[::-1], rewards_t[::-1]): 111 | return_t = gamma * return_t + reward_t 112 | G[(state_t, action_t)] = return_t 113 | for state_action, return_t in G.items(): 114 | state, action = state_action 115 | if state[0] <= 21: 116 | G_sum[state_action] += return_t 117 | N[state_action] += 1 118 | Q[state][action] = G_sum[state_action] / N[state_action] 119 | policy = {} 120 | for state, actions in Q.items(): 121 | policy[state] = torch.argmax(actions).item() 122 | return Q, policy 123 | 124 | 125 | gamma = 1 126 | n_episode = 500000 127 | 128 | optimal_Q, optimal_policy = mc_control_on_policy(env, gamma, n_episode) 129 | 130 | print(optimal_policy) 131 | 132 | 133 | def simulate_hold_episode(env, hold_score): 134 | state, _ = env.reset() 135 | while True: 136 | action = 1 if state[0] < hold_score else 0 137 | state, reward, terminated, truncated, info = env.step(action) 138 | is_done = terminated or truncated 139 | if is_done: 140 | return reward 141 | 142 | 143 | 144 | def simulate_episode(env, policy): 145 | state, _ = env.reset() 146 | while True: 147 | action = policy[state] 148 | state, reward, terminated, truncated, info = env.step(action) 149 | is_done = terminated or truncated 150 | if is_done: 151 | return reward 152 | 153 | 154 | 155 | n_episode = 100000 156 | hold_score = 18 157 | n_win_opt = 0 158 | n_win_hold = 0 159 | 160 | for _ in range(n_episode): 161 | reward = simulate_hold_episode(env, hold_score) 162 | if reward == 1: 163 | n_win_hold += 1 164 | reward = simulate_episode(env, optimal_policy) 165 | if reward == 1: 166 | n_win_opt += 1 167 | 168 | 169 | print(f'Winning probability under the simple policy: {n_win_hold/n_episode}') 170 | print(f'Winning probability under the optimal policy: {n_win_opt/n_episode}') 171 | 172 | 173 | # # Solving the Blackjack problem with the Q-learning algorithm 174 | 175 | # ## Developing the Q-learning algorithm 176 | 177 | def epsilon_greedy_policy(n_action, epsilon, state, Q): 178 | probs = torch.ones(n_action) * epsilon / n_action 179 | best_action = torch.argmax(Q[state]).item() 180 | probs[best_action] += 1.0 - epsilon 181 | action = torch.multinomial(probs, 1).item() 182 | return action 183 | 184 | 185 | def q_learning(env, gamma, n_episode, alpha, epsilon, final_epsilon): 186 | n_action = env.action_space.n 187 | Q = defaultdict(lambda: torch.zeros(n_action)) 188 | epsilon_decay = epsilon / (n_episode / 2) 189 | for episode in range(n_episode): 190 | state, _ = env.reset() 191 | is_done = False 192 | epsilon = max(final_epsilon, epsilon - epsilon_decay) 193 | 194 | while not is_done: 195 | action = epsilon_greedy_policy(n_action, epsilon, state, Q) 196 | next_state, reward, terminated, truncated, info = env.step(action) 197 | is_done = terminated or truncated 198 | delta = reward + gamma * torch.max(Q[next_state]) - Q[state][action] 199 | Q[state][action] += alpha * delta 200 | total_reward_episode[episode] += reward 201 | if is_done: 202 | break 203 | state = next_state 204 | policy = {} 205 | for state, actions in Q.items(): 206 | policy[state] = torch.argmax(actions).item() 207 | return Q, policy 208 | 209 | 210 | n_episode = 10000 211 | epsilon = 1.0 212 | final_epsilon = 0.1 213 | 214 | gamma = 1 215 | alpha = 0.003 216 | 217 | total_reward_episode = [0] * n_episode 218 | 219 | optimal_Q, optimal_policy = q_learning(env, gamma, n_episode, alpha, epsilon, final_epsilon) 220 | 221 | 222 | rolling_avg_reward = [total_reward_episode[0]] 223 | for i, reward in enumerate(total_reward_episode[1:], 1): 224 | rolling_avg_reward.append((rolling_avg_reward[-1]*i + reward)/(i+1)) 225 | 226 | 227 | import matplotlib.pyplot as plt 228 | plt.plot(rolling_avg_reward) 229 | plt.title('Average reward over time') 230 | plt.xlabel('Episode') 231 | plt.ylabel('Average reward') 232 | plt.ylim([-1, 1]) 233 | plt.show() 234 | 235 | 236 | n_episode = 100000 237 | n_win_opt = 0 238 | 239 | for _ in range(n_episode): 240 | reward = simulate_episode(env, optimal_policy) 241 | if reward == 1: 242 | n_win_opt += 1 243 | 244 | 245 | print(f'Winning probability under the optimal policy: {n_win_opt/n_episode}') 246 | 247 | 248 | # --- 249 | 250 | # Readers may ignore the next cell. 251 | 252 | get_ipython().system('jupyter nbconvert --to python ch15_part2.ipynb --TemplateExporter.exclude_input_prompt=True') 253 | 254 | -------------------------------------------------------------------------------- /ch2/ch2_part1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "38c139cd", 6 | "metadata": {}, 7 | "source": [ 8 | "\n", 9 | "Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)\n", 10 | "\n", 11 | "Chapter 2 Building A Movie Recommendation Engine with Naive Bayes\n", 12 | "\n", 13 | "Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)\n", 14 | "\n", 15 | "\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "6740371d", 21 | "metadata": {}, 22 | "source": [ 23 | "# Implementing Naïve Bayes " 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "33286ee5", 29 | "metadata": {}, 30 | "source": [ 31 | "## Implementing Naïve Bayes from scratch" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "id": "88605dbc", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import numpy as np\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "id": "e6695e41", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "X_train = np.array([\n", 52 | " [0, 1, 1],\n", 53 | " [0, 0, 1],\n", 54 | " [0, 0, 0],\n", 55 | " [1, 1, 0]])\n", 56 | "\n", 57 | "Y_train = ['Y', 'N', 'Y', 'Y']\n", 58 | "\n", 59 | "X_test = np.array([[1, 1, 0]])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "id": "38c76d45", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "def get_label_indices(labels):\n", 70 | " \"\"\"\n", 71 | " Group samples based on their labels and return indices\n", 72 | " @param labels: list of labels\n", 73 | " @return: dict, {class1: [indices], class2: [indices]}\n", 74 | " \"\"\"\n", 75 | " from collections import defaultdict\n", 76 | " label_indices = defaultdict(list)\n", 77 | " for index, label in enumerate(labels):\n", 78 | " label_indices[label].append(index)\n", 79 | " return label_indices" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "id": "4141a162", 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "label_indices:\n", 93 | " defaultdict(, {'Y': [0, 2, 3], 'N': [1]})\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "label_indices = get_label_indices(Y_train)\n", 99 | "print('label_indices:\\n', label_indices)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "id": "c6b94f65", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "def get_prior(label_indices):\n", 110 | " \"\"\"\n", 111 | " Compute prior based on training samples\n", 112 | " @param label_indices: grouped sample indices by class\n", 113 | " @return: dictionary, with class label as key, corresponding prior as the value\n", 114 | " \"\"\"\n", 115 | " prior = {label: len(indices) for label, indices in label_indices.items()}\n", 116 | " total_count = sum(prior.values())\n", 117 | " for label in prior:\n", 118 | " prior[label] /= total_count\n", 119 | " return prior" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 6, 125 | "id": "428ecf6d", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "Prior: {'Y': 0.75, 'N': 0.25}\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "prior = get_prior(label_indices)\n", 138 | "print('Prior:', prior)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 7, 144 | "id": "ed3ba747", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "def get_likelihood(features, label_indices, smoothing=0):\n", 149 | " \"\"\"\n", 150 | " Compute likelihood based on training samples\n", 151 | " @param features: matrix of features\n", 152 | " @param label_indices: grouped sample indices by class\n", 153 | " @param smoothing: integer, additive smoothing parameter\n", 154 | " @return: dictionary, with class as key, corresponding conditional probability P(feature|class) vector as value\n", 155 | " \"\"\"\n", 156 | " likelihood = {}\n", 157 | " for label, indices in label_indices.items():\n", 158 | " likelihood[label] = features[indices, :].sum(axis=0) + smoothing\n", 159 | " total_count = len(indices)\n", 160 | " likelihood[label] = likelihood[label] / (total_count + 2 * smoothing)\n", 161 | " return likelihood" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 8, 167 | "id": "e2b56969", 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "Likelihood:\n", 175 | " {'Y': array([0.4, 0.6, 0.4]), 'N': array([0.33333333, 0.33333333, 0.66666667])}\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "smoothing = 1\n", 181 | "likelihood = get_likelihood(X_train, label_indices, smoothing)\n", 182 | "print('Likelihood:\\n', likelihood)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 9, 188 | "id": "4559a3a2", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "def get_posterior(X, prior, likelihood):\n", 193 | " \"\"\"\n", 194 | " Compute posterior of testing samples, based on prior and likelihood\n", 195 | " @param X: testing samples\n", 196 | " @param prior: dictionary, with class label as key, corresponding prior as the value\n", 197 | " @param likelihood: dictionary, with class label as key, corresponding conditional probability vector as value\n", 198 | " @return: dictionary, with class label as key, corresponding posterior as value\n", 199 | " \"\"\"\n", 200 | " posteriors = []\n", 201 | " for x in X:\n", 202 | " # posterior is proportional to prior * likelihood\n", 203 | " posterior = prior.copy()\n", 204 | " for label, likelihood_label in likelihood.items():\n", 205 | " for index, bool_value in enumerate(x):\n", 206 | " posterior[label] *= likelihood_label[index] if bool_value else (1 - likelihood_label[index])\n", 207 | " # normalize so that all sums up to 1\n", 208 | " sum_posterior = sum(posterior.values())\n", 209 | " for label in posterior:\n", 210 | " if posterior[label] == float('inf'):\n", 211 | " posterior[label] = 1.0\n", 212 | " else:\n", 213 | " posterior[label] /= sum_posterior\n", 214 | " posteriors.append(posterior.copy())\n", 215 | " return posteriors\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "id": "6c559bfa", 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "Posterior:\n", 229 | " [{'Y': 0.9210360075805433, 'N': 0.07896399241945673}]\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "posterior = get_posterior(X_test, prior, likelihood)\n", 235 | "print('Posterior:\\n', posterior)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "id": "7e846661", 241 | "metadata": {}, 242 | "source": [ 243 | "## Implementing Naïve Bayes with scikit-learn " 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 11, 249 | "id": "8a54b509", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "from sklearn.naive_bayes import BernoulliNB\n" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 12, 259 | "id": "5e33349c", 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "BernoulliNB()" 266 | ] 267 | }, 268 | "execution_count": 12, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "clf = BernoulliNB(alpha=1.0, fit_prior=True)\n", 275 | "clf.fit(X_train, Y_train)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 13, 281 | "id": "e8c0835e", 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "[scikit-learn] Predicted probabilities:\n", 289 | " [[0.07896399 0.92103601]]\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "pred_prob = clf.predict_proba(X_test)\n", 295 | "print('[scikit-learn] Predicted probabilities:\\n', pred_prob)\n" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 14, 301 | "id": "22e775be", 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "[scikit-learn] Prediction: ['Y']\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "pred = clf.predict(X_test)\n", 314 | "print('[scikit-learn] Prediction:', pred)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "id": "1fe54862", 320 | "metadata": {}, 321 | "source": [ 322 | "---" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "id": "7e26d248", 328 | "metadata": {}, 329 | "source": [ 330 | "Readers may ignore the next cell." 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 15, 336 | "id": "6629bb2c", 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "name": "stderr", 341 | "output_type": "stream", 342 | "text": [ 343 | "[NbConvertApp] Converting notebook ch2_part1.ipynb to python\n", 344 | "[NbConvertApp] Writing 3985 bytes to ch2_part1.py\n" 345 | ] 346 | } 347 | ], 348 | "source": [ 349 | "!jupyter nbconvert --to python ch2_part1.ipynb --TemplateExporter.exclude_input_prompt=True" 350 | ] 351 | } 352 | ], 353 | "metadata": { 354 | "kernelspec": { 355 | "display_name": "Python 3 (ipykernel)", 356 | "language": "python", 357 | "name": "python3" 358 | }, 359 | "language_info": { 360 | "codemirror_mode": { 361 | "name": "ipython", 362 | "version": 3 363 | }, 364 | "file_extension": ".py", 365 | "mimetype": "text/x-python", 366 | "name": "python", 367 | "nbconvert_exporter": "python", 368 | "pygments_lexer": "ipython3", 369 | "version": "3.9.13" 370 | } 371 | }, 372 | "nbformat": 4, 373 | "nbformat_minor": 5 374 | } 375 | -------------------------------------------------------------------------------- /ch2/ch2_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 6 | # 7 | # Chapter 2 Building A Movie Recommendation Engine with Naive Bayes 8 | # 9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 10 | # 11 | # 12 | # 13 | 14 | # # Implementing Naïve Bayes 15 | 16 | # ## Implementing Naïve Bayes from scratch 17 | 18 | import numpy as np 19 | 20 | 21 | X_train = np.array([ 22 | [0, 1, 1], 23 | [0, 0, 1], 24 | [0, 0, 0], 25 | [1, 1, 0]]) 26 | 27 | Y_train = ['Y', 'N', 'Y', 'Y'] 28 | 29 | X_test = np.array([[1, 1, 0]]) 30 | 31 | 32 | def get_label_indices(labels): 33 | """ 34 | Group samples based on their labels and return indices 35 | @param labels: list of labels 36 | @return: dict, {class1: [indices], class2: [indices]} 37 | """ 38 | from collections import defaultdict 39 | label_indices = defaultdict(list) 40 | for index, label in enumerate(labels): 41 | label_indices[label].append(index) 42 | return label_indices 43 | 44 | 45 | label_indices = get_label_indices(Y_train) 46 | print('label_indices:\n', label_indices) 47 | 48 | 49 | def get_prior(label_indices): 50 | """ 51 | Compute prior based on training samples 52 | @param label_indices: grouped sample indices by class 53 | @return: dictionary, with class label as key, corresponding prior as the value 54 | """ 55 | prior = {label: len(indices) for label, indices in label_indices.items()} 56 | total_count = sum(prior.values()) 57 | for label in prior: 58 | prior[label] /= total_count 59 | return prior 60 | 61 | 62 | prior = get_prior(label_indices) 63 | print('Prior:', prior) 64 | 65 | 66 | def get_likelihood(features, label_indices, smoothing=0): 67 | """ 68 | Compute likelihood based on training samples 69 | @param features: matrix of features 70 | @param label_indices: grouped sample indices by class 71 | @param smoothing: integer, additive smoothing parameter 72 | @return: dictionary, with class as key, corresponding conditional probability P(feature|class) vector as value 73 | """ 74 | likelihood = {} 75 | for label, indices in label_indices.items(): 76 | likelihood[label] = features[indices, :].sum(axis=0) + smoothing 77 | total_count = len(indices) 78 | likelihood[label] = likelihood[label] / (total_count + 2 * smoothing) 79 | return likelihood 80 | 81 | 82 | smoothing = 1 83 | likelihood = get_likelihood(X_train, label_indices, smoothing) 84 | print('Likelihood:\n', likelihood) 85 | 86 | 87 | def get_posterior(X, prior, likelihood): 88 | """ 89 | Compute posterior of testing samples, based on prior and likelihood 90 | @param X: testing samples 91 | @param prior: dictionary, with class label as key, corresponding prior as the value 92 | @param likelihood: dictionary, with class label as key, corresponding conditional probability vector as value 93 | @return: dictionary, with class label as key, corresponding posterior as value 94 | """ 95 | posteriors = [] 96 | for x in X: 97 | # posterior is proportional to prior * likelihood 98 | posterior = prior.copy() 99 | for label, likelihood_label in likelihood.items(): 100 | for index, bool_value in enumerate(x): 101 | posterior[label] *= likelihood_label[index] if bool_value else (1 - likelihood_label[index]) 102 | # normalize so that all sums up to 1 103 | sum_posterior = sum(posterior.values()) 104 | for label in posterior: 105 | if posterior[label] == float('inf'): 106 | posterior[label] = 1.0 107 | else: 108 | posterior[label] /= sum_posterior 109 | posteriors.append(posterior.copy()) 110 | return posteriors 111 | 112 | 113 | posterior = get_posterior(X_test, prior, likelihood) 114 | print('Posterior:\n', posterior) 115 | 116 | 117 | # ## Implementing Naïve Bayes with scikit-learn 118 | 119 | from sklearn.naive_bayes import BernoulliNB 120 | 121 | 122 | clf = BernoulliNB(alpha=1.0, fit_prior=True) 123 | clf.fit(X_train, Y_train) 124 | 125 | 126 | pred_prob = clf.predict_proba(X_test) 127 | print('[scikit-learn] Predicted probabilities:\n', pred_prob) 128 | 129 | 130 | pred = clf.predict(X_test) 131 | print('[scikit-learn] Prediction:', pred) 132 | 133 | 134 | # --- 135 | 136 | # Readers may ignore the next cell. 137 | 138 | get_ipython().system('jupyter nbconvert --to python ch2_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 139 | 140 | -------------------------------------------------------------------------------- /ch2/ch2_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 6 | # 7 | # Chapter 2 Building A Movie Recommendation Engine with Naive Bayes 8 | # 9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 10 | # 11 | 12 | # # Building a movie recommender with Naïve Bayes 13 | 14 | import numpy as np 15 | import pandas as pd 16 | 17 | 18 | data_path = 'ml-1m/ratings.dat' 19 | df = pd.read_csv(data_path, header=None, sep='::', engine='python') 20 | df.columns = ['user_id', 'movie_id', 'rating', 'timestamp'] 21 | print(df) 22 | 23 | 24 | n_users = df['user_id'].nunique() 25 | n_movies = df['movie_id'].nunique() 26 | print(f"Number of users: {n_users}") 27 | print(f"Number of movies: {n_movies}") 28 | 29 | 30 | def load_user_rating_data(df, n_users, n_movies): 31 | """ 32 | Load rating data from the raw dataframe and also return movieId index mapping 33 | @param df: raw dataframe read from ratings.csv 34 | @param n_users: number of users 35 | @param n_movies: number of movies that have ratings 36 | @return: rating data in the numpy array of [user, movie]; 37 | movie_id_mapping, {movie_id: column index in rating data} 38 | """ 39 | data = np.zeros([n_users, n_movies], dtype=np.intc) 40 | movie_id_mapping = {} 41 | for user_id, movie_id, rating in zip(df['user_id'], df['movie_id'], df['rating']): 42 | user_id = int(user_id) - 1 43 | if movie_id not in movie_id_mapping: 44 | movie_id_mapping[movie_id] = len(movie_id_mapping) 45 | data[user_id, movie_id_mapping[movie_id]] = rating 46 | return data, movie_id_mapping 47 | 48 | data, movie_id_mapping = load_user_rating_data(df, n_users, n_movies) 49 | 50 | 51 | values, counts = np.unique(data, return_counts=True) 52 | for value, count in zip(values, counts): 53 | print(f'Number of rating {value}: {count}') 54 | 55 | 56 | print(df['movie_id'].value_counts()) 57 | 58 | 59 | target_movie_id = 2858 60 | X_raw = np.delete(data, movie_id_mapping[target_movie_id], axis=1) 61 | Y_raw = data[:, movie_id_mapping[target_movie_id]] 62 | 63 | X = X_raw[Y_raw > 0] 64 | Y = Y_raw[Y_raw > 0] 65 | 66 | print('Shape of X:', X.shape) 67 | print('Shape of Y:', Y.shape) 68 | 69 | 70 | recommend = 3 71 | Y[Y <= recommend] = 0 72 | Y[Y > recommend] = 1 73 | 74 | n_pos = (Y == 1).sum() 75 | n_neg = (Y == 0).sum() 76 | print(f'{n_pos} positive samples and {n_neg} negative samples.') 77 | 78 | 79 | from sklearn.model_selection import train_test_split 80 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 81 | print(len(Y_train), len(Y_test)) 82 | 83 | 84 | from sklearn.naive_bayes import MultinomialNB 85 | clf = MultinomialNB(alpha=1.0, fit_prior=True) 86 | clf.fit(X_train, Y_train) 87 | 88 | 89 | prediction_prob = clf.predict_proba(X_test) 90 | print(prediction_prob[0:10]) 91 | 92 | prediction = clf.predict(X_test) 93 | print(prediction[:10]) 94 | 95 | accuracy = clf.score(X_test, Y_test) 96 | print(f'The accuracy is: {accuracy*100:.1f}%') 97 | 98 | 99 | # # Evaluating classification performance 100 | 101 | from sklearn.metrics import confusion_matrix 102 | print(confusion_matrix(Y_test, prediction, labels=[0, 1])) 103 | 104 | 105 | from sklearn.metrics import precision_score, recall_score, f1_score 106 | 107 | precision_score(Y_test, prediction, pos_label=1) 108 | 109 | 110 | recall_score(Y_test, prediction, pos_label=1) 111 | 112 | 113 | f1_score(Y_test, prediction, pos_label=1) 114 | 115 | 116 | f1_score(Y_test, prediction, pos_label=0) 117 | 118 | 119 | from sklearn.metrics import classification_report 120 | report = classification_report(Y_test, prediction) 121 | print(report) 122 | 123 | 124 | pos_prob = prediction_prob[:, 1] 125 | 126 | thresholds = np.arange(0.0, 1.1, 0.05) 127 | true_pos, false_pos = [0]*len(thresholds), [0]*len(thresholds) 128 | for pred, y in zip(pos_prob, Y_test): 129 | for i, threshold in enumerate(thresholds): 130 | if pred >= threshold: 131 | if y == 1: 132 | true_pos[i] += 1 133 | else: 134 | false_pos[i] += 1 135 | else: 136 | break 137 | 138 | n_pos_test = (Y_test == 1).sum() 139 | n_neg_test = (Y_test == 0).sum() 140 | true_pos_rate = [tp / n_pos_test for tp in true_pos] 141 | false_pos_rate = [fp / n_neg_test for fp in false_pos] 142 | 143 | 144 | import matplotlib.pyplot as plt 145 | plt.figure() 146 | lw = 2 147 | plt.plot(false_pos_rate, true_pos_rate, color='darkorange', lw=lw) 148 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 149 | plt.xlim([0.0, 1.0]) 150 | plt.ylim([0.0, 1.05]) 151 | plt.xlabel('False Positive Rate') 152 | plt.ylabel('True Positive Rate') 153 | plt.title('Receiver Operating Characteristic') 154 | plt.legend(loc="lower right") 155 | plt.show() 156 | 157 | 158 | from sklearn.metrics import roc_auc_score 159 | print(roc_auc_score(Y_test, pos_prob)) 160 | 161 | 162 | # # Tuning models with cross-validation 163 | 164 | from sklearn.model_selection import StratifiedKFold 165 | k = 5 166 | k_fold = StratifiedKFold(n_splits=k, random_state=42) 167 | 168 | smoothing_factor_option = [1, 2, 3, 4, 5, 6] 169 | fit_prior_option = [True, False] 170 | auc_record = {} 171 | 172 | for train_indices, test_indices in k_fold.split(X, Y): 173 | X_train_k, X_test_k = X[train_indices], X[test_indices] 174 | Y_train_k, Y_test_k = Y[train_indices], Y[test_indices] 175 | for alpha in smoothing_factor_option: 176 | if alpha not in auc_record: 177 | auc_record[alpha] = {} 178 | for fit_prior in fit_prior_option: 179 | clf = MultinomialNB(alpha=alpha, fit_prior=fit_prior) 180 | clf.fit(X_train_k, Y_train_k) 181 | prediction_prob = clf.predict_proba(X_test_k) 182 | pos_prob = prediction_prob[:, 1] 183 | auc = roc_auc_score(Y_test_k, pos_prob) 184 | auc_record[alpha][fit_prior] = auc + auc_record[alpha].get(fit_prior, 0.0) 185 | 186 | 187 | print('smoothing fit prior auc') 188 | for smoothing, smoothing_record in auc_record.items(): 189 | for fit_prior, auc in smoothing_record.items(): 190 | print(f' {smoothing} {fit_prior} {auc/k:.5f}') 191 | 192 | 193 | clf = MultinomialNB(alpha=2.0, fit_prior=False) 194 | clf.fit(X_train, Y_train) 195 | 196 | pos_prob = clf.predict_proba(X_test)[:, 1] 197 | print('AUC with the best model:', roc_auc_score(Y_test, pos_prob)) 198 | 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /ch3/ch3_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 6 | # 7 | # Chapter 3 Predicting Online Ad Click-Through with Tree-Based Algorithms 8 | # 9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 10 | 11 | # # The metrics for measuring a split 12 | 13 | # ## Gini Impurity 14 | 15 | import matplotlib.pyplot as plt 16 | import numpy as np 17 | 18 | 19 | # Plot Gini Impurity in binary case 20 | pos_fraction = np.linspace(0.00, 1.00, 1000) 21 | gini = 1 - pos_fraction**2 - (1-pos_fraction)**2 22 | plt.plot(pos_fraction, gini) 23 | plt.xlabel('Positive fraction') 24 | plt.ylabel('Gini Impurity') 25 | plt.ylim(0, 1) 26 | plt.show() 27 | 28 | 29 | # Given labels of a data set, the Gini Impurity calculation function 30 | def gini_impurity(labels): 31 | # When the set is empty, it is also pure 32 | if len(labels) == 0: 33 | return 0 34 | # Count the occurrences of each label 35 | counts = np.unique(labels, return_counts=True)[1] 36 | fractions = counts / float(len(labels)) 37 | return 1 - np.sum(fractions ** 2) 38 | 39 | 40 | print(f'{gini_impurity([1, 1, 0, 1, 0]):.4f}') 41 | print(f'{gini_impurity([1, 1, 0, 1, 0, 0]):.4f}') 42 | print(f'{gini_impurity([1, 1, 1, 1]):.4f}') 43 | 44 | 45 | # ## Information Gain 46 | 47 | # Plot entropy in binary case 48 | pos_fraction = np.linspace(0.001, 0.999, 1000) 49 | ent = - (pos_fraction * np.log2(pos_fraction) + (1 - pos_fraction) * np.log2(1 - pos_fraction)) 50 | plt.plot(pos_fraction, ent) 51 | plt.xlabel('Positive fraction') 52 | plt.ylabel('Entropy') 53 | plt.ylim(0, 1) 54 | plt.show() 55 | 56 | 57 | # Given labels of a data set, the entropy calculation function 58 | def entropy(labels): 59 | if len(labels) == 0: 60 | return 0 61 | counts = np.unique(labels, return_counts=True)[1] 62 | fractions = counts / float(len(labels)) 63 | return - np.sum(fractions * np.log2(fractions)) 64 | 65 | print(f'{entropy([1, 1, 0, 1, 0]):.4f}') 66 | print(f'{entropy([1, 1, 0, 1, 0, 0]):.4f}') 67 | print(f'{entropy([1, 1, 1, 1]):.4f}') 68 | 69 | 70 | # def information_gain(y, mask, func=entropy): 71 | # s1 = np.sum(mask) 72 | # s2 = mask.size - s1 73 | # if (s1 == 0 | s2 == 0): return 0 74 | # return func(y) - s1 / float(s1 + s2) * func(y[mask]) - s2 / float(s1 + s2) * func(y[np.logical_not(mask)]) 75 | 76 | 77 | criterion_function = {'gini': gini_impurity, 'entropy': entropy} 78 | def weighted_impurity(groups, criterion='gini'): 79 | """ 80 | Calculate weighted impurity of children after a split 81 | @param groups: list of children, and a child consists a list of class labels 82 | @param criterion: metric to measure the quality of a split, 'gini' for Gini Impurity or 'entropy' for Information Gain 83 | @return: float, weighted impurity 84 | """ 85 | total = sum(len(group) for group in groups) 86 | weighted_sum = 0.0 87 | for group in groups: 88 | weighted_sum += len(group) / float(total) * criterion_function[criterion](group) 89 | return weighted_sum 90 | 91 | 92 | children_1 = [[1, 0, 1], [0, 1]] 93 | children_2 = [[1, 1], [0, 0, 1]] 94 | print(f"Entropy of #1 split: {weighted_impurity(children_1, 'entropy'):.4f}") 95 | print(f"Entropy of #2 split: {weighted_impurity(children_2, 'entropy'):.4f}") 96 | 97 | 98 | # # Implementing a decision tree from scratch 99 | 100 | def split_node(X, y, index, value): 101 | """ 102 | Split data set X, y based on a feature and a value 103 | @param X: numpy.ndarray, dataset feature 104 | @param y: numpy.ndarray, dataset target 105 | @param index: int, index of the feature used for splitting 106 | @param value: value of the feature used for splitting 107 | @return: list, list: left and right child, a child is in the format of [X, y] 108 | """ 109 | x_index = X[:, index] 110 | # if this feature is numerical 111 | if X[0, index].dtype.kind in ['i', 'f']: 112 | mask = x_index >= value 113 | # if this feature is categorical 114 | else: 115 | mask = x_index == value 116 | # split into left and right child 117 | left = [X[~mask, :], y[~mask]] 118 | right = [X[mask, :], y[mask]] 119 | return left, right 120 | 121 | 122 | def get_best_split(X, y, criterion): 123 | """ 124 | Obtain the best splitting point and resulting children for the data set X, y 125 | @param X: numpy.ndarray, dataset feature 126 | @param y: numpy.ndarray, dataset target 127 | @param criterion: gini or entropy 128 | @return: dict {index: index of the feature, value: feature value, children: left and right children} 129 | """ 130 | best_index, best_value, best_score, children = None, None, 1, None 131 | for index in range(len(X[0])): 132 | for value in np.sort(np.unique(X[:, index])): 133 | groups = split_node(X, y, index, value) 134 | impurity = weighted_impurity([groups[0][1], groups[1][1]], criterion) 135 | if impurity < best_score: 136 | best_index, best_value, best_score, children = index, value, impurity, groups 137 | return {'index': best_index, 'value': best_value, 'children': children}| 138 | 139 | 140 | def get_leaf(labels): 141 | # Obtain the leaf as the majority of the labels 142 | return np.bincount(labels).argmax() 143 | 144 | 145 | def split(node, max_depth, min_size, depth, criterion): 146 | """ 147 | Split children of a node to construct new nodes or assign them terminals 148 | @param node: dict, with children info 149 | @param max_depth: int, maximal depth of the tree 150 | @param min_size: int, minimal samples required to further split a child 151 | @param depth: int, current depth of the node 152 | @param criterion: gini or entropy 153 | """ 154 | left, right = node['children'] 155 | del (node['children']) 156 | if left[1].size == 0: 157 | node['right'] = get_leaf(right[1]) 158 | return 159 | if right[1].size == 0: 160 | node['left'] = get_leaf(left[1]) 161 | return 162 | # Check if the current depth exceeds the maximal depth 163 | if depth >= max_depth: 164 | node['left'], node['right'] = get_leaf(left[1]), get_leaf(right[1]) 165 | return 166 | # Check if the left child has enough samples 167 | if left[1].size <= min_size: 168 | node['left'] = get_leaf(left[1]) 169 | else: 170 | # It has enough samples, we further split it 171 | result = get_best_split(left[0], left[1], criterion) 172 | result_left, result_right = result['children'] 173 | if result_left[1].size == 0: 174 | node['left'] = get_leaf(result_right[1]) 175 | elif result_right[1].size == 0: 176 | node['left'] = get_leaf(result_left[1]) 177 | else: 178 | node['left'] = result 179 | split(node['left'], max_depth, min_size, depth + 1, criterion) 180 | # Check if the right child has enough samples 181 | if right[1].size <= min_size: 182 | node['right'] = get_leaf(right[1]) 183 | else: 184 | # It has enough samples, we further split it 185 | result = get_best_split(right[0], right[1], criterion) 186 | result_left, result_right = result['children'] 187 | if result_left[1].size == 0: 188 | node['right'] = get_leaf(result_right[1]) 189 | elif result_right[1].size == 0: 190 | node['right'] = get_leaf(result_left[1]) 191 | else: 192 | node['right'] = result 193 | split(node['right'], max_depth, min_size, depth + 1, criterion) 194 | 195 | 196 | def train_tree(X_train, y_train, max_depth, min_size, criterion='gini'): 197 | """ 198 | Construction of a tree starts here 199 | @param X_train: list of training samples (feature) 200 | @param y_train: list of training samples (target) 201 | @param max_depth: int, maximal depth of the tree 202 | @param min_size: int, minimal samples required to further split a child 203 | @param criterion: gini or entropy 204 | """ 205 | X = np.array(X_train) 206 | y = np.array(y_train) 207 | root = get_best_split(X, y, criterion) 208 | split(root, max_depth, min_size, 1, criterion) 209 | return root 210 | 211 | 212 | X_train = [['tech', 'professional'], 213 | ['fashion', 'student'], 214 | ['fashion', 'professional'], 215 | ['sports', 'student'], 216 | ['tech', 'student'], 217 | ['tech', 'retired'], 218 | ['sports', 'professional']] 219 | 220 | y_train = [1, 221 | 0, 222 | 0, 223 | 0, 224 | 1, 225 | 0, 226 | 1] 227 | 228 | tree = train_tree(X_train, y_train, 2, 2) 229 | 230 | 231 | CONDITION = {'numerical': {'yes': '>=', 'no': '<'}, 232 | 'categorical': {'yes': 'is', 'no': 'is not'}} 233 | def visualize_tree(node, depth=0): 234 | if isinstance(node, dict): 235 | if node['value'].dtype.kind in ['i', 'f']: 236 | condition = CONDITION['numerical'] 237 | else: 238 | condition = CONDITION['categorical'] 239 | print('{}|- X{} {} {}'.format(depth * ' ', node['index'] + 1, condition['no'], node['value'])) 240 | if 'left' in node: 241 | visualize_tree(node['left'], depth + 1) 242 | print('{}|- X{} {} {}'.format(depth * ' ', node['index'] + 1, condition['yes'], node['value'])) 243 | if 'right' in node: 244 | visualize_tree(node['right'], depth + 1) 245 | else: 246 | print(f"{depth * ' '}[{node}]") 247 | 248 | 249 | visualize_tree(tree) 250 | 251 | 252 | X_train_n = [[6, 7], 253 | [2, 4], 254 | [7, 2], 255 | [3, 6], 256 | [4, 7], 257 | [5, 2], 258 | [1, 6], 259 | [2, 0], 260 | [6, 3], 261 | [4, 1]] 262 | 263 | y_train_n = [0, 264 | 0, 265 | 0, 266 | 0, 267 | 0, 268 | 1, 269 | 1, 270 | 1, 271 | 1, 272 | 1] 273 | 274 | tree = train_tree(X_train_n, y_train_n, 2, 2) 275 | visualize_tree(tree) 276 | 277 | 278 | # # Implementing a decision tree with scikit-learn 279 | 280 | from sklearn.tree import DecisionTreeClassifier 281 | tree_sk = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_split=2) 282 | tree_sk.fit(X_train_n, y_train_n) 283 | 284 | from sklearn.tree import export_graphviz 285 | export_graphviz(tree_sk, out_file='tree.dot', feature_names=['X1', 'X2'], impurity=False, filled=True, class_names=['0', '1']) 286 | 287 | 288 | # --- 289 | 290 | # Readers may ignore the next cell. 291 | 292 | get_ipython().system('jupyter nbconvert --to python ch3_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 293 | 294 | -------------------------------------------------------------------------------- /ch3/ch3_part2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "05b0d160", 6 | "metadata": {}, 7 | "source": [ 8 | "\n", 9 | "Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)\n", 10 | "\n", 11 | "Chapter 3 Predicting Online Ad Click-Through with Tree-Based Algorithms \n", 12 | "\n", 13 | "Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "96cab12c", 19 | "metadata": {}, 20 | "source": [ 21 | "# Predicting ad click-through with a decision tree" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "id": "d12f1e67", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | " id click hour C1 banner_pos site_id site_domain \\\n", 35 | "0 1.000009e+18 0 14102100 1005 0 1fbe01fe f3845767 \n", 36 | "1 1.000017e+19 0 14102100 1005 0 1fbe01fe f3845767 \n", 37 | "2 1.000037e+19 0 14102100 1005 0 1fbe01fe f3845767 \n", 38 | "3 1.000064e+19 0 14102100 1005 0 1fbe01fe f3845767 \n", 39 | "4 1.000068e+19 0 14102100 1005 1 fe8cc448 9166c161 \n", 40 | "\n", 41 | " site_category app_id app_domain ... device_type device_conn_type C14 \\\n", 42 | "0 28905ebd ecad2386 7801e8d9 ... 1 2 15706 \n", 43 | "1 28905ebd ecad2386 7801e8d9 ... 1 0 15704 \n", 44 | "2 28905ebd ecad2386 7801e8d9 ... 1 0 15704 \n", 45 | "3 28905ebd ecad2386 7801e8d9 ... 1 0 15706 \n", 46 | "4 0569f928 ecad2386 7801e8d9 ... 1 0 18993 \n", 47 | "\n", 48 | " C15 C16 C17 C18 C19 C20 C21 \n", 49 | "0 320 50 1722 0 35 -1 79 \n", 50 | "1 320 50 1722 0 35 100084 79 \n", 51 | "2 320 50 1722 0 35 100084 79 \n", 52 | "3 320 50 1722 0 35 100084 79 \n", 53 | "4 320 50 2161 0 35 -1 157 \n", 54 | "\n", 55 | "[5 rows x 24 columns]\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "import pandas as pd\n", 61 | "n_rows = 300000\n", 62 | "df = pd.read_csv(\"train.csv\", nrows=n_rows)\n", 63 | "print(df.head(5))" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 2, 69 | "id": "2f3bed2c", 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "(300000, 19)\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values\n", 82 | "Y = df['click'].values\n", 83 | "\n", 84 | "print(X.shape)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 3, 90 | "id": "53b199ee", 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | " (0, 2)\t1.0\n", 98 | " (0, 6)\t1.0\n", 99 | " (0, 188)\t1.0\n", 100 | " (0, 2608)\t1.0\n", 101 | " (0, 2679)\t1.0\n", 102 | " (0, 3771)\t1.0\n", 103 | " (0, 3885)\t1.0\n", 104 | " (0, 3929)\t1.0\n", 105 | " (0, 4879)\t1.0\n", 106 | " (0, 7315)\t1.0\n", 107 | " (0, 7319)\t1.0\n", 108 | " (0, 7475)\t1.0\n", 109 | " (0, 7824)\t1.0\n", 110 | " (0, 7828)\t1.0\n", 111 | " (0, 7869)\t1.0\n", 112 | " (0, 7977)\t1.0\n", 113 | " (0, 7982)\t1.0\n", 114 | " (0, 8021)\t1.0\n", 115 | " (0, 8189)\t1.0\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "n_train = int(n_rows * 0.9)\n", 121 | "X_train = X[:n_train]\n", 122 | "Y_train = Y[:n_train]\n", 123 | "X_test = X[n_train:]\n", 124 | "Y_test = Y[n_train:]\n", 125 | "\n", 126 | "from sklearn.preprocessing import OneHotEncoder\n", 127 | "enc = OneHotEncoder(handle_unknown='ignore')\n", 128 | "X_train_enc = enc.fit_transform(X_train)\n", 129 | "\n", 130 | "X_train_enc[0]\n", 131 | "print(X_train_enc[0])" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 4, 137 | "id": "7bbea72b", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "X_test_enc = enc.transform(X_test)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 5, 147 | "id": "76447984", 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "{'max_depth': 10}\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "from sklearn.tree import DecisionTreeClassifier\n", 160 | "parameters = {'max_depth': [3, 10, None]}\n", 161 | "decision_tree = DecisionTreeClassifier(criterion='gini', min_samples_split=30)\n", 162 | "\n", 163 | "from sklearn.model_selection import GridSearchCV\n", 164 | "grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3, scoring='roc_auc')\n", 165 | "\n", 166 | "grid_search.fit(X_train_enc, Y_train)\n", 167 | "print(grid_search.best_params_)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 6, 173 | "id": "d8b9655b", 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "The ROC AUC on testing set is: 0.719\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "decision_tree_best = grid_search.best_estimator_\n", 186 | "pos_prob = decision_tree_best.predict_proba(X_test_enc)[:, 1]\n", 187 | "\n", 188 | "from sklearn.metrics import roc_auc_score\n", 189 | "print(f'The ROC AUC on testing set is: {roc_auc_score(Y_test, pos_prob):.3f}')" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 7, 195 | "id": "f52d826e", 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "The ROC AUC on testing set using random selection is: 0.499\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "import numpy as np\n", 208 | "pos_prob = np.zeros(len(Y_test))\n", 209 | "click_index = np.random.choice(len(Y_test), int(len(Y_test) * 51211.0/300000), replace=False)\n", 210 | "pos_prob[click_index] = 1\n", 211 | "\n", 212 | "print(f'The ROC AUC on testing set using random selection is: {roc_auc_score(Y_test, pos_prob):.3f}')" 213 | ] 214 | }, 215 | { 216 | "attachments": {}, 217 | "cell_type": "markdown", 218 | "id": "f325f60f", 219 | "metadata": {}, 220 | "source": [ 221 | "# Ensembling decision trees – random forest " 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 8, 227 | "id": "ade5a566", 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "{'max_depth': None}\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "from sklearn.ensemble import RandomForestClassifier\n", 240 | "\n", 241 | "random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1)\n", 242 | "grid_search = GridSearchCV(random_forest, parameters, n_jobs=-1, cv=3, scoring='roc_auc')\n", 243 | "grid_search.fit(X_train_enc, Y_train)\n", 244 | "print(grid_search.best_params_)\n" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 9, 250 | "id": "370afc6f", 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "name": "stdout", 255 | "output_type": "stream", 256 | "text": [ 257 | "The ROC AUC on testing set using random forest is: 0.759\n" 258 | ] 259 | } 260 | ], 261 | "source": [ 262 | "random_forest_best = grid_search.best_estimator_\n", 263 | "pos_prob = random_forest_best.predict_proba(X_test_enc)[:, 1]\n", 264 | "print(f'The ROC AUC on testing set using random forest is: {roc_auc_score(Y_test, pos_prob):.3f}')" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "id": "a5674331", 270 | "metadata": {}, 271 | "source": [ 272 | "# Ensembling decision trees – gradient boosted trees" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 10, 278 | "id": "81ced70c", 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "The ROC AUC on testing set using GBT is: 0.771\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "import xgboost as xgb\n", 291 | "model = xgb.XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=1000)\n", 292 | "\n", 293 | "model.fit(X_train_enc, Y_train)\n", 294 | "pos_prob = model.predict_proba(X_test_enc)[:, 1]\n", 295 | "\n", 296 | "print(f'The ROC AUC on testing set using GBT is: {roc_auc_score(Y_test, pos_prob):.3f}')\n" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "id": "4528421b", 302 | "metadata": {}, 303 | "source": [ 304 | "---" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "id": "8902f0ae", 310 | "metadata": {}, 311 | "source": [ 312 | "Readers may ignore the next cell." 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 11, 318 | "id": "d5eda2fe", 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "name": "stderr", 323 | "output_type": "stream", 324 | "text": [ 325 | "[NbConvertApp] Converting notebook ch3_part2.ipynb to python\n", 326 | "[NbConvertApp] Writing 2830 bytes to ch3_part2.py\n" 327 | ] 328 | } 329 | ], 330 | "source": [ 331 | "!jupyter nbconvert --to python ch3_part2.ipynb --TemplateExporter.exclude_input_prompt=True" 332 | ] 333 | } 334 | ], 335 | "metadata": { 336 | "kernelspec": { 337 | "display_name": "Python 3 (ipykernel)", 338 | "language": "python", 339 | "name": "python3" 340 | }, 341 | "language_info": { 342 | "codemirror_mode": { 343 | "name": "ipython", 344 | "version": 3 345 | }, 346 | "file_extension": ".py", 347 | "mimetype": "text/x-python", 348 | "name": "python", 349 | "nbconvert_exporter": "python", 350 | "pygments_lexer": "ipython3", 351 | "version": "3.9.16" 352 | } 353 | }, 354 | "nbformat": 4, 355 | "nbformat_minor": 5 356 | } 357 | -------------------------------------------------------------------------------- /ch3/ch3_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 6 | # 7 | # Chapter 3 Predicting Online Ad Click-Through with Tree-Based Algorithms 8 | # 9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 10 | 11 | # # Predicting ad click-through with a decision tree 12 | 13 | import pandas as pd 14 | n_rows = 300000 15 | df = pd.read_csv("train.csv", nrows=n_rows) 16 | print(df.head(5)) 17 | 18 | 19 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 20 | Y = df['click'].values 21 | 22 | print(X.shape) 23 | 24 | 25 | n_train = int(n_rows * 0.9) 26 | X_train = X[:n_train] 27 | Y_train = Y[:n_train] 28 | X_test = X[n_train:] 29 | Y_test = Y[n_train:] 30 | 31 | from sklearn.preprocessing import OneHotEncoder 32 | enc = OneHotEncoder(handle_unknown='ignore') 33 | X_train_enc = enc.fit_transform(X_train) 34 | 35 | X_train_enc[0] 36 | print(X_train_enc[0]) 37 | 38 | 39 | X_test_enc = enc.transform(X_test) 40 | 41 | 42 | from sklearn.tree import DecisionTreeClassifier 43 | parameters = {'max_depth': [3, 10, None]} 44 | decision_tree = DecisionTreeClassifier(criterion='gini', min_samples_split=30) 45 | 46 | from sklearn.model_selection import GridSearchCV 47 | grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3, scoring='roc_auc') 48 | 49 | grid_search.fit(X_train_enc, Y_train) 50 | print(grid_search.best_params_) 51 | 52 | 53 | decision_tree_best = grid_search.best_estimator_ 54 | pos_prob = decision_tree_best.predict_proba(X_test_enc)[:, 1] 55 | 56 | from sklearn.metrics import roc_auc_score 57 | print(f'The ROC AUC on testing set is: {roc_auc_score(Y_test, pos_prob):.3f}') 58 | 59 | 60 | import numpy as np 61 | pos_prob = np.zeros(len(Y_test)) 62 | click_index = np.random.choice(len(Y_test), int(len(Y_test) * 51211.0/300000), replace=False) 63 | pos_prob[click_index] = 1 64 | 65 | print(f'The ROC AUC on testing set using random selection is: {roc_auc_score(Y_test, pos_prob):.3f}') 66 | 67 | 68 | # # Ensembling decision trees – random forest 69 | 70 | from sklearn.ensemble import RandomForestClassifier 71 | 72 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1) 73 | grid_search = GridSearchCV(random_forest, parameters, n_jobs=-1, cv=3, scoring='roc_auc') 74 | grid_search.fit(X_train_enc, Y_train) 75 | print(grid_search.best_params_) 76 | 77 | 78 | random_forest_best = grid_search.best_estimator_ 79 | pos_prob = random_forest_best.predict_proba(X_test_enc)[:, 1] 80 | print(f'The ROC AUC on testing set using random forest is: {roc_auc_score(Y_test, pos_prob):.3f}') 81 | 82 | 83 | # # Ensembling decision trees – gradient boosted trees 84 | 85 | import xgboost as xgb 86 | model = xgb.XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=1000) 87 | 88 | model.fit(X_train_enc, Y_train) 89 | pos_prob = model.predict_proba(X_test_enc)[:, 1] 90 | 91 | print(f'The ROC AUC on testing set using GBT is: {roc_auc_score(Y_test, pos_prob):.3f}') 92 | 93 | 94 | # --- 95 | 96 | # Readers may ignore the next cell. 97 | 98 | get_ipython().system('jupyter nbconvert --to python ch3_part2.ipynb --TemplateExporter.exclude_input_prompt=True') 99 | 100 | -------------------------------------------------------------------------------- /ch4/ch4_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 6 | # 7 | # Chapter 4 Predicting Online Ad Click-Through with Tree-Based Algorithms 8 | # 9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 10 | # 11 | 12 | # # Converting categorical features to numerical – one-hot encoding and ordinal encoding 13 | 14 | from sklearn.feature_extraction import DictVectorizer 15 | 16 | 17 | X_dict = [{'interest': 'tech', 'occupation': 'professional'}, 18 | {'interest': 'fashion', 'occupation': 'student'}, 19 | {'interest': 'fashion', 'occupation': 'professional'}, 20 | {'interest': 'sports', 'occupation': 'student'}, 21 | {'interest': 'tech', 'occupation': 'student'}, 22 | {'interest': 'tech', 'occupation': 'retired'}, 23 | {'interest': 'sports', 'occupation': 'professional'}] 24 | 25 | dict_one_hot_encoder = DictVectorizer(sparse=False) 26 | X_encoded = dict_one_hot_encoder.fit_transform(X_dict) 27 | print(X_encoded) 28 | 29 | 30 | print(dict_one_hot_encoder.vocabulary_) 31 | 32 | 33 | new_dict = [{'interest': 'sports', 'occupation': 'retired'}] 34 | new_encoded = dict_one_hot_encoder.transform(new_dict) 35 | print(new_encoded) 36 | 37 | 38 | print(dict_one_hot_encoder.inverse_transform(new_encoded)) 39 | 40 | 41 | # new category not encountered before 42 | new_dict = [{'interest': 'unknown_interest', 'occupation': 'retired'}, 43 | {'interest': 'tech', 'occupation': 'unseen_occupation'}] 44 | new_encoded = dict_one_hot_encoder.transform(new_dict) 45 | print(new_encoded) 46 | 47 | 48 | import pandas as pd 49 | df = pd.DataFrame({'score': ['low', 50 | 'high', 51 | 'medium', 52 | 'medium', 53 | 'low']}) 54 | print(df) 55 | 56 | mapping = {'low':1, 'medium':2, 'high':3} 57 | df['score'] = df['score'].replace(mapping) 58 | 59 | print(df) 60 | 61 | 62 | # # Classifying data with logistic regression 63 | 64 | # ## Getting started with the logistic function 65 | 66 | import numpy as np 67 | import matplotlib.pyplot as plt 68 | 69 | 70 | def sigmoid(input): 71 | return 1.0 / (1 + np.exp(-input)) 72 | 73 | 74 | z = np.linspace(-8, 8, 1000) 75 | y = sigmoid(z) 76 | plt.plot(z, y) 77 | plt.axhline(y=0, ls='dotted', color='k') 78 | plt.axhline(y=0.5, ls='dotted', color='k') 79 | plt.axhline(y=1, ls='dotted', color='k') 80 | plt.yticks([0.0, 0.25, 0.5, 0.75, 1.0]) 81 | plt.xlabel('z') 82 | plt.ylabel('y(z)') 83 | plt.show() 84 | 85 | 86 | # ## Jumping from the logistic function to logistic regression 87 | 88 | # plot sample cost vs y_hat (prediction), for y (truth) = 1 89 | y_hat = np.linspace(0.001, 0.999, 1000) 90 | cost = -np.log(y_hat) 91 | plt.plot(y_hat, cost) 92 | plt.xlabel('Prediction') 93 | plt.ylabel('Cost') 94 | plt.xlim(0, 1) 95 | plt.ylim(0, 7) 96 | plt.show() 97 | 98 | 99 | # plot sample cost vs y_hat (prediction), for y (truth) = 0 100 | y_hat = np.linspace(0.001, 0.999, 1000) 101 | cost = -np.log(1 - y_hat) 102 | plt.plot(y_hat, cost) 103 | plt.xlabel('Prediction') 104 | plt.ylabel('Cost') 105 | plt.xlim(0, 1) 106 | plt.ylim(0, 7) 107 | plt.show() 108 | 109 | 110 | # # Training a logistic regression model 111 | 112 | # ## Training a logistic regression model using gradient descent 113 | 114 | # Gradient descent based logistic regression from scratch 115 | def compute_prediction(X, weights): 116 | """ 117 | Compute the prediction y_hat based on current weights 118 | """ 119 | z = np.dot(X, weights) 120 | return sigmoid(z) 121 | 122 | 123 | def update_weights_gd(X_train, y_train, weights, learning_rate): 124 | """ 125 | Update weights by one step 126 | """ 127 | predictions = compute_prediction(X_train, weights) 128 | weights_delta = np.dot(X_train.T, y_train - predictions) 129 | m = y_train.shape[0] 130 | weights += learning_rate / float(m) * weights_delta 131 | return weights 132 | 133 | 134 | def compute_cost(X, y, weights): 135 | """ 136 | Compute the cost J(w) 137 | """ 138 | predictions = compute_prediction(X, weights) 139 | cost = np.mean(-y * np.log(predictions) - (1 - y) * np.log(1 - predictions)) 140 | return cost 141 | 142 | 143 | def train_logistic_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False): 144 | """ Train a logistic regression model 145 | Args: 146 | X_train, y_train (numpy.ndarray, training data set) 147 | max_iter (int, number of iterations) 148 | learning_rate (float) 149 | fit_intercept (bool, with an intercept w0 or not) 150 | Returns: 151 | numpy.ndarray, learned weights 152 | """ 153 | if fit_intercept: 154 | intercept = np.ones((X_train.shape[0], 1)) 155 | X_train = np.hstack((intercept, X_train)) 156 | weights = np.zeros(X_train.shape[1]) 157 | for iteration in range(max_iter): 158 | weights = update_weights_gd(X_train, y_train, weights, learning_rate) 159 | # Check the cost for every 100 (for example) iterations 160 | if iteration % 100 == 0: 161 | print(compute_cost(X_train, y_train, weights)) 162 | return weights 163 | 164 | 165 | def predict(X, weights): 166 | if X.shape[1] == weights.shape[0] - 1: 167 | intercept = np.ones((X.shape[0], 1)) 168 | X = np.hstack((intercept, X)) 169 | return compute_prediction(X, weights) 170 | 171 | 172 | # A example 173 | X_train = np.array([[6, 7], 174 | [2, 4], 175 | [3, 6], 176 | [4, 7], 177 | [1, 6], 178 | [5, 2], 179 | [2, 0], 180 | [6, 3], 181 | [4, 1], 182 | [7, 2]]) 183 | 184 | y_train = np.array([0, 185 | 0, 186 | 0, 187 | 0, 188 | 0, 189 | 1, 190 | 1, 191 | 1, 192 | 1, 193 | 1]) 194 | 195 | 196 | weights = train_logistic_regression(X_train, y_train, max_iter=1000, learning_rate=0.1, fit_intercept=True) 197 | 198 | 199 | 200 | X_test = np.array([[6, 1], 201 | [1, 3], 202 | [3, 1], 203 | [4, 5]]) 204 | 205 | predictions = predict(X_test, weights) 206 | print(predictions) 207 | 208 | 209 | plt.scatter(X_train[:5,0], X_train[:5,1], c='b', marker='x') 210 | plt.scatter(X_train[5:,0], X_train[5:,1], c='k', marker='.') 211 | for i, prediction in enumerate(predictions): 212 | marker = 'X' if prediction < 0.5 else 'o' 213 | c = 'b' if prediction < 0.5 else 'k' 214 | plt.scatter(X_test[i,0], X_test[i,1], c=c, marker=marker) 215 | plt.show() 216 | 217 | 218 | # ## Predicting ad click-through with logistic regression using gradient descent 219 | 220 | import pandas as pd 221 | n_rows = 300000 222 | df = pd.read_csv("train.csv", nrows=n_rows) 223 | 224 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 225 | Y = df['click'].values 226 | 227 | n_train = 10000 228 | X_train = X[:n_train] 229 | Y_train = Y[:n_train] 230 | X_test = X[n_train:] 231 | Y_test = Y[n_train:] 232 | 233 | from sklearn.preprocessing import OneHotEncoder 234 | enc = OneHotEncoder(handle_unknown='ignore') 235 | X_train_enc = enc.fit_transform(X_train) 236 | 237 | X_test_enc = enc.transform(X_test) 238 | 239 | 240 | import timeit 241 | start_time = timeit.default_timer() 242 | weights = train_logistic_regression(X_train_enc.toarray(), Y_train, max_iter=10000, learning_rate=0.01, 243 | fit_intercept=True) 244 | print(f"--- {(timeit.default_timer() - start_time):.3f} seconds ---") 245 | 246 | 247 | pred = predict(X_test_enc.toarray(), weights) 248 | from sklearn.metrics import roc_auc_score 249 | print(f'Training samples: {n_train}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}') 250 | 251 | 252 | # ## Training a logistic regression model using stochastic gradient descent 253 | 254 | def update_weights_sgd(X_train, y_train, weights, learning_rate): 255 | """ One weight update iteration: moving weights by one step based on each individual sample 256 | Args: 257 | X_train, y_train (numpy.ndarray, training data set) 258 | weights (numpy.ndarray) 259 | learning_rate (float) 260 | Returns: 261 | numpy.ndarray, updated weights 262 | """ 263 | for X_each, y_each in zip(X_train, y_train): 264 | prediction = compute_prediction(X_each, weights) 265 | weights_delta = X_each.T * (y_each - prediction) 266 | weights += learning_rate * weights_delta 267 | return weights 268 | 269 | 270 | def train_logistic_regression_sgd(X_train, y_train, max_iter, learning_rate, fit_intercept=False): 271 | """ Train a logistic regression model via SGD 272 | Args: 273 | X_train, y_train (numpy.ndarray, training data set) 274 | max_iter (int, number of iterations) 275 | learning_rate (float) 276 | fit_intercept (bool, with an intercept w0 or not) 277 | Returns: 278 | numpy.ndarray, learned weights 279 | """ 280 | if fit_intercept: 281 | intercept = np.ones((X_train.shape[0], 1)) 282 | X_train = np.hstack((intercept, X_train)) 283 | weights = np.zeros(X_train.shape[1]) 284 | for iteration in range(max_iter): 285 | weights = update_weights_sgd(X_train, y_train, weights, learning_rate) 286 | # Check the cost for every 2 (for example) iterations 287 | if iteration % 2 == 0: 288 | print(compute_cost(X_train, y_train, weights)) 289 | return weights 290 | 291 | 292 | # Train the SGD model based on 100000 samples 293 | n_train = 100000 294 | X_train = X[:n_train] 295 | Y_train = Y[:n_train] 296 | X_test = X[n_train:] 297 | Y_test = Y[n_train:] 298 | 299 | from sklearn.preprocessing import OneHotEncoder 300 | enc = OneHotEncoder(handle_unknown='ignore') 301 | X_train_enc = enc.fit_transform(X_train) 302 | 303 | X_test_enc = enc.transform(X_test) 304 | 305 | start_time = timeit.default_timer() 306 | weights = train_logistic_regression_sgd(X_train_enc.toarray(), Y_train, max_iter=10, learning_rate=0.01, 307 | fit_intercept=True) 308 | print(f"--- {(timeit.default_timer() - start_time):.3f} seconds ---") 309 | pred = predict(X_test_enc.toarray(), weights) 310 | print(f'Training samples: {n_train}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}') 311 | 312 | 313 | # # Use scikit-learn package 314 | from sklearn.linear_model import SGDClassifier 315 | sgd_lr = SGDClassifier(loss='log_loss', penalty=None, fit_intercept=True, max_iter=20, learning_rate='constant', eta0=0.01) 316 | 317 | 318 | sgd_lr.fit(X_train_enc.toarray(), Y_train) 319 | 320 | pred = sgd_lr.predict_proba(X_test_enc.toarray())[:, 1] 321 | print(f'Training samples: {n_train}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}') 322 | 323 | 324 | # ## Feature selection using L1 regularization 325 | 326 | sgd_lr_l1 = SGDClassifier(loss='log_loss', 327 | penalty='l1', 328 | alpha=0.0001, 329 | fit_intercept=True, 330 | max_iter=10, 331 | learning_rate='constant', 332 | eta0=0.01, 333 | random_state=42) 334 | sgd_lr_l1.fit(X_train_enc.toarray(), Y_train) 335 | 336 | 337 | coef_abs = np.abs(sgd_lr_l1.coef_) 338 | print(coef_abs) 339 | 340 | 341 | # bottom 10 weights and the corresponding 10 least important features 342 | print(np.sort(coef_abs)[0][:10]) 343 | 344 | 345 | feature_names = enc.get_feature_names_out() 346 | bottom_10 = np.argsort(coef_abs)[0][:10] 347 | print('10 least important features are:\n', feature_names[bottom_10]) 348 | 349 | 350 | # top 10 weights and the corresponding 10 most important features 351 | print(np.sort(coef_abs)[0][-10:]) 352 | top_10 = np.argsort(coef_abs)[0][-10:] 353 | print('10 most important features are:\n', feature_names[top_10]) 354 | 355 | 356 | # --- 357 | 358 | # Readers may ignore the next cell. 359 | 360 | get_ipython().system('jupyter nbconvert --to python ch4_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 361 | 362 | -------------------------------------------------------------------------------- /ch4/ch4_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 6 | # 7 | # Chapter 4 Predicting Online Ad Click-Through with Tree-Based Algorithms 8 | # 9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 10 | # 11 | 12 | # # Training on large datasets with online learning 13 | 14 | import numpy as np 15 | import pandas as pd 16 | import timeit 17 | from sklearn.linear_model import SGDClassifier 18 | from sklearn.metrics import roc_auc_score 19 | from sklearn.preprocessing import OneHotEncoder 20 | 21 | 22 | n_rows = 100000 * 11 23 | df = pd.read_csv("train.csv", nrows=n_rows) 24 | 25 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 26 | Y = df['click'].values 27 | 28 | n_train = 100000 * 10 29 | X_train = X[:n_train] 30 | Y_train = Y[:n_train] 31 | X_test = X[n_train:] 32 | Y_test = Y[n_train:] 33 | 34 | 35 | enc = OneHotEncoder(handle_unknown='ignore') 36 | enc.fit(X_train) 37 | 38 | 39 | # The number of iterations is set to 1 if using partial_fit. 40 | sgd_lr_online = SGDClassifier(loss='log_loss', 41 | penalty=None, 42 | fit_intercept=True, 43 | max_iter=1, 44 | learning_rate='constant', 45 | eta0=0.01, 46 | random_state=42) 47 | 48 | 49 | start_time = timeit.default_timer() 50 | 51 | # Use the first 1,000,000 samples for training, and the next 100,000 for testing 52 | for i in range(10): 53 | x_train = X_train[i*100000:(i+1)*100000] 54 | y_train = Y_train[i*100000:(i+1)*100000] 55 | x_train_enc = enc.transform(x_train) 56 | sgd_lr_online.partial_fit(x_train_enc.toarray(), y_train, classes=[0, 1]) 57 | 58 | print(f"--- {(timeit.default_timer() - start_time):.3f} seconds ---") 59 | 60 | 61 | x_test_enc = enc.transform(X_test) 62 | 63 | pred = sgd_lr_online.predict_proba(x_test_enc.toarray())[:, 1] 64 | print(f'Training samples: {n_train * 10}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}') 65 | 66 | 67 | # # Handling multiclass classification 68 | 69 | from sklearn import datasets 70 | digits = datasets.load_digits() 71 | n_samples = len(digits.images) 72 | 73 | 74 | X = digits.images.reshape((n_samples, -1)) 75 | Y = digits.target 76 | 77 | 78 | from sklearn.model_selection import train_test_split 79 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 80 | 81 | 82 | from sklearn.model_selection import GridSearchCV 83 | parameters = {'penalty': ['l2', None], 84 | 'alpha': [1e-07, 1e-06, 1e-05, 1e-04], 85 | 'eta0': [0.01, 0.1, 1, 10]} 86 | 87 | sgd_lr = SGDClassifier(loss='log_loss', 88 | learning_rate='constant', 89 | fit_intercept=True, 90 | max_iter=50, 91 | random_state=42) 92 | 93 | grid_search = GridSearchCV(sgd_lr, parameters, n_jobs=-1, cv=5) 94 | 95 | grid_search.fit(X_train, Y_train) 96 | print(grid_search.best_params_) 97 | 98 | 99 | sgd_lr_best = grid_search.best_estimator_ 100 | accuracy = sgd_lr_best.score(X_test, Y_test) 101 | print(f'The accuracy on testing set is: {accuracy*100:.1f}%') 102 | 103 | 104 | # # Implementing logistic regression using TensorFlow 105 | 106 | import tensorflow as tf 107 | 108 | 109 | n_rows = 100000 110 | df = pd.read_csv("train.csv", nrows=n_rows) 111 | 112 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 113 | Y = df['click'].values 114 | 115 | n_train = int(n_rows * 0.9) 116 | X_train = X[:n_train] 117 | Y_train = Y[:n_train] 118 | X_test = X[n_train:] 119 | Y_test = Y[n_train:] 120 | 121 | 122 | enc = OneHotEncoder(handle_unknown='ignore') 123 | X_train_enc = enc.fit_transform(X_train).toarray().astype('float32') 124 | X_test_enc = enc.transform(X_test).toarray().astype('float32') 125 | Y_train = Y_train.astype('float32') 126 | Y_test = Y_test.astype('float32') 127 | 128 | 129 | batch_size = 1000 130 | train_data = tf.data.Dataset.from_tensor_slices((X_train_enc, Y_train)) 131 | train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1) 132 | 133 | 134 | n_features = X_train_enc.shape[1] 135 | W = tf.Variable(tf.zeros([n_features, 1])) 136 | b = tf.Variable(tf.zeros([1])) 137 | 138 | 139 | learning_rate = 0.001 140 | optimizer = tf.optimizers.Adam(learning_rate) 141 | 142 | 143 | def run_optimization(x, y): 144 | with tf.GradientTape() as tape: 145 | logits = tf.add(tf.matmul(x, W), b)[:, 0] 146 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)) 147 | # Update the parameters with respect to the gradient calculations 148 | gradients = tape.gradient(loss, [W, b]) 149 | optimizer.apply_gradients(zip(gradients, [W, b])) 150 | 151 | 152 | 153 | training_steps = 5000 154 | for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1): 155 | run_optimization(batch_x, batch_y) 156 | if step % 500 == 0: 157 | logits = tf.add(tf.matmul(batch_x, W), b)[:, 0] 158 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=batch_y, logits=logits)) 159 | print("step: %i, loss: %f" % (step, loss)) 160 | 161 | 162 | logits = tf.add(tf.matmul(X_test_enc, W), b)[:, 0] 163 | pred = tf.nn.sigmoid(logits) 164 | auc_metric = tf.keras.metrics.AUC() 165 | auc_metric.update_state(Y_test, pred) 166 | 167 | print(f'AUC on testing set: {auc_metric.result().numpy():.3f}') 168 | 169 | 170 | # # Feature selection using random forest 171 | 172 | X_train = X 173 | Y_train = Y 174 | 175 | enc = OneHotEncoder(handle_unknown='ignore') 176 | X_train_enc = enc.fit_transform(X_train) 177 | 178 | 179 | # Feature selection with random forest 180 | 181 | from sklearn.ensemble import RandomForestClassifier 182 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1, random_state=42) 183 | random_forest.fit(X_train_enc.toarray(), Y_train) 184 | 185 | 186 | feature_imp = random_forest.feature_importances_ 187 | print(feature_imp) 188 | 189 | 190 | # bottom 10 weights and the corresponding 10 least important features 191 | feature_names = enc.get_feature_names_out() 192 | print(np.sort(feature_imp)[:10]) 193 | bottom_10 = np.argsort(feature_imp)[:10] 194 | print('10 least important features are:\n', feature_names[bottom_10]) 195 | 196 | 197 | # top 10 weights and the corresponding 10 most important features 198 | print(np.sort(feature_imp)[-10:]) 199 | top_10 = np.argsort(feature_imp)[-10:] 200 | print('10 most important features are:\n', feature_names[top_10]) 201 | 202 | 203 | # --- 204 | 205 | # Readers may ignore the next cell. 206 | 207 | get_ipython().system('jupyter nbconvert --to python ch4_part2.ipynb --TemplateExporter.exclude_input_prompt=True') 208 | 209 | -------------------------------------------------------------------------------- /ch5/20051201_20051210.csv: -------------------------------------------------------------------------------- 1 | Date,Open,High,Low,Close,Adj Close,Volume 2 | 2005-12-01,2244.850098,2269.389893,2244.709961,2267.169922,2267.169922,2010420000 3 | 2005-12-02,2266.169922,2273.610107,2261.129883,2273.370117,2273.370117,1758510000 4 | 2005-12-05,2269.070068,2269.479980,2250.840088,2257.639893,2257.639893,1659920000 5 | 2005-12-06,2267.760010,2278.159912,2259.370117,2260.760010,2260.760010,1788200000 6 | 2005-12-07,2263.290039,2264.909912,2244.620117,2252.010010,2252.010010,1733530000 7 | 2005-12-08,2254.800049,2261.610107,2233.739990,2246.459961,2246.459961,1908360000 8 | 2005-12-09,2247.280029,2258.669922,2241.030029,2256.729980,2256.729980,1658570000 -------------------------------------------------------------------------------- /ch5/ch5_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 6 | # 7 | # Chapter 5 Predicting Stock Price with Regression Algorithms 8 | # 9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 10 | # 11 | 12 | # # Mining stock price data 13 | 14 | # ## Acquiring data and generating features 15 | 16 | import pandas as pd 17 | 18 | 19 | mydata = pd.read_csv('20051201_20051210.csv', index_col='Date') 20 | mydata 21 | 22 | 23 | def add_original_feature(df, df_new): 24 | df_new['open'] = df['Open'] 25 | df_new['open_1'] = df['Open'].shift(1) 26 | df_new['close_1'] = df['Close'].shift(1) 27 | df_new['high_1'] = df['High'].shift(1) 28 | df_new['low_1'] = df['Low'].shift(1) 29 | df_new['volume_1'] = df['Volume'].shift(1) 30 | 31 | 32 | 33 | def add_avg_price(df, df_new): 34 | df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1) 35 | df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1) 36 | df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1) 37 | df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30'] 38 | df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365'] 39 | df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365'] 40 | 41 | 42 | 43 | def add_avg_volume(df, df_new): 44 | df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1) 45 | df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1) 46 | df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1) 47 | df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30'] 48 | df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365'] 49 | df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365'] 50 | 51 | 52 | 53 | def add_std_price(df, df_new): 54 | df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1) 55 | df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1) 56 | df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1) 57 | df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30'] 58 | df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365'] 59 | df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365'] 60 | 61 | 62 | 63 | def add_std_volume(df, df_new): 64 | df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1) 65 | df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1) 66 | df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1) 67 | df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30'] 68 | df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365'] 69 | df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365'] 70 | 71 | 72 | 73 | def add_return_feature(df, df_new): 74 | df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1) 75 | df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1) 76 | df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1) 77 | df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1) 78 | df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1) 79 | df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1) 80 | df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1) 81 | 82 | 83 | 84 | def generate_features(df): 85 | """ 86 | Generate features for a stock/index based on historical price and performance 87 | @param df: dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adj Close" 88 | @return: dataframe, data set with new features 89 | """ 90 | df_new = pd.DataFrame() 91 | # 6 original features 92 | add_original_feature(df, df_new) 93 | # 31 generated features 94 | # average price 95 | add_avg_price(df, df_new) 96 | # average volume 97 | add_avg_volume(df, df_new) 98 | # standard deviation of prices 99 | add_std_price(df, df_new) 100 | # standard deviation of volumes 101 | add_std_volume(df, df_new) 102 | # # return 103 | add_return_feature(df, df_new) 104 | # the target 105 | df_new['close'] = df['Close'] 106 | df_new = df_new.dropna(axis=0) 107 | return df_new 108 | 109 | 110 | data_raw = pd.read_csv('19900101_20230630.csv', index_col='Date') 111 | data = generate_features(data_raw) 112 | print(data.round(decimals=3).head(5)) 113 | 114 | 115 | # # Estimating with linear regression 116 | 117 | # ## Implementing linear regression from scratch 118 | 119 | import numpy as np 120 | 121 | 122 | def compute_prediction(X, weights): 123 | """ 124 | Compute the prediction y_hat based on current weights 125 | """ 126 | return np.dot(X, weights) 127 | 128 | 129 | def update_weights_gd(X_train, y_train, weights, learning_rate): 130 | """ 131 | Update weights by one step and return updated wights 132 | """ 133 | predictions = compute_prediction(X_train, weights) 134 | weights_delta = np.dot(X_train.T, y_train - predictions) 135 | m = y_train.shape[0] 136 | weights += learning_rate / float(m) * weights_delta 137 | return weights 138 | 139 | 140 | def compute_loss(X, y, weights): 141 | """ 142 | Compute the loss J(w) 143 | """ 144 | predictions = compute_prediction(X, weights) 145 | return np.mean((predictions - y) ** 2 / 2.0) 146 | 147 | 148 | def train_linear_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False, display_loss=500): 149 | """ 150 | Train a linear regression model with gradient descent, and return trained model 151 | """ 152 | if fit_intercept: 153 | intercept = np.ones((X_train.shape[0], 1)) 154 | X_train = np.hstack((intercept, X_train)) 155 | weights = np.zeros(X_train.shape[1]) 156 | for iteration in range(max_iter): 157 | weights = update_weights_gd(X_train, y_train, weights, learning_rate) 158 | # Check the cost for every 500 (by default) iterations 159 | if iteration % display_loss == 0: 160 | print(compute_loss(X_train, y_train, weights)) 161 | return weights 162 | 163 | 164 | def predict(X, weights): 165 | if X.shape[1] == weights.shape[0] - 1: 166 | intercept = np.ones((X.shape[0], 1)) 167 | X = np.hstack((intercept, X)) 168 | return compute_prediction(X, weights) 169 | 170 | 171 | # A small example 172 | X_train = np.array([[6], [2], [3], [4], [1], [5], [2], [6], [4], [7]]) 173 | y_train = np.array([5.5, 1.6, 2.2, 3.7, 0.8, 5.2, 1.5, 5.3, 4.4, 6.8]) 174 | 175 | 176 | weights = train_linear_regression(X_train, y_train, max_iter=100, learning_rate=0.01, fit_intercept=True) 177 | 178 | 179 | X_test = np.array([[1.3], [3.5], [5.2], [2.8]]) 180 | 181 | predictions = predict(X_test, weights) 182 | 183 | import matplotlib.pyplot as plt 184 | plt.scatter(X_train[:, 0], y_train, marker='o', c='b') 185 | plt.scatter(X_test[:, 0], predictions, marker='*', c='k') 186 | plt.xlabel('x') 187 | plt.ylabel('y') 188 | plt.show() 189 | 190 | 191 | # The diabetes example 192 | from sklearn import datasets 193 | diabetes = datasets.load_diabetes() 194 | print(diabetes.data.shape) 195 | 196 | num_test = 30 197 | X_train = diabetes.data[:-num_test, :] 198 | y_train = diabetes.target[:-num_test] 199 | 200 | 201 | weights = train_linear_regression(X_train, y_train, max_iter=5000, learning_rate=1, fit_intercept=True) 202 | 203 | X_test = diabetes.data[-num_test:, :] 204 | y_test = diabetes.target[-num_test:] 205 | 206 | predictions = predict(X_test, weights) 207 | 208 | print(predictions) 209 | print(y_test) 210 | 211 | 212 | # ## Implementing linear regression with scikit-learn 213 | 214 | # Directly use SGDRegressor from scikit-learn 215 | from sklearn.linear_model import SGDRegressor 216 | regressor = SGDRegressor(loss='squared_error', 217 | penalty='l2', 218 | alpha=0.0001, 219 | learning_rate='constant', 220 | eta0=0.2, 221 | max_iter=100, 222 | random_state=42) 223 | 224 | 225 | regressor.fit(X_train, y_train) 226 | predictions = regressor.predict(X_test) 227 | print(predictions) 228 | 229 | 230 | # ## Implementing linear regression with TensorFlow 231 | 232 | import tensorflow as tf 233 | 234 | 235 | layer0 = tf.keras.layers.Dense(units=1, input_shape=[X_train.shape[1]]) 236 | model = tf.keras.Sequential(layer0) 237 | 238 | 239 | model.compile(loss='mean_squared_error', 240 | optimizer=tf.keras.optimizers.Adam(1)) 241 | 242 | 243 | model.fit(X_train, y_train, epochs=100, verbose=True) 244 | 245 | 246 | predictions = model.predict(X_test)[:, 0] 247 | print(predictions) 248 | 249 | 250 | # --- 251 | 252 | # Readers may ignore the next cell. 253 | 254 | get_ipython().system('jupyter nbconvert --to python ch5_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 255 | 256 | -------------------------------------------------------------------------------- /ch6/ch6_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 6 | # 7 | # Chapter 6 Predicting Stock Prices with Artificial Neural Networks 8 | # 9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 10 | # 11 | 12 | # # Demystifying neural networks 13 | 14 | # ## Starting with a single-layer neural network 15 | 16 | # ### Layers in neural networks 17 | 18 | import numpy as np 19 | import matplotlib.pyplot as plt 20 | 21 | 22 | def sigmoid(z): 23 | return 1.0 / (1 + np.exp(-z)) 24 | 25 | z = np.linspace(-8, 8, 1000) 26 | y = sigmoid(z) 27 | plt.plot(z, y) 28 | plt.xlabel('z') 29 | plt.ylabel('y(z)') 30 | plt.title('logistic') 31 | plt.grid() 32 | plt.show() 33 | 34 | 35 | def tanh(z): 36 | return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z)) 37 | 38 | z = np.linspace(-8, 8, 1000) 39 | y = tanh(z) 40 | plt.plot(z, y) 41 | plt.xlabel('z') 42 | plt.ylabel('y(z)') 43 | plt.title('tanh') 44 | plt.grid() 45 | plt.show() 46 | 47 | 48 | def relu(z): 49 | return np.maximum(np.zeros_like(z), z) 50 | 51 | 52 | z = np.linspace(-8, 8, 1000) 53 | y = relu(z) 54 | plt.plot(z, y) 55 | plt.xlabel('z') 56 | plt.ylabel('y(z)') 57 | plt.title('relu') 58 | plt.grid() 59 | plt.show() 60 | 61 | 62 | # # Building neural networks 63 | 64 | # ## Implementing neural networks from scratch 65 | 66 | def sigmoid_derivative(z): 67 | return sigmoid(z) * (1.0 - sigmoid(z)) 68 | 69 | 70 | def train(X, y, n_hidden, learning_rate, n_iter): 71 | m, n_input = X.shape 72 | W1 = np.random.randn(n_input, n_hidden) 73 | b1 = np.zeros((1, n_hidden)) 74 | W2 = np.random.randn(n_hidden, 1) 75 | b2 = np.zeros((1, 1)) 76 | for i in range(1, n_iter+1): 77 | Z2 = np.matmul(X, W1) + b1 78 | A2 = sigmoid(Z2) 79 | Z3 = np.matmul(A2, W2) + b2 80 | A3 = Z3 81 | 82 | dZ3 = A3 - y 83 | dW2 = np.matmul(A2.T, dZ3) 84 | db2 = np.sum(dZ3, axis=0, keepdims=True) 85 | 86 | dZ2 = np.matmul(dZ3, W2.T) * sigmoid_derivative(Z2) 87 | dW1 = np.matmul(X.T, dZ2) 88 | db1 = np.sum(dZ2, axis=0) 89 | 90 | W2 = W2 - learning_rate * dW2 / m 91 | b2 = b2 - learning_rate * db2 / m 92 | W1 = W1 - learning_rate * dW1 / m 93 | b1 = b1 - learning_rate * db1 / m 94 | 95 | if i % 100 == 0: 96 | cost = np.mean((y - A3) ** 2) 97 | print('Iteration %i, training loss: %f' % (i, cost)) 98 | 99 | model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2} 100 | return model 101 | 102 | 103 | from sklearn import datasets 104 | housing = datasets.fetch_california_housing() 105 | 106 | num_test = 10 # the last 10 samples as testing set 107 | 108 | from sklearn import preprocessing 109 | scaler = preprocessing.StandardScaler() 110 | 111 | X_train = housing.data[:-num_test, :] 112 | X_train = scaler.fit_transform(X_train) 113 | y_train = housing.target[:-num_test].reshape(-1, 1) 114 | X_test = housing.data[-num_test:, :] 115 | X_test = scaler.transform(X_test) 116 | y_test = housing.target[-num_test:] 117 | 118 | 119 | n_hidden = 20 120 | learning_rate = 0.1 121 | n_iter = 2000 122 | 123 | model = train(X_train, y_train, n_hidden, learning_rate, n_iter) 124 | 125 | 126 | def predict(x, model): 127 | W1 = model['W1'] 128 | b1 = model['b1'] 129 | W2 = model['W2'] 130 | b2 = model['b2'] 131 | A2 = sigmoid(np.matmul(x, W1) + b1) 132 | A3 = np.matmul(A2, W2) + b2 133 | return A3 134 | 135 | 136 | predictions = predict(X_test, model) 137 | print(predictions[:, 0]) 138 | print(y_test) 139 | 140 | 141 | # ## Implementing neural networks with scikit-learn 142 | 143 | from sklearn.neural_network import MLPRegressor 144 | nn_scikit = MLPRegressor(hidden_layer_sizes=(16, 8), 145 | activation='relu', 146 | solver='adam', 147 | learning_rate_init=0.001, 148 | random_state=42, 149 | max_iter=2000) 150 | 151 | 152 | nn_scikit.fit(X_train, y_train.ravel()) 153 | predictions = nn_scikit.predict(X_test) 154 | print(predictions) 155 | 156 | 157 | from sklearn.metrics import mean_squared_error 158 | print(mean_squared_error(y_test, predictions)) 159 | 160 | 161 | # ## Implementing neural networks with TensorFlow 162 | 163 | import tensorflow as tf 164 | from tensorflow import keras 165 | 166 | tf.random.set_seed(42) 167 | 168 | 169 | model = keras.Sequential([ 170 | keras.layers.Dense(units=16, activation='relu'), 171 | keras.layers.Dense(units=8, activation='relu'), 172 | keras.layers.Dense(units=1) 173 | ]) 174 | 175 | 176 | model.compile(loss='mean_squared_error', 177 | optimizer=tf.keras.optimizers.Adam(0.01)) 178 | 179 | 180 | model.fit(X_train, y_train, epochs=300) 181 | 182 | 183 | predictions = model.predict(X_test)[:, 0] 184 | print(predictions) 185 | 186 | print(mean_squared_error(y_test, predictions)) 187 | 188 | 189 | # ## Implementing neural networks with PyTorch 190 | 191 | import torch 192 | import torch.nn as nn 193 | 194 | 195 | torch.manual_seed(42) 196 | model = nn.Sequential(nn.Linear(X_train.shape[1], 16), 197 | nn.ReLU(), 198 | nn.Linear(16, 8), 199 | nn.ReLU(), 200 | nn.Linear(8, 1)) 201 | 202 | 203 | loss_function = nn.MSELoss() 204 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01) 205 | 206 | 207 | X_train_torch = torch.from_numpy(X_train.astype(np.float32)) 208 | y_train_torch = torch.from_numpy(y_train.astype(np.float32)) 209 | 210 | 211 | def train_step(model, X_train, y_train, loss_function, optimizer): 212 | pred_train = model(X_train) 213 | loss = loss_function(pred_train, y_train) 214 | 215 | model.zero_grad() 216 | loss.backward() 217 | 218 | optimizer.step() 219 | 220 | return loss.item() 221 | 222 | 223 | for epoch in range(500): 224 | loss = train_step(model, X_train_torch, y_train_torch, loss_function, optimizer) 225 | 226 | if epoch % 100 == 0: 227 | print(f"Epoch {epoch} - loss: {loss}") 228 | 229 | 230 | 231 | X_test_torch = torch.from_numpy(X_test.astype(np.float32)) 232 | predictions = model(X_test_torch).detach().numpy()[:, 0] 233 | print(predictions) 234 | 235 | print(mean_squared_error(y_test, predictions)) 236 | 237 | 238 | # # Preventing overfitting in neural networks 239 | 240 | # ## Dropout 241 | 242 | torch.manual_seed(42) 243 | model_with_dropout = nn.Sequential(nn.Linear(X_train.shape[1], 16), 244 | nn.ReLU(), 245 | nn.Dropout(0.1), 246 | nn.Linear(16, 8), 247 | nn.ReLU(), 248 | nn.Linear(8, 1)) 249 | 250 | 251 | optimizer = torch.optim.Adam(model_with_dropout.parameters(), lr=0.01) 252 | 253 | 254 | for epoch in range(1000): 255 | loss = train_step(model_with_dropout, X_train_torch, y_train_torch, loss_function, optimizer) 256 | 257 | if epoch % 100 == 0: 258 | print(f"Epoch {epoch} - loss: {loss}") 259 | 260 | 261 | 262 | model_with_dropout.eval() 263 | predictions = model_with_dropout(X_test_torch).detach().numpy()[:, 0] 264 | 265 | print(mean_squared_error(y_test, predictions)) 266 | 267 | 268 | # ## Early stopping 269 | 270 | torch.manual_seed(42) 271 | model = nn.Sequential(nn.Linear(X_train.shape[1], 16), 272 | nn.ReLU(), 273 | nn.Linear(16, 8), 274 | nn.ReLU(), 275 | nn.Linear(8, 1)) 276 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01) 277 | 278 | 279 | patience = 100 280 | epochs_no_improve = 0 281 | best_test_loss = float('inf') 282 | 283 | 284 | import copy 285 | 286 | best_model = model 287 | 288 | for epoch in range(500): 289 | loss = train_step(model, X_train_torch, y_train_torch, loss_function, optimizer) 290 | 291 | predictions = model(X_test_torch).detach().numpy()[:, 0] 292 | test_loss = mean_squared_error(y_test, predictions) 293 | if test_loss > best_test_loss: 294 | epochs_no_improve += 1 295 | if epochs_no_improve > patience: 296 | print(f"Early stopped at epoch {epoch}") 297 | break 298 | else: 299 | epochs_no_improve = 0 300 | best_test_loss = test_loss 301 | best_model = copy.deepcopy(model) 302 | 303 | 304 | 305 | predictions = best_model(X_test_torch).detach().numpy()[:, 0] 306 | 307 | print(mean_squared_error(y_test, predictions)) 308 | 309 | 310 | # --- 311 | 312 | # Readers may ignore the next cell. 313 | 314 | get_ipython().system('jupyter nbconvert --to python ch6_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 315 | 316 | -------------------------------------------------------------------------------- /ch6/ch6_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 6 | # 7 | # Chapter 6 Predicting Stock Prices with Artificial Neural Networks 8 | # 9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 10 | # 11 | 12 | # # Predicting stock prices with neural networks 13 | 14 | # ## Training a simple neural network 15 | 16 | import pandas as pd 17 | import numpy as np 18 | from sklearn.preprocessing import StandardScaler 19 | import torch 20 | import torch.nn as nn 21 | 22 | 23 | # Reusing the feature generation function we developed 24 | def generate_features(df): 25 | """ 26 | Generate features for a stock/index based on historical price and performance 27 | @param df: dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adj Close" 28 | @return: dataframe, data set with new features 29 | """ 30 | df_new = pd.DataFrame() 31 | # 6 original features 32 | df_new['open'] = df['Open'] 33 | df_new['open_1'] = df['Open'].shift(1) 34 | df_new['close_1'] = df['Close'].shift(1) 35 | df_new['high_1'] = df['High'].shift(1) 36 | df_new['low_1'] = df['Low'].shift(1) 37 | df_new['volume_1'] = df['Volume'].shift(1) 38 | # 31 generated features 39 | # average price 40 | df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1) 41 | df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1) 42 | df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1) 43 | df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30'] 44 | df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365'] 45 | df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365'] 46 | # average volume 47 | df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1) 48 | df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1) 49 | df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1) 50 | df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30'] 51 | df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365'] 52 | df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365'] 53 | # standard deviation of prices 54 | df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1) 55 | df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1) 56 | df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1) 57 | df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30'] 58 | df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365'] 59 | df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365'] 60 | # standard deviation of volumes 61 | df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1) 62 | df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1) 63 | df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1) 64 | df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30'] 65 | df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365'] 66 | df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365'] 67 | # # return 68 | df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1) 69 | df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1) 70 | df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1) 71 | df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1) 72 | df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1) 73 | df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1) 74 | df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1) 75 | # the target 76 | df_new['close'] = df['Close'] 77 | df_new = df_new.dropna(axis=0) 78 | return df_new 79 | 80 | 81 | data_raw = pd.read_csv('19900101_20230630.csv', index_col='Date') 82 | data = generate_features(data_raw) 83 | 84 | start_train = '1990-01-01' 85 | end_train = '2022-12-31' 86 | 87 | start_test = '2023-01-01' 88 | end_test = '2023-06-30' 89 | 90 | data_train = data.loc[start_train:end_train] 91 | X_train = data_train.drop('close', axis=1).values 92 | y_train = data_train['close'].values 93 | 94 | data_test = data.loc[start_test:end_test] 95 | X_test = data_test.drop('close', axis=1).values 96 | y_test = data_test['close'].values 97 | 98 | 99 | scaler = StandardScaler() 100 | X_scaled_train = scaler.fit_transform(X_train) 101 | X_scaled_test = scaler.transform(X_test) 102 | 103 | 104 | X_train_torch = torch.from_numpy(X_scaled_train.astype(np.float32)) 105 | X_test_torch = torch.from_numpy(X_scaled_test.astype(np.float32)) 106 | y_train = y_train.reshape(y_train.shape[0], 1) 107 | y_train_torch = torch.from_numpy(y_train.astype(np.float32)) 108 | 109 | 110 | torch.manual_seed(42) 111 | model = nn.Sequential(nn.Linear(X_train.shape[1], 32), 112 | nn.ReLU(), 113 | nn.Linear(32, 1)) 114 | 115 | 116 | loss_function = nn.MSELoss() 117 | optimizer = torch.optim.Adam(model.parameters(), lr=0.3) 118 | 119 | 120 | def train_step(model, X_train, y_train, loss_function, optimizer): 121 | pred_train = model(X_train) 122 | loss = loss_function(pred_train, y_train) 123 | 124 | model.zero_grad() 125 | loss.backward() 126 | 127 | optimizer.step() 128 | 129 | return loss.item() 130 | 131 | 132 | for epoch in range(1000): 133 | loss = train_step(model, X_train_torch, y_train_torch, loss_function, optimizer) 134 | 135 | if epoch % 100 == 0: 136 | print(f"Epoch {epoch} - loss: {loss}") 137 | 138 | 139 | 140 | predictions = model(X_test_torch).detach().numpy()[:, 0] 141 | 142 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 143 | print(f'MSE: {mean_squared_error(y_test, predictions):.3f}') 144 | print(f'MAE: {mean_absolute_error(y_test, predictions):.3f}') 145 | print(f'R^2: {r2_score(y_test, predictions):.3f}') 146 | 147 | 148 | # ## Fine-tuning the neural network 149 | 150 | from torch.utils.tensorboard import SummaryWriter 151 | 152 | 153 | hparams_config = { 154 | "hidden_size": [16, 32], 155 | "epochs": [1000, 3000], 156 | "lr": [0.1, 0.3], 157 | } 158 | 159 | 160 | def train_validate_model(hidden_size, epochs, lr): 161 | model = nn.Sequential(nn.Linear(X_train.shape[1], hidden_size), 162 | nn.ReLU(), 163 | nn.Linear(hidden_size, 1)) 164 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 165 | 166 | # Create the TensorBoard writer 167 | writer_path = f"runs/{experiment_num}/{hidden_size}/{epochs}/{lr}" 168 | writer = SummaryWriter(log_dir=writer_path) 169 | 170 | for epoch in range(epochs): 171 | loss = train_step(model, X_train_torch, y_train_torch, loss_function, optimizer) 172 | 173 | predictions = model(X_test_torch).detach().numpy()[:, 0] 174 | test_mse = mean_squared_error(y_test, predictions) 175 | 176 | writer.add_scalar( 177 | tag="train loss", 178 | scalar_value=loss, 179 | global_step=epoch, 180 | ) 181 | writer.add_scalar( 182 | tag="test loss", 183 | scalar_value=test_mse, 184 | global_step=epoch, 185 | ) 186 | 187 | test_r2 = r2_score(y_test, predictions) 188 | print(f'R^2: {test_r2:.3f}\n') 189 | 190 | # Add the hyperparameters and metrics to TensorBoard 191 | writer.add_hparams( 192 | { 193 | "hidden_size": hidden_size, 194 | "epochs": epochs, 195 | "lr": lr, 196 | }, 197 | { 198 | "test MSE": test_mse, 199 | "test R^2": test_r2, 200 | }, 201 | ) 202 | 203 | 204 | experiment_num = 0 205 | 206 | torch.manual_seed(42) 207 | for hidden_size in hparams_config["hidden_size"]: 208 | for epochs in hparams_config["epochs"]: 209 | for lr in hparams_config["lr"]: 210 | experiment_num += 1 211 | print(f"Experiment {experiment_num}: hidden_size = {hidden_size}, epochs = {epochs}, lr = {lr}") 212 | train_validate_model(hidden_size, epochs, lr) 213 | 214 | 215 | 216 | hidden_size = 16 217 | epochs = 3000 218 | lr = 0.3 219 | best_model = nn.Sequential(nn.Linear(X_train.shape[1], hidden_size), 220 | nn.ReLU(), 221 | nn.Linear(hidden_size, 1)) 222 | optimizer = torch.optim.Adam(best_model.parameters(), lr=lr) 223 | for epoch in range(epochs): 224 | train_step(best_model, X_train_torch, y_train_torch, loss_function, optimizer) 225 | 226 | predictions = best_model(X_test_torch).detach().numpy()[:, 0] 227 | 228 | 229 | import matplotlib.pyplot as plt 230 | plt.rc('xtick', labelsize=10) 231 | plt.rc('ytick', labelsize=10) 232 | plt.plot(data_test.index, y_test, c='k') 233 | plt.plot(data_test.index, predictions, c='b') 234 | plt.xticks(range(0, 130, 10), rotation=60) 235 | plt.xlabel('Date', fontsize=10) 236 | plt.ylabel('Close price', fontsize=10) 237 | plt.legend(['Truth', 'Neural network'], fontsize=10) 238 | plt.show() 239 | 240 | 241 | # --- 242 | 243 | # Readers may ignore the next cell. 244 | 245 | get_ipython().system('jupyter nbconvert --to python ch6_part2.ipynb --TemplateExporter.exclude_input_prompt=True') 246 | 247 | -------------------------------------------------------------------------------- /ch7/ch7_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 6 | # 7 | # Chapter 7 Mining the 20 Newsgroups Dataset with Text Analysis Techniques 8 | # 9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 10 | # 11 | 12 | # # Touring popular NLP libraries and picking up NLP basics 13 | 14 | # ## Corpora 15 | 16 | import nltk 17 | # nltk.download() 18 | 19 | 20 | from nltk.corpus import names 21 | print(names.words()[:10]) 22 | 23 | print(len(names.words())) 24 | 25 | 26 | # ## Tokenization 27 | 28 | from nltk.tokenize import word_tokenize 29 | sent = '''I am reading a book. 30 | It is Python Machine Learning By Example, 31 | 4th edition.''' 32 | 33 | print(word_tokenize(sent)) 34 | 35 | 36 | sent2 = 'I have been to U.K. and U.S.A.' 37 | print(word_tokenize(sent2)) 38 | 39 | 40 | import spacy 41 | 42 | nlp = spacy.load('en_core_web_sm') 43 | tokens2 = nlp(sent2) 44 | 45 | print([token.text for token in tokens2]) 46 | 47 | 48 | from nltk.tokenize import sent_tokenize 49 | print(sent_tokenize(sent)) 50 | 51 | 52 | # ## PoS tagging 53 | 54 | import nltk 55 | tokens = word_tokenize(sent) 56 | print(nltk.pos_tag(tokens)) 57 | 58 | 59 | nltk.help.upenn_tagset('PRP') 60 | nltk.help.upenn_tagset('VBP') 61 | 62 | 63 | print([(token.text, token.pos_) for token in tokens2]) 64 | 65 | 66 | # ## NER 67 | 68 | tokens3 = nlp('The book written by Hayden Liu in 2024 was sold at $30 in America') 69 | print([(token_ent.text, token_ent.label_) for token_ent in tokens3.ents]) 70 | 71 | 72 | # ## Stemming and lemmatization 73 | 74 | from nltk.stem.porter import PorterStemmer 75 | porter_stemmer = PorterStemmer() 76 | 77 | 78 | porter_stemmer.stem('machines') 79 | 80 | 81 | porter_stemmer.stem('learning') 82 | 83 | 84 | from nltk.stem import WordNetLemmatizer 85 | lemmatizer = WordNetLemmatizer() 86 | 87 | 88 | lemmatizer.lemmatize('machines') 89 | 90 | 91 | lemmatizer.lemmatize('learning') 92 | 93 | 94 | # # Getting the newsgroups data 95 | 96 | from sklearn.datasets import fetch_20newsgroups 97 | 98 | 99 | groups = fetch_20newsgroups() 100 | 101 | 102 | groups.keys() 103 | 104 | 105 | groups['target_names'] 106 | 107 | 108 | groups['target'] 109 | 110 | 111 | import numpy as np 112 | np.unique(groups.target) 113 | 114 | 115 | import seaborn as sns 116 | import matplotlib.pyplot as plt 117 | sns.histplot(groups.target, bins=20) 118 | plt.xticks(range(0, 20, 1)) 119 | plt.show() 120 | 121 | 122 | groups.data[0] 123 | 124 | 125 | groups.target[0] 126 | 127 | 128 | groups.target_names[groups.target[0]] 129 | 130 | 131 | # # Thinking about features for text data 132 | 133 | # ## Counting the occurrence of each word token 134 | 135 | from sklearn.feature_extraction.text import CountVectorizer 136 | 137 | 138 | count_vector = CountVectorizer(max_features=500) 139 | data_count = count_vector.fit_transform(groups.data) 140 | 141 | 142 | data_count 143 | 144 | 145 | data_count[0] 146 | 147 | 148 | data_count.toarray()[0] 149 | 150 | 151 | print(count_vector.get_feature_names_out()) 152 | 153 | 154 | # ## Text preprocessing 155 | 156 | data_cleaned = [] 157 | for doc in groups.data: 158 | doc_cleaned = ' '.join(word for word in doc.split() if word.isalpha()) 159 | data_cleaned.append(doc_cleaned) 160 | 161 | 162 | # ## Dropping stop words 163 | 164 | from sklearn.feature_extraction import _stop_words 165 | print(_stop_words.ENGLISH_STOP_WORDS) 166 | 167 | 168 | count_vector = CountVectorizer(stop_words="english",max_features=500) 169 | 170 | 171 | # ## Reducing inflectional and derivational forms of words 172 | 173 | all_names = set(names.words()) 174 | 175 | 176 | def get_cleaned_data(groups, lemmatizer, remove_words): 177 | data_cleaned = [] 178 | 179 | for doc in groups.data: 180 | doc = doc.lower() 181 | doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in remove_words) 182 | data_cleaned.append(doc_cleaned) 183 | 184 | return data_cleaned 185 | 186 | 187 | count_vector_sw = CountVectorizer(stop_words="english", max_features=500) 188 | 189 | data_cleaned = get_cleaned_data(groups, lemmatizer, all_names) 190 | 191 | data_cleaned_count = count_vector_sw.fit_transform(data_cleaned) 192 | 193 | 194 | sum(len(set(doc.split())) for doc in data_cleaned) 195 | 196 | 197 | print(count_vector_sw.get_feature_names_out()) 198 | 199 | 200 | # # Visualizing the newsgroups data with t-SNE 201 | 202 | # ## t-SNE for dimensionality reduction 203 | 204 | from sklearn.manifold import TSNE 205 | 206 | 207 | categories_3 = ['talk.religion.misc', 'comp.graphics', 'sci.space'] 208 | 209 | groups_3 = fetch_20newsgroups(categories=categories_3) 210 | 211 | 212 | data_cleaned = get_cleaned_data(groups_3, lemmatizer, all_names) 213 | 214 | data_cleaned_count_3 = count_vector_sw.fit_transform(data_cleaned) 215 | 216 | 217 | tsne_model = TSNE(n_components=2, perplexity=40, random_state=42, learning_rate=500) 218 | 219 | data_tsne = tsne_model.fit_transform(data_cleaned_count_3.toarray()) 220 | 221 | 222 | plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=groups_3.target) 223 | plt.show() 224 | 225 | 226 | categories_5 = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 227 | 'comp.windows.x'] 228 | groups_5 = fetch_20newsgroups(categories=categories_5) 229 | 230 | data_cleaned = get_cleaned_data(groups_5, lemmatizer, all_names) 231 | 232 | data_cleaned_count_5 = count_vector_sw.fit_transform(data_cleaned) 233 | 234 | data_tsne = tsne_model.fit_transform(data_cleaned_count_5.toarray()) 235 | 236 | plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=groups_5.target) 237 | 238 | plt.show() 239 | 240 | 241 | # # Building embedding models using shadow neural networks 242 | 243 | # ## Utilizing pre-trained embedding models 244 | 245 | import gensim.downloader as api 246 | model = api.load("glove-twitter-25") 247 | 248 | 249 | vector = model['computer'] 250 | print('Word computer is embedded into:\n', vector) 251 | 252 | 253 | similar_words = model.most_similar("computer") 254 | print('Top ten words most contextually relevant to computer:\n', 255 | similar_words) 256 | 257 | 258 | doc_sample = ['i', 'love', 'reading', 'python', 'machine', 259 | 'learning', 'by', 'example'] 260 | doc_vector = np.mean([model[word] for word in doc_sample], axis=0) 261 | print('The document sample is embedded into:\n', doc_vector) 262 | 263 | 264 | # --- 265 | 266 | # Readers may ignore the next cell. 267 | 268 | get_ipython().system('jupyter nbconvert --to python ch7_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 269 | 270 | -------------------------------------------------------------------------------- /ch8/ch8_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 8 Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Getting started with k-means clustering 11 | 12 | # ## Implementing k-means from scratch 13 | 14 | from sklearn import datasets 15 | iris = datasets.load_iris() 16 | X = iris.data[:, 2:4] 17 | y = iris.target 18 | 19 | 20 | import numpy as np 21 | from matplotlib import pyplot as plt 22 | plt.scatter(X[:,0], X[:,1], c=y) 23 | plt.show() 24 | 25 | 26 | k = 3 27 | np.random.seed(0) 28 | random_index = np.random.choice(range(len(X)), k) 29 | centroids = X[random_index] 30 | 31 | 32 | def visualize_centroids(X, centroids): 33 | plt.scatter(X[:, 0], X[:, 1]) 34 | plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505') 35 | plt.show() 36 | 37 | visualize_centroids(X, centroids) 38 | 39 | 40 | def dist(a, b): 41 | return np.linalg.norm(a - b, axis=1) 42 | 43 | 44 | def assign_cluster(x, centroids): 45 | distances = dist(x, centroids) 46 | cluster = np.argmin(distances) 47 | return cluster 48 | 49 | 50 | def update_centroids(X, centroids, clusters): 51 | for i in range(k): 52 | cluster_i = np.where(clusters == i) 53 | centroids[i] = np.mean(X[cluster_i], axis=0) 54 | 55 | 56 | tol = 0.0001 57 | max_iter = 100 58 | 59 | iter = 0 60 | centroids_diff = 100000 61 | clusters = np.zeros(len(X)) 62 | 63 | 64 | from copy import deepcopy 65 | while iter < max_iter and centroids_diff > tol: 66 | for i in range(len(X)): 67 | clusters[i] = assign_cluster(X[i], centroids) 68 | centroids_prev = deepcopy(centroids) 69 | update_centroids(X, centroids, clusters) 70 | iter += 1 71 | centroids_diff = np.linalg.norm(centroids - centroids_prev) 72 | print('Iteration:', str(iter)) 73 | print('Centroids:\n', centroids) 74 | print(f'Centroids move: {centroids_diff:5.4f}') 75 | visualize_centroids(X, centroids) 76 | 77 | 78 | plt.scatter(X[:, 0], X[:, 1], c=clusters) 79 | plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='r') 80 | plt.show() 81 | 82 | 83 | # ## Implementing k-means with scikit-learn 84 | 85 | from sklearn.cluster import KMeans 86 | kmeans_sk = KMeans(n_clusters=3, n_init='auto', random_state=42) 87 | 88 | 89 | kmeans_sk.fit(X) 90 | 91 | 92 | clusters_sk = kmeans_sk.labels_ 93 | centroids_sk = kmeans_sk.cluster_centers_ 94 | 95 | 96 | plt.scatter(X[:, 0], X[:, 1], c=clusters_sk) 97 | plt.scatter(centroids_sk[:, 0], centroids_sk[:, 1], marker='*', s=200, c='r') 98 | plt.show() 99 | 100 | 101 | # ## Choosing the value of k 102 | 103 | X = iris.data 104 | y = iris.target 105 | k_list = list(range(1, 7)) 106 | sse_list = [0] * len(k_list) 107 | 108 | 109 | for k_ind, k in enumerate(k_list): 110 | kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42) 111 | kmeans.fit(X) 112 | clusters = kmeans.labels_ 113 | centroids = kmeans.cluster_centers_ 114 | 115 | sse = 0 116 | for i in range(k): 117 | cluster_i = np.where(clusters == i) 118 | 119 | sse += np.linalg.norm(X[cluster_i] - centroids[i]) 120 | 121 | print(f'k={k}, SSE={sse}') 122 | sse_list[k_ind] = sse 123 | 124 | 125 | plt.plot(k_list, sse_list) 126 | plt.show() 127 | 128 | 129 | # --- 130 | 131 | # Readers may ignore the next cell. 132 | 133 | get_ipython().system('jupyter nbconvert --to python ch8_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 134 | 135 | -------------------------------------------------------------------------------- /ch8/ch8_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 8 Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Clustering newsgroups dataset 11 | 12 | # ## Clustering newsgroups data using k-means 13 | 14 | from sklearn.datasets import fetch_20newsgroups 15 | 16 | categories = [ 17 | 'alt.atheism', 18 | 'talk.religion.misc', 19 | 'comp.graphics', 20 | 'sci.space', 21 | ] 22 | 23 | groups = fetch_20newsgroups(subset='all', categories=categories) 24 | 25 | labels = groups.target 26 | label_names = groups.target_names 27 | 28 | 29 | from nltk.stem import WordNetLemmatizer 30 | from nltk.corpus import names 31 | all_names = set(names.words()) 32 | lemmatizer = WordNetLemmatizer() 33 | 34 | def get_cleaned_data(groups, lemmatizer, remove_words): 35 | data_cleaned = [] 36 | 37 | for doc in groups.data: 38 | doc = doc.lower() 39 | doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in remove_words) 40 | data_cleaned.append(doc_cleaned) 41 | 42 | return data_cleaned 43 | 44 | data_cleaned = get_cleaned_data(groups, lemmatizer, all_names) 45 | 46 | 47 | from sklearn.feature_extraction.text import CountVectorizer 48 | count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2) 49 | data_cv = count_vector.fit_transform(data_cleaned) 50 | 51 | 52 | from sklearn.cluster import KMeans 53 | k = 4 54 | kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42) 55 | 56 | kmeans.fit(data_cv) 57 | 58 | 59 | clusters = kmeans.labels_ 60 | 61 | from collections import Counter 62 | print(Counter(clusters)) 63 | 64 | 65 | from sklearn.feature_extraction.text import TfidfVectorizer 66 | tfidf_vector = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2) 67 | 68 | 69 | data_tv = tfidf_vector.fit_transform(data_cleaned) 70 | kmeans.fit(data_tv) 71 | clusters = kmeans.labels_ 72 | print(Counter(clusters)) 73 | 74 | 75 | import numpy as np 76 | cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)} 77 | 78 | terms = tfidf_vector.get_feature_names_out() 79 | centroids = kmeans.cluster_centers_ 80 | for cluster, index_list in cluster_label.items(): 81 | counter = Counter(cluster_label[cluster]) 82 | print(f'cluster_{cluster}: {len(index_list)} samples') 83 | for label_index, count in sorted(counter.items(), key=lambda x: x[1], reverse=True): 84 | print(f'- {label_names[label_index]}: {count} samples') 85 | print('Top 10 terms:') 86 | for ind in centroids[cluster].argsort()[-10:]: 87 | print('%s ' % terms[ind], end="") 88 | print('\n') 89 | 90 | 91 | # ## Describing the clusters using GPT 92 | 93 | keywords = ' '.join(terms[ind] for ind in centroids[0].argsort()[-100:]) 94 | 95 | 96 | print(keywords) 97 | 98 | 99 | import openai 100 | 101 | 102 | # openai.api_key = '' 103 | 104 | 105 | def get_completion(prompt, model="text-davinci-003"): 106 | messages = [{"role": "user", "content": prompt}] 107 | response = openai.ChatCompletion.create( 108 | model=model, 109 | messages=messages, 110 | temperature=0 111 | ) 112 | return response.choices[0].message["content"] 113 | 114 | 115 | # response = get_completion(f"Describe a common topic based on the following keywords: {keywords}") 116 | # print(response) 117 | 118 | 119 | # # Discovering underlying topics in newsgroups 120 | 121 | # ## Topic modeling using NMF 122 | 123 | from sklearn.decomposition import NMF 124 | 125 | t = 20 126 | nmf = NMF(n_components=t, random_state=42) 127 | 128 | 129 | nmf.fit(data_cv) 130 | 131 | print(nmf.components_) 132 | 133 | 134 | terms_cv = count_vector.get_feature_names_out() 135 | for topic_idx, topic in enumerate(nmf.components_): 136 | print("Topic {}:" .format(topic_idx)) 137 | print(" ".join([terms_cv[i] for i in topic.argsort()[-10:]])) 138 | 139 | 140 | # ## Topic modeling using LDA 141 | 142 | from sklearn.decomposition import LatentDirichletAllocation 143 | 144 | t = 20 145 | lda = LatentDirichletAllocation(n_components=t, learning_method='batch',random_state=42) 146 | 147 | 148 | lda.fit(data_cv) 149 | 150 | print(lda.components_) 151 | 152 | 153 | for topic_idx, topic in enumerate(lda.components_): 154 | print("Topic {}:" .format(topic_idx)) 155 | print(" ".join([terms_cv[i] for i in topic.argsort()[-10:]])) 156 | 157 | 158 | data_cleaned = get_cleaned_data(groups_3, lemmatizer, all_names) 159 | 160 | 161 | data_embedding = [] 162 | 163 | for doc in data_cleaned: 164 | # print(doc) 165 | doc_vector = np.mean([model[word] for word in doc.split() if word in model], axis=0) 166 | data_embedding.append(doc_vector) 167 | 168 | 169 | data_tsne = tsne_model.fit_transform(np.array(data_embedding)) 170 | plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=groups_3.target) 171 | 172 | plt.show() 173 | 174 | 175 | # --- 176 | 177 | # Readers may ignore the next cell. 178 | 179 | get_ipython().system('jupyter nbconvert --to python ch8_part2.ipynb --TemplateExporter.exclude_input_prompt=True') 180 | 181 | -------------------------------------------------------------------------------- /ch9/ch9_part1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 9 Recognizing Faces with Support Vector Machine 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Finding the separating boundary with SVM 11 | 12 | # ## Implementing SVM 13 | 14 | from sklearn.datasets import load_breast_cancer 15 | cancer_data = load_breast_cancer() 16 | 17 | X = cancer_data.data 18 | Y = cancer_data.target 19 | 20 | print('Input data size :', X.shape) 21 | print('Output data size :', Y.shape) 22 | print('Label names:', cancer_data.target_names) 23 | n_pos = (Y == 1).sum() 24 | n_neg = (Y == 0).sum() 25 | print(f'{n_pos} positive samples and {n_neg} negative samples.') 26 | 27 | 28 | from sklearn.model_selection import train_test_split 29 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42) 30 | 31 | 32 | from sklearn.svm import SVC 33 | clf = SVC(kernel='linear', C=1.0, random_state=42) 34 | 35 | 36 | clf.fit(X_train, Y_train) 37 | 38 | 39 | accuracy = clf.score(X_test, Y_test) 40 | print(f'The accuracy is: {accuracy*100:.1f}%') 41 | 42 | 43 | # ## Scenario 4 – dealing with more than two classes 44 | 45 | from sklearn.datasets import load_wine 46 | wine_data = load_wine() 47 | X = wine_data.data 48 | Y = wine_data.target 49 | 50 | print('Input data size :', X.shape) 51 | print('Output data size :', Y.shape) 52 | print('Label names:', wine_data.target_names) 53 | n_class0 = (Y == 0).sum() 54 | n_class1 = (Y == 1).sum() 55 | n_class2 = (Y == 2).sum() 56 | print(f'{n_class0} class0 samples,\n{n_class1} class1 samples,\n{n_class2} class2 samples.') 57 | 58 | 59 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42) 60 | 61 | 62 | clf = SVC(kernel='linear', C=1.0, random_state=42) 63 | clf.fit(X_train, Y_train) 64 | 65 | 66 | accuracy = clf.score(X_test, Y_test) 67 | print(f'The accuracy is: {accuracy*100:.1f}%') 68 | 69 | 70 | from sklearn.metrics import classification_report 71 | pred = clf.predict(X_test) 72 | print(classification_report(Y_test, pred)) 73 | 74 | 75 | # ## Scenario 5 – solving linearly non-separable problems with kernels 76 | 77 | import numpy as np 78 | import matplotlib.pyplot as plt 79 | 80 | 81 | X = np.c_[# negative class 82 | (.3, -.8), 83 | (-1.5, -1), 84 | (-1.3, -.8), 85 | (-1.1, -1.3), 86 | (-1.2, -.3), 87 | (-1.3, -.5), 88 | (-.6, 1.1), 89 | (-1.4, 2.2), 90 | (1, 1), 91 | # positive class 92 | (1.3, .8), 93 | (1.2, .5), 94 | (.2, -2), 95 | (.5, -2.4), 96 | (.2, -2.3), 97 | (0, -2.7), 98 | (1.3, 2.1)].T 99 | Y = [-1] * 8 + [1] * 8 100 | 101 | 102 | gamma_option = [1, 2, 4] 103 | 104 | 105 | for i, gamma in enumerate(gamma_option, 1): 106 | svm = SVC(kernel='rbf', gamma=gamma) 107 | svm.fit(X, Y) 108 | plt.scatter(X[:, 0], X[:, 1], c=['b']*8+['r']*8, zorder=10) 109 | plt.axis('tight') 110 | XX, YY = np.mgrid[-3:3:200j, -3:3:200j] 111 | Z = svm.decision_function(np.c_[XX.ravel(), YY.ravel()]) 112 | Z = Z.reshape(XX.shape) 113 | plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired) 114 | plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'], levels=[-.5, 0, .5]) 115 | plt.title('gamma = %d' % gamma) 116 | plt.show() 117 | 118 | 119 | # --- 120 | 121 | # Readers may ignore the next cell. 122 | 123 | get_ipython().system('jupyter nbconvert --to python ch9_part1.ipynb --TemplateExporter.exclude_input_prompt=True') 124 | 125 | -------------------------------------------------------------------------------- /ch9/ch9_part2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing) 5 | # 6 | # Chapter 9 Recognizing Faces with Support Vector Machine 7 | # 8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com) 9 | 10 | # # Classifying face images with SVM 11 | 12 | # ## Exploring the face image dataset 13 | 14 | from sklearn.datasets import fetch_lfw_people 15 | 16 | # face_data = fetch_lfw_people(min_faces_per_person=80) 17 | face_data = fetch_lfw_people(data_home='./', min_faces_per_person=80, download_if_missing=False) 18 | 19 | 20 | X = face_data.data 21 | Y = face_data.target 22 | 23 | print('Input data size :', X.shape) 24 | print('Output data size :', Y.shape) 25 | print('Label names:', face_data.target_names) 26 | 27 | 28 | for i in range(5): 29 | print(f'Class {i} has {(Y == i).sum()} samples.') 30 | 31 | 32 | import matplotlib.pyplot as plt 33 | 34 | fig, ax = plt.subplots(3, 4) 35 | for i, axi in enumerate(ax.flat): 36 | axi.imshow(face_data.images[i], cmap='bone') 37 | axi.set(xticks=[], yticks=[], 38 | xlabel=face_data.target_names[face_data.target[i]]) 39 | 40 | plt.show() 41 | 42 | 43 | # ## Building an SVM-based image classifier 44 | 45 | from sklearn.model_selection import train_test_split 46 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42) 47 | 48 | 49 | from sklearn.svm import SVC 50 | clf = SVC(class_weight='balanced', random_state=42) 51 | 52 | 53 | from sklearn.model_selection import GridSearchCV 54 | parameters = {'C': [10, 100, 300], 55 | 'gamma': [0.0001, 0.0003, 0.001], 56 | 'kernel' : ['rbf', 'linear'] } 57 | 58 | grid_search = GridSearchCV(clf, parameters, n_jobs=-1, cv=5) 59 | 60 | 61 | grid_search.fit(X_train, Y_train) 62 | 63 | 64 | print('The best model:\n', grid_search.best_params_) 65 | 66 | 67 | print('The best averaged performance:', grid_search.best_score_) 68 | 69 | 70 | clf_best = grid_search.best_estimator_ 71 | 72 | print(f'The accuracy is: {clf_best.score(X_test, Y_test)*100:.1f}%') 73 | 74 | 75 | pred = clf_best.predict(X_test) 76 | 77 | from sklearn.metrics import classification_report 78 | print(classification_report(Y_test, pred, target_names=face_data.target_names)) 79 | 80 | 81 | # ## Boosting image classification performance with PCA 82 | 83 | from sklearn.decomposition import PCA 84 | pca = PCA(n_components=100, whiten=True, random_state=42) 85 | svc = SVC(class_weight='balanced', kernel='rbf', random_state=42) 86 | 87 | from sklearn.pipeline import Pipeline 88 | model = Pipeline([('pca', pca), 89 | ('svc', svc)]) 90 | 91 | 92 | parameters_pipeline = {'svc__C': [1, 3, 10], 93 | 'svc__gamma': [0.01, 0.03, 0.003]} 94 | grid_search = GridSearchCV(model, parameters_pipeline, n_jobs=-1, cv=5) 95 | 96 | grid_search.fit(X_train, Y_train) 97 | 98 | 99 | print('The best model:\n', grid_search.best_params_) 100 | print('The best averaged performance:', grid_search.best_score_) 101 | 102 | model_best = grid_search.best_estimator_ 103 | print(f'The accuracy is: {model_best.score(X_test, Y_test)*100:.1f}%') 104 | pred = model_best.predict(X_test) 105 | print(classification_report(Y_test, pred, target_names=face_data.target_names)) 106 | 107 | 108 | # # Estimating with support vector regression 109 | 110 | # ## Implementing SVR 111 | 112 | from sklearn import datasets 113 | diabetes = datasets.load_diabetes() 114 | 115 | X = diabetes.data 116 | Y = diabetes.target 117 | 118 | print('Input data size :', X.shape) 119 | print('Output data size :', Y.shape) 120 | 121 | 122 | 123 | num_test = 30 # the last 30 samples as testing set 124 | X_train = diabetes.data[:-num_test, :] 125 | y_train = diabetes.target[:-num_test] 126 | X_test = diabetes.data[-num_test:, :] 127 | y_test = diabetes.target[-num_test:] 128 | 129 | 130 | from sklearn.svm import SVR 131 | regressor = SVR(C=100, kernel='linear') 132 | regressor.fit(X_train, y_train) 133 | 134 | 135 | from sklearn.metrics import r2_score 136 | predictions = regressor.predict(X_test) 137 | print(r2_score(y_test, predictions)) 138 | 139 | 140 | parameters = {'C': [300, 500, 700], 141 | 'gamma': [0.3, 0.6, 1], 142 | 'kernel' : ['rbf', 'linear']} 143 | 144 | regressor = SVR() 145 | grid_search = GridSearchCV(regressor, parameters, n_jobs=-1, cv=5) 146 | 147 | 148 | grid_search.fit(X_train, y_train) 149 | 150 | 151 | print('The best model:\n', grid_search.best_params_) 152 | 153 | 154 | model_best = grid_search.best_estimator_ 155 | predictions = model_best.predict(X_test) 156 | 157 | print(r2_score(y_test, predictions)) 158 | 159 | 160 | # --- 161 | 162 | # Readers may ignore the next cell. 163 | 164 | get_ipython().system('jupyter nbconvert --to python ch9_part2.ipynb --TemplateExporter.exclude_input_prompt=True') 165 | 166 | --------------------------------------------------------------------------------