├── LICENSE
├── README.md
├── ch10
    ├── ch10_part1.ipynb
    ├── ch10_part1.py
    ├── ch10_part2.ipynb
    └── ch10_part2.py
├── ch11
    ├── ch11_part1.ipynb
    └── ch11_part1.py
├── ch12
    ├── ch12_part1.ipynb
    ├── ch12_part1.py
    ├── ch12_part2.ipynb
    ├── ch12_part2.py
    ├── ch12_part3.ipynb
    ├── ch12_part3.py
    └── warpeace_input.txt
├── ch13
    ├── ch13_part1.ipynb
    ├── ch13_part1.py
    ├── ch13_part2.ipynb
    ├── ch13_part2.py
    └── warpeace_input.txt
├── ch14
    ├── ch14_part1.ipynb
    ├── ch14_part1.py
    ├── ch14_part2.ipynb
    └── ch14_part2.py
├── ch15
    ├── ch15_part1.ipynb
    ├── ch15_part1.py
    ├── ch15_part2.ipynb
    └── ch15_part2.py
├── ch2
    ├── ch2_part1.ipynb
    ├── ch2_part1.py
    ├── ch2_part2.ipynb
    └── ch2_part2.py
├── ch3
    ├── ch3_part1.ipynb
    ├── ch3_part1.py
    ├── ch3_part2.ipynb
    └── ch3_part2.py
├── ch4
    ├── ch4_part1.ipynb
    ├── ch4_part1.py
    ├── ch4_part2.ipynb
    └── ch4_part2.py
├── ch5
    ├── 19900101_20230630.csv
    ├── 20051201_20051210.csv
    ├── ch5_part1.ipynb
    ├── ch5_part1.py
    ├── ch5_part2.ipynb
    └── ch5_part2.py
├── ch6
    ├── ch6_part1.ipynb
    ├── ch6_part1.py
    ├── ch6_part2.ipynb
    └── ch6_part2.py
├── ch7
    ├── ch7_part1.ipynb
    └── ch7_part1.py
├── ch8
    ├── ch8_part1.ipynb
    ├── ch8_part1.py
    ├── ch8_part2.ipynb
    └── ch8_part2.py
└── ch9
    ├── ch9_part1.ipynb
    ├── ch9_part1.py
    ├── ch9_part2.ipynb
    └── ch9_part2.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 packtjaniceg
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python-Machine-Learning-by-Example-Fourth-Edition
2 | Python Machine Learning by Example, Fourth Edition
3 | 


--------------------------------------------------------------------------------
/ch10/ch10_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 10 Machine Learning Best Practices
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Best practices in the data preparation stage
 11 | 
 12 | # ## Best practice 4 – Dealing with missing data
 13 | 
 14 | import numpy as np
 15 | from sklearn.impute import SimpleImputer
 16 | 
 17 | 
 18 | data_origin = [[30, 100],
 19 |                [20, 50],
 20 |                [35, np.nan],
 21 |                [25, 80],
 22 |                [30, 70],
 23 |                [40, 60]]
 24 | 
 25 | 
 26 | imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
 27 | imp_mean.fit(data_origin)
 28 | 
 29 | 
 30 | data_mean_imp = imp_mean.transform(data_origin)
 31 | print(data_mean_imp)
 32 | 
 33 | 
 34 | imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
 35 | imp_median.fit(data_origin)
 36 | data_median_imp = imp_median.transform(data_origin)
 37 | print(data_median_imp)
 38 | 
 39 | 
 40 | # New samples
 41 | new = [[20, np.nan],
 42 |        [30, np.nan],
 43 |        [np.nan, 70],
 44 |        [np.nan, np.nan]]
 45 | new_mean_imp = imp_mean.transform(new)
 46 | print(new_mean_imp)
 47 | 
 48 | 
 49 | # Effects of discarding missing values and imputation
 50 | from sklearn import datasets
 51 | dataset = datasets.load_diabetes()
 52 | X_full, y = dataset.data, dataset.target
 53 | 
 54 | 
 55 | m, n = X_full.shape
 56 | m_missing = int(m * 0.25)
 57 | print(m, m_missing)
 58 | 
 59 | 
 60 | np.random.seed(42)
 61 | missing_samples = np.array([True] * m_missing + [False] * (m - m_missing))
 62 | np.random.shuffle(missing_samples)
 63 | 
 64 | 
 65 | missing_features = np.random.randint(low=0, high=n, size=m_missing)
 66 | 
 67 | 
 68 | X_missing = X_full.copy()
 69 | X_missing[np.where(missing_samples)[0], missing_features] = np.nan
 70 | 
 71 | 
 72 | # Discard samples containing missing values
 73 | X_rm_missing = X_missing[~missing_samples, :]
 74 | y_rm_missing = y[~missing_samples]
 75 | 
 76 | 
 77 | # Estimate R^2 on the data set with missing samples removed
 78 | from sklearn.ensemble import RandomForestRegressor
 79 | from sklearn.model_selection import cross_val_score
 80 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100)
 81 | score_rm_missing = cross_val_score(regressor, X_rm_missing, y_rm_missing).mean()
 82 | print(f'Score with the data set with missing samples removed: {score_rm_missing:.2f}')
 83 | 
 84 | 
 85 | # Imputation with mean value
 86 | imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
 87 | X_mean_imp = imp_mean.fit_transform(X_missing)
 88 | 
 89 | 
 90 | # Estimate R^2 on the data set with missing samples removed
 91 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100)
 92 | score_mean_imp = cross_val_score(regressor, X_mean_imp, y).mean()
 93 | print(f'Score with the data set with missing values replaced by mean: {score_mean_imp:.2f}')
 94 | 
 95 | 
 96 | # Estimate R^2 on the full data set
 97 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=500)
 98 | score_full = cross_val_score(regressor, X_full, y).mean()
 99 | print(f'Score with the full data set: {score_full:.2f}')
100 | 
101 | 
102 | # # Best practices in the training sets generation stage
103 | 
104 | # ## Best practice 8 – Deciding whether to select features, and if so, how to do so 
105 | 
106 | from sklearn.datasets import load_digits
107 | dataset = load_digits()
108 | X, y = dataset.data, dataset.target
109 | print(X.shape)
110 | 
111 | 
112 | # Estimate accuracy on the original data set
113 | from sklearn.svm import SVC
114 | classifier = SVC(gamma=0.005, random_state=42)
115 | score = cross_val_score(classifier, X, y).mean()
116 | print(f'Score with the original data set: {score:.2f}')
117 | 
118 | 
119 | # Feature selection with random forest
120 | from sklearn.ensemble import RandomForestClassifier
121 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1, random_state=42)
122 | random_forest.fit(X, y)
123 | 
124 | # Sort features based on their importancies
125 | feature_sorted = np.argsort(random_forest.feature_importances_)
126 | 
127 | 
128 | # Select different number of top features
129 | K = [10, 15, 25, 35, 45]
130 | for k in K:
131 |     top_K_features = feature_sorted[-k:]
132 |     X_k_selected = X[:, top_K_features]
133 |     # Estimate accuracy on the data set with k selected features
134 |     classifier = SVC(gamma=0.005)
135 |     score_k_features = cross_val_score(classifier, X_k_selected, y).mean()
136 |     print(f'Score with the dataset of top {k} features: {score_k_features:.2f}')
137 | 
138 | 
139 | # ## Best practice 9 – Deciding whether to reduce dimensionality, and if so, how to do so! 
140 | 
141 | from sklearn.decomposition import PCA
142 | 
143 | # Keep different number of top components
144 | N = [10, 15, 25, 35, 45]
145 | for n in N:
146 |     pca = PCA(n_components=n)
147 |     X_n_kept = pca.fit_transform(X)
148 |     # Estimate accuracy on the data set with top n components
149 |     classifier = SVC(gamma=0.005)
150 |     score_n_components = cross_val_score(classifier, X_n_kept, y).mean()
151 |     print(f'Score with the dataset of top {n} components: {score_n_components:.2f}')
152 | 
153 | 
154 | # ## Best practice 12 – Performing feature engineering without domain expertise 
155 | 
156 | # ### Binarization and discretization 
157 | 
158 | from sklearn.preprocessing import Binarizer
159 | X = [[4], [1], [3], [0]]
160 | binarizer = Binarizer(threshold=2.9)
161 | X_new = binarizer.fit_transform(X)
162 | print(X_new)
163 | 
164 | 
165 | # ### Polynomial transformation
166 | 
167 | from sklearn.preprocessing import PolynomialFeatures
168 | X = [[2, 4],
169 |      [1, 3],
170 |      [3, 2],
171 |      [0, 3]]
172 | poly = PolynomialFeatures(degree=2)
173 | X_new = poly.fit_transform(X)
174 | print(X_new)
175 | 
176 | 
177 | # ---
178 | 
179 | # Readers may ignore the next cell.
180 | 
181 | get_ipython().system('jupyter nbconvert --to python ch10_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
182 | 
183 | 


--------------------------------------------------------------------------------
/ch10/ch10_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 10 Machine Learning Best Practices
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # ## Best practice 14 – Extracting features from text data 
 11 | 
 12 | # ### Word embedding
 13 | 
 14 | from gensim.models import Word2Vec
 15 | 
 16 | 
 17 | # Sample sentences for training
 18 | sentences = [
 19 |     ["i", "love", "machine", "learning", "by", "example"],
 20 |     ["machine", "learning", "and", "deep", "learning", "are", "fascinating"],
 21 |     ["word", "embedding", "is", "essential", "for", "many", "nlp", "tasks"],
 22 |     ["word2vec", "produces", "word", "embeddings"]
 23 | ]
 24 | 
 25 | # Create and train Word2Vec model
 26 | model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=0)
 27 | 
 28 | # Access word vectors
 29 | vector = model.wv["machine"]
 30 | print("Vector for 'machine':", vector)
 31 | 
 32 | 
 33 | import torch
 34 | import torch.nn as nn
 35 | 
 36 | # Sample data
 37 | input_data = torch.LongTensor([[1, 2, 3, 4], [5, 1, 6, 3]])
 38 | 
 39 | # Define the embedding layer
 40 | vocab_size = 10  # Total number of unique words
 41 | embedding_dim = 3  # Dimensionality of the embeddings
 42 | embedding_layer = nn.Embedding(vocab_size, embedding_dim)
 43 | 
 44 | # Pass input data through the embedding layer
 45 | embedded_data = embedding_layer(input_data)
 46 | 
 47 | # Print the embedded data
 48 | print("Embedded Data:\n", embedded_data)
 49 | 
 50 | 
 51 | # # Best practices in the deployment and monitoring stage 
 52 | 
 53 | # # Best practice 19 – Saving, loading, and reusing models 
 54 | 
 55 | # ### Saving and restoring models using pickle 
 56 | 
 57 | from sklearn import datasets
 58 | dataset = datasets.load_diabetes()
 59 | X, y = dataset.data, dataset.target
 60 | 
 61 | num_new = 30    # the last 30 samples as new data set
 62 | X_train = X[:-num_new, :]
 63 | y_train = y[:-num_new]
 64 | X_new = X[-num_new:, :]
 65 | y_new = y[-num_new:]
 66 | 
 67 | 
 68 | # Data pre-processing
 69 | from sklearn.preprocessing import StandardScaler
 70 | scaler = StandardScaler()
 71 | scaler.fit(X_train)
 72 | 
 73 | 
 74 | import pickle
 75 | # Save the scaler
 76 | pickle.dump(scaler, open("scaler.p", "wb" ))
 77 | 
 78 | 
 79 | X_scaled_train = scaler.transform(X_train)
 80 | 
 81 | 
 82 | # Regression model training
 83 | from sklearn.svm import SVR
 84 | regressor = SVR(C=20)
 85 | regressor.fit(X_scaled_train, y_train)
 86 | 
 87 | 
 88 | # Save the regressor
 89 | pickle.dump(regressor, open("regressor.p", "wb"))
 90 | 
 91 | 
 92 | # Deployment
 93 | my_scaler = pickle.load(open("scaler.p", "rb" ))
 94 | my_regressor = pickle.load(open("regressor.p", "rb"))
 95 | 
 96 | 
 97 | X_scaled_new = my_scaler.transform(X_new)
 98 | predictions = my_regressor.predict(X_scaled_new)
 99 | 
100 | 
101 | # Monitor
102 | from sklearn.metrics import r2_score
103 | print(f'Health check on the model, R^2: {r2_score(y_new, predictions):.3f}')
104 | 
105 | 
106 | # ### Saving and restoring models in TensorFlow 
107 | 
108 | import tensorflow as tf
109 | from tensorflow import keras
110 | 
111 | cancer_data = datasets.load_breast_cancer()
112 | X = cancer_data.data
113 | X = scaler.fit_transform(X)
114 | y = cancer_data.target
115 | 
116 | 
117 | learning_rate = 0.005
118 | n_iter = 10
119 | 
120 | tf.random.set_seed(42)
121 | 
122 | model = keras.Sequential([
123 |     keras.layers.Dense(units=1, activation='sigmoid')
124 | ])
125 | 
126 | model.compile(loss='binary_crossentropy',
127 |               optimizer=tf.keras.optimizers.Adam(learning_rate))
128 | 
129 | 
130 | model.fit(X, y, epochs=n_iter)
131 | 
132 | 
133 | model.summary()
134 | 
135 | 
136 | path = './model_tf'
137 | model.save(path)
138 | 
139 | 
140 | new_model = tf.keras.models.load_model(path)
141 | 
142 | new_model.summary()
143 | 
144 | 
145 | # ### Saving and restoring models in PyTorch
146 | 
147 | X_torch = torch.FloatTensor(X)
148 | y_torch = torch.FloatTensor(y.reshape(y.shape[0], 1))
149 | 
150 | 
151 | torch.manual_seed(42)
152 |  
153 | model = nn.Sequential(nn.Linear(X.shape[1], 1),
154 |                       nn.Sigmoid())
155 |  
156 | loss_function = nn.BCELoss()
157 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
158 | 
159 | 
160 | def train_step(model, X_train, y_train, loss_function, optimizer):
161 |     pred_train = model(X_train)
162 |     loss = loss_function(pred_train, y_train)
163 |     model.zero_grad()
164 |     loss.backward()
165 |     optimizer.step()
166 |     return loss.item()
167 | 
168 | 
169 | for epoch in range(n_iter):
170 |     loss = train_step(model, X_torch, y_torch, loss_function, optimizer)
171 |     print(f"Epoch {epoch} - loss: {loss}")
172 | 
173 | 
174 | print(model)
175 | 
176 | 
177 | path = './model.pth'
178 | torch.save(model, path)
179 | 
180 | 
181 | new_model = torch.load(path)
182 | print(new_model)
183 | 
184 | 
185 | # ---
186 | 
187 | # Readers may ignore the next cell.
188 | 
189 | get_ipython().system('jupyter nbconvert --to python ch10_part2.ipynb --TemplateExporter.exclude_input_prompt=True')
190 | 
191 | 


--------------------------------------------------------------------------------
/ch11/ch11_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 11 Categorizing Images of Clothing with Convolutional Neural Networks 
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Exploring the clothing image dataset 
 11 | 
 12 | import torch, torchvision 
 13 | from torchvision import transforms 
 14 | 
 15 | image_path = './'
 16 | transform = transforms.Compose([transforms.ToTensor(),
 17 |                                # transforms.Normalize((0.5,), (0.5,))
 18 |                                ])
 19 | 
 20 | train_dataset = torchvision.datasets.FashionMNIST(root=image_path, 
 21 |                                                   train=True, 
 22 |                                                   transform=transform, 
 23 |                                                   download=True)
 24 | 
 25 | test_dataset = torchvision.datasets.FashionMNIST(root=image_path, 
 26 |                                                  train=False, 
 27 |                                                  transform=transform, 
 28 |                                                  download=False)
 29 | 
 30 | 
 31 | print(train_dataset)
 32 | 
 33 | 
 34 | print(test_dataset)
 35 | 
 36 | 
 37 | from torch.utils.data import DataLoader
 38 | 
 39 | batch_size = 64
 40 | torch.manual_seed(42)
 41 | train_dl = DataLoader(train_dataset, batch_size, shuffle=True)
 42 | 
 43 | 
 44 | data_iter = iter(train_dl)
 45 | images, labels = next(data_iter)
 46 | 
 47 | 
 48 | print(labels)
 49 | 
 50 | 
 51 | # constant for classes
 52 | class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
 53 |                 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot']
 54 | 
 55 | 
 56 | print(images[0].shape)
 57 | 
 58 | 
 59 | print(torch.max(images), torch.min(images))
 60 | 
 61 | 
 62 | import numpy as np
 63 | import matplotlib.pyplot as plt
 64 | 
 65 | npimg = images[1].numpy()
 66 | plt.imshow(np.transpose(npimg, (1, 2, 0)))
 67 | plt.colorbar()
 68 | plt.title(class_names[labels[1]])
 69 | plt.show()
 70 | 
 71 | 
 72 | plt.figure(figsize=(10, 10))
 73 | 
 74 | for i in range(16):
 75 |     plt.subplot(4, 4, i + 1)
 76 |     plt.subplots_adjust(hspace=.3)
 77 |     plt.xticks([])
 78 |     plt.yticks([])
 79 |     npimg = images[i].numpy()
 80 |     plt.imshow(np.transpose(npimg, (1, 2, 0)), cmap="Greys")
 81 |     plt.title(class_names[labels[i]])
 82 | plt.show()
 83 | 
 84 | 
 85 | # # Classifying clothing images with CNNs 
 86 | 
 87 | # ## Architecting the CNN model 
 88 | 
 89 | import torch.nn as nn
 90 | model = nn.Sequential()
 91 | 
 92 | 
 93 | model.add_module('conv1', nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3))
 94 | model.add_module('relu1', nn.ReLU()) 
 95 | 
 96 | 
 97 | model.add_module('pool1', nn.MaxPool2d(kernel_size=2))
 98 | 
 99 | 
100 | model.add_module('conv2', nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3))
101 | model.add_module('relu2', nn.ReLU())   
102 | 
103 | 
104 | model.add_module('pool2', nn.MaxPool2d(kernel_size=2))
105 | 
106 | 
107 | model.add_module('conv3', nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3))
108 | model.add_module('relu3', nn.ReLU()) 
109 | 
110 | 
111 | x = torch.rand((64, 1, 28, 28))
112 | print(model(x).shape)
113 | 
114 | 
115 | model.add_module('flatten', nn.Flatten()) 
116 | 
117 | 
118 | print(model(x).shape)
119 | 
120 | 
121 | model.add_module('fc1', nn.Linear(1152, 64))
122 | model.add_module('relu4', nn.ReLU()) 
123 | 
124 | 
125 | model.add_module('fc2', nn.Linear(64, 10))
126 | model.add_module('output', nn.Softmax(dim = 1))
127 | 
128 | 
129 | print(model)
130 | 
131 | 
132 | from torchsummary import summary
133 | 
134 | 
135 | summary(model, input_size=(1, 28, 28), batch_size=-1, device="cpu")
136 | 
137 | 
138 | # ## Fitting the CNN model 
139 | 
140 | device = torch.device("cuda:0")
141 | # device = torch.device("cpu")
142 | model = model.to(device) 
143 | 
144 | loss_fn = nn.CrossEntropyLoss()
145 | optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
146 | 
147 | 
148 | def train(model, optimizer, num_epochs, train_dl):
149 |     for epoch in range(num_epochs):
150 |         loss_train = 0
151 |         accuracy_train = 0
152 |         for x_batch, y_batch in train_dl:
153 |             x_batch = x_batch.to(device) 
154 |             y_batch = y_batch.to(device) 
155 |             pred = model(x_batch)
156 |             loss = loss_fn(pred, y_batch)
157 |             loss.backward()
158 |             optimizer.step()
159 |             optimizer.zero_grad()
160 |             loss_train += loss.item() * y_batch.size(0)
161 |             is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
162 |             accuracy_train += is_correct.sum().cpu()
163 | 
164 |         loss_train /= len(train_dl.dataset)
165 |         accuracy_train /= len(train_dl.dataset)
166 |         
167 |         print(f'Epoch {epoch+1} - loss: {loss_train:.4f} - accuracy: {accuracy_train:.4f}')
168 | 
169 | 
170 | num_epochs = 30
171 | train(model, optimizer, num_epochs, train_dl)
172 | 
173 | 
174 | test_dl = DataLoader(test_dataset, batch_size, shuffle=False)
175 | 
176 | def evaluate_model(model, test_dl):
177 |     accuracy_test = 0
178 |     with torch.no_grad():
179 |         for x_batch, y_batch in test_dl:
180 |             pred = model.cpu()(x_batch)
181 |             is_correct = torch.argmax(pred, dim=1) == y_batch
182 |             accuracy_test += is_correct.float().sum().item()
183 |     
184 |     print(f'Accuracy on test set: {100 * accuracy_test / 10000} %')
185 | 
186 | evaluate_model(model, test_dl)
187 | 
188 | 
189 | # ## Visualizing the convolutional filters 
190 | 
191 | conv3_weight = model.conv3.weight.data
192 | print(conv3_weight.shape)
193 | 
194 | 
195 | plt.figure(figsize=(10, 10))
196 | 
197 | n_filters = 16
198 | for i in range(n_filters):
199 |     weight = conv3_weight[i].cpu().numpy()
200 |     plt.subplot(4, 4, i+1)
201 |     plt.xticks([])
202 |     plt.yticks([])
203 |     plt.imshow(weight[0], cmap='gray')
204 |  
205 | plt.show()
206 | 
207 | 
208 | # # Boosting the CNN classifier with data augmentation 
209 | 
210 | # ## Flipping for data augmentation
211 | 
212 | def display_image_greys(image):
213 |     npimg = image.numpy()
214 |     plt.imshow(np.transpose(npimg, (1, 2, 0)), cmap="Greys")
215 |     plt.xticks([])
216 |     plt.yticks([])
217 | 
218 | 
219 | image = images[1]
220 | plt.figure(figsize=(8, 8))
221 | plt.subplot(1, 2, 1)
222 | display_image_greys(image)
223 | 
224 | ## flipping (horizontally)
225 | img_flipped = transforms.functional.hflip(image)
226 | plt.subplot(1, 2, 2)
227 | display_image_greys(img_flipped)
228 | 
229 | plt.show()
230 |  
231 | 
232 | 
233 | torch.manual_seed(42)
234 | flip_transform = transforms.Compose([transforms.RandomHorizontalFlip()])
235 | 
236 | plt.figure(figsize=(10, 10))
237 | plt.subplot(1, 4, 1)
238 | display_image_greys(image)
239 | 
240 | for i in range(3):
241 |     plt.subplot(1, 4, i+2)
242 |     img_flip = flip_transform(image)
243 |     display_image_greys(img_flip)
244 | 
245 | 
246 | # ## Rotation for data augmentation
247 | 
248 | # rotate
249 | 
250 | torch.manual_seed(42)
251 | rotate_transform = transforms.Compose([transforms.RandomRotation(20)])
252 | 
253 | plt.figure(figsize=(10, 10))
254 | plt.subplot(1, 4, 1)
255 | display_image_greys(image)
256 | 
257 | for i in range(3):
258 |     plt.subplot(1, 4, i+2)
259 |     img_rotate = rotate_transform(image)
260 |     display_image_greys(img_rotate)
261 | 
262 | 
263 | # ## Cropping for data augmentation
264 | 
265 | torch.manual_seed(42)
266 | crop_transform = transforms.Compose([
267 |     transforms.RandomResizedCrop(size=(28, 28), scale=(0.7, 1))])
268 | 
269 | plt.figure(figsize=(10, 10))
270 | plt.subplot(1, 4, 1)
271 | display_image_greys(image)
272 | 
273 | for i in range(3):
274 |     plt.subplot(1, 4, i+2)
275 |     img_crop = crop_transform(image)
276 |     display_image_greys(img_crop)
277 |     
278 | 
279 | 
280 | # # Improving the clothing image classifier with data augmentation
281 | 
282 | torch.manual_seed(42)
283 | transform_train = transforms.Compose([
284 |     transforms.RandomHorizontalFlip(),
285 |     transforms.RandomRotation(10),
286 |     transforms.RandomResizedCrop(size=(28, 28), scale=(0.9, 1)),
287 |     transforms.ToTensor(),
288 | ])
289 | 
290 |  
291 | train_dataset_aug = torchvision.datasets.FashionMNIST(root=image_path, 
292 |                                                       train=True, 
293 |                                                       transform=transform_train, 
294 |                                                       download=False)
295 | 
296 | 
297 | from torch.utils.data import Subset
298 | train_dataset_aug_small = Subset(train_dataset_aug, torch.arange(500)) 
299 | 
300 | 
301 | train_dl_aug_small = DataLoader(train_dataset_aug_small, batch_size, shuffle=True)
302 | 
303 | 
304 | model = nn.Sequential()
305 | model.add_module('conv1', nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3))
306 | model.add_module('relu1', nn.ReLU()) 
307 | model.add_module('pool1', nn.MaxPool2d(kernel_size=2))
308 | 
309 | model.add_module('conv2', nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3))
310 | model.add_module('relu2', nn.ReLU())   
311 | model.add_module('pool2', nn.MaxPool2d(kernel_size=2))
312 | 
313 | model.add_module('conv3', nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3))
314 | model.add_module('relu3', nn.ReLU()) 
315 | 
316 | model.add_module('flatten', nn.Flatten()) 
317 | model.add_module('fc1', nn.Linear(1152, 64))
318 | model.add_module('relu4', nn.ReLU()) 
319 | 
320 | model.add_module('fc2', nn.Linear(64, 10))
321 | model.add_module('output', nn.Softmax(dim = 1))
322 | 
323 | model = model.to(device) 
324 | 
325 | 
326 | optimizer = torch.optim.Adam(model.parameters(), lr=0.001)        
327 | train(model, optimizer, 1000, train_dl_aug_small)
328 | 
329 | 
330 | evaluate_model(model, test_dl)
331 | 
332 | 
333 | # # Advancing the CNN classifier with transfer learning  
334 | 
335 | from torchvision.models import resnet18
336 | my_resnet = resnet18(weights='IMAGENET1K_V1')
337 | 
338 | 
339 | my_resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
340 | 
341 | num_ftrs = my_resnet.fc.in_features
342 | my_resnet.fc = nn.Linear(num_ftrs, 10)
343 | 
344 | 
345 | my_resnet = my_resnet.to(device) 
346 | optimizer = torch.optim.Adam(my_resnet.parameters(), lr=0.001)  
347 | train(my_resnet, optimizer, 10, train_dl)
348 | 
349 | 
350 | evaluate_model(my_resnet, test_dl)
351 | 
352 | 
353 | # ---
354 | 
355 | # Readers may ignore the next cell.
356 | 
357 | get_ipython().system('jupyter nbconvert --to python ch11_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
358 | 
359 | 


--------------------------------------------------------------------------------
/ch12/ch12_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 12 Making Predictions with Sequences Using Recurrent Neural Networks
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Analyzing movie review sentiment with RNNs
 11 | 
 12 | # ## Analyzing and preprocessing the data 
 13 | 
 14 | from torchtext.datasets import IMDB
 15 | 
 16 | train_dataset = list(IMDB(split='train'))
 17 | test_dataset = list(IMDB(split='test'))
 18 | 
 19 | print(len(train_dataset), len(test_dataset))
 20 | 
 21 | 
 22 | # !conda install -c pytorch torchtext -y
 23 | 
 24 | 
 25 | # !conda install -c conda-forge portalocker -y
 26 | 
 27 | 
 28 | import re
 29 | from collections import Counter, OrderedDict
 30 | 
 31 | def tokenizer(text):
 32 |     text = re.sub('<[^>]*>', '', text)
 33 |     emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
 34 |     text = re.sub('[\W]+', ' ', text.lower()) +\
 35 |         ' '.join(emoticons).replace('-', '')
 36 |     tokenized = text.split()
 37 |     return tokenized
 38 | 
 39 | token_counts = Counter()
 40 | train_labels = []
 41 | for label, line in train_dataset:
 42 |     train_labels.append(label)
 43 |     tokens = tokenizer(line)
 44 |     token_counts.update(tokens)
 45 |  
 46 |     
 47 | print('Vocab-size:', len(token_counts))
 48 | print(Counter(train_labels))
 49 | 
 50 | 
 51 | from torchtext.vocab import vocab
 52 | 
 53 | sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
 54 | ordered_dict = OrderedDict(sorted_by_freq_tuples)
 55 | 
 56 | vocab_mapping = vocab(ordered_dict)
 57 | 
 58 | vocab_mapping.insert_token("<pad>", 0)
 59 | vocab_mapping.insert_token("<unk>", 1)
 60 | vocab_mapping.set_default_index(1)
 61 | 
 62 | 
 63 | print([vocab_mapping[token] for token in ['this', 'is', 'an', 'example']])
 64 | print([vocab_mapping[token] for token in ['this', 'is', 'example2']])
 65 | 
 66 | 
 67 | import torch
 68 | import torch.nn as nn
 69 | 
 70 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 71 | 
 72 | text_transform = lambda x: [vocab[token] for token in tokenizer(x)]    
 73 | 
 74 | def collate_batch(batch):
 75 |     label_list, text_list, lengths = [], [], []
 76 |     for _label, _text in batch:
 77 |         label_list.append(1. if _label == 2 else 0.)
 78 |         processed_text = [vocab_mapping[token] for token in tokenizer(_text)]    
 79 |         text_list.append(torch.tensor(processed_text, dtype=torch.int64))
 80 |         lengths.append(len(processed_text))
 81 |     label_list = torch.tensor(label_list)
 82 |     lengths = torch.tensor(lengths)
 83 |     padded_text_list = nn.utils.rnn.pad_sequence(
 84 |         text_list, batch_first=True)
 85 |     return padded_text_list.to(device), label_list.to(device), lengths.to(device)
 86 | 
 87 | 
 88 | # from torch.nn.utils.rnn import pad_sequence
 89 | # a = [torch.tensor([11, 7, 35, 462], dtype=torch.int64), torch.tensor([11, 7, 35, 462, 11], dtype=torch.int64)]
 90 | # b = [torch.tensor([11, 7, 35], dtype=torch.int64), torch.tensor([11, 7, 35, 462, 11, 12], dtype=torch.int64)]
 91 | # # c = torch.ones(1, 15, 300)
 92 | # pad_sequence(a, True).size()
 93 | 
 94 | 
 95 | from torch.utils.data import DataLoader
 96 | torch.manual_seed(0)
 97 | dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_batch)
 98 | text_batch, label_batch, length_batch = next(iter(dataloader))
 99 | print(text_batch)
100 | print(label_batch)
101 | print(length_batch)
102 | print(text_batch.shape)
103 | 
104 | 
105 | batch_size = 32  
106 | 
107 | train_dl = DataLoader(train_dataset, batch_size=batch_size,
108 |                       shuffle=True, collate_fn=collate_batch)
109 | 
110 | test_dl = DataLoader(test_dataset, batch_size=batch_size,
111 |                      shuffle=False, collate_fn=collate_batch)
112 | 
113 | 
114 | # ## Building a simple LSTM network 
115 | 
116 | vocab_size = len(vocab_mapping)
117 | embed_dim = 32
118 | rnn_hidden_dim = 50
119 | fc_hidden_dim = 32
120 | 
121 | 
122 | class RNN(nn.Module):
123 |     def __init__(self, vocab_size, embed_dim, rnn_hidden_dim, fc_hidden_dim):
124 |         super().__init__()
125 |         self.embedding = nn.Embedding(vocab_size, 
126 |                                       embed_dim, 
127 |                                       padding_idx=0) 
128 |         self.rnn = nn.LSTM(embed_dim, rnn_hidden_dim, 
129 |                            batch_first=True)
130 |         self.fc1 = nn.Linear(rnn_hidden_dim, fc_hidden_dim)
131 |         self.relu = nn.ReLU()
132 |         self.fc2 = nn.Linear(fc_hidden_dim, 1)
133 |         self.sigmoid = nn.Sigmoid()
134 | 
135 |     def forward(self, text, lengths):
136 |         out = self.embedding(text)
137 |         out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
138 |         out, (hidden, cell) = self.rnn(out)
139 |         out = hidden[-1, :, :]
140 |         out = self.fc1(out)
141 |         out = self.relu(out)
142 |         out = self.fc2(out)
143 |         out = self.sigmoid(out)
144 |         return out
145 |  
146 | 
147 | 
148 | model = RNN(vocab_size, embed_dim, rnn_hidden_dim, fc_hidden_dim) 
149 | model = model.to(device)
150 | 
151 | 
152 | loss_fn = nn.BCELoss()
153 | optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
154 | 
155 | 
156 | def train(model, dataloader, optimizer):
157 |     model.train()
158 |     total_acc, total_loss = 0, 0
159 |     for text_batch, label_batch, length_batch in dataloader:
160 |         optimizer.zero_grad()
161 |         pred = model(text_batch, length_batch)[:, 0]
162 |         loss = loss_fn(pred, label_batch)
163 |         loss.backward()
164 |         optimizer.step()
165 |         total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
166 |         total_loss += loss.item()*label_batch.size(0)
167 | 
168 |     total_loss /= len(dataloader.dataset)
169 |     total_acc /= len(train_dl.dataset)
170 |     print(f'Epoch {epoch+1} - loss: {total_loss:.4f} - accuracy: {total_acc:.4f}')
171 |  
172 | 
173 | 
174 | torch.manual_seed(0)
175 | num_epochs = 10 
176 | for epoch in range(num_epochs):
177 |     train(model, train_dl, optimizer)
178 | 
179 | 
180 | def evaluate(model, dataloader):
181 |     model.eval()
182 |     total_acc = 0
183 |     with torch.no_grad():
184 |         for text_batch, label_batch, lengths in dataloader:
185 |             pred = model(text_batch, lengths)[:, 0]
186 |             total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
187 |     print(f'Accuracy on test set: {100 * total_acc/len(dataloader.dataset)} %')
188 |  
189 | evaluate(model, test_dl)
190 | 
191 | 
192 | # ## Stacking multiple LSTM layers 
193 | 
194 | nn.LSTM(embed_dim, rnn_hidden_dim, num_layers=2, batch_first=True)
195 | 
196 | 
197 | # ---
198 | 
199 | # Readers may ignore the next cell.
200 | 
201 | get_ipython().system('jupyter nbconvert --to python ch12_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
202 | 
203 | 


--------------------------------------------------------------------------------
/ch12/ch12_part2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "d1d4890d-a136-4be7-88ad-cca747947f4e",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)\n",
  9 |     "\n",
 10 |     "Chapter 12 Making Predictions with Sequences Using Recurrent Neural Networks\n",
 11 |     "\n",
 12 |     "Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "id": "851bb88a-6660-4390-9282-bf43e461ac88",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Revisiting stock price forecasting with LSTM"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "id": "22828a4c-8789-4f71-9854-987a0629687e",
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import pandas as pd\n",
 31 |     "import torch\n",
 32 |     "import torch.nn as nn"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "id": "8cfc60ba-9224-4a2c-9bcf-bcee140bae3e",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# Reusing the feature generation function we developed\n",
 43 |     "def generate_features(df):\n",
 44 |     "    \"\"\"\n",
 45 |     "    Generate features for a stock/index based on historical price and performance\n",
 46 |     "    @param df: dataframe with columns \"Open\", \"Close\", \"High\", \"Low\", \"Volume\", \"Adj Close\"\n",
 47 |     "    @return: dataframe, data set with new features\n",
 48 |     "    \"\"\"\n",
 49 |     "    df_new = pd.DataFrame()\n",
 50 |     "    # 6 original features\n",
 51 |     "    df_new['open'] = df['Open']\n",
 52 |     "    df_new['open_1'] = df['Open'].shift(1)\n",
 53 |     "    df_new['close_1'] = df['Close'].shift(1)\n",
 54 |     "    df_new['high_1'] = df['High'].shift(1)\n",
 55 |     "    df_new['low_1'] = df['Low'].shift(1)\n",
 56 |     "    df_new['volume_1'] = df['Volume'].shift(1)\n",
 57 |     "    # # 31 generated features\n",
 58 |     "    # # average price\n",
 59 |     "    df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1)\n",
 60 |     "    df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1)\n",
 61 |     "    df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1)\n",
 62 |     "    df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30']\n",
 63 |     "    df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365']\n",
 64 |     "    df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365']\n",
 65 |     "    # # average volume\n",
 66 |     "    df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1)\n",
 67 |     "    df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1)\n",
 68 |     "    df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1)\n",
 69 |     "    df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30']\n",
 70 |     "    df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365']\n",
 71 |     "    df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365']\n",
 72 |     "    # # standard deviation of prices\n",
 73 |     "    df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1)\n",
 74 |     "    df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1)\n",
 75 |     "    df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1)\n",
 76 |     "    df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30']\n",
 77 |     "    df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365']\n",
 78 |     "    df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365']\n",
 79 |     "    # # standard deviation of volumes\n",
 80 |     "    df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1)\n",
 81 |     "    df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1)\n",
 82 |     "    df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1)\n",
 83 |     "    df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30']\n",
 84 |     "    df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365']\n",
 85 |     "    df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365']\n",
 86 |     "    # # # return\n",
 87 |     "    df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1)\n",
 88 |     "    df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1)\n",
 89 |     "    df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1)\n",
 90 |     "    df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1)\n",
 91 |     "    df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1)\n",
 92 |     "    df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1)\n",
 93 |     "    df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1)\n",
 94 |     "    # the target\n",
 95 |     "    df_new['close'] = df['Close']\n",
 96 |     "    df_new = df_new.dropna(axis=0)\n",
 97 |     "    return df_new"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 3,
103 |    "id": "ec9b69f9-82ff-4ac6-a193-29aa84cd4797",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "data_raw = pd.read_csv('19900101_20230630.csv', index_col='Date')\n",
108 |     "data = generate_features(data_raw)\n",
109 |     "\n",
110 |     "start_train = '1990-01-01'\n",
111 |     "end_train = '2022-12-31'\n",
112 |     "\n",
113 |     "start_test = '2023-01-01'\n",
114 |     "end_test = '2023-06-30'\n",
115 |     "\n",
116 |     "data_train = data.loc[start_train:end_train]\n",
117 |     "X_train = data_train.drop('close', axis=1).values\n",
118 |     "y_train = data_train['close'].values\n",
119 |     "\n",
120 |     "data_test = data.loc[start_test:end_test]\n",
121 |     "X_test = data_test.drop('close', axis=1).values\n",
122 |     "y_test = data_test['close'].values"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 4,
128 |    "id": "433d14fa-5f4c-4036-8314-541d65cce4a6",
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "from sklearn.preprocessing import StandardScaler\n",
133 |     "scaler = StandardScaler()\n",
134 |     "\n",
135 |     "X_scaled_train = torch.FloatTensor(scaler.fit_transform(X_train))\n",
136 |     "X_scaled_test = torch.FloatTensor(scaler.transform(X_test))\n",
137 |     "\n",
138 |     "y_train_torch = torch.FloatTensor(y_train)\n",
139 |     "y_test_torch = torch.FloatTensor(y_test)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 5,
145 |    "id": "20dce8e1-2445-44bb-85e0-dbebf16fcbf3",
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "# Define a function to create sequences\n",
150 |     "def create_sequences(data, labels, seq_length):\n",
151 |     "    sequences = []\n",
152 |     "    for i in range(len(data) - seq_length):\n",
153 |     "        seq = data[i:i+seq_length]\n",
154 |     "        label = labels[i+seq_length-1]\n",
155 |     "        sequences.append((seq, label))\n",
156 |     "    return sequences\n",
157 |     "\n",
158 |     "    \n",
159 |     "# Create sequences with a sequence length of 5\n",
160 |     "seq_length = 5\n",
161 |     "sequence_train = create_sequences(X_scaled_train, y_train_torch, seq_length)\n",
162 |     "sequence_test = create_sequences(X_scaled_test, y_test_torch, seq_length)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 6,
168 |    "id": "619613a7-c1c0-4791-b3de-62ef15fb9089",
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "from torch.utils.data import DataLoader\n",
173 |     "torch.manual_seed(0)\n",
174 |     "\n",
175 |     "batch_size = 128  \n",
176 |     "train_dl = DataLoader(sequence_train, batch_size=batch_size,\n",
177 |     "                      shuffle=True)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 7,
183 |    "id": "924de558-f9b4-44f3-b3cf-d3248c546954",
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "class RNN(nn.Module):\n",
188 |     "    def __init__(self, input_dim, rnn_hidden_dim, fc_hidden_dim):\n",
189 |     "        super().__init__()\n",
190 |     "        self.rnn = nn.LSTM(input_dim, rnn_hidden_dim, 2,\n",
191 |     "                           batch_first=True)\n",
192 |     "        self.fc1 = nn.Linear(rnn_hidden_dim, fc_hidden_dim)\n",
193 |     "        self.relu = nn.ReLU()\n",
194 |     "        self.fc2 = nn.Linear(fc_hidden_dim, 1)\n",
195 |     "\n",
196 |     "    def forward(self, x):\n",
197 |     "        out, (hidden, cell) = self.rnn(x)\n",
198 |     "        out = hidden[-1, :, :]\n",
199 |     "        out = self.fc1(out)\n",
200 |     "        out = self.relu(out)\n",
201 |     "        out = self.fc2(out)\n",
202 |     "        return out\n",
203 |     "\n",
204 |     "\n",
205 |     "torch.manual_seed(42)\n",
206 |     "rnn_hidden_dim = 16\n",
207 |     "fc_hidden_dim = 16\n",
208 |     "model = RNN(X_train.shape[1], rnn_hidden_dim, fc_hidden_dim) \n",
209 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
210 |     "model = model.to(device)\n",
211 |     "\n",
212 |     "\n",
213 |     "loss_fn = nn.MSELoss()\n",
214 |     "optimizer = torch.optim.Adam(model.parameters(), lr=0.003)\n"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 8,
220 |    "id": "ae0b5bf5-53cc-4c0f-8989-a615d3031571",
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "def train(model, dataloader, optimizer):\n",
225 |     "    model.train()\n",
226 |     "    total_loss = 0\n",
227 |     "    for seq, label in dataloader:\n",
228 |     "        optimizer.zero_grad()\n",
229 |     "        pred = model(seq.to(device))[:, 0]\n",
230 |     "        loss = loss_fn(pred, label.to(device))\n",
231 |     "        loss.backward()\n",
232 |     "        optimizer.step()\n",
233 |     "        total_loss += loss.item()*label.size(0)\n",
234 |     "    return total_loss/len(dataloader.dataset)\n"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 9,
240 |    "id": "a55cecc0-7081-4615-a783-08ba9d5d09de",
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "name": "stdout",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "Epoch 1 - loss: 24797427.9047\n",
248 |       "Epoch 101 - loss: 10503.0117\n",
249 |       "Epoch 201 - loss: 3234.3346\n",
250 |       "Epoch 301 - loss: 2735.4141\n",
251 |       "Epoch 401 - loss: 2297.7157\n",
252 |       "Epoch 501 - loss: 2108.5702\n",
253 |       "Epoch 601 - loss: 1741.5264\n",
254 |       "Epoch 701 - loss: 2798.3159\n",
255 |       "Epoch 801 - loss: 1635.2345\n",
256 |       "Epoch 901 - loss: 1459.4806\n"
257 |      ]
258 |     }
259 |    ],
260 |    "source": [
261 |     "\n",
262 |     "num_epochs = 1000 \n",
263 |     "for epoch in range(num_epochs):\n",
264 |     "    loss = train(model, train_dl, optimizer)\n",
265 |     "    if epoch % 100 == 0:\n",
266 |     "        print(f'Epoch {epoch+1} - loss: {loss:.4f}')"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 10,
272 |    "id": "c25679b2-1f3a-45f8-a945-ac02926a922e",
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "predictions, y = [], []\n",
277 |     " \n",
278 |     "for seq, label in sequence_test:\n",
279 |     "    with torch.no_grad():\n",
280 |     "        pred = model.cpu()(seq.view(1, seq_length, X_test.shape[1]))[:, 0]\n",
281 |     "        predictions.append(pred)\n",
282 |     "        y.append(label)"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 11,
288 |    "id": "cea9280b-5675-497b-9da0-9d3efb0e13e9",
289 |    "metadata": {},
290 |    "outputs": [
291 |     {
292 |      "name": "stdout",
293 |      "output_type": "stream",
294 |      "text": [
295 |       "R^2: 0.897\n"
296 |      ]
297 |     }
298 |    ],
299 |    "source": [
300 |     "from sklearn.metrics import r2_score\n",
301 |     "print(f'R^2: {r2_score(y, predictions):.3f}')\n"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "id": "a221aaed-fed8-415d-b291-34ff46884e0b",
307 |    "metadata": {},
308 |    "source": [
309 |     "---"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "id": "f86bbbd2-7c6c-419e-898f-8f683f2600d0",
315 |    "metadata": {},
316 |    "source": [
317 |     "Readers may ignore the next cell."
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 12,
323 |    "id": "ad91961b-758c-4cdd-8f90-8126bf40ed5c",
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "name": "stderr",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "[NbConvertApp] Converting notebook ch12_part2.ipynb to python\n",
331 |       "[NbConvertApp] Writing 7105 bytes to ch12_part2.py\n"
332 |      ]
333 |     }
334 |    ],
335 |    "source": [
336 |     "!jupyter nbconvert --to python ch12_part2.ipynb --TemplateExporter.exclude_input_prompt=True"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "id": "609dece8-49c6-489d-8085-29d839097116",
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": []
346 |   }
347 |  ],
348 |  "metadata": {
349 |   "kernelspec": {
350 |    "display_name": "Python 3 (ipykernel)",
351 |    "language": "python",
352 |    "name": "python3"
353 |   },
354 |   "language_info": {
355 |    "codemirror_mode": {
356 |     "name": "ipython",
357 |     "version": 3
358 |    },
359 |    "file_extension": ".py",
360 |    "mimetype": "text/x-python",
361 |    "name": "python",
362 |    "nbconvert_exporter": "python",
363 |    "pygments_lexer": "ipython3",
364 |    "version": "3.11.4"
365 |   }
366 |  },
367 |  "nbformat": 4,
368 |  "nbformat_minor": 5
369 | }
370 | 


--------------------------------------------------------------------------------
/ch12/ch12_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 12 Making Predictions with Sequences Using Recurrent Neural Networks
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Revisiting stock price forecasting with LSTM
 11 | 
 12 | import pandas as pd
 13 | import torch
 14 | import torch.nn as nn
 15 | 
 16 | 
 17 | # Reusing the feature generation function we developed
 18 | def generate_features(df):
 19 |     """
 20 |     Generate features for a stock/index based on historical price and performance
 21 |     @param df: dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adj Close"
 22 |     @return: dataframe, data set with new features
 23 |     """
 24 |     df_new = pd.DataFrame()
 25 |     # 6 original features
 26 |     df_new['open'] = df['Open']
 27 |     df_new['open_1'] = df['Open'].shift(1)
 28 |     df_new['close_1'] = df['Close'].shift(1)
 29 |     df_new['high_1'] = df['High'].shift(1)
 30 |     df_new['low_1'] = df['Low'].shift(1)
 31 |     df_new['volume_1'] = df['Volume'].shift(1)
 32 |     # # 31 generated features
 33 |     # # average price
 34 |     df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1)
 35 |     df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1)
 36 |     df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1)
 37 |     df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30']
 38 |     df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365']
 39 |     df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365']
 40 |     # # average volume
 41 |     df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1)
 42 |     df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1)
 43 |     df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1)
 44 |     df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30']
 45 |     df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365']
 46 |     df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365']
 47 |     # # standard deviation of prices
 48 |     df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1)
 49 |     df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1)
 50 |     df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1)
 51 |     df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30']
 52 |     df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365']
 53 |     df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365']
 54 |     # # standard deviation of volumes
 55 |     df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1)
 56 |     df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1)
 57 |     df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1)
 58 |     df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30']
 59 |     df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365']
 60 |     df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365']
 61 |     # # # return
 62 |     df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1)
 63 |     df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1)
 64 |     df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1)
 65 |     df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1)
 66 |     df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1)
 67 |     df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1)
 68 |     df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1)
 69 |     # the target
 70 |     df_new['close'] = df['Close']
 71 |     df_new = df_new.dropna(axis=0)
 72 |     return df_new
 73 | 
 74 | 
 75 | data_raw = pd.read_csv('19900101_20230630.csv', index_col='Date')
 76 | data = generate_features(data_raw)
 77 | 
 78 | start_train = '1990-01-01'
 79 | end_train = '2022-12-31'
 80 | 
 81 | start_test = '2023-01-01'
 82 | end_test = '2023-06-30'
 83 | 
 84 | data_train = data.loc[start_train:end_train]
 85 | X_train = data_train.drop('close', axis=1).values
 86 | y_train = data_train['close'].values
 87 | 
 88 | data_test = data.loc[start_test:end_test]
 89 | X_test = data_test.drop('close', axis=1).values
 90 | y_test = data_test['close'].values
 91 | 
 92 | 
 93 | from sklearn.preprocessing import StandardScaler
 94 | scaler = StandardScaler()
 95 | 
 96 | X_scaled_train = torch.FloatTensor(scaler.fit_transform(X_train))
 97 | X_scaled_test = torch.FloatTensor(scaler.transform(X_test))
 98 | 
 99 | y_train_torch = torch.FloatTensor(y_train)
100 | y_test_torch = torch.FloatTensor(y_test)
101 | 
102 | 
103 | # Define a function to create sequences
104 | def create_sequences(data, labels, seq_length):
105 |     sequences = []
106 |     for i in range(len(data) - seq_length):
107 |         seq = data[i:i+seq_length]
108 |         label = labels[i+seq_length-1]
109 |         sequences.append((seq, label))
110 |     return sequences
111 | 
112 |     
113 | # Create sequences with a sequence length of 5
114 | seq_length = 5
115 | sequence_train = create_sequences(X_scaled_train, y_train_torch, seq_length)
116 | sequence_test = create_sequences(X_scaled_test, y_test_torch, seq_length)
117 | 
118 | 
119 | from torch.utils.data import DataLoader
120 | torch.manual_seed(0)
121 | 
122 | batch_size = 128  
123 | train_dl = DataLoader(sequence_train, batch_size=batch_size,
124 |                       shuffle=True)
125 | 
126 | 
127 | class RNN(nn.Module):
128 |     def __init__(self, input_dim, rnn_hidden_dim, fc_hidden_dim):
129 |         super().__init__()
130 |         self.rnn = nn.LSTM(input_dim, rnn_hidden_dim, 2,
131 |                            batch_first=True)
132 |         self.fc1 = nn.Linear(rnn_hidden_dim, fc_hidden_dim)
133 |         self.relu = nn.ReLU()
134 |         self.fc2 = nn.Linear(fc_hidden_dim, 1)
135 | 
136 |     def forward(self, x):
137 |         out, (hidden, cell) = self.rnn(x)
138 |         out = hidden[-1, :, :]
139 |         out = self.fc1(out)
140 |         out = self.relu(out)
141 |         out = self.fc2(out)
142 |         return out
143 | 
144 | 
145 | torch.manual_seed(42)
146 | rnn_hidden_dim = 16
147 | fc_hidden_dim = 16
148 | model = RNN(X_train.shape[1], rnn_hidden_dim, fc_hidden_dim) 
149 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
150 | model = model.to(device)
151 | 
152 | 
153 | loss_fn = nn.MSELoss()
154 | optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
155 | 
156 | 
157 | def train(model, dataloader, optimizer):
158 |     model.train()
159 |     total_loss = 0
160 |     for seq, label in dataloader:
161 |         optimizer.zero_grad()
162 |         pred = model(seq.to(device))[:, 0]
163 |         loss = loss_fn(pred, label.to(device))
164 |         loss.backward()
165 |         optimizer.step()
166 |         total_loss += loss.item()*label.size(0)
167 |     return total_loss/len(dataloader.dataset)
168 | 
169 | 
170 | num_epochs = 1000 
171 | for epoch in range(num_epochs):
172 |     loss = train(model, train_dl, optimizer)
173 |     if epoch % 100 == 0:
174 |         print(f'Epoch {epoch+1} - loss: {loss:.4f}')
175 | 
176 | 
177 | predictions, y = [], []
178 |  
179 | for seq, label in sequence_test:
180 |     with torch.no_grad():
181 |         pred = model.cpu()(seq.view(1, seq_length, X_test.shape[1]))[:, 0]
182 |         predictions.append(pred)
183 |         y.append(label)
184 | 
185 | 
186 | from sklearn.metrics import r2_score
187 | print(f'R^2: {r2_score(y, predictions):.3f}')
188 | 
189 | 
190 | # ---
191 | 
192 | # Readers may ignore the next cell.
193 | 
194 | get_ipython().system('jupyter nbconvert --to python ch12_part2.ipynb --TemplateExporter.exclude_input_prompt=True')
195 | 
196 | 
197 | 
198 | 
199 | 


--------------------------------------------------------------------------------
/ch12/ch12_part3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 12 Making Predictions with Sequences Using Recurrent Neural Networks
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Writing your own War and Peace with RNNs 
 11 | 
 12 | # ## Acquiring and analyzing the training data 
 13 | 
 14 | with open('warpeace_input.txt', 'r', encoding="utf8") as fp:
 15 |     raw_text = fp.read()
 16 | raw_text = raw_text.lower()
 17 | 
 18 | 
 19 | print(raw_text[:200])
 20 | 
 21 | 
 22 | all_words = raw_text.split()
 23 | unique_words = list(set(all_words))
 24 | print(f'Number of unique words: {len(unique_words)}')
 25 | 
 26 | 
 27 | n_chars = len(raw_text)
 28 | print(f'Total characters: {n_chars}')
 29 | 
 30 | 
 31 | chars = sorted(list(set(raw_text)))
 32 | vocab_size = len(chars)
 33 | print(f'Total vocabulary (unique characters): {vocab_size}')
 34 | print(chars)
 35 | 
 36 | 
 37 | # ## Constructing the training set for the RNN text generator
 38 | 
 39 | index_to_char = dict((i, c) for i, c in enumerate(chars))
 40 | char_to_index = dict((c, i) for i, c in enumerate(chars))
 41 | print(char_to_index)
 42 | 
 43 | 
 44 | import numpy as np
 45 | text_encoded = np.array(
 46 |     [char_to_index[ch] for ch in raw_text],
 47 |     dtype=np.int32)
 48 | 
 49 | 
 50 | seq_length = 40
 51 | chunk_size = seq_length + 1
 52 | 
 53 | text_chunks = np.array([text_encoded[i:i+chunk_size] 
 54 |                for i in range(len(text_encoded)-chunk_size+1)]) 
 55 | 
 56 | 
 57 | import torch
 58 | from torch.utils.data import Dataset
 59 | 
 60 | class SeqDataset(Dataset):
 61 |     def __init__(self, text_chunks):
 62 |         self.text_chunks = text_chunks
 63 | 
 64 |     def __len__(self):
 65 |         return len(self.text_chunks)
 66 |     
 67 |     def __getitem__(self, idx):
 68 |         text_chunk = self.text_chunks[idx]
 69 |         return text_chunk[:-1].long(), text_chunk[1:].long()
 70 |     
 71 | seq_dataset = SeqDataset(torch.from_numpy(text_chunks))
 72 | 
 73 | 
 74 | from torch.utils.data import DataLoader
 75 |  
 76 | batch_size = 64
 77 | 
 78 | torch.manual_seed(0)
 79 | seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
 80 | 
 81 | 
 82 | # ## Building and Training an RNN text generator 
 83 | 
 84 | import torch.nn as nn
 85 | 
 86 | class RNN(nn.Module):
 87 |     def __init__(self, vocab_size, embed_dim, rnn_hidden_dim):
 88 |         super().__init__()
 89 |         self.embedding = nn.Embedding(vocab_size, embed_dim) 
 90 |         self.rnn_hidden_dim = rnn_hidden_dim
 91 |         self.rnn = nn.LSTM(embed_dim, rnn_hidden_dim, 
 92 |                            batch_first=True)
 93 |         self.fc = nn.Linear(rnn_hidden_dim, vocab_size)
 94 | 
 95 |     def forward(self, x, hidden, cell):
 96 |         out = self.embedding(x).unsqueeze(1)
 97 |         out, (hidden, cell) = self.rnn(out, (hidden, cell))
 98 |         out = self.fc(out).reshape(out.size(0), -1)
 99 |         return out, hidden, cell
100 | 
101 |     def init_hidden(self, batch_size):
102 |         hidden = torch.zeros(1, batch_size, self.rnn_hidden_dim)
103 |         cell = torch.zeros(1, batch_size, self.rnn_hidden_dim)
104 |         return hidden, cell
105 | 
106 | 
107 | embed_dim = 256
108 | rnn_hidden_dim = 512
109 | 
110 | torch.manual_seed(0)
111 | model = RNN(vocab_size, embed_dim, rnn_hidden_dim) 
112 | 
113 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
114 | model = model.to(device)
115 | model 
116 | 
117 | 
118 | loss_fn = nn.CrossEntropyLoss()
119 | optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
120 | 
121 | 
122 | num_epochs = 10000
123 | 
124 | torch.manual_seed(0)
125 | 
126 | for epoch in range(num_epochs):
127 |     hidden, cell = model.init_hidden(batch_size)
128 |     seq_batch, target_batch = next(iter(seq_dl))
129 |     seq_batch = seq_batch.to(device)
130 |     target_batch = target_batch.to(device)
131 |     optimizer.zero_grad()
132 |     loss = 0
133 |     for c in range(seq_length):
134 |         pred, hidden, cell = model(seq_batch[:, c], hidden.to(device), cell.to(device)) 
135 |         loss += loss_fn(pred, target_batch[:, c])
136 |     loss.backward()
137 |     optimizer.step()
138 |     loss = loss.item()/seq_length
139 |     if epoch % 500 == 0:
140 |         print(f'Epoch {epoch} - loss: {loss:.4f}')
141 | 
142 | 
143 | from torch.distributions.categorical import Categorical
144 | 
145 | def generate_text(model, starting_str, len_generated_text=500):
146 |     encoded_input = torch.tensor([char_to_index[s] for s in starting_str])
147 |     encoded_input = torch.reshape(encoded_input, (1, -1))
148 | 
149 |     generated_str = starting_str
150 | 
151 |     model.eval()
152 | 
153 |     hidden, cell = model.init_hidden(1)
154 |     for c in range(len(starting_str)-1):
155 |         _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
156 |     
157 |     last_char = encoded_input[:, -1]
158 |     for _ in range(len_generated_text):
159 |         logits, hidden, cell = model(last_char.view(1), hidden, cell) 
160 |         logits = torch.squeeze(logits, 0)
161 |         last_char = Categorical(logits=logits).sample()
162 |         generated_str += str(index_to_char[last_char.item()])
163 |         
164 |     return generated_str
165 | 
166 | 
167 | model.to('cpu')
168 | torch.manual_seed(0)
169 | print(generate_text(model, 'the emperor', 500))
170 | 
171 | 
172 | # ---
173 | 
174 | # Readers may ignore the next cell.
175 | 
176 | get_ipython().system('jupyter nbconvert --to python ch12_part3.ipynb --TemplateExporter.exclude_input_prompt=True')
177 | 
178 | 


--------------------------------------------------------------------------------
/ch13/ch13_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 13 Advancing language understanding and Generation with the Transformer models
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Understanding self-attention 
 11 | 
 12 | import torch
 13 | 
 14 | sentence = torch.tensor(
 15 |     [0, # python 
 16 |      8, # machine      
 17 |      1, # learning 
 18 |      6, # by 
 19 |      2] # example 
 20 | )
 21 | 
 22 | sentence
 23 | 
 24 | 
 25 | torch.manual_seed(0)
 26 | embed = torch.nn.Embedding(10, 16)
 27 | sentence_embed = embed(sentence).detach()
 28 | 
 29 | 
 30 | sentence_embed
 31 | 
 32 | 
 33 | d = sentence_embed.shape[1]
 34 | w_key = torch.rand(d, d)
 35 | w_query = torch.rand(d, d)
 36 | w_value = torch.rand(d, d)
 37 | 
 38 | 
 39 | token1_embed = sentence_embed[0]
 40 | key_1 = w_key.matmul(token1_embed)
 41 | query_1 = w_query.matmul(token1_embed)
 42 | value_1 = w_value.matmul(token1_embed)
 43 | 
 44 | 
 45 | key_1
 46 | 
 47 | 
 48 | keys = sentence_embed.matmul(w_key.T)
 49 | 
 50 | 
 51 | keys[0]
 52 | 
 53 | 
 54 | values = sentence_embed.matmul(w_value.T)
 55 | 
 56 | 
 57 | import torch.nn.functional as F
 58 | a1 = F.softmax(query_1.matmul(keys.T) / d ** 0.5, dim=0)
 59 | 
 60 | 
 61 | a1
 62 | 
 63 | 
 64 | z1 = a1.matmul(values)
 65 | z1
 66 | 
 67 | 
 68 | # # Improving sentiment analysis with BERT and Transformers
 69 | 
 70 | # ## Fine-tuning a pre-trained BERT model for sentiment Analysis
 71 | 
 72 | from torchtext.datasets import IMDB
 73 | 
 74 | train_dataset = list(IMDB(split='train'))
 75 | test_dataset = list(IMDB(split='test'))
 76 | 
 77 | print(len(train_dataset), len(test_dataset))
 78 | 
 79 | 
 80 | train_texts = [train_sample[1] for train_sample in train_dataset]
 81 | train_labels = [train_sample[0] for train_sample in train_dataset]
 82 | 
 83 | test_texts = [test_sample[1] for test_sample in test_dataset]
 84 | test_labels = [test_sample[0] for test_sample in test_dataset]
 85 | 
 86 | 
 87 | import transformers
 88 | from transformers import DistilBertTokenizerFast
 89 | 
 90 | # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
 91 | tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', local_files_only=True)
 92 | 
 93 | 
 94 | train_encodings = tokenizer(train_texts, truncation=True, padding=True)
 95 | test_encodings = tokenizer(test_texts, truncation=True, padding=True)
 96 | 
 97 | 
 98 | train_encodings[0] 
 99 | 
100 | 
101 | class IMDbDataset(torch.utils.data.Dataset):
102 |     def __init__(self, encodings, labels):
103 |         self.encodings = encodings
104 |         self.labels = labels
105 | 
106 |     def __getitem__(self, idx):
107 |         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
108 |         item['labels'] = torch.tensor([0., 1.] if self.labels[idx] == 2 else [1., 0.])
109 |         return item
110 | 
111 |     def __len__(self):
112 |         return len(self.labels)
113 | 
114 | 
115 | train_encoded_dataset = IMDbDataset(train_encodings, train_labels)
116 | test_encoded_dataset = IMDbDataset(test_encodings, test_labels)
117 | 
118 | 
119 | batch_size = 32
120 | train_dl = torch.utils.data.DataLoader(train_encoded_dataset, batch_size=batch_size, shuffle=True)
121 | test_dl = torch.utils.data.DataLoader(test_encoded_dataset, batch_size=batch_size, shuffle=False)
122 | 
123 | 
124 | from transformers import DistilBertForSequenceClassification
125 | 
126 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
127 | 
128 | model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', local_files_only=True)
129 | model.to(device)
130 | 
131 | 
132 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
133 | 
134 | 
135 | def train(model, dataloader, optimizer):
136 |     model.train()
137 |     total_loss = 0
138 |     for batch in dataloader:
139 |         optimizer.zero_grad()
140 | 
141 |         input_ids = batch['input_ids'].to(device)
142 |         attention_mask = batch['attention_mask'].to(device)
143 |         labels = batch['labels'].to(device)
144 |         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
145 |         loss = outputs['loss'] 
146 |         
147 |         optimizer.zero_grad()
148 |         loss.backward()
149 |         optimizer.step()
150 | 
151 |         total_loss += loss.item()*len(batch)
152 | 
153 |     return total_loss/len(dataloader.dataset)
154 |     
155 | 
156 | 
157 | def evaluate(model, dataloader):
158 |     model.eval()
159 |     total_acc = 0
160 |     with torch.no_grad():
161 |         for batch in dataloader:
162 | 
163 |             input_ids = batch['input_ids'].to(device)
164 |             attention_mask = batch['attention_mask'].to(device)
165 |             labels = batch['labels'].to(device)
166 |             outputs = model(input_ids, attention_mask=attention_mask)
167 |             logits = outputs['logits']
168 | 
169 |             pred = torch.argmax(logits, 1)
170 |             total_acc += (pred == torch.argmax(labels, 1)).float().sum().item()
171 | 
172 |     return  total_acc/len(dataloader.dataset)
173 |  
174 | 
175 | 
176 | torch.manual_seed(0)
177 | num_epochs = 1 
178 | for epoch in range(num_epochs):
179 |     train_loss = train(model, train_dl, optimizer)
180 |     train_acc = evaluate(model, train_dl)
181 |     print(f'Epoch {epoch+1} - loss: {train_loss:.4f} - accuracy: {train_acc:.4f}')
182 | 
183 | 
184 | test_acc = evaluate(model, test_dl)
185 | print(f'Accuracy on test set: {100 * test_acc:.2f} %')
186 | 
187 | 
188 | # torch.cuda.mem_get_info()
189 | 
190 | 
191 | # torch.cuda.empty_cache()
192 | 
193 | 
194 | # free up memory
195 | del model 
196 | 
197 | 
198 | # ## Using the Trainer API to train Transformer models 
199 | 
200 | model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', local_files_only=True)
201 | model.to(device)
202 | 
203 | optim = torch.optim.Adam(model.parameters(), lr=5e-5)
204 | 
205 | 
206 | # !conda install -c conda-forge accelerate -y
207 | 
208 | 
209 | from transformers import Trainer, TrainingArguments
210 | 
211 | training_args = TrainingArguments(
212 |     output_dir='./results', 
213 |     num_train_epochs=1,     
214 |     per_device_train_batch_size=32, 
215 |     logging_dir='./logs',
216 |     logging_steps=50,
217 | )
218 | 
219 | 
220 | # trainer = Trainer(
221 | #     model=model,
222 | #     args=training_args,
223 | #     train_dataset=train_encoded_dataset,
224 | #     optimizers=(optim, None)
225 | # )
226 | 
227 | 
228 | from datasets import load_metric
229 | import numpy as np
230 | 
231 | metric = load_metric("accuracy")
232 | 
233 | def compute_metrics(eval_pred):
234 |     logits, labels = eval_pred  
235 |     pred = np.argmax(logits, axis=-1)
236 |     return metric.compute(predictions=pred, references=np.argmax(labels, 1))
237 | 
238 | 
239 | trainer = Trainer(
240 |     model=model,
241 |     compute_metrics=compute_metrics,
242 |     args=training_args,
243 |     train_dataset=train_encoded_dataset,
244 |     eval_dataset=test_encoded_dataset,
245 |     optimizers=(optim, None)
246 | )
247 | 
248 | 
249 | trainer.train()
250 | 
251 | 
252 | print(trainer.evaluate())
253 | 
254 | 
255 | # ---
256 | 
257 | # Readers may ignore the next cell.
258 | 
259 | get_ipython().system('jupyter nbconvert --to python ch13_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
260 | 
261 | 


--------------------------------------------------------------------------------
/ch13/ch13_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 13 Advancing language understanding and Generation with the Transformer models
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Generating text using GPT 
 11 | 
 12 | # ## Writing your own War and Peace with GPT
 13 | 
 14 | from transformers import pipeline, set_seed
 15 | 
 16 | generator = pipeline('text-generation', model='gpt2')
 17 | set_seed(0)
 18 | generator("I love machine learning",
 19 |           max_length=20,
 20 |           num_return_sequences=3)
 21 | 
 22 | 
 23 | from transformers import TextDataset, GPT2Tokenizer
 24 | 
 25 | # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 26 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2', local_files_only=True)
 27 | 
 28 | 
 29 | text_dataset = TextDataset(tokenizer=tokenizer, file_path='warpeace_input.txt', block_size=128)
 30 | 
 31 | 
 32 | len(text_dataset)
 33 | 
 34 | 
 35 | from transformers import DataCollatorForLanguageModeling
 36 | data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
 37 | 
 38 | 
 39 | import torch
 40 | from transformers import GPT2LMHeadModel
 41 | model = GPT2LMHeadModel.from_pretrained('gpt2')
 42 | 
 43 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 44 | model.to(device)
 45 | 
 46 | 
 47 | optim = torch.optim.Adam(model.parameters(), lr=5e-5)
 48 | 
 49 | 
 50 | from transformers import Trainer, TrainingArguments
 51 | 
 52 | training_args = TrainingArguments(
 53 |     output_dir='./gpt_results', 
 54 |     num_train_epochs=20,     
 55 |     per_device_train_batch_size=16, 
 56 |     logging_dir='./gpt_logs',
 57 |     save_total_limit=1,
 58 |     logging_steps=500,
 59 | )
 60 | 
 61 | 
 62 | trainer = Trainer(
 63 |     model=model,
 64 |     args=training_args,
 65 |     data_collator=data_collator,
 66 |     train_dataset=text_dataset,
 67 |     optimizers=(optim, None)
 68 | )
 69 | 
 70 | 
 71 | trainer.train()
 72 | 
 73 | 
 74 | def generate_text(prompt_text, model, tokenizer, max_length):
 75 |     input_ids = tokenizer.encode(prompt_text, return_tensors="pt").to(device)
 76 |     
 77 |     # Generate response
 78 |     output_sequences = model.generate(
 79 |         input_ids=input_ids,
 80 |         max_length=max_length,
 81 |         num_return_sequences=1,
 82 |         no_repeat_ngram_size=2,
 83 |         top_p=0.9,
 84 |     )
 85 | 
 86 |     # Decode the generated responses
 87 |     responses = []
 88 |     for response_id in output_sequences:
 89 |         response = tokenizer.decode(response_id, skip_special_tokens=True)
 90 |         responses.append(response)
 91 | 
 92 |     return responses
 93 | 
 94 | 
 95 | prompt_text = "the emperor"
 96 | responses = generate_text(prompt_text, model, tokenizer, 100)
 97 | 
 98 | for response in responses:
 99 |     print(response)
100 | 
101 | 
102 | # ---
103 | 
104 | # Readers may ignore the next cell.
105 | 
106 | get_ipython().system('jupyter nbconvert --to python ch13_part2.ipynb --TemplateExporter.exclude_input_prompt=True')
107 | 
108 | 


--------------------------------------------------------------------------------
/ch14/ch14_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 14 Building an Image Search Engine Using Multimodal Models
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Getting started with the dataset
 11 | 
 12 | # ## Loading the Flickr8k dataset
 13 | 
 14 | import os
 15 | from PIL import Image
 16 | import torch
 17 | from torch.utils.data import Dataset, DataLoader
 18 | import torchvision.transforms as transforms
 19 | 
 20 | 
 21 | image_dir = "flickr8k/Flicker8k_Dataset"
 22 | caption_file = "flickr8k/captions.txt"
 23 | 
 24 | 
 25 | from transformers import DistilBertTokenizer
 26 | tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
 27 | 
 28 | 
 29 | class Flickr8kDataset(Dataset):
 30 |     def __init__(self, image_dir, caption_file):
 31 |         self.image_dir = image_dir
 32 |         self.transform = transforms.Compose([
 33 |                             transforms.Resize((224, 224)),
 34 |                             transforms.ToTensor(),
 35 |                          ])
 36 |         self.image_paths, self.captions = self.read_caption_file(caption_file)
 37 | 
 38 |     def read_caption_file(self, caption_file):
 39 |         image_paths = []
 40 |         captions = []
 41 | 
 42 |         with open(caption_file, "r") as file:
 43 |             lines = file.readlines()
 44 |             for line in lines[1:]:
 45 |                 parts = line.strip().split(",")
 46 |                 image_paths.append(os.path.join(self.image_dir, parts[0]))
 47 |                 captions.append(parts[1])
 48 |         
 49 |         self.caption_encodings = tokenizer(captions, truncation=True, padding=True, max_length=200)
 50 |             
 51 |         return image_paths, captions
 52 | 
 53 |     def __len__(self):
 54 |         return len(self.image_paths)
 55 | 
 56 |     def __getitem__(self, idx):
 57 |         item = {key: torch.tensor(val[idx]) for key, val in self.caption_encodings.items()}
 58 |         
 59 |         caption = self.captions[idx]
 60 |         item["caption"] = caption
 61 |         
 62 |         img_path = self.image_paths[idx]
 63 |         img = Image.open(img_path).convert("RGB")
 64 |         img = self.transform(img)
 65 |         item['image'] = img
 66 | 
 67 |         return item
 68 | 
 69 | 
 70 | flickr8k_dataset = Flickr8kDataset(image_dir=image_dir, caption_file=caption_file)
 71 | 
 72 | 
 73 | item_sample = next(iter(flickr8k_dataset))
 74 | 
 75 | 
 76 | item_sample
 77 | 
 78 | 
 79 | import matplotlib.pyplot as plt
 80 | import numpy as np
 81 | 
 82 | npimg = item_sample['image'].numpy()
 83 | plt.imshow(np.transpose(npimg, (1, 2, 0)))
 84 |  
 85 | 
 86 | 
 87 | torch.manual_seed(0)
 88 | batch_size = 32
 89 | data_loader = DataLoader(flickr8k_dataset, batch_size=batch_size, shuffle=True)
 90 | 
 91 | 
 92 | # # Architecting the CLIP model
 93 | 
 94 | # ## Vision encoder
 95 | 
 96 | import torch.nn as nn
 97 | from torchvision.models import resnet50
 98 | 
 99 | class VisionEncoder(nn.Module):
100 |     def __init__(self):
101 |         super().__init__()
102 |         pretrained_resnet50 = resnet50(pretrained=True)
103 |         self.model = nn.Sequential(*list(pretrained_resnet50.children())[:-1])
104 | 
105 |         for param in self.model.parameters():
106 |             param.requires_grad = False
107 | 
108 |     def forward(self, x):
109 |         x= self.model(x)
110 |         x = x.view(x.size(0), -1)
111 |         return x
112 |     
113 | 
114 | 
115 | # ## Text encoder 
116 | 
117 | from transformers import DistilBertModel
118 | 
119 | 
120 | class TextEncoder(nn.Module):
121 |     def __init__(self):
122 |         super().__init__()
123 |         self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
124 |         for param in self.model.parameters():
125 |             param.requires_grad = False
126 | 
127 |     def forward(self, input_ids, attention_mask=None):
128 |         outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
129 |         return outputs.last_hidden_state[:, 0, :]
130 | 
131 | 
132 | # ## Projection head for contractive learning 
133 | 
134 | class ProjectionHead(nn.Module):
135 |     def __init__(self, embedding_dim, projection_dim=256, dropout=0.1):
136 |         super().__init__()
137 |         self.projection = nn.Linear(embedding_dim, projection_dim)
138 |         self.gelu = nn.GELU()
139 |         self.fc = nn.Linear(projection_dim, projection_dim)
140 |         self.dropout = nn.Dropout(dropout)
141 |         self.layer_norm = nn.LayerNorm(projection_dim)
142 | 
143 |     def forward(self, x):
144 |         projection = self.projection(x)
145 |         x = self.gelu(projection)
146 |         x = self.fc(x)
147 |         x = self.dropout(x)
148 |         x = projection + x
149 |         x = self.layer_norm(x)
150 |         return x
151 |     
152 | 
153 | 
154 | # ## CLIP model
155 | 
156 | import torch.nn.functional as F
157 | 
158 | class CLIPModel(nn.Module):
159 |     def __init__(self, image_embedding=2048, text_embedding=768):
160 |         super().__init__()
161 |         self.vision_encoder = VisionEncoder()
162 |         self.text_encoder = TextEncoder()
163 |         self.image_projection = ProjectionHead(embedding_dim=image_embedding)
164 |         self.text_projection = ProjectionHead(embedding_dim=text_embedding)
165 | 
166 |     def forward(self, batch):
167 |         image_features = self.vision_encoder(batch["image"])
168 |         text_features = self.text_encoder(
169 |             input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
170 |         )
171 |         image_embeddings = self.image_projection(image_features)
172 |         text_embeddings = self.text_projection(text_features)
173 | 
174 |         logits = text_embeddings @ image_embeddings.T
175 |         images_similarity = image_embeddings @ image_embeddings.T
176 |         texts_similarity = text_embeddings @ text_embeddings.T
177 |         targets = F.softmax((images_similarity + texts_similarity)/2 , dim=-1)
178 |         texts_loss = F.cross_entropy(logits, targets)
179 |         images_loss = F.cross_entropy(logits.T, targets.T)
180 |         loss =  (images_loss + texts_loss) / 2
181 |         return loss.mean()
182 |         
183 | 
184 | 
185 | # # Finding images with words 
186 | 
187 | # ## Training the CLIP model
188 | 
189 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
190 | 
191 | 
192 | model = CLIPModel().to(device)
193 | 
194 | 
195 | optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
196 | 
197 | 
198 | def train(model, dataloader, optimizer):
199 |     model.train()
200 |     total_loss = 0
201 |     b = 0
202 |     for batch in dataloader:
203 |         optimizer.zero_grad()
204 |         batch = {k: v.to(device) for k, v in batch.items() if k != "caption"}
205 |         loss = model(batch)
206 |         optimizer.zero_grad()
207 |         loss.backward()
208 |         optimizer.step()
209 | 
210 |         total_loss += loss.item()*len(batch)
211 |         
212 |     return total_loss/len(dataloader.dataset)
213 | 
214 | 
215 | num_epochs = 3 
216 | for epoch in range(num_epochs):
217 |     train_loss = train(model, data_loader, optimizer)
218 |     print(f'Epoch {epoch+1} - loss: {train_loss:.4f}')
219 | 
220 | 
221 | # ## Obtaining embeddings for images and text to identify matches
222 | 
223 | torch.manual_seed(0)
224 | data_loader = DataLoader(flickr8k_dataset, batch_size=batch_size, shuffle=True)
225 | sample_batch = next(iter(data_loader))
226 | 
227 | 
228 | batch_image_features = model.vision_encoder(sample_batch["image"].to(device))
229 | batch_image_embeddings = model.image_projection(batch_image_features)
230 |  
231 | 
232 | 
233 | def search_top_images(model, image_embeddings, query, n=1):
234 |     encoded_query = tokenizer([query])
235 |     batch = {
236 |         key: torch.tensor(values).to(device)
237 |         for key, values in encoded_query.items()
238 |     }
239 |     model.eval()
240 |     with torch.no_grad():
241 |         text_features = model.text_encoder(
242 |             input_ids=batch["input_ids"], 
243 |             attention_mask=batch["attention_mask"])
244 |         text_embeddings = model.text_projection(text_features)
245 |     
246 |     dot_similarity = text_embeddings @ image_embeddings.T
247 |     values, indices = torch.topk(dot_similarity.squeeze(0), n)
248 |     return indices
249 |  
250 | 
251 | 
252 | query = "a running dog"
253 | top_image_ids = search_top_images(model, batch_image_embeddings, query, 2)
254 | for id in top_image_ids:
255 |     image = sample_batch["image"][id]
256 |     npimg = image.numpy()
257 |     plt.imshow(np.transpose(npimg, (1, 2, 0)))
258 |     plt.title(f"Query: {query}")
259 |     plt.show()
260 | 
261 | 
262 | query = "kids jumping into a pool"
263 | top_image_ids = search_top_images(model, batch_image_embeddings, query, 1)
264 | for id in top_image_ids:
265 |     image = sample_batch["image"][id]
266 |     npimg = image.numpy()
267 |     plt.imshow(np.transpose(npimg, (1, 2, 0)))
268 |     plt.title(f"Query: {query}")
269 |     plt.show()
270 |     
271 | 
272 | 
273 | # ---
274 | 
275 | # Readers may ignore the next cell.
276 | 
277 | get_ipython().system('jupyter nbconvert --to python ch14_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
278 | 
279 | 


--------------------------------------------------------------------------------
/ch14/ch14_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 14 Building an Image Search Engine Using Multimodal Models
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Finding images with words 
 11 | 
 12 | # ## Image search using the pre-trained CLIP model
 13 | 
 14 | from sentence_transformers import SentenceTransformer, util
 15 | model = SentenceTransformer('clip-ViT-B-32')
 16 | 
 17 | 
 18 | import os
 19 | import glob
 20 | from PIL import Image
 21 | import torch
 22 | 
 23 | 
 24 | image_paths = list(glob.glob('flickr8k/Flicker8k_Dataset/*.jpg'))
 25 | 
 26 | all_image_embeddings = []
 27 | for img_path in image_paths:
 28 |     img = Image.open(img_path)
 29 |     all_image_embeddings.append(model.encode(img, convert_to_tensor=True))
 30 |  
 31 | 
 32 | 
 33 | import matplotlib.pyplot as plt
 34 |  
 35 | 
 36 | def search_top_images(model, image_embeddings, query, top_k=1):
 37 |     query_embeddings = model.encode([query], convert_to_tensor=True, show_progress_bar=False)
 38 |     hits = util.semantic_search(query_embeddings,  image_embeddings, top_k=top_k)[0]
 39 |     return hits
 40 | 
 41 | 
 42 | query = "a swimming dog"
 43 | hits = search_top_images(model, all_image_embeddings, query)
 44 | 
 45 | for hit in hits:
 46 |     img_path = image_paths[hit['corpus_id']]
 47 |     image = Image.open(img_path)
 48 |     plt.imshow(image)
 49 |     plt.title(f"Query: {query}")
 50 |     plt.show()
 51 | 
 52 | 
 53 | image_query = Image.open("flickr8k/Flicker8k_Dataset/240696675_7d05193aa0.jpg")
 54 | hits = search_top_images(model, all_image_embeddings, image_query, 3)[1:]
 55 | 
 56 | plt.imshow(image_query)
 57 | plt.title(f"Query image")
 58 | plt.show()
 59 | 
 60 | for hit in hits:
 61 |     img_path = image_paths[hit['corpus_id']]
 62 |     image = Image.open(img_path)
 63 |     plt.imshow(image)
 64 |     plt.title(f"Similar image")
 65 |     plt.show()
 66 | 
 67 | 
 68 | # ## Zero-shot classification
 69 | 
 70 | from torchvision.datasets import CIFAR100
 71 | cifar100 = CIFAR100(root="CIFAR100", download=True, train=False)
 72 | 
 73 | 
 74 | print(cifar100.classes)
 75 | print("Number of classes in CIFAR100 dataset:", len(cifar100.classes))
 76 | 
 77 | 
 78 | sample_index = 0
 79 | img, class_id = cifar100[sample_index]
 80 | print(f"Class of the sample image: {class_id} - {cifar100.classes[class_id]}")
 81 | 
 82 | 
 83 | sample_image_embeddings = model.encode(img, convert_to_tensor=True)
 84 | 
 85 | 
 86 | class_text = model.encode(cifar100.classes, convert_to_tensor=True)
 87 | 
 88 | 
 89 | hits = util.semantic_search(sample_image_embeddings,  class_text, top_k=1)[0]
 90 | pred = hits[0]['corpus_id']
 91 | print(f"Predicted class of the sample image: {pred}")
 92 | 
 93 | 
 94 | all_image_embeddings = []
 95 | class_true = []
 96 | for img, class_id in cifar100:
 97 |     class_true.append(class_id)
 98 |     all_image_embeddings.append(model.encode(img, convert_to_tensor=True))
 99 | 
100 | 
101 | class_pred = []
102 | for hit in util.semantic_search(all_image_embeddings,  class_text, top_k=1):
103 |     class_pred.append(hit[0]['corpus_id'])
104 | 
105 | 
106 | from sklearn.metrics import accuracy_score
107 | acc = accuracy_score(class_true, class_pred)
108 | print(f"Accuracy of zero-shot classification: {acc * 100}%")
109 | 
110 | 
111 | # ---
112 | 
113 | # Readers may ignore the next cell.
114 | 
115 | get_ipython().system('jupyter nbconvert --to python ch14_part2.ipynb --TemplateExporter.exclude_input_prompt=True')
116 | 
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/ch15/ch15_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 15 Making Decisions in Complex Environments with Reinforcement Learning
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Setting up the working environment
 11 | 
 12 | # ## Installing OpenAI Gym 
 13 | 
 14 | import gymnasium as gym
 15 | print(gym.envs.registry.keys())
 16 | 
 17 | 
 18 | # # Solving the FrozenLake environment with dynamic programming
 19 | 
 20 | # ## Simulating the FrozenLake environment
 21 | 
 22 | env = gym.make("FrozenLake-v1", render_mode="rgb_array")
 23 |  
 24 | n_state = env.observation_space.n
 25 | print(n_state)
 26 | n_action = env.action_space.n
 27 | print(n_action)
 28 | 
 29 | 
 30 | env.reset(seed=0)
 31 | 
 32 | 
 33 | import matplotlib.pyplot as plt
 34 | plt.imshow(env.render())  
 35 | 
 36 | 
 37 | new_state, reward, terminated, truncated, info = env.step(2)
 38 | is_done = terminated or truncated
 39 |     
 40 | env.render()
 41 | print(new_state)
 42 | print(reward)
 43 | print(is_done)
 44 | print(info)
 45 | 
 46 | 
 47 | plt.imshow(env.render())
 48 | 
 49 | 
 50 | def run_episode(env, policy):
 51 |     state, _ = env.reset()
 52 |     total_reward = 0
 53 |     is_done = False
 54 |     while not is_done:
 55 |         action = policy[state].item()
 56 |         state, reward, terminated, truncated, info = env.step(action)
 57 |         is_done = terminated or truncated
 58 |         total_reward += reward
 59 |         if is_done:
 60 |             break
 61 |     return total_reward
 62 | 
 63 | 
 64 | import torch
 65 | 
 66 | n_episode = 1000
 67 | 
 68 | total_rewards = []
 69 | for episode in range(n_episode):
 70 |     random_policy = torch.randint(high=n_action, size=(n_state,))
 71 |     total_reward = run_episode(env, random_policy)
 72 |     total_rewards.append(total_reward)
 73 | 
 74 | print(f'Average total reward under random policy: {sum(total_rewards)/n_episode}')
 75 | 
 76 | 
 77 | print(env.env.P[6])
 78 | 
 79 | 
 80 | # ## Solving FrozenLake with the value iteration algorithm
 81 | 
 82 | def value_iteration(env, gamma, threshold):
 83 |     """
 84 |     Solve a given environment with value iteration algorithm
 85 |     @param env: Gymnasium environment
 86 |     @param gamma: discount factor
 87 |     @param threshold: the evaluation will stop once values for all states are less than the threshold
 88 |     @return: values of the optimal policy for the given environment
 89 |     """
 90 |     n_state = env.observation_space.n
 91 |     n_action = env.action_space.n
 92 |     V = torch.zeros(n_state)
 93 |     while True:
 94 |         V_temp = torch.empty(n_state)
 95 |         for state in range(n_state):
 96 |             v_actions = torch.zeros(n_action)
 97 |             for action in range(n_action):
 98 |                 for trans_prob, new_state, reward, _ in env.env.P[state][action]:
 99 |                     v_actions[action] += trans_prob * (reward + gamma * V[new_state])
100 |             V_temp[state] = torch.max(v_actions)
101 |         max_delta = torch.max(torch.abs(V - V_temp))
102 |         V = V_temp.clone()
103 |         if max_delta <= threshold:
104 |             break
105 |     return V
106 | 
107 | 
108 | gamma = 0.99
109 | threshold = 0.0001
110 | 
111 | V_optimal = value_iteration(env, gamma, threshold)
112 | print('Optimal values:\n', V_optimal)
113 | 
114 | 
115 | 
116 | def extract_optimal_policy(env, V_optimal, gamma):
117 |     """
118 |     Obtain the optimal policy based on the optimal values
119 |     @param env: Gymnasium environment
120 |     @param V_optimal: optimal values
121 |     @param gamma: discount factor
122 |     @return: optimal policy
123 |     """
124 |     n_state = env.observation_space.n
125 |     n_action = env.action_space.n
126 |     optimal_policy = torch.zeros(n_state)
127 |     for state in range(n_state):
128 |         v_actions = torch.zeros(n_action)
129 |         for action in range(n_action):
130 |             for trans_prob, new_state, reward, _ in env.env.P[state][action]:
131 |                 v_actions[action] += trans_prob * (reward + gamma * V_optimal[new_state])
132 |         optimal_policy[state] = torch.argmax(v_actions)
133 |     return optimal_policy
134 | 
135 | 
136 | optimal_policy = extract_optimal_policy(env, V_optimal, gamma)
137 | print('Optimal policy:\n', optimal_policy)
138 | 
139 | 
140 | def run_episode(env, policy):
141 |     state, _ = env.reset()
142 |     total_reward = 0
143 |     is_done = False
144 |     while not is_done:
145 |         action = policy[state].item()
146 |         state, reward, terminated, truncated, info = env.step(action)
147 |         is_done = terminated or truncated
148 |         total_reward += reward
149 |         if is_done:
150 |             break
151 |     return total_reward
152 | 
153 | 
154 | n_episode = 1000
155 | total_rewards = []
156 | for episode in range(n_episode):
157 |     total_reward = run_episode(env, optimal_policy)
158 |     total_rewards.append(total_reward)
159 | 
160 | print('Average total reward under the optimal policy:', sum(total_rewards) / n_episode)
161 | 
162 | 
163 | # ## Solving FrozenLake with the policy iteration algorithm
164 | 
165 | def policy_evaluation(env, policy, gamma, threshold):
166 |     """
167 |     Perform policy evaluation
168 |     @param env: Gymnasium  environment
169 |     @param policy: policy matrix containing actions and their probability in each state
170 |     @param gamma: discount factor
171 |     @param threshold: the evaluation will stop once values for all states are less than the threshold
172 |     @return: values of the given policy
173 |     """
174 |     n_state = policy.shape[0]
175 |     V = torch.zeros(n_state)
176 |     while True:
177 |         V_temp = torch.zeros(n_state)
178 |         for state in range(n_state):
179 |             action = policy[state].item()
180 |             for trans_prob, new_state, reward, _ in env.env.P[state][action]:
181 |                 V_temp[state] += trans_prob * (reward + gamma * V[new_state])
182 |         max_delta = torch.max(torch.abs(V - V_temp))
183 |         V = V_temp.clone()
184 |         if max_delta <= threshold:
185 |             break
186 |     return V
187 | 
188 | 
189 | def policy_improvement(env, V, gamma):
190 |     """
191 |     Obtain an improved policy based on the values
192 |     @param env: Gymnasium  environment
193 |     @param V: policy values
194 |     @param gamma: discount factor
195 |     @return: the policy
196 |     """
197 |     n_state = env.observation_space.n
198 |     n_action = env.action_space.n
199 |     policy = torch.zeros(n_state)
200 |     for state in range(n_state):
201 |         v_actions = torch.zeros(n_action)
202 |         for action in range(n_action):
203 |             for trans_prob, new_state, reward, _ in env.env.P[state][action]:
204 |                 v_actions[action] += trans_prob * (reward + gamma * V[new_state])
205 |         policy[state] = torch.argmax(v_actions)
206 |     return policy
207 | 
208 | 
209 | def policy_iteration(env, gamma, threshold):
210 |     """
211 |     Solve a given environment with policy iteration algorithm
212 |     @param env: Gymnasium  environment
213 |     @param gamma: discount factor
214 |     @param threshold: the evaluation will stop once values for all states are less than the threshold
215 |     @return: optimal values and the optimal policy for the given environment
216 |     """
217 |     n_state = env.observation_space.n
218 |     n_action = env.action_space.n
219 |     policy = torch.randint(high=n_action, size=(n_state,)).float()
220 |     while True:
221 |         V = policy_evaluation(env, policy, gamma, threshold)
222 |         policy_improved = policy_improvement(env, V, gamma)
223 |         if torch.equal(policy_improved, policy):
224 |             return V, policy_improved
225 |         policy = policy_improved
226 | 
227 | 
228 | gamma = 0.99
229 | threshold = 0.0001
230 | 
231 | 
232 | V_optimal, optimal_policy = policy_iteration(env, gamma, threshold)
233 | print('Optimal values:\n', V_optimal)
234 | print('Optimal policy:\n', optimal_policy)
235 | 
236 | 
237 | # ---
238 | 
239 | # Readers may ignore the next cell.
240 | 
241 | 
242 | 
243 | 


--------------------------------------------------------------------------------
/ch15/ch15_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 15 Making Decisions in Complex Environments with Reinforcement Learning
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Performing Monte Carlo learning
 11 | 
 12 | # ## Simulating the Blackjack environment
 13 | 
 14 | import gymnasium as gym
 15 | 
 16 | env = gym.make('Blackjack-v1')
 17 | 
 18 | env.reset(seed=0)
 19 | 
 20 | 
 21 | env.step(1)
 22 | 
 23 | 
 24 | env.step(1)
 25 | 
 26 | 
 27 | env.step(0)
 28 | 
 29 | 
 30 | # ## Performing Monte Carlo policy evaluation
 31 | 
 32 | def run_episode(env, hold_score):
 33 |     state, _ = env.reset()
 34 |     rewards = []
 35 |     states = [state]
 36 |     while True:
 37 |         action = 1 if state[0] < hold_score else 0
 38 |         state, reward, terminated, truncated, info = env.step(action)
 39 |         is_done = terminated or truncated
 40 |         states.append(state)
 41 |         rewards.append(reward)
 42 |         if is_done:
 43 |             break
 44 |     return states, rewards
 45 | 
 46 | 
 47 | from collections import defaultdict
 48 | 
 49 | def mc_prediction_first_visit(env, hold_score, gamma, n_episode):
 50 |     V = defaultdict(float)
 51 |     N = defaultdict(int)
 52 |     for episode in range(n_episode):
 53 |         states_t, rewards_t = run_episode(env, hold_score)
 54 |         return_t = 0
 55 |         G = {}
 56 |         for state_t, reward_t in zip(states_t[1::-1], rewards_t[::-1]):
 57 |             return_t = gamma * return_t + reward_t
 58 |             G[state_t] = return_t
 59 |         for state, return_t in G.items():
 60 |             if state[0] <= 21:
 61 |                 V[state] += return_t
 62 |                 N[state] += 1
 63 |     for state in V:
 64 |         V[state] = V[state] / N[state]
 65 |     return V
 66 | 
 67 | 
 68 | gamma = 1
 69 | hold_score = 18
 70 | n_episode = 500000
 71 | 
 72 | value = mc_prediction_first_visit(env, hold_score, gamma, n_episode)
 73 | 
 74 | print(value)
 75 | 
 76 | print('Number of states:', len(value))
 77 | 
 78 | 
 79 | # ## Performing on-policy Monte Carlo control
 80 | 
 81 | import torch
 82 | 
 83 | def run_episode(env, Q, n_action):
 84 |     state, _ = env.reset()
 85 |     rewards = []
 86 |     actions = []
 87 |     states = []
 88 |     action = torch.randint(0, n_action, [1]).item()
 89 |     while True:
 90 |         actions.append(action)
 91 |         states.append(state)
 92 |         state, reward, terminated, truncated, info = env.step(action)
 93 |         is_done = terminated or truncated
 94 |         rewards.append(reward)
 95 |         if is_done:
 96 |             break
 97 |         action = torch.argmax(Q[state]).item()
 98 |     return states, actions, rewards
 99 | 
100 | 
101 | def mc_control_on_policy(env, gamma, n_episode):
102 |     n_action = env.action_space.n
103 |     G_sum = defaultdict(float)
104 |     N = defaultdict(int)
105 |     Q = defaultdict(lambda: torch.empty(n_action))
106 |     for episode in range(n_episode):
107 |         states_t, actions_t, rewards_t = run_episode(env, Q, n_action)
108 |         return_t = 0
109 |         G = {}
110 |         for state_t, action_t, reward_t in zip(states_t[::-1], actions_t[::-1], rewards_t[::-1]):
111 |             return_t = gamma * return_t + reward_t
112 |             G[(state_t, action_t)] = return_t
113 |         for state_action, return_t in G.items():
114 |             state, action = state_action
115 |             if state[0] <= 21:
116 |                 G_sum[state_action] += return_t
117 |                 N[state_action] += 1
118 |                 Q[state][action] = G_sum[state_action] / N[state_action]
119 |     policy = {}
120 |     for state, actions in Q.items():
121 |         policy[state] = torch.argmax(actions).item()
122 |     return Q, policy
123 | 
124 | 
125 | gamma = 1
126 | n_episode = 500000
127 | 
128 | optimal_Q, optimal_policy = mc_control_on_policy(env, gamma, n_episode)
129 | 
130 | print(optimal_policy)
131 | 
132 | 
133 | def simulate_hold_episode(env, hold_score):
134 |     state, _ = env.reset()
135 |     while True:
136 |         action = 1 if state[0] < hold_score else 0
137 |         state, reward, terminated, truncated, info = env.step(action)
138 |         is_done = terminated or truncated
139 |         if is_done:
140 |             return reward
141 |         
142 | 
143 | 
144 | def simulate_episode(env, policy):
145 |     state, _ = env.reset()
146 |     while True:
147 |         action = policy[state]
148 |         state, reward, terminated, truncated, info = env.step(action)
149 |         is_done = terminated or truncated
150 |         if is_done:
151 |             return reward
152 |         
153 | 
154 | 
155 | n_episode = 100000
156 | hold_score = 18
157 | n_win_opt = 0
158 | n_win_hold = 0
159 | 
160 | for _ in range(n_episode):
161 |     reward = simulate_hold_episode(env, hold_score)
162 |     if reward == 1:
163 |         n_win_hold += 1
164 |     reward = simulate_episode(env, optimal_policy)
165 |     if reward == 1:
166 |         n_win_opt += 1
167 | 
168 | 
169 | print(f'Winning probability under the simple policy: {n_win_hold/n_episode}')
170 | print(f'Winning probability under the optimal policy: {n_win_opt/n_episode}')
171 | 
172 | 
173 | # # Solving the Blackjack problem with the Q-learning algorithm
174 | 
175 | # ## Developing the Q-learning algorithm
176 | 
177 | def epsilon_greedy_policy(n_action, epsilon, state, Q):
178 |     probs = torch.ones(n_action) * epsilon / n_action
179 |     best_action = torch.argmax(Q[state]).item()
180 |     probs[best_action] += 1.0 - epsilon
181 |     action = torch.multinomial(probs, 1).item()
182 |     return action
183 | 
184 | 
185 | def q_learning(env, gamma, n_episode, alpha, epsilon, final_epsilon):
186 |     n_action = env.action_space.n
187 |     Q = defaultdict(lambda: torch.zeros(n_action))
188 |     epsilon_decay = epsilon / (n_episode / 2)  
189 |     for episode in range(n_episode):
190 |         state, _ = env.reset()
191 |         is_done = False
192 |         epsilon = max(final_epsilon, epsilon - epsilon_decay)
193 | 
194 |         while not is_done:
195 |             action = epsilon_greedy_policy(n_action, epsilon, state, Q)
196 |             next_state, reward, terminated, truncated, info = env.step(action)
197 |             is_done = terminated or truncated
198 |             delta = reward + gamma * torch.max(Q[next_state]) - Q[state][action]
199 |             Q[state][action] += alpha * delta
200 |             total_reward_episode[episode] += reward
201 |             if is_done:
202 |                 break
203 |             state = next_state
204 |     policy = {}
205 |     for state, actions in Q.items():
206 |         policy[state] = torch.argmax(actions).item()
207 |     return Q, policy
208 | 
209 | 
210 | n_episode = 10000
211 | epsilon = 1.0
212 | final_epsilon = 0.1
213 | 
214 | gamma = 1
215 | alpha = 0.003
216 | 
217 | total_reward_episode = [0] * n_episode
218 | 
219 | optimal_Q, optimal_policy = q_learning(env, gamma, n_episode, alpha, epsilon, final_epsilon)
220 | 
221 | 
222 | rolling_avg_reward = [total_reward_episode[0]]
223 | for i, reward in enumerate(total_reward_episode[1:], 1):
224 |     rolling_avg_reward.append((rolling_avg_reward[-1]*i + reward)/(i+1))
225 | 
226 | 
227 | import matplotlib.pyplot as plt
228 | plt.plot(rolling_avg_reward)
229 | plt.title('Average reward over time')
230 | plt.xlabel('Episode')
231 | plt.ylabel('Average reward')
232 | plt.ylim([-1, 1])
233 | plt.show()
234 | 
235 | 
236 | n_episode = 100000
237 | n_win_opt = 0
238 | 
239 | for _ in range(n_episode):
240 |     reward = simulate_episode(env, optimal_policy)
241 |     if reward == 1:
242 |         n_win_opt += 1
243 | 
244 | 
245 | print(f'Winning probability under the optimal policy: {n_win_opt/n_episode}')
246 | 
247 | 
248 | # ---
249 | 
250 | # Readers may ignore the next cell.
251 | 
252 | get_ipython().system('jupyter nbconvert --to python ch15_part2.ipynb --TemplateExporter.exclude_input_prompt=True')
253 | 
254 | 


--------------------------------------------------------------------------------
/ch2/ch2_part1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "38c139cd",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "\n",
  9 |     "Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)\n",
 10 |     "\n",
 11 |     "Chapter 2 Building A Movie Recommendation Engine with Naive Bayes\n",
 12 |     "\n",
 13 |     "Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)\n",
 14 |     "\n",
 15 |     "\n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "6740371d",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# Implementing Naïve Bayes "
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "id": "33286ee5",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Implementing Naïve Bayes from scratch"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 1,
 37 |    "id": "88605dbc",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "import numpy as np\n"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "id": "e6695e41",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "X_train = np.array([\n",
 52 |     "    [0, 1, 1],\n",
 53 |     "    [0, 0, 1],\n",
 54 |     "    [0, 0, 0],\n",
 55 |     "    [1, 1, 0]])\n",
 56 |     "\n",
 57 |     "Y_train = ['Y', 'N', 'Y', 'Y']\n",
 58 |     "\n",
 59 |     "X_test = np.array([[1, 1, 0]])"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "id": "38c76d45",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "def get_label_indices(labels):\n",
 70 |     "    \"\"\"\n",
 71 |     "    Group samples based on their labels and return indices\n",
 72 |     "    @param labels: list of labels\n",
 73 |     "    @return: dict, {class1: [indices], class2: [indices]}\n",
 74 |     "    \"\"\"\n",
 75 |     "    from collections import defaultdict\n",
 76 |     "    label_indices = defaultdict(list)\n",
 77 |     "    for index, label in enumerate(labels):\n",
 78 |     "        label_indices[label].append(index)\n",
 79 |     "    return label_indices"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 4,
 85 |    "id": "4141a162",
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "label_indices:\n",
 93 |       " defaultdict(<class 'list'>, {'Y': [0, 2, 3], 'N': [1]})\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "label_indices = get_label_indices(Y_train)\n",
 99 |     "print('label_indices:\\n', label_indices)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 5,
105 |    "id": "c6b94f65",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "def get_prior(label_indices):\n",
110 |     "    \"\"\"\n",
111 |     "    Compute prior based on training samples\n",
112 |     "    @param label_indices: grouped sample indices by class\n",
113 |     "    @return: dictionary, with class label as key, corresponding prior as the value\n",
114 |     "    \"\"\"\n",
115 |     "    prior = {label: len(indices) for label, indices in label_indices.items()}\n",
116 |     "    total_count = sum(prior.values())\n",
117 |     "    for label in prior:\n",
118 |     "        prior[label] /= total_count\n",
119 |     "    return prior"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 6,
125 |    "id": "428ecf6d",
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "Prior: {'Y': 0.75, 'N': 0.25}\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "prior = get_prior(label_indices)\n",
138 |     "print('Prior:', prior)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 7,
144 |    "id": "ed3ba747",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "def get_likelihood(features, label_indices, smoothing=0):\n",
149 |     "    \"\"\"\n",
150 |     "    Compute likelihood based on training samples\n",
151 |     "    @param features: matrix of features\n",
152 |     "    @param label_indices: grouped sample indices by class\n",
153 |     "    @param smoothing: integer, additive smoothing parameter\n",
154 |     "    @return: dictionary, with class as key, corresponding conditional probability P(feature|class) vector as value\n",
155 |     "    \"\"\"\n",
156 |     "    likelihood = {}\n",
157 |     "    for label, indices in label_indices.items():\n",
158 |     "        likelihood[label] = features[indices, :].sum(axis=0) + smoothing\n",
159 |     "        total_count = len(indices)\n",
160 |     "        likelihood[label] = likelihood[label] / (total_count + 2 * smoothing)\n",
161 |     "    return likelihood"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 8,
167 |    "id": "e2b56969",
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "name": "stdout",
172 |      "output_type": "stream",
173 |      "text": [
174 |       "Likelihood:\n",
175 |       " {'Y': array([0.4, 0.6, 0.4]), 'N': array([0.33333333, 0.33333333, 0.66666667])}\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "smoothing = 1\n",
181 |     "likelihood = get_likelihood(X_train, label_indices, smoothing)\n",
182 |     "print('Likelihood:\\n', likelihood)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 9,
188 |    "id": "4559a3a2",
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "def get_posterior(X, prior, likelihood):\n",
193 |     "    \"\"\"\n",
194 |     "    Compute posterior of testing samples, based on prior and likelihood\n",
195 |     "    @param X: testing samples\n",
196 |     "    @param prior: dictionary, with class label as key, corresponding prior as the value\n",
197 |     "    @param likelihood: dictionary, with class label as key, corresponding conditional probability vector as value\n",
198 |     "    @return: dictionary, with class label as key, corresponding posterior as value\n",
199 |     "    \"\"\"\n",
200 |     "    posteriors = []\n",
201 |     "    for x in X:\n",
202 |     "        # posterior is proportional to prior * likelihood\n",
203 |     "        posterior = prior.copy()\n",
204 |     "        for label, likelihood_label in likelihood.items():\n",
205 |     "            for index, bool_value in enumerate(x):\n",
206 |     "                posterior[label] *= likelihood_label[index] if bool_value else (1 - likelihood_label[index])\n",
207 |     "        # normalize so that all sums up to 1\n",
208 |     "        sum_posterior = sum(posterior.values())\n",
209 |     "        for label in posterior:\n",
210 |     "            if posterior[label] == float('inf'):\n",
211 |     "                posterior[label] = 1.0\n",
212 |     "            else:\n",
213 |     "                posterior[label] /= sum_posterior\n",
214 |     "        posteriors.append(posterior.copy())\n",
215 |     "    return posteriors\n"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 10,
221 |    "id": "6c559bfa",
222 |    "metadata": {},
223 |    "outputs": [
224 |     {
225 |      "name": "stdout",
226 |      "output_type": "stream",
227 |      "text": [
228 |       "Posterior:\n",
229 |       " [{'Y': 0.9210360075805433, 'N': 0.07896399241945673}]\n"
230 |      ]
231 |     }
232 |    ],
233 |    "source": [
234 |     "posterior = get_posterior(X_test, prior, likelihood)\n",
235 |     "print('Posterior:\\n', posterior)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "id": "7e846661",
241 |    "metadata": {},
242 |    "source": [
243 |     "## Implementing Naïve Bayes with scikit-learn "
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 11,
249 |    "id": "8a54b509",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "from sklearn.naive_bayes import BernoulliNB\n"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 12,
259 |    "id": "5e33349c",
260 |    "metadata": {},
261 |    "outputs": [
262 |     {
263 |      "data": {
264 |       "text/plain": [
265 |        "BernoulliNB()"
266 |       ]
267 |      },
268 |      "execution_count": 12,
269 |      "metadata": {},
270 |      "output_type": "execute_result"
271 |     }
272 |    ],
273 |    "source": [
274 |     "clf = BernoulliNB(alpha=1.0, fit_prior=True)\n",
275 |     "clf.fit(X_train, Y_train)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 13,
281 |    "id": "e8c0835e",
282 |    "metadata": {},
283 |    "outputs": [
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "[scikit-learn] Predicted probabilities:\n",
289 |       " [[0.07896399 0.92103601]]\n"
290 |      ]
291 |     }
292 |    ],
293 |    "source": [
294 |     "pred_prob = clf.predict_proba(X_test)\n",
295 |     "print('[scikit-learn] Predicted probabilities:\\n', pred_prob)\n"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 14,
301 |    "id": "22e775be",
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "name": "stdout",
306 |      "output_type": "stream",
307 |      "text": [
308 |       "[scikit-learn] Prediction: ['Y']\n"
309 |      ]
310 |     }
311 |    ],
312 |    "source": [
313 |     "pred = clf.predict(X_test)\n",
314 |     "print('[scikit-learn] Prediction:', pred)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "id": "1fe54862",
320 |    "metadata": {},
321 |    "source": [
322 |     "---"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "id": "7e26d248",
328 |    "metadata": {},
329 |    "source": [
330 |     "Readers may ignore the next cell."
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 15,
336 |    "id": "6629bb2c",
337 |    "metadata": {},
338 |    "outputs": [
339 |     {
340 |      "name": "stderr",
341 |      "output_type": "stream",
342 |      "text": [
343 |       "[NbConvertApp] Converting notebook ch2_part1.ipynb to python\n",
344 |       "[NbConvertApp] Writing 3985 bytes to ch2_part1.py\n"
345 |      ]
346 |     }
347 |    ],
348 |    "source": [
349 |     "!jupyter nbconvert --to python ch2_part1.ipynb --TemplateExporter.exclude_input_prompt=True"
350 |    ]
351 |   }
352 |  ],
353 |  "metadata": {
354 |   "kernelspec": {
355 |    "display_name": "Python 3 (ipykernel)",
356 |    "language": "python",
357 |    "name": "python3"
358 |   },
359 |   "language_info": {
360 |    "codemirror_mode": {
361 |     "name": "ipython",
362 |     "version": 3
363 |    },
364 |    "file_extension": ".py",
365 |    "mimetype": "text/x-python",
366 |    "name": "python",
367 |    "nbconvert_exporter": "python",
368 |    "pygments_lexer": "ipython3",
369 |    "version": "3.9.13"
370 |   }
371 |  },
372 |  "nbformat": 4,
373 |  "nbformat_minor": 5
374 | }
375 | 


--------------------------------------------------------------------------------
/ch2/ch2_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 
  5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  6 | # 
  7 | # Chapter 2 Building A Movie Recommendation Engine with Naive Bayes
  8 | # 
  9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
 10 | # 
 11 | # 
 12 | # 
 13 | 
 14 | # # Implementing Naïve Bayes 
 15 | 
 16 | # ## Implementing Naïve Bayes from scratch
 17 | 
 18 | import numpy as np
 19 | 
 20 | 
 21 | X_train = np.array([
 22 |     [0, 1, 1],
 23 |     [0, 0, 1],
 24 |     [0, 0, 0],
 25 |     [1, 1, 0]])
 26 | 
 27 | Y_train = ['Y', 'N', 'Y', 'Y']
 28 | 
 29 | X_test = np.array([[1, 1, 0]])
 30 | 
 31 | 
 32 | def get_label_indices(labels):
 33 |     """
 34 |     Group samples based on their labels and return indices
 35 |     @param labels: list of labels
 36 |     @return: dict, {class1: [indices], class2: [indices]}
 37 |     """
 38 |     from collections import defaultdict
 39 |     label_indices = defaultdict(list)
 40 |     for index, label in enumerate(labels):
 41 |         label_indices[label].append(index)
 42 |     return label_indices
 43 | 
 44 | 
 45 | label_indices = get_label_indices(Y_train)
 46 | print('label_indices:\n', label_indices)
 47 | 
 48 | 
 49 | def get_prior(label_indices):
 50 |     """
 51 |     Compute prior based on training samples
 52 |     @param label_indices: grouped sample indices by class
 53 |     @return: dictionary, with class label as key, corresponding prior as the value
 54 |     """
 55 |     prior = {label: len(indices) for label, indices in label_indices.items()}
 56 |     total_count = sum(prior.values())
 57 |     for label in prior:
 58 |         prior[label] /= total_count
 59 |     return prior
 60 | 
 61 | 
 62 | prior = get_prior(label_indices)
 63 | print('Prior:', prior)
 64 | 
 65 | 
 66 | def get_likelihood(features, label_indices, smoothing=0):
 67 |     """
 68 |     Compute likelihood based on training samples
 69 |     @param features: matrix of features
 70 |     @param label_indices: grouped sample indices by class
 71 |     @param smoothing: integer, additive smoothing parameter
 72 |     @return: dictionary, with class as key, corresponding conditional probability P(feature|class) vector as value
 73 |     """
 74 |     likelihood = {}
 75 |     for label, indices in label_indices.items():
 76 |         likelihood[label] = features[indices, :].sum(axis=0) + smoothing
 77 |         total_count = len(indices)
 78 |         likelihood[label] = likelihood[label] / (total_count + 2 * smoothing)
 79 |     return likelihood
 80 | 
 81 | 
 82 | smoothing = 1
 83 | likelihood = get_likelihood(X_train, label_indices, smoothing)
 84 | print('Likelihood:\n', likelihood)
 85 | 
 86 | 
 87 | def get_posterior(X, prior, likelihood):
 88 |     """
 89 |     Compute posterior of testing samples, based on prior and likelihood
 90 |     @param X: testing samples
 91 |     @param prior: dictionary, with class label as key, corresponding prior as the value
 92 |     @param likelihood: dictionary, with class label as key, corresponding conditional probability vector as value
 93 |     @return: dictionary, with class label as key, corresponding posterior as value
 94 |     """
 95 |     posteriors = []
 96 |     for x in X:
 97 |         # posterior is proportional to prior * likelihood
 98 |         posterior = prior.copy()
 99 |         for label, likelihood_label in likelihood.items():
100 |             for index, bool_value in enumerate(x):
101 |                 posterior[label] *= likelihood_label[index] if bool_value else (1 - likelihood_label[index])
102 |         # normalize so that all sums up to 1
103 |         sum_posterior = sum(posterior.values())
104 |         for label in posterior:
105 |             if posterior[label] == float('inf'):
106 |                 posterior[label] = 1.0
107 |             else:
108 |                 posterior[label] /= sum_posterior
109 |         posteriors.append(posterior.copy())
110 |     return posteriors
111 | 
112 | 
113 | posterior = get_posterior(X_test, prior, likelihood)
114 | print('Posterior:\n', posterior)
115 | 
116 | 
117 | # ## Implementing Naïve Bayes with scikit-learn 
118 | 
119 | from sklearn.naive_bayes import BernoulliNB
120 | 
121 | 
122 | clf = BernoulliNB(alpha=1.0, fit_prior=True)
123 | clf.fit(X_train, Y_train)
124 | 
125 | 
126 | pred_prob = clf.predict_proba(X_test)
127 | print('[scikit-learn] Predicted probabilities:\n', pred_prob)
128 | 
129 | 
130 | pred = clf.predict(X_test)
131 | print('[scikit-learn] Prediction:', pred)
132 | 
133 | 
134 | # ---
135 | 
136 | # Readers may ignore the next cell.
137 | 
138 | get_ipython().system('jupyter nbconvert --to python ch2_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
139 | 
140 | 


--------------------------------------------------------------------------------
/ch2/ch2_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 
  5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  6 | # 
  7 | # Chapter 2 Building A Movie Recommendation Engine with Naive Bayes
  8 | # 
  9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
 10 | # 
 11 | 
 12 | # # Building a movie recommender with Naïve Bayes
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | 
 17 | 
 18 | data_path = 'ml-1m/ratings.dat'
 19 | df = pd.read_csv(data_path, header=None, sep='::', engine='python')
 20 | df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
 21 | print(df)
 22 | 
 23 | 
 24 | n_users = df['user_id'].nunique()
 25 | n_movies = df['movie_id'].nunique()
 26 | print(f"Number of users: {n_users}")
 27 | print(f"Number of movies: {n_movies}")
 28 | 
 29 | 
 30 | def load_user_rating_data(df, n_users, n_movies):
 31 |     """
 32 |     Load rating data from the raw dataframe and also return movieId index mapping
 33 |     @param df: raw dataframe read from ratings.csv
 34 |     @param n_users: number of users
 35 |     @param n_movies: number of movies that have ratings
 36 |     @return: rating data in the numpy array of [user, movie];
 37 |              movie_id_mapping, {movie_id: column index in rating data}
 38 |     """
 39 |     data = np.zeros([n_users, n_movies], dtype=np.intc)
 40 |     movie_id_mapping = {}
 41 |     for user_id, movie_id, rating in zip(df['user_id'], df['movie_id'], df['rating']):
 42 |         user_id = int(user_id) - 1
 43 |         if movie_id not in movie_id_mapping:
 44 |             movie_id_mapping[movie_id] = len(movie_id_mapping)
 45 |         data[user_id, movie_id_mapping[movie_id]] = rating
 46 |     return data, movie_id_mapping
 47 | 
 48 | data, movie_id_mapping = load_user_rating_data(df, n_users, n_movies)
 49 | 
 50 | 
 51 | values, counts = np.unique(data, return_counts=True) 
 52 | for value, count in zip(values, counts): 
 53 |     print(f'Number of rating {value}: {count}') 
 54 | 
 55 | 
 56 | print(df['movie_id'].value_counts())
 57 | 
 58 | 
 59 | target_movie_id = 2858
 60 | X_raw = np.delete(data, movie_id_mapping[target_movie_id], axis=1)
 61 | Y_raw = data[:, movie_id_mapping[target_movie_id]]
 62 | 
 63 | X = X_raw[Y_raw > 0]
 64 | Y = Y_raw[Y_raw > 0]
 65 | 
 66 | print('Shape of X:', X.shape)
 67 | print('Shape of Y:', Y.shape)
 68 | 
 69 | 
 70 | recommend = 3
 71 | Y[Y <= recommend] = 0
 72 | Y[Y > recommend] = 1
 73 | 
 74 | n_pos = (Y == 1).sum()
 75 | n_neg = (Y == 0).sum()
 76 | print(f'{n_pos} positive samples and {n_neg} negative samples.')
 77 | 
 78 | 
 79 | from sklearn.model_selection import train_test_split
 80 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
 81 | print(len(Y_train), len(Y_test))
 82 | 
 83 | 
 84 | from sklearn.naive_bayes import MultinomialNB
 85 | clf = MultinomialNB(alpha=1.0, fit_prior=True)
 86 | clf.fit(X_train, Y_train)
 87 | 
 88 | 
 89 | prediction_prob = clf.predict_proba(X_test)
 90 | print(prediction_prob[0:10])
 91 | 
 92 | prediction = clf.predict(X_test)
 93 | print(prediction[:10])
 94 | 
 95 | accuracy = clf.score(X_test, Y_test)
 96 | print(f'The accuracy is: {accuracy*100:.1f}%')
 97 | 
 98 | 
 99 | # # Evaluating classification performance
100 | 
101 | from sklearn.metrics import confusion_matrix
102 | print(confusion_matrix(Y_test, prediction, labels=[0, 1]))
103 | 
104 | 
105 | from sklearn.metrics import precision_score, recall_score, f1_score
106 | 
107 | precision_score(Y_test, prediction, pos_label=1)
108 | 
109 | 
110 | recall_score(Y_test, prediction, pos_label=1)
111 | 
112 | 
113 | f1_score(Y_test, prediction, pos_label=1)
114 | 
115 | 
116 | f1_score(Y_test, prediction, pos_label=0) 
117 | 
118 | 
119 | from sklearn.metrics import classification_report
120 | report = classification_report(Y_test, prediction)
121 | print(report)
122 | 
123 | 
124 | pos_prob = prediction_prob[:, 1]
125 | 
126 | thresholds = np.arange(0.0, 1.1, 0.05)
127 | true_pos, false_pos = [0]*len(thresholds), [0]*len(thresholds)
128 | for pred, y in zip(pos_prob, Y_test):
129 |     for i, threshold in enumerate(thresholds):
130 |         if pred >= threshold:
131 |             if y == 1:
132 |                 true_pos[i] += 1
133 |             else:
134 |                 false_pos[i] += 1
135 |         else:
136 |             break
137 | 
138 | n_pos_test = (Y_test == 1).sum()
139 | n_neg_test = (Y_test == 0).sum()
140 | true_pos_rate = [tp / n_pos_test for tp in true_pos]
141 | false_pos_rate = [fp / n_neg_test for fp in false_pos]
142 | 
143 | 
144 | import matplotlib.pyplot as plt
145 | plt.figure()
146 | lw = 2
147 | plt.plot(false_pos_rate, true_pos_rate, color='darkorange', lw=lw)
148 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
149 | plt.xlim([0.0, 1.0])
150 | plt.ylim([0.0, 1.05])
151 | plt.xlabel('False Positive Rate')
152 | plt.ylabel('True Positive Rate')
153 | plt.title('Receiver Operating Characteristic')
154 | plt.legend(loc="lower right")
155 | plt.show()
156 | 
157 | 
158 | from sklearn.metrics import roc_auc_score
159 | print(roc_auc_score(Y_test, pos_prob))
160 | 
161 | 
162 | # # Tuning models with cross-validation
163 | 
164 | from sklearn.model_selection import StratifiedKFold
165 | k = 5
166 | k_fold = StratifiedKFold(n_splits=k, random_state=42)
167 | 
168 | smoothing_factor_option = [1, 2, 3, 4, 5, 6]
169 | fit_prior_option = [True, False]
170 | auc_record = {}
171 | 
172 | for train_indices, test_indices in k_fold.split(X, Y):
173 |     X_train_k, X_test_k = X[train_indices], X[test_indices]
174 |     Y_train_k, Y_test_k = Y[train_indices], Y[test_indices]
175 |     for alpha in smoothing_factor_option:
176 |         if alpha not in auc_record:
177 |             auc_record[alpha] = {}
178 |         for fit_prior in fit_prior_option:
179 |             clf = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
180 |             clf.fit(X_train_k, Y_train_k)
181 |             prediction_prob = clf.predict_proba(X_test_k)
182 |             pos_prob = prediction_prob[:, 1]
183 |             auc = roc_auc_score(Y_test_k, pos_prob)
184 |             auc_record[alpha][fit_prior] = auc + auc_record[alpha].get(fit_prior, 0.0)
185 | 
186 | 
187 | print('smoothing  fit prior  auc')
188 | for smoothing, smoothing_record in auc_record.items():
189 |     for fit_prior, auc in smoothing_record.items():
190 |         print(f'    {smoothing}        {fit_prior}    {auc/k:.5f}')
191 | 
192 | 
193 | clf = MultinomialNB(alpha=2.0, fit_prior=False)
194 | clf.fit(X_train, Y_train)
195 | 
196 | pos_prob = clf.predict_proba(X_test)[:, 1]
197 | print('AUC with the best model:', roc_auc_score(Y_test, pos_prob))
198 | 
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/ch3/ch3_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 
  5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  6 | # 
  7 | # Chapter 3 Predicting Online Ad Click-Through with Tree-Based Algorithms 
  8 | # 
  9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
 10 | 
 11 | # # The metrics for measuring a split 
 12 | 
 13 | # ## Gini Impurity 
 14 | 
 15 | import matplotlib.pyplot as plt
 16 | import numpy as np
 17 | 
 18 | 
 19 | # Plot Gini Impurity in binary case
 20 | pos_fraction = np.linspace(0.00, 1.00, 1000)
 21 | gini = 1 - pos_fraction**2 - (1-pos_fraction)**2
 22 | plt.plot(pos_fraction, gini)
 23 | plt.xlabel('Positive fraction')
 24 | plt.ylabel('Gini Impurity')
 25 | plt.ylim(0, 1)
 26 | plt.show()
 27 | 
 28 | 
 29 | # Given labels of a data set, the Gini Impurity calculation function
 30 | def gini_impurity(labels):
 31 |     # When the set is empty, it is also pure
 32 |     if len(labels) == 0:
 33 |         return 0
 34 |     # Count the occurrences of each label
 35 |     counts = np.unique(labels, return_counts=True)[1]
 36 |     fractions = counts / float(len(labels))
 37 |     return 1 - np.sum(fractions ** 2)
 38 | 
 39 | 
 40 | print(f'{gini_impurity([1, 1, 0, 1, 0]):.4f}')
 41 | print(f'{gini_impurity([1, 1, 0, 1, 0, 0]):.4f}')
 42 | print(f'{gini_impurity([1, 1, 1, 1]):.4f}')
 43 | 
 44 | 
 45 | # ## Information Gain 
 46 | 
 47 | # Plot entropy in binary case
 48 | pos_fraction = np.linspace(0.001, 0.999, 1000)
 49 | ent = - (pos_fraction * np.log2(pos_fraction) + (1 - pos_fraction) * np.log2(1 - pos_fraction))
 50 | plt.plot(pos_fraction, ent)
 51 | plt.xlabel('Positive fraction')
 52 | plt.ylabel('Entropy')
 53 | plt.ylim(0, 1)
 54 | plt.show()
 55 | 
 56 | 
 57 | # Given labels of a data set, the entropy calculation function
 58 | def entropy(labels):
 59 |     if len(labels) == 0:
 60 |         return 0
 61 |     counts = np.unique(labels, return_counts=True)[1]
 62 |     fractions = counts / float(len(labels))
 63 |     return - np.sum(fractions * np.log2(fractions))
 64 | 
 65 | print(f'{entropy([1, 1, 0, 1, 0]):.4f}')
 66 | print(f'{entropy([1, 1, 0, 1, 0, 0]):.4f}')
 67 | print(f'{entropy([1, 1, 1, 1]):.4f}')
 68 | 
 69 | 
 70 | # def information_gain(y, mask, func=entropy):
 71 | #     s1 = np.sum(mask)
 72 | #     s2 = mask.size - s1
 73 | #     if (s1 == 0 | s2 == 0): return 0
 74 | #     return func(y) - s1 / float(s1 + s2) * func(y[mask]) - s2 / float(s1 + s2) * func(y[np.logical_not(mask)])
 75 | 
 76 | 
 77 | criterion_function = {'gini': gini_impurity, 'entropy': entropy}
 78 | def weighted_impurity(groups, criterion='gini'):
 79 |     """
 80 |     Calculate weighted impurity of children after a split
 81 |     @param groups: list of children, and a child consists a list of class labels
 82 |     @param criterion: metric to measure the quality of a split, 'gini' for Gini Impurity or 'entropy' for Information Gain
 83 |     @return: float, weighted impurity
 84 |     """
 85 |     total = sum(len(group) for group in groups)
 86 |     weighted_sum = 0.0
 87 |     for group in groups:
 88 |         weighted_sum += len(group) / float(total) * criterion_function[criterion](group)
 89 |     return weighted_sum
 90 | 
 91 | 
 92 | children_1 = [[1, 0, 1], [0, 1]]
 93 | children_2 = [[1, 1], [0, 0, 1]]
 94 | print(f"Entropy of #1 split: {weighted_impurity(children_1, 'entropy'):.4f}")
 95 | print(f"Entropy of #2 split: {weighted_impurity(children_2, 'entropy'):.4f}")
 96 | 
 97 | 
 98 | # # Implementing a decision tree from scratch 
 99 | 
100 | def split_node(X, y, index, value):
101 |     """
102 |     Split data set X, y based on a feature and a value
103 |     @param X: numpy.ndarray, dataset feature
104 |     @param y: numpy.ndarray, dataset target
105 |     @param index: int, index of the feature used for splitting
106 |     @param value: value of the feature used for splitting
107 |     @return: list, list: left and right child, a child is in the format of [X, y]
108 |     """
109 |     x_index = X[:, index]
110 |     # if this feature is numerical
111 |     if X[0, index].dtype.kind in ['i', 'f']:
112 |         mask = x_index >= value
113 |     # if this feature is categorical
114 |     else:
115 |         mask = x_index == value
116 |     # split into left and right child
117 |     left = [X[~mask, :], y[~mask]]
118 |     right = [X[mask, :], y[mask]]
119 |     return left, right
120 | 
121 | 
122 | def get_best_split(X, y, criterion):
123 |     """
124 |     Obtain the best splitting point and resulting children for the data set X, y
125 |     @param X: numpy.ndarray, dataset feature
126 |     @param y: numpy.ndarray, dataset target
127 |     @param criterion: gini or entropy
128 |     @return: dict {index: index of the feature, value: feature value, children: left and right children}
129 |     """
130 |     best_index, best_value, best_score, children = None, None, 1, None
131 |     for index in range(len(X[0])):
132 |         for value in np.sort(np.unique(X[:, index])):
133 |             groups = split_node(X, y, index, value)
134 |             impurity = weighted_impurity([groups[0][1], groups[1][1]], criterion)
135 |             if impurity < best_score:
136 |                 best_index, best_value, best_score, children = index, value, impurity, groups
137 |     return {'index': best_index, 'value': best_value, 'children': children}|
138 | 
139 | 
140 | def get_leaf(labels):
141 |     # Obtain the leaf as the majority of the labels
142 |     return np.bincount(labels).argmax()
143 | 
144 | 
145 | def split(node, max_depth, min_size, depth, criterion):
146 |     """
147 |     Split children of a node to construct new nodes or assign them terminals
148 |     @param node: dict, with children info
149 |     @param max_depth: int, maximal depth of the tree
150 |     @param min_size: int, minimal samples required to further split a child
151 |     @param depth: int, current depth of the node
152 |     @param criterion: gini or entropy
153 |     """
154 |     left, right = node['children']
155 |     del (node['children'])
156 |     if left[1].size == 0:
157 |         node['right'] = get_leaf(right[1])
158 |         return
159 |     if right[1].size == 0:
160 |         node['left'] = get_leaf(left[1])
161 |         return
162 |     # Check if the current depth exceeds the maximal depth
163 |     if depth >= max_depth:
164 |         node['left'], node['right'] = get_leaf(left[1]), get_leaf(right[1])
165 |         return
166 |     # Check if the left child has enough samples
167 |     if left[1].size <= min_size:
168 |         node['left'] = get_leaf(left[1])
169 |     else:
170 |         # It has enough samples, we further split it
171 |         result = get_best_split(left[0], left[1], criterion)
172 |         result_left, result_right = result['children']
173 |         if result_left[1].size == 0:
174 |             node['left'] = get_leaf(result_right[1])
175 |         elif result_right[1].size == 0:
176 |             node['left'] = get_leaf(result_left[1])
177 |         else:
178 |             node['left'] = result
179 |             split(node['left'], max_depth, min_size, depth + 1, criterion)
180 |     # Check if the right child has enough samples
181 |     if right[1].size <= min_size:
182 |         node['right'] = get_leaf(right[1])
183 |     else:
184 |         # It has enough samples, we further split it
185 |         result = get_best_split(right[0], right[1], criterion)
186 |         result_left, result_right = result['children']
187 |         if result_left[1].size == 0:
188 |             node['right'] = get_leaf(result_right[1])
189 |         elif result_right[1].size == 0:
190 |             node['right'] = get_leaf(result_left[1])
191 |         else:
192 |             node['right'] = result
193 |             split(node['right'], max_depth, min_size, depth + 1, criterion)
194 | 
195 | 
196 | def train_tree(X_train, y_train, max_depth, min_size, criterion='gini'):
197 |     """
198 |     Construction of a tree starts here
199 |     @param X_train: list of training samples (feature)
200 |     @param y_train: list of training samples (target)
201 |     @param max_depth: int, maximal depth of the tree
202 |     @param min_size: int, minimal samples required to further split a child
203 |     @param criterion: gini or entropy
204 |     """
205 |     X = np.array(X_train)
206 |     y = np.array(y_train)
207 |     root = get_best_split(X, y, criterion)
208 |     split(root, max_depth, min_size, 1, criterion)
209 |     return root
210 | 
211 | 
212 | X_train = [['tech', 'professional'],
213 |            ['fashion', 'student'],
214 |            ['fashion', 'professional'],
215 |            ['sports', 'student'],
216 |            ['tech', 'student'],
217 |            ['tech', 'retired'],
218 |            ['sports', 'professional']]
219 | 
220 | y_train = [1,
221 |            0,
222 |            0,
223 |            0,
224 |            1,
225 |            0,
226 |            1]
227 | 
228 | tree = train_tree(X_train, y_train, 2, 2)
229 | 
230 | 
231 | CONDITION = {'numerical': {'yes': '>=', 'no': '<'},
232 |              'categorical': {'yes': 'is', 'no': 'is not'}}
233 | def visualize_tree(node, depth=0):
234 |     if isinstance(node, dict):
235 |         if node['value'].dtype.kind in ['i', 'f']:
236 |             condition = CONDITION['numerical']
237 |         else:
238 |             condition = CONDITION['categorical']
239 |         print('{}|- X{} {} {}'.format(depth * '  ', node['index'] + 1, condition['no'], node['value']))
240 |         if 'left' in node:
241 |             visualize_tree(node['left'], depth + 1)
242 |         print('{}|- X{} {} {}'.format(depth * '  ', node['index'] + 1, condition['yes'], node['value']))
243 |         if 'right' in node:
244 |             visualize_tree(node['right'], depth + 1)
245 |     else:
246 |         print(f"{depth * '  '}[{node}]")
247 | 
248 | 
249 | visualize_tree(tree)
250 | 
251 | 
252 | X_train_n = [[6, 7],
253 |            [2, 4],
254 |            [7, 2],
255 |            [3, 6],
256 |            [4, 7],
257 |            [5, 2],
258 |            [1, 6],
259 |            [2, 0],
260 |            [6, 3],
261 |            [4, 1]]
262 | 
263 | y_train_n = [0,
264 |            0,
265 |            0,
266 |            0,
267 |            0,
268 |            1,
269 |            1,
270 |            1,
271 |            1,
272 |            1]
273 | 
274 | tree = train_tree(X_train_n, y_train_n, 2, 2)
275 | visualize_tree(tree)
276 | 
277 | 
278 | # # Implementing a decision tree with scikit-learn 
279 | 
280 | from sklearn.tree import DecisionTreeClassifier
281 | tree_sk = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_split=2)
282 | tree_sk.fit(X_train_n, y_train_n)
283 | 
284 | from sklearn.tree import export_graphviz
285 | export_graphviz(tree_sk, out_file='tree.dot', feature_names=['X1', 'X2'], impurity=False, filled=True, class_names=['0', '1'])
286 | 
287 | 
288 | # ---
289 | 
290 | # Readers may ignore the next cell.
291 | 
292 | get_ipython().system('jupyter nbconvert --to python ch3_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
293 | 
294 | 


--------------------------------------------------------------------------------
/ch3/ch3_part2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "05b0d160",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "\n",
  9 |     "Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)\n",
 10 |     "\n",
 11 |     "Chapter 3 Predicting Online Ad Click-Through with Tree-Based Algorithms \n",
 12 |     "\n",
 13 |     "Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "96cab12c",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Predicting ad click-through with a decision tree"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "id": "d12f1e67",
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "name": "stdout",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "             id  click      hour    C1  banner_pos   site_id site_domain  \\\n",
 35 |       "0  1.000009e+18      0  14102100  1005           0  1fbe01fe    f3845767   \n",
 36 |       "1  1.000017e+19      0  14102100  1005           0  1fbe01fe    f3845767   \n",
 37 |       "2  1.000037e+19      0  14102100  1005           0  1fbe01fe    f3845767   \n",
 38 |       "3  1.000064e+19      0  14102100  1005           0  1fbe01fe    f3845767   \n",
 39 |       "4  1.000068e+19      0  14102100  1005           1  fe8cc448    9166c161   \n",
 40 |       "\n",
 41 |       "  site_category    app_id app_domain  ... device_type device_conn_type    C14  \\\n",
 42 |       "0      28905ebd  ecad2386   7801e8d9  ...           1                2  15706   \n",
 43 |       "1      28905ebd  ecad2386   7801e8d9  ...           1                0  15704   \n",
 44 |       "2      28905ebd  ecad2386   7801e8d9  ...           1                0  15704   \n",
 45 |       "3      28905ebd  ecad2386   7801e8d9  ...           1                0  15706   \n",
 46 |       "4      0569f928  ecad2386   7801e8d9  ...           1                0  18993   \n",
 47 |       "\n",
 48 |       "   C15  C16   C17  C18  C19     C20  C21  \n",
 49 |       "0  320   50  1722    0   35      -1   79  \n",
 50 |       "1  320   50  1722    0   35  100084   79  \n",
 51 |       "2  320   50  1722    0   35  100084   79  \n",
 52 |       "3  320   50  1722    0   35  100084   79  \n",
 53 |       "4  320   50  2161    0   35      -1  157  \n",
 54 |       "\n",
 55 |       "[5 rows x 24 columns]\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "import pandas as pd\n",
 61 |     "n_rows = 300000\n",
 62 |     "df = pd.read_csv(\"train.csv\", nrows=n_rows)\n",
 63 |     "print(df.head(5))"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 2,
 69 |    "id": "2f3bed2c",
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "(300000, 19)\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values\n",
 82 |     "Y = df['click'].values\n",
 83 |     "\n",
 84 |     "print(X.shape)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 3,
 90 |    "id": "53b199ee",
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "  (0, 2)\t1.0\n",
 98 |       "  (0, 6)\t1.0\n",
 99 |       "  (0, 188)\t1.0\n",
100 |       "  (0, 2608)\t1.0\n",
101 |       "  (0, 2679)\t1.0\n",
102 |       "  (0, 3771)\t1.0\n",
103 |       "  (0, 3885)\t1.0\n",
104 |       "  (0, 3929)\t1.0\n",
105 |       "  (0, 4879)\t1.0\n",
106 |       "  (0, 7315)\t1.0\n",
107 |       "  (0, 7319)\t1.0\n",
108 |       "  (0, 7475)\t1.0\n",
109 |       "  (0, 7824)\t1.0\n",
110 |       "  (0, 7828)\t1.0\n",
111 |       "  (0, 7869)\t1.0\n",
112 |       "  (0, 7977)\t1.0\n",
113 |       "  (0, 7982)\t1.0\n",
114 |       "  (0, 8021)\t1.0\n",
115 |       "  (0, 8189)\t1.0\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "n_train = int(n_rows * 0.9)\n",
121 |     "X_train = X[:n_train]\n",
122 |     "Y_train = Y[:n_train]\n",
123 |     "X_test = X[n_train:]\n",
124 |     "Y_test = Y[n_train:]\n",
125 |     "\n",
126 |     "from sklearn.preprocessing import OneHotEncoder\n",
127 |     "enc = OneHotEncoder(handle_unknown='ignore')\n",
128 |     "X_train_enc = enc.fit_transform(X_train)\n",
129 |     "\n",
130 |     "X_train_enc[0]\n",
131 |     "print(X_train_enc[0])"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 4,
137 |    "id": "7bbea72b",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "X_test_enc = enc.transform(X_test)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 5,
147 |    "id": "76447984",
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "{'max_depth': 10}\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "from sklearn.tree import DecisionTreeClassifier\n",
160 |     "parameters = {'max_depth': [3, 10, None]}\n",
161 |     "decision_tree = DecisionTreeClassifier(criterion='gini', min_samples_split=30)\n",
162 |     "\n",
163 |     "from sklearn.model_selection import GridSearchCV\n",
164 |     "grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3, scoring='roc_auc')\n",
165 |     "\n",
166 |     "grid_search.fit(X_train_enc, Y_train)\n",
167 |     "print(grid_search.best_params_)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 6,
173 |    "id": "d8b9655b",
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stdout",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "The ROC AUC on testing set is: 0.719\n"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "decision_tree_best = grid_search.best_estimator_\n",
186 |     "pos_prob = decision_tree_best.predict_proba(X_test_enc)[:, 1]\n",
187 |     "\n",
188 |     "from sklearn.metrics import roc_auc_score\n",
189 |     "print(f'The ROC AUC on testing set is: {roc_auc_score(Y_test, pos_prob):.3f}')"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 7,
195 |    "id": "f52d826e",
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "name": "stdout",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "The ROC AUC on testing set using random selection is: 0.499\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "import numpy as np\n",
208 |     "pos_prob = np.zeros(len(Y_test))\n",
209 |     "click_index = np.random.choice(len(Y_test), int(len(Y_test) *  51211.0/300000), replace=False)\n",
210 |     "pos_prob[click_index] = 1\n",
211 |     "\n",
212 |     "print(f'The ROC AUC on testing set using random selection is: {roc_auc_score(Y_test, pos_prob):.3f}')"
213 |    ]
214 |   },
215 |   {
216 |    "attachments": {},
217 |    "cell_type": "markdown",
218 |    "id": "f325f60f",
219 |    "metadata": {},
220 |    "source": [
221 |     "# Ensembling decision trees – random forest  "
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 8,
227 |    "id": "ade5a566",
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "{'max_depth': None}\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "from sklearn.ensemble import RandomForestClassifier\n",
240 |     "\n",
241 |     "random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1)\n",
242 |     "grid_search = GridSearchCV(random_forest, parameters, n_jobs=-1, cv=3, scoring='roc_auc')\n",
243 |     "grid_search.fit(X_train_enc, Y_train)\n",
244 |     "print(grid_search.best_params_)\n"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 9,
250 |    "id": "370afc6f",
251 |    "metadata": {},
252 |    "outputs": [
253 |     {
254 |      "name": "stdout",
255 |      "output_type": "stream",
256 |      "text": [
257 |       "The ROC AUC on testing set using random forest is: 0.759\n"
258 |      ]
259 |     }
260 |    ],
261 |    "source": [
262 |     "random_forest_best = grid_search.best_estimator_\n",
263 |     "pos_prob = random_forest_best.predict_proba(X_test_enc)[:, 1]\n",
264 |     "print(f'The ROC AUC on testing set using random forest is: {roc_auc_score(Y_test, pos_prob):.3f}')"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "id": "a5674331",
270 |    "metadata": {},
271 |    "source": [
272 |     "# Ensembling decision trees – gradient boosted trees"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 10,
278 |    "id": "81ced70c",
279 |    "metadata": {},
280 |    "outputs": [
281 |     {
282 |      "name": "stdout",
283 |      "output_type": "stream",
284 |      "text": [
285 |       "The ROC AUC on testing set using GBT is: 0.771\n"
286 |      ]
287 |     }
288 |    ],
289 |    "source": [
290 |     "import xgboost as xgb\n",
291 |     "model = xgb.XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=1000)\n",
292 |     "\n",
293 |     "model.fit(X_train_enc, Y_train)\n",
294 |     "pos_prob = model.predict_proba(X_test_enc)[:, 1]\n",
295 |     "\n",
296 |     "print(f'The ROC AUC on testing set using GBT is: {roc_auc_score(Y_test, pos_prob):.3f}')\n"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "id": "4528421b",
302 |    "metadata": {},
303 |    "source": [
304 |     "---"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "id": "8902f0ae",
310 |    "metadata": {},
311 |    "source": [
312 |     "Readers may ignore the next cell."
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 11,
318 |    "id": "d5eda2fe",
319 |    "metadata": {},
320 |    "outputs": [
321 |     {
322 |      "name": "stderr",
323 |      "output_type": "stream",
324 |      "text": [
325 |       "[NbConvertApp] Converting notebook ch3_part2.ipynb to python\n",
326 |       "[NbConvertApp] Writing 2830 bytes to ch3_part2.py\n"
327 |      ]
328 |     }
329 |    ],
330 |    "source": [
331 |     "!jupyter nbconvert --to python ch3_part2.ipynb --TemplateExporter.exclude_input_prompt=True"
332 |    ]
333 |   }
334 |  ],
335 |  "metadata": {
336 |   "kernelspec": {
337 |    "display_name": "Python 3 (ipykernel)",
338 |    "language": "python",
339 |    "name": "python3"
340 |   },
341 |   "language_info": {
342 |    "codemirror_mode": {
343 |     "name": "ipython",
344 |     "version": 3
345 |    },
346 |    "file_extension": ".py",
347 |    "mimetype": "text/x-python",
348 |    "name": "python",
349 |    "nbconvert_exporter": "python",
350 |    "pygments_lexer": "ipython3",
351 |    "version": "3.9.16"
352 |   }
353 |  },
354 |  "nbformat": 4,
355 |  "nbformat_minor": 5
356 | }
357 | 


--------------------------------------------------------------------------------
/ch3/ch3_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 
  5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  6 | # 
  7 | # Chapter 3 Predicting Online Ad Click-Through with Tree-Based Algorithms 
  8 | # 
  9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
 10 | 
 11 | # # Predicting ad click-through with a decision tree
 12 | 
 13 | import pandas as pd
 14 | n_rows = 300000
 15 | df = pd.read_csv("train.csv", nrows=n_rows)
 16 | print(df.head(5))
 17 | 
 18 | 
 19 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
 20 | Y = df['click'].values
 21 | 
 22 | print(X.shape)
 23 | 
 24 | 
 25 | n_train = int(n_rows * 0.9)
 26 | X_train = X[:n_train]
 27 | Y_train = Y[:n_train]
 28 | X_test = X[n_train:]
 29 | Y_test = Y[n_train:]
 30 | 
 31 | from sklearn.preprocessing import OneHotEncoder
 32 | enc = OneHotEncoder(handle_unknown='ignore')
 33 | X_train_enc = enc.fit_transform(X_train)
 34 | 
 35 | X_train_enc[0]
 36 | print(X_train_enc[0])
 37 | 
 38 | 
 39 | X_test_enc = enc.transform(X_test)
 40 | 
 41 | 
 42 | from sklearn.tree import DecisionTreeClassifier
 43 | parameters = {'max_depth': [3, 10, None]}
 44 | decision_tree = DecisionTreeClassifier(criterion='gini', min_samples_split=30)
 45 | 
 46 | from sklearn.model_selection import GridSearchCV
 47 | grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3, scoring='roc_auc')
 48 | 
 49 | grid_search.fit(X_train_enc, Y_train)
 50 | print(grid_search.best_params_)
 51 | 
 52 | 
 53 | decision_tree_best = grid_search.best_estimator_
 54 | pos_prob = decision_tree_best.predict_proba(X_test_enc)[:, 1]
 55 | 
 56 | from sklearn.metrics import roc_auc_score
 57 | print(f'The ROC AUC on testing set is: {roc_auc_score(Y_test, pos_prob):.3f}')
 58 | 
 59 | 
 60 | import numpy as np
 61 | pos_prob = np.zeros(len(Y_test))
 62 | click_index = np.random.choice(len(Y_test), int(len(Y_test) *  51211.0/300000), replace=False)
 63 | pos_prob[click_index] = 1
 64 | 
 65 | print(f'The ROC AUC on testing set using random selection is: {roc_auc_score(Y_test, pos_prob):.3f}')
 66 | 
 67 | 
 68 | # # Ensembling decision trees – random forest  
 69 | 
 70 | from sklearn.ensemble import RandomForestClassifier
 71 | 
 72 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1)
 73 | grid_search = GridSearchCV(random_forest, parameters, n_jobs=-1, cv=3, scoring='roc_auc')
 74 | grid_search.fit(X_train_enc, Y_train)
 75 | print(grid_search.best_params_)
 76 | 
 77 | 
 78 | random_forest_best = grid_search.best_estimator_
 79 | pos_prob = random_forest_best.predict_proba(X_test_enc)[:, 1]
 80 | print(f'The ROC AUC on testing set using random forest is: {roc_auc_score(Y_test, pos_prob):.3f}')
 81 | 
 82 | 
 83 | # # Ensembling decision trees – gradient boosted trees
 84 | 
 85 | import xgboost as xgb
 86 | model = xgb.XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=1000)
 87 | 
 88 | model.fit(X_train_enc, Y_train)
 89 | pos_prob = model.predict_proba(X_test_enc)[:, 1]
 90 | 
 91 | print(f'The ROC AUC on testing set using GBT is: {roc_auc_score(Y_test, pos_prob):.3f}')
 92 | 
 93 | 
 94 | # ---
 95 | 
 96 | # Readers may ignore the next cell.
 97 | 
 98 | get_ipython().system('jupyter nbconvert --to python ch3_part2.ipynb --TemplateExporter.exclude_input_prompt=True')
 99 | 
100 | 


--------------------------------------------------------------------------------
/ch4/ch4_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 
  5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  6 | # 
  7 | # Chapter 4 Predicting Online Ad Click-Through with Tree-Based Algorithms 
  8 | # 
  9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
 10 | # 
 11 | 
 12 | # # Converting categorical features to numerical – one-hot encoding and ordinal encoding
 13 | 
 14 | from sklearn.feature_extraction import DictVectorizer
 15 | 
 16 | 
 17 | X_dict = [{'interest': 'tech', 'occupation': 'professional'},
 18 |           {'interest': 'fashion', 'occupation': 'student'},
 19 |           {'interest': 'fashion', 'occupation': 'professional'},
 20 |           {'interest': 'sports', 'occupation': 'student'},
 21 |           {'interest': 'tech', 'occupation': 'student'},
 22 |           {'interest': 'tech', 'occupation': 'retired'},
 23 |           {'interest': 'sports', 'occupation': 'professional'}]
 24 | 
 25 | dict_one_hot_encoder = DictVectorizer(sparse=False)
 26 | X_encoded = dict_one_hot_encoder.fit_transform(X_dict)
 27 | print(X_encoded)
 28 | 
 29 | 
 30 | print(dict_one_hot_encoder.vocabulary_)
 31 | 
 32 | 
 33 | new_dict = [{'interest': 'sports', 'occupation': 'retired'}]
 34 | new_encoded = dict_one_hot_encoder.transform(new_dict)
 35 | print(new_encoded)
 36 | 
 37 | 
 38 | print(dict_one_hot_encoder.inverse_transform(new_encoded))
 39 | 
 40 | 
 41 | # new category not encountered before
 42 | new_dict = [{'interest': 'unknown_interest', 'occupation': 'retired'},
 43 |             {'interest': 'tech', 'occupation': 'unseen_occupation'}]
 44 | new_encoded = dict_one_hot_encoder.transform(new_dict)
 45 | print(new_encoded)
 46 | 
 47 | 
 48 | import pandas as pd
 49 | df = pd.DataFrame({'score': ['low',
 50 |                              'high',
 51 |                              'medium',
 52 |                              'medium',
 53 |                              'low']})
 54 | print(df)
 55 | 
 56 | mapping = {'low':1, 'medium':2, 'high':3}
 57 | df['score'] = df['score'].replace(mapping)
 58 | 
 59 | print(df)
 60 | 
 61 | 
 62 | # # Classifying data with logistic regression 
 63 | 
 64 | # ## Getting started with the logistic function
 65 | 
 66 | import numpy as np
 67 | import matplotlib.pyplot as plt
 68 | 
 69 | 
 70 | def sigmoid(input):
 71 |     return 1.0 / (1 + np.exp(-input))
 72 | 
 73 | 
 74 | z = np.linspace(-8, 8, 1000)
 75 | y = sigmoid(z)
 76 | plt.plot(z, y)
 77 | plt.axhline(y=0, ls='dotted', color='k')
 78 | plt.axhline(y=0.5, ls='dotted', color='k')
 79 | plt.axhline(y=1, ls='dotted', color='k')
 80 | plt.yticks([0.0, 0.25, 0.5, 0.75, 1.0])
 81 | plt.xlabel('z')
 82 | plt.ylabel('y(z)')
 83 | plt.show()
 84 | 
 85 | 
 86 | # ## Jumping from the logistic function to logistic regression 
 87 | 
 88 | # plot sample cost vs y_hat (prediction), for y (truth) = 1
 89 | y_hat = np.linspace(0.001, 0.999, 1000)
 90 | cost = -np.log(y_hat)
 91 | plt.plot(y_hat, cost)
 92 | plt.xlabel('Prediction')
 93 | plt.ylabel('Cost')
 94 | plt.xlim(0, 1)
 95 | plt.ylim(0, 7)
 96 | plt.show()
 97 | 
 98 | 
 99 | # plot sample cost vs y_hat (prediction), for y (truth) = 0
100 | y_hat = np.linspace(0.001, 0.999, 1000)
101 | cost = -np.log(1 - y_hat)
102 | plt.plot(y_hat, cost)
103 | plt.xlabel('Prediction')
104 | plt.ylabel('Cost')
105 | plt.xlim(0, 1)
106 | plt.ylim(0, 7)
107 | plt.show()
108 | 
109 | 
110 | # # Training a logistic regression model 
111 | 
112 | # ## Training a logistic regression model using gradient descent
113 | 
114 | # Gradient descent based logistic regression from scratch
115 | def compute_prediction(X, weights):
116 |     """
117 |     Compute the prediction y_hat based on current weights
118 |     """
119 |     z = np.dot(X, weights)
120 |     return sigmoid(z)
121 | 
122 | 
123 | def update_weights_gd(X_train, y_train, weights, learning_rate):
124 |     """
125 |     Update weights by one step
126 |     """
127 |     predictions = compute_prediction(X_train, weights)
128 |     weights_delta = np.dot(X_train.T, y_train - predictions)
129 |     m = y_train.shape[0]
130 |     weights += learning_rate / float(m) * weights_delta
131 |     return weights
132 | 
133 | 
134 | def compute_cost(X, y, weights):
135 |     """
136 |      Compute the cost J(w)
137 |     """
138 |     predictions = compute_prediction(X, weights)
139 |     cost = np.mean(-y * np.log(predictions) - (1 - y) * np.log(1 - predictions))
140 |     return cost
141 | 
142 | 
143 | def train_logistic_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False):
144 |     """ Train a logistic regression model
145 |     Args:
146 |         X_train, y_train (numpy.ndarray, training data set)
147 |         max_iter (int, number of iterations)
148 |         learning_rate (float)
149 |         fit_intercept (bool, with an intercept w0 or not)
150 |     Returns:
151 |         numpy.ndarray, learned weights
152 |     """
153 |     if fit_intercept:
154 |         intercept = np.ones((X_train.shape[0], 1))
155 |         X_train = np.hstack((intercept, X_train))
156 |     weights = np.zeros(X_train.shape[1])
157 |     for iteration in range(max_iter):
158 |         weights = update_weights_gd(X_train, y_train, weights, learning_rate)
159 |         # Check the cost for every 100 (for example) iterations
160 |         if iteration % 100 == 0:
161 |             print(compute_cost(X_train, y_train, weights))
162 |     return weights
163 | 
164 | 
165 | def predict(X, weights):
166 |     if X.shape[1] == weights.shape[0] - 1:
167 |         intercept = np.ones((X.shape[0], 1))
168 |         X = np.hstack((intercept, X))
169 |     return compute_prediction(X, weights)
170 | 
171 | 
172 | # A example
173 | X_train = np.array([[6, 7],
174 |                     [2, 4],
175 |                     [3, 6],
176 |                     [4, 7],
177 |                     [1, 6],
178 |                     [5, 2],
179 |                     [2, 0],
180 |                     [6, 3],
181 |                     [4, 1],
182 |                     [7, 2]])
183 | 
184 | y_train = np.array([0,
185 |                     0,
186 |                     0,
187 |                     0,
188 |                     0,
189 |                     1,
190 |                     1,
191 |                     1,
192 |                     1,
193 |                     1])
194 | 
195 | 
196 | weights = train_logistic_regression(X_train, y_train, max_iter=1000, learning_rate=0.1, fit_intercept=True)
197 | 
198 | 
199 | 
200 | X_test = np.array([[6, 1],
201 |                    [1, 3],
202 |                    [3, 1],
203 |                    [4, 5]])
204 | 
205 | predictions = predict(X_test, weights)
206 | print(predictions)
207 | 
208 | 
209 | plt.scatter(X_train[:5,0], X_train[:5,1], c='b', marker='x')
210 | plt.scatter(X_train[5:,0], X_train[5:,1], c='k', marker='.')
211 | for i, prediction in enumerate(predictions):
212 |     marker = 'X' if prediction < 0.5 else 'o'
213 |     c = 'b' if prediction < 0.5 else 'k'
214 |     plt.scatter(X_test[i,0], X_test[i,1], c=c, marker=marker)
215 | plt.show()
216 | 
217 | 
218 | # ## Predicting ad click-through with logistic regression using gradient descent 
219 | 
220 | import pandas as pd
221 | n_rows = 300000
222 | df = pd.read_csv("train.csv", nrows=n_rows)
223 | 
224 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
225 | Y = df['click'].values
226 | 
227 | n_train = 10000
228 | X_train = X[:n_train]
229 | Y_train = Y[:n_train]
230 | X_test = X[n_train:]
231 | Y_test = Y[n_train:]
232 | 
233 | from sklearn.preprocessing import OneHotEncoder
234 | enc = OneHotEncoder(handle_unknown='ignore')
235 | X_train_enc = enc.fit_transform(X_train)
236 | 
237 | X_test_enc = enc.transform(X_test)
238 | 
239 | 
240 | import timeit
241 | start_time = timeit.default_timer()
242 | weights = train_logistic_regression(X_train_enc.toarray(), Y_train, max_iter=10000, learning_rate=0.01,
243 |                                     fit_intercept=True)
244 | print(f"--- {(timeit.default_timer() - start_time):.3f} seconds ---")
245 | 
246 | 
247 | pred = predict(X_test_enc.toarray(), weights)
248 | from sklearn.metrics import roc_auc_score
249 | print(f'Training samples: {n_train}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}')
250 | 
251 | 
252 | # ## Training a logistic regression model using stochastic gradient descent
253 | 
254 | def update_weights_sgd(X_train, y_train, weights, learning_rate):
255 |     """ One weight update iteration: moving weights by one step based on each individual sample
256 |     Args:
257 |         X_train, y_train (numpy.ndarray, training data set)
258 |         weights (numpy.ndarray)
259 |         learning_rate (float)
260 |     Returns:
261 |         numpy.ndarray, updated weights
262 |     """
263 |     for X_each, y_each in zip(X_train, y_train):
264 |         prediction = compute_prediction(X_each, weights)
265 |         weights_delta = X_each.T * (y_each - prediction)
266 |         weights += learning_rate * weights_delta
267 |     return weights
268 | 
269 | 
270 | def train_logistic_regression_sgd(X_train, y_train, max_iter, learning_rate, fit_intercept=False):
271 |     """ Train a logistic regression model via SGD
272 |     Args:
273 |         X_train, y_train (numpy.ndarray, training data set)
274 |         max_iter (int, number of iterations)
275 |         learning_rate (float)
276 |         fit_intercept (bool, with an intercept w0 or not)
277 |     Returns:
278 |         numpy.ndarray, learned weights
279 |     """
280 |     if fit_intercept:
281 |         intercept = np.ones((X_train.shape[0], 1))
282 |         X_train = np.hstack((intercept, X_train))
283 |     weights = np.zeros(X_train.shape[1])
284 |     for iteration in range(max_iter):
285 |         weights = update_weights_sgd(X_train, y_train, weights, learning_rate)
286 |         # Check the cost for every 2 (for example) iterations
287 |         if iteration % 2 == 0:
288 |             print(compute_cost(X_train, y_train, weights))
289 |     return weights
290 | 
291 | 
292 | # Train the SGD model based on 100000 samples
293 | n_train = 100000
294 | X_train = X[:n_train]
295 | Y_train = Y[:n_train]
296 | X_test = X[n_train:]
297 | Y_test = Y[n_train:]
298 | 
299 | from sklearn.preprocessing import OneHotEncoder
300 | enc = OneHotEncoder(handle_unknown='ignore')
301 | X_train_enc = enc.fit_transform(X_train)
302 | 
303 | X_test_enc = enc.transform(X_test)
304 | 
305 | start_time = timeit.default_timer()
306 | weights = train_logistic_regression_sgd(X_train_enc.toarray(), Y_train, max_iter=10, learning_rate=0.01,
307 |                                         fit_intercept=True)
308 | print(f"--- {(timeit.default_timer() - start_time):.3f} seconds ---")
309 | pred = predict(X_test_enc.toarray(), weights)
310 | print(f'Training samples: {n_train}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}')
311 | 
312 | 
313 | # # Use scikit-learn package
314 | from sklearn.linear_model import SGDClassifier
315 | sgd_lr = SGDClassifier(loss='log_loss', penalty=None, fit_intercept=True, max_iter=20, learning_rate='constant', eta0=0.01)
316 | 
317 | 
318 | sgd_lr.fit(X_train_enc.toarray(), Y_train)
319 | 
320 | pred = sgd_lr.predict_proba(X_test_enc.toarray())[:, 1]
321 | print(f'Training samples: {n_train}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}')
322 | 
323 | 
324 | # ## Feature selection using L1 regularization
325 | 
326 | sgd_lr_l1 = SGDClassifier(loss='log_loss', 
327 |                           penalty='l1', 
328 |                           alpha=0.0001, 
329 |                           fit_intercept=True, 
330 |                           max_iter=10, 
331 |                           learning_rate='constant', 
332 |                           eta0=0.01,
333 |                           random_state=42)
334 | sgd_lr_l1.fit(X_train_enc.toarray(), Y_train)
335 | 
336 | 
337 | coef_abs = np.abs(sgd_lr_l1.coef_)
338 | print(coef_abs)
339 | 
340 | 
341 | # bottom 10 weights and the corresponding 10 least important features
342 | print(np.sort(coef_abs)[0][:10])
343 | 
344 | 
345 | feature_names = enc.get_feature_names_out()
346 | bottom_10 = np.argsort(coef_abs)[0][:10]
347 | print('10 least important features are:\n', feature_names[bottom_10])
348 | 
349 | 
350 | # top 10 weights and the corresponding 10 most important features
351 | print(np.sort(coef_abs)[0][-10:])
352 | top_10 = np.argsort(coef_abs)[0][-10:]
353 | print('10 most important features are:\n', feature_names[top_10])
354 | 
355 | 
356 | # ---
357 | 
358 | # Readers may ignore the next cell.
359 | 
360 | get_ipython().system('jupyter nbconvert --to python ch4_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
361 | 
362 | 


--------------------------------------------------------------------------------
/ch4/ch4_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 
  5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  6 | # 
  7 | # Chapter 4 Predicting Online Ad Click-Through with Tree-Based Algorithms 
  8 | # 
  9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
 10 | # 
 11 | 
 12 | # # Training on large datasets with online learning 
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | import timeit
 17 | from sklearn.linear_model import SGDClassifier
 18 | from sklearn.metrics import roc_auc_score
 19 | from sklearn.preprocessing import OneHotEncoder
 20 | 
 21 | 
 22 | n_rows = 100000 * 11
 23 | df = pd.read_csv("train.csv", nrows=n_rows)
 24 | 
 25 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
 26 | Y = df['click'].values
 27 | 
 28 | n_train = 100000 * 10
 29 | X_train = X[:n_train]
 30 | Y_train = Y[:n_train]
 31 | X_test = X[n_train:]
 32 | Y_test = Y[n_train:]
 33 | 
 34 | 
 35 | enc = OneHotEncoder(handle_unknown='ignore')
 36 | enc.fit(X_train)
 37 | 
 38 | 
 39 | # The number of iterations is set to 1 if using partial_fit.
 40 | sgd_lr_online = SGDClassifier(loss='log_loss', 
 41 |                               penalty=None, 
 42 |                               fit_intercept=True, 
 43 |                               max_iter=1, 
 44 |                               learning_rate='constant',
 45 |                               eta0=0.01, 
 46 |                               random_state=42)
 47 | 
 48 | 
 49 | start_time = timeit.default_timer()
 50 | 
 51 | # Use the first 1,000,000 samples for training, and the next 100,000 for testing
 52 | for i in range(10):
 53 |     x_train = X_train[i*100000:(i+1)*100000]
 54 |     y_train = Y_train[i*100000:(i+1)*100000]
 55 |     x_train_enc = enc.transform(x_train)
 56 |     sgd_lr_online.partial_fit(x_train_enc.toarray(), y_train, classes=[0, 1])
 57 | 
 58 | print(f"--- {(timeit.default_timer() - start_time):.3f} seconds ---")
 59 | 
 60 | 
 61 | x_test_enc = enc.transform(X_test)
 62 | 
 63 | pred = sgd_lr_online.predict_proba(x_test_enc.toarray())[:, 1]
 64 | print(f'Training samples: {n_train * 10}, AUC on testing set: {roc_auc_score(Y_test, pred):.3f}')
 65 | 
 66 | 
 67 | # # Handling multiclass classification 
 68 | 
 69 | from sklearn import datasets
 70 | digits = datasets.load_digits()
 71 | n_samples = len(digits.images)
 72 | 
 73 | 
 74 | X = digits.images.reshape((n_samples, -1))
 75 | Y = digits.target
 76 | 
 77 | 
 78 | from sklearn.model_selection import train_test_split
 79 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
 80 | 
 81 | 
 82 | from sklearn.model_selection import GridSearchCV
 83 | parameters = {'penalty': ['l2', None],
 84 |               'alpha': [1e-07, 1e-06, 1e-05, 1e-04],
 85 |               'eta0': [0.01, 0.1, 1, 10]}
 86 | 
 87 | sgd_lr = SGDClassifier(loss='log_loss', 
 88 |                        learning_rate='constant', 
 89 |                        fit_intercept=True, 
 90 |                        max_iter=50,
 91 |                        random_state=42)
 92 | 
 93 | grid_search = GridSearchCV(sgd_lr, parameters, n_jobs=-1, cv=5)
 94 | 
 95 | grid_search.fit(X_train, Y_train)
 96 | print(grid_search.best_params_)
 97 | 
 98 | 
 99 | sgd_lr_best = grid_search.best_estimator_
100 | accuracy = sgd_lr_best.score(X_test, Y_test)
101 | print(f'The accuracy on testing set is: {accuracy*100:.1f}%')
102 | 
103 | 
104 | # # Implementing logistic regression using TensorFlow 
105 | 
106 | import tensorflow as tf
107 | 
108 | 
109 | n_rows = 100000
110 | df = pd.read_csv("train.csv", nrows=n_rows)
111 | 
112 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
113 | Y = df['click'].values
114 | 
115 | n_train = int(n_rows * 0.9)
116 | X_train = X[:n_train]
117 | Y_train = Y[:n_train] 
118 | X_test = X[n_train:]
119 | Y_test = Y[n_train:] 
120 | 
121 | 
122 | enc = OneHotEncoder(handle_unknown='ignore')
123 | X_train_enc = enc.fit_transform(X_train).toarray().astype('float32')
124 | X_test_enc = enc.transform(X_test).toarray().astype('float32')
125 | Y_train = Y_train.astype('float32')
126 | Y_test = Y_test.astype('float32')
127 | 
128 | 
129 | batch_size = 1000
130 | train_data = tf.data.Dataset.from_tensor_slices((X_train_enc, Y_train))
131 | train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)
132 | 
133 | 
134 | n_features = X_train_enc.shape[1]
135 | W = tf.Variable(tf.zeros([n_features, 1]))
136 | b = tf.Variable(tf.zeros([1]))
137 | 
138 | 
139 | learning_rate = 0.001
140 | optimizer = tf.optimizers.Adam(learning_rate)
141 | 
142 | 
143 | def run_optimization(x, y):
144 |     with tf.GradientTape() as tape:
145 |         logits = tf.add(tf.matmul(x, W), b)[:, 0]
146 |         loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits))
147 |     # Update the parameters with respect to the gradient calculations
148 |     gradients = tape.gradient(loss, [W, b])
149 |     optimizer.apply_gradients(zip(gradients, [W, b]))
150 |     
151 | 
152 | 
153 | training_steps = 5000
154 | for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
155 |     run_optimization(batch_x, batch_y)
156 |     if step % 500 == 0:
157 |         logits = tf.add(tf.matmul(batch_x, W), b)[:, 0]
158 |         loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=batch_y, logits=logits))
159 |         print("step: %i, loss: %f" % (step, loss))
160 | 
161 | 
162 | logits = tf.add(tf.matmul(X_test_enc, W), b)[:, 0]
163 | pred = tf.nn.sigmoid(logits)
164 | auc_metric = tf.keras.metrics.AUC()
165 | auc_metric.update_state(Y_test, pred)
166 | 
167 | print(f'AUC on testing set: {auc_metric.result().numpy():.3f}')
168 | 
169 | 
170 | # # Feature selection using random forest 
171 | 
172 | X_train = X
173 | Y_train = Y
174 | 
175 | enc = OneHotEncoder(handle_unknown='ignore')
176 | X_train_enc = enc.fit_transform(X_train)
177 | 
178 | 
179 | # Feature selection with random forest
180 | 
181 | from sklearn.ensemble import RandomForestClassifier
182 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1, random_state=42)
183 | random_forest.fit(X_train_enc.toarray(), Y_train)
184 | 
185 | 
186 | feature_imp = random_forest.feature_importances_
187 | print(feature_imp)
188 | 
189 | 
190 | # bottom 10 weights and the corresponding 10 least important features
191 | feature_names = enc.get_feature_names_out()
192 | print(np.sort(feature_imp)[:10])
193 | bottom_10 = np.argsort(feature_imp)[:10]
194 | print('10 least important features are:\n', feature_names[bottom_10])
195 | 
196 | 
197 | # top 10 weights and the corresponding 10 most important features
198 | print(np.sort(feature_imp)[-10:])
199 | top_10 = np.argsort(feature_imp)[-10:]
200 | print('10 most important features are:\n', feature_names[top_10])
201 | 
202 | 
203 | # ---
204 | 
205 | # Readers may ignore the next cell.
206 | 
207 | get_ipython().system('jupyter nbconvert --to python ch4_part2.ipynb --TemplateExporter.exclude_input_prompt=True')
208 | 
209 | 


--------------------------------------------------------------------------------
/ch5/20051201_20051210.csv:
--------------------------------------------------------------------------------
1 | Date,Open,High,Low,Close,Adj Close,Volume
2 | 2005-12-01,2244.850098,2269.389893,2244.709961,2267.169922,2267.169922,2010420000
3 | 2005-12-02,2266.169922,2273.610107,2261.129883,2273.370117,2273.370117,1758510000
4 | 2005-12-05,2269.070068,2269.479980,2250.840088,2257.639893,2257.639893,1659920000
5 | 2005-12-06,2267.760010,2278.159912,2259.370117,2260.760010,2260.760010,1788200000
6 | 2005-12-07,2263.290039,2264.909912,2244.620117,2252.010010,2252.010010,1733530000
7 | 2005-12-08,2254.800049,2261.610107,2233.739990,2246.459961,2246.459961,1908360000
8 | 2005-12-09,2247.280029,2258.669922,2241.030029,2256.729980,2256.729980,1658570000


--------------------------------------------------------------------------------
/ch5/ch5_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 
  5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  6 | # 
  7 | # Chapter 5 Predicting Stock Price with Regression Algorithms
  8 | # 
  9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
 10 | # 
 11 | 
 12 | # # Mining stock price data 
 13 | 
 14 | # ## Acquiring data and generating features
 15 | 
 16 | import pandas as pd
 17 | 
 18 | 
 19 | mydata = pd.read_csv('20051201_20051210.csv', index_col='Date')
 20 | mydata
 21 | 
 22 | 
 23 | def add_original_feature(df, df_new):
 24 |     df_new['open'] = df['Open']
 25 |     df_new['open_1'] = df['Open'].shift(1)
 26 |     df_new['close_1'] = df['Close'].shift(1)
 27 |     df_new['high_1'] = df['High'].shift(1)
 28 |     df_new['low_1'] = df['Low'].shift(1)
 29 |     df_new['volume_1'] = df['Volume'].shift(1)
 30 |     
 31 | 
 32 | 
 33 | def add_avg_price(df, df_new): 
 34 |     df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1)
 35 |     df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1)
 36 |     df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1)
 37 |     df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30']
 38 |     df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365']
 39 |     df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365']
 40 |     
 41 | 
 42 | 
 43 | def add_avg_volume(df, df_new):
 44 |     df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1)
 45 |     df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1)
 46 |     df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1)
 47 |     df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30']
 48 |     df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365']
 49 |     df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365']
 50 |     
 51 | 
 52 | 
 53 | def add_std_price(df, df_new):
 54 |     df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1)
 55 |     df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1)
 56 |     df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1)
 57 |     df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30']
 58 |     df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365']
 59 |     df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365']
 60 |     
 61 | 
 62 | 
 63 | def add_std_volume(df, df_new):
 64 |     df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1)
 65 |     df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1)
 66 |     df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1)
 67 |     df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30']
 68 |     df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365']
 69 |     df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365']
 70 |     
 71 | 
 72 | 
 73 | def add_return_feature(df, df_new):
 74 |     df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1)
 75 |     df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1)
 76 |     df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1)
 77 |     df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1)
 78 |     df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1)
 79 |     df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1)
 80 |     df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1)
 81 |     
 82 | 
 83 | 
 84 | def generate_features(df):
 85 |     """
 86 |     Generate features for a stock/index based on historical price and performance
 87 |     @param df: dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adj Close"
 88 |     @return: dataframe, data set with new features
 89 |     """
 90 |     df_new = pd.DataFrame()
 91 |     # 6 original features
 92 |     add_original_feature(df, df_new)
 93 |     # 31 generated features
 94 |     # average price
 95 |     add_avg_price(df, df_new)
 96 |     # average volume
 97 |     add_avg_volume(df, df_new)
 98 |     # standard deviation of prices
 99 |     add_std_price(df, df_new)
100 |     # standard deviation of volumes
101 |     add_std_volume(df, df_new)
102 |     # # return
103 |     add_return_feature(df, df_new)
104 |     # the target
105 |     df_new['close'] = df['Close']
106 |     df_new = df_new.dropna(axis=0)
107 |     return df_new
108 | 
109 | 
110 | data_raw = pd.read_csv('19900101_20230630.csv', index_col='Date')
111 | data = generate_features(data_raw)
112 | print(data.round(decimals=3).head(5))
113 | 
114 | 
115 | # # Estimating with linear regression 
116 | 
117 | # ## Implementing linear regression from scratch 
118 | 
119 | import numpy as np
120 | 
121 | 
122 | def compute_prediction(X, weights):
123 |     """
124 |     Compute the prediction y_hat based on current weights
125 |     """
126 |     return np.dot(X, weights)
127 | 
128 | 
129 | def update_weights_gd(X_train, y_train, weights, learning_rate):
130 |     """
131 |     Update weights by one step and return updated wights
132 |     """
133 |     predictions = compute_prediction(X_train, weights)
134 |     weights_delta = np.dot(X_train.T, y_train - predictions)
135 |     m = y_train.shape[0]
136 |     weights += learning_rate / float(m) * weights_delta
137 |     return weights
138 | 
139 | 
140 | def compute_loss(X, y, weights):
141 |     """
142 |     Compute the loss J(w)
143 |     """
144 |     predictions = compute_prediction(X, weights)
145 |     return np.mean((predictions - y) ** 2 / 2.0)
146 | 
147 | 
148 | def train_linear_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False, display_loss=500):
149 |     """
150 |     Train a linear regression model with gradient descent, and return trained model
151 |     """
152 |     if fit_intercept:
153 |         intercept = np.ones((X_train.shape[0], 1))
154 |         X_train = np.hstack((intercept, X_train))
155 |     weights = np.zeros(X_train.shape[1])
156 |     for iteration in range(max_iter):
157 |         weights = update_weights_gd(X_train, y_train, weights, learning_rate)
158 |         # Check the cost for every 500 (by default) iterations
159 |         if iteration % display_loss == 0:
160 |             print(compute_loss(X_train, y_train, weights))
161 |     return weights
162 | 
163 | 
164 | def predict(X, weights):
165 |     if X.shape[1] == weights.shape[0] - 1:
166 |         intercept = np.ones((X.shape[0], 1))
167 |         X = np.hstack((intercept, X))
168 |     return compute_prediction(X, weights)
169 | 
170 | 
171 | # A small example
172 | X_train = np.array([[6], [2], [3], [4], [1], [5], [2], [6], [4], [7]])
173 | y_train = np.array([5.5, 1.6, 2.2, 3.7, 0.8, 5.2, 1.5, 5.3, 4.4, 6.8])
174 | 
175 | 
176 | weights = train_linear_regression(X_train, y_train, max_iter=100, learning_rate=0.01, fit_intercept=True)
177 | 
178 | 
179 | X_test = np.array([[1.3], [3.5], [5.2], [2.8]])
180 | 
181 | predictions = predict(X_test, weights)
182 | 
183 | import matplotlib.pyplot as plt
184 | plt.scatter(X_train[:, 0], y_train, marker='o', c='b')
185 | plt.scatter(X_test[:, 0], predictions, marker='*', c='k')
186 | plt.xlabel('x')
187 | plt.ylabel('y')
188 | plt.show()
189 | 
190 | 
191 | # The diabetes example
192 | from sklearn import datasets
193 | diabetes = datasets.load_diabetes()
194 | print(diabetes.data.shape)
195 | 
196 | num_test = 30
197 | X_train = diabetes.data[:-num_test, :]
198 | y_train = diabetes.target[:-num_test]
199 | 
200 | 
201 | weights = train_linear_regression(X_train, y_train, max_iter=5000, learning_rate=1, fit_intercept=True)
202 | 
203 | X_test = diabetes.data[-num_test:, :]
204 | y_test = diabetes.target[-num_test:]
205 | 
206 | predictions = predict(X_test, weights)
207 | 
208 | print(predictions)
209 | print(y_test)
210 | 
211 | 
212 | # ## Implementing linear regression with scikit-learn 
213 | 
214 | # Directly use SGDRegressor from scikit-learn
215 | from sklearn.linear_model import SGDRegressor
216 | regressor = SGDRegressor(loss='squared_error', 
217 |                          penalty='l2', 
218 |                          alpha=0.0001, 
219 |                          learning_rate='constant', 
220 |                          eta0=0.2,
221 |                          max_iter=100, 
222 |                          random_state=42)
223 | 
224 | 
225 | regressor.fit(X_train, y_train)
226 | predictions = regressor.predict(X_test)
227 | print(predictions)
228 | 
229 | 
230 | # ## Implementing linear regression with TensorFlow
231 | 
232 | import tensorflow as tf
233 | 
234 | 
235 | layer0 = tf.keras.layers.Dense(units=1, input_shape=[X_train.shape[1]])
236 | model = tf.keras.Sequential(layer0)
237 | 
238 | 
239 | model.compile(loss='mean_squared_error',
240 |               optimizer=tf.keras.optimizers.Adam(1))
241 | 
242 | 
243 | model.fit(X_train, y_train, epochs=100, verbose=True)
244 | 
245 | 
246 | predictions = model.predict(X_test)[:, 0]
247 | print(predictions)
248 | 
249 | 
250 | # ---
251 | 
252 | # Readers may ignore the next cell.
253 | 
254 | get_ipython().system('jupyter nbconvert --to python ch5_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
255 | 
256 | 


--------------------------------------------------------------------------------
/ch6/ch6_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 
  5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  6 | # 
  7 | # Chapter 6 Predicting Stock Prices with Artificial Neural Networks
  8 | # 
  9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
 10 | # 
 11 | 
 12 | # # Demystifying neural networks 
 13 | 
 14 | # ## Starting with a single-layer neural network
 15 | 
 16 | # ### Layers in neural networks
 17 | 
 18 | import numpy as np
 19 | import matplotlib.pyplot as plt
 20 | 
 21 | 
 22 | def sigmoid(z):
 23 |     return 1.0 / (1 + np.exp(-z))
 24 | 
 25 | z = np.linspace(-8, 8, 1000)
 26 | y = sigmoid(z)
 27 | plt.plot(z, y)
 28 | plt.xlabel('z')
 29 | plt.ylabel('y(z)')
 30 | plt.title('logistic')
 31 | plt.grid()
 32 | plt.show()
 33 | 
 34 | 
 35 | def tanh(z):
 36 |     return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))
 37 | 
 38 | z = np.linspace(-8, 8, 1000)
 39 | y = tanh(z)
 40 | plt.plot(z, y)
 41 | plt.xlabel('z')
 42 | plt.ylabel('y(z)')
 43 | plt.title('tanh')
 44 | plt.grid()
 45 | plt.show()
 46 | 
 47 | 
 48 | def relu(z):
 49 |     return np.maximum(np.zeros_like(z), z)
 50 | 
 51 | 
 52 | z = np.linspace(-8, 8, 1000)
 53 | y = relu(z)
 54 | plt.plot(z, y)
 55 | plt.xlabel('z')
 56 | plt.ylabel('y(z)')
 57 | plt.title('relu')
 58 | plt.grid()
 59 | plt.show()
 60 | 
 61 | 
 62 | # # Building neural networks 
 63 | 
 64 | # ## Implementing neural networks from scratch 
 65 | 
 66 | def sigmoid_derivative(z):
 67 |     return sigmoid(z) * (1.0 - sigmoid(z))
 68 | 
 69 | 
 70 | def train(X, y, n_hidden, learning_rate, n_iter):
 71 |     m, n_input = X.shape
 72 |     W1 = np.random.randn(n_input, n_hidden)
 73 |     b1 = np.zeros((1, n_hidden))
 74 |     W2 = np.random.randn(n_hidden, 1)
 75 |     b2 = np.zeros((1, 1))
 76 |     for i in range(1, n_iter+1):
 77 |         Z2 = np.matmul(X, W1) + b1
 78 |         A2 = sigmoid(Z2)
 79 |         Z3 = np.matmul(A2, W2) + b2
 80 |         A3 = Z3
 81 | 
 82 |         dZ3 = A3 - y
 83 |         dW2 = np.matmul(A2.T, dZ3)
 84 |         db2 = np.sum(dZ3, axis=0, keepdims=True)
 85 | 
 86 |         dZ2 = np.matmul(dZ3, W2.T) * sigmoid_derivative(Z2)
 87 |         dW1 = np.matmul(X.T, dZ2)
 88 |         db1 = np.sum(dZ2, axis=0)
 89 | 
 90 |         W2 = W2 - learning_rate * dW2 / m
 91 |         b2 = b2 - learning_rate * db2 / m
 92 |         W1 = W1 - learning_rate * dW1 / m
 93 |         b1 = b1 - learning_rate * db1 / m
 94 | 
 95 |         if i % 100 == 0:
 96 |             cost = np.mean((y - A3) ** 2)
 97 |             print('Iteration %i, training loss: %f' % (i, cost))
 98 | 
 99 |     model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
100 |     return model
101 | 
102 | 
103 | from sklearn import datasets
104 | housing = datasets.fetch_california_housing()
105 | 
106 | num_test = 10  # the last 10 samples as testing set
107 | 
108 | from sklearn import preprocessing
109 | scaler = preprocessing.StandardScaler()
110 | 
111 | X_train = housing.data[:-num_test, :]
112 | X_train = scaler.fit_transform(X_train)
113 | y_train = housing.target[:-num_test].reshape(-1, 1)
114 | X_test = housing.data[-num_test:, :]
115 | X_test = scaler.transform(X_test)
116 | y_test = housing.target[-num_test:]
117 | 
118 | 
119 | n_hidden = 20
120 | learning_rate = 0.1
121 | n_iter = 2000
122 | 
123 | model = train(X_train, y_train, n_hidden, learning_rate, n_iter)
124 | 
125 | 
126 | def predict(x, model):
127 |     W1 = model['W1']
128 |     b1 = model['b1']
129 |     W2 = model['W2']
130 |     b2 = model['b2']
131 |     A2 = sigmoid(np.matmul(x, W1) + b1)
132 |     A3 = np.matmul(A2, W2) + b2
133 |     return A3
134 | 
135 | 
136 | predictions = predict(X_test, model)
137 | print(predictions[:, 0])
138 | print(y_test)
139 | 
140 | 
141 | # ## Implementing neural networks with scikit-learn 
142 | 
143 | from sklearn.neural_network import MLPRegressor
144 | nn_scikit = MLPRegressor(hidden_layer_sizes=(16, 8), 
145 |                          activation='relu', 
146 |                          solver='adam',
147 |                          learning_rate_init=0.001, 
148 |                          random_state=42, 
149 |                          max_iter=2000)
150 | 
151 | 
152 | nn_scikit.fit(X_train, y_train.ravel())
153 | predictions = nn_scikit.predict(X_test)
154 | print(predictions)
155 | 
156 | 
157 | from sklearn.metrics import mean_squared_error
158 | print(mean_squared_error(y_test, predictions))
159 | 
160 | 
161 | # ## Implementing neural networks with TensorFlow
162 | 
163 | import tensorflow as tf
164 | from tensorflow import keras
165 | 
166 | tf.random.set_seed(42)
167 | 
168 | 
169 | model = keras.Sequential([
170 |     keras.layers.Dense(units=16, activation='relu'),
171 |     keras.layers.Dense(units=8, activation='relu'),
172 |     keras.layers.Dense(units=1)
173 | ])
174 | 
175 | 
176 | model.compile(loss='mean_squared_error',
177 |               optimizer=tf.keras.optimizers.Adam(0.01))
178 | 
179 | 
180 | model.fit(X_train, y_train, epochs=300)
181 | 
182 | 
183 | predictions = model.predict(X_test)[:, 0]
184 | print(predictions)
185 | 
186 | print(mean_squared_error(y_test, predictions))
187 | 
188 | 
189 | # ## Implementing neural networks with PyTorch
190 | 
191 | import torch
192 | import torch.nn as nn
193 | 
194 | 
195 | torch.manual_seed(42)
196 | model = nn.Sequential(nn.Linear(X_train.shape[1], 16),
197 |                       nn.ReLU(),
198 |                       nn.Linear(16, 8),
199 |                       nn.ReLU(),
200 |                       nn.Linear(8, 1))
201 | 
202 | 
203 | loss_function = nn.MSELoss()
204 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
205 | 
206 | 
207 | X_train_torch = torch.from_numpy(X_train.astype(np.float32))
208 | y_train_torch = torch.from_numpy(y_train.astype(np.float32))
209 | 
210 | 
211 | def train_step(model, X_train, y_train, loss_function, optimizer):
212 |     pred_train = model(X_train)
213 |     loss = loss_function(pred_train, y_train)
214 |  
215 |     model.zero_grad()
216 |     loss.backward()
217 | 
218 |     optimizer.step()
219 |     
220 |     return loss.item()
221 | 
222 | 
223 | for epoch in range(500):
224 |     loss = train_step(model, X_train_torch, y_train_torch, loss_function, optimizer)
225 | 
226 |     if epoch % 100 == 0:
227 |         print(f"Epoch {epoch} - loss: {loss}")
228 |         
229 | 
230 | 
231 | X_test_torch = torch.from_numpy(X_test.astype(np.float32))
232 | predictions = model(X_test_torch).detach().numpy()[:, 0]
233 | print(predictions)
234 | 
235 | print(mean_squared_error(y_test, predictions))
236 | 
237 | 
238 | # # Preventing overfitting in neural networks 
239 | 
240 | # ## Dropout
241 | 
242 | torch.manual_seed(42)
243 | model_with_dropout = nn.Sequential(nn.Linear(X_train.shape[1], 16),
244 |                                    nn.ReLU(),
245 |                                    nn.Dropout(0.1),
246 |                                    nn.Linear(16, 8),
247 |                                    nn.ReLU(),
248 |                                    nn.Linear(8, 1))
249 | 
250 | 
251 | optimizer = torch.optim.Adam(model_with_dropout.parameters(), lr=0.01)
252 | 
253 | 
254 | for epoch in range(1000):
255 |     loss = train_step(model_with_dropout, X_train_torch, y_train_torch, loss_function, optimizer)
256 | 
257 |     if epoch % 100 == 0:
258 |         print(f"Epoch {epoch} - loss: {loss}")
259 |  
260 | 
261 | 
262 | model_with_dropout.eval()
263 | predictions = model_with_dropout(X_test_torch).detach().numpy()[:, 0]
264 | 
265 | print(mean_squared_error(y_test, predictions))
266 | 
267 | 
268 | # ## Early stopping 
269 | 
270 | torch.manual_seed(42)
271 | model = nn.Sequential(nn.Linear(X_train.shape[1], 16),
272 |                       nn.ReLU(),
273 |                       nn.Linear(16, 8),
274 |                       nn.ReLU(),
275 |                       nn.Linear(8, 1))
276 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
277 | 
278 | 
279 | patience = 100
280 | epochs_no_improve = 0
281 | best_test_loss = float('inf')
282 | 
283 | 
284 | import copy
285 | 
286 | best_model = model
287 | 
288 | for epoch in range(500):
289 |     loss = train_step(model, X_train_torch, y_train_torch, loss_function, optimizer)
290 |         
291 |     predictions = model(X_test_torch).detach().numpy()[:, 0]
292 |     test_loss = mean_squared_error(y_test, predictions)
293 |     if test_loss > best_test_loss:
294 |         epochs_no_improve += 1
295 |         if epochs_no_improve > patience:
296 |             print(f"Early stopped at epoch {epoch}")
297 |             break
298 |     else:
299 |         epochs_no_improve = 0
300 |         best_test_loss = test_loss
301 |         best_model = copy.deepcopy(model)
302 |             
303 | 
304 | 
305 | predictions = best_model(X_test_torch).detach().numpy()[:, 0] 
306 | 
307 | print(mean_squared_error(y_test, predictions))
308 | 
309 | 
310 | # ---
311 | 
312 | # Readers may ignore the next cell.
313 | 
314 | get_ipython().system('jupyter nbconvert --to python ch6_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
315 | 
316 | 


--------------------------------------------------------------------------------
/ch6/ch6_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 
  5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  6 | # 
  7 | # Chapter 6 Predicting Stock Prices with Artificial Neural Networks
  8 | # 
  9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
 10 | # 
 11 | 
 12 | # # Predicting stock prices with neural networks 
 13 | 
 14 | # ## Training a simple neural network 
 15 | 
 16 | import pandas as pd
 17 | import numpy as np
 18 | from sklearn.preprocessing import StandardScaler
 19 | import torch
 20 | import torch.nn as nn
 21 | 
 22 | 
 23 | # Reusing the feature generation function we developed
 24 | def generate_features(df):
 25 |     """
 26 |     Generate features for a stock/index based on historical price and performance
 27 |     @param df: dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adj Close"
 28 |     @return: dataframe, data set with new features
 29 |     """
 30 |     df_new = pd.DataFrame()
 31 |     # 6 original features
 32 |     df_new['open'] = df['Open']
 33 |     df_new['open_1'] = df['Open'].shift(1)
 34 |     df_new['close_1'] = df['Close'].shift(1)
 35 |     df_new['high_1'] = df['High'].shift(1)
 36 |     df_new['low_1'] = df['Low'].shift(1)
 37 |     df_new['volume_1'] = df['Volume'].shift(1)
 38 |     # 31 generated features
 39 |     # average price
 40 |     df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1)
 41 |     df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1)
 42 |     df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1)
 43 |     df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30']
 44 |     df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365']
 45 |     df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365']
 46 |     # average volume
 47 |     df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1)
 48 |     df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1)
 49 |     df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1)
 50 |     df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30']
 51 |     df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365']
 52 |     df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365']
 53 |     # standard deviation of prices
 54 |     df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1)
 55 |     df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1)
 56 |     df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1)
 57 |     df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30']
 58 |     df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365']
 59 |     df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365']
 60 |     # standard deviation of volumes
 61 |     df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1)
 62 |     df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1)
 63 |     df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1)
 64 |     df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30']
 65 |     df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365']
 66 |     df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365']
 67 |     # # return
 68 |     df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1)
 69 |     df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1)
 70 |     df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1)
 71 |     df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1)
 72 |     df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1)
 73 |     df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1)
 74 |     df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1)
 75 |     # the target
 76 |     df_new['close'] = df['Close']
 77 |     df_new = df_new.dropna(axis=0)
 78 |     return df_new
 79 | 
 80 | 
 81 | data_raw = pd.read_csv('19900101_20230630.csv', index_col='Date')
 82 | data = generate_features(data_raw)
 83 | 
 84 | start_train = '1990-01-01'
 85 | end_train = '2022-12-31'
 86 | 
 87 | start_test = '2023-01-01'
 88 | end_test = '2023-06-30'
 89 | 
 90 | data_train = data.loc[start_train:end_train]
 91 | X_train = data_train.drop('close', axis=1).values
 92 | y_train = data_train['close'].values
 93 | 
 94 | data_test = data.loc[start_test:end_test]
 95 | X_test = data_test.drop('close', axis=1).values
 96 | y_test = data_test['close'].values
 97 | 
 98 | 
 99 | scaler = StandardScaler()
100 | X_scaled_train = scaler.fit_transform(X_train)
101 | X_scaled_test = scaler.transform(X_test)
102 | 
103 | 
104 | X_train_torch = torch.from_numpy(X_scaled_train.astype(np.float32))
105 | X_test_torch = torch.from_numpy(X_scaled_test.astype(np.float32))
106 | y_train = y_train.reshape(y_train.shape[0], 1)
107 | y_train_torch = torch.from_numpy(y_train.astype(np.float32))
108 | 
109 | 
110 | torch.manual_seed(42)
111 | model = nn.Sequential(nn.Linear(X_train.shape[1], 32),
112 |                       nn.ReLU(),
113 |                       nn.Linear(32, 1))
114 | 
115 | 
116 | loss_function = nn.MSELoss()
117 | optimizer = torch.optim.Adam(model.parameters(), lr=0.3)
118 | 
119 | 
120 | def train_step(model, X_train, y_train, loss_function, optimizer):
121 |     pred_train = model(X_train)
122 |     loss = loss_function(pred_train, y_train)
123 |  
124 |     model.zero_grad()
125 |     loss.backward()
126 | 
127 |     optimizer.step()
128 |     
129 |     return loss.item()
130 | 
131 | 
132 | for epoch in range(1000):
133 |     loss = train_step(model, X_train_torch, y_train_torch, loss_function, optimizer)
134 |     
135 |     if epoch % 100 == 0:
136 |         print(f"Epoch {epoch} - loss: {loss}")
137 |         
138 | 
139 | 
140 | predictions = model(X_test_torch).detach().numpy()[:, 0]
141 | 
142 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
143 | print(f'MSE: {mean_squared_error(y_test, predictions):.3f}')
144 | print(f'MAE: {mean_absolute_error(y_test, predictions):.3f}')
145 | print(f'R^2: {r2_score(y_test, predictions):.3f}')
146 | 
147 | 
148 | # ## Fine-tuning the neural network 
149 | 
150 | from torch.utils.tensorboard import SummaryWriter
151 | 
152 | 
153 | hparams_config = {
154 |     "hidden_size": [16, 32],
155 |     "epochs": [1000, 3000],
156 |     "lr": [0.1, 0.3],
157 | }
158 | 
159 | 
160 | def train_validate_model(hidden_size, epochs, lr):
161 |     model = nn.Sequential(nn.Linear(X_train.shape[1], hidden_size),
162 |                                   nn.ReLU(),
163 |                                   nn.Linear(hidden_size, 1))
164 |     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
165 | 
166 |     # Create the TensorBoard writer
167 |     writer_path = f"runs/{experiment_num}/{hidden_size}/{epochs}/{lr}"
168 |     writer = SummaryWriter(log_dir=writer_path)
169 | 
170 |     for epoch in range(epochs):
171 |         loss = train_step(model, X_train_torch, y_train_torch, loss_function, optimizer)
172 | 
173 |         predictions = model(X_test_torch).detach().numpy()[:, 0]
174 |         test_mse = mean_squared_error(y_test, predictions)
175 | 
176 |         writer.add_scalar(
177 |             tag="train loss",
178 |             scalar_value=loss,
179 |             global_step=epoch,
180 |         )
181 |         writer.add_scalar(
182 |             tag="test loss",
183 |             scalar_value=test_mse,
184 |             global_step=epoch,
185 |         )
186 | 
187 |     test_r2 = r2_score(y_test, predictions)
188 |     print(f'R^2: {test_r2:.3f}\n')
189 | 
190 |     # Add the hyperparameters and metrics to TensorBoard
191 |     writer.add_hparams(
192 |         {
193 |             "hidden_size": hidden_size,
194 |             "epochs": epochs,
195 |             "lr": lr,
196 |         },
197 |         {
198 |             "test MSE": test_mse,
199 |             "test R^2": test_r2,
200 |         },
201 |     )
202 | 
203 | 
204 | experiment_num = 0
205 | 
206 | torch.manual_seed(42)
207 | for hidden_size in hparams_config["hidden_size"]:
208 |     for epochs in hparams_config["epochs"]:
209 |         for lr in hparams_config["lr"]:
210 |             experiment_num += 1
211 |             print(f"Experiment {experiment_num}: hidden_size = {hidden_size}, epochs = {epochs}, lr = {lr}")
212 |             train_validate_model(hidden_size, epochs, lr)
213 |             
214 | 
215 | 
216 | hidden_size = 16
217 | epochs = 3000
218 | lr = 0.3
219 | best_model = nn.Sequential(nn.Linear(X_train.shape[1], hidden_size),
220 |                            nn.ReLU(),
221 |                            nn.Linear(hidden_size, 1))
222 | optimizer = torch.optim.Adam(best_model.parameters(), lr=lr)
223 | for epoch in range(epochs):
224 |     train_step(best_model, X_train_torch, y_train_torch, loss_function, optimizer)
225 | 
226 | predictions = best_model(X_test_torch).detach().numpy()[:, 0] 
227 | 
228 | 
229 | import matplotlib.pyplot as plt
230 | plt.rc('xtick', labelsize=10)
231 | plt.rc('ytick', labelsize=10)
232 | plt.plot(data_test.index, y_test, c='k')
233 | plt.plot(data_test.index, predictions, c='b')
234 | plt.xticks(range(0, 130, 10), rotation=60)
235 | plt.xlabel('Date', fontsize=10)
236 | plt.ylabel('Close price', fontsize=10)
237 | plt.legend(['Truth', 'Neural network'], fontsize=10)
238 | plt.show()
239 | 
240 | 
241 | # ---
242 | 
243 | # Readers may ignore the next cell.
244 | 
245 | get_ipython().system('jupyter nbconvert --to python ch6_part2.ipynb --TemplateExporter.exclude_input_prompt=True')
246 | 
247 | 


--------------------------------------------------------------------------------
/ch7/ch7_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 
  5 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  6 | # 
  7 | # Chapter 7 Mining the 20 Newsgroups Dataset with Text Analysis Techniques
  8 | # 
  9 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
 10 | # 
 11 | 
 12 | # # Touring popular NLP libraries and picking up NLP basics 
 13 | 
 14 | # ## Corpora 
 15 | 
 16 | import nltk
 17 | # nltk.download()
 18 | 
 19 | 
 20 | from nltk.corpus import names
 21 | print(names.words()[:10])
 22 | 
 23 | print(len(names.words()))
 24 | 
 25 | 
 26 | # ## Tokenization
 27 | 
 28 | from nltk.tokenize import word_tokenize
 29 | sent = '''I am reading a book.
 30 |           It is Python Machine Learning By Example,
 31 |           4th edition.'''
 32 | 
 33 | print(word_tokenize(sent))
 34 | 
 35 | 
 36 | sent2 = 'I have been to U.K. and U.S.A.'
 37 | print(word_tokenize(sent2))
 38 | 
 39 | 
 40 | import spacy
 41 | 
 42 | nlp = spacy.load('en_core_web_sm')
 43 | tokens2 = nlp(sent2)
 44 | 
 45 | print([token.text for token in tokens2])
 46 | 
 47 | 
 48 | from nltk.tokenize import sent_tokenize
 49 | print(sent_tokenize(sent))
 50 | 
 51 | 
 52 | # ## PoS tagging 
 53 | 
 54 | import nltk
 55 | tokens = word_tokenize(sent)
 56 | print(nltk.pos_tag(tokens))
 57 | 
 58 | 
 59 | nltk.help.upenn_tagset('PRP')
 60 | nltk.help.upenn_tagset('VBP')
 61 | 
 62 | 
 63 | print([(token.text, token.pos_) for token in tokens2])
 64 | 
 65 | 
 66 | # ## NER
 67 | 
 68 | tokens3 = nlp('The book written by Hayden Liu in 2024 was sold at $30 in America')
 69 | print([(token_ent.text, token_ent.label_) for token_ent in tokens3.ents])
 70 | 
 71 | 
 72 | # ## Stemming and lemmatization 
 73 | 
 74 | from nltk.stem.porter import PorterStemmer
 75 | porter_stemmer = PorterStemmer()
 76 | 
 77 | 
 78 | porter_stemmer.stem('machines')
 79 | 
 80 | 
 81 | porter_stemmer.stem('learning')
 82 | 
 83 | 
 84 | from nltk.stem import WordNetLemmatizer
 85 | lemmatizer = WordNetLemmatizer()
 86 | 
 87 | 
 88 | lemmatizer.lemmatize('machines')
 89 | 
 90 | 
 91 | lemmatizer.lemmatize('learning')
 92 | 
 93 | 
 94 | # # Getting the newsgroups data
 95 | 
 96 | from sklearn.datasets import fetch_20newsgroups
 97 | 
 98 | 
 99 | groups = fetch_20newsgroups()
100 | 
101 | 
102 | groups.keys()
103 | 
104 | 
105 | groups['target_names']
106 | 
107 | 
108 | groups['target']
109 | 
110 | 
111 | import numpy as np
112 | np.unique(groups.target)
113 | 
114 | 
115 | import seaborn as sns
116 | import matplotlib.pyplot as plt
117 | sns.histplot(groups.target, bins=20)
118 | plt.xticks(range(0, 20, 1))
119 | plt.show()
120 | 
121 | 
122 | groups.data[0]
123 | 
124 | 
125 | groups.target[0]
126 | 
127 | 
128 | groups.target_names[groups.target[0]]
129 | 
130 | 
131 | # # Thinking about features for text data
132 | 
133 | # ## Counting the occurrence of each word token 
134 | 
135 | from sklearn.feature_extraction.text import CountVectorizer
136 | 
137 | 
138 | count_vector = CountVectorizer(max_features=500)
139 | data_count = count_vector.fit_transform(groups.data)
140 | 
141 | 
142 | data_count
143 | 
144 | 
145 | data_count[0]
146 | 
147 | 
148 | data_count.toarray()[0]
149 | 
150 | 
151 | print(count_vector.get_feature_names_out())
152 | 
153 | 
154 | # ## Text preprocessing
155 | 
156 | data_cleaned = []
157 | for doc in groups.data:
158 |     doc_cleaned = ' '.join(word for word in doc.split() if word.isalpha())
159 |     data_cleaned.append(doc_cleaned)
160 | 
161 | 
162 | # ## Dropping stop words 
163 | 
164 | from sklearn.feature_extraction import _stop_words
165 | print(_stop_words.ENGLISH_STOP_WORDS)
166 | 
167 | 
168 | count_vector = CountVectorizer(stop_words="english",max_features=500)
169 | 
170 | 
171 | # ## Reducing inflectional and derivational forms of words 
172 | 
173 | all_names = set(names.words())
174 | 
175 | 
176 | def get_cleaned_data(groups, lemmatizer, remove_words):
177 |     data_cleaned = []
178 | 
179 |     for doc in groups.data:
180 |         doc = doc.lower()
181 |         doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in remove_words)
182 |         data_cleaned.append(doc_cleaned)
183 |         
184 |     return data_cleaned
185 | 
186 | 
187 | count_vector_sw = CountVectorizer(stop_words="english", max_features=500)
188 | 
189 | data_cleaned = get_cleaned_data(groups, lemmatizer, all_names)
190 | 
191 | data_cleaned_count = count_vector_sw.fit_transform(data_cleaned)
192 | 
193 | 
194 | sum(len(set(doc.split())) for doc in data_cleaned)
195 | 
196 | 
197 | print(count_vector_sw.get_feature_names_out())
198 | 
199 | 
200 | # # Visualizing the newsgroups data with t-SNE 
201 | 
202 | # ## t-SNE for dimensionality reduction 
203 | 
204 | from sklearn.manifold import TSNE
205 | 
206 | 
207 | categories_3 = ['talk.religion.misc', 'comp.graphics', 'sci.space']
208 | 
209 | groups_3 = fetch_20newsgroups(categories=categories_3)
210 | 
211 | 
212 | data_cleaned = get_cleaned_data(groups_3, lemmatizer, all_names)
213 |  
214 | data_cleaned_count_3 = count_vector_sw.fit_transform(data_cleaned)
215 | 
216 | 
217 | tsne_model = TSNE(n_components=2,  perplexity=40, random_state=42, learning_rate=500)
218 | 
219 | data_tsne = tsne_model.fit_transform(data_cleaned_count_3.toarray())
220 | 
221 | 
222 | plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=groups_3.target)
223 | plt.show()
224 | 
225 | 
226 | categories_5 = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
227 |                 'comp.windows.x']
228 | groups_5 = fetch_20newsgroups(categories=categories_5)
229 | 
230 | data_cleaned = get_cleaned_data(groups_5, lemmatizer, all_names)
231 |  
232 | data_cleaned_count_5 = count_vector_sw.fit_transform(data_cleaned)
233 | 
234 | data_tsne = tsne_model.fit_transform(data_cleaned_count_5.toarray())
235 | 
236 | plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=groups_5.target)
237 | 
238 | plt.show()
239 | 
240 | 
241 | # # Building embedding models using shadow neural networks
242 | 
243 | # ## Utilizing pre-trained embedding models 
244 | 
245 | import gensim.downloader as api
246 | model = api.load("glove-twitter-25")
247 | 
248 | 
249 | vector = model['computer']
250 | print('Word computer is embedded into:\n', vector)
251 | 
252 | 
253 | similar_words = model.most_similar("computer")
254 | print('Top ten words most contextually relevant to computer:\n', 
255 |            similar_words)
256 | 
257 | 
258 | doc_sample = ['i', 'love', 'reading', 'python', 'machine', 
259 |                  'learning', 'by', 'example']
260 | doc_vector = np.mean([model[word] for word in doc_sample], axis=0)
261 | print('The document sample is embedded into:\n', doc_vector)
262 | 
263 | 
264 | # ---
265 | 
266 | # Readers may ignore the next cell.
267 | 
268 | get_ipython().system('jupyter nbconvert --to python ch7_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
269 | 
270 | 


--------------------------------------------------------------------------------
/ch8/ch8_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 8 Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Getting started with k-means clustering
 11 | 
 12 | # ## Implementing k-means from scratch
 13 | 
 14 | from sklearn import datasets
 15 | iris = datasets.load_iris()
 16 | X = iris.data[:, 2:4]
 17 | y = iris.target
 18 | 
 19 | 
 20 | import numpy as np
 21 | from matplotlib import pyplot as plt
 22 | plt.scatter(X[:,0], X[:,1], c=y)
 23 | plt.show()
 24 | 
 25 | 
 26 | k = 3
 27 | np.random.seed(0)
 28 | random_index = np.random.choice(range(len(X)), k)
 29 | centroids = X[random_index]
 30 | 
 31 | 
 32 | def visualize_centroids(X, centroids):
 33 |     plt.scatter(X[:, 0], X[:, 1])
 34 |     plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505')
 35 |     plt.show()
 36 | 
 37 | visualize_centroids(X, centroids)
 38 | 
 39 | 
 40 | def dist(a, b):
 41 |     return np.linalg.norm(a - b, axis=1)
 42 | 
 43 | 
 44 | def assign_cluster(x, centroids):
 45 |     distances = dist(x, centroids)
 46 |     cluster = np.argmin(distances)
 47 |     return cluster
 48 | 
 49 | 
 50 | def update_centroids(X, centroids, clusters):
 51 |     for i in range(k):
 52 |         cluster_i = np.where(clusters == i)
 53 |         centroids[i] = np.mean(X[cluster_i], axis=0)
 54 | 
 55 | 
 56 | tol = 0.0001
 57 | max_iter = 100
 58 | 
 59 | iter = 0
 60 | centroids_diff = 100000
 61 | clusters = np.zeros(len(X))
 62 | 
 63 | 
 64 | from copy import deepcopy
 65 | while iter < max_iter and centroids_diff > tol:
 66 |     for i in range(len(X)):
 67 |         clusters[i] = assign_cluster(X[i], centroids)
 68 |     centroids_prev = deepcopy(centroids)
 69 |     update_centroids(X, centroids, clusters)
 70 |     iter += 1
 71 |     centroids_diff = np.linalg.norm(centroids - centroids_prev)
 72 |     print('Iteration:', str(iter))
 73 |     print('Centroids:\n', centroids)
 74 |     print(f'Centroids move: {centroids_diff:5.4f}')
 75 |     visualize_centroids(X, centroids)
 76 | 
 77 | 
 78 | plt.scatter(X[:, 0], X[:, 1], c=clusters)
 79 | plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='r')
 80 | plt.show()
 81 | 
 82 | 
 83 | # ## Implementing k-means with scikit-learn
 84 | 
 85 | from sklearn.cluster import KMeans
 86 | kmeans_sk = KMeans(n_clusters=3, n_init='auto', random_state=42)
 87 | 
 88 | 
 89 | kmeans_sk.fit(X)
 90 | 
 91 | 
 92 | clusters_sk = kmeans_sk.labels_
 93 | centroids_sk = kmeans_sk.cluster_centers_
 94 | 
 95 | 
 96 | plt.scatter(X[:, 0], X[:, 1], c=clusters_sk)
 97 | plt.scatter(centroids_sk[:, 0], centroids_sk[:, 1], marker='*', s=200, c='r')
 98 | plt.show()
 99 | 
100 | 
101 | # ## Choosing the value of k 
102 | 
103 | X = iris.data
104 | y = iris.target
105 | k_list = list(range(1, 7))
106 | sse_list = [0] * len(k_list)
107 | 
108 | 
109 | for k_ind, k in enumerate(k_list):
110 |     kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)
111 |     kmeans.fit(X)
112 |     clusters = kmeans.labels_
113 |     centroids = kmeans.cluster_centers_
114 | 
115 |     sse = 0
116 |     for i in range(k):
117 |         cluster_i = np.where(clusters == i)
118 | 
119 |         sse += np.linalg.norm(X[cluster_i] - centroids[i])
120 | 
121 |     print(f'k={k}, SSE={sse}')
122 |     sse_list[k_ind] = sse
123 | 
124 | 
125 | plt.plot(k_list, sse_list)
126 | plt.show()
127 | 
128 | 
129 | # ---
130 | 
131 | # Readers may ignore the next cell.
132 | 
133 | get_ipython().system('jupyter nbconvert --to python ch8_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
134 | 
135 | 


--------------------------------------------------------------------------------
/ch8/ch8_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 8 Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Clustering newsgroups dataset
 11 | 
 12 | # ## Clustering newsgroups data using k-means 
 13 | 
 14 | from sklearn.datasets import fetch_20newsgroups
 15 | 
 16 | categories = [
 17 |     'alt.atheism',
 18 |     'talk.religion.misc',
 19 |     'comp.graphics',
 20 |     'sci.space',
 21 | ]
 22 | 
 23 | groups = fetch_20newsgroups(subset='all', categories=categories)
 24 | 
 25 | labels = groups.target
 26 | label_names = groups.target_names
 27 | 
 28 | 
 29 | from nltk.stem import WordNetLemmatizer
 30 | from nltk.corpus import names
 31 | all_names = set(names.words())
 32 | lemmatizer = WordNetLemmatizer()
 33 | 
 34 | def get_cleaned_data(groups, lemmatizer, remove_words):
 35 |     data_cleaned = []
 36 | 
 37 |     for doc in groups.data:
 38 |         doc = doc.lower()
 39 |         doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in remove_words)
 40 |         data_cleaned.append(doc_cleaned)
 41 |         
 42 |     return data_cleaned
 43 | 
 44 | data_cleaned = get_cleaned_data(groups, lemmatizer, all_names)
 45 | 
 46 | 
 47 | from sklearn.feature_extraction.text import CountVectorizer
 48 | count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2)
 49 | data_cv = count_vector.fit_transform(data_cleaned)
 50 | 
 51 | 
 52 | from sklearn.cluster import KMeans
 53 | k = 4
 54 | kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)
 55 | 
 56 | kmeans.fit(data_cv)
 57 | 
 58 | 
 59 | clusters = kmeans.labels_
 60 | 
 61 | from collections import Counter
 62 | print(Counter(clusters))
 63 | 
 64 | 
 65 | from sklearn.feature_extraction.text import TfidfVectorizer
 66 | tfidf_vector = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2)
 67 | 
 68 | 
 69 | data_tv = tfidf_vector.fit_transform(data_cleaned)
 70 | kmeans.fit(data_tv)
 71 | clusters = kmeans.labels_
 72 | print(Counter(clusters))
 73 | 
 74 | 
 75 | import numpy as np
 76 | cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)}
 77 | 
 78 | terms = tfidf_vector.get_feature_names_out()
 79 | centroids = kmeans.cluster_centers_
 80 | for cluster, index_list in cluster_label.items():
 81 |     counter = Counter(cluster_label[cluster])
 82 |     print(f'cluster_{cluster}: {len(index_list)} samples')
 83 |     for label_index, count in sorted(counter.items(), key=lambda x: x[1], reverse=True):
 84 |         print(f'- {label_names[label_index]}: {count} samples')
 85 |     print('Top 10 terms:')
 86 |     for ind in centroids[cluster].argsort()[-10:]:
 87 |         print('%s ' % terms[ind], end="")
 88 |     print('\n')
 89 | 
 90 | 
 91 | # ## Describing the clusters using GPT 
 92 | 
 93 | keywords = ' '.join(terms[ind] for ind in centroids[0].argsort()[-100:])  
 94 | 
 95 | 
 96 | print(keywords)
 97 | 
 98 | 
 99 | import openai
100 | 
101 | 
102 | # openai.api_key = '<YOUR API KEY>'
103 | 
104 | 
105 | def get_completion(prompt, model="text-davinci-003"):
106 |     messages = [{"role": "user", "content": prompt}]
107 |     response = openai.ChatCompletion.create(
108 |         model=model,
109 |         messages=messages,
110 |         temperature=0
111 |     )
112 |     return response.choices[0].message["content"]
113 | 
114 | 
115 | # response = get_completion(f"Describe a common topic based on the following keywords: {keywords}")
116 | # print(response)
117 | 
118 | 
119 | # # Discovering underlying topics in newsgroups 
120 | 
121 | # ## Topic modeling using NMF 
122 | 
123 | from sklearn.decomposition import NMF
124 | 
125 | t = 20
126 | nmf = NMF(n_components=t, random_state=42)
127 | 
128 | 
129 | nmf.fit(data_cv)
130 | 
131 | print(nmf.components_)
132 | 
133 | 
134 | terms_cv = count_vector.get_feature_names_out()
135 | for topic_idx, topic in enumerate(nmf.components_):
136 |         print("Topic {}:" .format(topic_idx))
137 |         print(" ".join([terms_cv[i] for i in topic.argsort()[-10:]]))
138 | 
139 | 
140 | # ## Topic modeling using LDA 
141 | 
142 | from sklearn.decomposition import LatentDirichletAllocation
143 | 
144 | t = 20
145 | lda = LatentDirichletAllocation(n_components=t, learning_method='batch',random_state=42)
146 | 
147 | 
148 | lda.fit(data_cv)
149 | 
150 | print(lda.components_)
151 | 
152 | 
153 | for topic_idx, topic in enumerate(lda.components_):
154 |         print("Topic {}:" .format(topic_idx))
155 |         print(" ".join([terms_cv[i] for i in topic.argsort()[-10:]]))
156 | 
157 | 
158 | data_cleaned = get_cleaned_data(groups_3, lemmatizer, all_names)
159 | 
160 | 
161 | data_embedding = []
162 | 
163 | for doc in data_cleaned:
164 | #     print(doc)
165 |     doc_vector = np.mean([model[word] for word in doc.split() if word in model], axis=0)
166 |     data_embedding.append(doc_vector)
167 |  
168 |         
169 | data_tsne = tsne_model.fit_transform(np.array(data_embedding))
170 | plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=groups_3.target)
171 | 
172 | plt.show()
173 | 
174 | 
175 | # ---
176 | 
177 | # Readers may ignore the next cell.
178 | 
179 | get_ipython().system('jupyter nbconvert --to python ch8_part2.ipynb --TemplateExporter.exclude_input_prompt=True')
180 | 
181 | 


--------------------------------------------------------------------------------
/ch9/ch9_part1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 9 Recognizing Faces with Support Vector Machine
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Finding the separating boundary with SVM 
 11 | 
 12 | # ## Implementing SVM
 13 | 
 14 | from sklearn.datasets import load_breast_cancer
 15 | cancer_data = load_breast_cancer()
 16 | 
 17 | X = cancer_data.data
 18 | Y = cancer_data.target
 19 | 
 20 | print('Input data size :', X.shape)
 21 | print('Output data size :', Y.shape)
 22 | print('Label names:', cancer_data.target_names)
 23 | n_pos = (Y == 1).sum()
 24 | n_neg = (Y == 0).sum()
 25 | print(f'{n_pos} positive samples and {n_neg} negative samples.')
 26 | 
 27 | 
 28 | from sklearn.model_selection import train_test_split
 29 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
 30 | 
 31 | 
 32 | from sklearn.svm import SVC
 33 | clf = SVC(kernel='linear', C=1.0, random_state=42)
 34 | 
 35 | 
 36 | clf.fit(X_train, Y_train)
 37 | 
 38 | 
 39 | accuracy = clf.score(X_test, Y_test)
 40 | print(f'The accuracy is: {accuracy*100:.1f}%')
 41 | 
 42 | 
 43 | # ## Scenario 4 – dealing with more than two classes 
 44 | 
 45 | from sklearn.datasets import load_wine
 46 | wine_data = load_wine()
 47 | X = wine_data.data
 48 | Y = wine_data.target
 49 | 
 50 | print('Input data size :', X.shape)
 51 | print('Output data size :', Y.shape)
 52 | print('Label names:', wine_data.target_names)
 53 | n_class0 = (Y == 0).sum()
 54 | n_class1 = (Y == 1).sum()
 55 | n_class2 = (Y == 2).sum()
 56 | print(f'{n_class0} class0 samples,\n{n_class1} class1 samples,\n{n_class2} class2 samples.')
 57 | 
 58 | 
 59 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
 60 | 
 61 | 
 62 | clf = SVC(kernel='linear', C=1.0, random_state=42)
 63 | clf.fit(X_train, Y_train)
 64 | 
 65 | 
 66 | accuracy = clf.score(X_test, Y_test)
 67 | print(f'The accuracy is: {accuracy*100:.1f}%')
 68 | 
 69 | 
 70 | from sklearn.metrics import classification_report
 71 | pred = clf.predict(X_test)
 72 | print(classification_report(Y_test, pred))
 73 | 
 74 | 
 75 | # ## Scenario 5 – solving linearly non-separable problems with kernels 
 76 | 
 77 | import numpy as np
 78 | import matplotlib.pyplot as plt
 79 | 
 80 | 
 81 | X = np.c_[# negative class
 82 |           (.3, -.8),
 83 |           (-1.5, -1),
 84 |           (-1.3, -.8),
 85 |           (-1.1, -1.3),
 86 |           (-1.2, -.3),
 87 |           (-1.3, -.5),
 88 |           (-.6, 1.1),
 89 |           (-1.4, 2.2),
 90 |           (1, 1),
 91 |           # positive class
 92 |           (1.3, .8),
 93 |           (1.2, .5),
 94 |           (.2, -2),
 95 |           (.5, -2.4),
 96 |           (.2, -2.3),
 97 |           (0, -2.7),
 98 |           (1.3, 2.1)].T
 99 | Y = [-1] * 8 + [1] * 8
100 | 
101 | 
102 | gamma_option = [1, 2, 4]
103 | 
104 | 
105 | for i, gamma in enumerate(gamma_option, 1):
106 |     svm = SVC(kernel='rbf', gamma=gamma)
107 |     svm.fit(X, Y)
108 |     plt.scatter(X[:, 0], X[:, 1], c=['b']*8+['r']*8, zorder=10)
109 |     plt.axis('tight')
110 |     XX, YY = np.mgrid[-3:3:200j, -3:3:200j]
111 |     Z = svm.decision_function(np.c_[XX.ravel(), YY.ravel()])
112 |     Z = Z.reshape(XX.shape)
113 |     plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
114 |     plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'], levels=[-.5, 0, .5])
115 |     plt.title('gamma = %d' % gamma)
116 |     plt.show()
117 | 
118 | 
119 | # ---
120 | 
121 | # Readers may ignore the next cell.
122 | 
123 | get_ipython().system('jupyter nbconvert --to python ch9_part1.ipynb --TemplateExporter.exclude_input_prompt=True')
124 | 
125 | 


--------------------------------------------------------------------------------
/ch9/ch9_part2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)
  5 | # 
  6 | # Chapter 9 Recognizing Faces with Support Vector Machine
  7 | # 
  8 | # Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)
  9 | 
 10 | # # Classifying face images with SVM
 11 | 
 12 | # ## Exploring the face image dataset 
 13 | 
 14 | from sklearn.datasets import fetch_lfw_people
 15 | 
 16 | # face_data = fetch_lfw_people(min_faces_per_person=80)
 17 | face_data = fetch_lfw_people(data_home='./', min_faces_per_person=80, download_if_missing=False)
 18 | 
 19 | 
 20 | X = face_data.data
 21 | Y = face_data.target
 22 | 
 23 | print('Input data size :', X.shape)
 24 | print('Output data size :', Y.shape)
 25 | print('Label names:', face_data.target_names)
 26 | 
 27 | 
 28 | for i in range(5):
 29 |     print(f'Class {i} has {(Y == i).sum()} samples.')
 30 | 
 31 | 
 32 | import matplotlib.pyplot as plt
 33 | 
 34 | fig, ax = plt.subplots(3, 4)
 35 | for i, axi in enumerate(ax.flat):
 36 |     axi.imshow(face_data.images[i], cmap='bone')
 37 |     axi.set(xticks=[], yticks=[],
 38 |             xlabel=face_data.target_names[face_data.target[i]])
 39 | 
 40 | plt.show()
 41 | 
 42 | 
 43 | # ## Building an SVM-based image classifier
 44 | 
 45 | from sklearn.model_selection import train_test_split
 46 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
 47 | 
 48 | 
 49 | from sklearn.svm import SVC
 50 | clf = SVC(class_weight='balanced', random_state=42)
 51 | 
 52 | 
 53 | from sklearn.model_selection import GridSearchCV
 54 | parameters = {'C': [10, 100, 300],
 55 |               'gamma': [0.0001,  0.0003, 0.001],
 56 |               'kernel' : ['rbf', 'linear'] }
 57 | 
 58 | grid_search = GridSearchCV(clf, parameters, n_jobs=-1, cv=5)
 59 | 
 60 | 
 61 | grid_search.fit(X_train, Y_train)
 62 | 
 63 | 
 64 | print('The best model:\n', grid_search.best_params_)
 65 | 
 66 | 
 67 | print('The best averaged performance:', grid_search.best_score_)
 68 | 
 69 | 
 70 | clf_best = grid_search.best_estimator_
 71 | 
 72 | print(f'The accuracy is: {clf_best.score(X_test, Y_test)*100:.1f}%')
 73 | 
 74 | 
 75 | pred = clf_best.predict(X_test)
 76 | 
 77 | from sklearn.metrics import classification_report
 78 | print(classification_report(Y_test, pred, target_names=face_data.target_names))
 79 | 
 80 | 
 81 | # ## Boosting image classification performance with PCA 
 82 | 
 83 | from sklearn.decomposition import PCA
 84 | pca = PCA(n_components=100, whiten=True, random_state=42)
 85 | svc = SVC(class_weight='balanced', kernel='rbf', random_state=42)
 86 | 
 87 | from sklearn.pipeline import Pipeline
 88 | model = Pipeline([('pca', pca),
 89 |                   ('svc', svc)])
 90 | 
 91 | 
 92 | parameters_pipeline = {'svc__C': [1, 3, 10],
 93 |                        'svc__gamma': [0.01,  0.03, 0.003]}
 94 | grid_search = GridSearchCV(model, parameters_pipeline, n_jobs=-1, cv=5)
 95 | 
 96 | grid_search.fit(X_train, Y_train)
 97 | 
 98 | 
 99 | print('The best model:\n', grid_search.best_params_)
100 | print('The best averaged performance:', grid_search.best_score_)
101 | 
102 | model_best = grid_search.best_estimator_
103 | print(f'The accuracy is: {model_best.score(X_test, Y_test)*100:.1f}%')
104 | pred = model_best.predict(X_test)
105 | print(classification_report(Y_test, pred, target_names=face_data.target_names))
106 | 
107 | 
108 | # # Estimating with support vector regression 
109 | 
110 | # ## Implementing SVR 
111 | 
112 | from sklearn import datasets
113 | diabetes = datasets.load_diabetes()
114 | 
115 | X = diabetes.data
116 | Y = diabetes.target
117 | 
118 | print('Input data size :', X.shape)
119 | print('Output data size :', Y.shape)
120 |  
121 | 
122 | 
123 | num_test = 30    # the last 30 samples as testing set
124 | X_train = diabetes.data[:-num_test, :]
125 | y_train = diabetes.target[:-num_test]
126 | X_test = diabetes.data[-num_test:, :]
127 | y_test = diabetes.target[-num_test:]
128 | 
129 | 
130 | from sklearn.svm import SVR
131 | regressor = SVR(C=100, kernel='linear')
132 | regressor.fit(X_train, y_train)
133 | 
134 | 
135 | from sklearn.metrics import r2_score
136 | predictions = regressor.predict(X_test)
137 | print(r2_score(y_test, predictions))
138 | 
139 | 
140 | parameters = {'C': [300, 500, 700],
141 |               'gamma': [0.3, 0.6, 1],
142 |               'kernel' : ['rbf', 'linear']}
143 | 
144 | regressor = SVR()
145 | grid_search = GridSearchCV(regressor, parameters, n_jobs=-1, cv=5)
146 | 
147 | 
148 | grid_search.fit(X_train, y_train)
149 | 
150 | 
151 | print('The best model:\n', grid_search.best_params_)
152 | 
153 | 
154 | model_best = grid_search.best_estimator_
155 | predictions = model_best.predict(X_test)
156 | 
157 | print(r2_score(y_test, predictions))
158 | 
159 | 
160 | # ---
161 | 
162 | # Readers may ignore the next cell.
163 | 
164 | get_ipython().system('jupyter nbconvert --to python ch9_part2.ipynb --TemplateExporter.exclude_input_prompt=True')
165 | 
166 | 


--------------------------------------------------------------------------------