├── Chapter01 ├── supervised_helloworld.py └── unsupervised_helloworld.py ├── Chapter02 ├── breast_cancer_analysis.py ├── distance_test.py ├── distance_test_2.py ├── inertia_blobs.py ├── knn.py ├── vq_gaussian.py └── vq_image.py ├── Chapter03 ├── absenteeism_at_work_analysis.py ├── k_medoids.py ├── mean_shift.py ├── online_clustering.py └── spectral_clustering.py ├── Chapter04 ├── connectivity_constraints.py ├── dendrogram.py └── water_treatment_plant_analysis.py ├── Chapter05 ├── bayesian_gaussian_mixture.py ├── fuzzy_cmeans.py ├── gaussian_mixture.py └── generative_gaussian_mixture.py ├── Chapter06 ├── histogram.py ├── isolation_forest.py ├── kddcup99_anomaly_detection.py ├── kernel_density_estimation.py └── one_class_svm.py ├── Chapter07 ├── factor_analysis.py ├── fastica.py ├── kernel_pca.py ├── lda.py ├── nnmf.py ├── pca.py ├── sparse_pca.py └── whitening.py ├── Chapter08 ├── deep_convolutional_autoencoder.py ├── denoising_autoencoder.py ├── rubner-tavan-network.py ├── sanger_network.py ├── sparse_autoencoder.py ├── unsupervised_dbn.py └── variational_autoencoder.py ├── Chapter09 ├── dcgan.py ├── som.py └── wgan.py ├── LICENSE └── README.md /Chapter01/supervised_helloworld.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | from sklearn.linear_model import LinearRegression 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Create the dataset 15 | T = np.expand_dims(np.linspace(0.0, 10.0, num=100), axis=1) 16 | X = (T * np.random.uniform(1.0, 1.5, size=(100, 1))) + np.random.normal(0.0, 3.5, size=(100, 1)) 17 | df = pd.DataFrame(np.concatenate([T, X], axis=1), columns=['t', 'x']) 18 | 19 | # Perform the linear regression 20 | lr = LinearRegression() 21 | lr.fit(T, X) 22 | 23 | # Print the equation 24 | print('x(t) = {0:.3f}t + {1:.3f}'.format(lr.coef_[0][0], lr.intercept_[0])) 25 | 26 | # Show the diagram 27 | sns.set(style="white", palette="muted", color_codes=True) 28 | ax = sns.lmplot(data=df, x='t', y='x', height=8) 29 | plt.show() -------------------------------------------------------------------------------- /Chapter01/unsupervised_helloworld.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | from scipy.stats import poisson 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Create the initial observation set 15 | obs = np.array([7, 11, 9, 9, 8, 11, 9, 9, 8, 7, 11, 8, 9, 9, 11, 7, 10, 9, 10, 9, 7, 8, 9, 10, 13]) 16 | mu = np.mean(obs) 17 | 18 | print('mu = {}'.format(mu)) 19 | 20 | # Show the distribution 21 | sns.set(style="white", palette="muted", color_codes=True) 22 | fig, ax = plt.subplots(figsize=(14, 7), frameon=False) 23 | 24 | sns.distplot(obs, kde=True, color="b", ax=ax) 25 | ax.spines['top'].set_visible(False) 26 | ax.spines['right'].set_visible(False) 27 | plt.show() 28 | 29 | # Print some probabilities 30 | print('P(more than 8 trains) = {}'.format(poisson.sf(8, mu))) 31 | print('P(more than 9 trains) = {}'.format(poisson.sf(9, mu))) 32 | print('P(more than 10 trains) = {}'.format(poisson.sf(10, mu))) 33 | print('P(more than 11 trains) = {}'.format(poisson.sf(11, mu))) 34 | 35 | # Add new observations 36 | new_obs = np.array([13, 14, 11, 10, 11, 13, 13, 9, 11, 14, 12, 11, 12, 37 | 14, 8, 13, 10, 14, 12, 13, 10, 9, 14, 13, 11, 14, 13, 14]) 38 | 39 | obs = np.concatenate([obs, new_obs]) 40 | mu = np.mean(obs) 41 | 42 | print('mu = {}'.format(mu)) 43 | 44 | # Repeat the analysis of the same probabilities 45 | print('P(more than 8 trains) = {}'.format(poisson.sf(8, mu))) 46 | print('P(more than 9 trains) = {}'.format(poisson.sf(9, mu))) 47 | print('P(more than 10 trains) = {}'.format(poisson.sf(10, mu))) 48 | print('P(more than 11 trains) = {}'.format(poisson.sf(11, mu))) 49 | 50 | # Generate 2000 samples from the Poisson process 51 | syn = poisson.rvs(mu, size=2000) 52 | 53 | # Plot the complete distribution 54 | fig, ax = plt.subplots(figsize=(14, 7), frameon=False) 55 | 56 | sns.distplot(syn, kde=True, color="b", ax=ax) 57 | ax.spines['top'].set_visible(False) 58 | ax.spines['right'].set_visible(False) 59 | plt.show() 60 | 61 | -------------------------------------------------------------------------------- /Chapter02/breast_cancer_analysis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import matplotlib.pyplot as plt 5 | import matplotlib.cm as cm 6 | import seaborn as sns 7 | 8 | from sklearn.cluster import KMeans 9 | from sklearn.manifold import TSNE 10 | 11 | from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, \ 12 | adjusted_mutual_info_score, adjusted_rand_score 13 | from sklearn.metrics import silhouette_samples 14 | from sklearn.metrics.cluster import contingency_matrix 15 | 16 | 17 | # For reproducibility 18 | np.random.seed(1000) 19 | 20 | 21 | # Download from: https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data 22 | # Change with the actual path 23 | bc_dataset_path = '/wdbc.data' 24 | 25 | bc_dataset_columns = ['id','diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 26 | 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 27 | 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 28 | 'radius_se','texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 29 | 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 30 | 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 31 | 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 32 | 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'] 33 | 34 | if __name__ == '__main__': 35 | # Load the dataset 36 | df = pd.read_csv(bc_dataset_path, index_col=0, names=bc_dataset_columns).fillna(0.0) 37 | 38 | # Show the overall statistical properties 39 | print(df.describe()) 40 | 41 | # Show the pair-plot 42 | sns.set() 43 | 44 | with sns.plotting_context("notebook", font_scale=1.2): 45 | sns.pairplot(df, 46 | vars=['perimeter_mean', 'area_mean', 'smoothness_mean', 'concavity_mean', 'symmetry_mean'], 47 | hue="diagnosis") 48 | 49 | plt.show() 50 | 51 | # Project the dataset on a bidimensional plane 52 | cdf = df.drop(['diagnosis'], axis=1) 53 | 54 | tsne = TSNE(n_components=2, perplexity=10, random_state=1000) 55 | data_tsne = tsne.fit_transform(cdf) 56 | 57 | df_tsne = pd.DataFrame(data_tsne, columns=['x', 'y'], index=cdf.index) 58 | dff = pd.concat([df, df_tsne], axis=1) 59 | 60 | # Show the diagram 61 | fig, ax = plt.subplots(figsize=(18, 11)) 62 | 63 | with sns.plotting_context("notebook", font_scale=1.5): 64 | sns.scatterplot(x='x', 65 | y='y', 66 | hue='diagnosis', 67 | size='area_mean', 68 | style='diagnosis', 69 | sizes=(30, 400), 70 | palette=sns.color_palette("husl", 2), 71 | data=dff, 72 | ax=ax) 73 | 74 | ax.set_xlabel(r'$x$') 75 | ax.set_ylabel(r'$y$') 76 | 77 | plt.show() 78 | 79 | # Perform a K-Means clustering with K=2 80 | km = KMeans(n_clusters=2, max_iter=1000, random_state=1000) 81 | Y_pred = km.fit_predict(cdf) 82 | 83 | df_km = pd.DataFrame(Y_pred, columns=['prediction'], index=cdf.index) 84 | kmdff = pd.concat([dff, df_km], axis=1) 85 | 86 | # Show the clustering result 87 | fig, ax = plt.subplots(figsize=(18, 11)) 88 | 89 | with sns.plotting_context("notebook", font_scale=1.5): 90 | sns.scatterplot(x='x', 91 | y='y', 92 | hue='prediction', 93 | size='area_mean', 94 | style='diagnosis', 95 | sizes=(30, 400), 96 | palette=sns.color_palette("husl", 2), 97 | data=kmdff, 98 | ax=ax) 99 | 100 | ax.set_xlabel(r'$x$') 101 | ax.set_ylabel(r'$y$') 102 | 103 | plt.show() 104 | 105 | # Compute the inertia 106 | inertias = [] 107 | 108 | for i in range(2, 51): 109 | km = KMeans(n_clusters=i, max_iter=1000, random_state=1000) 110 | km.fit(cdf) 111 | inertias.append(km.inertia_) 112 | 113 | # Show the plot inertia vs. no. clusters 114 | fig, ax = plt.subplots(figsize=(18, 8)) 115 | 116 | ax.plot(np.arange(2, 51, 1), inertias) 117 | ax.set_xlabel('Number of clusters', fontsize=14) 118 | ax.set_ylabel('Inertia', fontsize=14) 119 | ax.set_xticks(np.arange(2, 51, 2)) 120 | 121 | plt.show() 122 | 123 | nb_clusters = [2, 4, 6, 8] 124 | mapping = [(0, 0), (0, 1), (1, 0), (1, 1)] 125 | 126 | # Show the silhouette plots 127 | fig, ax = plt.subplots(2, 2, figsize=(15, 10)) 128 | 129 | for i, n in enumerate(nb_clusters): 130 | km = KMeans(n_clusters=n, random_state=1000) 131 | Y = km.fit_predict(cdf) 132 | df_km = pd.DataFrame(Y_pred, columns=['prediction'], index=cdf.index) 133 | kmdff = pd.concat([dff, df_km], axis=1) 134 | 135 | silhouette_values = silhouette_samples(cdf, kmdff['prediction']) 136 | 137 | ax[mapping[i]].set_xticks([-0.15, 0.0, 0.25, 0.5, 0.75, 1.0]) 138 | ax[mapping[i]].set_yticks([]) 139 | ax[mapping[i]].set_title('%d clusters' % n) 140 | ax[mapping[i]].set_xlim([-0.15, 1]) 141 | y_lower = 20 142 | 143 | for t in range(n): 144 | ct_values = silhouette_values[Y == t] 145 | ct_values.sort() 146 | 147 | y_upper = y_lower + ct_values.shape[0] 148 | 149 | color = cm.Accent(float(t) / n) 150 | ax[mapping[i]].fill_betweenx(np.arange(y_lower, y_upper), 0, ct_values, facecolor=color, edgecolor=color) 151 | 152 | y_lower = y_upper + 20 153 | 154 | plt.show() 155 | 156 | # Compute the other metrics for K=2 157 | km = KMeans(n_clusters=2, max_iter=1000, random_state=1000) 158 | Y_pred = km.fit_predict(cdf) 159 | df_km = pd.DataFrame(Y_pred, columns=['prediction'], index=cdf.index) 160 | kmdff = pd.concat([dff, df_km], axis=1) 161 | 162 | print('Completeness: {}'.format(completeness_score(kmdff['diagnosis'], kmdff['prediction']))) 163 | print('Homogeneity: {}'.format(homogeneity_score(kmdff['diagnosis'], kmdff['prediction']))) 164 | print('V-Score: {}'.format(v_measure_score(kmdff['diagnosis'], kmdff['prediction']))) 165 | print('Adj. Mutual info: {}'.format(adjusted_mutual_info_score(kmdff['diagnosis'], kmdff['prediction']))) 166 | print('Adj. Rand score: {}'.format(adjusted_rand_score(kmdff['diagnosis'], kmdff['prediction']))) 167 | 168 | # Compute and show the contingency matrix 169 | cm = contingency_matrix(kmdff['diagnosis'].apply(lambda x: 0 if x == 'B' else 1), kmdff['prediction']) 170 | 171 | fig, ax = plt.subplots(figsize=(8, 6)) 172 | 173 | with sns.plotting_context("notebook", font_scale=1.5): 174 | sns.heatmap(cm, annot=True, fmt='d', ax=ax) 175 | 176 | plt.show() 177 | 178 | # Perform a K-Means clustering with K=8 179 | km = KMeans(n_clusters=8, max_iter=1000, random_state=1000) 180 | Y_pred = km.fit_predict(cdf) 181 | 182 | df_km = pd.DataFrame(Y_pred, columns=['prediction'], index=cdf.index) 183 | kmdff = pd.concat([dff, df_km], axis=1) 184 | 185 | # Show the result 186 | fig, ax = plt.subplots(figsize=(18, 11)) 187 | 188 | with sns.plotting_context("notebook", font_scale=1.5): 189 | sns.scatterplot(x='x', 190 | y='y', 191 | hue='prediction', 192 | size='area_mean', 193 | style='diagnosis', 194 | sizes=(30, 400), 195 | palette=sns.color_palette("husl", 8), 196 | data=kmdff, 197 | ax=ax) 198 | 199 | ax.set_xlabel(r'$x$') 200 | ax.set_ylabel(r'$y$') 201 | 202 | plt.show() 203 | 204 | # Show the statistical description of two clusters 205 | sdff = dff[(dff.x > -25.0) & (dff.x < 30.0) & (dff.y > -60.0) & (dff.y < -40.0)] 206 | print(sdff[['perimeter_mean', 'area_mean', 'smoothness_mean', 207 | 'concavity_mean', 'symmetry_mean']].describe()) 208 | 209 | sdff = dff[(dff.x > -10.0) & (dff.y > 20.0) & (dff.y < 50.0)] 210 | print(sdff[['perimeter_mean', 'area_mean', 'smoothness_mean', 211 | 'concavity_mean', 'symmetry_mean']].describe()) -------------------------------------------------------------------------------- /Chapter02/distance_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | from scipy.spatial.distance import cdist 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Create the distance matrix 15 | distances = np.zeros(shape=(8, 100)) 16 | 17 | for i in range(1, distances.shape[0] + 1): 18 | for j in range(1, distances.shape[1] + 1): 19 | distances[i - 1, j - 1] = np.log(cdist(np.zeros(shape=(1, j)), np.ones(shape=(1, j)), 20 | metric='minkowski', p=i)[0][0]) 21 | 22 | # Show the distances 23 | sns.set() 24 | 25 | fig, ax = plt.subplots(figsize=(16, 9)) 26 | 27 | for i in range(distances.shape[0]): 28 | ax.plot(np.arange(1, distances.shape[1] + 1, 1), distances[i], label='p={}'.format(i)) 29 | 30 | ax.set_xlabel('Dimensionality', fontsize=14) 31 | ax.set_ylabel('Minkowski distances (log-scale)', fontsize=14) 32 | ax.legend() 33 | ax.set_xticks(np.arange(1, distances.shape[1] + 2, 5)) 34 | ax.set_yticks(np.arange(0, 5, 0.5)) 35 | 36 | plt.show() 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /Chapter02/distance_test_2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from scipy.spatial.distance import cdist 4 | 5 | 6 | # For reproducibility 7 | np.random.seed(1000) 8 | 9 | 10 | if __name__ == '__main__': 11 | # Create the distance matrix 12 | distances = [] 13 | 14 | for i in range(1, 2500, 10): 15 | d = cdist(np.array([[0, 0]]), np.array([[5, float(i / 500)]]), metric='minkowski', p=15)[0][0] 16 | distances.append(d) 17 | 18 | print('Avg(distances) = {}'.format(np.mean(distances))) 19 | print('Std(distances) = {}'.format(np.std(distances))) 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Chapter02/inertia_blobs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | from sklearn.cluster import KMeans 7 | from sklearn.datasets import make_blobs 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | if __name__ == '__main__': 15 | X, Y = make_blobs(n_samples=2000, n_features=2, centers=12, 16 | cluster_std=0.05, center_box=[-5, 5], random_state=100) 17 | 18 | # Show the blobs 19 | sns.set() 20 | 21 | fig, ax = plt.subplots(figsize=(12, 8)) 22 | 23 | for i in range(12): 24 | ax.scatter(X[Y == i, 0], X[Y == i, 1], label='Blob {}'.format(i + 1)) 25 | 26 | ax.set_xlabel(r'$x_0$') 27 | ax.set_ylabel(r'$x_1$') 28 | ax.legend() 29 | 30 | plt.show() 31 | 32 | # Compute the inertia 33 | inertias = [] 34 | 35 | for i in range(2, 21): 36 | km = KMeans(n_clusters=i, max_iter=1000, random_state=1000) 37 | km.fit(X) 38 | inertias.append(km.inertia_) 39 | 40 | # Show the plot inertia vs. no. clusters 41 | fig, ax = plt.subplots(figsize=(18, 8)) 42 | 43 | ax.plot(np.arange(2, 21, 1), inertias) 44 | 45 | ax.set_xlabel('Number of clusters', fontsize=14) 46 | ax.set_ylabel('Inertia', fontsize=14) 47 | ax.set_xticks(np.arange(2, 21, 1)) 48 | 49 | plt.show() -------------------------------------------------------------------------------- /Chapter02/knn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | from sklearn.datasets import fetch_olivetti_faces 7 | from sklearn.neighbors import NearestNeighbors 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | if __name__ == '__main__': 15 | # Load the dataset 16 | faces = fetch_olivetti_faces() 17 | X = faces['data'] 18 | 19 | # Create and fit the model 20 | knn = NearestNeighbors(n_neighbors=10, metric='minkowski', p=2, radius=20.0, algorithm='ball_tree') 21 | knn.fit(X) 22 | 23 | # Test face 24 | i = 20 25 | test_face = X[i] + np.random.normal(0.0, 0.1, size=(X[0].shape[0])) 26 | 27 | # Show the test face 28 | sns.set() 29 | 30 | fig, ax = plt.subplots(figsize=(5, 3)) 31 | 32 | ax.imshow(test_face.reshape((64, 64)) * 255.0, cmap='gray') 33 | ax.grid(False) 34 | ax.set_xticks([]) 35 | ax.set_yticks([]) 36 | 37 | plt.show() 38 | 39 | # Compute and show the top 10 neighbors 40 | distances, neighbors = knn.kneighbors(test_face.reshape(1, -1)) 41 | 42 | fig, ax = plt.subplots(1, 10, figsize=(18, 8)) 43 | 44 | for i in range(10): 45 | idx = neighbors[0][i] 46 | ax[i].imshow(X[idx].reshape((64, 64)) * 255.0, cmap='gray') 47 | ax[i].grid(False) 48 | ax[i].set_xticks([]) 49 | ax[i].set_yticks([]) 50 | ax[i].set_xlabel('d={0:.2f}'.format(distances[0][i])) 51 | 52 | plt.show() 53 | 54 | # Compute and show the nearest 20 neighbors within a radius = 100 55 | distances, neighbors = knn.radius_neighbors(test_face.reshape(1, -1), radius=100.0) 56 | sd, sd_arg = np.sort(distances[0]), np.argsort(distances[0]) 57 | 58 | fig, ax = plt.subplots(2, 10, figsize=(18, 4)) 59 | 60 | for i in range(2): 61 | for j in range(10): 62 | idx = neighbors[0][sd_arg[(i * 10) + j]] 63 | ax[i, j].imshow(X[idx].reshape((64, 64)) * 255.0, cmap='gray') 64 | ax[i, j].grid(False) 65 | ax[i, j].set_xticks([]) 66 | ax[i, j].set_yticks([]) 67 | ax[i, j].set_xlabel('d={0:.2f}'.format(sd[(i * 10) + j])) 68 | 69 | plt.show() 70 | -------------------------------------------------------------------------------- /Chapter02/vq_gaussian.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.cm as cm 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | from scipy.spatial.distance import cdist 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | nb_samples = 1000 15 | n_vectors = 16 16 | delta = 0.05 17 | n_iterations = 1000 18 | 19 | 20 | if __name__ == '__main__': 21 | # Initialize the dataset and the vectors 22 | data = np.random.normal(0.0, 1.5, size=(nb_samples, 2)) 23 | qv = np.random.normal(0.0, 1.5, size=(n_vectors, 2)) 24 | 25 | # Show the initial configuration 26 | sns.set() 27 | 28 | fig, ax = plt.subplots(figsize=(10, 7)) 29 | 30 | ax.scatter(data[:, 0], data[:, 1], marker='d', s=15, label='Samples') 31 | ax.scatter(qv[:, 0], qv[:, 1], s=100, label='QVs') 32 | ax.set_xlabel(r'$x_0$') 33 | ax.set_ylabel(r'$x_1$') 34 | ax.legend() 35 | 36 | plt.show() 37 | 38 | # Perform the computation 39 | for i in range(n_iterations): 40 | for p in data: 41 | distances = cdist(qv, np.expand_dims(p, axis=0)) 42 | qvi = np.argmin(distances) 43 | alpha = p - qv[qvi] 44 | qv[qvi] += (delta * alpha) 45 | 46 | distances = cdist(data, qv) 47 | Y_qv = np.argmin(distances, axis=1) 48 | 49 | # Show the final configuration 50 | fig, ax = plt.subplots(1, 2, figsize=(20, 7), sharey=True) 51 | 52 | ax[0].scatter(data[:, 0], data[:, 1], marker='d', s=20, label='Samples') 53 | ax[0].scatter(qv[:, 0], qv[:, 1], s=100, label='QVs') 54 | ax[0].set_xlabel(r'$x_0$') 55 | ax[0].set_ylabel(r'$x_1$') 56 | ax[0].legend() 57 | 58 | for i in range(n_vectors): 59 | ax[1].scatter(data[Y_qv == i, 0], data[Y_qv == i, 1], marker='o', s=20, c=cm.tab20(i), 60 | label='VQ {}'.format(i + 1)) 61 | ax[1].set_xlabel(r'$x_0$') 62 | ax[1].legend() 63 | 64 | plt.show() -------------------------------------------------------------------------------- /Chapter02/vq_image.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | from scipy.misc import face 7 | from skimage.transform import resize 8 | 9 | from sklearn.cluster import KMeans 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | 16 | square_fragment_size = 2 17 | n_qvectors = 24 18 | 19 | 20 | if __name__ == '__main__': 21 | # Load and resize the image 22 | picture = resize(face(gray=False), output_shape=(192, 256), mode='reflect') 23 | 24 | # Show the original image 25 | sns.set() 26 | 27 | fig, ax = plt.subplots(figsize=(8, 6)) 28 | ax.imshow(picture) 29 | ax.grid(False) 30 | 31 | plt.show() 32 | 33 | # Collect all 2x2 fragments 34 | n_fragments = int(picture.shape[0] * picture.shape[1] / (square_fragment_size ** 2)) 35 | 36 | fragments = np.zeros(shape=(n_fragments, square_fragment_size ** 2 * picture.shape[2])) 37 | idx = 0 38 | 39 | for i in range(0, picture.shape[0], square_fragment_size): 40 | for j in range(0, picture.shape[1], square_fragment_size): 41 | fragments[idx] = picture[i:i + square_fragment_size, 42 | j:j + square_fragment_size, :].flatten() 43 | idx += 1 44 | 45 | # Perform the quantization 46 | km = KMeans(n_clusters=n_qvectors, random_state=1000) 47 | km.fit(fragments) 48 | 49 | qvs = km.predict(fragments) 50 | 51 | # Build the quantized image 52 | qv_picture = np.zeros(shape=(192, 256, 3)) 53 | idx = 0 54 | 55 | for i in range(0, 192, square_fragment_size): 56 | for j in range(0, 256, square_fragment_size): 57 | qv_picture[i:i + square_fragment_size, 58 | j:j + square_fragment_size, :] = \ 59 | km.cluster_centers_[qvs[idx]].\ 60 | reshape((square_fragment_size, square_fragment_size, 3)) 61 | idx += 1 62 | 63 | # Show the final image 64 | fig, ax = plt.subplots(figsize=(8, 6)) 65 | ax.imshow(qv_picture) 66 | ax.grid(False) 67 | 68 | plt.show() 69 | 70 | # Show the histograms 71 | fig, ax = plt.subplots(2, 3, figsize=(18, 8), sharex=True, sharey=True) 72 | 73 | sns.distplot(picture[:, :, 0].flatten() / 255.0, kde=True, bins=32, ax=ax[0, 0], color='r') 74 | sns.distplot(picture[:, :, 1].flatten() / 255.0, kde=True, bins=32, ax=ax[0, 1], color='g') 75 | sns.distplot(picture[:, :, 2].flatten() / 255.0, kde=True, bins=32, ax=ax[0, 2], color='b') 76 | 77 | sns.distplot(qv_picture[:, :, 0].flatten() / 255.0, kde=True, bins=32, ax=ax[1, 0], color='r') 78 | sns.distplot(qv_picture[:, :, 1].flatten() / 255.0, kde=True, bins=32, ax=ax[1, 1], color='g') 79 | sns.distplot(qv_picture[:, :, 2].flatten() / 255.0, kde=True, bins=32, ax=ax[1, 2], color='b') 80 | 81 | ax[0, 1].set_title('Original image', fontsize=14) 82 | ax[1, 1].set_title('Quantized image', fontsize=14) 83 | 84 | ax[1, 0].set_xlabel('Red channel', fontsize=13) 85 | ax[1, 1].set_xlabel('Green channel', fontsize=13) 86 | ax[1, 2].set_xlabel('Blue channel', fontsize=13) 87 | 88 | ax[1, 0].set_xticks(np.arange(0, 256, 25)) 89 | 90 | plt.show() 91 | 92 | # Compute the entropy of the red channels 93 | hist_original, _ = np.histogram(picture[:, :, 0].flatten() * 255.0, bins=256) 94 | hist_q, _ = np.histogram(qv_picture[:, :, 0].flatten() * 255.0, bins=256) 95 | 96 | p_original = hist_original / np.sum(hist_original) 97 | H_original = -np.sum(p_original * np.log2(p_original + 1e-8)) 98 | 99 | p_q = hist_q / np.sum(hist_q) 100 | H_q = -np.sum(p_q * np.log2(p_q + 1e-8)) 101 | 102 | print('Original entropy: {0:.3f} bits - Quantized entropy: {1:.3f} bits'.format(H_original, H_q)) 103 | 104 | -------------------------------------------------------------------------------- /Chapter03/absenteeism_at_work_analysis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib.cm as cm 4 | import pandas as pd 5 | import seaborn as sns 6 | 7 | from sklearn.cluster import DBSCAN 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.manifold import TSNE 10 | from sklearn.metrics import silhouette_score, calinski_harabaz_score 11 | from sklearn.metrics.pairwise import pairwise_distances 12 | 13 | 14 | # For reproducibility 15 | np.random.seed(1000) 16 | 17 | 18 | # Download the dataset from https://archive.ics.uci.edu/ml/machine-learning-databases/00445/ 19 | # unzip and set the path to .csv file 20 | data_path = '/Absenteeism_at_work.csv' 21 | 22 | 23 | n_perturbed = 20 24 | n_data = [] 25 | 26 | 27 | if __name__ == '__main__': 28 | # Load the dataset 29 | df = pd.read_csv(data_path, sep=';', header=0, index_col=0).fillna(0.0) 30 | print(df.count()) 31 | 32 | # Preprocess the dataset 33 | cdf = pd.get_dummies(df, columns=['Reason for absence', 'Month of absence', 'Day of the week', 34 | 'Seasons', 'Disciplinary failure', 'Education', 35 | 'Social drinker', 'Social smoker']) 36 | 37 | cdf = cdf.drop(labels=['Reason for absence', 'Month of absence', 'Day of the week', 38 | 'Seasons', 'Disciplinary failure', 'Education', 'Social drinker', 39 | 'Social smoker']).astype(np.float64) 40 | 41 | # Standardize the dataset 42 | ss = StandardScaler(with_std=False) 43 | sdf = ss.fit_transform(cdf) 44 | 45 | # Perform the TSNE non-linear dimensionality reduction 46 | tsne = TSNE(n_components=2, perplexity=15, random_state=1000) 47 | data_tsne = tsne.fit_transform(sdf) 48 | 49 | df_tsne = pd.DataFrame(data_tsne, columns=['x', 'y'], index=cdf.index) 50 | dff = pd.concat([cdf, df_tsne], axis=1) 51 | 52 | # Show the dataset 53 | sns.set() 54 | 55 | fig, ax = plt.subplots(figsize=(18, 11)) 56 | 57 | with sns.plotting_context("notebook", font_scale=1.5): 58 | sns.scatterplot(x='x', 59 | y='y', 60 | size='Age', 61 | sizes=(30, 400), 62 | palette=sns.color_palette("husl", 2), 63 | data=dff, 64 | ax=ax) 65 | 66 | ax.set_xlabel(r'$x$', fontsize=14) 67 | ax.set_ylabel(r'$y$', fontsize=14) 68 | 69 | plt.show() 70 | 71 | # Perform the preliminary analysis 72 | n_clusters = [] 73 | n_noise_points = [] 74 | silhouette_scores = [] 75 | calinski_harabaz_scores = [] 76 | 77 | for p in [2, 4, 8, 12]: 78 | n_clusters_p = [] 79 | n_noise_points_p = [] 80 | silhouette_scores_p = [] 81 | calinski_harabaz_scores_p = [] 82 | 83 | for eps in np.arange(15, 30, 0.5): 84 | dst = DBSCAN(eps=eps, min_samples=3, metric='minkowski', p=p) 85 | Y_pred_t = dst.fit_predict(sdf) 86 | 87 | n_clusters_p.append(np.max(Y_pred_t) + 1) 88 | n_noise_points_p.append(np.sum(Y_pred_t == -1)) 89 | silhouette_scores_p.append(silhouette_score(dff, Y_pred_t, metric='minkowski', p=p)) 90 | calinski_harabaz_scores_p.append(calinski_harabaz_score(dff, Y_pred_t)) 91 | 92 | n_clusters.append(n_clusters_p) 93 | n_noise_points.append(n_noise_points_p) 94 | silhouette_scores.append(silhouette_scores_p) 95 | calinski_harabaz_scores.append(calinski_harabaz_scores_p) 96 | 97 | # Show the results of the preliminary analysis 98 | fig, ax = plt.subplots(4, 4, figsize=(18, 12), sharex=True) 99 | 100 | for idx, p in enumerate([2, 4, 8, 12]): 101 | x = np.arange(15, 30, 0.5) 102 | 103 | ax[idx, 0].plot(x, n_clusters[idx], label='p={}'.format(p)) 104 | ax[idx, 1].plot(x, n_noise_points[idx], label='p={}'.format(p)) 105 | ax[idx, 2].plot(x, silhouette_scores[idx], label='p={}'.format(p)) 106 | ax[idx, 3].plot(x, calinski_harabaz_scores[idx], label='p={}'.format(p)) 107 | 108 | ax[0, 0].set_title('Number of clusters', fontsize=14) 109 | ax[0, 1].set_title('Number of noise points', fontsize=14) 110 | ax[0, 2].set_title('Silhouette score', fontsize=14) 111 | ax[0, 3].set_title('Calinski-Harabasz score', fontsize=14) 112 | 113 | for i in range(4): 114 | ax[i, 0].set_yticks(np.arange(5, 60, 7)) 115 | ax[i, 0].legend() 116 | ax[3, i].set_xlabel(r'$\epsilon$') 117 | 118 | plt.show() 119 | 120 | # Perform the clustering 121 | ds = DBSCAN(eps=25, min_samples=3, metric='minkowski', p=12) 122 | Y_pred = ds.fit_predict(sdf) 123 | 124 | print('Number of clusters: {}'.format(np.max(Y_pred) + 1)) 125 | print('Number of noise points: {}'.format(np.sum(Y_pred == -1))) 126 | 127 | print('Silhouette score: {:.3f}'.format(silhouette_score(dff, Y_pred, metric='minkowski', p=12))) 128 | print('Calinski-Harabaz score: {:.3f}'.format(calinski_harabaz_score(dff, Y_pred))) 129 | 130 | # Show the clustering results 131 | fig, ax = plt.subplots(figsize=(18, 11)) 132 | 133 | for i in range(np.max(np.unique(Y_pred)) + 1): 134 | ax.scatter(data_tsne[Y_pred == i, 0], data_tsne[Y_pred == i, 1], s=100, c=cm.Paired(i), 135 | label='C{}'.format(i + 1)) 136 | 137 | ax.scatter(data_tsne[Y_pred == -1, 0], data_tsne[Y_pred == -1, 1], marker='x', c='black', s=150, label='Noisy') 138 | 139 | ax.set_xlabel(r'$x$') 140 | ax.set_ylabel(r'$y$') 141 | ax.legend() 142 | 143 | plt.show() 144 | 145 | # Describe the region x < -45 146 | sdff = dff[(dff.x < -45.0)] 147 | print(sdff[sdff.columns[0:10]].describe()) 148 | 149 | # Describe the region x > 20 and -20 < y < 20 150 | sdff = dff[(dff.x > 20.0) & (dff.y > -20.0) & (dff.y < 20.0)] 151 | print(sdff[sdff.columns[0:10]].describe()) 152 | 153 | # Perform the instability analysis 154 | data = sdf.copy() 155 | 156 | data_mean = np.mean(data, axis=0) 157 | data_cov = np.cov(data.T) / 4.0 158 | 159 | for i in range(n_perturbed): 160 | gaussian_noise = np.random.multivariate_normal(data_mean, data_cov, size=(data.shape[0],)) 161 | noise = gaussian_noise * np.random.uniform(0.0, 1.0, size=(data.shape[0], data.shape[1])) 162 | n_data.append(data.copy() + noise) 163 | 164 | instabilities = [] 165 | 166 | for eps in np.arange(5.0, 31.0, 1.5): 167 | Yn = [] 168 | 169 | for nd in n_data: 170 | ds = DBSCAN(eps=eps, min_samples=3, metric='minkowski', p=12) 171 | Yn.append(ds.fit_predict(nd)) 172 | 173 | distances = [] 174 | 175 | for i in range(len(Yn) - 1): 176 | for j in range(i, len(Yn)): 177 | d = pairwise_distances(Yn[i].reshape(-1, 1), Yn[j].reshape(-1, 1), 'hamming') 178 | distances.append(d[0, 0]) 179 | 180 | instability = (2.0 * np.sum(distances)) / float(n_perturbed ** 2) 181 | instabilities.append(instability) 182 | 183 | # Show the results 184 | fig, ax = plt.subplots(figsize=(18, 8)) 185 | 186 | ax.plot(np.arange(5.0, 31.0, 1.5), instabilities) 187 | ax.set_xlabel(r'$\epsilon$', fontsize=14) 188 | ax.set_ylabel('Instability', fontsize=14) 189 | 190 | plt.show() 191 | 192 | -------------------------------------------------------------------------------- /Chapter03/k_medoids.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from scipy.spatial.distance import pdist, cdist, squareform 6 | 7 | from sklearn.cluster import KMeans 8 | from sklearn.datasets import make_blobs 9 | from sklearn.metrics import adjusted_rand_score 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | 16 | nb_samples = 1000 17 | nb_clusters = 8 18 | 19 | metric = 'minkowski' 20 | p = 7 21 | tolerance = 0.001 22 | 23 | 24 | if __name__ == '__main__': 25 | # Create the dataset 26 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=nb_clusters, 27 | cluster_std=1.2, center_box=[-5.0, 5.0], random_state=1000) 28 | 29 | # Perform K-Means clustering 30 | km = KMeans(n_clusters=nb_clusters, random_state=1000) 31 | C_km = km.fit_predict(X) 32 | 33 | print('Adjusted Rand score K-Means: {}'.format(adjusted_rand_score(Y, C_km))) 34 | 35 | # Perform K-Medoids clustering 36 | C = np.random.randint(0, nb_clusters, size=(X.shape[0],), dtype=np.int32) 37 | mu_idxs = np.zeros(shape=(nb_clusters, X.shape[1])) 38 | 39 | mu_copy = np.ones_like(mu_idxs) 40 | 41 | while np.linalg.norm(mu_idxs - mu_copy) > tolerance: 42 | for i in range(nb_clusters): 43 | Di = squareform(pdist(X[C == i], metric=metric, p=p)) 44 | SDi = np.sum(Di, axis=1) 45 | 46 | mu_copy[i] = mu_idxs[i].copy() 47 | idx = np.argmin(SDi) 48 | mu_idxs[i] = X[C == i][idx].copy() 49 | 50 | C = np.argmin(cdist(X, mu_idxs, metric=metric, p=p), axis=1) 51 | 52 | print('Adjusted Rand score K-Medoids: {}'.format(adjusted_rand_score(Y, C))) 53 | 54 | # Show the final results 55 | sns.set() 56 | 57 | fig, ax = plt.subplots(1, 3, figsize=(26, 8), sharey=True) 58 | 59 | for i in range(nb_clusters): 60 | ax[0].scatter(X[Y == i, 0], X[Y == i, 1], label='Blob {}'.format(i + 1)) 61 | ax[1].scatter(X[C_km == i, 0], X[C_km == i, 1], label='Cluster {}'.format(i + 1)) 62 | ax[2].scatter(X[C == i, 0], X[C == i, 1], label='Cluster {}'.format(i + 1)) 63 | 64 | ax[0].set_xlabel(r'$x_0$', fontsize=22) 65 | ax[1].set_xlabel(r'$x_0$', fontsize=22) 66 | ax[2].set_xlabel(r'$x_0$', fontsize=22) 67 | ax[0].set_ylabel(r'$x_1$', fontsize=22) 68 | 69 | ax[0].set_title('Ground truth', fontsize=18) 70 | ax[1].set_title('K-Means', fontsize=18) 71 | ax[2].set_title('K-Medoids with Minkowski metric and p=7', fontsize=18) 72 | 73 | ax[0].legend() 74 | ax[1].legend() 75 | ax[2].legend() 76 | 77 | plt.show() 78 | 79 | -------------------------------------------------------------------------------- /Chapter03/mean_shift.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | from sklearn.cluster import MeanShift 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | nb_samples = 500 14 | mss = [] 15 | Y_preds = [] 16 | bandwidths = [0.9, 1.0, 1.2, 1.5] 17 | 18 | 19 | if __name__ == '__main__': 20 | # Create the dataset 21 | data_1 = np.random.multivariate_normal([-2.0, 0.0], np.diag([1.0, 0.5]), size=(nb_samples,)) 22 | data_2 = np.random.multivariate_normal([0.0, 2.0], np.diag([1.5, 1.5]), size=(nb_samples,)) 23 | data_3 = np.random.multivariate_normal([2.0, 0.0], np.diag([0.5, 1.0]), size=(nb_samples,)) 24 | 25 | data = np.concatenate([data_1, data_2, data_3], axis=0) 26 | 27 | # Show the original dataset 28 | sns.set() 29 | 30 | fig, ax = plt.subplots(figsize=(12, 8)) 31 | 32 | ax.scatter(data[:, 0], data[:, 1]) 33 | ax.set_xlabel(r'$x_0$', fontsize=14) 34 | ax.set_ylabel(r'$x_1$', fontsize=14) 35 | 36 | plt.show() 37 | 38 | # Perform the clustering with different bandwidths 39 | for b in bandwidths: 40 | ms = MeanShift(bandwidth=b) 41 | Y_preds.append(ms.fit_predict(data)) 42 | mss.append(ms) 43 | 44 | # Show the results 45 | fig, ax = plt.subplots(1, 4, figsize=(20, 6), sharey=True) 46 | 47 | for j, b in enumerate(bandwidths): 48 | for i in range(mss[j].cluster_centers_.shape[0]): 49 | ax[j].scatter(data[Y_preds[j] == i, 0], data[Y_preds[j] == i, 1], marker='o', s=15, 50 | label='Cluster {}'.format(i + 1)) 51 | 52 | ax[j].set_xlabel(r'$x_0$', fontsize=14) 53 | ax[j].set_title('Bandwidth: {}'.format(b), fontsize=14) 54 | ax[j].legend() 55 | 56 | ax[0].set_ylabel(r'$x_1$', fontsize=14) 57 | 58 | plt.show() -------------------------------------------------------------------------------- /Chapter03/online_clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.cluster import KMeans, MiniBatchKMeans, Birch 6 | from sklearn.datasets import make_blobs 7 | from sklearn.metrics import adjusted_rand_score 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | nb_clusters = 8 15 | nb_samples = 2000 16 | batch_size = 50 17 | 18 | 19 | if __name__ == '__main__': 20 | # Create the dataset 21 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=nb_clusters, 22 | cluster_std=0.25, center_box=[-1.5, 1.5], shuffle=True, random_state=100) 23 | 24 | # Show the dataset 25 | sns.set() 26 | 27 | fig, ax = plt.subplots(figsize=(12, 8)) 28 | 29 | for i in range(nb_clusters): 30 | ax.scatter(X[Y == i, 0], X[Y == i, 1], label='Blob {}'.format(i + 1)) 31 | 32 | ax.set_xlabel(r'$x_0$', fontsize=14) 33 | ax.set_ylabel(r'$x_1$', fontsize=14) 34 | ax.legend() 35 | 36 | plt.show() 37 | 38 | # Perform a K-Means clustering 39 | km = KMeans(n_clusters=nb_clusters, random_state=1000) 40 | Y_pred_km = km.fit_predict(X) 41 | 42 | print('Adjusted Rand score: {}'.format(adjusted_rand_score(Y, Y_pred_km))) 43 | 44 | # Perform the online clustering 45 | mbkm = MiniBatchKMeans(n_clusters=nb_clusters, batch_size=batch_size, reassignment_ratio=0.001, random_state=1000) 46 | birch = Birch(n_clusters=nb_clusters, threshold=0.2, branching_factor=350) 47 | 48 | scores_mbkm = [] 49 | scores_birch = [] 50 | 51 | for i in range(0, nb_samples, batch_size): 52 | X_batch, Y_batch = X[i:i + batch_size], Y[i:i + batch_size] 53 | 54 | mbkm.partial_fit(X_batch) 55 | birch.partial_fit(X_batch) 56 | 57 | scores_mbkm.append(adjusted_rand_score(Y[:i + batch_size], mbkm.predict(X[:i + batch_size]))) 58 | scores_birch.append(adjusted_rand_score(Y[:i + batch_size], birch.predict(X[:i + batch_size]))) 59 | 60 | Y_pred_mbkm = mbkm.predict(X) 61 | Y_pred_birch = birch.predict(X) 62 | 63 | print('Adjusted Rand score Mini-Batch K-Means: {}'.format(adjusted_rand_score(Y, Y_pred_mbkm))) 64 | print('Adjusted Rand score BIRCH: {}'.format(adjusted_rand_score(Y, Y_pred_birch))) 65 | 66 | # Show the incremental Adjusted Rand scores 67 | fig, ax = plt.subplots(figsize=(15, 8)) 68 | 69 | ax.plot(range(0, nb_samples, batch_size), scores_mbkm, label='Mini-Batch K-Means') 70 | ax.plot(range(0, nb_samples, batch_size), scores_birch, label='Birch') 71 | 72 | ax.set_xlabel('Number of samples', fontsize=14) 73 | ax.set_ylabel('Incremental Adjusted Rand score', fontsize=14) 74 | ax.legend() 75 | 76 | plt.show() 77 | 78 | -------------------------------------------------------------------------------- /Chapter03/spectral_clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | from sklearn.cluster import SpectralClustering, KMeans 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | nb_samples = 2000 14 | 15 | 16 | if __name__ == '__main__': 17 | # Create the dataset 18 | X0 = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, nb_samples), axis=1) 19 | Y0 = -2.0 - np.cos(2.0 * X0) + np.random.uniform(0.0, 2.0, size=(nb_samples, 1)) 20 | 21 | X1 = np.expand_dims(np.linspace(-2 * np.pi, 2 * np.pi, nb_samples), axis=1) 22 | Y1 = 2.0 - np.cos(2.0 * X0) + np.random.uniform(0.0, 2.0, size=(nb_samples, 1)) 23 | 24 | data_0 = np.concatenate([X0, Y0], axis=1) 25 | data_1 = np.concatenate([X1, Y1], axis=1) 26 | data = np.concatenate([data_0, data_1], axis=0) 27 | 28 | # Show the dataset 29 | sns.set() 30 | 31 | fig, ax = plt.subplots(figsize=(12, 8)) 32 | 33 | ax.scatter(data[:, 0], data[:, 1]) 34 | ax.set_xlabel(r'$x_0$', fontsize=14) 35 | ax.set_ylabel(r'$x_1$', fontsize=14) 36 | 37 | plt.show() 38 | 39 | # Perform the clustering 40 | km = KMeans(n_clusters=2, random_state=1000) 41 | sc = SpectralClustering(n_clusters=2, affinity='rbf', gamma=2.0, random_state=1000) 42 | 43 | Y_pred_km = km.fit_predict(data) 44 | Y_pred_sc = sc.fit_predict(data) 45 | 46 | # Show the results 47 | fig, ax = plt.subplots(1, 3, figsize=(20, 6), sharey=True) 48 | 49 | ax[0].scatter(data[:, 0], data[:, 1], c='b', s=5) 50 | 51 | ax[1].scatter(data[Y_pred_sc == 0, 0], data[Y_pred_sc == 0, 1], marker='o', s=5, c='b', label='Cluster 1') 52 | ax[1].scatter(data[Y_pred_sc == 1, 0], data[Y_pred_sc == 1, 1], marker='d', s=5, c='gray', label='Cluster 2') 53 | 54 | ax[2].scatter(data[Y_pred_km == 0, 0], data[Y_pred_km == 0, 1], marker='o', c='b', s=5, label='Cluster 1') 55 | ax[2].scatter(data[Y_pred_km == 1, 0], data[Y_pred_km == 1, 1], marker='d', c='gray', s=5, label='Cluster 2') 56 | 57 | ax[0].set_title('Dataset', fontsize=14) 58 | ax[0].set_xlabel(r'$x_0$', fontsize=14) 59 | ax[0].set_ylabel(r'$x_1$', fontsize=14) 60 | 61 | ax[1].set_title('Spectral Clustering', fontsize=14) 62 | ax[1].set_xlabel(r'$x_0$', fontsize=14) 63 | ax[1].legend() 64 | 65 | ax[2].set_title('K-Means', fontsize=14) 66 | ax[2].set_xlabel(r'$x_0$', fontsize=14) 67 | ax[2].legend() 68 | 69 | plt.show() -------------------------------------------------------------------------------- /Chapter04/connectivity_constraints.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib.cm as cm 4 | import seaborn as sns 5 | 6 | from sklearn.datasets import make_blobs 7 | from sklearn.cluster import AgglomerativeClustering 8 | from sklearn.neighbors import kneighbors_graph 9 | 10 | from scipy.spatial.distance import pdist 11 | from scipy.cluster.hierarchy import linkage, dendrogram 12 | 13 | 14 | # For reproducibility 15 | np.random.seed(1000) 16 | 17 | 18 | nb_samples = 50 19 | nb_centers = 8 20 | 21 | 22 | if __name__ == '__main__': 23 | # Create the dataset 24 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, center_box=[-1, 1], centers=nb_centers, random_state=1000) 25 | 26 | # Show the dataset 27 | sns.set() 28 | 29 | fig, ax = plt.subplots(figsize=(12, 8)) 30 | 31 | for i, x in enumerate(X): 32 | ax.scatter(x[0], x[1], s=120) 33 | ax.annotate('%d' % i, xy=(x[0] + 0.05, x[1] + 0.05), fontsize=12) 34 | 35 | ax.set_xlabel(r'$x_0$', fontsize=14) 36 | ax.set_ylabel(r'$x_1$', fontsize=14) 37 | 38 | plt.show() 39 | 40 | # Show the dendrogram with average linkage 41 | dm = pdist(X, metric='euclidean') 42 | Z = linkage(dm, method='average') 43 | 44 | fig, ax = plt.subplots(figsize=(20, 10)) 45 | 46 | d = dendrogram(Z, orientation='right', truncate_mode='lastp', p=20, ax=ax) 47 | 48 | ax.set_xlabel('Dissimilarity', fontsize=18) 49 | ax.set_ylabel('Samples', fontsize=18) 50 | 51 | plt.show() 52 | 53 | # Perform the standard clustering 54 | ag = AgglomerativeClustering(n_clusters=8, affinity='euclidean', linkage='average') 55 | Y_pred = ag.fit_predict(X) 56 | 57 | # Show the results 58 | fig, ax = plt.subplots(figsize=(12, 8)) 59 | 60 | clusters = set() 61 | 62 | for i, x in enumerate(X): 63 | y = Y_pred[i] 64 | if y in clusters: 65 | label = False 66 | else: 67 | clusters.add(y) 68 | label = True 69 | 70 | ax.scatter(x[0], x[1], s=120, c=cm.Set1(y), label='Cluster {}'.format(y + 1) if label else None) 71 | ax.annotate('%d' % i, xy=(x[0] + 0.05, x[1] + 0.05), fontsize=12) 72 | 73 | ax.set_xlabel(r'$x_0$', fontsize=14) 74 | ax.set_ylabel(r'$x_1$', fontsize=14) 75 | ax.legend() 76 | 77 | plt.show() 78 | 79 | # Build the connectivity matrix 80 | cma = kneighbors_graph(X, n_neighbors=2) 81 | 82 | # Perform the clustering with connectivity constraints 83 | ag = AgglomerativeClustering(n_clusters=8, affinity='euclidean', linkage='average', connectivity=cma) 84 | Y_pred = ag.fit_predict(X) 85 | 86 | # Show the new results 87 | fig, ax = plt.subplots(figsize=(12, 8)) 88 | 89 | clusters = set() 90 | 91 | for i, x in enumerate(X): 92 | y = Y_pred[i] 93 | if y in clusters: 94 | label = False 95 | else: 96 | clusters.add(y) 97 | label = True 98 | 99 | ax.scatter(x[0], x[1], s=120, c=cm.Set1(y), label='Cluster {}'.format(y + 1) if label else None) 100 | ax.annotate('%d' % i, xy=(x[0] + 0.05, x[1] + 0.05), fontsize=12) 101 | 102 | ax.set_xlabel(r'$x_0$', fontsize=14) 103 | ax.set_ylabel(r'$x_1$', fontsize=14) 104 | ax.legend() 105 | 106 | plt.show() 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /Chapter04/dendrogram.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import make_blobs 6 | 7 | from scipy.spatial.distance import pdist 8 | from scipy.cluster.hierarchy import linkage, dendrogram, cophenet 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | nb_samples = 12 16 | nb_centers = 4 17 | 18 | 19 | if __name__ == '__main__': 20 | # Create the dataset 21 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, center_box=[-1, 1], centers=nb_centers, random_state=1000) 22 | 23 | # Show the dataset 24 | sns.set() 25 | 26 | fig, ax = plt.subplots(figsize=(12, 8)) 27 | 28 | for i, x in enumerate(X): 29 | ax.scatter(x[0], x[1], s=120) 30 | ax.annotate('%d' % i, xy=(x[0] + 0.05, x[1] + 0.05), fontsize=12) 31 | 32 | ax.set_xlabel(r'$x_0$', fontsize=14) 33 | ax.set_ylabel(r'$x_1$', fontsize=14) 34 | 35 | plt.show() 36 | 37 | # Compute the distance matrix 38 | dm = pdist(X, metric='euclidean') 39 | 40 | # Show the dendrogram with Ward's linkage 41 | Z = linkage(dm, method='ward') 42 | 43 | fig, ax = plt.subplots(figsize=(12, 8)) 44 | 45 | d = dendrogram(Z, show_leaf_counts=True, leaf_font_size=14, ax=ax) 46 | 47 | ax.set_xlabel('Samples', fontsize=14) 48 | ax.set_yticks(np.arange(0, 6.0, 0.25)) 49 | 50 | plt.show() 51 | 52 | # Show the dendrogram with single linkage 53 | Z = linkage(dm, method='single') 54 | 55 | fig, ax = plt.subplots(figsize=(12, 8)) 56 | 57 | d = dendrogram(Z, show_leaf_counts=True, leaf_font_size=14, ax=ax) 58 | 59 | ax.set_xlabel('Samples', fontsize=14) 60 | ax.set_yticks(np.arange(0, 2.0, 0.25)) 61 | 62 | plt.show() 63 | 64 | # Print the cophenetic correlations 65 | cpc, cp = cophenet(linkage(dm, method='ward'), dm) 66 | print('CPC Ward\'s linkage: {:.3f}'.format(cpc)) 67 | 68 | cpc, cp = cophenet(linkage(dm, method='single'), dm) 69 | print('CPC Single linkage: {:.3f}'.format(cpc)) 70 | 71 | cpc, cp = cophenet(linkage(dm, method='complete'), dm) 72 | print('CPC Complete linkage: {:.3f}'.format(cpc)) 73 | 74 | cpc, cp = cophenet(linkage(dm, method='average'), dm) 75 | print('CPC Average linkage: {:.3f}'.format(cpc)) 76 | 77 | -------------------------------------------------------------------------------- /Chapter04/water_treatment_plant_analysis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | import seaborn as sns 5 | 6 | from sklearn.preprocessing import StandardScaler 7 | from sklearn.manifold import TSNE 8 | from sklearn.cluster import AgglomerativeClustering 9 | from sklearn.metrics import silhouette_score 10 | 11 | from scipy.spatial.distance import pdist 12 | from scipy.cluster.hierarchy import linkage, dendrogram, cophenet 13 | 14 | 15 | # For reproducibility 16 | np.random.seed(1000) 17 | 18 | 19 | # Download the dataset from https://archive.ics.uci.edu/ml/machine-learning-databases/water-treatment/ 20 | # and set the path to .data file 21 | data_path = '/water-treatment.data' 22 | 23 | 24 | nb_clusters = [4, 6, 8, 10] 25 | linkages = ['single', 'complete', 'ward', 'average'] 26 | 27 | 28 | if __name__ == '__main__': 29 | # Read the dataset 30 | df = pd.read_csv(data_path, header=None, index_col=0, na_values='?').astype(np.float64) 31 | df.fillna(df.mean(), inplace=True) 32 | 33 | # Standardize the dataset 34 | ss = StandardScaler(with_std=False) 35 | sdf = ss.fit_transform(df) 36 | 37 | # Perform the TSNE non-linear dimensionality reduction 38 | tsne = TSNE(n_components=2, perplexity=10, random_state=1000) 39 | data_tsne = tsne.fit_transform(sdf) 40 | 41 | df_tsne = pd.DataFrame(data_tsne, columns=['x', 'y'], index=df.index) 42 | dff = pd.concat([df, df_tsne], axis=1) 43 | 44 | # Show the dataset 45 | sns.set() 46 | 47 | fig, ax = plt.subplots(figsize=(18, 11)) 48 | 49 | with sns.plotting_context("notebook", font_scale=1.5): 50 | sns.scatterplot(x='x', 51 | y='y', 52 | size=0, 53 | sizes=(120, 120), 54 | data=dff, 55 | legend=False, 56 | ax=ax) 57 | 58 | ax.set_xlabel(r'$x$') 59 | ax.set_ylabel(r'$y$') 60 | 61 | plt.show() 62 | 63 | # Analyze the result of different linkages and number of clusters 64 | cpcs = np.zeros(shape=(len(linkages), len(nb_clusters))) 65 | silhouette_scores = np.zeros(shape=(len(linkages), len(nb_clusters))) 66 | 67 | for i, l in enumerate(linkages): 68 | for j, nbc in enumerate(nb_clusters): 69 | dm = pdist(sdf, metric='minkowski', p=2) 70 | Z = linkage(dm, method=l) 71 | cpc, _ = cophenet(Z, dm) 72 | cpcs[i, j] = cpc 73 | 74 | ag = AgglomerativeClustering(n_clusters=nbc, affinity='euclidean', linkage=l) 75 | Y_pred = ag.fit_predict(sdf) 76 | sls = silhouette_score(sdf, Y_pred, random_state=1000) 77 | silhouette_scores[i, j] = sls 78 | 79 | fig, ax = plt.subplots(len(nb_clusters), 2, figsize=(20, 20), sharex=True) 80 | 81 | for i in range(len(nb_clusters)): 82 | ax[i, 0].plot(cpcs[:, i]) 83 | ax[i, 0].set_ylabel('Cophenetic correlation', fontsize=14) 84 | ax[i, 0].set_title('Number of clusters: {}'.format(nb_clusters[i]), fontsize=14) 85 | 86 | ax[i, 1].plot(silhouette_scores[:, i]) 87 | ax[i, 1].set_ylabel('Silhouette score', fontsize=14) 88 | ax[i, 1].set_title('Number of clusters: {}'.format(nb_clusters[i]), fontsize=14) 89 | 90 | plt.xticks(np.arange(len(linkages)), linkages) 91 | 92 | plt.show() 93 | 94 | # Show the truncated dendrogram for a complete linkage 95 | dm = pdist(sdf, metric='euclidean') 96 | Z = linkage(dm, method='complete') 97 | 98 | fig, ax = plt.subplots(figsize=(25, 20)) 99 | 100 | d = dendrogram(Z, orientation='right', truncate_mode='lastp', p=80, no_labels=True, ax=ax) 101 | 102 | ax.set_xlabel('Dissimilarity', fontsize=18) 103 | ax.set_ylabel('Samples (80 leaves)', fontsize=18) 104 | 105 | plt.show() 106 | 107 | # Perform the clustering (both with 8 and 2 clusters) 108 | for n in (8, 2): 109 | ag = AgglomerativeClustering(n_clusters=n, affinity='euclidean', linkage='complete') 110 | Y_pred = ag.fit_predict(sdf) 111 | 112 | df_pred = pd.Series(Y_pred, name='Cluster', index=df.index) 113 | pdff = pd.concat([dff, df_pred], axis=1) 114 | 115 | # Show the results of the clustering 116 | fig, ax = plt.subplots(figsize=(18, 11)) 117 | 118 | with sns.plotting_context("notebook", font_scale=1.5): 119 | sns.scatterplot(x='x', 120 | y='y', 121 | hue='Cluster', 122 | size='Cluster', 123 | sizes=(120, 120), 124 | palette=sns.color_palette("husl", n), 125 | data=pdff, 126 | ax=ax) 127 | 128 | ax.set_xlabel(r'$x$') 129 | ax.set_ylabel(r'$y$') 130 | 131 | plt.show() 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /Chapter05/bayesian_gaussian_mixture.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import make_blobs 6 | from sklearn.mixture import BayesianGaussianMixture 7 | 8 | from matplotlib.patches import Ellipse 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | nb_samples = 500 15 | nb_centers = 5 16 | 17 | 18 | if __name__ == '__main__': 19 | # Create the dataset 20 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, center_box=[-5, 5], 21 | centers=nb_centers, random_state=1000) 22 | 23 | # Train the model with concentration 1000 and 0.1 24 | for c in (1000.0, 0.1): 25 | gm = BayesianGaussianMixture(n_components=5, weight_concentration_prior=c, 26 | max_iter=10000, random_state=1000) 27 | gm.fit(X) 28 | 29 | print('Weights: {}'.format(gm.weights_)) 30 | 31 | Y_pred = gm.fit_predict(X) 32 | 33 | print((Y_pred == 0).sum()) 34 | print((Y_pred == 1).sum()) 35 | print((Y_pred == 2).sum()) 36 | print((Y_pred == 3).sum()) 37 | print((Y_pred == 4).sum()) 38 | 39 | # Compute the parameters of the Gaussian mixture 40 | m1 = gm.means_[0] 41 | m2 = gm.means_[1] 42 | m3 = gm.means_[2] 43 | m4 = gm.means_[3] 44 | m5 = gm.means_[4] 45 | 46 | c1 = gm.covariances_[0] 47 | c2 = gm.covariances_[1] 48 | c3 = gm.covariances_[2] 49 | c4 = gm.covariances_[3] 50 | c5 = gm.covariances_[4] 51 | 52 | we1 = 1 + gm.weights_[0] 53 | we2 = 1 + gm.weights_[1] 54 | we3 = 1 + gm.weights_[2] 55 | we4 = 1 + gm.weights_[3] 56 | we5 = 1 + gm.weights_[4] 57 | 58 | w1, v1 = np.linalg.eigh(c1) 59 | w2, v2 = np.linalg.eigh(c2) 60 | w3, v3 = np.linalg.eigh(c3) 61 | w4, v4 = np.linalg.eigh(c4) 62 | w5, v5 = np.linalg.eigh(c5) 63 | 64 | nv1 = v1 / np.linalg.norm(v1) 65 | nv2 = v2 / np.linalg.norm(v2) 66 | nv3 = v3 / np.linalg.norm(v3) 67 | nv4 = v4 / np.linalg.norm(v4) 68 | nv5 = v5 / np.linalg.norm(v5) 69 | 70 | a1 = np.arccos(np.dot(nv1[:, 1], [1.0, 0.0]) / np.linalg.norm(nv1[:, 1])) * 180.0 / np.pi 71 | a2 = np.arccos(np.dot(nv2[:, 1], [1.0, 0.0]) / np.linalg.norm(nv2[:, 1])) * 180.0 / np.pi 72 | a3 = np.arccos(np.dot(nv3[:, 1], [1.0, 0.0]) / np.linalg.norm(nv3[:, 1])) * 180.0 / np.pi 73 | a4 = np.arccos(np.dot(nv4[:, 1], [1.0, 0.0]) / np.linalg.norm(nv4[:, 1])) * 180.0 / np.pi 74 | a5 = np.arccos(np.dot(nv5[:, 1], [1.0, 0.0]) / np.linalg.norm(nv5[:, 1])) * 180.0 / np.pi 75 | 76 | # Show the results 77 | sns.set() 78 | 79 | fig, ax = plt.subplots(figsize=(22, 12)) 80 | 81 | ax.scatter(X[Y_pred == 0, 0], X[Y_pred == 0, 1], s=80, marker='x', label='Gaussian 1') 82 | ax.scatter(X[Y_pred == 1, 0], X[Y_pred == 1, 1], s=80, marker='o', label='Gaussian 2') 83 | ax.scatter(X[Y_pred == 2, 0], X[Y_pred == 2, 1], s=80, marker='d', label='Gaussian 3') 84 | ax.scatter(X[Y_pred == 3, 0], X[Y_pred == 3, 1], s=80, marker='s', label='Gaussian 4') 85 | if c == 1000: 86 | ax.scatter(X[Y_pred == 4, 0], X[Y_pred == 4, 1], s=80, marker='^', label='Gaussian 5') 87 | 88 | g1 = Ellipse(xy=m1, width=w1[1] * 3, height=w1[0] * 3, fill=False, linestyle='dashed', angle=a1, color='black', 89 | linewidth=1) 90 | g1_1 = Ellipse(xy=m1, width=w1[1] * 2, height=w1[0] * 2, fill=False, linestyle='dashed', angle=a1, color='black', 91 | linewidth=2) 92 | g1_2 = Ellipse(xy=m1, width=w1[1] * 1.4, height=w1[0] * 1.4, fill=False, linestyle='dashed', angle=a1, 93 | color='black', linewidth=3) 94 | 95 | g2 = Ellipse(xy=m2, width=w2[1] * 3, height=w2[0] * 3, fill=False, linestyle='dashed', angle=a2, color='black', 96 | linewidth=1) 97 | g2_1 = Ellipse(xy=m2, width=w2[1] * 2, height=w2[0] * 2, fill=False, linestyle='dashed', angle=a2, color='black', 98 | linewidth=2) 99 | g2_2 = Ellipse(xy=m2, width=w2[1] * 1.4, height=w2[0] * 1.4, fill=False, linestyle='dashed', angle=a2, 100 | color='black', linewidth=3) 101 | 102 | g3 = Ellipse(xy=m3, width=w3[1] * 3, height=w3[0] * 3, fill=False, linestyle='dashed', angle=a3, color='black', 103 | linewidth=1) 104 | g3_1 = Ellipse(xy=m3, width=w3[1] * 2, height=w3[0] * 2, fill=False, linestyle='dashed', angle=a3, color='black', 105 | linewidth=2) 106 | g3_2 = Ellipse(xy=m3, width=w3[1] * 1.4, height=w3[0] * 1.4, fill=False, linestyle='dashed', angle=a3, 107 | color='black', linewidth=3) 108 | 109 | g4 = Ellipse(xy=m4, width=w4[1] * 3, height=w4[0] * 3, fill=False, linestyle='dashed', angle=a4, color='black', 110 | linewidth=1) 111 | g4_1 = Ellipse(xy=m4, width=w4[1] * 2, height=w4[0] * 2, fill=False, linestyle='dashed', angle=a4, color='black', 112 | linewidth=2) 113 | g4_2 = Ellipse(xy=m4, width=w4[1] * 1.4, height=w4[0] * 1.4, fill=False, linestyle='dashed', angle=a4, 114 | color='black', linewidth=3) 115 | 116 | ax.set_xlabel(r'$x_0$', fontsize=16) 117 | ax.set_ylabel(r'$x_1$', fontsize=16) 118 | 119 | ax.add_artist(g1) 120 | ax.add_artist(g1_1) 121 | ax.add_artist(g1_2) 122 | ax.add_artist(g2) 123 | ax.add_artist(g2_1) 124 | ax.add_artist(g2_2) 125 | ax.add_artist(g3) 126 | ax.add_artist(g3_1) 127 | ax.add_artist(g3_2) 128 | ax.add_artist(g4) 129 | ax.add_artist(g4_1) 130 | ax.add_artist(g4_2) 131 | 132 | if c == 1000: 133 | g5 = Ellipse(xy=m5, width=w5[1] * 3, height=w5[0] * 3, fill=False, linestyle='dashed', angle=a5, 134 | color='black', 135 | linewidth=1) 136 | g5_1 = Ellipse(xy=m5, width=w5[1] * 2, height=w5[0] * 2, fill=False, linestyle='dashed', angle=a5, 137 | color='black', 138 | linewidth=2) 139 | g5_2 = Ellipse(xy=m5, width=w5[1] * 1.4, height=w5[0] * 1.4, fill=False, linestyle='dashed', angle=a5, 140 | color='black', linewidth=3) 141 | 142 | ax.add_artist(g5) 143 | ax.add_artist(g5_1) 144 | ax.add_artist(g5_2) 145 | 146 | ax.legend(fontsize=16) 147 | 148 | plt.show() 149 | 150 | 151 | -------------------------------------------------------------------------------- /Chapter05/fuzzy_cmeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import load_digits 6 | from sklearn.metrics import adjusted_rand_score 7 | 8 | from skfuzzy.cluster import cmeans 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | if __name__ == '__main__': 16 | # Load the dataset 17 | digits = load_digits() 18 | X = digits['data'] / 255.0 19 | Y = digits['target'] 20 | 21 | # Perform a preliminary analysis 22 | Ws = [] 23 | pcs = [] 24 | 25 | for m in np.linspace(1.05, 1.5, 5): 26 | fc, W, _, _, _, _, pc = cmeans(X.T, c=10, m=m, error=1e-6, maxiter=20000, seed=1000) 27 | Ws.append(W) 28 | pcs.append(pc) 29 | 30 | # Show the results 31 | sns.set() 32 | 33 | fig, ax = plt.subplots(1, 5, figsize=(20, 4)) 34 | 35 | for i, m in enumerate(np.linspace(1.05, 1.5, 5)): 36 | ax[i].bar(np.arange(10), -np.log(Ws[i][:, 0])) 37 | ax[i].set_xticks(np.arange(10)) 38 | ax[i].set_title(r'$m={}, P_C={:.2f}$'.format(m, pcs[i])) 39 | 40 | ax[0].set_ylabel(r'$-log(w_0j)$') 41 | 42 | plt.show() 43 | 44 | # Perform the clustering 45 | fc, W, _, _, _, _, pc = cmeans(X.T, c=10, m=1.2, error=1e-6, maxiter=20000, seed=1000) 46 | Mu = fc.reshape((10, 8, 8)) 47 | 48 | # Show the centroids 49 | fig, ax = plt.subplots(1, 10, figsize=(20, 4)) 50 | 51 | for i in range(10): 52 | ax[i].imshow(Mu[i] * 255, cmap='gray') 53 | ax[i].grid(False) 54 | ax[i].set_xticks([]) 55 | ax[i].set_yticks([]) 56 | 57 | plt.show() 58 | 59 | # Show the assignments for X[0] 60 | print(W[:, 0]) 61 | 62 | # Compute the adjusted Rand score 63 | Y_pred = np.argmax(W.T, axis=1) 64 | 65 | print(adjusted_rand_score(Y, Y_pred)) 66 | 67 | im = np.argmin(np.std(W.T, axis=1)) 68 | 69 | print(im) 70 | print(Y[im]) 71 | print(W[:, im]) -------------------------------------------------------------------------------- /Chapter05/gaussian_mixture.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import make_blobs 6 | from sklearn.mixture import GaussianMixture 7 | from sklearn.cluster import KMeans 8 | 9 | from matplotlib.patches import Ellipse 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | nb_samples = 300 16 | nb_centers = 2 17 | 18 | 19 | if __name__ == '__main__': 20 | # Create the dataset 21 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, center_box=[-1, 1], centers=nb_centers, 22 | cluster_std=[1.0, 0.6], random_state=1000) 23 | 24 | # Show the dataset 25 | sns.set() 26 | 27 | fig, ax = plt.subplots(figsize=(15, 9)) 28 | 29 | ax.scatter(X[:, 0], X[:, 1], s=120) 30 | 31 | ax.set_xlabel(r'$x_0$', fontsize=14) 32 | ax.set_ylabel(r'$x_1$', fontsize=14) 33 | 34 | plt.show() 35 | 36 | # Train the model 37 | gm = GaussianMixture(n_components=2, random_state=1000) 38 | gm.fit(X) 39 | Y_pred = gm.fit_predict(X) 40 | 41 | print('Means: \n{}'.format(gm.means_)) 42 | print('Covariance matrices: \n{}'.format(gm.covariances_)) 43 | print('Weights: \n{}'.format(gm.weights_)) 44 | 45 | m1 = gm.means_[0] 46 | m2 = gm.means_[1] 47 | 48 | c1 = gm.covariances_[0] 49 | c2 = gm.covariances_[1] 50 | 51 | we1 = 1 + gm.weights_[0] 52 | we2 = 1 + gm.weights_[1] 53 | 54 | # Eigendecompose the covariances 55 | w1, v1 = np.linalg.eigh(c1) 56 | w2, v2 = np.linalg.eigh(c2) 57 | 58 | nv1 = v1 / np.linalg.norm(v1) 59 | nv2 = v2 / np.linalg.norm(v2) 60 | 61 | print('Eigenvalues 1: \n{}'.format(w1)) 62 | print('Eigenvectors 1: \n{}'.format(nv1)) 63 | 64 | print('Eigenvalues 2: \n{}'.format(w2)) 65 | print('Eigenvectors 2: \n{}'.format(nv2)) 66 | 67 | a1 = np.arccos(np.dot(nv1[:, 1], [1.0, 0.0]) / np.linalg.norm(nv1[:, 1])) * 180.0 / np.pi 68 | a2 = np.arccos(np.dot(nv2[:, 1], [1.0, 0.0]) / np.linalg.norm(nv2[:, 1])) * 180.0 / np.pi 69 | 70 | # Perform K-Means clustering 71 | km = KMeans(n_clusters=2, random_state=1000) 72 | km.fit(X) 73 | Y_pred_km = km.predict(X) 74 | 75 | # Show the comparison of the results 76 | fig, ax = plt.subplots(1, 2, figsize=(22, 9), sharey=True) 77 | 78 | ax[0].scatter(X[Y_pred == 0, 0], X[Y_pred == 0, 1], s=80, marker='o', label='Gaussian 1') 79 | ax[0].scatter(X[Y_pred == 1, 0], X[Y_pred == 1, 1], s=80, marker='d', label='Gaussian 2') 80 | 81 | g1 = Ellipse(xy=m1, width=w1[1] * 3, height=w1[0] * 3, fill=False, linestyle='dashed', angle=a1, color='black', 82 | linewidth=1) 83 | g1_1 = Ellipse(xy=m1, width=w1[1] * 2, height=w1[0] * 2, fill=False, linestyle='dashed', angle=a1, color='black', 84 | linewidth=2) 85 | g1_2 = Ellipse(xy=m1, width=w1[1] * 1.4, height=w1[0] * 1.4, fill=False, linestyle='dashed', angle=a1, 86 | color='black', linewidth=3) 87 | 88 | g2 = Ellipse(xy=m2, width=w2[1] * 3, height=w2[0] * 3, fill=False, linestyle='dashed', angle=a2, color='black', 89 | linewidth=1) 90 | g2_1 = Ellipse(xy=m2, width=w2[1] * 2, height=w2[0] * 2, fill=False, linestyle='dashed', angle=a2, color='black', 91 | linewidth=2) 92 | g2_2 = Ellipse(xy=m2, width=w2[1] * 1.4, height=w2[0] * 1.4, fill=False, linestyle='dashed', angle=a2, 93 | color='black', linewidth=3) 94 | 95 | ax[0].set_xlabel(r'$x_0$', fontsize=16) 96 | ax[0].set_ylabel(r'$x_1$', fontsize=16) 97 | 98 | ax[0].add_artist(g1) 99 | ax[0].add_artist(g1_1) 100 | ax[0].add_artist(g1_2) 101 | ax[0].add_artist(g2) 102 | ax[0].add_artist(g2_1) 103 | ax[0].add_artist(g2_2) 104 | 105 | ax[0].set_title('Gaussian Mixture', fontsize=16) 106 | 107 | ax[0].legend(fontsize=16) 108 | 109 | ax[1].scatter(X[Y_pred_km == 0, 0], X[Y_pred_km == 0, 1], s=80, marker='o', label='Cluster 1') 110 | ax[1].scatter(X[Y_pred_km == 1, 0], X[Y_pred_km == 1, 1], s=80, marker='d', label='Cluster 2') 111 | 112 | ax[1].set_xlabel(r'$x_0$', fontsize=16) 113 | 114 | ax[1].set_title('K-Means', fontsize=16) 115 | 116 | ax[1].legend(fontsize=16) 117 | 118 | # Predict the probability of some sample points 119 | print('P([0, -2]=G1) = {:.3f} and P([0, -2]=G2) = {:.3f}'.format(*list(gm.predict_proba([[0.0, -2.0]]).squeeze()))) 120 | print('P([1, -1]=G1) = {:.3f} and P([1, -1]=G2) = {:.3f}'.format(*list(gm.predict_proba([[1.0, -1.0]]).squeeze()))) 121 | print('P([1, 0]=G1) = {:.3f} and P([1, 0]=G2) = {:.3f}'.format(*list(gm.predict_proba([[1.0, 0.0]]).squeeze()))) 122 | 123 | plt.show() 124 | 125 | # Compute AICs, BICs, and log-likelihood 126 | n_max_components = 20 127 | 128 | aics = [] 129 | bics = [] 130 | log_likelihoods = [] 131 | 132 | for n in range(1, n_max_components + 1): 133 | gm = GaussianMixture(n_components=n, random_state=1000) 134 | gm.fit(X) 135 | aics.append(gm.aic(X)) 136 | bics.append(gm.bic(X)) 137 | log_likelihoods.append(gm.score(X) * nb_samples) 138 | 139 | # Show the results 140 | fig, ax = plt.subplots(1, 3, figsize=(20, 6)) 141 | 142 | ax[0].plot(range(1, n_max_components + 1), aics) 143 | ax[0].set_xticks(range(1, n_max_components + 1)) 144 | 145 | ax[0].set_xlabel('Number of Gaussians', fontsize=14) 146 | ax[0].set_title('AIC', fontsize=14) 147 | 148 | ax[1].plot(range(1, n_max_components + 1), bics) 149 | ax[1].set_xticks(range(1, n_max_components + 1)) 150 | 151 | ax[1].set_xlabel('Number of Gaussians', fontsize=14) 152 | ax[1].set_title('BIC', fontsize=14) 153 | 154 | ax[2].plot(range(1, n_max_components + 1), log_likelihoods) 155 | ax[2].set_xticks(range(1, n_max_components + 1)) 156 | 157 | ax[2].set_xlabel('Number of Gaussians', fontsize=14) 158 | ax[2].set_title('Log-likelihood', fontsize=14) 159 | 160 | plt.show() 161 | 162 | -------------------------------------------------------------------------------- /Chapter05/generative_gaussian_mixture.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import make_blobs 6 | 7 | from matplotlib.patches import Ellipse 8 | 9 | from scipy.stats import multivariate_normal 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | 16 | nb_samples = 500 17 | nb_unlabeled = 400 18 | nb_iterations = 10 19 | 20 | 21 | m1 = np.array([-2.0, -2.5]) 22 | c1 = np.array([[1.0, 1.0], 23 | [1.0, 2.0]]) 24 | q1 = 0.5 25 | 26 | m2 = np.array([1.0, 3.0]) 27 | c2 = np.array([[2.0, -1.0], 28 | [-1.0, 3.5]]) 29 | q2 = 0.5 30 | 31 | 32 | if __name__ == '__main__': 33 | # Create the dataset 34 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=2, cluster_std=1.5, random_state=100) 35 | 36 | unlabeled_idx = np.random.choice(np.arange(0, nb_samples, 1), replace=False, size=nb_unlabeled) 37 | Y[unlabeled_idx] = -1 38 | 39 | # Show the initial configuration 40 | w1, v1 = np.linalg.eigh(c1) 41 | w2, v2 = np.linalg.eigh(c2) 42 | 43 | nv1 = v1 / np.linalg.norm(v1) 44 | nv2 = v2 / np.linalg.norm(v2) 45 | 46 | a1 = np.arccos(np.dot(nv1[:, 1], [1.0, 0.0]) / np.linalg.norm(nv1[:, 1])) * 180.0 / np.pi 47 | a2 = np.arccos(np.dot(nv2[:, 1], [1.0, 0.0]) / np.linalg.norm(nv2[:, 1])) * 180.0 / np.pi 48 | 49 | sns.set() 50 | 51 | fig, ax = plt.subplots(figsize=(22, 12)) 52 | 53 | ax.scatter(X[Y == 0, 0], X[Y == 0, 1], s=80, marker='o', label='Class 1') 54 | ax.scatter(X[Y == 1, 0], X[Y == 1, 1], s=80, marker='d', label='Class 2') 55 | ax.scatter(X[Y == -1, 0], X[Y == -1, 1], s=100, marker='x', label='Unlabeled') 56 | 57 | g1 = Ellipse(xy=m1, width=w1[1] * 3, height=w1[0] * 3, fill=False, linestyle='dashed', angle=a1, color='black', 58 | linewidth=1) 59 | g1_1 = Ellipse(xy=m1, width=w1[1] * 2, height=w1[0] * 2, fill=False, linestyle='dashed', angle=a1, color='black', 60 | linewidth=2) 61 | g1_2 = Ellipse(xy=m1, width=w1[1] * 1.4, height=w1[0] * 1.4, fill=False, linestyle='dashed', angle=a1, 62 | color='black', linewidth=3) 63 | 64 | g2 = Ellipse(xy=m2, width=w2[1] * 3, height=w2[0] * 3, fill=False, linestyle='dashed', angle=a2, color='black', 65 | linewidth=1) 66 | g2_1 = Ellipse(xy=m2, width=w2[1] * 2, height=w2[0] * 2, fill=False, linestyle='dashed', angle=a2, color='black', 67 | linewidth=2) 68 | g2_2 = Ellipse(xy=m2, width=w2[1] * 1.4, height=w2[0] * 1.4, fill=False, linestyle='dashed', angle=a2, 69 | color='black', linewidth=3) 70 | 71 | ax.set_xlabel(r'$x_0$', fontsize=16) 72 | ax.set_ylabel(r'$x_1$', fontsize=16) 73 | 74 | ax.add_artist(g1) 75 | ax.add_artist(g1_1) 76 | ax.add_artist(g1_2) 77 | ax.add_artist(g2) 78 | ax.add_artist(g2_1) 79 | ax.add_artist(g2_2) 80 | 81 | ax.legend(fontsize=16) 82 | 83 | plt.show() 84 | 85 | # Train the model 86 | for i in range(nb_iterations): 87 | Pij = np.zeros((nb_samples, 2)) 88 | 89 | for i in range(nb_samples): 90 | 91 | if Y[i] == -1: 92 | p1 = multivariate_normal.pdf(X[i], m1, c1, allow_singular=True) * q1 93 | p2 = multivariate_normal.pdf(X[i], m2, c2, allow_singular=True) * q2 94 | Pij[i] = [p1, p2] / (p1 + p2) 95 | else: 96 | Pij[i, :] = [1.0, 0.0] if Y[i] == 0 else [0.0, 1.0] 97 | 98 | n = np.sum(Pij, axis=0) 99 | m = np.sum(np.dot(Pij.T, X), axis=0) 100 | 101 | m1 = np.dot(Pij[:, 0], X) / n[0] 102 | m2 = np.dot(Pij[:, 1], X) / n[1] 103 | 104 | q1 = n[0] / float(nb_samples) 105 | q2 = n[1] / float(nb_samples) 106 | 107 | c1 = np.zeros((2, 2)) 108 | c2 = np.zeros((2, 2)) 109 | 110 | for t in range(nb_samples): 111 | c1 += Pij[t, 0] * np.outer(X[t] - m1, X[t] - m1) 112 | c2 += Pij[t, 1] * np.outer(X[t] - m2, X[t] - m2) 113 | 114 | c1 /= n[0] 115 | c2 /= n[1] 116 | 117 | print('Gaussian 1:') 118 | print(q1) 119 | print(m1) 120 | print(c1) 121 | 122 | print('\nGaussian 2:') 123 | print(q2) 124 | print(m2) 125 | print(c2) 126 | 127 | # Show the final configuration 128 | # Show the initial configuration 129 | w1, v1 = np.linalg.eigh(c1) 130 | w2, v2 = np.linalg.eigh(c2) 131 | 132 | nv1 = v1 / np.linalg.norm(v1) 133 | nv2 = v2 / np.linalg.norm(v2) 134 | 135 | a1 = np.arccos(np.dot(nv1[:, 1], [1.0, 0.0]) / np.linalg.norm(nv1[:, 1])) * 180.0 / np.pi 136 | a2 = np.arccos(np.dot(nv2[:, 1], [1.0, 0.0]) / np.linalg.norm(nv2[:, 1])) * 180.0 / np.pi 137 | 138 | sns.set() 139 | 140 | fig, ax = plt.subplots(figsize=(22, 12)) 141 | 142 | ax.scatter(X[Y == 0, 0], X[Y == 0, 1], s=80, marker='o', label='Class 1') 143 | ax.scatter(X[Y == 1, 0], X[Y == 1, 1], s=80, marker='d', label='Class 2') 144 | ax.scatter(X[Y == -1, 0], X[Y == -1, 1], s=100, marker='x', label='Unlabeled') 145 | 146 | g1 = Ellipse(xy=m1, width=w1[1] * 3, height=w1[0] * 3, fill=False, linestyle='dashed', angle=a1, color='black', 147 | linewidth=1) 148 | g1_1 = Ellipse(xy=m1, width=w1[1] * 2, height=w1[0] * 2, fill=False, linestyle='dashed', angle=a1, color='black', 149 | linewidth=2) 150 | g1_2 = Ellipse(xy=m1, width=w1[1] * 1.4, height=w1[0] * 1.4, fill=False, linestyle='dashed', angle=a1, 151 | color='black', linewidth=3) 152 | 153 | g2 = Ellipse(xy=m2, width=w2[1] * 3, height=w2[0] * 3, fill=False, linestyle='dashed', angle=a2, color='black', 154 | linewidth=1) 155 | g2_1 = Ellipse(xy=m2, width=w2[1] * 2, height=w2[0] * 2, fill=False, linestyle='dashed', angle=a2, color='black', 156 | linewidth=2) 157 | g2_2 = Ellipse(xy=m2, width=w2[1] * 1.4, height=w2[0] * 1.4, fill=False, linestyle='dashed', angle=a2, 158 | color='black', linewidth=3) 159 | 160 | ax.set_xlabel(r'$x_0$', fontsize=16) 161 | ax.set_ylabel(r'$x_1$', fontsize=16) 162 | 163 | ax.add_artist(g1) 164 | ax.add_artist(g1_1) 165 | ax.add_artist(g1_2) 166 | ax.add_artist(g2) 167 | ax.add_artist(g2_1) 168 | ax.add_artist(g2_2) 169 | 170 | ax.legend(fontsize=16) 171 | 172 | plt.show() -------------------------------------------------------------------------------- /Chapter06/histogram.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | 6 | # For reproducibility 7 | np.random.seed(1000) 8 | 9 | 10 | if __name__ == '__main__': 11 | # Generate the dataset 12 | nb_samples = [1000, 800, 500, 380, 280, 150, 120, 100, 50, 30] 13 | 14 | ages = [] 15 | 16 | for n in nb_samples: 17 | i = np.random.uniform(10, 80, size=2) 18 | a = np.random.uniform(i[0], i[1], size=n).astype(np.int32) 19 | ages.append(a) 20 | 21 | ages = np.concatenate(ages) 22 | 23 | # Compute the histogram 24 | # Any FutureWarning is related to SciPy deprecations which are still employed by NumPy but 25 | # it doesn't affect the results 26 | h, e = np.histogram(ages, bins='auto') 27 | 28 | print('Histograms counts: {}'.format(h)) 29 | print('Bin edges: {}'.format(e)) 30 | 31 | # Show the histogram 32 | sns.set() 33 | 34 | fig, ax = plt.subplots(figsize=(16, 10)) 35 | 36 | sns.distplot(ages, kde=False, ax=ax, label='Age count') 37 | 38 | ax.set_xlabel('Age', fontsize=14) 39 | ax.set_ylabel('Number of entries', fontsize=14) 40 | 41 | ax.set_xticks(e) 42 | 43 | ax.legend() 44 | 45 | plt.show() 46 | 47 | # Compute the probability for a sample interval 48 | d = e[1] - e[0] 49 | p50 = float(h[12]) / float(ages.shape[0]) 50 | 51 | print('P(48.84 < x < 51.58) = {:.2f} ({:.2f}%)'.format(p50, p50 * 100.0)) -------------------------------------------------------------------------------- /Chapter06/isolation_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import load_wine 6 | from sklearn.preprocessing import StandardScaler 7 | from sklearn.ensemble import IsolationForest 8 | from sklearn.manifold import TSNE 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | nb_samples = 2000 16 | nb_test_samples = 200 17 | 18 | 19 | if __name__ == '__main__': 20 | # Load the dataset 21 | wine = load_wine() 22 | X = wine['data'].astype(np.float64) 23 | 24 | # Normalize the dataset 25 | ss = StandardScaler() 26 | X = ss.fit_transform(X) 27 | 28 | # Train the isolation forest 29 | isf = IsolationForest(n_estimators=150, behaviour='new', contamination=0.01, random_state=1000) 30 | Y_pred = isf.fit_predict(X) 31 | 32 | print('Outliers in the training set: {}'.format(np.sum(Y_pred == -1))) 33 | 34 | # Create the test set 35 | X_test_1 = np.mean(X) + np.random.normal(0.0, 1.0, size=(50, 13)) 36 | X_test_2 = np.mean(X) + np.random.normal(0.0, 2.0, size=(50, 13)) 37 | X_test = np.concatenate([X_test_1, X_test_2], axis=0) 38 | 39 | Y_test = isf.predict(X_test) * 2 40 | 41 | Xf = np.concatenate([X, X_test], axis=0) 42 | Yf = np.concatenate([Y_pred, Y_test], axis=0) 43 | 44 | print(Yf[::-1]) 45 | 46 | # Perform the t-SNE dimensionality reduction 47 | tsne = TSNE(n_components=2, perplexity=5, n_iter=5000, random_state=1000) 48 | X_tsne = tsne.fit_transform(Xf) 49 | 50 | # Show the results 51 | sns.set() 52 | 53 | fig, ax = plt.subplots(figsize=(15, 10)) 54 | 55 | ax.scatter(X_tsne[Yf == 1, 0], X_tsne[Yf == 1, 1], marker='o', s=100, label='Inliers') 56 | ax.scatter(X_tsne[Yf == -1, 0], X_tsne[Yf == -1, 1], marker='x', s=100, label='Ouliers') 57 | ax.scatter(X_tsne[Yf == 2, 0], X_tsne[Yf == 2, 1], marker='^', s=80, label='Test inliers') 58 | ax.scatter(X_tsne[Yf == -2, 0], X_tsne[Yf == -2, 1], marker='v', s=80, label='Test ouliers') 59 | 60 | ax.set_xlabel(r'$x_1$', fontsize=14) 61 | ax.set_ylabel(r'$x_2$', fontsize=14) 62 | 63 | ax.legend(fontsize=14) 64 | 65 | plt.show() 66 | 67 | 68 | -------------------------------------------------------------------------------- /Chapter06/kddcup99_anomaly_detection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import fetch_kddcup99 6 | from sklearn.neighbors import KernelDensity 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | def is_anomaly(kd, source, destination, medium_thr=0.03, high_thr=0.015): 14 | xs = np.log(source + 0.1) 15 | xd = np.log(destination + 0.1) 16 | data = np.array([[xs, xd]]) 17 | 18 | density = np.exp(kd.score_samples(data))[0] 19 | 20 | if density >= medium_thr: 21 | return density, 'Normal connection' 22 | elif density >= high_thr: 23 | return density, 'Medium risk' 24 | else: 25 | return density, 'High risk' 26 | 27 | 28 | if __name__ == '__main__': 29 | # Load the dataset 30 | kddcup99 = fetch_kddcup99(subset='http', percent10=True, random_state=1000) 31 | 32 | X = kddcup99['data'].astype(np.float64) 33 | Y = kddcup99['target'] 34 | 35 | print('Statuses: {}'.format(np.unique(Y))) 36 | print('Normal samples: {}'.format(X[Y == b'normal.'].shape[0])) 37 | print('Anomalies: {}'.format(X[Y != b'normal.'].shape[0])) 38 | 39 | means = np.mean(X, axis=0) 40 | stds = np.std(X, axis=0) 41 | IQRs = np.percentile(X, 75, axis=0) - np.percentile(X, 25, axis=0) 42 | 43 | # Show the histogram of the durations 44 | # Any FutureWarning is related to SciPy deprecations which are still employed by NumPy but 45 | # it doesn't affect the results 46 | h0, e0 = np.histogram(X[:, 0], bins='auto') 47 | 48 | sns.set() 49 | 50 | fig, ax = plt.subplots(figsize=(16, 10)) 51 | 52 | sns.distplot(X[:, 0], kde=False, ax=ax) 53 | 54 | ax.set_xlabel('Duration', fontsize=14) 55 | ax.set_ylabel('Number of entries', fontsize=14) 56 | 57 | ax.set_xticks(e0) 58 | 59 | plt.show() 60 | 61 | # Compute the optimal bandwidth 62 | N = float(X.shape[0]) 63 | 64 | h0 = 0.9 * np.min([stds[0], IQRs[0] / 1.34]) * np.power(N, -0.2) 65 | h1 = 0.9 * np.min([stds[1], IQRs[1] / 1.34]) * np.power(N, -0.2) 66 | h2 = 0.9 * np.min([stds[2], IQRs[2] / 1.34]) * np.power(N, -0.2) 67 | 68 | print('h0 = {:.3f}, h1 = {:.3f}, h2 = {:.3f}'.format(h0, h1, h2)) 69 | 70 | # Show the KDE for normal and malicious connections 71 | fig, ax = plt.subplots(2, 3, figsize=(22, 10)) 72 | 73 | sns.distplot(X[Y == b'normal.', 0], kde=True, ax=ax[0, 0], label='KDE') 74 | sns.distplot(X[Y == b'normal.', 1], kde=True, ax=ax[0, 1], label='KDE') 75 | sns.distplot(X[Y == b'normal.', 2], kde=True, ax=ax[0, 2], label='KDE') 76 | 77 | sns.distplot(X[Y != b'normal.', 0], kde=True, ax=ax[1, 0], label='KDE') 78 | sns.distplot(X[Y != b'normal.', 1], kde=True, ax=ax[1, 1], label='KDE') 79 | sns.distplot(X[Y != b'normal.', 2], kde=True, ax=ax[1, 2], label='KDE') 80 | 81 | ax[0, 0].set_title('Duration', fontsize=16) 82 | ax[0, 1].set_title('Source bytes', fontsize=16) 83 | ax[0, 2].set_title('Destination bytes', fontsize=16) 84 | 85 | ax[0, 0].set_xticks(np.arange(-4, 12, 2)) 86 | ax[1, 0].set_xticks(np.arange(-4, 12, 2)) 87 | 88 | ax[0, 1].set_xticks(np.arange(-10, 16, 2)) 89 | ax[1, 1].set_xticks(np.arange(-10, 16, 2)) 90 | 91 | ax[0, 2].set_xticks(np.arange(-2, 14, 2)) 92 | ax[1, 2].set_xticks(np.arange(-2, 14, 2)) 93 | 94 | plt.show() 95 | 96 | # Perform the KDE 97 | X = X[:, 1:] 98 | 99 | kd = KernelDensity(kernel='gaussian', bandwidth=0.025) 100 | kd.fit(X[Y == b'normal.']) 101 | 102 | Yn = np.exp(kd.score_samples(X[Y == b'normal.'])) 103 | Ya = np.exp(kd.score_samples(X[Y != b'normal.'])) 104 | 105 | print('Mean normal: {:.5f} - Std: {:.5f}'.format(np.mean(Yn), np.std(Yn))) 106 | print('Mean anomalies: {:.5f} - Std: {:.5f}'.format(np.mean(Ya), np.std(Ya))) 107 | 108 | print(np.sum(Yn < 0.05)) 109 | print(np.sum(Yn < 0.03)) 110 | print(np.sum(Yn < 0.02)) 111 | print(np.sum(Yn < 0.015)) 112 | 113 | print(np.sum(Ya < 0.015)) 114 | 115 | # Perform some sample anomaly detections 116 | print('p = {:.2f} - {}'.format(*is_anomaly(kd, 200, 1100))) 117 | print('p = {:.2f} - {}'.format(*is_anomaly(kd, 360, 200))) 118 | print('p = {:.2f} - {}'.format(*is_anomaly(kd, 800, 1800))) 119 | 120 | # Show the bivariate KDE plot 121 | fig, ax = plt.subplots(figsize=(13, 8)) 122 | 123 | sns.kdeplot(X[Y != b'normal.', 0], X[Y != b'normal.', 1], cmap="Reds", shade=True, shade_lowest=False, kernel='gau', 124 | bw=0.025, ax=ax, label='Anomaly') 125 | sns.kdeplot(X[Y == b'normal.', 0], X[Y == b'normal.', 1], cmap="Blues", shade=True, shade_lowest=False, 126 | kernel='gau', bw=0.025, ax=ax, label='Normal') 127 | 128 | ax.set_xlabel('Source Bytes (logarithmic)', fontsize=14) 129 | ax.set_ylabel('Destination Bytes (logarithmic)', fontsize=14) 130 | 131 | ax.set_xlim(4, 12) 132 | ax.set_ylim(5, 11) 133 | 134 | ax.legend() 135 | 136 | plt.show() -------------------------------------------------------------------------------- /Chapter06/kernel_density_estimation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.neighbors import KernelDensity 6 | 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | 12 | if __name__ == '__main__': 13 | # Generate the dataset 14 | nb_samples = [1000, 800, 500, 380, 280, 150, 120, 100, 50, 30] 15 | 16 | ages = [] 17 | 18 | for n in nb_samples: 19 | i = np.random.uniform(10, 80, size=2) 20 | a = np.random.uniform(i[0], i[1], size=n).astype(np.int32) 21 | ages.append(a) 22 | 23 | ages = np.concatenate(ages) 24 | 25 | # Train KDE with Gaussian kernels and 3 bandwidths 26 | kd_01 = KernelDensity(kernel='gaussian', bandwidth=0.1) 27 | kd_05 = KernelDensity(kernel='gaussian', bandwidth=0.5) 28 | kd_15 = KernelDensity(kernel='gaussian', bandwidth=1.5) 29 | 30 | kd_01.fit(ages.reshape(-1, 1)) 31 | kd_05.fit(ages.reshape(-1, 1)) 32 | kd_15.fit(ages.reshape(-1, 1)) 33 | 34 | # Show the results 35 | sns.set() 36 | 37 | fig, ax = plt.subplots(3, 1, figsize=(14, 20), sharex=True) 38 | 39 | data = np.arange(10, 70, 0.05).reshape(-1, 1) 40 | 41 | ax[0].plot(data, np.exp(kd_01.score_samples(data))) 42 | ax[0].set_title('Bandwidth = 0.1', fontsize=14) 43 | ax[0].set_ylabel('Density', fontsize=14) 44 | 45 | ax[1].plot(data, np.exp(kd_05.score_samples(data))) 46 | ax[1].set_title('Bandwidth = 0.5', fontsize=14) 47 | ax[1].set_ylabel('Density', fontsize=14) 48 | 49 | ax[2].plot(data, np.exp(kd_15.score_samples(data))) 50 | ax[2].set_title('Bandwidth = 1.5', fontsize=14) 51 | ax[2].set_xlabel('Age', fontsize=14) 52 | ax[2].set_ylabel('Density', fontsize=14) 53 | 54 | plt.show() 55 | 56 | # Compute the optimal bandwidth (method 1) 57 | N = float(ages.shape[0]) 58 | h = 1.06 * np.std(ages) * np.power(N, -0.2) 59 | 60 | print('h = {:.3f}'.format(h)) 61 | 62 | # Compute the optimal bandwidth (method 2) 63 | IQR = np.percentile(ages, 75) - np.percentile(ages, 25) 64 | h = 0.9 * np.min([np.std(ages), IQR / 1.34]) * np.power(N, -0.2) 65 | 66 | print('h = {:.3f}'.format(h)) 67 | 68 | # Train KDE with different kernels and bandwidth = 2.0 69 | kd_gaussian = KernelDensity(kernel='gaussian', bandwidth=2.0) 70 | kd_epanechnikov = KernelDensity(kernel='epanechnikov', bandwidth=2.0) 71 | kd_exponential = KernelDensity(kernel='exponential', bandwidth=2.0) 72 | 73 | kd_gaussian.fit(ages.reshape(-1, 1)) 74 | kd_epanechnikov.fit(ages.reshape(-1, 1)) 75 | kd_exponential.fit(ages.reshape(-1, 1)) 76 | 77 | # Show the results 78 | fig, ax = plt.subplots(3, 1, figsize=(14, 20), sharex=False) 79 | 80 | data = np.arange(10, 70, 0.05).reshape(-1, 1) 81 | 82 | ax[0].plot(data, np.exp(kd_gaussian.score_samples(data))) 83 | ax[0].set_title('Gaussian Kernel', fontsize=14) 84 | ax[0].set_ylabel('Density', fontsize=14) 85 | 86 | ax[1].plot(data, np.exp(kd_epanechnikov.score_samples(data))) 87 | ax[1].set_title('Epanechnikov Kernel', fontsize=14) 88 | ax[1].set_ylabel('Density', fontsize=14) 89 | ax[1].set_xlabel('Age', fontsize=14) 90 | 91 | ax[2].plot(data, np.exp(kd_exponential.score_samples(data))) 92 | ax[2].set_title('Exponential Kernel', fontsize=14) 93 | ax[2].set_xlabel('Age', fontsize=14) 94 | ax[2].set_ylabel('Density', fontsize=14) 95 | 96 | plt.show() 97 | 98 | # Perform a sample anomaly detection 99 | test_data = np.array([12, 15, 18, 20, 25, 30, 40, 50, 55, 60, 65, 70, 75, 80, 85, 90]).reshape(-1, 1) 100 | 101 | test_densities_epanechnikov = np.exp(kd_epanechnikov.score_samples(test_data)) 102 | test_densities_gaussian = np.exp(kd_gaussian.score_samples(test_data)) 103 | 104 | for age, density in zip(np.squeeze(test_data), test_densities_epanechnikov): 105 | print('p(Age = {:d}) = {:.7f} ({})'.format(age, density, 'Anomaly' if density < 0.005 else 'Normal')) 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /Chapter06/one_class_svm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.svm import OneClassSVM 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | nb_samples = 2000 14 | nb_test_samples = 200 15 | 16 | 17 | if __name__ == '__main__': 18 | # Generate the dataset 19 | X = np.empty(shape=(nb_samples + nb_test_samples, 2)) 20 | 21 | X[:nb_samples] = np.random.multivariate_normal([15, 160], np.diag([1.5, 10]), size=nb_samples) 22 | X[nb_samples:, 0] = np.random.uniform(11, 19, size=nb_test_samples) 23 | X[nb_samples:, 1] = np.random.uniform(120, 210, size=nb_test_samples) 24 | 25 | # Normalize the dataset 26 | ss = StandardScaler() 27 | Xs = ss.fit_transform(X) 28 | 29 | # Show the dataset 30 | sns.set() 31 | 32 | fig, ax = plt.subplots(figsize=(13, 8)) 33 | 34 | ax.scatter(Xs[nb_samples:, 0], Xs[nb_samples:, 1], marker='^', s=80, label='Test samples') 35 | ax.scatter(Xs[:nb_samples, 0], Xs[:nb_samples, 1], label='Inliers') 36 | 37 | ax.set_xlabel('Age', fontsize=14) 38 | ax.set_ylabel('Height', fontsize=14) 39 | 40 | ax.legend(fontsize=14) 41 | 42 | plt.show() 43 | 44 | # Train the One-Class SVM 45 | ocsvm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.2) 46 | Ys = ocsvm.fit_predict(Xs) 47 | 48 | # Show the results 49 | fig, ax = plt.subplots(1, 2, figsize=(22, 10), sharey=True) 50 | 51 | ax[0].scatter(Xs[Ys == -1, 0], Xs[Ys == -1, 1], marker='x', s=100, label='Ouliers') 52 | ax[0].scatter(Xs[Ys == 1, 0], Xs[Ys == 1, 1], marker='o', label='Inliers') 53 | 54 | ax[1].scatter(Xs[Ys == -1, 0], Xs[Ys == -1, 1], marker='x', s=100) 55 | 56 | ax[0].set_xlabel('Age', fontsize=16) 57 | ax[0].set_ylabel('Height', fontsize=16) 58 | 59 | ax[1].set_xlabel('Age', fontsize=16) 60 | 61 | ax[0].set_title('All samples', fontsize=16) 62 | ax[1].set_title('Outliers', fontsize=16) 63 | 64 | ax[0].legend(fontsize=16) 65 | 66 | plt.show() 67 | 68 | -------------------------------------------------------------------------------- /Chapter07/factor_analysis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import fetch_olivetti_faces 6 | from sklearn.decomposition import PCA, FactorAnalysis 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Load the dataset 15 | faces = fetch_olivetti_faces(shuffle=True, random_state=1000) 16 | X = faces['data'] 17 | Xz = X - np.mean(X, axis=0) 18 | 19 | # Create a noisy version 20 | C = np.diag(np.random.uniform(0.0, 0.1, size=Xz.shape[1])) 21 | Xnz = Xz + np.random.multivariate_normal(np.zeros(shape=Xz.shape[1]), C, size=Xz.shape[0]) 22 | 23 | # Show some samples 24 | sns.set() 25 | 26 | fig, ax = plt.subplots(2, 10, figsize=(22, 6)) 27 | 28 | Xn = Xnz + np.mean(X, axis=0) 29 | 30 | for i in range(10): 31 | ax[0, i].imshow(X[i].reshape((64, 64)), cmap='gray') 32 | ax[0, i].set_xticks([]) 33 | ax[0, i].set_yticks([]) 34 | 35 | ax[1, i].imshow(Xn[i].reshape((64, 64)), cmap='gray') 36 | ax[1, i].set_xticks([]) 37 | ax[1, i].set_yticks([]) 38 | 39 | plt.show() 40 | 41 | # Perform the evaluations 42 | pca = PCA(n_components=128, random_state=1000) 43 | pca.fit(Xz) 44 | print('PCA log-likelihood(Xz): {}'.format(pca.score(Xz))) 45 | 46 | pcan = PCA(n_components=128, random_state=1000) 47 | pcan.fit(Xnz) 48 | print('PCA log-likelihood(Xnz): {}'.format(pcan.score(Xnz))) 49 | 50 | fa = FactorAnalysis(n_components=128, random_state=1000) 51 | fa.fit(Xnz) 52 | print('Factor Analysis log-likelihood(Xnz): {}'.format(fa.score(Xnz))) 53 | -------------------------------------------------------------------------------- /Chapter07/fastica.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import load_digits, fetch_olivetti_faces 6 | from sklearn.decomposition import FastICA 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Load the dataset 15 | digits = load_digits() 16 | X = digits['data'] / np.max(digits['data']) 17 | 18 | # Perform the fast ICA 19 | ica = FastICA(n_components=50, max_iter=10000, tol=1e-5, random_state=1000) 20 | ica.fit(X) 21 | 22 | # Show the components 23 | sns.set() 24 | 25 | fig, ax = plt.subplots(5, 10, figsize=(15, 10)) 26 | 27 | for i in range(5): 28 | for j in range(10): 29 | ax[i, j].imshow(ica.components_[(2 * j) + i].reshape((8, 8)), cmap='gray') 30 | ax[i, j].set_xticks([]) 31 | ax[i, j].set_yticks([]) 32 | 33 | plt.show() 34 | 35 | # Load the Olivetti faces dataset 36 | faces = fetch_olivetti_faces(shuffle=True, random_state=1000) 37 | 38 | # Show the first 10 faces 39 | fig, ax = plt.subplots(1, 10, figsize=(22, 12)) 40 | 41 | for i in range(10): 42 | ax[i].imshow(faces['images'][i], cmap='gray') 43 | ax[i].set_xticks([]) 44 | ax[i].set_yticks([]) 45 | 46 | plt.show() 47 | 48 | # Perform the fast ICA 49 | for n in (100, 350): 50 | ica = FastICA(n_components=n, max_iter=10000, tol=1e-5, random_state=1000) 51 | ica.fit(faces['data']) 52 | 53 | # Show the first 50 components 54 | fig, ax = plt.subplots(5, 10, figsize=(15, 10)) 55 | 56 | for i in range(5): 57 | for j in range(10): 58 | ax[i, j].imshow(ica.components_[(5 * j) + i].reshape((64, 64)), cmap='gray') 59 | ax[i, j].set_xticks([]) 60 | ax[i, j].set_yticks([]) 61 | 62 | plt.show() 63 | 64 | -------------------------------------------------------------------------------- /Chapter07/kernel_pca.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import make_moons 6 | from sklearn.decomposition import KernelPCA 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Create the dataset 15 | X, Y = make_moons(n_samples=800, noise=0.05, random_state=1000) 16 | 17 | # Perform a Kernel PCA 18 | kpca = KernelPCA(n_components=2, kernel='rbf', gamma=10.0, random_state=1000) 19 | X_pca = kpca.fit_transform(X) 20 | 21 | # Show the results 22 | sns.set() 23 | 24 | fig, ax = plt.subplots(1, 2, figsize=(22, 8)) 25 | 26 | ax[0].scatter(X[Y == 0, 0], X[Y == 0, 1]) 27 | ax[0].scatter(X[Y == 1, 0], X[Y == 1, 1]) 28 | ax[0].set_xlabel(r'$x_1$', fontsize=16) 29 | ax[0].set_ylabel(r'$x_2$', fontsize=16) 30 | ax[0].set_title('Original dataset', fontsize=16) 31 | 32 | ax[1].scatter(X_pca[Y == 0, 0], X_pca[Y == 0, 1]) 33 | ax[1].scatter(X_pca[Y == 1, 0], X_pca[Y == 1, 1]) 34 | ax[1].set_xlabel('First component', fontsize=16) 35 | ax[1].set_ylabel('Second component', fontsize=16) 36 | ax[1].set_title('RBF Kernel PCA projected dataset', fontsize=16) 37 | 38 | plt.show() -------------------------------------------------------------------------------- /Chapter07/lda.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import fetch_20newsgroups 6 | from sklearn.feature_extraction.text import CountVectorizer 7 | from sklearn.decomposition import LatentDirichletAllocation 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | if __name__ == '__main__': 15 | # Load the dataset 16 | news = fetch_20newsgroups(subset='all', 17 | categories=('rec.autos', 'comp.sys.mac.hardware'), 18 | remove=('headers', 'footers', 'quotes'), random_state=1000) 19 | 20 | corpus = news['data'] 21 | labels = news['target'] 22 | 23 | # Vectorize the dataset 24 | cv = CountVectorizer(strip_accents='unicode', stop_words='english', analyzer='word', token_pattern='[a-z]+') 25 | Xc = cv.fit_transform(corpus) 26 | 27 | print(len(cv.vocabulary_)) 28 | 29 | # Perform the LDA 30 | lda = LatentDirichletAllocation(n_components=2, learning_method='online', max_iter=100, random_state=1000) 31 | Xl = lda.fit_transform(Xc) 32 | 33 | # Show the top-10 words per topic 34 | Mwts_lda = np.argsort(lda.components_, axis=1)[::-1] 35 | 36 | for t in range(2): 37 | print('\nTopic ' + str(t)) 38 | for i in range(10): 39 | print(cv.get_feature_names()[Mwts_lda[t, i]]) 40 | 41 | # Show the sample messages 42 | print(corpus[100]) 43 | print(corpus[200]) 44 | 45 | # Show the topic mixtures 46 | print(Xl[100]) 47 | print(Xl[200]) 48 | 49 | # Show the mixtures for both sub-categories 50 | sns.set() 51 | 52 | fig, ax = plt.subplots(1, 2, figsize=(22, 8), sharey=True) 53 | 54 | x0 = Xl[labels == 0] 55 | x1 = Xl[labels == 1] 56 | 57 | ax[0].scatter(x0[:, 0], x0[:, 1]) 58 | ax[0].set_xlabel('Topic 0', fontsize=16) 59 | ax[0].set_ylabel('Topic 1', fontsize=16) 60 | ax[0].set_title('comp.sys.mac.hardware', fontsize=16) 61 | 62 | ax[1].scatter(x1[:, 0], x1[:, 1]) 63 | ax[1].set_xlabel('Topic 0', fontsize=16) 64 | ax[1].set_title('rec.autos', fontsize=16) 65 | 66 | plt.show() 67 | 68 | -------------------------------------------------------------------------------- /Chapter07/nnmf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import load_digits 6 | from sklearn.decomposition import NMF 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Load the dataset 15 | digits = load_digits() 16 | X = digits['data'] / np.max(digits['data']) 17 | 18 | # Perform a Non-negative matrix factorization 19 | nmf = NMF(n_components=50, alpha=2.0, l1_ratio=0.1, random_state=1000) 20 | nmf.fit(X) 21 | 22 | # Show the components 23 | sns.set() 24 | 25 | fig, ax = plt.subplots(5, 10, figsize=(22, 15)) 26 | 27 | for i in range(5): 28 | for j in range(10): 29 | ax[i, j].imshow(nmf.components_[(5 * j) + i].reshape((8, 8)), cmap='gray') 30 | ax[i, j].set_xticks([]) 31 | ax[i, j].set_yticks([]) 32 | 33 | plt.show() 34 | 35 | # Transform X[0] 36 | y = nmf.transform(X[0].reshape(1, -1)).squeeze() 37 | 38 | # Show the absolute magnitudes 39 | fig, ax = plt.subplots(figsize=(22, 10)) 40 | 41 | ax.bar(np.arange(1, 51, 1), np.abs(y)) 42 | ax.set_xticks(np.arange(1, 51, 1)) 43 | ax.set_xlabel('Component', fontsize=16) 44 | ax.set_ylabel('Coefficient (absolute values)', fontsize=16) 45 | 46 | plt.show() 47 | 48 | 49 | -------------------------------------------------------------------------------- /Chapter07/pca.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import load_digits 6 | from sklearn.decomposition import PCA 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Load the dataset 15 | digits = load_digits() 16 | X = digits['data'] / np.max(digits['data']) 17 | 18 | # Compute the eigenvalues of C 19 | C = np.cov(X.T) 20 | l, v = np.linalg.eig(C) 21 | l = np.sort(l)[::-1] 22 | d = l[:l.shape[0] - 1] - l[1:] 23 | 24 | # Show the differences 25 | sns.set() 26 | 27 | fig, ax = plt.subplots(figsize=(22, 12)) 28 | 29 | ax.bar(np.arange(2, len(d) + 2, 1), d) 30 | ax.set_xticks(np.arange(2, len(d) + 2, 1)) 31 | ax.set_xlabel('Component', fontsize=16) 32 | ax.set_ylabel('Eigenvalue difference', fontsize=16) 33 | 34 | plt.show() 35 | 36 | # Perform the PCA 37 | pca = PCA(n_components=16, random_state=1000) 38 | digits_pca = pca.fit_transform(X) 39 | 40 | # Show some sample digits 41 | fig, ax = plt.subplots(2, 10, figsize=(22, 6)) 42 | 43 | for i in range(10): 44 | ax[0, i].imshow(X[i].reshape((8, 8)), cmap='gray') 45 | ax[0, i].set_xticks([]) 46 | ax[0, i].set_yticks([]) 47 | 48 | ax[1, i].imshow(pca.inverse_transform(digits_pca[i]).reshape((8, 8)), cmap='gray') 49 | ax[1, i].set_xticks([]) 50 | ax[1, i].set_yticks([]) 51 | 52 | plt.show() 53 | 54 | # Total explained variance (as ratio) 55 | print(np.sum(pca.explained_variance_ratio_)) 56 | 57 | # Show the explained variance ratio per component 58 | ev = pca.explained_variance_ratio_ 59 | 60 | fig, ax = plt.subplots(figsize=(10, 6)) 61 | 62 | ax.bar(np.arange(1, len(ev) + 1, 1), ev) 63 | ax.set_xticks(np.arange(1, len(ev) + 1, 1)) 64 | ax.set_xlabel('Components') 65 | ax.set_ylabel('Explained variance ratio') 66 | 67 | plt.show() 68 | 69 | -------------------------------------------------------------------------------- /Chapter07/sparse_pca.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import load_digits 6 | from sklearn.decomposition import SparsePCA 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Load the dataset 15 | digits = load_digits() 16 | X = digits['data'] / np.max(digits['data']) 17 | 18 | # Perform a sparse PCA 19 | spca = SparsePCA(n_components=30, alpha=2.0, normalize_components=True, random_state=1000) 20 | spca.fit(X) 21 | 22 | # Show the components 23 | sns.set() 24 | 25 | fig, ax = plt.subplots(3, 10, figsize=(22, 8)) 26 | 27 | for i in range(3): 28 | for j in range(10): 29 | ax[i, j].imshow(spca.components_[(3 * j) + i].reshape((8, 8)), cmap='gray') 30 | ax[i, j].set_xticks([]) 31 | ax[i, j].set_yticks([]) 32 | 33 | plt.show() 34 | 35 | # Transform X[0] 36 | y = spca.transform(X[0].reshape(1, -1)).squeeze() 37 | 38 | # Show the absolute magnitudes 39 | fig, ax = plt.subplots(figsize=(22, 10)) 40 | 41 | ax.bar(np.arange(1, 31, 1), np.abs(y)) 42 | ax.set_xticks(np.arange(1, 31, 1)) 43 | ax.set_xlabel('Component', fontsize=16) 44 | ax.set_ylabel('Coefficient (absolute values)', fontsize=16) 45 | 46 | plt.show() 47 | 48 | 49 | -------------------------------------------------------------------------------- /Chapter07/whitening.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from sklearn.datasets import make_blobs 6 | 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | 12 | def zero_center(X): 13 | return X - np.mean(X, axis=0) 14 | 15 | 16 | def whiten(X, correct=True): 17 | Xc = zero_center(X) 18 | _, L, V = np.linalg.svd(Xc) 19 | W = np.dot(V.T, np.diag(1.0 / L)) 20 | return np.dot(Xc, W) * np.sqrt(X.shape[0]) if correct else 1.0 21 | 22 | 23 | if __name__ == '__main__': 24 | # Create the dataset 25 | X, _ = make_blobs(n_samples=300, centers=1, cluster_std=2.5, random_state=1000) 26 | 27 | print(np.cov(X.T)) 28 | 29 | Xw = whiten(X) 30 | 31 | # Show the plots 32 | sns.set() 33 | 34 | fig, ax = plt.subplots(1, 2, figsize=(22, 8)) 35 | 36 | ax[0].scatter(X[:, 0], X[:, 1]) 37 | ax[0].set_xlim([-10, 10]) 38 | ax[0].set_ylim([-16, 16]) 39 | ax[0].set_xlabel(r'$x_1$', fontsize=16) 40 | ax[0].set_ylabel(r'$x_2$', fontsize=16) 41 | ax[0].set_title('Original dataset', fontsize=16) 42 | 43 | ax[1].scatter(Xw[:, 0], Xw[:, 1]) 44 | ax[1].set_xlim([-10, 10]) 45 | ax[1].set_ylim([-16, 16]) 46 | ax[1].set_xlabel(r'$x_1$', fontsize=16) 47 | ax[1].set_ylabel(r'$x_2$', fontsize=16) 48 | ax[1].set_title('Whitened dataset', fontsize=16) 49 | 50 | plt.show() 51 | 52 | -------------------------------------------------------------------------------- /Chapter08/deep_convolutional_autoencoder.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import seaborn as sns 4 | import tensorflow as tf 5 | 6 | from sklearn.datasets import fetch_olivetti_faces 7 | 8 | 9 | # Set random seed for reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | nb_epochs = 600 14 | batch_size = 50 15 | code_length = 256 16 | width = 32 17 | height = 32 18 | 19 | 20 | if __name__ == '__main__': 21 | # Load the dataset 22 | faces = fetch_olivetti_faces(shuffle=True, random_state=1000) 23 | X_train = faces['images'] 24 | 25 | # Create graph 26 | graph = tf.Graph() 27 | 28 | with graph.as_default(): 29 | input_images_xl = tf.placeholder(tf.float32, shape=(None, X_train.shape[1], X_train.shape[2], 1)) 30 | input_images = tf.image.resize_images(input_images_xl, (width, height), 31 | method=tf.image.ResizeMethod.BICUBIC) 32 | 33 | # Encoder 34 | conv_0 = tf.layers.conv2d(inputs=input_images, 35 | filters=16, 36 | kernel_size=(3, 3), 37 | strides=(2, 2), 38 | activation=tf.nn.relu, 39 | padding='same') 40 | 41 | conv_1 = tf.layers.conv2d(inputs=conv_0, 42 | filters=32, 43 | kernel_size=(3, 3), 44 | activation=tf.nn.relu, 45 | padding='same') 46 | 47 | conv_2 = tf.layers.conv2d(inputs=conv_1, 48 | filters=64, 49 | kernel_size=(3, 3), 50 | activation=tf.nn.relu, 51 | padding='same') 52 | 53 | conv_3 = tf.layers.conv2d(inputs=conv_2, 54 | filters=128, 55 | kernel_size=(3, 3), 56 | activation=tf.nn.relu, 57 | padding='same') 58 | 59 | # Code layer 60 | code_input = tf.layers.flatten(inputs=conv_3) 61 | 62 | code_layer = tf.layers.dense(inputs=code_input, 63 | units=code_length, 64 | activation=tf.nn.sigmoid) 65 | 66 | code_mean = tf.reduce_mean(code_layer, axis=1) 67 | 68 | # Decoder 69 | decoder_input = tf.reshape(code_layer, (-1, int(width / 2), int(height / 2), 1)) 70 | 71 | convt_0 = tf.layers.conv2d_transpose(inputs=decoder_input, 72 | filters=128, 73 | kernel_size=(3, 3), 74 | strides=(2, 2), 75 | activation=tf.nn.relu, 76 | padding='same') 77 | 78 | convt_1 = tf.layers.conv2d_transpose(inputs=convt_0, 79 | filters=64, 80 | kernel_size=(3, 3), 81 | activation=tf.nn.relu, 82 | padding='same') 83 | 84 | convt_2 = tf.layers.conv2d_transpose(inputs=convt_1, 85 | filters=32, 86 | kernel_size=(3, 3), 87 | activation=tf.nn.relu, 88 | padding='same') 89 | 90 | convt_3 = tf.layers.conv2d_transpose(inputs=convt_2, 91 | filters=1, 92 | kernel_size=(3, 3), 93 | activation=tf.sigmoid, 94 | padding='same') 95 | 96 | output_images = tf.image.resize_images(convt_3, (X_train.shape[1], X_train.shape[2]), 97 | method=tf.image.ResizeMethod.BICUBIC) 98 | 99 | # Loss 100 | loss = tf.nn.l2_loss(convt_3 - input_images) 101 | 102 | # Training step 103 | training_step = tf.train.AdamOptimizer(0.001).minimize(loss) 104 | 105 | # Train the model 106 | session = tf.InteractiveSession(graph=graph) 107 | tf.global_variables_initializer().run() 108 | 109 | for e in range(nb_epochs): 110 | np.random.shuffle(X_train) 111 | 112 | total_loss = 0.0 113 | code_means = [] 114 | 115 | for i in range(0, X_train.shape[0] - batch_size, batch_size): 116 | X = np.expand_dims(X_train[i:i + batch_size, :, :], axis=3).astype(np.float32) 117 | 118 | _, n_loss, c_mean = session.run([training_step, loss, code_mean], 119 | feed_dict={ 120 | input_images_xl: X 121 | }) 122 | total_loss += n_loss 123 | code_means.append(c_mean) 124 | 125 | print('Epoch {}) Average loss per sample: {} (Code mean: {})'. 126 | format(e + 1, total_loss / float(X_train.shape[0]), np.mean(code_means))) 127 | 128 | # Show some examples 129 | Xs = np.reshape(X_train[0:batch_size], (batch_size, X_train.shape[1], X_train.shape[2], 1)) 130 | 131 | Ys = session.run([output_images], 132 | feed_dict={ 133 | input_images_xl: Xs 134 | }) 135 | 136 | Ys = np.squeeze(Ys[0] * 255.0) 137 | 138 | fig, ax = plt.subplots(3, 10, figsize=(22, 8)) 139 | sns.set() 140 | 141 | for i in range(10): 142 | ax[0, i].imshow(Ys[i], cmap='gray') 143 | ax[0, i].set_xticks([]) 144 | ax[0, i].set_yticks([]) 145 | 146 | ax[1, i].imshow(Ys[i + 10], cmap='gray') 147 | ax[1, i].set_xticks([]) 148 | ax[1, i].set_yticks([]) 149 | 150 | ax[2, i].imshow(Ys[i + 20], cmap='gray') 151 | ax[2, i].set_xticks([]) 152 | ax[2, i].set_yticks([]) 153 | 154 | plt.show() 155 | 156 | session.close() -------------------------------------------------------------------------------- /Chapter08/denoising_autoencoder.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import seaborn as sns 4 | import tensorflow as tf 5 | 6 | from sklearn.datasets import fetch_olivetti_faces 7 | 8 | 9 | # Set random seed for reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | nb_epochs = 600 14 | batch_size = 50 15 | code_length = 256 16 | width = 32 17 | height = 32 18 | 19 | 20 | if __name__ == '__main__': 21 | # Load the dataset 22 | faces = fetch_olivetti_faces(shuffle=True, random_state=1000) 23 | X_train = faces['images'] 24 | 25 | # Create graph 26 | graph = tf.Graph() 27 | 28 | with graph.as_default(): 29 | input_images_xl = tf.placeholder(tf.float32, shape=(None, X_train.shape[1], X_train.shape[2], 1)) 30 | input_noisy_images_xl = tf.placeholder(tf.float32, shape=(None, X_train.shape[1], X_train.shape[2], 1)) 31 | 32 | input_images = tf.image.resize_images(input_images_xl, (width, height), method=tf.image.ResizeMethod.BICUBIC) 33 | input_noisy_images = tf.image.resize_images(input_noisy_images_xl, (width, height), 34 | method=tf.image.ResizeMethod.BICUBIC) 35 | 36 | # Encoder 37 | conv_0 = tf.layers.conv2d(inputs=input_noisy_images, 38 | filters=16, 39 | kernel_size=(3, 3), 40 | strides=(2, 2), 41 | activation=tf.nn.relu, 42 | padding='same') 43 | 44 | conv_1 = tf.layers.conv2d(inputs=conv_0, 45 | filters=32, 46 | kernel_size=(3, 3), 47 | activation=tf.nn.relu, 48 | padding='same') 49 | 50 | conv_2 = tf.layers.conv2d(inputs=conv_1, 51 | filters=64, 52 | kernel_size=(3, 3), 53 | activation=tf.nn.relu, 54 | padding='same') 55 | 56 | conv_3 = tf.layers.conv2d(inputs=conv_2, 57 | filters=128, 58 | kernel_size=(3, 3), 59 | activation=tf.nn.relu, 60 | padding='same') 61 | 62 | # Code layer 63 | code_input = tf.layers.flatten(inputs=conv_3) 64 | 65 | code_layer = tf.layers.dense(inputs=code_input, 66 | units=code_length, 67 | activation=tf.nn.sigmoid) 68 | 69 | code_mean = tf.reduce_mean(code_layer, axis=1) 70 | 71 | # Decoder 72 | decoder_input = tf.reshape(code_layer, (-1, int(width / 2), int(height / 2), 1)) 73 | 74 | convt_0 = tf.layers.conv2d_transpose(inputs=decoder_input, 75 | filters=128, 76 | kernel_size=(3, 3), 77 | strides=(2, 2), 78 | activation=tf.nn.relu, 79 | padding='same') 80 | 81 | convt_1 = tf.layers.conv2d_transpose(inputs=convt_0, 82 | filters=64, 83 | kernel_size=(3, 3), 84 | activation=tf.nn.relu, 85 | padding='same') 86 | 87 | convt_2 = tf.layers.conv2d_transpose(inputs=convt_1, 88 | filters=32, 89 | kernel_size=(3, 3), 90 | activation=tf.nn.relu, 91 | padding='same') 92 | 93 | convt_3 = tf.layers.conv2d_transpose(inputs=convt_2, 94 | filters=1, 95 | kernel_size=(3, 3), 96 | activation=tf.sigmoid, 97 | padding='same') 98 | 99 | output_images = tf.image.resize_images(convt_3, (X_train.shape[1], X_train.shape[2]), 100 | method=tf.image.ResizeMethod.BICUBIC) 101 | 102 | # Loss 103 | loss = tf.nn.l2_loss(convt_3 - input_images) 104 | 105 | # Training step 106 | training_step = tf.train.AdamOptimizer(0.001).minimize(loss) 107 | 108 | # Train the model 109 | session = tf.InteractiveSession(graph=graph) 110 | tf.global_variables_initializer().run() 111 | 112 | for e in range(nb_epochs): 113 | np.random.shuffle(X_train) 114 | 115 | total_loss = 0.0 116 | code_means = [] 117 | 118 | for i in range(0, X_train.shape[0] - batch_size, batch_size): 119 | X = np.expand_dims(X_train[i:i + batch_size, :, :], axis=3).astype(np.float32) 120 | Xn = np.clip(X + np.random.normal(0.0, 0.2, size=(batch_size, X_train.shape[1], X_train.shape[2], 1)), 0.0, 121 | 1.0) 122 | 123 | _, n_loss, c_mean = session.run([training_step, loss, code_mean], 124 | feed_dict={ 125 | input_images_xl: X, 126 | input_noisy_images_xl: Xn 127 | }) 128 | total_loss += n_loss 129 | code_means.append(c_mean) 130 | 131 | print('Epoch {}) Average loss per sample: {} (Code mean: {})'. 132 | format(e + 1, total_loss / float(X_train.shape[0]), np.mean(code_means))) 133 | 134 | # Show some examples 135 | Xs = np.reshape(X_train[0:10], (10, X_train.shape[1], X_train.shape[2], 1)) 136 | Xn = np.clip(Xs + np.random.normal(0.0, 0.2, size=(10, X_train.shape[1], X_train.shape[2], 1)), 0.0, 1.0) 137 | 138 | Ys = session.run([output_images], 139 | feed_dict={ 140 | input_noisy_images_xl: Xn 141 | }) 142 | 143 | Ys = np.squeeze(Ys[0] * 255.0) 144 | 145 | fig, ax = plt.subplots(2, 10, figsize=(22, 6)) 146 | sns.set() 147 | 148 | for i in range(10): 149 | ax[0, i].imshow(np.squeeze(Xn[i]), cmap='gray') 150 | ax[0, i].set_xticks([]) 151 | ax[0, i].set_yticks([]) 152 | 153 | ax[1, i].imshow(Ys[i], cmap='gray') 154 | ax[1, i].set_xticks([]) 155 | ax[1, i].set_yticks([]) 156 | 157 | plt.show() 158 | 159 | session.close() -------------------------------------------------------------------------------- /Chapter08/rubner-tavan-network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.datasets import make_blobs 4 | 5 | # Set random seed for reproducibility 6 | np.random.seed(1000) 7 | 8 | 9 | n_components = 2 10 | learning_rate = 0.0001 11 | max_iterations = 1000 12 | stabilization_cycles = 5 13 | threshold = 0.00001 14 | 15 | 16 | def zero_center(Xd): 17 | return Xd - np.mean(Xd, axis=0) 18 | 19 | 20 | if __name__ == '__main__': 21 | # Create the dataset 22 | X, _ = make_blobs(n_samples=500, centers=3, cluster_std=[5.0, 1.0, 2.5], random_state=1000) 23 | Xs = zero_center(X) 24 | 25 | Q = np.cov(Xs.T) 26 | eigu, eigv = np.linalg.eig(Q) 27 | 28 | print('Eigenvalues: {}'.format(eigu)) 29 | print('Eigenvectors: {}'.format(eigv.T)) 30 | 31 | # Initialize the variables 32 | W = np.random.normal(0.0, 0.5, size=(Xs.shape[1], n_components)) 33 | V = np.tril(np.random.normal(0.0, 0.01, size=(n_components, n_components))) 34 | np.fill_diagonal(V, 0.0) 35 | 36 | prev_W = np.zeros((Xs.shape[1], n_components)) 37 | t = 0 38 | 39 | # Perform the training cycle 40 | while np.linalg.norm(W - prev_W, ord='fro') > threshold and t < max_iterations: 41 | prev_W = W.copy() 42 | t += 1 43 | 44 | for i in range(Xs.shape[0]): 45 | y_p = np.zeros((n_components, 1)) 46 | xi = np.expand_dims(Xs[i], 1) 47 | y = None 48 | 49 | for _ in range(stabilization_cycles): 50 | y = np.dot(W.T, xi) + np.dot(V, y_p) 51 | y_p = y.copy() 52 | 53 | dW = np.zeros((Xs.shape[1], n_components)) 54 | dV = np.zeros((n_components, n_components)) 55 | 56 | for t in range(n_components): 57 | y2 = np.power(y[t], 2) 58 | dW[:, t] = np.squeeze((y[t] * xi) + (y2 * np.expand_dims(W[:, t], 1))) 59 | dV[t, :] = -np.squeeze((y[t] * y) + (y2 * np.expand_dims(V[t, :], 1))) 60 | 61 | W += (learning_rate * dW) 62 | V += (learning_rate * dV) 63 | 64 | V = np.tril(V) 65 | np.fill_diagonal(V, 0.0) 66 | 67 | W /= np.linalg.norm(W, axis=0).reshape((1, n_components)) 68 | 69 | print('Final weights: {}'.format(W)) 70 | 71 | # Compute the covariance matrix 72 | Y_comp = np.zeros((Xs.shape[0], n_components)) 73 | 74 | for i in range(Xs.shape[0]): 75 | y_p = np.zeros((n_components, 1)) 76 | xi = np.expand_dims(Xs[i], 1) 77 | 78 | for _ in range(stabilization_cycles): 79 | Y_comp[i] = np.squeeze(np.dot(W.T, xi) + np.dot(V.T, y_p)) 80 | y_p = y.copy() 81 | 82 | print('Final covariance matrix: {}'.format(np.cov(Y_comp.T))) 83 | -------------------------------------------------------------------------------- /Chapter08/sanger_network.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import numpy as np 4 | 5 | from sklearn.datasets import make_blobs 6 | 7 | # Set random seed for reproducibility 8 | np.random.seed(1000) 9 | 10 | 11 | n_components = 2 12 | learning_rate = 0.01 13 | nb_iterations = 5000 14 | t = 0.0 15 | 16 | 17 | def zero_center(Xd): 18 | return Xd - np.mean(Xd, axis=0) 19 | 20 | 21 | if __name__ == '__main__': 22 | # Create the dataset 23 | X, _ = make_blobs(n_samples=500, centers=3, cluster_std=[5.0, 1.0, 2.5], random_state=1000) 24 | Xs = zero_center(X) 25 | 26 | Q = np.cov(Xs.T) 27 | eigu, eigv = np.linalg.eig(Q) 28 | 29 | print('Covariance matrix: {}'.format(Q)) 30 | print('Eigenvalues: {}'.format(eigu)) 31 | print('Eigenvectors: {}'.format(eigv.T)) 32 | 33 | # Initialize the weights 34 | W_sanger = np.random.normal(scale=0.5, size=(n_components, Xs.shape[1])) 35 | W_sanger /= np.linalg.norm(W_sanger, axis=1).reshape((n_components, 1)) 36 | 37 | # Show the initial configuration 38 | sns.set() 39 | fig, ax = plt.subplots(figsize=(10, 10)) 40 | 41 | ax.scatter(Xs[:, 0], Xs[:, 1]) 42 | ax.set_xlabel(r'$x_0$', fontsize=16) 43 | ax.set_ylabel(r'$x_1$', fontsize=16) 44 | W = W_sanger * 10.0 45 | 46 | ax.arrow(0, 0, W[0, 0], W[0, 1], head_width=1.0, head_length=0.5, fc='k', ec='k') 47 | ax.annotate(r'$w_0$', xy=(1.0, 1.0), xycoords='data', xytext=(W[0, 0] + 0.5, W[0, 1] + 0.5), textcoords='data', 48 | size=20) 49 | 50 | ax.arrow(0, 0, W[1, 0], W[1, 1], head_width=1.0, head_length=0.5, fc='k', ec='k') 51 | ax.annotate(r'$w_1$', xy=(1.0, 1.0), xycoords='data', xytext=(W[1, 0] + 0.5, W[1, 1] + 1.0), textcoords='data', 52 | size=20) 53 | 54 | ax.grid() 55 | plt.show() 56 | 57 | # Perform the training cycle 58 | for i in range(nb_iterations): 59 | dw = np.zeros((n_components, Xs.shape[1])) 60 | t += 1.0 61 | 62 | for j in range(Xs.shape[0]): 63 | Ysj = np.dot(W_sanger, Xs[j]).reshape((n_components, 1)) 64 | QYd = np.tril(np.dot(Ysj, Ysj.T)) 65 | dw += np.dot(Ysj, Xs[j].reshape((1, X.shape[1]))) - np.dot(QYd, W_sanger) 66 | 67 | W_sanger += (learning_rate / t) * dw 68 | W_sanger /= np.linalg.norm(W_sanger, axis=1).reshape((n_components, 1)) 69 | 70 | print('Final weights: {}'.format(W_sanger)) 71 | print('Final covariance matrix: {}'.format(np.cov(np.dot(Xs, W_sanger.T).T))) 72 | 73 | # Plot the final configuration 74 | fig, ax = plt.subplots(figsize=(10, 10)) 75 | 76 | ax.scatter(Xs[:, 0], Xs[:, 1]) 77 | ax.set_xlabel(r'$x_0$', fontsize=16) 78 | ax.set_ylabel(r'$x_1$', fontsize=16) 79 | 80 | # Compute a normalization factor for the length of the components 81 | nf = np.sort(np.expand_dims(eigu / np.linalg.norm(eigu), axis=1))[::-1] 82 | W = W_sanger * (15 * nf) 83 | 84 | ax.arrow(0, 0, W[0, 0], W[0, 1], head_width=1.0, head_length=0.5, fc='k', ec='k') 85 | ax.annotate(r'$w_0$', xy=(1.0, 1.0), xycoords='data', xytext=(W[0, 0] + 1.0, W[0, 1]), textcoords='data', 86 | size=20) 87 | 88 | ax.arrow(0, 0, W[1, 0], W[1, 1], head_width=1.0, head_length=0.5, fc='k', ec='k') 89 | ax.annotate(r'$w_1$', xy=(1.0, 1.0), xycoords='data', xytext=(W[1, 0] + 0.5, W[1, 1] + 1.0), textcoords='data', 90 | size=20) 91 | 92 | ax.grid() 93 | plt.show() 94 | -------------------------------------------------------------------------------- /Chapter08/sparse_autoencoder.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import seaborn as sns 4 | import tensorflow as tf 5 | 6 | from sklearn.datasets import fetch_olivetti_faces 7 | 8 | 9 | # Set random seed for reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | nb_epochs = 600 14 | batch_size = 50 15 | code_length = 256 16 | width = 32 17 | height = 32 18 | 19 | 20 | if __name__ == '__main__': 21 | # Load the dataset 22 | faces = fetch_olivetti_faces(shuffle=True, random_state=1000) 23 | X_train = faces['images'] 24 | 25 | # Create graph 26 | graph = tf.Graph() 27 | 28 | with graph.as_default(): 29 | input_images_xl = tf.placeholder(tf.float32, shape=(None, X_train.shape[1], X_train.shape[2], 1)) 30 | input_noisy_images_xl = tf.placeholder(tf.float32, shape=(None, X_train.shape[1], X_train.shape[2], 1)) 31 | 32 | input_images = tf.image.resize_images(input_images_xl, (width, height), method=tf.image.ResizeMethod.BICUBIC) 33 | input_noisy_images = tf.image.resize_images(input_noisy_images_xl, (width, height), 34 | method=tf.image.ResizeMethod.BICUBIC) 35 | 36 | # Encoder 37 | conv_0 = tf.layers.conv2d(inputs=input_noisy_images, 38 | filters=16, 39 | kernel_size=(3, 3), 40 | strides=(2, 2), 41 | activation=tf.nn.relu, 42 | padding='same') 43 | 44 | conv_1 = tf.layers.conv2d(inputs=conv_0, 45 | filters=32, 46 | kernel_size=(3, 3), 47 | activation=tf.nn.relu, 48 | padding='same') 49 | 50 | conv_2 = tf.layers.conv2d(inputs=conv_1, 51 | filters=64, 52 | kernel_size=(3, 3), 53 | activation=tf.nn.relu, 54 | padding='same') 55 | 56 | conv_3 = tf.layers.conv2d(inputs=conv_2, 57 | filters=128, 58 | kernel_size=(3, 3), 59 | activation=tf.nn.relu, 60 | padding='same') 61 | 62 | # Code layer 63 | code_input = tf.layers.flatten(inputs=conv_3) 64 | 65 | code_layer = tf.layers.dense(inputs=code_input, 66 | units=code_length, 67 | activation=tf.nn.sigmoid) 68 | 69 | code_mean = tf.reduce_mean(code_layer, axis=1) 70 | 71 | # Decoder 72 | decoder_input = tf.reshape(code_layer, (-1, int(width / 2), int(height / 2), 1)) 73 | 74 | convt_0 = tf.layers.conv2d_transpose(inputs=decoder_input, 75 | filters=128, 76 | kernel_size=(3, 3), 77 | strides=(2, 2), 78 | activation=tf.nn.relu, 79 | padding='same') 80 | 81 | convt_1 = tf.layers.conv2d_transpose(inputs=convt_0, 82 | filters=64, 83 | kernel_size=(3, 3), 84 | activation=tf.nn.relu, 85 | padding='same') 86 | 87 | convt_2 = tf.layers.conv2d_transpose(inputs=convt_1, 88 | filters=32, 89 | kernel_size=(3, 3), 90 | activation=tf.nn.relu, 91 | padding='same') 92 | 93 | convt_3 = tf.layers.conv2d_transpose(inputs=convt_2, 94 | filters=1, 95 | kernel_size=(3, 3), 96 | activation=tf.sigmoid, 97 | padding='same') 98 | 99 | output_images = tf.image.resize_images(convt_3, (X_train.shape[1], X_train.shape[2]), 100 | method=tf.image.ResizeMethod.BICUBIC) 101 | 102 | # Loss 103 | sparsity_constraint = 0.01 * tf.reduce_sum(tf.norm(code_layer, ord=1, axis=1)) 104 | loss = tf.nn.l2_loss(convt_3 - input_images) + sparsity_constraint 105 | 106 | # Training step 107 | training_step = tf.train.AdamOptimizer(0.001).minimize(loss) 108 | 109 | # Train the model 110 | session = tf.InteractiveSession(graph=graph) 111 | tf.global_variables_initializer().run() 112 | 113 | for e in range(nb_epochs): 114 | np.random.shuffle(X_train) 115 | 116 | total_loss = 0.0 117 | code_means = [] 118 | 119 | for i in range(0, X_train.shape[0] - batch_size, batch_size): 120 | X = np.expand_dims(X_train[i:i + batch_size, :, :], axis=3).astype(np.float32) 121 | Xn = np.clip(X + np.random.normal(0.0, 0.2, size=(batch_size, X_train.shape[1], X_train.shape[2], 1)), 0.0, 122 | 1.0) 123 | 124 | _, n_loss, c_mean = session.run([training_step, loss, code_mean], 125 | feed_dict={ 126 | input_images_xl: X, 127 | input_noisy_images_xl: Xn 128 | }) 129 | total_loss += n_loss 130 | code_means.append(c_mean) 131 | 132 | print('Epoch {}) Average loss per sample: {} (Code mean: {})'. 133 | format(e + 1, total_loss / float(X_train.shape[0]), np.mean(code_means))) 134 | 135 | # Show some examples 136 | Xs = np.reshape(X_train[0:10], (10, X_train.shape[1], X_train.shape[2], 1)) 137 | Xn = np.clip(Xs + np.random.normal(0.0, 0.2, size=(10, X_train.shape[1], X_train.shape[2], 1)), 0.0, 1.0) 138 | 139 | Ys = session.run([output_images], 140 | feed_dict={ 141 | input_noisy_images_xl: Xn 142 | }) 143 | 144 | Ys = np.squeeze(Ys[0] * 255.0) 145 | 146 | fig, ax = plt.subplots(2, 10, figsize=(22, 6)) 147 | sns.set() 148 | 149 | for i in range(10): 150 | ax[0, i].imshow(np.squeeze(Xn[i]), cmap='gray') 151 | ax[0, i].set_xticks([]) 152 | ax[0, i].set_yticks([]) 153 | 154 | ax[1, i].imshow(Ys[i], cmap='gray') 155 | ax[1, i].set_xticks([]) 156 | ax[1, i].set_yticks([]) 157 | 158 | plt.show() 159 | 160 | session.close() -------------------------------------------------------------------------------- /Chapter08/unsupervised_dbn.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import numpy as np 4 | 5 | # To install the DBN package: pip install git+git://github.com/albertbup/deep-belief-network.git 6 | # Further information: https://github.com/albertbup/deep-belief-network 7 | from dbn import UnsupervisedDBN 8 | 9 | from sklearn.datasets import load_digits 10 | from sklearn.manifold import TSNE 11 | from sklearn.utils import shuffle 12 | 13 | 14 | # Set random seed for reproducibility 15 | np.random.seed(1000) 16 | 17 | 18 | nb_samples = 500 19 | 20 | 21 | if __name__ == '__main__': 22 | # Load the dataset 23 | digits = load_digits() 24 | X_train = digits['data'] / np.max(digits['data']) 25 | Y_train = digits['target'] 26 | 27 | X_train, Y_train = shuffle(X_train, Y_train, random_state=1000) 28 | X_train = X_train[0:nb_samples] 29 | Y_train = Y_train[0:nb_samples] 30 | 31 | # Train the unsupervised DBN 32 | unsupervised_dbn = UnsupervisedDBN(hidden_layers_structure=[32, 32, 16], 33 | learning_rate_rbm=0.025, 34 | n_epochs_rbm=500, 35 | batch_size=16, 36 | activation_function='sigmoid') 37 | 38 | X_dbn = unsupervised_dbn.fit_transform(X_train) 39 | 40 | # Perform t-SNE 41 | tsne = TSNE(n_components=2, perplexity=10, random_state=1000) 42 | X_tsne = tsne.fit_transform(X_dbn) 43 | 44 | # Show the result 45 | fig, ax = plt.subplots(figsize=(22, 14)) 46 | sns.set() 47 | 48 | markers = ['o', 'd', 'x', '^', 'v', '<', '>', 'P', 's', 'p'] 49 | 50 | for i in range(10): 51 | ax.scatter(X_tsne[Y_train == i, 0], X_tsne[Y_train == i, 1], marker=markers[i], s=150, 52 | label='Class {}'.format(i + 1)) 53 | 54 | ax.set_xlabel(r'$x_0$', fontsize=16) 55 | ax.set_ylabel(r'$x_1$', fontsize=16) 56 | ax.grid(True) 57 | ax.legend(fontsize=16) 58 | 59 | plt.show() -------------------------------------------------------------------------------- /Chapter08/variational_autoencoder.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import seaborn as sns 4 | import tensorflow as tf 5 | 6 | from sklearn.datasets import fetch_olivetti_faces 7 | 8 | 9 | # Set random seed for reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | nb_epochs = 800 14 | batch_size = 100 15 | code_length = 512 16 | width = 32 17 | height = 32 18 | 19 | 20 | if __name__ == '__main__': 21 | # Load the dataset 22 | faces = fetch_olivetti_faces(shuffle=True, random_state=1000) 23 | X_train = faces['images'] 24 | 25 | # Create graph 26 | graph = tf.Graph() 27 | 28 | with graph.as_default(): 29 | input_images_xl = tf.placeholder(tf.float32, shape=(batch_size, X_train.shape[1], X_train.shape[2], 1)) 30 | input_images = tf.image.resize_images(input_images_xl, (width, height), method=tf.image.ResizeMethod.BICUBIC) 31 | 32 | # Encoder 33 | conv_0 = tf.layers.conv2d(inputs=input_images, 34 | filters=16, 35 | kernel_size=(3, 3), 36 | strides=(2, 2), 37 | activation=tf.nn.relu, 38 | padding='same') 39 | 40 | conv_1 = tf.layers.conv2d(inputs=conv_0, 41 | filters=32, 42 | kernel_size=(3, 3), 43 | activation=tf.nn.relu, 44 | padding='same') 45 | 46 | conv_2 = tf.layers.conv2d(inputs=conv_1, 47 | filters=64, 48 | kernel_size=(3, 3), 49 | activation=tf.nn.relu, 50 | padding='same') 51 | 52 | conv_3 = tf.layers.conv2d(inputs=conv_2, 53 | filters=128, 54 | kernel_size=(3, 3), 55 | activation=tf.nn.relu, 56 | padding='same') 57 | 58 | # Code layer 59 | code_input = tf.layers.flatten(inputs=conv_3) 60 | 61 | code_mean = tf.layers.dense(inputs=code_input, 62 | units=width * height) 63 | 64 | code_log_variance = tf.layers.dense(inputs=code_input, 65 | units=width * height) 66 | 67 | code_std = tf.sqrt(tf.exp(code_log_variance)) 68 | 69 | # Normal samples 70 | normal_samples = tf.random_normal(mean=0.0, stddev=1.0, shape=(batch_size, width * height)) 71 | 72 | # Sampled code 73 | sampled_code = (normal_samples * code_std) + code_mean 74 | 75 | # Decoder 76 | decoder_input = tf.reshape(sampled_code, (-1, int(width / 4), int(height / 4), 16)) 77 | 78 | convt_0 = tf.layers.conv2d_transpose(inputs=decoder_input, 79 | filters=128, 80 | kernel_size=(3, 3), 81 | strides=(2, 2), 82 | activation=tf.nn.relu, 83 | padding='same') 84 | 85 | convt_1 = tf.layers.conv2d_transpose(inputs=convt_0, 86 | filters=128, 87 | kernel_size=(3, 3), 88 | strides=(2, 2), 89 | activation=tf.nn.relu, 90 | padding='same') 91 | 92 | convt_2 = tf.layers.conv2d_transpose(inputs=convt_1, 93 | filters=32, 94 | kernel_size=(3, 3), 95 | activation=tf.nn.relu, 96 | padding='same') 97 | 98 | convt_3 = tf.layers.conv2d_transpose(inputs=convt_2, 99 | filters=1, 100 | kernel_size=(3, 3), 101 | padding='same') 102 | 103 | convt_output = tf.nn.sigmoid(convt_3) 104 | 105 | output_images = tf.image.resize_images(convt_output, (X_train.shape[1], X_train.shape[2]), 106 | method=tf.image.ResizeMethod.BICUBIC) 107 | 108 | # Loss 109 | reconstruction = tf.nn.sigmoid_cross_entropy_with_logits(logits=convt_3, labels=input_images) 110 | kl_divergence = 0.5 * tf.reduce_sum( 111 | tf.square(code_mean) + tf.square(code_std) - tf.log(1e-8 + tf.square(code_std)) - 1, axis=1) 112 | 113 | loss = tf.reduce_sum(tf.reduce_sum(reconstruction) + kl_divergence) 114 | 115 | # Training step 116 | training_step = tf.train.AdamOptimizer(0.001).minimize(loss) 117 | 118 | # Train the model 119 | session = tf.InteractiveSession(graph=graph) 120 | tf.global_variables_initializer().run() 121 | 122 | for e in range(nb_epochs): 123 | np.random.shuffle(X_train) 124 | 125 | total_loss = 0.0 126 | 127 | for i in range(0, X_train.shape[0] - batch_size, batch_size): 128 | X = np.zeros((batch_size, 64, 64, 1), dtype=np.float32) 129 | X[:, :, :, 0] = X_train[i:i + batch_size, :, :] 130 | 131 | _, n_loss = session.run([training_step, loss], 132 | feed_dict={ 133 | input_images_xl: X 134 | }) 135 | total_loss += n_loss 136 | 137 | print('Epoch {}) Average loss per sample: {}'.format(e + 1, total_loss / float(batch_size))) 138 | 139 | # Show some examples 140 | Xs = np.reshape(X_train[0:batch_size], (batch_size, 64, 64, 1)) 141 | 142 | Ys = session.run([output_images], 143 | feed_dict={ 144 | input_images_xl: Xs 145 | }) 146 | 147 | Ys = np.squeeze(Ys[0] * 255.0) 148 | 149 | fig, ax = plt.subplots(3, 10, figsize=(22, 8)) 150 | sns.set() 151 | 152 | for i in range(10): 153 | ax[0, i].imshow(Ys[i], cmap='gray') 154 | ax[0, i].set_xticks([]) 155 | ax[0, i].set_yticks([]) 156 | 157 | ax[1, i].imshow(Ys[i + 10], cmap='gray') 158 | ax[1, i].set_xticks([]) 159 | ax[1, i].set_yticks([]) 160 | 161 | ax[2, i].imshow(Ys[i + 20], cmap='gray') 162 | ax[2, i].set_xticks([]) 163 | ax[2, i].set_yticks([]) 164 | 165 | plt.show() 166 | 167 | session.close() -------------------------------------------------------------------------------- /Chapter09/dcgan.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from sklearn.datasets import fetch_olivetti_faces 7 | 8 | 9 | # Set random seed for reproducibility 10 | np.random.seed(1000) 11 | tf.set_random_seed(1000) 12 | 13 | 14 | nb_samples = 400 15 | code_length = 512 16 | nb_epochs = 500 17 | batch_size = 50 18 | nb_iterations = int(nb_samples / batch_size) 19 | 20 | 21 | def generator(z, is_training=True): 22 | with tf.variable_scope('generator'): 23 | conv_0 = tf.layers.conv2d_transpose(inputs=z, 24 | filters=1024, 25 | kernel_size=(4, 4), 26 | padding='valid') 27 | 28 | b_conv_0 = tf.layers.batch_normalization(inputs=conv_0, training=is_training) 29 | 30 | conv_1 = tf.layers.conv2d_transpose(inputs=tf.nn.leaky_relu(b_conv_0), 31 | filters=512, 32 | kernel_size=(4, 4), 33 | strides=(2, 2), 34 | padding='same') 35 | 36 | b_conv_1 = tf.layers.batch_normalization(inputs=conv_1, training=is_training) 37 | 38 | conv_2 = tf.layers.conv2d_transpose(inputs=tf.nn.leaky_relu(b_conv_1), 39 | filters=256, 40 | kernel_size=(4, 4), 41 | strides=(2, 2), 42 | padding='same') 43 | 44 | b_conv_2 = tf.layers.batch_normalization(inputs=conv_2, training=is_training) 45 | 46 | conv_3 = tf.layers.conv2d_transpose(inputs=tf.nn.leaky_relu(b_conv_2), 47 | filters=128, 48 | kernel_size=(4, 4), 49 | strides=(2, 2), 50 | padding='same') 51 | 52 | b_conv_3 = tf.layers.batch_normalization(inputs=conv_3, training=is_training) 53 | 54 | conv_4 = tf.layers.conv2d_transpose(inputs=tf.nn.leaky_relu(b_conv_3), 55 | filters=1, 56 | kernel_size=(4, 4), 57 | strides=(2, 2), 58 | padding='same') 59 | 60 | return tf.nn.tanh(conv_4) 61 | 62 | 63 | def discriminator(x, is_training=True, reuse_variables=True): 64 | with tf.variable_scope('discriminator', reuse=reuse_variables): 65 | conv_0 = tf.layers.conv2d(inputs=x, 66 | filters=128, 67 | kernel_size=(4, 4), 68 | strides=(2, 2), 69 | padding='same') 70 | 71 | conv_1 = tf.layers.conv2d(inputs=tf.nn.leaky_relu(conv_0), 72 | filters=256, 73 | kernel_size=(4, 4), 74 | strides=(2, 2), 75 | padding='same') 76 | 77 | b_conv_1 = tf.layers.batch_normalization(inputs=conv_1, training=is_training) 78 | 79 | conv_2 = tf.layers.conv2d(inputs=tf.nn.leaky_relu(b_conv_1), 80 | filters=512, 81 | kernel_size=(4, 4), 82 | strides=(2, 2), 83 | padding='same') 84 | 85 | b_conv_2 = tf.layers.batch_normalization(inputs=conv_2, training=is_training) 86 | 87 | conv_3 = tf.layers.conv2d(inputs=tf.nn.leaky_relu(b_conv_2), 88 | filters=1024, 89 | kernel_size=(4, 4), 90 | strides=(2, 2), 91 | padding='same') 92 | 93 | b_conv_3 = tf.layers.batch_normalization(inputs=conv_3, training=is_training) 94 | 95 | conv_4 = tf.layers.conv2d(inputs=tf.nn.leaky_relu(b_conv_3), 96 | filters=1, 97 | kernel_size=(4, 4), 98 | padding='valid') 99 | 100 | return conv_4 101 | 102 | 103 | if __name__ == '__main__': 104 | # Load the dataset 105 | faces = fetch_olivetti_faces(shuffle=True, random_state=1000) 106 | X_train = faces['images'] 107 | X_train = (2.0 * X_train) - 1.0 108 | width = X_train.shape[1] 109 | height = X_train.shape[2] 110 | 111 | # Show some samples 112 | sns.set() 113 | 114 | fig, ax = plt.subplots(1, 10, figsize=(22, 12)) 115 | 116 | for i in range(10): 117 | ax[i].imshow(faces['images'][i], cmap='gray') 118 | ax[i].set_xticks([]) 119 | ax[i].set_yticks([]) 120 | 121 | plt.show() 122 | 123 | # Create the graph 124 | graph = tf.Graph() 125 | 126 | with graph.as_default(): 127 | input_x = tf.placeholder(tf.float32, shape=(None, width, height, 1)) 128 | input_z = tf.placeholder(tf.float32, shape=(None, code_length)) 129 | is_training = tf.placeholder(tf.bool) 130 | 131 | gen = generator(z=tf.reshape(input_z, (-1, 1, 1, code_length)), is_training=is_training) 132 | 133 | discr_1_l = discriminator(x=input_x, is_training=is_training, reuse_variables=False) 134 | discr_2_l = discriminator(x=gen, is_training=is_training, reuse_variables=True) 135 | 136 | loss_d_1 = tf.reduce_mean( 137 | tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(discr_1_l), logits=discr_1_l)) 138 | loss_d_2 = tf.reduce_mean( 139 | tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(discr_2_l), logits=discr_2_l)) 140 | loss_d = loss_d_1 + loss_d_2 141 | 142 | loss_g = tf.reduce_mean( 143 | tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(discr_2_l), logits=discr_2_l)) 144 | 145 | variables_g = [variable for variable in tf.trainable_variables() if variable.name.startswith('generator')] 146 | variables_d = [variable for variable in tf.trainable_variables() if variable.name.startswith('discriminator')] 147 | 148 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 149 | training_step_d = tf.train.AdamOptimizer(0.0001, beta1=0.5).minimize(loss=loss_d, var_list=variables_d) 150 | training_step_g = tf.train.AdamOptimizer(0.0005, beta1=0.5).minimize(loss=loss_g, var_list=variables_g) 151 | 152 | # Train the model 153 | session = tf.InteractiveSession(graph=graph) 154 | tf.global_variables_initializer().run() 155 | 156 | samples_range = np.arange(nb_samples) 157 | 158 | for e in range(nb_epochs): 159 | d_losses = [] 160 | g_losses = [] 161 | 162 | for i in range(nb_iterations): 163 | Xi = np.random.choice(samples_range, size=batch_size) 164 | X = np.expand_dims(X_train[Xi], axis=3) 165 | Z = np.random.uniform(-1.0, 1.0, size=(batch_size, code_length)).astype(np.float32) 166 | 167 | _, d_loss = session.run([training_step_d, loss_d], 168 | feed_dict={ 169 | input_x: X, 170 | input_z: Z, 171 | is_training: True 172 | }) 173 | d_losses.append(d_loss) 174 | 175 | Z = np.random.uniform(-1.0, 1.0, size=(batch_size, code_length)).astype(np.float32) 176 | 177 | _, g_loss = session.run([training_step_g, loss_g], 178 | feed_dict={ 179 | input_x: X, 180 | input_z: Z, 181 | is_training: True 182 | }) 183 | 184 | g_losses.append(g_loss) 185 | 186 | print('Epoch {}) Avg. discriminator loss: {} - Avg. generator loss: {}'.format(e + 1, 187 | np.mean(d_losses), 188 | np.mean(g_losses))) 189 | 190 | # Show some results 191 | Z = np.random.uniform(-1.0, 1.0, size=(20, code_length)).astype(np.float32) 192 | 193 | Ys = session.run([gen], 194 | feed_dict={ 195 | input_z: Z, 196 | is_training: False 197 | }) 198 | 199 | Ys = np.squeeze((Ys[0] + 1.0) * 0.5 * 255.0).astype(np.uint8) 200 | 201 | fig, ax = plt.subplots(2, 10, figsize=(22, 5)) 202 | 203 | for i in range(2): 204 | for j in range(10): 205 | ax[i, j].imshow(Ys[(i * 10) + j], cmap='gray') 206 | ax[i, j].set_xticks([]) 207 | ax[i, j].set_yticks([]) 208 | 209 | plt.show() 210 | 211 | session.close() 212 | -------------------------------------------------------------------------------- /Chapter09/som.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import numpy as np 4 | 5 | from sklearn.datasets import fetch_olivetti_faces 6 | from sklearn.preprocessing import StandardScaler 7 | 8 | # Set random seed for reproducibility 9 | np.random.seed(1000) 10 | 11 | 12 | nb_iterations = 1000 13 | nb_adj_iterations = 500 14 | pattern_length = 64 * 64 15 | pattern_width = pattern_height = 64 16 | eta0 = 1.0 17 | sigma0 = 3.0 18 | tau = 100.0 19 | matrix_side = 8 20 | 21 | W = np.random.normal(0, 0.1, size=(matrix_side, matrix_side, pattern_length)) 22 | precomputed_distances = np.zeros((matrix_side, matrix_side, matrix_side, matrix_side)) 23 | 24 | 25 | def winning_unit(xt): 26 | global W 27 | distances = np.linalg.norm(W - xt, ord=2, axis=2) 28 | max_activation_unit = np.argmax(distances) 29 | return int(np.floor(max_activation_unit / matrix_side)), max_activation_unit % matrix_side 30 | 31 | 32 | def eta(t): 33 | return eta0 * np.exp(-float(t) / tau) 34 | 35 | 36 | def sigma(t): 37 | return float(sigma0) * np.exp(-float(t) / tau) 38 | 39 | 40 | def distance_matrix(xt, yt, sigmat): 41 | global precomputed_distances 42 | dm = precomputed_distances[xt, yt, :, :] 43 | de = 2.0 * np.power(sigmat, 2) 44 | return np.exp(-dm / de) 45 | 46 | 47 | if __name__ == '__main__': 48 | # Load the dataset 49 | faces = fetch_olivetti_faces(shuffle=True) 50 | Xcomplete = faces['data'].astype(np.float64) / np.max(faces['data']) 51 | np.random.shuffle(Xcomplete) 52 | X = Xcomplete[0:100] 53 | 54 | # Pre-compute distances 55 | for i in range(matrix_side): 56 | for j in range(matrix_side): 57 | for k in range(matrix_side): 58 | for t in range(matrix_side): 59 | precomputed_distances[i, j, k, t] = \ 60 | np.power(float(i) - float(k), 2) + np.power(float(j) - float(t), 2) 61 | 62 | # Perform training cycle 63 | sequence = np.arange(0, X.shape[0]) 64 | t = 0 65 | 66 | for e in range(nb_iterations): 67 | np.random.shuffle(sequence) 68 | t += 1 69 | 70 | if e < nb_adj_iterations: 71 | etat = eta(t) 72 | sigmat = sigma(t) 73 | else: 74 | etat = 0.2 75 | sigmat = 1.0 76 | 77 | for n in sequence: 78 | x_sample = X[n] 79 | 80 | xw, yw = winning_unit(x_sample) 81 | dm = distance_matrix(xw, yw, sigmat) 82 | 83 | dW = etat * np.expand_dims(dm, axis=2) * (x_sample - W) 84 | W += dW 85 | 86 | W /= np.linalg.norm(W, axis=2).reshape((matrix_side, matrix_side, 1)) 87 | 88 | if e > 0 and e % 100 == 0: 89 | print('Training step: {}'.format(t-1)) 90 | 91 | # Show the final W matrix 92 | sc = StandardScaler(with_std=False) 93 | Ws = sc.fit_transform(W.reshape((matrix_side * matrix_side, pattern_length))) 94 | 95 | matrix_w = np.zeros((matrix_side * pattern_height, matrix_side * pattern_width)) 96 | 97 | Ws = Ws.reshape((matrix_side, matrix_side, pattern_length)) 98 | 99 | for i in range(matrix_side): 100 | for j in range(matrix_side): 101 | matrix_w[i * pattern_height:i * pattern_height + pattern_height, 102 | j * pattern_height:j * pattern_height + pattern_width] = W[i, j].reshape((pattern_height, pattern_width)) * 255.0 103 | 104 | fig, ax = plt.subplots(figsize=(8, 8)) 105 | sns.set() 106 | 107 | ax.matshow(matrix_w.tolist(), cmap='gray') 108 | ax.set_xticks([]) 109 | ax.set_yticks([]) 110 | plt.show() -------------------------------------------------------------------------------- /Chapter09/wgan.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from keras.datasets import fashion_mnist 7 | 8 | 9 | # Set random seed for reproducibility 10 | np.random.seed(1000) 11 | tf.set_random_seed(1000) 12 | 13 | 14 | nb_samples = 5000 15 | nb_epochs = 100 16 | nb_critic = 5 17 | batch_size = 64 18 | nb_iterations = int(nb_samples / batch_size) 19 | code_length = 100 20 | 21 | 22 | def generator(z, is_training=True): 23 | with tf.variable_scope('generator'): 24 | conv_0 = tf.layers.conv2d_transpose(inputs=z, 25 | filters=1024, 26 | kernel_size=(4, 4), 27 | padding='valid') 28 | 29 | b_conv_0 = tf.layers.batch_normalization(inputs=conv_0, training=is_training) 30 | 31 | conv_1 = tf.layers.conv2d_transpose(inputs=tf.nn.leaky_relu(b_conv_0), 32 | filters=512, 33 | kernel_size=(4, 4), 34 | strides=(2, 2), 35 | padding='same') 36 | 37 | b_conv_1 = tf.layers.batch_normalization(inputs=conv_1, training=is_training) 38 | 39 | conv_2 = tf.layers.conv2d_transpose(inputs=tf.nn.leaky_relu(b_conv_1), 40 | filters=256, 41 | kernel_size=(4, 4), 42 | strides=(2, 2), 43 | padding='same') 44 | 45 | b_conv_2 = tf.layers.batch_normalization(inputs=conv_2, training=is_training) 46 | 47 | conv_3 = tf.layers.conv2d_transpose(inputs=tf.nn.leaky_relu(b_conv_2), 48 | filters=128, 49 | kernel_size=(4, 4), 50 | strides=(2, 2), 51 | padding='same') 52 | 53 | b_conv_3 = tf.layers.batch_normalization(inputs=conv_3, training=is_training) 54 | 55 | conv_4 = tf.layers.conv2d_transpose(inputs=tf.nn.leaky_relu(b_conv_3), 56 | filters=1, 57 | kernel_size=(4, 4), 58 | strides=(2, 2), 59 | padding='same') 60 | 61 | return tf.nn.tanh(conv_4) 62 | 63 | 64 | def critic(x, is_training=True, reuse_variables=True): 65 | with tf.variable_scope('critic', reuse=reuse_variables): 66 | conv_0 = tf.layers.conv2d(inputs=x, 67 | filters=128, 68 | kernel_size=(4, 4), 69 | strides=(2, 2), 70 | padding='same') 71 | 72 | conv_1 = tf.layers.conv2d(inputs=tf.nn.leaky_relu(conv_0), 73 | filters=256, 74 | kernel_size=(4, 4), 75 | strides=(2, 2), 76 | padding='same') 77 | 78 | b_conv_1 = tf.layers.batch_normalization(inputs=conv_1, training=is_training) 79 | 80 | conv_2 = tf.layers.conv2d(inputs=tf.nn.leaky_relu(b_conv_1), 81 | filters=512, 82 | kernel_size=(4, 4), 83 | strides=(2, 2), 84 | padding='same') 85 | 86 | b_conv_2 = tf.layers.batch_normalization(inputs=conv_2, training=is_training) 87 | 88 | conv_3 = tf.layers.conv2d(inputs=tf.nn.leaky_relu(b_conv_2), 89 | filters=1024, 90 | kernel_size=(4, 4), 91 | strides=(2, 2), 92 | padding='same') 93 | 94 | b_conv_3 = tf.layers.batch_normalization(inputs=conv_3, training=is_training) 95 | 96 | conv_4 = tf.layers.conv2d(inputs=tf.nn.leaky_relu(b_conv_3), 97 | filters=1, 98 | kernel_size=(4, 4), 99 | padding='valid') 100 | 101 | return conv_4 102 | 103 | 104 | if __name__ == '__main__': 105 | # Load the dataset 106 | (X_train, _), (_, _) = fashion_mnist.load_data() 107 | X_train = X_train.astype(np.float32)[0:nb_samples] / 255.0 108 | X_train = (2.0 * X_train) - 1.0 109 | 110 | width = X_train.shape[1] 111 | height = X_train.shape[2] 112 | 113 | # Show some samples 114 | sns.set() 115 | 116 | fig, ax = plt.subplots(1, 10, figsize=(22, 12)) 117 | 118 | for i in range(10): 119 | ax[i].imshow(X_train[i], cmap='gray') 120 | ax[i].set_xticks([]) 121 | ax[i].set_yticks([]) 122 | 123 | plt.show() 124 | 125 | # Create the graph 126 | graph = tf.Graph() 127 | 128 | with graph.as_default(): 129 | input_x = tf.placeholder(tf.float32, shape=(None, width, height, 1)) 130 | input_z = tf.placeholder(tf.float32, shape=(None, code_length)) 131 | is_training = tf.placeholder(tf.bool) 132 | 133 | gen = generator(z=tf.reshape(input_z, (-1, 1, 1, code_length)), is_training=is_training) 134 | 135 | r_input_x = tf.image.resize_images(images=input_x, size=(64, 64), 136 | method=tf.image.ResizeMethod.BICUBIC) 137 | 138 | crit_1_l = critic(x=r_input_x, is_training=is_training, reuse_variables=False) 139 | crit_2_l = critic(x=gen, is_training=is_training, reuse_variables=True) 140 | 141 | loss_c = tf.reduce_mean(crit_2_l - crit_1_l) 142 | loss_g = tf.reduce_mean(-crit_2_l) 143 | 144 | variables_g = [variable for variable in tf.trainable_variables() 145 | if variable.name.startswith('generator')] 146 | variables_c = [variable for variable in tf.trainable_variables() 147 | if variable.name.startswith('critic')] 148 | 149 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 150 | optimizer_c = tf.train.AdamOptimizer(0.00005, beta1=0.5, beta2=0.9).\ 151 | minimize(loss=loss_c, var_list=variables_c) 152 | 153 | with tf.control_dependencies([optimizer_c]): 154 | training_step_c = tf.tuple(tensors=[ 155 | tf.assign(variable, tf.clip_by_value(variable, -0.01, 0.01)) 156 | for variable in variables_c]) 157 | 158 | training_step_g = tf.train.AdamOptimizer(0.00005, beta1=0.5, beta2=0.9).\ 159 | minimize(loss=loss_g, var_list=variables_g) 160 | 161 | # Train the model 162 | session = tf.InteractiveSession(graph=graph) 163 | tf.global_variables_initializer().run() 164 | 165 | samples_range = np.arange(nb_samples) 166 | 167 | for e in range(nb_epochs): 168 | c_losses = [] 169 | g_losses = [] 170 | 171 | for i in range(nb_iterations): 172 | for j in range(nb_critic): 173 | Xi = np.random.choice(samples_range, size=batch_size) 174 | X = np.expand_dims(X_train[Xi], axis=3) 175 | Z = np.random.uniform(-1.0, 1.0, size=(batch_size, code_length)).astype(np.float32) 176 | 177 | _, c_loss = session.run([training_step_c, loss_c], 178 | feed_dict={ 179 | input_x: X, 180 | input_z: Z, 181 | is_training: True 182 | }) 183 | c_losses.append(c_loss) 184 | 185 | Z = np.random.uniform(-1.0, 1.0, size=(batch_size, code_length)).astype(np.float32) 186 | 187 | _, g_loss = session.run([training_step_g, loss_g], 188 | feed_dict={ 189 | input_x: np.zeros(shape=(batch_size, width, height, 1)), 190 | input_z: Z, 191 | is_training: True 192 | }) 193 | 194 | g_losses.append(g_loss) 195 | 196 | print('Epoch {}) Avg. critic loss: {} - Avg. generator loss: {}'.format(e + 1, np.mean(c_losses), 197 | np.mean(g_losses))) 198 | 199 | # Show some results 200 | Z = np.random.uniform(-1.0, 1.0, size=(30, code_length)).astype(np.float32) 201 | 202 | Ys = session.run([gen], 203 | feed_dict={ 204 | input_z: Z, 205 | is_training: False 206 | }) 207 | 208 | Ys = np.squeeze((Ys[0] + 1.0) * 0.5 * 255.0).astype(np.uint8) 209 | 210 | fig, ax = plt.subplots(3, 10, figsize=(15, 8)) 211 | 212 | for i in range(3): 213 | for j in range(10): 214 | ax[i, j].imshow(Ys[(i * 10) + j], cmap='gray') 215 | ax[i, j].set_xticks([]) 216 | ax[i, j].set_yticks([]) 217 | 218 | plt.show() 219 | 220 | session.close() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hands-On Unsupervised Learning with Python 2 | 3 | Book Name 4 | 5 | This is the code repository for [Hands-On Unsupervised Learning with Python](https://www.packtpub.com/big-data-and-business-intelligence/hands-unsupervised-learning-python?utm_source=github&utm_medium=repository&utm_campaign=9781789348279), published by Packt. 6 | 7 | **Implement machine learning and deep learning models using Scikit-Learn, TensorFlow, and more** 8 | 9 | ## What is this book about? 10 | Unsupervised learning is about making use of raw, untagged data and applying learning algorithms to it to help a machine predict its outcome. With this book, you will explore the concept of unsupervised learning to cluster large sets of data and analyze them repeatedly until the desired outcome is found using Python. 11 | 12 | This book covers the following exciting features: 13 | * Use cluster algorithms to identify and optimize natural groups of data 14 | * Explore advanced non-linear and hierarchical clustering in action 15 | * Soft label assignments for fuzzy c-means and Gaussian mixture models 16 | * Detect anomalies through density estimation 17 | * Perform principal component analysis using neural network models 18 | 19 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789349273) today! 20 | 21 | https://www.packtpub.com/ 23 | 24 | 25 | ## Instructions and Navigations 26 | All of the code is organized into folders. For example, Chapter02. 27 | 28 | The code will look like the following: 29 | ``` 30 | X_train = faces['images'] 31 | X_train = (2.0 * X_train) - 1.0 32 | 33 | width = X_train.shape[1] 34 | height = X_train.shape[2] 35 | ``` 36 | 37 | **Following is what you need for this book:** 38 | This book is intended for statisticians, data scientists, machine learning developers, and deep learning practitioners who want to build smart applications by implementing key building block unsupervised learning, and master all the new techniques and algorithms offered in machine learning and deep learning using real-world examples. Some prior knowledge of machine learning concepts and statistics is desirable. 39 | 40 | With the following software and hardware list you can run all code files present in the book (Chapter 1-09). 41 | 42 | ### Software and Hardware List 43 | 44 | | Chapter | Software required | OS required | 45 | | -------- | ------------------------------------| -----------------------------------| 46 | | 1-9 | Python 3.5+, Jupyter Notebbok | Windows, Mac OS X, and Linux (Any) | 47 | 48 | 49 | 50 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://www.packtpub.com/sites/default/files/downloads/9781789348279_ColorImages.pdf). 51 | 52 | ## Errata 53 | The hyperlink in Ebook for Graphics is not correct, refer to this link for Graphic bundle : (https://www.packtpub.com/sites/default/files/downloads/9781789348279_ColorImages.pdf) 54 | 55 | 56 | ### Related products 57 | * Building Machine Learning Systems with Python - Third Edition [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/building-machine-learning-systems-python-third-edition?utm_source=github&utm_medium=repository&utm_campaign=9781788623223) [[Amazon]](https://www.amazon.com/dp/1788622227) 58 | 59 | * Machine Learning Algorithms - Second Edition [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-algorithms-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781789347999) [[Amazon]](https://www.amazon.com/dp/1789345480) 60 | 61 | ## Get to Know the Author 62 | **Giuseppe Bonaccorso** 63 | is an experienced manager in the fields of AI, data science, and machine learning. He has been involved in solution design, management, and delivery in different business contexts. He got his M.Sc.Eng in electronics in 2005 from the University of Catania, Italy, and continued his studies at the University of Rome Tor Vergata, Italy, and the University of Essex, UK. His main interests include machine/deep learning, reinforcement learning, big data, bio-inspired adaptive systems, neuroscience, and natural language processing. 64 | 65 | 66 | ## Other books by the author 67 | * [Mastering Machine Learning Algorithms](https://www.packtpub.com/big-data-and-business-intelligence/mastering-machine-learning-algorithms?utm_source=github&utm_medium=repository&utm_campaign=9781788621113) 68 | * [Machine Learning Algorithms - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-algorithms-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781789347999) 69 | 70 | ### Suggestions and Feedback 71 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 72 | 73 | --------------------------------------------------------------------------------