├── LICENSE ├── README.md ├── Section02 ├── 1categorical.py ├── 1missing_features.py ├── 2data_normalization.py ├── 2data_scaling.py ├── 3feature_filtering.py ├── 3feature_selection.py ├── 4pca.py ├── 5dictionary_learning.py ├── 5kernel_pca.py └── 5nmf.py ├── Section03 ├── 1twoD_linear_regression.py ├── 2multiple_linear_regression.py ├── 3ridge_lasso_elasticnet.py ├── 4ransac_regression.py ├── 5polynomial_regression.py └── 6isotonic_regression.py ├── Section04 ├── 1logistic_regression.py ├── 2perceptron.py ├── 3grid_search.py ├── 3grid_search_2.py ├── 4classification_metrics.py └── 5roc_curve.py ├── Section05 ├── 1bernoulli.py ├── 2multinomial.py └── 3gaussian.py ├── Section06 ├── 1linear_svm.py ├── 2kernel_svm.py ├── 2kernel_svm_1.py ├── 2kernel_svm_2.py ├── 3controlled_svm.py └── 4svr.py ├── Section07 ├── 1decision_tree.py ├── 2decision_tree_2.py ├── 3random_forest.py ├── 4random_forest_2.py ├── 5adaboost.py ├── 6adaboost_2.py ├── 7gradient_tree_boosting.py └── 8voting_classifier.py ├── Section08 ├── 1k_means.py ├── 1k_means_2.py ├── 2dbscan.py ├── 3spectral_clustering.py └── 3spectral_clustering_2.py ├── Section09 ├── 1dendrogram.py ├── 2agglomerative_clustering.py └── 3connectivity_constraints.py └── Section10 ├── 1user_based.py ├── 2content-based.py ├── 3memory_based_cf.py ├── 4model_based_cf.py └── 5als_spark.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fundamentals of Machine Learning with scikit-learn [Video] 2 | This is the code repository for [Fundamentals of Machine Learning with scikit-learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/fundamentals-machine-learning-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789134377), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the video course from start to finish. 3 | ## About the Video Course 4 | As the amount of data continues to grow at an almost incomprehensible rate, being able to understand and process data is becoming a key differentiator for competitive organizations. Machine Learning applications are everywhere, from self-driving cars, spam detection, document searches, and trading strategies, to speech recognition. This makes machine learning well-suited to the present-day era of big data and data science. The main challenge is how to transform data into actionable knowledge. 5 | 6 | 7 | In this course you will learn all the important Machine Learning algorithms that are commonly used in the field of data science. These algorithms can be used for supervised as well as unsupervised learning, reinforcement learning, and semi-supervised learning. A few famous algorithms that are covered in this book are: Linear regression, Logistic Regression, SVM, Naive Bayes, K-Means, Random Forest, and Feature engineering. In this course, you will also learn how these algorithms work and their practical implementation to resolve your problems. 8 | 9 | 10 |

What You Will Learn

11 |
12 |
14 | 15 | ## Instructions and Navigation 16 | ### Assumed Knowledge 17 | To fully benefit from the coverage included in this course, you will need:
18 | This course is for IT professionals who want to enter the field of data science and are very new to Machine Learning. Familiarity with languages such as R and Python will be invaluable here. 19 | ### Technical Requirements 20 | This course has the following software requirements:
21 | 24 | 25 | ## Related Products 26 | * [Hands-On Machine Learning with Python and Scikit-Learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/hands-machine-learning-python-and-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781788991056) 27 | 28 | * [Machine Learning with scikit-learn and Tensorflow [Video]](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-scikit-learn-and-tensorflow-video?utm_source=github&utm_medium=repository&utm_campaign=9781788629928) 29 | 30 | * [Introduction to ML Classification Models using scikit-learn [Video]](https://www.packtpub.com/application-development/introduction-ml-classification-models-using-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789345926) 31 | 32 | -------------------------------------------------------------------------------- /Section02/1categorical.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder 6 | from sklearn.feature_extraction import DictVectorizer, FeatureHasher 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | if __name__ == '__main__': 13 | Y = np.random.choice(('Male', 'Female'), size=(10)) 14 | 15 | # Encode the labels 16 | print('Label encoding') 17 | le = LabelEncoder() 18 | yt = le.fit_transform(Y) 19 | print(yt) 20 | 21 | # Decode a dummy output 22 | print('Label decoding') 23 | output = [1, 0, 1, 1, 0, 0] 24 | decoded_output = [le.classes_[i] for i in output] 25 | print(decoded_output) 26 | 27 | # Binarize the labels 28 | print('Label binarization') 29 | lb = LabelBinarizer() 30 | yb = lb.fit_transform(Y) 31 | print(yb) 32 | 33 | # Decode the binarized labels 34 | print('Label decoding') 35 | lb.inverse_transform(yb) 36 | 37 | # Define some dictionary data 38 | data = [ 39 | {'feature_1': 10, 'feature_2': 15}, 40 | {'feature_1': -5, 'feature_3': 22}, 41 | {'feature_3': -2, 'feature_4': 10} 42 | ] 43 | 44 | # Vectorize the dictionary data 45 | print('Dictionary data vectorization') 46 | dv = DictVectorizer() 47 | Y_dict = dv.fit_transform(data) 48 | print(Y_dict.todense()) 49 | 50 | print('Vocabulary:') 51 | print(dv.vocabulary_) 52 | 53 | # Feature hashing 54 | print('Feature hashing') 55 | fh = FeatureHasher() 56 | Y_hashed = fh.fit_transform(data) 57 | 58 | # Decode the features 59 | print('Feature decoding') 60 | print(Y_hashed.todense()) 61 | 62 | # One-hot encoding 63 | data1 = [ 64 | [0, 10], 65 | [1, 11], 66 | [1, 8], 67 | [0, 12], 68 | [0, 15] 69 | ] 70 | 71 | # Encode data 72 | oh = OneHotEncoder(categorical_features=[0]) 73 | Y_oh = oh.fit_transform(data1) 74 | print(Y_oh.todense()) 75 | -------------------------------------------------------------------------------- /Section02/1missing_features.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.preprocessing import Imputer 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | if __name__ == '__main__': 11 | data = np.array([[1, np.nan, 2], [2, 3, np.nan], [-1, 4, 2]]) 12 | print(data) 13 | 14 | # Imputer with mean-strategy 15 | print('Mean strategy') 16 | imp = Imputer(strategy='mean') 17 | print(imp.fit_transform(data)) 18 | 19 | # Imputer with median-strategy 20 | print('Median strategy') 21 | imp = Imputer(strategy='median') 22 | print(imp.fit_transform(data)) 23 | 24 | # Imputer with most-frequent-strategy 25 | print('Most-frequent strategy') 26 | imp = Imputer(strategy='most_frequent') 27 | print(imp.fit_transform(data)) 28 | 29 | -------------------------------------------------------------------------------- /Section02/2data_normalization.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.preprocessing import Normalizer 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | if __name__ == '__main__': 11 | # Create a dummy dataset 12 | data = np.array([1.0, 2.0]) 13 | print(data) 14 | 15 | # Max normalization 16 | n_max = Normalizer(norm='max') 17 | nm = n_max.fit_transform(data.reshape(1, -1)) 18 | print(nm) 19 | 20 | # L1 normalization 21 | n_l1 = Normalizer(norm='l1') 22 | nl1 = n_l1.fit_transform(data.reshape(1, -1)) 23 | print(nl1) 24 | 25 | # L2 normalization 26 | n_l2 = Normalizer(norm='l2') 27 | nl2 = n_l2.fit_transform(data.reshape(1, -1)) 28 | print(nl2) -------------------------------------------------------------------------------- /Section02/2data_scaling.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.preprocessing import StandardScaler, RobustScaler 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | if __name__ == '__main__': 12 | # Create a dummy dataset 13 | data = np.ndarray(shape=(100, 2)) 14 | 15 | for i in range(100): 16 | data[i, 0] = 2.0 + np.random.normal(1.5, 3.0) 17 | data[i, 1] = 0.5 + np.random.normal(1.5, 3.0) 18 | 19 | # Show the original and the scaled dataset 20 | fig, ax = plt.subplots(1, 2, figsize=(14, 5)) 21 | 22 | ax[0].scatter(data[:, 0], data[:, 1]) 23 | ax[0].set_xlim([-10, 10]) 24 | ax[0].set_ylim([-10, 10]) 25 | ax[0].grid() 26 | ax[0].set_xlabel('X') 27 | ax[0].set_ylabel('Y') 28 | ax[0].set_title('Raw data') 29 | 30 | # Scale data 31 | ss = StandardScaler() 32 | scaled_data = ss.fit_transform(data) 33 | 34 | ax[1].scatter(scaled_data[:, 0], scaled_data[:, 1]) 35 | ax[1].set_xlim([-10, 10]) 36 | ax[1].set_ylim([-10, 10]) 37 | ax[1].grid() 38 | ax[1].set_xlabel('X') 39 | ax[1].set_ylabel('Y') 40 | ax[1].set_title('Scaled data') 41 | 42 | plt.show() 43 | 44 | # Scale data using a Robust Scaler 45 | fig, ax = plt.subplots(2, 2, figsize=(8, 8)) 46 | 47 | ax[0, 0].scatter(data[:, 0], data[:, 1]) 48 | ax[0, 0].set_xlim([-10, 10]) 49 | ax[0, 0].set_ylim([-10, 10]) 50 | ax[0, 0].grid() 51 | ax[0, 0].set_xlabel('X') 52 | ax[0, 0].set_ylabel('Y') 53 | ax[0, 0].set_title('Raw data') 54 | 55 | rs = RobustScaler(quantile_range=(15, 85)) 56 | scaled_data = rs.fit_transform(data) 57 | 58 | ax[0, 1].scatter(scaled_data[:, 0], scaled_data[:, 1]) 59 | ax[0, 1].set_xlim([-10, 10]) 60 | ax[0, 1].set_ylim([-10, 10]) 61 | ax[0, 1].grid() 62 | ax[0, 1].set_xlabel('X') 63 | ax[0, 1].set_ylabel('Y') 64 | ax[0, 1].set_title('Scaled data (15% - 85%)') 65 | 66 | rs1 = RobustScaler(quantile_range=(25, 75)) 67 | scaled_data1 = rs1.fit_transform(data) 68 | 69 | ax[1, 0].scatter(scaled_data1[:, 0], scaled_data1[:, 1]) 70 | ax[1, 0].set_xlim([-10, 10]) 71 | ax[1, 0].set_ylim([-10, 10]) 72 | ax[1, 0].grid() 73 | ax[1, 0].set_xlabel('X') 74 | ax[1, 0].set_ylabel('Y') 75 | ax[1, 0].set_title('Scaled data (25% - 75%)') 76 | 77 | rs2 = RobustScaler(quantile_range=(30, 65)) 78 | scaled_data2 = rs2.fit_transform(data) 79 | 80 | ax[1, 1].scatter(scaled_data2[:, 0], scaled_data2[:, 1]) 81 | ax[1, 1].set_xlim([-10, 10]) 82 | ax[1, 1].set_ylim([-10, 10]) 83 | ax[1, 1].grid() 84 | ax[1, 1].set_xlabel('X') 85 | ax[1, 1].set_ylabel('Y') 86 | ax[1, 1].set_title('Scaled data (30% - 60%)') 87 | 88 | plt.show() 89 | 90 | -------------------------------------------------------------------------------- /Section02/3feature_filtering.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_boston, load_iris 6 | from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, f_regression 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | if __name__ == '__main__': 12 | # Load Boston data 13 | regr_data = load_boston() 14 | print('Boston data shape') 15 | print(regr_data.data.shape) 16 | 17 | # Select the best k features with regression test 18 | kb_regr = SelectKBest(f_regression) 19 | X_b = kb_regr.fit_transform(regr_data.data, regr_data.target) 20 | print('K-Best-filtered Boston dataset shape') 21 | print(X_b.shape) 22 | print('K-Best scores') 23 | print(kb_regr.scores_) 24 | 25 | # Load iris data 26 | class_data = load_iris() 27 | print('Iris dataset shape') 28 | print(class_data.data.shape) 29 | 30 | # Select the best k features using Chi^2 classification test 31 | perc_class = SelectPercentile(chi2, percentile=15) 32 | X_p = perc_class.fit_transform(class_data.data, class_data.target) 33 | print('Chi2-filtered Iris dataset shape') 34 | print(X_p.shape) 35 | print('Chi2 scores') 36 | print(perc_class.scores_) 37 | 38 | -------------------------------------------------------------------------------- /Section02/3feature_selection.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.feature_selection import VarianceThreshold 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | if __name__ == '__main__': 12 | # Create a dummy dataset 13 | X = np.ndarray(shape=(100, 3)) 14 | 15 | X[:, 0] = np.random.normal(0.0, 5.0, size=100) 16 | X[:, 1] = np.random.normal(0.5, 5.0, size=100) 17 | X[:, 2] = np.random.normal(1.0, 0.5, size=100) 18 | 19 | # Show the dataset 20 | fig, ax = plt.subplots(1, 1, figsize=(12, 8)) 21 | ax.grid() 22 | ax.set_xlabel('X') 23 | ax.set_ylabel('Y') 24 | 25 | ax.plot(X[:, 0], label='STD = 5.0') 26 | ax.plot(X[:, 1], label='STD = 5.0') 27 | ax.plot(X[:, 2], label='STD = 0.5') 28 | 29 | plt.legend() 30 | plt.show() 31 | 32 | # Impose a variance threshold 33 | print('Samples before variance thresholding') 34 | print(X[0:3, :]) 35 | 36 | vt = VarianceThreshold(threshold=1.5) 37 | X_t = vt.fit_transform(X) 38 | 39 | # After the filter has removed the componenents 40 | print('Samples after variance thresholding') 41 | print(X_t[0:3, :]) -------------------------------------------------------------------------------- /Section02/4pca.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.decomposition import PCA 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | if __name__ == '__main__': 13 | # Load MNIST digits 14 | digits = load_digits() 15 | 16 | # Show some random digits 17 | selection = np.random.randint(0, 1797, size=100) 18 | 19 | fig, ax = plt.subplots(10, 10, figsize=(10, 10)) 20 | 21 | samples = [digits.data[x].reshape((8, 8)) for x in selection] 22 | 23 | for i in range(10): 24 | for j in range(10): 25 | ax[i, j].set_axis_off() 26 | ax[i, j].imshow(samples[(i * 8) + j], cmap='gray') 27 | 28 | plt.show() 29 | 30 | # Perform a PCA on the digits dataset 31 | pca = PCA(n_components=36, whiten=True) 32 | X_pca = pca.fit_transform(digits.data / 255) 33 | 34 | print('Explained variance ratio') 35 | print(pca.explained_variance_ratio_) 36 | 37 | # Plot the explained variance ratio 38 | fig, ax = plt.subplots(1, 2, figsize=(16, 6)) 39 | 40 | ax[0].set_xlabel('Component') 41 | ax[0].set_ylabel('Variance ratio (%)') 42 | ax[0].bar(np.arange(36), pca.explained_variance_ratio_ * 100.0) 43 | 44 | ax[1].set_xlabel('Component') 45 | ax[1].set_ylabel('Cumulative variance (%)') 46 | ax[1].bar(np.arange(36), np.cumsum(pca.explained_variance_)[::-1]) 47 | 48 | plt.show() 49 | 50 | # Rebuild from PCA and show the result 51 | fig, ax = plt.subplots(10, 10, figsize=(10, 10)) 52 | 53 | samples = [pca.inverse_transform(X_pca[x]).reshape((8, 8)) for x in selection] 54 | 55 | for i in range(10): 56 | for j in range(10): 57 | ax[i, j].set_axis_off() 58 | ax[i, j].imshow(samples[(i * 8) + j], cmap='gray') 59 | 60 | plt.show() 61 | 62 | -------------------------------------------------------------------------------- /Section02/5dictionary_learning.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.decomposition import DictionaryLearning 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | if __name__ == '__main__': 13 | # Load MNIST digits 14 | digits = load_digits() 15 | 16 | # Perform a dictionary learning (and atom extraction) from the MNIST dataset 17 | dl = DictionaryLearning(n_components=36, fit_algorithm='lars', transform_algorithm='lasso_lars') 18 | X_dict = dl.fit_transform(digits.data) 19 | 20 | # Show the atoms that have been extracted 21 | fig, ax = plt.subplots(6, 6, figsize=(8, 8)) 22 | 23 | samples = [dl.components_[x].reshape((8, 8)) for x in range(34)] 24 | 25 | for i in range(6): 26 | for j in range(6): 27 | ax[i, j].set_axis_off() 28 | ax[i, j].imshow(samples[(i * 5) + j], cmap='gray') 29 | 30 | plt.show() 31 | 32 | -------------------------------------------------------------------------------- /Section02/5kernel_pca.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets.samples_generator import make_blobs 7 | from sklearn.decomposition import KernelPCA 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | if __name__ == '__main__': 13 | # Create a dummy dataset 14 | Xb, Yb = make_blobs(n_samples=500, centers=3, n_features=3) 15 | 16 | # Show the dataset 17 | fig, ax = plt.subplots(1, 1, figsize=(8, 8)) 18 | ax.scatter(Xb[:, 0], Xb[:, 1]) 19 | ax.set_xlabel('X') 20 | ax.set_ylabel('Y') 21 | ax.grid() 22 | 23 | plt.show() 24 | 25 | # Perform a kernel PCA (with radial basis function) 26 | kpca = KernelPCA(n_components=2, kernel='rbf', fit_inverse_transform=True) 27 | X_kpca = kpca.fit_transform(Xb) 28 | 29 | # Plot the dataset after PCA 30 | fig, ax = plt.subplots(1, 1, figsize=(8, 8)) 31 | ax.scatter(kpca.X_transformed_fit_[:, 0], kpca.X_transformed_fit_[:, 1]) 32 | ax.set_xlabel('First component: Variance') 33 | ax.set_ylabel('Second component: Mean') 34 | ax.grid() 35 | 36 | plt.show() -------------------------------------------------------------------------------- /Section02/5nmf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_iris 6 | from sklearn.decomposition import NMF 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | if __name__ == '__main__': 12 | # Load iris dataset 13 | iris = load_iris() 14 | print('Irid dataset shape') 15 | print(iris.data.shape) 16 | 17 | # Perform a non-negative matrix factorization 18 | nmf = NMF(n_components=3, init='random', l1_ratio=0.1) 19 | Xt = nmf.fit_transform(iris.data) 20 | 21 | print('Reconstruction error') 22 | print(nmf.reconstruction_err_) 23 | 24 | print('Original Iris sample') 25 | print(iris.data[0]) 26 | 27 | print('Compressed Iris sample (via Non-Negative Matrix Factorization)') 28 | print(Xt[0]) 29 | 30 | print('Rebuilt sample') 31 | print(nmf.inverse_transform(Xt[0])) -------------------------------------------------------------------------------- /Section03/1twoD_linear_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from scipy.optimize import minimize 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | # Number of samples 13 | nb_samples = 200 14 | 15 | 16 | def loss(v): 17 | e = 0.0 18 | for i in range(nb_samples): 19 | e += np.square(v[0] + v[1]*X[i] - Y[i]) 20 | return 0.5 * e 21 | 22 | 23 | def gradient(v): 24 | g = np.zeros(shape=2) 25 | for i in range(nb_samples): 26 | g[0] += (v[0] + v[1]*X[i] - Y[i]) 27 | g[1] += ((v[0] + v[1]*X[i] - Y[i]) * X[i]) 28 | return g 29 | 30 | 31 | def show_dataset(X, Y): 32 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 33 | 34 | ax.scatter(X, Y) 35 | ax.set_xlabel('X') 36 | ax.set_ylabel('Y') 37 | ax.grid() 38 | 39 | plt.show() 40 | 41 | 42 | if __name__ == '__main__': 43 | # Create dataset 44 | X = np.arange(-5, 5, 0.05) 45 | 46 | Y = X + 2 47 | Y += np.random.uniform(-0.5, 0.5, size=nb_samples) 48 | 49 | # Show the dataset 50 | show_dataset(X, Y) 51 | 52 | # Minimize loss function 53 | result = minimize(fun=loss, x0=np.array([0.0, 0.0]), jac=gradient, method='L-BFGS-B') 54 | 55 | print('Interpolating rect:') 56 | print('y = %.2fx + %2.f' % (result.x[1], result.x[0])) 57 | 58 | # Compute the absolute error 59 | err = 0.0 60 | 61 | for i in range(nb_samples): 62 | err += np.abs(Y[i] - (result.x[1]*X[i] + result.x[0])) 63 | 64 | print('Absolute error: %.2f' % err) -------------------------------------------------------------------------------- /Section03/2multiple_linear_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_boston 7 | from sklearn.linear_model import LinearRegression 8 | from sklearn.model_selection import train_test_split, cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | def show_dataset(data): 16 | fig, ax = plt.subplots(4, 3, figsize=(20, 15)) 17 | 18 | for i in range(4): 19 | for j in range(3): 20 | ax[i, j].plot(data.data[:, i + (j + 1) * 3]) 21 | ax[i, j].grid() 22 | 23 | plt.show() 24 | 25 | 26 | if __name__ == '__main__': 27 | # Load dataset 28 | boston = load_boston() 29 | 30 | # Show dataset 31 | show_dataset(boston) 32 | 33 | # Create a linear regressor instance 34 | lr = LinearRegression(normalize=True) 35 | 36 | # Split dataset 37 | X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, test_size=0.1) 38 | 39 | # Train the model 40 | lr.fit(X_train, Y_train) 41 | 42 | print('Score %.3f' % lr.score(X_test, Y_test)) 43 | 44 | # CV score 45 | scores = cross_val_score(lr, boston.data, boston.target, cv=7, scoring='neg_mean_squared_error') 46 | print('CV Negative mean squared errors mean: %.3f' % scores.mean()) 47 | print('CV Negative mean squared errors std: %.3f' % scores.std()) 48 | 49 | # CV R2 score 50 | r2_scores = cross_val_score(lr, boston.data, boston.target, cv=10, scoring='r2') 51 | print('CV R2 score: %.3f' % r2_scores.mean()) 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /Section03/3ridge_lasso_elasticnet.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_diabetes 6 | from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV 7 | from sklearn.model_selection import cross_val_score 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | if __name__ == '__main__': 15 | diabetes = load_diabetes() 16 | 17 | # Create a linear regressor and compute CV score 18 | lr = LinearRegression(normalize=True) 19 | lr_scores = cross_val_score(lr, diabetes.data, diabetes.target, cv=10) 20 | print('Linear regression CV score: %.6f' % lr_scores.mean()) 21 | 22 | # Create a Ridge regressor and compute CV score 23 | rg = Ridge(0.005, normalize=True) 24 | rg_scores = cross_val_score(rg, diabetes.data, diabetes.target, cv=10) 25 | print('Ridge regression CV score: %.6f' % rg_scores.mean()) 26 | 27 | # Create a Lasso regressor and compute CV score 28 | ls = Lasso(0.01, normalize=True) 29 | ls_scores = cross_val_score(ls, diabetes.data, diabetes.target, cv=10) 30 | print('Lasso regression CV score: %.6f' % ls_scores.mean()) 31 | 32 | # Create ElasticNet regressor and compute CV score 33 | en = ElasticNet(alpha=0.001, l1_ratio=0.8, normalize=True) 34 | en_scores = cross_val_score(en, diabetes.data, diabetes.target, cv=10) 35 | print('ElasticNet regression CV score: %.6f' % en_scores.mean()) 36 | 37 | # Find the optimal alpha value for Ridge regression 38 | rgcv = RidgeCV(alphas=(1.0, 0.1, 0.01, 0.001, 0.005, 0.0025, 0.001, 0.00025), normalize=True) 39 | rgcv.fit(diabetes.data, diabetes.target) 40 | print('Ridge optimal alpha: %.3f' % rgcv.alpha_) 41 | 42 | # Find the optimal alpha value for Lasso regression 43 | lscv = LassoCV(alphas=(1.0, 0.1, 0.01, 0.001, 0.005, 0.0025, 0.001, 0.00025), normalize=True) 44 | lscv.fit(diabetes.data, diabetes.target) 45 | print('Lasso optimal alpha: %.3f' % lscv.alpha_) 46 | 47 | # Find the optimal alpha and l1_ratio for Elastic Net 48 | encv = ElasticNetCV(alphas=(0.1, 0.01, 0.005, 0.0025, 0.001), l1_ratio=(0.1, 0.25, 0.5, 0.75, 0.8), normalize=True) 49 | encv.fit(diabetes.data, diabetes.target) 50 | print('ElasticNet optimal alpha: %.3f and L1 ratio: %.4f' % (encv.alpha_, encv.l1_ratio_)) 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /Section03/4ransac_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.linear_model import LinearRegression, RANSACRegressor 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | nb_samples = 200 13 | nb_noise_samples = 150 14 | 15 | 16 | def show_dataset(X, Y): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.scatter(X, Y) 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | ax.grid() 23 | 24 | plt.show() 25 | 26 | 27 | if __name__ == '__main__': 28 | # Create dataset 29 | X = np.arange(-5, 5, 0.05) 30 | 31 | Y = X + 2 32 | Y += np.random.uniform(-0.5, 0.5, size=nb_samples) 33 | 34 | for i in range(nb_noise_samples, nb_samples): 35 | Y[i] += np.random.uniform(12, 15) 36 | 37 | # Show the dataset 38 | show_dataset(X, Y) 39 | 40 | # Create a linear regressor 41 | lr = LinearRegression(normalize=True) 42 | lr.fit(X.reshape(-1, 1), Y.reshape(-1, 1)) 43 | print('Standard regressor: y = %.3fx + %.3f' % (lr.coef_, lr.intercept_)) 44 | 45 | # Create RANSAC regressor 46 | rs = RANSACRegressor(lr) 47 | rs.fit(X.reshape(-1, 1), Y.reshape(-1, 1)) 48 | print('RANSAC regressor: y = %.3fx + %.3f' % (rs.estimator_.coef_, rs.estimator_.intercept_)) -------------------------------------------------------------------------------- /Section03/5polynomial_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.linear_model import LinearRegression 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.preprocessing import PolynomialFeatures 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 200 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.scatter(X, Y) 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | ax.grid() 24 | 25 | plt.show() 26 | 27 | 28 | if __name__ == '__main__': 29 | # Create dataset 30 | X = np.arange(-5, 5, 0.05) 31 | 32 | Y = X + 2 33 | Y += X**2 + np.random.uniform(-0.5, 0.5, size=nb_samples) 34 | 35 | # Show the dataset 36 | show_dataset(X, Y) 37 | 38 | # Split dataset 39 | X_train, X_test, Y_train, Y_test = train_test_split(X.reshape(-1, 1), Y.reshape(-1, 1), test_size=0.25) 40 | 41 | lr = LinearRegression(normalize=True) 42 | lr.fit(X_train, Y_train) 43 | print('Linear regression score: %.3f' % lr.score(X_train, Y_train)) 44 | 45 | # Create polynomial features 46 | pf = PolynomialFeatures(degree=2) 47 | X_train = pf.fit_transform(X_train) 48 | X_test = pf.fit_transform(X_test) 49 | 50 | lr.fit(X_train, Y_train) 51 | print('Second degree polynomial regression score: %.3f' % lr.score(X_train, Y_train)) -------------------------------------------------------------------------------- /Section03/6isotonic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from matplotlib.collections import LineCollection 7 | 8 | from sklearn.isotonic import IsotonicRegression 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 100 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.plot(X, Y, 'b.-') 21 | ax.grid() 22 | ax.set_xlabel('X') 23 | ax.set_ylabel('Y') 24 | 25 | plt.show() 26 | 27 | 28 | def show_isotonic_regression_segments(X, Y, Yi, segments): 29 | lc = LineCollection(segments, zorder=0) 30 | lc.set_array(np.ones(len(Y))) 31 | lc.set_linewidths(0.5 * np.ones(nb_samples)) 32 | 33 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 34 | 35 | ax.plot(X, Y, 'b.', markersize=8) 36 | ax.plot(X, Yi, 'g.-', markersize=8) 37 | ax.grid() 38 | ax.set_xlabel('X') 39 | ax.set_ylabel('Y') 40 | 41 | plt.show() 42 | 43 | 44 | if __name__ == '__main__': 45 | # Create dataset 46 | X = np.arange(-5, 5, 0.1) 47 | Y = X + np.random.uniform(-0.5, 1, size=X.shape) 48 | 49 | # Show original dataset 50 | show_dataset(X, Y) 51 | 52 | # Create an isotonic regressor 53 | ir = IsotonicRegression(-6, 10) 54 | Yi = ir.fit_transform(X, Y) 55 | 56 | # Create a segment list 57 | segments = [[[i, Y[i]], [i, Yi[i]]] for i in range(nb_samples)] 58 | 59 | # Show isotonic interpolation 60 | show_isotonic_regression_segments(X, Y, Yi, segments) 61 | -------------------------------------------------------------------------------- /Section04/1logistic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.model_selection import train_test_split, cross_val_score 8 | from sklearn.linear_model import LogisticRegression 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 500 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.grid() 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | 24 | for i in range(nb_samples): 25 | if Y[i] == 0: 26 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 27 | else: 28 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 29 | 30 | plt.show() 31 | 32 | 33 | def show_classification_areas(X, Y, lr): 34 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 35 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 36 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) 37 | Z = lr.predict(np.c_[xx.ravel(), yy.ravel()]) 38 | 39 | Z = Z.reshape(xx.shape) 40 | plt.figure(1, figsize=(30, 25)) 41 | plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel1) 42 | 43 | # Plot also the training points 44 | plt.scatter(X[:, 0], X[:, 1], c=np.abs(Y - 1), edgecolors='k', cmap=plt.cm.coolwarm) 45 | plt.xlabel('X') 46 | plt.ylabel('Y') 47 | 48 | plt.xlim(xx.min(), xx.max()) 49 | plt.ylim(yy.min(), yy.max()) 50 | plt.xticks(()) 51 | plt.yticks(()) 52 | 53 | plt.show() 54 | 55 | 56 | if __name__ == '__main__': 57 | # Create dataset 58 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 59 | n_clusters_per_class=1) 60 | 61 | # Show dataset 62 | show_dataset(X, Y) 63 | 64 | # Split dataset 65 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) 66 | 67 | # Create logistic regressor 68 | lr = LogisticRegression() 69 | lr.fit(X_train, Y_train) 70 | print('Logistic regression score: %.3f' % lr.score(X_test, Y_test)) 71 | 72 | # Compute CV score 73 | lr_scores = cross_val_score(lr, X, Y, scoring='accuracy', cv=10) 74 | print('Logistic regression CV average score: %.3f' % lr_scores.mean()) 75 | 76 | # Show classification areas 77 | show_classification_areas(X, Y, lr) 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /Section04/2perceptron.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.linear_model import SGDClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 500 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.grid() 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | 24 | for i in range(nb_samples): 25 | if Y[i] == 0: 26 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 27 | else: 28 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 29 | 30 | plt.show() 31 | 32 | 33 | if __name__ == '__main__': 34 | # Create dataset 35 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 36 | n_clusters_per_class=1) 37 | 38 | # Show dataset 39 | show_dataset(X, Y) 40 | 41 | # Create perceptron as SGD instance 42 | # The same result can be obtained using directly the class sklearn.linear_model.Perceptron 43 | sgd = SGDClassifier(loss='perceptron', learning_rate='optimal', n_iter=10) 44 | sgd_scores = cross_val_score(sgd, X, Y, scoring='accuracy', cv=10) 45 | print('Perceptron CV average score: %.3f' % sgd_scores.mean()) 46 | 47 | -------------------------------------------------------------------------------- /Section04/3grid_search.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from sklearn.datasets import load_iris 7 | from sklearn.model_selection import GridSearchCV, cross_val_score 8 | from sklearn.linear_model import LogisticRegression 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | if __name__ == '__main__': 16 | # Load dataset 17 | iris = load_iris() 18 | 19 | # Define a param grid 20 | param_grid = [ 21 | { 22 | 'penalty': ['l1', 'l2'], 23 | 'C': [0.5, 1.0, 1.5, 1.8, 2.0, 2.5] 24 | } 25 | ] 26 | 27 | # Create and train a grid search 28 | gs = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, 29 | scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) 30 | gs.fit(iris.data, iris.target) 31 | 32 | # Best estimator 33 | print(gs.best_estimator_) 34 | 35 | gs_scores = cross_val_score(gs.best_estimator_, iris.data, iris.target, scoring='accuracy', cv=10) 36 | print('Best estimator CV average score: %.3f' % gs_scores.mean()) 37 | 38 | -------------------------------------------------------------------------------- /Section04/3grid_search_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from sklearn.datasets import load_iris 7 | from sklearn.model_selection import GridSearchCV, cross_val_score 8 | from sklearn.linear_model import SGDClassifier 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | if __name__ == '__main__': 15 | # Load dataset 16 | iris = load_iris() 17 | 18 | # Define a param grid 19 | param_grid = [ 20 | { 21 | 'penalty': ['l1', 'l2', 'elasticnet'], 22 | 'alpha': [1e-5, 1e-4, 5e-4, 1e-3, 2.3e-3, 5e-3, 1e-2], 23 | 'l1_ratio': [0.01, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.75, 0.8] 24 | } 25 | ] 26 | 27 | # Create SGD classifier 28 | sgd = SGDClassifier(loss='perceptron', learning_rate='optimal') 29 | 30 | # Create and train a grid search 31 | gs = GridSearchCV(estimator=sgd, param_grid=param_grid, scoring='accuracy', cv=10, 32 | n_jobs=multiprocessing.cpu_count()) 33 | gs.fit(iris.data, iris.target) 34 | 35 | # Best estimator 36 | print(gs.best_estimator_) 37 | 38 | gs_scores = cross_val_score(gs.best_estimator_, iris.data, iris.target, scoring='accuracy', cv=10) 39 | print('Best estimator CV average score: %.3f' % gs_scores.mean()) -------------------------------------------------------------------------------- /Section04/4classification_metrics.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import make_classification 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.metrics import accuracy_score, zero_one_loss, jaccard_similarity_score, confusion_matrix, \ 9 | precision_score, recall_score, fbeta_score 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | nb_samples = 500 16 | 17 | 18 | if __name__ == '__main__': 19 | # Create dataset 20 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 21 | n_clusters_per_class=1) 22 | 23 | # Split dataset 24 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) 25 | 26 | # Create and train logistic regressor 27 | lr = LogisticRegression() 28 | lr.fit(X_train, Y_train) 29 | 30 | print('Accuracy score: %.3f' % accuracy_score(Y_test, lr.predict(X_test))) 31 | print('Zero-one loss (normalized): %.3f' % zero_one_loss(Y_test, lr.predict(X_test))) 32 | print('Zero-one loss (unnormalized): %.3f' % zero_one_loss(Y_test, lr.predict(X_test), normalize=False)) 33 | print('Jaccard similarity score: %.3f' % jaccard_similarity_score(Y_test, lr.predict(X_test))) 34 | 35 | # Compute confusion matrix 36 | cm = confusion_matrix(y_true=Y_test, y_pred=lr.predict(X_test)) 37 | print('Confusion matrix:') 38 | print(cm) 39 | 40 | print('Precision score: %.3f' % precision_score(Y_test, lr.predict(X_test))) 41 | print('Recall score: %.3f' % recall_score(Y_test, lr.predict(X_test))) 42 | print('F-Beta score (1): %.3f' % fbeta_score(Y_test, lr.predict(X_test), beta=1)) 43 | print('F-Beta score (0.75): %.3f' % fbeta_score(Y_test, lr.predict(X_test), beta=0.75)) 44 | print('F-Beta score (1.25): %.3f' % fbeta_score(Y_test, lr.predict(X_test), beta=1.25)) 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /Section04/5roc_curve.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.metrics import roc_curve, auc 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | nb_samples = 500 16 | 17 | 18 | if __name__ == '__main__': 19 | # Create dataset 20 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 21 | n_clusters_per_class=1) 22 | 23 | # Split dataset 24 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) 25 | 26 | #Create and train logistic regressor 27 | lr = LogisticRegression() 28 | lr.fit(X_train, Y_train) 29 | 30 | # Compute ROC curve 31 | Y_score = lr.decision_function(X_test) 32 | fpr, tpr, thresholds = roc_curve(Y_test, Y_score) 33 | 34 | plt.figure(figsize=(30, 25)) 35 | 36 | plt.plot(fpr, tpr, color='red', label='Logistic regression (AUC: %.2f)' % auc(fpr, tpr)) 37 | plt.plot([0, 1], [0, 1], color='blue', linestyle='--') 38 | plt.xlim([0.0, 1.0]) 39 | plt.ylim([0.0, 1.01]) 40 | plt.title('ROC Curve') 41 | plt.xlabel('False Positive Rate') 42 | plt.ylabel('True Positive Rate') 43 | plt.legend(loc="lower right") 44 | 45 | plt.show() -------------------------------------------------------------------------------- /Section05/1bernoulli.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.model_selection import train_test_split, cross_val_score 8 | from sklearn.naive_bayes import BernoulliNB 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 300 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.grid() 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | 24 | for i in range(nb_samples): 25 | if Y[i] == 0: 26 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 27 | else: 28 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 29 | 30 | plt.show() 31 | 32 | 33 | if __name__ == '__main__': 34 | # Create dataset 35 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0) 36 | 37 | # Show dataset 38 | show_dataset(X, Y) 39 | 40 | # Split dataset 41 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) 42 | 43 | # Create and train Bernoulli Naive Bayes classifier 44 | bnb = BernoulliNB(binarize=0.0) 45 | bnb.fit(X_train, Y_train) 46 | 47 | print('Bernoulli Naive Bayes score: %.3f' % bnb.score(X_test, Y_test)) 48 | 49 | # Compute CV score 50 | bnb_scores = cross_val_score(bnb, X, Y, scoring='accuracy', cv=10) 51 | print('Bernoulli Naive Bayes CV average score: %.3f' % bnb_scores.mean()) 52 | 53 | # Predict some values 54 | data = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 55 | Yp = bnb.predict(data) 56 | print(Yp) 57 | 58 | -------------------------------------------------------------------------------- /Section05/2multinomial.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.feature_extraction import DictVectorizer 6 | from sklearn.naive_bayes import MultinomialNB 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Prepare a dummy dataset 15 | data = [ 16 | {'house': 100, 'street': 50, 'shop': 25, 'car': 100, 'tree': 20}, 17 | {'house': 5, 'street': 5, 'shop': 0, 'car': 10, 'tree': 500, 'river': 1} 18 | ] 19 | 20 | # Create and train a dictionary vectorizer 21 | dv = DictVectorizer(sparse=False) 22 | X = dv.fit_transform(data) 23 | Y = np.array([1, 0]) 24 | 25 | # Create and train a Multinomial Naive Bayes classifier 26 | mnb = MultinomialNB() 27 | mnb.fit(X, Y) 28 | 29 | # Create dummy test data 30 | test_data = data = [ 31 | {'house': 80, 'street': 20, 'shop': 15, 'car': 70, 'tree': 10, 'river': 1}, 32 | {'house': 10, 'street': 5, 'shop': 1, 'car': 8, 'tree': 300, 'river': 0} 33 | ] 34 | 35 | Yp = mnb.predict(dv.fit_transform(test_data)) 36 | print(Yp) 37 | -------------------------------------------------------------------------------- /Section05/3gaussian.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.naive_bayes import GaussianNB 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.metrics import roc_curve, auc 11 | 12 | 13 | # For reproducibility 14 | np.random.seed(1000) 15 | 16 | nb_samples = 300 17 | 18 | 19 | def show_dataset(X, Y): 20 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 21 | 22 | ax.grid() 23 | ax.set_xlabel('X') 24 | ax.set_ylabel('Y') 25 | 26 | for i in range(nb_samples): 27 | if Y[i] == 0: 28 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 29 | else: 30 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 31 | 32 | plt.show() 33 | 34 | 35 | if __name__ == '__main__': 36 | # Create dataset 37 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0) 38 | 39 | # Show dataset 40 | show_dataset(X, Y) 41 | 42 | # Split dataset 43 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) 44 | 45 | # Create and train Gaussian Naive Bayes classifier 46 | gnb = GaussianNB() 47 | gnb.fit(X_train, Y_train) 48 | 49 | # Create and train a Logistic regressor (for comparison) 50 | lr = LogisticRegression() 51 | lr.fit(X_train, Y_train) 52 | 53 | # Compute ROC Curve 54 | Y_gnb_score = gnb.predict_proba(X_test) 55 | Y_lr_score = lr.decision_function(X_test) 56 | 57 | fpr_gnb, tpr_gnb, thresholds_gnb = roc_curve(Y_test, Y_gnb_score[:, 1]) 58 | fpr_lr, tpr_lr, thresholds_lr = roc_curve(Y_test, Y_lr_score) 59 | 60 | # Plot ROC Curve 61 | plt.figure(figsize=(30, 25)) 62 | 63 | plt.plot(fpr_gnb, tpr_gnb, color='red', label='Naive Bayes (AUC: %.2f)' % auc(fpr_gnb, tpr_gnb)) 64 | plt.plot(fpr_lr, tpr_lr, color='green', label='Logistic Regression (AUC: %.2f)' % auc(fpr_lr, tpr_lr)) 65 | plt.plot([0, 1], [0, 1], color='blue', linestyle='--') 66 | plt.xlim([0.0, 1.0]) 67 | plt.ylim([0.0, 1.01]) 68 | plt.title('ROC Curve') 69 | plt.xlabel('False Positive Rate') 70 | plt.ylabel('True Positive Rate') 71 | plt.legend(loc="lower right") 72 | 73 | plt.show() 74 | -------------------------------------------------------------------------------- /Section06/1linear_svm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.svm import SVC 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 500 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.grid() 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | 24 | for i in range(nb_samples): 25 | if Y[i] == 0: 26 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 27 | else: 28 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 29 | 30 | plt.show() 31 | 32 | 33 | if __name__ == '__main__': 34 | # Create dataset 35 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 36 | n_clusters_per_class=1) 37 | 38 | # Show dataset 39 | show_dataset(X, Y) 40 | 41 | # Create a SVM with linear kernel 42 | svc = SVC(kernel='linear') 43 | 44 | # Compute CV score 45 | svc_scores = cross_val_score(svc, X, Y, scoring='accuracy', cv=10) 46 | print('Linear SVM CV average score: %.3f' % svc_scores.mean()) 47 | 48 | -------------------------------------------------------------------------------- /Section06/2kernel_svm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import multiprocessing 6 | 7 | from sklearn.datasets import make_circles 8 | from sklearn.model_selection import GridSearchCV 9 | from sklearn.svm import SVC 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | nb_samples = 500 16 | 17 | 18 | def show_dataset(X, Y): 19 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 20 | 21 | ax.grid() 22 | ax.set_xlabel('X') 23 | ax.set_ylabel('Y') 24 | 25 | for i in range(nb_samples): 26 | if Y[i] == 0: 27 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 28 | else: 29 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 30 | 31 | plt.show() 32 | 33 | 34 | if __name__ == '__main__': 35 | # Create datasets 36 | X, Y = make_circles(n_samples=nb_samples, noise=0.1) 37 | 38 | # Show dataset 39 | show_dataset(X, Y) 40 | 41 | # Define a param grid 42 | param_grid = [ 43 | { 44 | 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 45 | 'C': [0.1, 0.2, 0.4, 0.5, 1.0, 1.5, 1.8, 2.0, 2.5, 3.0] 46 | } 47 | ] 48 | 49 | # Create a train grid search on SVM classifier 50 | gs = GridSearchCV(estimator=SVC(), param_grid=param_grid, 51 | scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) 52 | gs.fit(X, Y) 53 | 54 | print(gs.best_estimator_) 55 | print('Kernel SVM score: %.3f' % gs.best_score_) 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /Section06/2kernel_svm_1.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.model_selection import GridSearchCV 8 | from sklearn.svm import SVC 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | if __name__ == '__main__': 16 | # Load dataset 17 | digits = load_digits() 18 | 19 | # Define a param grid 20 | param_grid = [ 21 | { 22 | 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 23 | 'C': [0.1, 0.2, 0.4, 0.5, 1.0, 1.5, 1.8, 2.0, 2.5, 3.0] 24 | } 25 | ] 26 | 27 | # Create a train grid search on SVM classifier 28 | gs = GridSearchCV(estimator=SVC(), param_grid=param_grid, 29 | scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) 30 | gs.fit(digits.data, digits.target) 31 | 32 | print(gs.best_estimator_) 33 | print('Kernel SVM score: %.3f' % gs.best_score_) -------------------------------------------------------------------------------- /Section06/2kernel_svm_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from sklearn.datasets import fetch_olivetti_faces 7 | from sklearn.model_selection import GridSearchCV 8 | from sklearn.svm import SVC 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | # Set a local folder here 15 | olivetti_home = '' 16 | 17 | 18 | if __name__ == '__main__': 19 | # Load dataset 20 | 21 | faces = fetch_olivetti_faces(data_home=olivetti_home) 22 | # Define a param grid 23 | param_grid = [ 24 | { 25 | 'kernel': ['rbf', 'poly'], 26 | 'C': [0.1, 0.5, 1.0, 1.5], 27 | 'degree': [2, 3, 4, 5], 28 | 'gamma': [0.001, 0.01, 0.1, 0.5] 29 | } 30 | ] 31 | 32 | # Create a train grid search on SVM classifier 33 | gs = GridSearchCV(estimator=SVC(), param_grid=param_grid, 34 | scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) 35 | gs.fit(faces.data, faces.target) 36 | 37 | print(gs.best_estimator_) 38 | print('Kernel SVM score: %.3f' % gs.best_score_) 39 | -------------------------------------------------------------------------------- /Section06/3controlled_svm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.svm import SVC, NuSVC 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 500 14 | 15 | 16 | def show_dataset(X, Y): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.grid() 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | 23 | for i in range(nb_samples): 24 | if Y[i] == 0: 25 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 26 | else: 27 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 28 | 29 | plt.show() 30 | 31 | 32 | if __name__ == '__main__': 33 | # Create dataset 34 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 35 | n_clusters_per_class=1) 36 | 37 | # Show dataset 38 | show_dataset(X, Y) 39 | 40 | # Create and train a linear SVM 41 | svc = SVC(kernel='linear') 42 | svc.fit(X, Y) 43 | print('Number of support vectors: %d' % len(svc.support_vectors_)) 44 | 45 | # Create and train a Nu-SVM classifier 46 | nusvc = NuSVC(kernel='linear', nu=0.05) 47 | nusvc.fit(X, Y) 48 | print('Number of support vectors (nu=0.05): %d' % len(nusvc.support_vectors_)) -------------------------------------------------------------------------------- /Section06/4svr.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.svm import SVR 7 | from sklearn.model_selection import cross_val_score 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 50 14 | 15 | 16 | def show_dataset(X, Y): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.grid() 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | 23 | ax.scatter(X, Y) 24 | 25 | plt.show() 26 | 27 | 28 | if __name__ == '__main__': 29 | # Create dataset 30 | X = np.arange(-nb_samples, nb_samples, 1) 31 | Y = np.zeros(shape=(2 * nb_samples,)) 32 | 33 | for x in X: 34 | Y[int(x) + nb_samples] = np.power(x * 6, 2.0) / 1e4 + np.random.uniform(-2, 2) 35 | 36 | # Show dataset 37 | #show_dataset(X, Y) 38 | 39 | # Create and train a Support Vector regressor 40 | svr = SVR(kernel='poly', degree=2, C=1.5, epsilon=0.5) 41 | svr_scores = cross_val_score(svr, X.reshape((nb_samples*2, 1)), Y, scoring='neg_mean_squared_error', cv=10) 42 | print('SVR CV average negative squared error: %.3f' % svr_scores.mean()) 43 | 44 | -------------------------------------------------------------------------------- /Section07/1decision_tree.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import make_classification 6 | from sklearn.tree import DecisionTreeClassifier, export_graphviz 7 | from sklearn.model_selection import cross_val_score 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 500 14 | 15 | # Set a folder to store the graph in 16 | graph_folder = '' 17 | 18 | 19 | if __name__ == '__main__': 20 | # Create dataset 21 | X, Y = make_classification(n_samples=nb_samples, n_features=3, n_informative=3, n_redundant=0, n_classes=3, 22 | n_clusters_per_class=1) 23 | 24 | # Create a Decision tree classifier 25 | dt = DecisionTreeClassifier() 26 | dt_scores = cross_val_score(dt, X, Y, scoring='accuracy', cv=10) 27 | print('Decision tree score: %.3f' % dt_scores.mean()) 28 | 29 | # Save in Graphviz format 30 | dt.fit(X, Y) 31 | 32 | with open('dt.dot', 'w') as df: 33 | df = export_graphviz(dt, out_file=df, 34 | feature_names=['A', 'B', 'C'], 35 | class_names=['C1', 'C2', 'C3']) 36 | -------------------------------------------------------------------------------- /Section07/2decision_tree_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.tree import DecisionTreeClassifier 8 | from sklearn.model_selection import GridSearchCV 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | if __name__ == '__main__': 16 | # Load dataset 17 | digits = load_digits() 18 | 19 | # Define a param grid 20 | param_grid = [ 21 | { 22 | 'criterion': ['gini', 'entropy'], 23 | 'max_features': ['auto', 'log2', None], 24 | 'min_samples_split': [2, 10, 25, 100, 200], 25 | 'max_depth': [5, 10, 15, None] 26 | } 27 | ] 28 | 29 | # Create and train a grid search 30 | gs = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, 31 | scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) 32 | gs.fit(digits.data, digits.target) 33 | 34 | print(gs.best_estimator_) 35 | print('Decision tree score: %.3f' % gs.best_score_) 36 | -------------------------------------------------------------------------------- /Section07/3random_forest.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_classifications = 100 15 | 16 | 17 | if __name__ == '__main__': 18 | # Load dataset 19 | digits = load_digits() 20 | 21 | # Collect accuracies 22 | rf_accuracy = [] 23 | 24 | for i in range(1, nb_classifications): 25 | a = cross_val_score(RandomForestClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy', 26 | cv=10).mean() 27 | rf_accuracy.append(a) 28 | 29 | # Show results 30 | plt.figure(figsize=(30, 25)) 31 | plt.xlabel('Number of trees') 32 | plt.ylabel('Accuracy') 33 | plt.grid(True) 34 | plt.plot(rf_accuracy) 35 | plt.show() -------------------------------------------------------------------------------- /Section07/4random_forest_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.ensemble import ExtraTreesClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_classifications = 100 15 | 16 | 17 | if __name__ == '__main__': 18 | # Load dataset 19 | digits = load_digits() 20 | 21 | # Collect accuracies 22 | et_accuracy = [] 23 | 24 | for i in range(1, nb_classifications): 25 | a = cross_val_score(ExtraTreesClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy', 26 | cv=10).mean() 27 | et_accuracy.append(a) 28 | 29 | # Show results 30 | plt.figure(figsize=(30, 25)) 31 | plt.xlabel('Number of trees') 32 | plt.ylabel('Accuracy') 33 | plt.grid(True) 34 | plt.plot(et_accuracy) 35 | plt.show() 36 | -------------------------------------------------------------------------------- /Section07/5adaboost.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.ensemble import AdaBoostClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_classifications = 100 15 | 16 | 17 | if __name__ == '__main__': 18 | # Load dataset 19 | digits = load_digits() 20 | 21 | # Collect accuracies 22 | ab_accuracy = [] 23 | 24 | for i in range(1, nb_classifications): 25 | a = cross_val_score(AdaBoostClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy', 26 | cv=10).mean() 27 | ab_accuracy.append(a) 28 | 29 | # Show results 30 | plt.figure(figsize=(30, 25)) 31 | plt.xlabel('Number of trees') 32 | plt.ylabel('Accuracy') 33 | plt.grid(True) 34 | plt.plot(ab_accuracy) 35 | plt.show() -------------------------------------------------------------------------------- /Section07/6adaboost_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_iris 6 | from sklearn.ensemble import AdaBoostClassifier 7 | from sklearn.model_selection import cross_val_score 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | if __name__ == '__main__': 15 | # Load dataset 16 | iris = load_iris() 17 | 18 | # Create and train an AdaBoost classifier 19 | ada = AdaBoostClassifier(n_estimators=100, learning_rate=1.0) 20 | ada_scores = cross_val_score(ada, iris.data, iris.target, scoring='accuracy', cv=10) 21 | print('AdaBoost score: %.3f' % ada_scores.mean()) 22 | 23 | -------------------------------------------------------------------------------- /Section07/7gradient_tree_boosting.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.ensemble import GradientBoostingClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 500 14 | 15 | if __name__ == '__main__': 16 | # Create the dataset 17 | X, Y = make_classification(n_samples=nb_samples, n_features=4, n_informative=3, n_redundant=1, n_classes=3) 18 | 19 | # Collect the scores for n_estimators in (1, 50) 20 | a = [] 21 | max_estimators = 50 22 | 23 | for i in range(1, max_estimators): 24 | score = cross_val_score(GradientBoostingClassifier(n_estimators=i, learning_rate=10.0 / float(i)), X, Y, 25 | cv=10, scoring='accuracy').mean() 26 | a.append(score) 27 | 28 | # Plot the results 29 | plt.figure(figsize=(30, 25)) 30 | plt.xlabel('Number of estimators') 31 | plt.ylabel('Average CV accuracy') 32 | plt.grid(True) 33 | plt.plot(a) 34 | plt.show() -------------------------------------------------------------------------------- /Section07/8voting_classifier.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.svm import SVC 9 | from sklearn.tree import DecisionTreeClassifier 10 | from sklearn.ensemble import VotingClassifier 11 | from sklearn.model_selection import cross_val_score 12 | 13 | # For reproducibility 14 | np.random.seed(1000) 15 | 16 | nb_samples = 500 17 | 18 | 19 | def compute_accuracies(lr, dt, svc, vc, X, Y): 20 | accuracies = [] 21 | 22 | accuracies.append(cross_val_score(lr, X, Y, scoring='accuracy', cv=10).mean()) 23 | accuracies.append(cross_val_score(dt, X, Y, scoring='accuracy', cv=10).mean()) 24 | accuracies.append(cross_val_score(svc, X, Y, scoring='accuracy', cv=10).mean()) 25 | accuracies.append(cross_val_score(vc, X, Y, scoring='accuracy', cv=10).mean()) 26 | 27 | print('Accuracies:') 28 | print(np.array(accuracies)) 29 | 30 | return accuracies 31 | 32 | 33 | def plot_accuracies(accuracies): 34 | fig, ax = plt.subplots(figsize=(12, 8)) 35 | positions = np.array([0, 1, 2, 3]) 36 | 37 | ax.bar(positions, accuracies, 0.5) 38 | ax.set_ylabel('Accuracy') 39 | ax.set_xticklabels(('Logistic Regression', 'Decision Tree', 'SVM', 'Ensemble')) 40 | ax.set_xticks(positions + (5.0 / 20)) 41 | plt.ylim([0.80, 0.93]) 42 | plt.show() 43 | 44 | 45 | if __name__ == '__main__': 46 | # Create the dataset 47 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, n_classes=2) 48 | 49 | # Show the dataset 50 | fig, ax = plt.subplots(figsize=(12, 12)) 51 | 52 | for i, x in enumerate(X): 53 | if Y[i] == 0: 54 | ax.scatter(x[0], x[1], marker='s', color='blue') 55 | else: 56 | ax.scatter(x[0], x[1], marker='d', color='red') 57 | 58 | ax.set_xlabel(r'$X_0$') 59 | ax.set_ylabel(r'$X_1$') 60 | plt.show() 61 | 62 | # Create the classifiers 63 | lr = LogisticRegression() 64 | svc = SVC(kernel='poly', probability=True) 65 | dt = DecisionTreeClassifier() 66 | 67 | classifiers = [('lr', lr), 68 | ('dt', dt), 69 | ('svc', svc)] 70 | 71 | # Hard voting 72 | vc = VotingClassifier(estimators=classifiers, voting='hard') 73 | 74 | # Compute and plot accuracies 75 | hard_accuracies = compute_accuracies(lr, dt, svc, vc, X, Y) 76 | plot_accuracies(hard_accuracies) 77 | 78 | # Soft weighted voting 79 | weights = [1.5, 0.5, 0.75] 80 | 81 | vc = VotingClassifier(estimators=classifiers, weights=weights, voting='soft') 82 | 83 | # Compute and plot accuracies 84 | soft_accuracies = compute_accuracies(lr, dt, svc, vc, X, Y) 85 | plot_accuracies(soft_accuracies) 86 | 87 | -------------------------------------------------------------------------------- /Section08/1k_means.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_blobs 7 | from sklearn.cluster import KMeans 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 1000 14 | 15 | 16 | def show_dataset(X): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.grid() 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | 23 | ax.scatter(X[:, 0], X[:, 1], marker='o', color='b') 24 | 25 | plt.show() 26 | 27 | 28 | def show_clustered_dataset(X, km): 29 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 30 | 31 | ax.grid() 32 | ax.set_xlabel('X') 33 | ax.set_ylabel('Y') 34 | 35 | for i in range(nb_samples): 36 | c = km.predict(X[i].reshape(1, -1)) 37 | if c == 0: 38 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 39 | elif c == 1: 40 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 41 | else: 42 | ax.scatter(X[i, 0], X[i, 1], marker='d', color='g') 43 | 44 | plt.show() 45 | 46 | 47 | if __name__ == '__main__': 48 | # Create dataset 49 | X, _ = make_blobs(n_samples=nb_samples, n_features=2, centers=3, cluster_std=1.5) 50 | 51 | # Show dataset 52 | show_dataset(X) 53 | 54 | # Create and train K-Means 55 | km = KMeans(n_clusters=3) 56 | km.fit(X) 57 | 58 | # Show the centroids 59 | print(km.cluster_centers_) 60 | 61 | # Show clustered dataset 62 | show_clustered_dataset(X, km) 63 | -------------------------------------------------------------------------------- /Section08/1k_means_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_circles 7 | from sklearn.cluster import KMeans 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 1000 14 | 15 | 16 | def show_dataset(X): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.grid() 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | 23 | for i in range(nb_samples): 24 | if Y[i] == 0: 25 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 26 | else: 27 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 28 | 29 | plt.show() 30 | 31 | 32 | def show_clustered_dataset(X, km): 33 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 34 | 35 | ax.grid() 36 | ax.set_xlabel('X') 37 | ax.set_ylabel('Y') 38 | 39 | for i in range(nb_samples): 40 | c = km.predict(X[i].reshape(1, -1)) 41 | if c == 0: 42 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 43 | else: 44 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 45 | 46 | plt.show() 47 | 48 | 49 | if __name__ == '__main__': 50 | # Create dataset 51 | X, Y = make_circles(n_samples=nb_samples, noise=0.05) 52 | 53 | # Show dataset 54 | show_dataset(X) 55 | 56 | # Create and train K-Means 57 | km = KMeans(n_clusters=2) 58 | km.fit(X) 59 | 60 | # Show clustered dataset 61 | show_clustered_dataset(X, km) 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /Section08/2dbscan.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_moons 7 | from sklearn.cluster import DBSCAN 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 1000 14 | 15 | 16 | def show_dataset(X, Y): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.grid() 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | 23 | for i in range(nb_samples): 24 | if Y[i] == 0: 25 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 26 | else: 27 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 28 | 29 | plt.show() 30 | 31 | 32 | def show_clustered_dataset(X, Y): 33 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 34 | 35 | ax.grid() 36 | ax.set_xlabel('X') 37 | ax.set_ylabel('Y') 38 | 39 | for i in range(nb_samples): 40 | if Y[i] == 0: 41 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 42 | else: 43 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 44 | 45 | plt.show() 46 | 47 | 48 | if __name__ == '__main__': 49 | # Create dataset 50 | X, Y = make_moons(n_samples=nb_samples, noise=0.05) 51 | 52 | # Show dataset 53 | show_dataset(X, Y) 54 | 55 | # Create and train DBSCAN 56 | dbs = DBSCAN(eps=0.1) 57 | Y = dbs.fit_predict(X) 58 | 59 | # Show clustered dataset 60 | show_clustered_dataset(X, Y) 61 | 62 | -------------------------------------------------------------------------------- /Section08/3spectral_clustering.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import warnings 6 | 7 | from sklearn.datasets import make_moons 8 | from sklearn.cluster import SpectralClustering 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 1000 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.grid() 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | 24 | for i in range(nb_samples): 25 | if Y[i] == 0: 26 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 27 | else: 28 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 29 | 30 | plt.show() 31 | 32 | 33 | def show_clustered_dataset(X, Y): 34 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 35 | 36 | ax.grid() 37 | ax.set_xlabel('X') 38 | ax.set_ylabel('Y') 39 | 40 | for i in range(nb_samples): 41 | if Y[i] == 0: 42 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 43 | else: 44 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 45 | 46 | plt.show() 47 | 48 | 49 | if __name__ == '__main__': 50 | warnings.simplefilter("ignore") 51 | 52 | # Create dataset 53 | X, Y = make_moons(n_samples=nb_samples, noise=0.05) 54 | 55 | # Show dataset 56 | show_dataset(X, Y) 57 | 58 | # Create and train Spectral Clustering 59 | sc = SpectralClustering(n_clusters=2, affinity='nearest_neighbors') 60 | Y = sc.fit_predict(X) 61 | 62 | # Show clustered dataset 63 | show_clustered_dataset(X, Y) -------------------------------------------------------------------------------- /Section08/3spectral_clustering_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_moons 7 | from sklearn.cluster import SpectralClustering 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 1000 14 | 15 | 16 | if __name__ == '__main__': 17 | # Create dataset 18 | X, Y = make_moons(n_samples=nb_samples, noise=0.05) 19 | 20 | # Try different gammas with a RBF affinity 21 | Yss = [] 22 | gammas = np.linspace(0, 12, 4) 23 | 24 | for gamma in gammas: 25 | sc = SpectralClustering(n_clusters=2, affinity='rbf', gamma=gamma) 26 | Yss.append(sc.fit_predict(X)) 27 | 28 | # Show data 29 | fig, ax = plt.subplots(1, 4, figsize=(30, 10), sharey=True) 30 | 31 | for x in range(4): 32 | ax[x].grid() 33 | ax[x].set_title('Gamma = %.0f' % gammas[x]) 34 | 35 | for i in range(nb_samples): 36 | c = Yss[x][i] 37 | 38 | if c == 0: 39 | ax[x].scatter(X[i, 0], X[i, 1], marker='o', color='r') 40 | else: 41 | ax[x].scatter(X[i, 0], X[i, 1], marker='^', color='b') 42 | 43 | plt.show() -------------------------------------------------------------------------------- /Section09/1dendrogram.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_blobs 7 | 8 | from scipy.spatial.distance import pdist 9 | from scipy.cluster.hierarchy import linkage 10 | from scipy.cluster.hierarchy import dendrogram 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | nb_samples = 25 16 | 17 | if __name__ == '__main__': 18 | # Create the dataset 19 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=3, cluster_std=1.5) 20 | 21 | # Show the dataset 22 | fig, ax = plt.subplots(1, 1, figsize=(10, 8)) 23 | 24 | ax.grid() 25 | ax.set_xlabel('X') 26 | ax.set_ylabel('Y') 27 | 28 | ax.scatter(X[:, 0], X[:, 1], marker='o', color='b') 29 | plt.show() 30 | 31 | # Compute the distance matrix 32 | Xdist = pdist(X, metric='euclidean') 33 | 34 | # Compute the linkage 35 | Xl = linkage(Xdist, method='ward') 36 | 37 | # Compute and show the dendrogram 38 | fig, ax = plt.subplots(1, 1, figsize=(10, 8)) 39 | Xd = dendrogram(Xl) 40 | plt.show() -------------------------------------------------------------------------------- /Section09/2agglomerative_clustering.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_blobs 7 | from sklearn.cluster import AgglomerativeClustering 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | nb_samples = 3000 13 | 14 | 15 | def plot_clustered_dataset(X, Y): 16 | fig, ax = plt.subplots(1, 1, figsize=(10, 8)) 17 | 18 | ax.grid() 19 | ax.set_xlabel('X') 20 | ax.set_ylabel('Y') 21 | 22 | markers = ['o', 'd', '^', 'x', '1', '2', '3', 's'] 23 | colors = ['r', 'b', 'g', 'c', 'm', 'k', 'y', '#cccfff'] 24 | 25 | for i in range(nb_samples): 26 | ax.scatter(X[i, 0], X[i, 1], marker=markers[Y[i]], color=colors[Y[i]]) 27 | 28 | plt.show() 29 | 30 | 31 | if __name__ == '__main__': 32 | # Create the dataset 33 | X, _ = make_blobs(n_samples=nb_samples, n_features=2, centers=8, cluster_std=2.0) 34 | 35 | # Show the dataset 36 | fig, ax = plt.subplots(1, 1, figsize=(10, 8)) 37 | 38 | ax.grid() 39 | ax.set_xlabel('X') 40 | ax.set_ylabel('Y') 41 | 42 | ax.scatter(X[:, 0], X[:, 1], marker='o', color='b') 43 | plt.show() 44 | 45 | # Complete linkage 46 | print('Complete linkage') 47 | ac = AgglomerativeClustering(n_clusters=8, linkage='complete') 48 | Y = ac.fit_predict(X) 49 | 50 | # Show the clustered dataset 51 | plot_clustered_dataset(X, Y) 52 | 53 | # Average linkage 54 | print('Average linkage') 55 | ac = AgglomerativeClustering(n_clusters=8, linkage='average') 56 | Y = ac.fit_predict(X) 57 | 58 | # Show the clustered dataset 59 | plot_clustered_dataset(X, Y) 60 | 61 | # Ward linkage 62 | print('Ward linkage') 63 | ac = AgglomerativeClustering(n_clusters=8) 64 | Y = ac.fit_predict(X) 65 | 66 | # Show the clustered dataset 67 | plot_clustered_dataset(X, Y) 68 | 69 | 70 | -------------------------------------------------------------------------------- /Section09/3connectivity_constraints.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_circles 7 | from sklearn.cluster import AgglomerativeClustering 8 | from sklearn.neighbors import kneighbors_graph 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 3000 14 | 15 | if __name__ == '__main__': 16 | # Create the dataset 17 | X, _ = make_circles(n_samples=nb_samples, noise=0.05) 18 | 19 | # Show the dataset 20 | fig, ax = plt.subplots(1, 1, figsize=(10, 8)) 21 | 22 | ax.grid() 23 | ax.set_xlabel('X') 24 | ax.set_ylabel('Y') 25 | 26 | ax.scatter(X[:, 0], X[:, 1], marker='o', color='b') 27 | plt.show() 28 | 29 | # Unstructured clustering with average linkage 30 | print('Unstructured clustering with average linkage') 31 | ac = AgglomerativeClustering(n_clusters=20, linkage='average') 32 | ac.fit(X) 33 | 34 | # Plot the clustered dataset 35 | fig, ax = plt.subplots(1, 1, figsize=(12, 10)) 36 | 37 | ax.grid() 38 | ax.set_xlabel('X') 39 | ax.set_ylabel('Y') 40 | ax.scatter(X[:, 0], X[:, 1], marker='o', cmap=plt.cm.spectral, c=ac.labels_) 41 | plt.show() 42 | 43 | # Connectivity constraints 44 | print('Imposing connectivity constraints') 45 | 46 | acc = [] 47 | k = [50, 100, 200, 500] 48 | 49 | ac = AgglomerativeClustering(n_clusters=20, connectivity=None, linkage='average') 50 | ac.fit(X) 51 | 52 | for i in range(4): 53 | kng = kneighbors_graph(X, k[i]) 54 | ac1 = AgglomerativeClustering(n_clusters=20, connectivity=kng, linkage='average') 55 | ac1.fit(X) 56 | acc.append(ac1) 57 | 58 | # Show the four plots 59 | fig, ax = plt.subplots(2, 2, figsize=(14, 10)) 60 | 61 | ax[0, 0].grid() 62 | ax[0, 0].set_title('K = 50') 63 | ax[0, 0].set_xlabel('X') 64 | ax[0, 0].set_ylabel('Y') 65 | ax[0, 0].scatter(X[:, 0], X[:, 1], marker='o', cmap=plt.cm.spectral, c=acc[0].labels_) 66 | 67 | ax[0, 1].grid() 68 | ax[0, 1].set_title('K = 100') 69 | ax[0, 1].set_xlabel('X') 70 | ax[0, 1].set_ylabel('Y') 71 | ax[0, 1].scatter(X[:, 0], X[:, 1], marker='o', cmap=plt.cm.spectral, c=acc[1].labels_) 72 | 73 | ax[1, 0].grid() 74 | ax[1, 0].set_title('K = 200') 75 | ax[1, 0].set_xlabel('X') 76 | ax[1, 0].set_ylabel('Y') 77 | ax[1, 0].scatter(X[:, 0], X[:, 1], marker='o', cmap=plt.cm.spectral, c=acc[2].labels_) 78 | 79 | ax[1, 1].grid() 80 | ax[1, 1].set_title('K = 500') 81 | ax[1, 1].set_xlabel('X') 82 | ax[1, 1].set_ylabel('Y') 83 | ax[1, 1].scatter(X[:, 0], X[:, 1], marker='o', cmap=plt.cm.spectral, c=acc[3].labels_) 84 | plt.show() 85 | 86 | -------------------------------------------------------------------------------- /Section10/1user_based.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.neighbors import NearestNeighbors 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | nb_users = 1000 11 | nb_product = 20 12 | 13 | if __name__ == '__main__': 14 | # Create the user dataset 15 | users = np.zeros(shape=(nb_users, 4)) 16 | 17 | for i in range(nb_users): 18 | users[i, 0] = np.random.randint(0, 4) 19 | users[i, 1] = np.random.randint(0, 2) 20 | users[i, 2] = np.random.randint(0, 5) 21 | users[i, 2] = np.random.randint(0, 5) 22 | 23 | # Create user-product dataset 24 | user_products = np.random.randint(0, nb_product, size=(nb_users, 5)) 25 | 26 | # Fit k-nearest neighbors 27 | nn = NearestNeighbors(n_neighbors=20, radius=2.0) 28 | nn.fit(users) 29 | 30 | # Create a test user 31 | test_user = np.array([2, 0, 3, 2]) 32 | 33 | # Determine the neighbors 34 | d, neighbors = nn.kneighbors(test_user.reshape(1, -1)) 35 | 36 | print('Neighbors:') 37 | print(neighbors) 38 | 39 | # Determine the suggested products 40 | suggested_products = [] 41 | 42 | for n in neighbors: 43 | for products in user_products[n]: 44 | for product in products: 45 | if product != 0 and product not in suggested_products: 46 | suggested_products.append(product) 47 | 48 | print('Suggested products:') 49 | print(suggested_products) 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /Section10/2content-based.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.neighbors import NearestNeighbors 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | nb_items = 1000 11 | 12 | if __name__ == '__main__': 13 | # Create the item dataset 14 | items = np.zeros(shape=(nb_items, 4)) 15 | 16 | for i in range(nb_items): 17 | items[i, 0] = np.random.randint(0, 100) 18 | items[i, 1] = np.random.randint(0, 100) 19 | items[i, 2] = np.random.randint(0, 100) 20 | items[i, 3] = np.random.randint(0, 100) 21 | 22 | metrics = ['euclidean', 'hamming', 'jaccard'] 23 | 24 | for metric in metrics: 25 | print('Metric: %r' % metric) 26 | 27 | # Fit k-nearest neighbors 28 | nn = NearestNeighbors(n_neighbors=10, radius=5.0, metric=metric) 29 | nn.fit(items) 30 | 31 | # Create a test product 32 | test_product = np.array([15, 60, 28, 73]) 33 | 34 | # Determine the neighbors with different radiuses 35 | d, suggestions = nn.radius_neighbors(test_product.reshape(1, -1), radius=20) 36 | 37 | print('Suggestions (radius=10):') 38 | print(suggestions) 39 | 40 | d, suggestions = nn.radius_neighbors(test_product.reshape(1, -1), radius=30) 41 | 42 | print('Suggestions (radius=15):') 43 | print(suggestions) -------------------------------------------------------------------------------- /Section10/3memory_based_cf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import warnings 5 | 6 | from scikits.crab.models import MatrixPreferenceDataModel 7 | from scikits.crab.similarities import UserSimilarity 8 | from scikits.crab.metrics import euclidean_distances 9 | from scikits.crab.recommenders.knn import UserBasedRecommender 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | if __name__ == '__main__': 15 | # Define a user-item matrix 16 | user_item_matrix = { 17 | 1: {1: 2, 2: 5, 3: 3}, 18 | 2: {1: 5, 4: 2}, 19 | 3: {2: 3, 4: 5, 3: 2}, 20 | 4: {3: 5, 5: 1}, 21 | 5: {1: 3, 2: 3, 4: 1, 5: 3} 22 | } 23 | 24 | # Build a matrix preference model 25 | model = MatrixPreferenceDataModel(user_item_matrix) 26 | 27 | # Build a similarity matrix 28 | similarity_matrix = UserSimilarity(model, euclidean_distances) 29 | 30 | # Create a recommender 31 | recommender = UserBasedRecommender(model, similarity_matrix, with_preference=True) 32 | 33 | # Test the recommender for user 2 34 | with warnings.catch_warnings(): 35 | warnings.simplefilter("ignore") 36 | print(recommender.recommend(2)) 37 | -------------------------------------------------------------------------------- /Section10/4model_based_cf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from scipy.linalg import svd 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | if __name__ == '__main__': 11 | # Create a dummy user-item matrix 12 | M = np.random.randint(0, 6, size=(20, 10)) 13 | 14 | print('User-Item matrix:') 15 | print(M) 16 | 17 | # Decompose M 18 | U, s, V = svd(M, full_matrices=True) 19 | S = np.diag(s) 20 | 21 | print('U -> %r' % str(U.shape)) 22 | print('S -> %r' % str(S.shape)) 23 | print('V -> %r' % str(V.shape)) 24 | 25 | # Select the first 8 singular values 26 | Uk = U[:, 0:8] 27 | Sk = S[0:8, 0:8] 28 | Vk = V[0:8, :] 29 | 30 | # Compute the user and product vectors 31 | Su = Uk.dot(np.sqrt(Sk).T) 32 | Si = np.sqrt(Sk).dot(Vk).T 33 | 34 | # Compute the average rating per user 35 | Er = np.mean(M, axis=1) 36 | 37 | # Perform a prediction for the user 5 and item 2 38 | r5_2 = Er[5] + Su[5].dot(Si[2]) 39 | print(r5_2) 40 | 41 | -------------------------------------------------------------------------------- /Section10/5als_spark.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from pyspark import SparkContext, SparkConf 4 | from pyspark.mllib.recommendation import Rating 5 | from pyspark.mllib.recommendation import ALS 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | nb_users = 200 11 | nb_products = 100 12 | ratings = [] 13 | 14 | if __name__ == '__main__': 15 | conf = SparkConf().setAppName('ALS').setMaster('local[*]') 16 | sc = SparkContext(conf=conf) 17 | 18 | for _ in range(10): 19 | for i in range(nb_users): 20 | rating = Rating(user=i, product=np.random.randint(1, nb_products), rating=np.random.randint(0, 5)) 21 | ratings.append(rating) 22 | 23 | # Parallelize the ratings 24 | ratings = sc.parallelize(ratings) 25 | 26 | # Train the model 27 | model = ALS.train(ratings, rank=5, iterations=10) 28 | 29 | # Test the model 30 | test = ratings.map(lambda rating: (rating.user, rating.product)) 31 | 32 | predictions = model.predictAll(test) 33 | full_predictions = predictions.map(lambda pred: ((pred.user, pred.product), pred.rating)) 34 | 35 | # Compute MSE 36 | split_ratings = ratings.map(lambda rating: ((rating.user, rating.product), rating.rating)) 37 | joined_predictions = split_ratings.join(full_predictions) 38 | mse = joined_predictions.map(lambda x: (x[1][0] - x[1][1]) ** 2).mean() 39 | 40 | print('MSE: %.3f' % mse) 41 | 42 | # Perform a single prediction 43 | prediction = model.predict(10, 20) 44 | print('Prediction: %.3f' % prediction) 45 | --------------------------------------------------------------------------------