├── Chapter14 ├── vader_sentiment_analysis.py ├── lsa_2.py ├── word2vec.py ├── lsa_1.py ├── lda.py ├── lsa.py ├── sentiment_analysis.py └── plsa.py ├── Chapter13 ├── language_detection.py ├── corpora.py ├── stopwords_removal.py ├── pos_ner.py ├── stemming.py ├── tokenizing.py ├── reuters_text_classifier.py └── vectorizing.py ├── Chapter08 ├── adaboost_2.py ├── adaboost.py ├── random_forest.py ├── decision_tree_2.py ├── gradient_tree_boosting.py ├── decision_tree.py ├── random_forest_2.py ├── voting_classifier.py ├── decision_tree_regression.py └── dt.dot ├── Chapter03 ├── data_normalization.py ├── toy_dataset.py ├── missing_features.py ├── nmf.py ├── dictionary_learning.py ├── tsne.py ├── sparse_pca.py ├── kernel_pca.py ├── feature_selection.py ├── feature_filtering.py ├── whitening.py ├── pca.py ├── fastica.py ├── categorical.py └── data_scaling.py ├── Chapter02 ├── MLE.py ├── resampling.py └── SMOTE.py ├── Chapter07 ├── kernel_svm_1.py ├── kernel_svm_2.py ├── linear_svm.py ├── controlled_svm.py ├── svr.py ├── kernel_svm.py ├── svr_airfoil.py └── s3vm.py ├── Chapter12 ├── model_based_cf.py ├── memory_based_cf.py ├── content-based.py ├── user_based.py └── als_spark.py ├── Chapter17 ├── vectorization.py ├── numpy_cupy.py ├── feature_union.py ├── pipeline.py └── pipeline_2.py ├── Chapter11 ├── dendrogram.py ├── agglomerative_clustering.py └── connectivity_constraints.py ├── LICENSE ├── Chapter06 ├── multinomial.py ├── newsgroups.py ├── discriminant_analysis.py ├── bernoulli.py └── gaussian.py ├── Chapter05 ├── grid_search.py ├── learning_curve.py ├── grid_search_2.py ├── perceptron.py ├── roc_curve.py ├── confusion_matrix.py ├── passive_aggressive_classification.py ├── classification_metrics.py ├── passive_aggressive_regression.py └── logistic_regression.py ├── Chapter10 ├── spectral_clustering_2.py ├── dbscan.py ├── birch.py ├── biclustering.py ├── mini_batch_kmeans.py └── spectral_clustering.py ├── Chapter04 ├── ransac_regression.py ├── huber_regression.py ├── polynomial_regression.py ├── isotonic_regression.py ├── bayesian_regression.py ├── multiple_linear_regression.py ├── 2d_linear_regression.py └── ridge_lasso_elasticnet.py ├── Chapter16 ├── convolution.py ├── gradients.py ├── mlp.py ├── logistic_regression.py ├── dcn.py └── lstm.py ├── Chapter09 ├── k_means.py ├── knn.py ├── k_means_2.py ├── gaussian_mixture.py └── evaluation_metrics.py ├── Chapter15 ├── keras_scikit_learn.py └── mlp.py └── README.md /Chapter14/vader_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from nltk.sentiment.vader import SentimentIntensityAnalyzer 4 | 5 | if __name__ == '__main__': 6 | text = 'This is a very interesting and quite powerful sentiment analyzer' 7 | 8 | vader = SentimentIntensityAnalyzer() 9 | print(vader.polarity_scores(text)) -------------------------------------------------------------------------------- /Chapter13/language_detection.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from langdetect import detect, detect_langs 4 | 5 | if __name__ == '__main__': 6 | # Simple language detection 7 | print(detect('This is English')) 8 | print(detect('Dies ist Deutsch')) 9 | 10 | # Probabilistic language detection 11 | print(detect_langs('I really love you mon doux amour!')) -------------------------------------------------------------------------------- /Chapter13/corpora.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from nltk.corpus import gutenberg 4 | 5 | if __name__ == '__main__': 6 | # Print all Gutenberg corpus documents 7 | print('Gutenberg corpus files:') 8 | print(gutenberg.fileids()) 9 | 10 | # Print a raw corpus 11 | print(gutenberg.raw('milton-paradise.txt')) 12 | 13 | # Print 2 sentences from a corpus 14 | print(gutenberg.sents('milton-paradise.txt')[0:2]) 15 | 16 | # Print 20 words from a corpus 17 | print(gutenberg.words('milton-paradise.txt')[0:20]) -------------------------------------------------------------------------------- /Chapter08/adaboost_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_iris 6 | from sklearn.ensemble import AdaBoostClassifier 7 | from sklearn.model_selection import cross_val_score 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | if __name__ == '__main__': 15 | # Load dataset 16 | iris = load_iris() 17 | 18 | # Create and train an AdaBoost classifier 19 | ada = AdaBoostClassifier(n_estimators=100, learning_rate=1.0) 20 | ada_scores = cross_val_score(ada, iris.data, iris.target, scoring='accuracy', cv=10) 21 | print('AdaBoost score: %.3f' % ada_scores.mean()) 22 | 23 | -------------------------------------------------------------------------------- /Chapter13/stopwords_removal.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from nltk.corpus import stopwords 4 | from nltk.tokenize import RegexpTokenizer 5 | 6 | if __name__ == '__main__': 7 | # Load English stopwords 8 | sw = set(stopwords.words('english')) 9 | 10 | print('English stopwords:') 11 | print(sw) 12 | 13 | # Tokenize and remove stopwords 14 | complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!' 15 | 16 | ret = RegexpTokenizer('[a-zA-Z\']+') 17 | tokens = ret.tokenize(complex_text) 18 | clean_tokens = [t for t in tokens if t not in sw] 19 | 20 | print('Tokenized and cleaned complex text') 21 | print(clean_tokens) -------------------------------------------------------------------------------- /Chapter13/pos_ner.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from nltk import word_tokenize, pos_tag, ne_chunk, tree2conlltags 4 | 5 | 6 | if __name__ == '__main__': 7 | sentence_1 = 'My friend John lives in Paris' 8 | 9 | # Perform a POS tagging 10 | tokens_1 = word_tokenize(sentence_1) 11 | tags_1 = pos_tag(tokens_1) 12 | 13 | print(sentence_1) 14 | print(tags_1) 15 | 16 | # Peform a POS and NER tagging 17 | sentence_2 = 'Search a hotel in Cambridge near the MIT' 18 | 19 | tokens_2 = word_tokenize(sentence_2) 20 | tags_2 = pos_tag(tokens_2) 21 | 22 | print('\n') 23 | print(sentence_2) 24 | print(tree2conlltags(ne_chunk(tags_2))) 25 | 26 | -------------------------------------------------------------------------------- /Chapter03/data_normalization.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.preprocessing import Normalizer 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | if __name__ == '__main__': 11 | # Create a dummy dataset 12 | data = np.array([1.0, 2.0]) 13 | print(data) 14 | 15 | # Max normalization 16 | n_max = Normalizer(norm='max') 17 | nm = n_max.fit_transform(data.reshape(1, -1)) 18 | print(nm) 19 | 20 | # L1 normalization 21 | n_l1 = Normalizer(norm='l1') 22 | nl1 = n_l1.fit_transform(data.reshape(1, -1)) 23 | print(nl1) 24 | 25 | # L2 normalization 26 | n_l2 = Normalizer(norm='l2') 27 | nl2 = n_l2.fit_transform(data.reshape(1, -1)) 28 | print(nl2) -------------------------------------------------------------------------------- /Chapter02/MLE.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from scipy.optimize import minimize 6 | 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | 12 | nb_samples = 100 13 | 14 | # Create the dataset 15 | X_data = np.random.normal(loc=0.0, scale=np.sqrt(2.0), size=nb_samples) 16 | 17 | 18 | def negative_log_likelihood(v): 19 | l = 0.0 20 | f1 = 1.0 / np.sqrt(2.0 * np.pi * v[1]) 21 | f2 = 2.0 * v[1] 22 | 23 | for x in X_data: 24 | l += np.log(f1 * np.exp(-np.square(x - v[0]) / f2)) 25 | 26 | return -l 27 | 28 | 29 | if __name__ == '__main__': 30 | # Create the dataset 31 | res = minimize(fun=negative_log_likelihood, x0=np.array([0.0, 1.0])) 32 | 33 | print(res) 34 | 35 | -------------------------------------------------------------------------------- /Chapter03/toy_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_boston 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.utils import check_random_state 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | if __name__ == '__main__': 15 | # Load the dataset 16 | boston = load_boston() 17 | X = boston.data 18 | Y = boston.target 19 | 20 | print(X.shape) 21 | print(Y.shape) 22 | 23 | # Create train and test sets 24 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=1000) 25 | 26 | # Use a random state 27 | rs = check_random_state(1000) 28 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=rs) -------------------------------------------------------------------------------- /Chapter03/missing_features.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.preprocessing import Imputer 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | if __name__ == '__main__': 11 | data = np.array([[1, np.nan, 2], [2, 3, np.nan], [-1, 4, 2]]) 12 | print(data) 13 | 14 | # Imputer with mean-strategy 15 | print('Mean strategy') 16 | imp = Imputer(strategy='mean') 17 | print(imp.fit_transform(data)) 18 | 19 | # Imputer with median-strategy 20 | print('Median strategy') 21 | imp = Imputer(strategy='median') 22 | print(imp.fit_transform(data)) 23 | 24 | # Imputer with most-frequent-strategy 25 | print('Most-frequent strategy') 26 | imp = Imputer(strategy='most_frequent') 27 | print(imp.fit_transform(data)) 28 | 29 | -------------------------------------------------------------------------------- /Chapter03/nmf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_iris 6 | from sklearn.decomposition import NMF 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | if __name__ == '__main__': 12 | # Load iris dataset 13 | iris = load_iris() 14 | print('Irid dataset shape') 15 | print(iris.data.shape) 16 | 17 | # Perform a non-negative matrix factorization 18 | nmf = NMF(n_components=3, init='random', l1_ratio=0.1) 19 | Xt = nmf.fit_transform(iris.data) 20 | 21 | print('Reconstruction error') 22 | print(nmf.reconstruction_err_) 23 | 24 | print('Original Iris sample') 25 | print(iris.data[0]) 26 | 27 | print('Compressed Iris sample (via Non-Negative Matrix Factorization)') 28 | print(Xt[0]) 29 | 30 | print('Rebuilt sample') 31 | print(nmf.inverse_transform(Xt[0])) -------------------------------------------------------------------------------- /Chapter08/adaboost.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.ensemble import AdaBoostClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_classifications = 100 15 | 16 | 17 | if __name__ == '__main__': 18 | # Load dataset 19 | digits = load_digits() 20 | 21 | # Collect accuracies 22 | ab_accuracy = [] 23 | 24 | for i in range(1, nb_classifications): 25 | a = cross_val_score(AdaBoostClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy', 26 | cv=10).mean() 27 | ab_accuracy.append(a) 28 | 29 | # Show results 30 | plt.figure(figsize=(30, 25)) 31 | plt.xlabel('Number of trees') 32 | plt.ylabel('Accuracy') 33 | plt.grid(True) 34 | plt.plot(ab_accuracy) 35 | plt.show() -------------------------------------------------------------------------------- /Chapter07/kernel_svm_1.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.model_selection import GridSearchCV 8 | from sklearn.svm import SVC 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | if __name__ == '__main__': 16 | # Load dataset 17 | digits = load_digits() 18 | 19 | # Define a param grid 20 | param_grid = [ 21 | { 22 | 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 23 | 'C': [0.1, 0.2, 0.4, 0.5, 1.0, 1.5, 1.8, 2.0, 2.5, 3.0] 24 | } 25 | ] 26 | 27 | # Create a train grid search on SVM classifier 28 | gs = GridSearchCV(estimator=SVC(), param_grid=param_grid, 29 | scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) 30 | gs.fit(digits.data, digits.target) 31 | 32 | print(gs.best_estimator_) 33 | print('Kernel SVM score: %.3f' % gs.best_score_) -------------------------------------------------------------------------------- /Chapter08/random_forest.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_classifications = 100 15 | 16 | 17 | if __name__ == '__main__': 18 | # Load dataset 19 | digits = load_digits() 20 | 21 | # Collect accuracies 22 | rf_accuracy = [] 23 | 24 | for i in range(1, nb_classifications): 25 | a = cross_val_score(RandomForestClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy', 26 | cv=10).mean() 27 | rf_accuracy.append(a) 28 | 29 | # Show results 30 | plt.figure(figsize=(30, 25)) 31 | plt.xlabel('Number of trees') 32 | plt.ylabel('Accuracy') 33 | plt.grid(True) 34 | plt.plot(rf_accuracy) 35 | plt.show() -------------------------------------------------------------------------------- /Chapter03/dictionary_learning.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.decomposition import DictionaryLearning 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | if __name__ == '__main__': 13 | # Load MNIST digits 14 | digits = load_digits() 15 | 16 | # Perform a dictionary learning (and atom extraction) from the MNIST dataset 17 | dl = DictionaryLearning(n_components=36, fit_algorithm='lars', transform_algorithm='lasso_lars') 18 | X_dict = dl.fit_transform(digits.data) 19 | 20 | # Show the atoms that have been extracted 21 | fig, ax = plt.subplots(6, 6, figsize=(8, 8)) 22 | 23 | samples = [dl.components_[x].reshape((8, 8)) for x in range(34)] 24 | 25 | for i in range(6): 26 | for j in range(6): 27 | ax[i, j].set_axis_off() 28 | ax[i, j].imshow(samples[(i * 5) + j], cmap='gray') 29 | 30 | plt.show() 31 | 32 | -------------------------------------------------------------------------------- /Chapter13/stemming.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from nltk.stem.snowball import SnowballStemmer 4 | from nltk.stem.snowball import PorterStemmer 5 | from nltk.stem.lancaster import LancasterStemmer 6 | 7 | if __name__ == '__main__': 8 | print('English Snowball stemming:') 9 | ess = SnowballStemmer('english', ignore_stopwords=True) 10 | print(ess.stem('flies')) 11 | 12 | print('French Snowball stemming:') 13 | fss = SnowballStemmer('french', ignore_stopwords=True) 14 | print(fss.stem('courais')) 15 | 16 | print('English Snowball stemming:') 17 | print(ess.stem('teeth')) 18 | 19 | print('Porter stemming:') 20 | ps = PorterStemmer() 21 | print(ps.stem('teeth')) 22 | 23 | print('Lancaster stemming:') 24 | ls = LancasterStemmer() 25 | print(ls.stem('teeth')) 26 | 27 | print('Porter stemming:') 28 | print(ps.stem('teen')) 29 | print(ps.stem('teenager')) 30 | 31 | print('Lancaster stemming:') 32 | print(ls.stem('teen')) 33 | print(ls.stem('teenager')) -------------------------------------------------------------------------------- /Chapter03/tsne.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import matplotlib.cm as cm 5 | import numpy as np 6 | 7 | from sklearn.datasets import load_digits 8 | from sklearn.manifold import TSNE 9 | 10 | 11 | # Set random seed for reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | if __name__ == '__main__': 16 | # Load the dataset 17 | digits = load_digits() 18 | X = digits['data'] / np.max(digits['data']) 19 | 20 | # Perform a t-SNE 21 | tsne = TSNE(n_components=2, perplexity=20, random_state=1000) 22 | X_tsne = tsne.fit_transform(X) 23 | 24 | # Plot the t-SNE result 25 | fig, ax = plt.subplots(figsize=(18, 10)) 26 | 27 | for i in range(400): 28 | ax.scatter(X_tsne[:, 0], X_tsne[:, 1], color=cm.rainbow(digits['target'] * 10), marker='o', s=20) 29 | ax.annotate('%d' % digits['target'][i], xy=(X_tsne[i, 0] + 1, X_tsne[i, 1] + 1)) 30 | 31 | ax.set_xlabel(r'$x_0$') 32 | ax.set_ylabel(r'$x_1$') 33 | ax.grid() 34 | 35 | plt.show() -------------------------------------------------------------------------------- /Chapter12/model_based_cf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from scipy.linalg import svd 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | if __name__ == '__main__': 11 | # Create a dummy user-item matrix 12 | M = np.random.randint(0, 6, size=(20, 10)) 13 | 14 | print('User-Item matrix:') 15 | print(M) 16 | 17 | # Decompose M 18 | U, s, V = svd(M, full_matrices=True) 19 | S = np.diag(s) 20 | 21 | print('U -> %r' % str(U.shape)) 22 | print('S -> %r' % str(S.shape)) 23 | print('V -> %r' % str(V.shape)) 24 | 25 | # Select the first 8 singular values 26 | Uk = U[:, 0:8] 27 | Sk = S[0:8, 0:8] 28 | Vk = V[0:8, :] 29 | 30 | # Compute the user and product vectors 31 | Su = Uk.dot(np.sqrt(Sk).T) 32 | Si = np.sqrt(Sk).dot(Vk).T 33 | 34 | # Compute the average rating per user 35 | Er = np.mean(M, axis=1) 36 | 37 | # Perform a prediction for the user 5 and item 2 38 | r5_2 = Er[5] + Su[5].dot(Si[2]) 39 | print(r5_2) 40 | 41 | -------------------------------------------------------------------------------- /Chapter03/sparse_pca.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.decomposition import SparsePCA 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Load MNIST digits 15 | digits = load_digits() 16 | 17 | # Show some random digits 18 | selection = np.random.randint(0, 1797, size=100) 19 | 20 | fig, ax = plt.subplots(10, 10, figsize=(10, 10)) 21 | 22 | samples = [digits.data[x].reshape((8, 8)) for x in selection] 23 | 24 | for i in range(10): 25 | for j in range(10): 26 | ax[i, j].set_axis_off() 27 | ax[i, j].imshow(samples[(i * 8) + j], cmap='gray') 28 | 29 | plt.show() 30 | 31 | # Perform a PCA on the digits dataset 32 | spca = SparsePCA(n_components=60, alpha=0.1) 33 | X_spca = spca.fit_transform(digits.data / 255) 34 | 35 | print('SPCA components shape:') 36 | print(spca.components_.shape) 37 | 38 | 39 | -------------------------------------------------------------------------------- /Chapter17/vectorization.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import time 5 | 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | 11 | size = 500 12 | 13 | 14 | if __name__ == '__main__': 15 | # Create the matrices 16 | A1 = np.random.normal(0.0, 2.0, size=(size, size)).astype(np.float32) 17 | A2 = np.random.normal(0.0, 2.0, size=(size, size)).astype(np.float32) 18 | 19 | # Non-vectorized computation 20 | D = np.zeros(shape=(size, size)).astype(np.float32) 21 | 22 | start_time = time.time() 23 | 24 | for i in range(size): 25 | for j in range(size): 26 | d = 0.0 27 | for k in range(size): 28 | d += A1[i, k] * A2[k, j] 29 | D[i, j] = d 30 | 31 | end_time = time.time() 32 | elapsed = end_time - start_time 33 | print(elapsed) 34 | 35 | # Vectorized computation 36 | start_time = time.time() 37 | 38 | D = np.dot(A1, A2) 39 | 40 | end_time = time.time() 41 | elapsed = end_time - start_time 42 | print(elapsed) -------------------------------------------------------------------------------- /Chapter08/decision_tree_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.tree import DecisionTreeClassifier 8 | from sklearn.model_selection import GridSearchCV 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | if __name__ == '__main__': 16 | # Load dataset 17 | digits = load_digits() 18 | 19 | # Define a param grid 20 | param_grid = [ 21 | { 22 | 'criterion': ['gini', 'entropy'], 23 | 'max_features': ['auto', 'log2', None], 24 | 'min_samples_split': [2, 10, 25, 100, 200], 25 | 'max_depth': [5, 10, 15, None] 26 | } 27 | ] 28 | 29 | # Create and train a grid searh 30 | gs = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, 31 | scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) 32 | gs.fit(digits.data, digits.target) 33 | 34 | print(gs.best_estimator_) 35 | print('Decision tree score: %.3f' % gs.best_score_) -------------------------------------------------------------------------------- /Chapter11/dendrogram.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_blobs 7 | 8 | from scipy.spatial.distance import pdist 9 | from scipy.cluster.hierarchy import linkage 10 | from scipy.cluster.hierarchy import dendrogram 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | nb_samples = 25 16 | 17 | if __name__ == '__main__': 18 | # Create the dataset 19 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=3, cluster_std=1.5) 20 | 21 | # Show the dataset 22 | fig, ax = plt.subplots(1, 1, figsize=(10, 8)) 23 | 24 | ax.grid() 25 | ax.set_xlabel('X') 26 | ax.set_ylabel('Y') 27 | 28 | ax.scatter(X[:, 0], X[:, 1], marker='o', color='b') 29 | plt.show() 30 | 31 | # Compute the distance matrix 32 | Xdist = pdist(X, metric='euclidean') 33 | 34 | # Compute the linkage 35 | Xl = linkage(Xdist, method='ward') 36 | 37 | # Compute and show the dendrogram 38 | fig, ax = plt.subplots(1, 1, figsize=(10, 8)) 39 | Xd = dendrogram(Xl) 40 | plt.show() -------------------------------------------------------------------------------- /Chapter07/kernel_svm_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from sklearn.datasets import fetch_olivetti_faces 7 | from sklearn.model_selection import GridSearchCV 8 | from sklearn.svm import SVC 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | # Set a local folder here 15 | olivetti_home = '' 16 | 17 | 18 | if __name__ == '__main__': 19 | # Load dataset 20 | faces = fetch_olivetti_faces(data_home=olivetti_home) 21 | 22 | # Define a param grid 23 | param_grid = [ 24 | { 25 | 'kernel': ['rbf', 'poly'], 26 | 'C': [0.1, 0.5, 1.0, 1.5], 27 | 'degree': [2, 3, 4, 5], 28 | 'gamma': [0.001, 0.01, 0.1, 0.5] 29 | } 30 | ] 31 | 32 | # Create a train grid search on SVM classifier 33 | gs = GridSearchCV(estimator=SVC(), param_grid=param_grid, 34 | scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) 35 | gs.fit(faces.data, faces.target) 36 | 37 | print(gs.best_estimator_) 38 | print('Kernel SVM score: %.3f' % gs.best_score_) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Chapter03/kernel_pca.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_circles 7 | from sklearn.decomposition import KernelPCA 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | if __name__ == '__main__': 13 | # Create a dummy dataset 14 | Xb, Yb = Xb, Yb = make_circles(n_samples=500, factor=0.1, noise=0.05) 15 | 16 | # Show the dataset 17 | fig, ax = plt.subplots(1, 1, figsize=(8, 8)) 18 | ax.scatter(Xb[:, 0], Xb[:, 1]) 19 | ax.set_xlabel('X') 20 | ax.set_ylabel('Y') 21 | ax.grid() 22 | 23 | plt.show() 24 | 25 | # Perform a kernel PCA (with radial basis function) 26 | kpca = KernelPCA(n_components=2, kernel='rbf', fit_inverse_transform=True, gamma=1.0) 27 | X_kpca = kpca.fit_transform(Xb) 28 | 29 | # Plot the dataset after PCA 30 | fig, ax = plt.subplots(1, 1, figsize=(8, 8)) 31 | ax.scatter(kpca.X_transformed_fit_[:, 0], kpca.X_transformed_fit_[:, 1]) 32 | ax.set_xlabel('First component') 33 | ax.set_ylabel('Second component') 34 | ax.grid() 35 | 36 | plt.show() -------------------------------------------------------------------------------- /Chapter06/multinomial.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.feature_extraction import DictVectorizer 6 | from sklearn.naive_bayes import MultinomialNB 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | if __name__ == '__main__': 14 | # Prepare a dummy dataset 15 | data = [ 16 | {'house': 100, 'street': 50, 'shop': 25, 'car': 100, 'tree': 20}, 17 | {'house': 5, 'street': 5, 'shop': 0, 'car': 10, 'tree': 500, 'river': 1} 18 | ] 19 | 20 | # Create and train a dictionary vectorizer 21 | dv = DictVectorizer(sparse=False) 22 | X = dv.fit_transform(data) 23 | Y = np.array([1, 0]) 24 | 25 | # Create and train a Multinomial Naive Bayes classifier 26 | mnb = MultinomialNB() 27 | mnb.fit(X, Y) 28 | 29 | # Create dummy test data 30 | test_data = data = [ 31 | {'house': 80, 'street': 20, 'shop': 15, 'car': 70, 'tree': 10, 'river': 1}, 32 | {'house': 10, 'street': 5, 'shop': 1, 'car': 8, 'tree': 300, 'river': 0} 33 | ] 34 | 35 | Yp = mnb.predict(dv.fit_transform(test_data)) 36 | print(Yp) 37 | -------------------------------------------------------------------------------- /Chapter05/grid_search.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from sklearn.datasets import load_iris 7 | from sklearn.model_selection import GridSearchCV, cross_val_score 8 | from sklearn.linear_model import LogisticRegression 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | if __name__ == '__main__': 16 | # Load dataset 17 | iris = load_iris() 18 | 19 | # Define a param grid 20 | param_grid = [ 21 | { 22 | 'penalty': ['l1', 'l2'], 23 | 'C': [0.5, 1.0, 1.5, 1.8, 2.0, 2.5] 24 | } 25 | ] 26 | 27 | # Create and train a grid search 28 | gs = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, 29 | scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) 30 | gs.fit(iris.data, iris.target) 31 | 32 | # Best estimator 33 | print(gs.best_estimator_) 34 | 35 | gs_scores = cross_val_score(gs.best_estimator_, iris.data, iris.target, scoring='accuracy', cv=10) 36 | print('Best estimator CV average score: %.3f' % gs_scores.mean()) 37 | 38 | -------------------------------------------------------------------------------- /Chapter08/gradient_tree_boosting.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.ensemble import GradientBoostingClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 500 14 | 15 | if __name__ == '__main__': 16 | # Create the dataset 17 | X, Y = make_classification(n_samples=nb_samples, n_features=4, n_informative=3, n_redundant=1, n_classes=3) 18 | 19 | # Collect the scores for n_estimators in (1, 50) 20 | a = [] 21 | max_estimators = 50 22 | 23 | for i in range(1, max_estimators): 24 | score = cross_val_score(GradientBoostingClassifier(n_estimators=i, learning_rate=10.0 / float(i)), X, Y, 25 | cv=10, scoring='accuracy').mean() 26 | a.append(score) 27 | 28 | # Plot the results 29 | plt.figure(figsize=(30, 25)) 30 | plt.xlabel('Number of estimators') 31 | plt.ylabel('Average CV accuracy') 32 | plt.grid(True) 33 | plt.plot(a) 34 | plt.show() -------------------------------------------------------------------------------- /Chapter14/lsa_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from nltk.corpus import brown 6 | 7 | from sklearn.decomposition import TruncatedSVD 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | if __name__ == '__main__': 16 | # Compose a corpus 17 | sentences = sentences = brown.sents(categories=['news', 'fiction']) 18 | corpus = [] 19 | 20 | for s in sentences: 21 | corpus.append(' '.join(s)) 22 | 23 | # Vectorize the corpus 24 | vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', sublinear_tf=True, use_idf=True) 25 | Xc = vectorizer.fit_transform(corpus) 26 | 27 | rank = 2 28 | 29 | # Performed a truncated SVD 30 | tsvd = TruncatedSVD(n_components=rank) 31 | Xt = tsvd.fit_transform(Xc) 32 | 33 | # Check the top-10 word per topic 34 | Mwts = np.argsort(tsvd.components_, axis=1)[::-1] 35 | 36 | for t in range(rank): 37 | print('\nTopic ' + str(t)) 38 | for i in range(10): 39 | print(vectorizer.get_feature_names()[Mwts[t, i]]) -------------------------------------------------------------------------------- /Chapter05/learning_curve.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_wine 7 | from sklearn.model_selection import learning_curve 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.utils import shuffle 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | 16 | if __name__ == '__main__': 17 | # Load the dataset 18 | wine = load_wine() 19 | 20 | # Shuffle the dataset and compute the learning curves 21 | X, Y = shuffle(wine['data'], wine['target']) 22 | tsize, training_score, test_score = learning_curve(LogisticRegression(), X, Y, cv=20, random_state=1000) 23 | 24 | # Show the learning curve 25 | avg_tr_scores = np.mean(training_score, axis=1) 26 | avg_test_scores = np.mean(test_score, axis=1) 27 | 28 | fig, ax = plt.subplots(figsize=(15, 8)) 29 | 30 | ax.plot(tsize, avg_tr_scores, label='Training score') 31 | ax.plot(tsize, avg_test_scores, label='CV score') 32 | ax.set_xlabel('Number of samples') 33 | ax.set_ylabel('Accuracy') 34 | ax.legend() 35 | ax.grid() 36 | 37 | plt.show() -------------------------------------------------------------------------------- /Chapter08/decision_tree.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import make_classification 6 | from sklearn.tree import DecisionTreeClassifier, export_graphviz 7 | from sklearn.model_selection import cross_val_score 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 500 14 | 15 | # Set a folder to store the graph in 16 | graph_folder = './dt.dot' 17 | 18 | 19 | if __name__ == '__main__': 20 | # Create dataset 21 | X, Y = make_classification(n_samples=nb_samples, n_features=3, n_informative=3, n_redundant=0, n_classes=3, 22 | n_clusters_per_class=1) 23 | 24 | # Create a Decision tree classifier 25 | dt = DecisionTreeClassifier() 26 | dt_scores = cross_val_score(dt, X, Y, scoring='accuracy', cv=10) 27 | print('Decision tree score: %.3f' % dt_scores.mean()) 28 | 29 | # Save in Graphviz format 30 | dt.fit(X, Y) 31 | 32 | with open(graph_folder, 'w') as df: 33 | df = export_graphviz(dt, out_file=df, 34 | feature_names=['A', 'B', 'C'], 35 | class_names=['C1', 'C2', 'C3']) -------------------------------------------------------------------------------- /Chapter03/feature_selection.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.feature_selection import VarianceThreshold 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | if __name__ == '__main__': 12 | # Create a dummy dataset 13 | X = np.ndarray(shape=(100, 3)) 14 | 15 | X[:, 0] = np.random.normal(0.0, 5.0, size=100) 16 | X[:, 1] = np.random.normal(0.5, 5.0, size=100) 17 | X[:, 2] = np.random.normal(1.0, 0.5, size=100) 18 | 19 | # Show the dataset 20 | fig, ax = plt.subplots(1, 1, figsize=(12, 8)) 21 | ax.grid() 22 | ax.set_xlabel('X') 23 | ax.set_ylabel('Y') 24 | 25 | ax.plot(X[:, 0], label='STD = 5.0') 26 | ax.plot(X[:, 1], label='STD = 5.0') 27 | ax.plot(X[:, 2], label='STD = 0.5') 28 | 29 | plt.legend() 30 | plt.show() 31 | 32 | # Impose a variance threshold 33 | print('Samples before variance thresholding') 34 | print(X[0:3, :]) 35 | 36 | vt = VarianceThreshold(threshold=1.5) 37 | X_t = vt.fit_transform(X) 38 | 39 | # After the filter has removed the componenents 40 | print('Samples after variance thresholding') 41 | print(X_t[0:3, :]) -------------------------------------------------------------------------------- /Chapter12/memory_based_cf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import warnings 5 | 6 | from scikits.crab.models import MatrixPreferenceDataModel 7 | from scikits.crab.similarities import UserSimilarity 8 | from scikits.crab.metrics import euclidean_distances 9 | from scikits.crab.recommenders.knn import UserBasedRecommender 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | if __name__ == '__main__': 15 | # Define a user-item matrix 16 | user_item_matrix = { 17 | 1: {1: 2, 2: 5, 3: 3}, 18 | 2: {1: 5, 4: 2}, 19 | 3: {2: 3, 4: 5, 3: 2}, 20 | 4: {3: 5, 5: 1}, 21 | 5: {1: 3, 2: 3, 4: 1, 5: 3} 22 | } 23 | 24 | # Build a matrix preference model 25 | model = MatrixPreferenceDataModel(user_item_matrix) 26 | 27 | # Build a similarity matrix 28 | similarity_matrix = UserSimilarity(model, euclidean_distances) 29 | 30 | # Create a recommender 31 | recommender = UserBasedRecommender(model, similarity_matrix, with_preference=True) 32 | 33 | # Test the recommender for user 2 34 | with warnings.catch_warnings(): 35 | warnings.simplefilter("ignore") 36 | print(recommender.recommend(2)) 37 | -------------------------------------------------------------------------------- /Chapter10/spectral_clustering_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_moons 7 | from sklearn.cluster import SpectralClustering 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 1000 14 | 15 | 16 | if __name__ == '__main__': 17 | # Create dataset 18 | X, Y = make_moons(n_samples=nb_samples, noise=0.05) 19 | 20 | # Try different gammas with a RBF affinity 21 | Yss = [] 22 | gammas = np.linspace(0, 12, 4) 23 | 24 | for gamma in gammas: 25 | sc = SpectralClustering(n_clusters=2, affinity='rbf', gamma=gamma) 26 | Yss.append(sc.fit_predict(X)) 27 | 28 | # Show data 29 | fig, ax = plt.subplots(1, 4, figsize=(30, 10), sharey=True) 30 | 31 | for x in range(4): 32 | ax[x].grid() 33 | ax[x].set_title('Gamma = %.0f' % gammas[x]) 34 | 35 | for i in range(nb_samples): 36 | c = Yss[x][i] 37 | 38 | if c == 0: 39 | ax[x].scatter(X[i, 0], X[i, 1], marker='o', color='r') 40 | else: 41 | ax[x].scatter(X[i, 0], X[i, 1], marker='^', color='b') 42 | 43 | plt.show() -------------------------------------------------------------------------------- /Chapter03/feature_filtering.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_boston, load_iris 6 | from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, f_regression 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | if __name__ == '__main__': 12 | # Load Boston data 13 | regr_data = load_boston() 14 | print('Boston data shape') 15 | print(regr_data.data.shape) 16 | 17 | # Select the best k features with regression test 18 | kb_regr = SelectKBest(f_regression) 19 | X_b = kb_regr.fit_transform(regr_data.data, regr_data.target) 20 | print('K-Best-filtered Boston dataset shape') 21 | print(X_b.shape) 22 | print('K-Best scores') 23 | print(kb_regr.scores_) 24 | 25 | # Load iris data 26 | class_data = load_iris() 27 | print('Iris dataset shape') 28 | print(class_data.data.shape) 29 | 30 | # Select the best k features using Chi^2 classification test 31 | perc_class = SelectPercentile(chi2, percentile=15) 32 | X_p = perc_class.fit_transform(class_data.data, class_data.target) 33 | print('Chi2-filtered Iris dataset shape') 34 | print(X_p.shape) 35 | print('Chi2 scores') 36 | print(perc_class.scores_) 37 | 38 | -------------------------------------------------------------------------------- /Chapter17/numpy_cupy.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | # For further information, please visit https://cupy.chainer.org/ 5 | import cupy as cp 6 | import time 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | cp.random.seed(1000) 12 | 13 | size = 5000 14 | 15 | 16 | if __name__ == '__main__': 17 | # Create the matrices using NumPy 18 | A1 = np.random.normal(0.0, 2.0, size=(size, size)).astype(np.float32) 19 | A2 = np.random.normal(0.0, 2.0, size=(size, size)).astype(np.float32) 20 | 21 | # Perform the measurement using NumPy 22 | Ad = A1.copy() 23 | 24 | start_time = time.time() 25 | 26 | for _ in range(100): 27 | Ad = np.dot(Ad, A2) 28 | 29 | end_time = time.time() 30 | elapsed = end_time - start_time 31 | print(elapsed) 32 | 33 | # Create the matrices using CuPy 34 | B1 = cp.random.normal(0.0, 2.0, size=(size, size)) 35 | B2 = cp.random.normal(0.0, 2.0, size=(size, size)) 36 | 37 | # Perform the measurement using CuPy with GPU support 38 | Bd = B1.copy() 39 | 40 | start_time = time.time() 41 | 42 | for _ in range(100): 43 | Bd = cp.dot(Bd, B2) 44 | 45 | end_time = time.time() 46 | elapsed = end_time - start_time 47 | print(elapsed) 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /Chapter17/feature_union.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import warnings 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.decomposition import PCA 8 | from sklearn.feature_selection import SelectKBest, f_classif 9 | from sklearn.model_selection import cross_val_score 10 | from sklearn.pipeline import Pipeline, FeatureUnion 11 | from sklearn.preprocessing import StandardScaler 12 | from sklearn.svm import SVC 13 | 14 | # For reproducibility 15 | np.random.seed(1000) 16 | 17 | 18 | if __name__ == '__main__': 19 | warnings.simplefilter("ignore") 20 | 21 | # Load the dataset 22 | digits = load_digits() 23 | 24 | # Create the steps for a feature union 25 | steps_fu = [ 26 | ('pca', PCA(n_components=10)), 27 | ('kbest', SelectKBest(f_classif, k=5)), 28 | ] 29 | 30 | # Create the steps for the pipeline 31 | fu = FeatureUnion(steps_fu) 32 | scaler = StandardScaler() 33 | svc = SVC(kernel='rbf', C=5.0, gamma=0.05) 34 | 35 | pipeline_steps = [ 36 | ('fu', fu), 37 | ('scaler', scaler), 38 | ('classifier', svc) 39 | ] 40 | 41 | pipeline = Pipeline(pipeline_steps) 42 | 43 | print('Cross-validation score:') 44 | print(cross_val_score(pipeline, digits.data, digits.target, cv=10).mean()) -------------------------------------------------------------------------------- /Chapter05/grid_search_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from sklearn.datasets import load_iris 7 | from sklearn.model_selection import GridSearchCV, cross_val_score 8 | from sklearn.linear_model import SGDClassifier 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | if __name__ == '__main__': 15 | # Load dataset 16 | iris = load_iris() 17 | 18 | # Define a param grid 19 | param_grid = [ 20 | { 21 | 'penalty': ['l1', 'l2', 'elasticnet'], 22 | 'alpha': [1e-5, 1e-4, 5e-4, 1e-3, 2.3e-3, 5e-3, 1e-2], 23 | 'l1_ratio': [0.01, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.75, 0.8] 24 | } 25 | ] 26 | 27 | # Create SGD classifier 28 | sgd = SGDClassifier(loss='perceptron', learning_rate='optimal') 29 | 30 | # Create and train a grid search 31 | gs = GridSearchCV(estimator=sgd, param_grid=param_grid, scoring='accuracy', cv=10, 32 | n_jobs=multiprocessing.cpu_count()) 33 | gs.fit(iris.data, iris.target) 34 | 35 | # Best estimator 36 | print(gs.best_estimator_) 37 | 38 | gs_scores = cross_val_score(gs.best_estimator_, iris.data, iris.target, scoring='accuracy', cv=10) 39 | print('Best estimator CV average score: %.3f' % gs_scores.mean()) -------------------------------------------------------------------------------- /Chapter07/linear_svm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.svm import SVC 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 500 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.grid() 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | 24 | for i in range(nb_samples): 25 | if Y[i] == 0: 26 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 27 | else: 28 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 29 | 30 | plt.show() 31 | 32 | 33 | if __name__ == '__main__': 34 | # Create dataset 35 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 36 | n_clusters_per_class=1) 37 | 38 | # Show dataset 39 | show_dataset(X, Y) 40 | 41 | # Create a SVM with linear kernel 42 | svc = SVC(kernel='linear') 43 | 44 | # Compute CV score 45 | svc_scores = cross_val_score(svc, X, Y, scoring='accuracy', cv=10) 46 | print('Linear SVM CV average score: %.3f' % svc_scores.mean()) 47 | 48 | -------------------------------------------------------------------------------- /Chapter14/word2vec.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import multiprocessing 5 | 6 | from nltk.corpus import brown 7 | from nltk.corpus import stopwords 8 | 9 | # Install Gensim using: pip install -U gensim 10 | # Further information: https://radimrehurek.com/gensim/ 11 | from gensim.models import Word2Vec 12 | 13 | 14 | # For reproducibility 15 | np.random.seed(1000) 16 | 17 | 18 | if __name__ == '__main__': 19 | sw = set(stopwords.words('english')) 20 | 21 | # Prepare the corpus 22 | brown_corpus = brown.sents() 23 | 24 | corpus = [] 25 | 26 | for sent in brown_corpus: 27 | c_sent = [w.strip().lower() for w in sent if w.strip().lower() not in sw] 28 | corpus.append(c_sent) 29 | 30 | # Train the Word2Vec model 31 | # A UserWarning: detected Windows; can be discarded 32 | model = Word2Vec(corpus, size=300, window=10, min_count=1, workers=multiprocessing.cpu_count()) 33 | wv = model.wv 34 | del model 35 | 36 | # Show a feature vector 37 | print(wv['committee']) 38 | 39 | print('\n') 40 | 41 | # Show the words most similar to "house" 42 | print(wv.most_similar('house')) 43 | 44 | print('\n') 45 | 46 | # Show the similarity between "committee" and "president" 47 | print(wv.similarity('committee', 'president')) -------------------------------------------------------------------------------- /Chapter04/ransac_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.linear_model import LinearRegression, RANSACRegressor 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | nb_samples = 200 13 | nb_noise_samples = 150 14 | 15 | 16 | def show_dataset(X, Y): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.scatter(X, Y) 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | ax.grid() 23 | 24 | plt.show() 25 | 26 | 27 | if __name__ == '__main__': 28 | # Create dataset 29 | X = np.arange(-5, 5, 0.05) 30 | 31 | Y = X + 2 32 | Y += np.random.uniform(-0.5, 0.5, size=nb_samples) 33 | 34 | for i in range(nb_noise_samples, nb_samples): 35 | Y[i] += np.random.uniform(12, 15) 36 | 37 | # Show the dataset 38 | show_dataset(X, Y) 39 | 40 | # Create a linear regressor 41 | lr = LinearRegression(normalize=True) 42 | lr.fit(X.reshape(-1, 1), Y.reshape(-1, 1)) 43 | print('Standard regressor: y = %.3fx + %.3f' % (lr.coef_, lr.intercept_)) 44 | 45 | # Create RANSAC regressor 46 | rs = RANSACRegressor(lr) 47 | rs.fit(X.reshape(-1, 1), Y.reshape(-1, 1)) 48 | print('RANSAC regressor: y = %.3fx + %.3f' % (rs.estimator_.coef_, rs.estimator_.intercept_)) -------------------------------------------------------------------------------- /Chapter12/content-based.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.neighbors import NearestNeighbors 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | nb_items = 1000 11 | 12 | if __name__ == '__main__': 13 | # Create the item dataset 14 | items = np.zeros(shape=(nb_items, 4)) 15 | 16 | for i in range(nb_items): 17 | items[i, 0] = np.random.randint(0, 100) 18 | items[i, 1] = np.random.randint(0, 100) 19 | items[i, 2] = np.random.randint(0, 100) 20 | items[i, 3] = np.random.randint(0, 100) 21 | 22 | metrics = ['euclidean', 'hamming', 'jaccard'] 23 | 24 | for metric in metrics: 25 | print('Metric: %r' % metric) 26 | 27 | # Fit k-nearest neighbors 28 | nn = NearestNeighbors(n_neighbors=10, radius=5.0, metric=metric) 29 | nn.fit(items) 30 | 31 | # Create a test product 32 | test_product = np.array([15, 60, 28, 73]) 33 | 34 | # Determine the neighbors with different radiuses 35 | d, suggestions = nn.radius_neighbors(test_product.reshape(1, -1), radius=20) 36 | 37 | print('Suggestions (radius=10):') 38 | print(suggestions) 39 | 40 | d, suggestions = nn.radius_neighbors(test_product.reshape(1, -1), radius=30) 41 | 42 | print('Suggestions (radius=15):') 43 | print(suggestions) -------------------------------------------------------------------------------- /Chapter08/random_forest_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_classifications = 100 15 | 16 | 17 | if __name__ == '__main__': 18 | # Load dataset 19 | digits = load_digits() 20 | 21 | # Collect accuracies 22 | rf_accuracy = [] 23 | et_accuracy = [] 24 | 25 | for i in range(1, nb_classifications): 26 | a = cross_val_score(RandomForestClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy', 27 | cv=10).mean() 28 | rf_accuracy.append(a) 29 | 30 | b = cross_val_score(ExtraTreesClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy', 31 | cv=10).mean() 32 | et_accuracy.append(b) 33 | 34 | # Show results 35 | plt.figure(figsize=(30, 25)) 36 | plt.xlabel('Number of trees') 37 | plt.ylabel('Accuracy') 38 | plt.grid(True) 39 | plt.plot(rf_accuracy, color='blue', label='Random Forest') 40 | plt.plot(et_accuracy, color='red', label='Extra Random Forest') 41 | plt.legend(loc="lower right") 42 | plt.show() -------------------------------------------------------------------------------- /Chapter07/controlled_svm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.svm import SVC, NuSVC 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 500 14 | 15 | 16 | def show_dataset(X, Y): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.grid() 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | 23 | for i in range(nb_samples): 24 | if Y[i] == 0: 25 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 26 | else: 27 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 28 | 29 | plt.show() 30 | 31 | 32 | if __name__ == '__main__': 33 | # Create dataset 34 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 35 | n_clusters_per_class=1) 36 | 37 | # Show dataset 38 | show_dataset(X, Y) 39 | 40 | # Create and train a linear SVM 41 | svc = SVC(kernel='linear') 42 | svc.fit(X, Y) 43 | print('Number of support vectors: %d' % len(svc.support_vectors_)) 44 | 45 | # Create and train a Nu-SVM classifier 46 | nusvc = NuSVC(kernel='linear', nu=0.05) 47 | nusvc.fit(X, Y) 48 | print('Number of support vectors (nu=0.05): %d' % len(nusvc.support_vectors_)) -------------------------------------------------------------------------------- /Chapter16/convolution.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from scipy.misc import face 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | if __name__ == '__main__': 13 | # Load the image 14 | img = face(gray=True) 15 | 16 | # Show the original image 17 | plt.imshow(img, cmap='gray') 18 | plt.show() 19 | 20 | # Define the kernel 21 | kernel = np.array( 22 | [[0, 1, 0], 23 | [1, -4, 0], 24 | [0, 1, 0]], 25 | dtype=np.float32) 26 | 27 | cfilter = np.zeros((3, 3, 1, 1), dtype=np.float32) 28 | cfilter[:, :, 0, 0] = kernel 29 | 30 | # Create the graph 31 | graph = tf.Graph() 32 | 33 | with graph.as_default(): 34 | x = tf.placeholder(tf.float32, shape=(None, 768, 1024, 1), name='image') 35 | f = tf.constant(cfilter) 36 | 37 | # In case of errors, please use padding='SAME' 38 | y = tf.nn.conv2d(x, f, strides=[1, 1, 1, 1], padding='same') 39 | 40 | session = tf.InteractiveSession(graph=graph) 41 | 42 | # Compute the convolution 43 | c_img = session.run([y], feed_dict={x: img.reshape((1, 768, 1024, 1))}) 44 | n_img = np.array(c_img).reshape((768, 1024)) 45 | 46 | # Show the final image 47 | plt.imshow(n_img, cmap='gray') 48 | plt.show() 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /Chapter02/resampling.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.utils import resample 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | nb_samples = 1000 15 | weights = (0.95, 0.05) 16 | 17 | 18 | if __name__ == '__main__': 19 | # Create an unbalanced dataset 20 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, weights=weights, random_state=1000) 21 | 22 | # Show the shapes 23 | print(X[Y == 0].shape) 24 | print(X[Y == 1].shape) 25 | 26 | # Show the dataset 27 | fig, ax = plt.subplots(figsize=(10, 8)) 28 | 29 | ax.scatter(X[Y == 0, 0], X[Y == 0, 1], label='Class 1') 30 | ax.scatter(X[Y == 1, 0], X[Y == 1, 1], label='Class 2') 31 | ax.set_xlabel(r'$x_0$') 32 | ax.set_ylabel(r'$x_1$') 33 | ax.set_title('Unbalanced dataset') 34 | ax.legend() 35 | ax.grid() 36 | 37 | plt.show() 38 | 39 | # Resample the dataset 40 | X_1_resampled = resample(X[Y == 1], n_samples=X[Y == 0].shape[0], random_state=1000) 41 | 42 | Xu = np.concatenate((X[Y == 0], X_1_resampled)) 43 | Yu = np.concatenate((Y[Y == 0], np.ones(shape=(X[Y == 0].shape[0],), dtype=np.int32))) 44 | 45 | # Show the new shapes 46 | print(Xu[Yu == 0].shape) 47 | print(Xu[Yu == 1].shape) 48 | 49 | -------------------------------------------------------------------------------- /Chapter04/huber_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.linear_model import LinearRegression, HuberRegressor 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | nb_samples = 500 13 | nb_noise_samples = 50 14 | 15 | 16 | def show_dataset(X, Y): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.scatter(X, Y) 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | ax.grid() 23 | 24 | plt.show() 25 | 26 | 27 | if __name__ == '__main__': 28 | # Create dataset 29 | X = np.arange(-5, 5, 10.0 / float(nb_samples)) 30 | 31 | Y = X + 2 32 | Y += np.random.uniform(-0.5, 0.5, size=nb_samples) 33 | 34 | noisy_samples = np.random.choice(np.arange(0, nb_samples), size=nb_noise_samples, replace=False) 35 | 36 | for i in noisy_samples: 37 | Y[i] += np.random.uniform(0, 10.0) 38 | 39 | # Show the dataset 40 | show_dataset(X, Y) 41 | 42 | # Create a linear regressor 43 | lr = LinearRegression(normalize=True) 44 | lr.fit(X.reshape(-1, 1), Y.reshape(-1, 1)) 45 | print('Standard regressor: y = %.3fx + %.3f' % (lr.coef_, lr.intercept_)) 46 | 47 | # Create a Huber regressor 48 | hr = HuberRegressor(epsilon=1.25) 49 | hr.fit(X.reshape(-1, 1), Y) 50 | print('Huber regressor: y = %.3fx + %.3f' % (hr.coef_, hr.intercept_)) -------------------------------------------------------------------------------- /Chapter05/perceptron.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.linear_model import SGDClassifier 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 500 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.grid() 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | 24 | for i in range(nb_samples): 25 | if Y[i] == 0: 26 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 27 | else: 28 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 29 | 30 | plt.show() 31 | 32 | 33 | if __name__ == '__main__': 34 | # Create dataset 35 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 36 | n_clusters_per_class=1) 37 | 38 | # Show dataset 39 | show_dataset(X, Y) 40 | 41 | # Create perceptron as SGD instance 42 | # The same result can be obtained using directly the class sklearn.linear_model.Perceptron 43 | sgd = SGDClassifier(loss='perceptron', learning_rate='optimal', n_iter=10) 44 | sgd_scores = cross_val_score(sgd, X, Y, scoring='accuracy', cv=10) 45 | print('Perceptron CV average score: %.3f' % sgd_scores.mean()) 46 | 47 | -------------------------------------------------------------------------------- /Chapter05/roc_curve.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.metrics import roc_curve, auc 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | nb_samples = 500 16 | 17 | 18 | if __name__ == '__main__': 19 | # Create dataset 20 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 21 | n_clusters_per_class=1) 22 | 23 | # Split dataset 24 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) 25 | 26 | #Create and train logistic regressor 27 | lr = LogisticRegression() 28 | lr.fit(X_train, Y_train) 29 | 30 | # Compute ROC curve 31 | Y_score = lr.decision_function(X_test) 32 | fpr, tpr, thresholds = roc_curve(Y_test, Y_score) 33 | 34 | plt.figure(figsize=(30, 25)) 35 | 36 | plt.plot(fpr, tpr, color='red', label='Logistic regression (AUC: %.2f)' % auc(fpr, tpr)) 37 | plt.plot([0, 1], [0, 1], color='blue', linestyle='--') 38 | plt.xlim([0.0, 1.0]) 39 | plt.ylim([0.0, 1.01]) 40 | plt.title('ROC Curve') 41 | plt.xlabel('False Positive Rate') 42 | plt.ylabel('True Positive Rate') 43 | plt.legend(loc="lower right") 44 | 45 | plt.show() -------------------------------------------------------------------------------- /Chapter04/polynomial_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.linear_model import LinearRegression 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.preprocessing import PolynomialFeatures 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 200 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.scatter(X, Y) 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | ax.grid() 24 | 25 | plt.show() 26 | 27 | 28 | if __name__ == '__main__': 29 | # Create dataset 30 | X = np.arange(-5, 5, 0.05) 31 | 32 | Y = X + 2 33 | Y += X**2 + np.random.uniform(-0.5, 0.5, size=nb_samples) 34 | 35 | # Show the dataset 36 | show_dataset(X, Y) 37 | 38 | # Split dataset 39 | X_train, X_test, Y_train, Y_test = train_test_split(X.reshape(-1, 1), Y.reshape(-1, 1), test_size=0.25) 40 | 41 | lr = LinearRegression(normalize=True) 42 | lr.fit(X_train, Y_train) 43 | print('Linear regression score: %.3f' % lr.score(X_train, Y_train)) 44 | 45 | # Create polynomial features 46 | pf = PolynomialFeatures(degree=2) 47 | X_train = pf.fit_transform(X_train) 48 | X_test = pf.fit_transform(X_test) 49 | 50 | lr.fit(X_train, Y_train) 51 | print('Second degree polynomial regression score: %.3f' % lr.score(X_train, Y_train)) -------------------------------------------------------------------------------- /Chapter16/gradients.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import tensorflow as tf 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | nb_points = 100 11 | 12 | if __name__ == '__main__': 13 | # Create the dataset 14 | X = np.linspace(-nb_points, nb_points, 200, dtype=np.float32) 15 | 16 | # Plot the dataset 17 | fig, ax = plt.subplots(figsize=(8, 6)) 18 | ax.plot(X, X) 19 | ax.grid() 20 | plt.show() 21 | 22 | # Create the graph 23 | graph = tf.Graph() 24 | 25 | with graph.as_default(): 26 | Xt = tf.placeholder(tf.float32, shape=(None, 1), name='x') 27 | Y = tf.pow(Xt, 3.0, name='x_3') 28 | Yd = tf.gradients(Y, Xt, name='dx') 29 | Yd2 = tf.gradients(Yd, Xt, name='d2x') 30 | 31 | session = tf.InteractiveSession(graph=graph) 32 | 33 | # Compute the gradients 34 | X2, dX, d2X = session.run([Y, Yd, Yd2], feed_dict={Xt: X.reshape((nb_points * 2, 1))}) 35 | 36 | # Plot the gradients 37 | fig, ax = plt.subplots(1, 3, figsize=(20, 5)) 38 | 39 | ax[0].plot(X, X2) 40 | ax[0].grid() 41 | ax[0].set_xlabel('x') 42 | ax[0].set_ylabel(r'$x^2$') 43 | 44 | ax[1].plot(X, dX[0]) 45 | ax[1].grid() 46 | ax[1].set_xlabel('x') 47 | ax[1].set_ylabel(r'$dx/dy$') 48 | 49 | ax[2].plot(X, d2X[0]) 50 | ax[2].grid() 51 | ax[2].set_xlabel('x') 52 | ax[2].set_ylabel(r'$d^2x/dy^2$') 53 | 54 | plt.show() -------------------------------------------------------------------------------- /Chapter17/pipeline.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import make_classification 6 | from sklearn.decomposition import PCA 7 | from sklearn.model_selection import GridSearchCV 8 | from sklearn.pipeline import Pipeline 9 | from sklearn.preprocessing import StandardScaler 10 | from sklearn.svm import SVC 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | nb_samples = 500 16 | 17 | 18 | if __name__ == '__main__': 19 | # Create the dataset 20 | X, Y = make_classification(n_samples=nb_samples, n_informative=15, n_redundant=5, n_classes=2) 21 | 22 | # Create the steps for the pipeline 23 | pca = PCA(n_components=10) 24 | scaler = StandardScaler() 25 | svc = SVC(kernel='poly', gamma=3) 26 | 27 | steps = [ 28 | ('pca', pca), 29 | ('scaler', scaler), 30 | ('classifier', svc) 31 | ] 32 | 33 | # Create the pipeline 34 | pipeline = Pipeline(steps) 35 | 36 | # Perform a grid search 37 | param_grid = { 38 | 'pca__n_components': [5, 10, 12, 15, 18, 20], 39 | 'classifier__kernel': ['rbf', 'poly'], 40 | 'classifier__gamma': [0.05, 0.1, 0.2, 0.5], 41 | 'classifier__degree': [2, 3, 5] 42 | } 43 | 44 | gs = GridSearchCV(pipeline, param_grid) 45 | gs.fit(X, Y) 46 | 47 | print('Best estimator:') 48 | print(gs.best_estimator_) 49 | 50 | print('Best score:') 51 | print(gs.best_score_) 52 | -------------------------------------------------------------------------------- /Chapter07/svr.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.svm import SVR 7 | from sklearn.model_selection import cross_val_score 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 50 14 | 15 | 16 | def show_dataset(X, Y, Y_pred=None): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.grid() 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | 23 | ax.scatter(X, Y) 24 | 25 | if Y_pred is not None: 26 | ax.plot(X, Y_pred, c='r') 27 | 28 | plt.show() 29 | 30 | 31 | if __name__ == '__main__': 32 | # Create dataset 33 | X = np.arange(-nb_samples, nb_samples, 1) 34 | Y = np.zeros(shape=(2 * nb_samples,)) 35 | 36 | for x in X: 37 | Y[int(x) + nb_samples] = np.power(x * 6, 2.0) / 1e4 + np.random.uniform(-2, 2) 38 | 39 | # Show dataset 40 | show_dataset(X, Y) 41 | 42 | # Create and train a Support Vector regressor 43 | svr = SVR(kernel='poly', degree=2, C=1.5, epsilon=0.5) 44 | svr_scores = cross_val_score(svr, X.reshape((nb_samples*2, 1)), Y, scoring='neg_mean_squared_error', cv=10) 45 | print('SVR CV average negative squared error: %.3f' % svr_scores.mean()) 46 | 47 | # Fit the model 48 | svr.fit(X.reshape(-1, 1), Y.ravel()) 49 | Y_pred = svr.predict(X.reshape(-1, 1)) 50 | 51 | # Show the dataset together with the prediction 52 | show_dataset(X, Y, Y_pred) 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /Chapter06/newsgroups.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.cm as cm 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | 7 | from sklearn.datasets import fetch_20newsgroups_vectorized 8 | from sklearn.naive_bayes import MultinomialNB 9 | from sklearn.metrics import confusion_matrix 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | 16 | def plot_confusion_matrix(Y_test, Y_pred, targets): 17 | cmatrix = confusion_matrix(y_true=Y_test, y_pred=Y_pred) 18 | cm_fig, cm_ax = plt.subplots(figsize=(12, 12)) 19 | cm_ax.matshow(cmatrix, cmap=cm.GnBu) 20 | 21 | x = y = np.arange(0, len(targets)) 22 | plt.xticks(x, targets, rotation='vertical') 23 | plt.yticks(y, targets) 24 | 25 | for i in range(len(targets)): 26 | for j in range(len(targets)): 27 | cm_ax.text(x=j, y=i, s=cmatrix[i, j], va='center', ha='center', size='x-large') 28 | 29 | plt.show() 30 | 31 | 32 | if __name__ == '__main__': 33 | # Load the dataset 34 | train_data = fetch_20newsgroups_vectorized(subset='train') 35 | test_data = fetch_20newsgroups_vectorized(subset='test') 36 | 37 | # Create and train the model 38 | mnb = MultinomialNB(alpha=0.01) 39 | mnb.fit(train_data['data'], train_data['target']) 40 | 41 | print(mnb.score(test_data['data'], test_data['target'])) 42 | 43 | # Plot the confusion matrix 44 | plot_confusion_matrix(test_data['target'], mnb.predict(test_data['data']), list(test_data['target_names'])) -------------------------------------------------------------------------------- /Chapter12/user_based.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.neighbors import NearestNeighbors 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | nb_users = 1000 11 | nb_product = 20 12 | 13 | if __name__ == '__main__': 14 | # Create the user dataset 15 | users = np.zeros(shape=(nb_users, 4)) 16 | 17 | for i in range(nb_users): 18 | users[i, 0] = np.random.randint(0, 4) 19 | users[i, 1] = np.random.randint(0, 2) 20 | users[i, 2] = np.random.randint(0, 5) 21 | users[i, 2] = np.random.randint(0, 5) 22 | 23 | # Create user-product dataset 24 | user_products = np.random.randint(0, nb_product, size=(nb_users, 5)) 25 | 26 | # Fit k-nearest neighbors 27 | nn = NearestNeighbors(n_neighbors=20, radius=2.0) 28 | nn.fit(users) 29 | 30 | # Create a test user 31 | test_user = np.array([2, 0, 3, 2]) 32 | 33 | # Determine the neighbors 34 | d, neighbors = nn.kneighbors(test_user.reshape(1, -1)) 35 | 36 | print('Neighbors:') 37 | print(neighbors) 38 | 39 | # Determine the suggested products 40 | suggested_products = [] 41 | 42 | for n in neighbors: 43 | for products in user_products[n]: 44 | for product in products: 45 | if product != 0 and product not in suggested_products: 46 | suggested_products.append(product) 47 | 48 | print('Suggested products:') 49 | print(suggested_products) 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /Chapter05/confusion_matrix.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.cm as cm 4 | import matplotlib.pyplot as plt 5 | 6 | import numpy as np 7 | 8 | from sklearn.datasets import load_wine 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.metrics import confusion_matrix 12 | 13 | 14 | # For reproducibility 15 | np.random.seed(1000) 16 | 17 | 18 | def plot_confusion_matrix(Y_test, Y_pred, targets): 19 | cmatrix = confusion_matrix(y_true=Y_test, y_pred=Y_pred) 20 | cm_fig, cm_ax = plt.subplots(figsize=(8.0, 8.0)) 21 | cm_ax.matshow(cmatrix, cmap=cm.GnBu) 22 | 23 | cm_ax.set_xticklabels([''] + targets) 24 | cm_ax.set_yticklabels([''] + targets) 25 | 26 | for i in range(len(targets)): 27 | for j in range(len(targets)): 28 | cm_ax.text(x=j, y=i, s=cmatrix[i, j], va='center', ha='center', size='x-large') 29 | 30 | plt.title('Confusion matrix') 31 | plt.show() 32 | 33 | 34 | if __name__ == '__main__': 35 | # Load the dataset 36 | wine = load_wine() 37 | 38 | # Split the dataset 39 | X_train, X_test, Y_train, Y_test = train_test_split(wine['data'], wine['target'], test_size=0.25) 40 | 41 | # Train the model 42 | lr = LogisticRegression() 43 | lr.fit(X_train, Y_train) 44 | 45 | # Plot the confusion matrix 46 | targets = list(wine['target_names']) 47 | plot_confusion_matrix(lr.predict(X_test), Y_test, targets) 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /Chapter10/dbscan.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_moons 7 | from sklearn.cluster import DBSCAN 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 1000 14 | 15 | 16 | def show_dataset(X, Y): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.grid() 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | 23 | for i in range(nb_samples): 24 | if Y[i] == 0: 25 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 26 | else: 27 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 28 | 29 | plt.show() 30 | 31 | 32 | def show_clustered_dataset(X, Y): 33 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 34 | 35 | ax.grid() 36 | ax.set_xlabel('X') 37 | ax.set_ylabel('Y') 38 | 39 | for i in range(nb_samples): 40 | if Y[i] == 0: 41 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 42 | else: 43 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 44 | 45 | plt.show() 46 | 47 | 48 | if __name__ == '__main__': 49 | # Create dataset 50 | X, Y = make_moons(n_samples=nb_samples, noise=0.05) 51 | 52 | # Show dataset 53 | show_dataset(X, Y) 54 | 55 | # Create and train DBSCAN 56 | dbs = DBSCAN(eps=0.1) 57 | Y = dbs.fit_predict(X) 58 | 59 | # Show clustered dataset 60 | show_clustered_dataset(X, Y) 61 | 62 | -------------------------------------------------------------------------------- /Chapter06/discriminant_analysis.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | from sklearn.datasets import make_blobs 7 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis 8 | from sklearn.model_selection import cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | # Total number of samples 16 | nb_samples = 1000 17 | 18 | 19 | if __name__ == '__main__': 20 | # Create the dataset 21 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=2, cluster_std=[1.0, 10.0], random_state=1000) 22 | 23 | # Show the dataset 24 | fig, ax = plt.subplots(figsize=(11, 7)) 25 | 26 | ax.scatter(X[Y == 0, 0], X[Y == 0, 1], label='Class 0') 27 | ax.scatter(X[Y == 1, 0], X[Y == 1, 1], label='Class 1') 28 | ax.set_xlabel(r'$x_0$') 29 | ax.set_ylabel(r'$x_1$') 30 | ax.grid() 31 | ax.legend() 32 | 33 | plt.show() 34 | 35 | # Show the covariance matrices 36 | print('Covariance matrix for class 0:') 37 | print(np.cov(X[Y == 0].T)) 38 | 39 | print('\nCovariance matrix for class 1:') 40 | print(np.cov(X[Y == 1].T)) 41 | 42 | # Show the CV scores 43 | lda = LinearDiscriminantAnalysis() 44 | print('\nLDA average CV accuracy: %.3f' % cross_val_score(lda, X, Y, cv=10).mean()) 45 | 46 | qda = QuadraticDiscriminantAnalysis() 47 | print('QDA average CV accuracy: %.3f' % cross_val_score(qda, X, Y, cv=10).mean()) 48 | 49 | -------------------------------------------------------------------------------- /Chapter12/als_spark.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from pyspark import SparkContext, SparkConf 4 | from pyspark.mllib.recommendation import Rating 5 | from pyspark.mllib.recommendation import ALS 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | nb_users = 200 11 | nb_products = 100 12 | ratings = [] 13 | 14 | if __name__ == '__main__': 15 | conf = SparkConf().setAppName('ALS').setMaster('local[*]') 16 | sc = SparkContext(conf=conf) 17 | 18 | for _ in range(10): 19 | for i in range(nb_users): 20 | rating = Rating(user=i, product=np.random.randint(1, nb_products), rating=np.random.randint(0, 5)) 21 | ratings.append(rating) 22 | 23 | # Parallelize the ratings 24 | ratings = sc.parallelize(ratings) 25 | 26 | # Train the model 27 | model = ALS.train(ratings, rank=5, iterations=10) 28 | 29 | # Test the model 30 | test = ratings.map(lambda rating: (rating.user, rating.product)) 31 | 32 | predictions = model.predictAll(test) 33 | full_predictions = predictions.map(lambda pred: ((pred.user, pred.product), pred.rating)) 34 | 35 | # Compute MSE 36 | split_ratings = ratings.map(lambda rating: ((rating.user, rating.product), rating.rating)) 37 | joined_predictions = split_ratings.join(full_predictions) 38 | mse = joined_predictions.map(lambda x: (x[1][0] - x[1][1]) ** 2).mean() 39 | 40 | print('MSE: %.3f' % mse) 41 | 42 | # Perform a single prediction 43 | prediction = model.predict(10, 20) 44 | print('Prediction: %.3f' % prediction) -------------------------------------------------------------------------------- /Chapter07/kernel_svm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import multiprocessing 6 | 7 | from sklearn.datasets import make_circles 8 | from sklearn.model_selection import GridSearchCV 9 | from sklearn.svm import SVC 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | nb_samples = 500 16 | 17 | 18 | def show_dataset(X, Y): 19 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 20 | 21 | ax.grid() 22 | ax.set_xlabel('X') 23 | ax.set_ylabel('Y') 24 | 25 | for i in range(nb_samples): 26 | if Y[i] == 0: 27 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 28 | else: 29 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 30 | 31 | plt.show() 32 | 33 | 34 | if __name__ == '__main__': 35 | # Create datasets 36 | X, Y = make_circles(n_samples=nb_samples, noise=0.1) 37 | 38 | # Show dataset 39 | show_dataset(X, Y) 40 | 41 | # Define a param grid 42 | param_grid = [ 43 | { 44 | 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 45 | 'C': [0.1, 0.2, 0.4, 0.5, 1.0, 1.5, 1.8, 2.0, 2.5, 3.0] 46 | } 47 | ] 48 | 49 | # Create a train grid search on SVM classifier 50 | gs = GridSearchCV(estimator=SVC(), param_grid=param_grid, 51 | scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) 52 | gs.fit(X, Y) 53 | 54 | print(gs.best_estimator_) 55 | print('Kernel SVM score: %.3f' % gs.best_score_) 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /Chapter10/birch.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_blobs 7 | from sklearn.cluster import Birch 8 | from sklearn.metrics import adjusted_rand_score 9 | 10 | 11 | # Set random seed for reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | nb_samples = 2000 16 | batch_size = 80 17 | 18 | 19 | if __name__ == '__main__': 20 | # Create the dataset 21 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=5, cluster_std=1.5, random_state=1000) 22 | 23 | # Create an instance of BIRCH 24 | birch = Birch(n_clusters=5, threshold=0.15, branching_factor=100) 25 | 26 | # Train the model 27 | X_batch = [] 28 | Y_preds = [] 29 | 30 | for i in range(0, nb_samples, batch_size): 31 | birch.partial_fit(X[i:i + batch_size]) 32 | X_batch.append(X[:i + batch_size]) 33 | Y_preds.append(birch.predict(X[:i + batch_size])) 34 | 35 | print(adjusted_rand_score(birch.predict(X), Y)) 36 | 37 | # Show the training steps 38 | fig, ax = plt.subplots(5, 5, figsize=(20, 12)) 39 | 40 | for i in range(5): 41 | for j in range(5): 42 | idx = (i * 5) + j 43 | 44 | for k in range(5): 45 | ax[i][j].scatter(X_batch[idx][Y_preds[idx] == k, 0], X_batch[idx][Y_preds[idx] == k, 1], s=3) 46 | 47 | ax[i][j].set_xticks([]) 48 | ax[i][j].set_yticks([]) 49 | ax[i][j].set_title('{} samples'.format(batch_size * (idx + 1))) 50 | 51 | plt.show() 52 | 53 | 54 | -------------------------------------------------------------------------------- /Chapter04/isotonic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from matplotlib.collections import LineCollection 7 | 8 | from sklearn.isotonic import IsotonicRegression 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 100 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.plot(X, Y, 'b.-') 21 | ax.grid() 22 | ax.set_xlabel('X') 23 | ax.set_ylabel('Y') 24 | 25 | plt.show() 26 | 27 | 28 | def show_isotonic_regression_segments(X, Y, Yi, segments): 29 | lc = LineCollection(segments, zorder=0) 30 | lc.set_array(np.ones(len(Y))) 31 | lc.set_linewidths(0.5 * np.ones(nb_samples)) 32 | 33 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 34 | 35 | ax.plot(X, Y, 'b.', markersize=8) 36 | ax.plot(X, Yi, 'g.-', markersize=8) 37 | ax.grid() 38 | ax.set_xlabel('X') 39 | ax.set_ylabel('Y') 40 | 41 | plt.show() 42 | 43 | 44 | if __name__ == '__main__': 45 | # Create dataset 46 | X = np.arange(-5, 5, 0.1) 47 | Y = X + np.random.uniform(-0.5, 1, size=X.shape) 48 | 49 | # Show original dataset 50 | show_dataset(X, Y) 51 | 52 | # Create an isotonic regressor 53 | ir = IsotonicRegression(-6, 10) 54 | Yi = ir.fit_transform(X, Y) 55 | 56 | # Create a segment list 57 | segments = [[[i, Y[i]], [i, Yi[i]]] for i in range(nb_samples)] 58 | 59 | # Show isotonic interpolation 60 | show_isotonic_regression_segments(X, Y, Yi, segments) 61 | -------------------------------------------------------------------------------- /Chapter09/k_means.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_blobs 7 | from sklearn.cluster import KMeans 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 1000 14 | 15 | 16 | def show_dataset(X): 17 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 18 | 19 | ax.grid() 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | 23 | ax.scatter(X[:, 0], X[:, 1], marker='o', color='b') 24 | 25 | plt.show() 26 | 27 | 28 | def show_clustered_dataset(X, km): 29 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 30 | 31 | ax.grid() 32 | ax.set_xlabel('X') 33 | ax.set_ylabel('Y') 34 | 35 | for i in range(nb_samples): 36 | c = km.predict(X[i].reshape(1, -1)) 37 | if c == 0: 38 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 39 | elif c == 1: 40 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 41 | else: 42 | ax.scatter(X[i, 0], X[i, 1], marker='d', color='g') 43 | 44 | plt.show() 45 | 46 | 47 | if __name__ == '__main__': 48 | # Create dataset 49 | X, _ = make_blobs(n_samples=nb_samples, n_features=2, centers=3, cluster_std=1.5, random_state=1000) 50 | 51 | # Show dataset 52 | show_dataset(X) 53 | 54 | # Create and train K-Means 55 | km = KMeans(n_clusters=3) 56 | km.fit(X) 57 | 58 | # Show the centroids 59 | print(km.cluster_centers_) 60 | 61 | # Show clustered dataset 62 | show_clustered_dataset(X, km) 63 | 64 | -------------------------------------------------------------------------------- /Chapter10/biclustering.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.cluster.bicluster import SpectralBiclustering 7 | 8 | 9 | # Set random seed for reproducibility 10 | np.random.seed(1000) 11 | 12 | 13 | nb_users = 100 14 | nb_products = 150 15 | max_rating = 10 16 | 17 | 18 | if __name__ == '__main__': 19 | # Create the user-product matrix 20 | up_matrix = np.random.randint(0, max_rating + 1, size=(nb_users, nb_products)) 21 | mask_matrix = np.random.randint(0, 2, size=(nb_users, nb_products)) 22 | up_matrix *= mask_matrix 23 | 24 | # Show the matrix 25 | fig, ax = plt.subplots(figsize=(12, 6)) 26 | 27 | matx = ax.matshow(up_matrix) 28 | fig.colorbar(matx) 29 | 30 | ax.set_xticks([]) 31 | ax.set_yticks([]) 32 | ax.set_xlabel('Products') 33 | ax.set_ylabel('Users') 34 | 35 | plt.show() 36 | 37 | # Perform a Spectral Biclustering 38 | sbc = SpectralBiclustering(n_clusters=10, random_state=1000) 39 | sbc.fit(up_matrix) 40 | 41 | # Show the clustered matrix 42 | up_clustered = np.outer(np.sort(sbc.row_labels_) + 1, np.sort(sbc.column_labels_) + 1) 43 | 44 | fig, ax = plt.subplots(figsize=(12, 6)) 45 | 46 | matx = ax.matshow(up_clustered) 47 | 48 | ax.set_xticks([]) 49 | ax.set_yticks([]) 50 | ax.set_xlabel('Products') 51 | ax.set_ylabel('Users') 52 | 53 | plt.show() 54 | 55 | # Show some examples of users and products associated with ranking 6 56 | print(np.where(sbc.rows_[6, :] == True)) 57 | print(np.where(sbc.columns_[6, :] == True)) -------------------------------------------------------------------------------- /Chapter14/lsa_1.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from nltk.corpus import brown 7 | 8 | from scipy.linalg import svd 9 | 10 | from sklearn.feature_extraction.text import TfidfVectorizer 11 | 12 | 13 | # For reproducibility 14 | np.random.seed(1000) 15 | 16 | 17 | def scatter_documents(X): 18 | fig, ax = plt.subplots(1, 1, figsize=(10, 6)) 19 | 20 | ax.scatter(X[:, 0], X[:, 1]) 21 | ax.set_xlabel('t0') 22 | ax.set_ylabel('t1') 23 | ax.grid() 24 | plt.show() 25 | 26 | 27 | if __name__ == '__main__': 28 | # Compose a corpus 29 | sentences = sentences = brown.sents(categories=['news', 'fiction']) 30 | corpus = [] 31 | 32 | for s in sentences: 33 | corpus.append(' '.join(s)) 34 | 35 | # Vectorize the corpus 36 | vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', sublinear_tf=True, use_idf=True) 37 | Xc = vectorizer.fit_transform(corpus).todense() 38 | 39 | # Perform SVD 40 | U, s, V = svd(Xc, full_matrices=False) 41 | 42 | # Extract a sub-space with rank=2 43 | rank = 2 44 | 45 | Uk = U[:, 0:rank] 46 | sk = np.diag(s)[0:rank, 0:rank] 47 | Vk = V[0:rank, :] 48 | 49 | # Check the top-10 word per topic 50 | Mwts = np.argsort(np.abs(Vk), axis=1)[::-1] 51 | 52 | for t in range(rank): 53 | print('\nTopic ' + str(t)) 54 | for i in range(10): 55 | print(vectorizer.get_feature_names()[Mwts[t, i]]) 56 | 57 | # Show a scatter plot of all documents 58 | Mdtk = Uk.dot(sk) 59 | scatter_documents(Mdtk) 60 | -------------------------------------------------------------------------------- /Chapter13/tokenizing.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from __future__ import print_function 4 | 5 | from nltk.tokenize import sent_tokenize 6 | from nltk.tokenize import TreebankWordTokenizer 7 | from nltk.tokenize import RegexpTokenizer 8 | 9 | if __name__ == '__main__': 10 | # Sentence tokenizing 11 | print('Generic text:') 12 | generic_text = 'Lorem ipsum dolor sit amet, amet minim temporibus in sit. Vel ne impedit consequat intellegebat.' 13 | print(sent_tokenize(generic_text)) 14 | 15 | print('English text:') 16 | english_text = 'Where is the closest train station? I need to reach London' 17 | print(sent_tokenize(english_text, language='english')) 18 | 19 | print('Spanish text:') 20 | spanish_text = u'¿Dónde está la estación más cercana? Inmediatamente me tengo que ir a Barcelona.' 21 | for sentence in sent_tokenize(spanish_text, language='spanish'): 22 | print(sentence) 23 | 24 | # Word tokenizing 25 | # Create a Treebank word tokenizer 26 | tbwt = TreebankWordTokenizer() 27 | 28 | print('Simple text:') 29 | simple_text = 'This is a simple text.' 30 | print(tbwt.tokenize(simple_text)) 31 | 32 | print('Complex text:') 33 | complex_text = 'This isn\'t a simple text' 34 | print(tbwt.tokenize(complex_text)) 35 | 36 | # Create a Regexp tokenizer 37 | ret = RegexpTokenizer('[a-zA-Z0-9\'\.]+') 38 | print(ret.tokenize(complex_text)) 39 | 40 | # Create a more restrictive Regexp tokenizer 41 | ret = RegexpTokenizer('[a-zA-Z\']+') 42 | 43 | complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!' 44 | print(ret.tokenize(complex_text)) -------------------------------------------------------------------------------- /Chapter03/whitening.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | # For reproducibility 8 | np.random.seed(1000) 9 | 10 | 11 | nb_samples = 1000 12 | 13 | 14 | def zero_center(X): 15 | return X - np.mean(X, axis=0) 16 | 17 | 18 | def whiten(X, correct=True): 19 | Xc = zero_center(X) 20 | _, L, V = np.linalg.svd(Xc) 21 | W = np.dot(V.T, np.diag(1.0 / L)) 22 | return np.dot(Xc, W) * np.sqrt(X.shape[0]) if correct else 1.0 23 | 24 | 25 | if __name__ == '__main__': 26 | # Create the dataset 27 | X = np.random.normal(0.0, [2.5, 1.0], size=(nb_samples, 2)) 28 | 29 | theta = np.pi / 4.0 30 | R = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) 31 | 32 | Xr = np.dot(X, R) 33 | 34 | # Create a whitened version 35 | Xw = whiten(Xr) 36 | 37 | # Print the whitened covariance matrix 38 | print(np.cov(Xw.T)) 39 | 40 | # Show original and whitened datasets 41 | fig, ax = plt.subplots(1, 2, figsize=(15, 5)) 42 | 43 | ax[0].scatter(Xr[:, 0], Xr[:, 1]) 44 | ax[0].set_xticks(np.arange(-10, 10), 2) 45 | ax[0].set_yticks(np.arange(-8, 8), 2) 46 | ax[0].set_xlabel(r'$x_1$') 47 | ax[0].set_ylabel(r'$x_2$') 48 | ax[0].set_title(r'Original dataset') 49 | ax[0].grid() 50 | 51 | ax[1].scatter(Xw[:, 0], Xw[:, 1]) 52 | ax[1].set_xticks(np.arange(-10, 10), 2) 53 | ax[1].set_yticks(np.arange(-8, 8), 2) 54 | ax[1].set_xlabel(r'$x_1$') 55 | ax[1].set_ylabel(r'$x_2$') 56 | ax[1].set_title(r'Whitened dataset') 57 | ax[1].grid() 58 | 59 | plt.show() 60 | 61 | -------------------------------------------------------------------------------- /Chapter02/SMOTE.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | # Install Imbalanced-Learning with: pip install -U imbalanced-learn 7 | # For further information: http://contrib.scikit-learn.org/imbalanced-learn/stable/index.html 8 | from imblearn.over_sampling import SMOTE 9 | 10 | from sklearn.datasets import make_classification 11 | 12 | 13 | # For reproducibility 14 | np.random.seed(1000) 15 | 16 | 17 | nb_samples = 1000 18 | weights = (0.95, 0.05) 19 | 20 | 21 | if __name__ == '__main__': 22 | # Create an unbalanced dataset 23 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, weights=weights, random_state=1000) 24 | 25 | # Create and train a SMOTE instance 26 | smote = SMOTE() 27 | X_resampled, Y_resampled = smote.fit_sample(X, Y) 28 | 29 | # Show original and resampled datasets 30 | fig, ax = plt.subplots(1, 2, figsize=(20, 8)) 31 | 32 | ax[0].scatter(X[Y == 0, 0], X[Y == 0, 1], label='Class 1') 33 | ax[0].scatter(X[Y == 1, 0], X[Y == 1, 1], label='Class 2') 34 | ax[0].set_xlabel(r'$x_0$') 35 | ax[0].set_ylabel(r'$x_1$') 36 | ax[0].set_title('Unbalanced dataset') 37 | ax[0].legend() 38 | ax[0].grid() 39 | 40 | ax[1].scatter(X_resampled[Y_resampled == 0, 0], X_resampled[Y_resampled == 0, 1], label='Class 1') 41 | ax[1].scatter(X_resampled[Y_resampled == 1, 0], X_resampled[Y_resampled == 1, 1], label='Class 2') 42 | ax[1].set_xlabel(r'$x_0$') 43 | ax[1].set_ylabel(r'$x_1$') 44 | ax[1].set_title('SMOTE balancing') 45 | ax[1].legend() 46 | ax[1].grid() 47 | 48 | plt.show() 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /Chapter05/passive_aggressive_classification.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | from sklearn.datasets import load_iris 7 | from sklearn.linear_model import PassiveAggressiveClassifier 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.model_selection import train_test_split 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | 16 | if __name__ == '__main__': 17 | # Load and scale the dataset 18 | iris = load_iris() 19 | 20 | ss = StandardScaler() 21 | 22 | X = ss.fit_transform(iris['data']) 23 | Y = iris['target'] 24 | 25 | # Split the dataset 26 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1000) 27 | 28 | # Create the model 29 | pac = PassiveAggressiveClassifier(C=0.05, loss='squared_hinge', max_iter=2000, random_state=1000) 30 | 31 | # Train with the start-up samples 32 | nb_initial_samples = int(X_train.shape[0] / 1.5) 33 | pac.fit(X_train[0:nb_initial_samples], Y_train[0:nb_initial_samples]) 34 | 35 | # Continue with the incremental samples 36 | validation_accuracies = [] 37 | 38 | for (x, y) in zip(X_train[nb_initial_samples:], Y_train[nb_initial_samples:]): 39 | pac.partial_fit(x.reshape(1, -1), y.ravel(), classes=np.unique(iris['target'])) 40 | validation_accuracies.append(pac.score(X_test, Y_test)) 41 | 42 | # Show the validation plot 43 | fig, ax = plt.subplots(figsize=(18, 8)) 44 | 45 | ax.plot(validation_accuracies) 46 | ax.set_xlabel('Online sample') 47 | ax.set_ylabel('Validation accuracy') 48 | ax.grid() 49 | 50 | plt.show() 51 | -------------------------------------------------------------------------------- /Chapter06/bernoulli.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.model_selection import train_test_split, cross_val_score 8 | from sklearn.naive_bayes import BernoulliNB 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 300 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.grid() 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | 24 | for i in range(nb_samples): 25 | if Y[i] == 0: 26 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 27 | else: 28 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 29 | 30 | plt.show() 31 | 32 | 33 | if __name__ == '__main__': 34 | # Create dataset 35 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0) 36 | 37 | # Show dataset 38 | show_dataset(X, Y) 39 | 40 | # Split dataset 41 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) 42 | 43 | # Create and train Bernoulli Naive Bayes classifier 44 | bnb = BernoulliNB(binarize=0.0) 45 | bnb.fit(X_train, Y_train) 46 | 47 | print('Bernoulli Naive Bayes score: %.3f' % bnb.score(X_test, Y_test)) 48 | 49 | # Compute CV score 50 | bnb_scores = cross_val_score(bnb, X, Y, scoring='accuracy', cv=10) 51 | print('Bernoulli Naive Bayes CV average score: %.3f' % bnb_scores.mean()) 52 | 53 | # Predict some values 54 | data = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 55 | Yp = bnb.predict(data) 56 | print(Yp) 57 | 58 | -------------------------------------------------------------------------------- /Chapter14/lda.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from nltk.corpus import brown 6 | 7 | from sklearn.decomposition import LatentDirichletAllocation 8 | from sklearn.feature_extraction.text import CountVectorizer 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | if __name__ == '__main__': 14 | # Compose a corpus 15 | sentences_1 = brown.sents(categories=['reviews'])[0:1000] 16 | sentences_2 = brown.sents(categories=['government'])[0:1000] 17 | sentences_3 = brown.sents(categories=['fiction'])[0:1000] 18 | sentences_4 = brown.sents(categories=['news'])[0:1000] 19 | corpus = [] 20 | 21 | for s in sentences_1 + sentences_2 + sentences_3 + sentences_4: 22 | corpus.append(' '.join(s)) 23 | 24 | # Vectorize the corpus 25 | cv = CountVectorizer(strip_accents='unicode', stop_words='english', analyzer='word') 26 | Xc = cv.fit_transform(corpus) 27 | 28 | # Perform LDA 29 | lda = LatentDirichletAllocation(n_topics=8, learning_method='online', max_iter=25) 30 | Xl = lda.fit_transform(Xc) 31 | 32 | # Show the top 5 words per topic 33 | Mwts_lda = np.argsort(lda.components_, axis=1)[::-1] 34 | 35 | for t in range(8): 36 | print('\nTopic ' + str(t)) 37 | for i in range(5): 38 | print(cv.get_feature_names()[Mwts_lda[t, i]]) 39 | 40 | # Test the model with new document 41 | print('Document 0:') 42 | print(corpus[0]) 43 | print(Xl[0]) 44 | 45 | print('Document 2500:') 46 | print(corpus[2500]) 47 | print(Xl[2500]) 48 | 49 | test_doc = corpus[0] + ' ' + corpus[2500] 50 | y_test = lda.transform(cv.transform([test_doc])) 51 | print(y_test) 52 | 53 | -------------------------------------------------------------------------------- /Chapter09/knn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.neighbors import NearestNeighbors 8 | from sklearn.preprocessing import StandardScaler 9 | 10 | 11 | # Set random seed for reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | if __name__ == '__main__': 16 | # Load the dataset 17 | digits = load_digits() 18 | 19 | # Scale the dataset 20 | ss = StandardScaler(with_std=False) 21 | X = ss.fit_transform(digits['data']) 22 | 23 | # Create and train the model 24 | knn = NearestNeighbors(n_neighbors=25, leaf_size=30, algorithm='ball_tree') 25 | knn.fit(X) 26 | 27 | # Create a noisy sample (and show it) 28 | X_noise = X[50] + np.random.normal(0.0, 1.5, size=(64,)) 29 | 30 | fig, ax = plt.subplots(1, 2, figsize=(4, 8)) 31 | 32 | ax[0].imshow(digits['images'][50], cmap='gray') 33 | ax[0].set_xticks([]) 34 | ax[0].set_yticks([]) 35 | 36 | ax[1].imshow(ss.inverse_transform(X_noise).reshape((8, 8)), cmap='gray') 37 | ax[1].set_xticks([]) 38 | ax[1].set_yticks([]) 39 | 40 | plt.show() 41 | 42 | # Compute the neighbors 43 | distances, neighbors = knn.kneighbors(X_noise.reshape(1, -1), return_distance=True) 44 | 45 | print('Distances:\n') 46 | print(distances[0]) 47 | 48 | # Show the neighbors 49 | fig, ax = plt.subplots(5, 5, figsize=(8, 8)) 50 | 51 | for y in range(5): 52 | for x in range(5): 53 | idx = neighbors[0][(x + (y * 5))] 54 | ax[y, x].matshow(digits['images'][idx], cmap='gray') 55 | ax[y, x].set_xticks([]) 56 | ax[y, x].set_yticks([]) 57 | 58 | plt.show() -------------------------------------------------------------------------------- /Chapter14/lsa.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from nltk.corpus import brown 7 | 8 | from scipy.linalg import svd 9 | 10 | from sklearn.feature_extraction.text import TfidfVectorizer 11 | 12 | 13 | # For reproducibility 14 | np.random.seed(1000) 15 | 16 | 17 | def scatter_documents(X): 18 | fig, ax = plt.subplots(1, 1, figsize=(10, 6)) 19 | 20 | ax.scatter(X[:, 0], X[:, 1]) 21 | ax.set_xlabel('t0') 22 | ax.set_ylabel('t1') 23 | ax.grid() 24 | plt.show() 25 | 26 | 27 | if __name__ == '__main__': 28 | # Compose a corpus 29 | sentences = brown.sents(categories=['news'])[0:500] 30 | corpus = [] 31 | 32 | for s in sentences: 33 | corpus.append(' '.join(s)) 34 | 35 | # Vectorize the corpus 36 | vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', sublinear_tf=True, use_idf=True) 37 | Xc = vectorizer.fit_transform(corpus).todense() 38 | 39 | # Perform SVD 40 | U, s, V = svd(Xc, full_matrices=False) 41 | 42 | # Extract a sub-space with rank=2 43 | rank = 2 44 | 45 | Uk = U[:, 0:rank] 46 | sk = np.diag(s)[0:rank, 0:rank] 47 | Vk = V[0:rank, :] 48 | 49 | # Check the top-10 word per topic 50 | Mwts = np.argsort(np.abs(Vk), axis=1)[::-1] 51 | 52 | for t in range(rank): 53 | print('\nTopic ' + str(t)) 54 | for i in range(10): 55 | print(vectorizer.get_feature_names()[Mwts[t, i]]) 56 | 57 | # Compute the structure of a document 58 | print('\nSample document:') 59 | print(corpus[0]) 60 | 61 | Mdtk = Uk.dot(sk) 62 | print('\nSample document in the topic sub-space:') 63 | print('d0 = %.2f*t1 + %.2f*t2' % (Mdtk[0][0], Mdtk[0][1])) 64 | 65 | # Show a scatter plot of all documents 66 | scatter_documents(Mdtk) 67 | -------------------------------------------------------------------------------- /Chapter10/mini_batch_kmeans.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_blobs 7 | from sklearn.cluster import KMeans, MiniBatchKMeans 8 | from sklearn.metrics import adjusted_rand_score 9 | 10 | 11 | # Set random seed for reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | nb_samples = 2000 16 | batch_size = 80 17 | 18 | 19 | if __name__ == '__main__': 20 | # Create the dataset 21 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=5, cluster_std=1.5, random_state=1000) 22 | 23 | # Create an instance of Mini-Batch k-Means 24 | mbkm = MiniBatchKMeans(n_clusters=5, max_iter=1000, batch_size=batch_size, random_state=1000) 25 | 26 | # Train the model 27 | X_batch = [] 28 | Y_preds = [] 29 | 30 | for i in range(0, nb_samples, batch_size): 31 | mbkm.partial_fit(X[i:i + batch_size]) 32 | 33 | X_batch.append(X[:i + batch_size]) 34 | Y_preds.append(mbkm.predict(X[:i + batch_size])) 35 | 36 | # Show the training steps 37 | fig, ax = plt.subplots(5, 5, figsize=(20, 12)) 38 | 39 | for i in range(5): 40 | for j in range(5): 41 | idx = (i * 5) + j 42 | 43 | for k in range(5): 44 | ax[i][j].scatter(X_batch[idx][Y_preds[idx] == k, 0], X_batch[idx][Y_preds[idx] == k, 1], s=3) 45 | 46 | ax[i][j].set_xticks([]) 47 | ax[i][j].set_yticks([]) 48 | ax[i][j].set_title('{} samples'.format(batch_size * (idx + 1))) 49 | 50 | plt.show() 51 | 52 | # Compute the Adjusted-Rand score and compare it with a standard K-Means 53 | print(adjusted_rand_score(mbkm.predict(X), Y)) 54 | 55 | km = KMeans(n_clusters=5, max_iter=1000, random_state=1000) 56 | km.fit(X) 57 | 58 | print(adjusted_rand_score(km.predict(X), Y)) 59 | 60 | -------------------------------------------------------------------------------- /Chapter03/pca.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.decomposition import PCA 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | if __name__ == '__main__': 13 | # Load MNIST digits 14 | digits = load_digits() 15 | 16 | # Show some random digits 17 | selection = np.random.randint(0, 1797, size=100) 18 | 19 | fig, ax = plt.subplots(10, 10, figsize=(10, 10)) 20 | 21 | samples = [digits.data[x].reshape((8, 8)) for x in selection] 22 | 23 | for i in range(10): 24 | for j in range(10): 25 | ax[i, j].set_axis_off() 26 | ax[i, j].imshow(samples[(i * 8) + j], cmap='gray') 27 | 28 | plt.show() 29 | 30 | # Perform a PCA on the digits dataset 31 | pca = PCA(n_components=36, whiten=True) 32 | X_pca = pca.fit_transform(digits.data / 255) 33 | 34 | print('Explained variance ratio') 35 | print(pca.explained_variance_ratio_) 36 | 37 | # Plot the explained variance ratio 38 | fig, ax = plt.subplots(1, 2, figsize=(16, 6)) 39 | 40 | ax[0].set_xlabel('Component') 41 | ax[0].set_ylabel('Variance ratio (%)') 42 | ax[0].bar(np.arange(36), pca.explained_variance_ratio_ * 100.0) 43 | 44 | ax[1].set_xlabel('Component') 45 | ax[1].set_ylabel('Cumulative variance (%)') 46 | ax[1].bar(np.arange(36), np.cumsum(pca.explained_variance_)[::-1]) 47 | 48 | plt.show() 49 | 50 | # Rebuild from PCA and show the result 51 | fig, ax = plt.subplots(10, 10, figsize=(10, 10)) 52 | 53 | samples = [pca.inverse_transform(X_pca[x]).reshape((8, 8)) for x in selection] 54 | 55 | for i in range(10): 56 | for j in range(10): 57 | ax[i, j].set_axis_off() 58 | ax[i, j].imshow(samples[(i * 8) + j], cmap='gray') 59 | 60 | plt.show() 61 | 62 | -------------------------------------------------------------------------------- /Chapter04/bayesian_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_boston 7 | from sklearn.linear_model import BayesianRidge 8 | from sklearn.model_selection import train_test_split, cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | def show_dataset(data): 16 | fig, ax = plt.subplots(4, 3, figsize=(20, 15)) 17 | 18 | for i in range(4): 19 | for j in range(3): 20 | ax[i, j].plot(data.data[:, i + (j + 1) * 3]) 21 | ax[i, j].grid() 22 | 23 | plt.show() 24 | 25 | 26 | if __name__ == '__main__': 27 | # Load dataset 28 | boston = load_boston() 29 | 30 | # Show dataset 31 | show_dataset(boston) 32 | 33 | # Create a Bayesian ridge regressor instance 34 | br = BayesianRidge(n_iter=1000) 35 | 36 | # Split dataset 37 | X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, test_size=0.1) 38 | 39 | # Train the model 40 | br.fit(X_train, Y_train) 41 | 42 | print('Score %.3f' % br.score(X_test, Y_test)) 43 | 44 | # CV score 45 | scores = cross_val_score(br, boston.data, boston.target, cv=7, scoring='neg_mean_squared_error') 46 | print('CV Negative mean squared errors mean: %.3f' % scores.mean()) 47 | print('CV Negative mean squared errors std: %.3f' % scores.std()) 48 | 49 | # CV R2 score 50 | r2_scores = cross_val_score(br, boston.data, boston.target, cv=10, scoring='r2') 51 | print('CV R2 score mean: %.3f' % r2_scores.mean()) 52 | print('CV R2 score std: %.3f' % r2_scores.std()) 53 | 54 | # Explained variance score 55 | ev_scores = cross_val_score(br, boston.data, boston.target, cv=10, scoring='explained_variance') 56 | print('CV explained variance score mean: %.3f' % ev_scores.mean()) 57 | print('CV explained variance score std: %.3f' % ev_scores.std()) -------------------------------------------------------------------------------- /Chapter04/multiple_linear_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import load_boston 7 | from sklearn.linear_model import LinearRegression 8 | from sklearn.model_selection import train_test_split, cross_val_score 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | def show_dataset(data): 16 | fig, ax = plt.subplots(4, 3, figsize=(20, 15)) 17 | 18 | for i in range(4): 19 | for j in range(3): 20 | ax[i, j].plot(data.data[:, i + (j + 1) * 3]) 21 | ax[i, j].grid() 22 | 23 | plt.show() 24 | 25 | 26 | if __name__ == '__main__': 27 | # Load dataset 28 | boston = load_boston() 29 | 30 | # Show dataset 31 | show_dataset(boston) 32 | 33 | # Create a linear regressor instance 34 | lr = LinearRegression(normalize=True) 35 | 36 | # Split dataset 37 | X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, test_size=0.1) 38 | 39 | # Train the model 40 | lr.fit(X_train, Y_train) 41 | 42 | print('Score %.3f' % lr.score(X_test, Y_test)) 43 | 44 | # CV score 45 | scores = cross_val_score(lr, boston.data, boston.target, cv=7, scoring='neg_mean_squared_error') 46 | print('CV Negative mean squared errors mean: %.3f' % scores.mean()) 47 | print('CV Negative mean squared errors std: %.3f' % scores.std()) 48 | 49 | # CV R2 score 50 | r2_scores = cross_val_score(lr, boston.data, boston.target, cv=10, scoring='r2') 51 | print('CV R2 score mean: %.3f' % r2_scores.mean()) 52 | print('CV R2 score std: %.3f' % r2_scores.std()) 53 | 54 | # Explained variance score 55 | ev_scores = cross_val_score(lr, boston.data, boston.target, cv=10, scoring='explained_variance') 56 | print('CV explained variance score mean: %.3f' % ev_scores.mean()) 57 | print('CV explained variance score std: %.3f' % ev_scores.std()) 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /Chapter03/fastica.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import os 6 | 7 | from shutil import copyfileobj 8 | from six.moves import urllib 9 | 10 | from sklearn.datasets.base import get_data_home 11 | from sklearn.datasets import fetch_mldata 12 | from sklearn.decomposition import FastICA 13 | 14 | 15 | # Set random seed for reproducibility 16 | np.random.seed(1000) 17 | 18 | 19 | # mldata.org can be subject to outages 20 | # Alternative original MNIST source (provided by Aurélien Geron) 21 | def fetch_mnist(data_home=None): 22 | mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat" 23 | data_home = get_data_home(data_home=data_home) 24 | data_home = os.path.join(data_home, 'mldata') 25 | if not os.path.exists(data_home): 26 | os.makedirs(data_home) 27 | mnist_save_path = os.path.join(data_home, "mnist-original.mat") 28 | if not os.path.exists(mnist_save_path): 29 | mnist_url = urllib.request.urlopen(mnist_alternative_url) 30 | with open(mnist_save_path, "wb") as matlab_file: 31 | copyfileobj(mnist_url, matlab_file) 32 | 33 | 34 | def zero_center(Xd): 35 | return Xd - np.mean(Xd, axis=0) 36 | 37 | 38 | if __name__ == '__main__': 39 | # Load the dataset 40 | mnist = fetch_mnist() 41 | digits = fetch_mldata("MNIST original") 42 | X = zero_center(digits['data'].astype(np.float64)) 43 | np.random.shuffle(X) 44 | 45 | # Peform Fast ICA with 64 components 46 | fastica = FastICA(n_components=256, max_iter=5000, random_state=1000) 47 | fastica.fit(X) 48 | 49 | # Plot the indipendent components 50 | fig, ax = plt.subplots(8, 8, figsize=(11, 11)) 51 | 52 | for i in range(8): 53 | for j in range(8): 54 | ax[i, j].imshow(fastica.components_[(i * 8) + j].reshape((28, 28)), cmap='gray') 55 | ax[i, j].axis('off') 56 | 57 | plt.show() 58 | -------------------------------------------------------------------------------- /Chapter03/categorical.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder 6 | from sklearn.feature_extraction import DictVectorizer, FeatureHasher 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | if __name__ == '__main__': 13 | Y = np.random.choice(('Male', 'Female'), size=(10)) 14 | 15 | # Encode the labels 16 | print('Label encoding') 17 | le = LabelEncoder() 18 | yt = le.fit_transform(Y) 19 | print(yt) 20 | 21 | # Decode a dummy output 22 | print('Label decoding') 23 | output = [1, 0, 1, 1, 0, 0] 24 | decoded_output = [le.classes_[int(i)] for i in output] 25 | print(decoded_output) 26 | 27 | # Binarize the labels 28 | print('Label binarization') 29 | lb = LabelBinarizer() 30 | yb = lb.fit_transform(Y) 31 | print(yb) 32 | 33 | # Decode the binarized labels 34 | print('Label decoding') 35 | lb.inverse_transform(yb) 36 | 37 | # Define some dictionary data 38 | data = [ 39 | {'feature_1': 10.0, 'feature_2': 15.0}, 40 | {'feature_1': -5.0, 'feature_3': 22.0}, 41 | {'feature_3': -2.0, 'feature_4': 10.0} 42 | ] 43 | 44 | # Vectorize the dictionary data 45 | print('Dictionary data vectorization') 46 | dv = DictVectorizer() 47 | Y_dict = dv.fit_transform(data) 48 | print(Y_dict.todense()) 49 | 50 | print('Vocabulary:') 51 | print(dv.vocabulary_) 52 | 53 | # Feature hashing 54 | print('Feature hashing') 55 | fh = FeatureHasher() 56 | Y_hashed = fh.fit_transform(data) 57 | 58 | # Decode the features 59 | print('Feature decoding') 60 | print(Y_hashed.todense()) 61 | 62 | # One-hot encoding 63 | data1 = [ 64 | [0.0, 10.0], 65 | [1.0, 11.0], 66 | [1.0, 8.0], 67 | [0.0, 12.0], 68 | [0.0, 15.0] 69 | ] 70 | 71 | # Encode data 72 | oh = OneHotEncoder(categorical_features=[0]) 73 | Y_oh = oh.fit_transform(data1) 74 | print(Y_oh.todense()) 75 | -------------------------------------------------------------------------------- /Chapter09/k_means_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_circles 7 | from sklearn.cluster import KMeans 8 | 9 | from scipy.spatial.distance import pdist 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | nb_samples = 1000 16 | 17 | 18 | def show_dataset(X): 19 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 20 | 21 | ax.grid() 22 | ax.set_xlabel('X') 23 | ax.set_ylabel('Y') 24 | 25 | for i in range(nb_samples): 26 | if Y[i] == 0: 27 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 28 | else: 29 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 30 | 31 | plt.show() 32 | 33 | 34 | def show_clustered_dataset(X, km): 35 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 36 | 37 | ax.grid() 38 | ax.set_xlabel('X') 39 | ax.set_ylabel('Y') 40 | 41 | for i in range(nb_samples): 42 | c = km.predict(X[i].reshape(1, -1)) 43 | if c == 0: 44 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 45 | else: 46 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 47 | 48 | plt.show() 49 | 50 | 51 | if __name__ == '__main__': 52 | # Create dataset 53 | X, Y = make_circles(n_samples=nb_samples, noise=0.05) 54 | 55 | # Show dataset 56 | show_dataset(X) 57 | 58 | # Create and train K-Means 59 | km = KMeans(n_clusters=2) 60 | km.fit(X) 61 | 62 | # Show clustered dataset 63 | show_clustered_dataset(X, km) 64 | 65 | # Compute the average intra-cluster distances 66 | Y_pred = km.predict(X) 67 | 68 | sampled_X = np.random.choice(X[Y_pred == 0, 0], replace=False, size=300).astype(np.int32) 69 | 70 | true_distances = pdist(X[Y == 0], metric='euclidean') 71 | distances = pdist(X[sampled_X], metric='euclidean') 72 | 73 | print('True average distance: %.3f' % np.mean(true_distances)) 74 | print('Clustering agerage distance: %.3f' % np.mean(distances)) 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /Chapter13/reuters_text_classifier.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from nltk.corpus import reuters, stopwords 6 | from nltk.tokenize import RegexpTokenizer 7 | from nltk.stem.snowball import SnowballStemmer 8 | 9 | from sklearn.feature_extraction.text import TfidfVectorizer 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.ensemble import RandomForestClassifier 12 | 13 | # For reproducibility 14 | np.random.seed(1000) 15 | 16 | ret = RegexpTokenizer('[a-zA-Z0-9\']+') 17 | sw = set(stopwords.words('english')) 18 | ess = SnowballStemmer('english', ignore_stopwords=True) 19 | 20 | 21 | def tokenizer(sentence): 22 | tokens = ret.tokenize(sentence) 23 | return [ess.stem(t) for t in tokens if t not in sw] 24 | 25 | 26 | if __name__ == '__main__': 27 | # Compose the corpus 28 | Xr = np.array(reuters.sents(categories=['rubber'])) 29 | Xc = np.array(reuters.sents(categories=['cotton'])) 30 | Xw = np.concatenate((Xr, Xc)) 31 | X = [] 32 | 33 | for document in Xw: 34 | X.append(' '.join(document).strip().lower()) 35 | 36 | # Create the label vectors 37 | Yr = np.zeros(shape=Xr.shape) 38 | Yc = np.ones(shape=Xc.shape) 39 | Y = np.concatenate((Yr, Yc)) 40 | 41 | # Vectorize 42 | tfidfv = TfidfVectorizer(tokenizer=tokenizer, ngram_range=(1, 2), norm='l2') 43 | Xv = tfidfv.fit_transform(X) 44 | 45 | # Prepare train and test sets 46 | X_train, X_test, Y_train, Y_test = train_test_split(Xv, Y, test_size=0.25) 47 | 48 | # Create and train a Random Forest classifier 49 | rf = RandomForestClassifier(n_estimators=25) 50 | rf.fit(X_train, Y_train) 51 | 52 | # Test classifier 53 | score = rf.score(X_test, Y_test) 54 | print('Score: %.3f' % score) 55 | 56 | test_newsline = [ 57 | 'Trading tobacco is reducing the amount of requests for cotton and this has a negative impact on our economy'] 58 | yvt = tfidfv.transform(test_newsline) 59 | category = rf.predict(yvt) 60 | print('Predicted category: %d' % int(category[0])) 61 | 62 | -------------------------------------------------------------------------------- /Chapter04/2d_linear_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from scipy.optimize import minimize 7 | 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | # Number of samples 13 | nb_samples = 200 14 | 15 | 16 | def loss(v): 17 | e = 0.0 18 | for i in range(nb_samples): 19 | e += np.square(v[0] + v[1]*X[i] - Y[i]) 20 | return 0.5 * e 21 | 22 | 23 | def gradient(v): 24 | g = np.zeros(shape=2) 25 | for i in range(nb_samples): 26 | g[0] += (v[0] + v[1]*X[i] - Y[i]) 27 | g[1] += ((v[0] + v[1]*X[i] - Y[i]) * X[i]) 28 | return g 29 | 30 | 31 | def show_dataset(X, Y): 32 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 33 | 34 | ax.scatter(X, Y) 35 | ax.set_xlabel('X') 36 | ax.set_ylabel('Y') 37 | ax.grid() 38 | 39 | plt.show() 40 | 41 | 42 | if __name__ == '__main__': 43 | # Create dataset 44 | X = np.arange(-5, 5, 0.05) 45 | 46 | Y = X + 2 47 | Y += np.random.normal(0.0, 0.5, size=nb_samples) 48 | 49 | # Show the dataset 50 | show_dataset(X, Y) 51 | 52 | # Minimize loss function 53 | result = minimize(fun=loss, x0=np.array([0.0, 0.0]), jac=gradient, method='L-BFGS-B') 54 | print(result) 55 | 56 | print('Interpolating rect:') 57 | print('y = %.2fx + %2.f' % (result.x[1], result.x[0])) 58 | 59 | # Compute the absolute error 60 | err = 0.0 61 | 62 | for i in range(nb_samples): 63 | err += np.abs(Y[i] - (result.x[1]*X[i] + result.x[0])) 64 | 65 | print('Absolute error: %.2f' % err) 66 | 67 | # Repeat the process using the Moore-Penrose pseudo-inverse 68 | Xs = np.expand_dims(X, axis=1) 69 | Ys = np.expand_dims(Y, axis=1) 70 | Xs = np.concatenate((Xs, np.ones_like(Xs)), axis=1) 71 | 72 | result = np.linalg.inv(np.dot(Xs.T, Xs)).dot(Xs.T).dot(Y) 73 | 74 | print('Interpolating rect:') 75 | print('y = %.2fx + %2.f' % (result[0], result[1])) 76 | 77 | # Compute the estimator covariance 78 | covariance = (0.5 ** 2) * np.linalg.inv(np.dot(Xs.T, Xs)) 79 | 80 | print('Estimator covariance matrix:') 81 | print(covariance) 82 | -------------------------------------------------------------------------------- /Chapter05/classification_metrics.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import make_classification 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.metrics import accuracy_score, zero_one_loss, jaccard_similarity_score, confusion_matrix, \ 9 | precision_score, recall_score, fbeta_score, cohen_kappa_score, classification_report 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | nb_samples = 500 16 | 17 | 18 | if __name__ == '__main__': 19 | # Create dataset 20 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 21 | n_clusters_per_class=1, random_state=1000) 22 | 23 | # Split dataset 24 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=1000) 25 | 26 | # Create and train logistic regressor 27 | lr = LogisticRegression() 28 | lr.fit(X_train, Y_train) 29 | 30 | print('Accuracy score: %.3f' % accuracy_score(Y_test, lr.predict(X_test))) 31 | print('Zero-one loss (normalized): %.3f' % zero_one_loss(Y_test, lr.predict(X_test))) 32 | print('Zero-one loss (unnormalized): %.3f' % zero_one_loss(Y_test, lr.predict(X_test), normalize=False)) 33 | print('Jaccard similarity score: %.3f' % jaccard_similarity_score(Y_test, lr.predict(X_test))) 34 | 35 | # Compute confusion matrix 36 | cm = confusion_matrix(y_true=Y_test, y_pred=lr.predict(X_test)) 37 | print('Confusion matrix:') 38 | print(cm[::-1, ::-1]) 39 | 40 | print('Precision score: %.3f' % precision_score(Y_test, lr.predict(X_test))) 41 | print('Recall score: %.3f' % recall_score(Y_test, lr.predict(X_test))) 42 | print('F-Beta score (1): %.3f' % fbeta_score(Y_test, lr.predict(X_test), beta=1)) 43 | print('F-Beta score (0.75): %.3f' % fbeta_score(Y_test, lr.predict(X_test), beta=0.75)) 44 | print('F-Beta score (1.25): %.3f' % fbeta_score(Y_test, lr.predict(X_test), beta=1.25)) 45 | print('Cohen-Kappa score: %.3f' % cohen_kappa_score(Y_test, lr.predict(X_test))) 46 | 47 | # Print the classification report 48 | print('\n\nClassification report:') 49 | print(classification_report(Y_test, lr.predict(X_test))) 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /Chapter06/gaussian.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.naive_bayes import GaussianNB 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.metrics import roc_curve, auc 11 | 12 | 13 | # For reproducibility 14 | np.random.seed(1000) 15 | 16 | nb_samples = 300 17 | 18 | 19 | def show_dataset(X, Y): 20 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 21 | 22 | ax.grid() 23 | ax.set_xlabel('X') 24 | ax.set_ylabel('Y') 25 | 26 | for i in range(nb_samples): 27 | if Y[i] == 0: 28 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 29 | else: 30 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 31 | 32 | plt.show() 33 | 34 | 35 | if __name__ == '__main__': 36 | # Create dataset 37 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0) 38 | 39 | # Show dataset 40 | show_dataset(X, Y) 41 | 42 | # Split dataset 43 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) 44 | 45 | # Create and train Gaussian Naive Bayes classifier 46 | gnb = GaussianNB() 47 | gnb.fit(X_train, Y_train) 48 | 49 | # Create and train a Logistic regressor (for comparison) 50 | lr = LogisticRegression() 51 | lr.fit(X_train, Y_train) 52 | 53 | # Compute ROC Curve 54 | Y_gnb_score = gnb.predict_proba(X_test) 55 | Y_lr_score = lr.decision_function(X_test) 56 | 57 | fpr_gnb, tpr_gnb, thresholds_gnb = roc_curve(Y_test, Y_gnb_score[:, 1]) 58 | fpr_lr, tpr_lr, thresholds_lr = roc_curve(Y_test, Y_lr_score) 59 | 60 | # Plot ROC Curve 61 | plt.figure(figsize=(30, 25)) 62 | 63 | plt.plot(fpr_gnb, tpr_gnb, color='red', label='Naive Bayes (AUC: %.2f)' % auc(fpr_gnb, tpr_gnb)) 64 | plt.plot(fpr_lr, tpr_lr, color='green', label='Logistic Regression (AUC: %.2f)' % auc(fpr_lr, tpr_lr)) 65 | plt.plot([0, 1], [0, 1], color='blue', linestyle='--') 66 | plt.xlim([0.0, 1.0]) 67 | plt.ylim([0.0, 1.01]) 68 | plt.title('ROC Curve') 69 | plt.xlabel('False Positive Rate') 70 | plt.ylabel('True Positive Rate') 71 | plt.legend(loc="lower right") 72 | 73 | plt.show() 74 | -------------------------------------------------------------------------------- /Chapter05/passive_aggressive_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | from sklearn.datasets import make_regression 7 | from sklearn.linear_model import PassiveAggressiveRegressor 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples_1 = 300 14 | nb_samples_2 = 500 15 | 16 | 17 | if __name__ == '__main__': 18 | # Create the dataset 19 | X, Y = make_regression(n_samples=nb_samples_1, n_features=5, random_state=1000) 20 | 21 | # Create the model 22 | par = PassiveAggressiveRegressor(C=0.01, loss='squared_epsilon_insensitive', epsilon=0.001, max_iter=2000, 23 | random_state=1000) 24 | 25 | # Fit the model incrementally and collect the squared errors 26 | squared_errors = [] 27 | 28 | for (x, y) in zip(X, Y): 29 | par.partial_fit(x.reshape(1, -1), y.ravel()) 30 | y_pred = par.predict(x.reshape(1, -1)) 31 | squared_errors.append(np.power(y_pred - y, 2)) 32 | 33 | # Show the error plot 34 | fig, ax = plt.subplots(figsize=(18, 8)) 35 | 36 | ax.plot(squared_errors) 37 | ax.set_xlabel('Sample') 38 | ax.set_ylabel('Squared error') 39 | ax.grid() 40 | 41 | plt.show() 42 | 43 | # Repeat the example with a discontinuous dataset 44 | X1, Y1 = make_regression(n_samples=nb_samples_2, n_features=5, random_state=1000) 45 | X2, Y2 = make_regression(n_samples=nb_samples_2, n_features=5, random_state=1000) 46 | 47 | X2 += np.max(X1) 48 | Y2 += 0.5 49 | 50 | X = np.concatenate((X1, X2)) 51 | Y = np.concatenate((Y1, Y2)) 52 | 53 | par = PassiveAggressiveRegressor(C=0.01, loss='squared_epsilon_insensitive', epsilon=0.001, max_iter=2000, 54 | random_state=1000) 55 | 56 | # Fit the model incrementally and collect the squared errors 57 | squared_errors = [] 58 | 59 | for (x, y) in zip(X, Y): 60 | par.partial_fit(x.reshape(1, -1), y.ravel()) 61 | y_pred = par.predict(x.reshape(1, -1)) 62 | squared_errors.append(np.power(y_pred - y, 2)) 63 | 64 | # Show the error plot 65 | fig, ax = plt.subplots(figsize=(18, 8)) 66 | 67 | ax.plot(squared_errors) 68 | ax.set_xlabel('Sample') 69 | ax.set_ylabel('Squared error') 70 | ax.grid() 71 | 72 | plt.show() -------------------------------------------------------------------------------- /Chapter10/spectral_clustering.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import warnings 6 | 7 | from sklearn.datasets import make_moons 8 | from sklearn.cluster import SpectralClustering 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 1000 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.grid() 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | 24 | for i in range(nb_samples): 25 | if Y[i] == 0: 26 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 27 | else: 28 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 29 | 30 | plt.show() 31 | 32 | 33 | def show_clustered_dataset(X, Y): 34 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 35 | 36 | ax.grid() 37 | ax.set_xlabel('X') 38 | ax.set_ylabel('Y') 39 | 40 | ax.scatter(X[Y == 1, 0], X[Y == 1, 1], marker='o', color='r') 41 | ax.scatter(X[Y == 0, 0], X[Y == 0, 1], marker='^', color='b') 42 | 43 | plt.show() 44 | 45 | 46 | if __name__ == '__main__': 47 | warnings.simplefilter("ignore") 48 | 49 | # Create dataset 50 | X, Y = make_moons(n_samples=nb_samples, noise=0.05) 51 | 52 | # Show dataset 53 | show_dataset(X, Y) 54 | 55 | # Cluster the dataset for different gamma values 56 | Yss = [] 57 | gammas = np.linspace(0, 12, 4) 58 | 59 | for gamma in gammas: 60 | sc = SpectralClustering(n_clusters=2, affinity='rbf', gamma=gamma) 61 | Yss.append(sc.fit_predict(X)) 62 | 63 | # Show the result 64 | # The colors can be inverted with respect to the figure in the book 65 | fig, ax = plt.subplots(1, 4, figsize=(18, 8)) 66 | 67 | for i in range(4): 68 | ax[i].scatter(X[Yss[i] == 1, 0], X[Yss[i] == 1, 1], marker='o', color='r') 69 | ax[i].scatter(X[Yss[i] == 0, 0], X[Yss[i] == 0, 1], marker='^', color='b') 70 | ax[i].grid() 71 | ax[i].set_xlabel('X') 72 | ax[i].set_ylabel('Y') 73 | ax[i].set_title('Gamma = {}'.format(i * 4)) 74 | 75 | plt.show() 76 | 77 | # Create and train Spectral Clustering 78 | sc = SpectralClustering(n_clusters=2, affinity='nearest_neighbors') 79 | Ys = sc.fit_predict(X) 80 | 81 | # Show clustered dataset 82 | show_clustered_dataset(X, Y) 83 | -------------------------------------------------------------------------------- /Chapter04/ridge_lasso_elasticnet.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.datasets import load_boston 6 | from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV 7 | from sklearn.model_selection import cross_val_score 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | if __name__ == '__main__': 15 | boston = load_boston() 16 | 17 | # Create a linear regressor and compute CV score 18 | lr = LinearRegression(normalize=True) 19 | lr_scores = cross_val_score(lr, boston.data, boston.target, cv=10) 20 | print('Linear regression CV average score: %.6f' % lr_scores.mean()) 21 | 22 | # Create a Ridge regressor and compute CV score 23 | rg = Ridge(0.05, normalize=True) 24 | rg_scores = cross_val_score(rg, boston.data, boston.target, cv=10) 25 | print('Ridge regression CV average score: %.6f' % rg_scores.mean()) 26 | 27 | # Create a Lasso regressor and compute CV score 28 | ls = Lasso(0.01, normalize=True) 29 | ls_scores = cross_val_score(ls, boston.data, boston.target, cv=10) 30 | print('Lasso regression CV average score: %.6f' % ls_scores.mean()) 31 | 32 | # Create ElasticNet regressor and compute CV score 33 | en = ElasticNet(alpha=0.001, l1_ratio=0.8, normalize=True) 34 | en_scores = cross_val_score(en, boston.data, boston.target, cv=10) 35 | print('ElasticNet regression CV average score: %.6f' % en_scores.mean()) 36 | 37 | # Find the optimal alpha value for Ridge regression 38 | rgcv = RidgeCV(alphas=(1.0, 0.1, 0.01, 0.001, 0.005, 0.0025, 0.001, 0.00025), normalize=True) 39 | rgcv.fit(boston.data, boston.target) 40 | print('Ridge optimal alpha: %.3f' % rgcv.alpha_) 41 | 42 | # Find the optimal alpha value for Lasso regression 43 | lscv = LassoCV(alphas=(1.0, 0.1, 0.01, 0.001, 0.005, 0.0025, 0.001, 0.00025), normalize=True) 44 | lscv.fit(boston.data, boston.target) 45 | print('Lasso optimal alpha: %.3f' % lscv.alpha_) 46 | 47 | # Find the optimal alpha and l1_ratio for Elastic Net 48 | encv = ElasticNetCV(alphas=(0.1, 0.01, 0.005, 0.0025, 0.001), l1_ratio=(0.1, 0.25, 0.5, 0.75, 0.8), normalize=True) 49 | encv.fit(boston.data, boston.target) 50 | print('ElasticNet optimal alpha: %.3f and L1 ratio: %.4f' % (encv.alpha_, encv.l1_ratio_)) 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /Chapter15/keras_scikit_learn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.utils import shuffle 7 | from sklearn.model_selection import GridSearchCV 8 | 9 | from keras.models import Sequential 10 | from keras.layers import Dense, Activation 11 | from keras.optimizers import Adam 12 | from keras.utils import to_categorical 13 | from keras.wrappers.scikit_learn import KerasClassifier 14 | 15 | 16 | # For reproducibility 17 | np.random.seed(1000) 18 | 19 | nb_samples = 2000 20 | 21 | 22 | def build_model(lr=0.001): 23 | model = Sequential() 24 | 25 | model.add(Dense(64, input_dim=2)) 26 | model.add(Activation('relu')) 27 | 28 | model.add(Dense(32)) 29 | model.add(Activation('relu')) 30 | 31 | model.add(Dense(16)) 32 | model.add(Activation('relu')) 33 | 34 | model.add(Dense(2)) 35 | model.add(Activation('softmax')) 36 | 37 | model.compile(optimizer=Adam(lr=lr), 38 | loss='categorical_crossentropy', 39 | metrics=['accuracy']) 40 | 41 | return model 42 | 43 | 44 | if __name__ == '__main__': 45 | # Create the dataset 46 | X = np.zeros(shape=(nb_samples, 2), dtype=np.float32) 47 | Y = np.zeros(shape=(nb_samples,), dtype=np.float32) 48 | 49 | t = 15.0 * np.random.uniform(0.0, 1.0, size=(int(nb_samples / 2), 1)) 50 | 51 | X[0:int(nb_samples / 2), :] = t * np.hstack([-np.cos(t), np.sin(t)]) + \ 52 | np.random.uniform(0.0, 1.8, size=(int(nb_samples / 2), 2)) 53 | Y[0:int(nb_samples / 2)] = 0 54 | 55 | X[int(nb_samples / 2):, :] = t * np.hstack([np.cos(t), -np.sin(t)]) + \ 56 | np.random.uniform(0.0, 1.8, size=(int(nb_samples / 2), 2)) 57 | Y[int(nb_samples / 2):] = 1 58 | 59 | ss = StandardScaler() 60 | X = ss.fit_transform(X) 61 | 62 | X, Y = shuffle(X, Y, random_state=1000) 63 | 64 | # Wrap the Keras model 65 | skmodel = KerasClassifier(build_fn=build_model, epochs=100, batch_size=32, lr=0.001) 66 | 67 | # Perform a grid search 68 | parameters = { 69 | 'lr': [0.001, 0.01, 0.1], 70 | 'batch_size': [32, 64, 128] 71 | } 72 | 73 | gs = GridSearchCV(skmodel, parameters, cv=5) 74 | gs.fit(X, to_categorical(Y, 2)) 75 | 76 | # Show the best score and parameters 77 | print(gs.best_score_) 78 | print(gs.best_params_) 79 | 80 | -------------------------------------------------------------------------------- /Chapter07/svr_airfoil.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import pandas as pd 6 | 7 | from sklearn.svm import SVR 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.model_selection import train_test_split 10 | 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | # Download the dataset from: https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise 16 | # Change with the folder where the file is stored 17 | file_path = '/airfoil_self_noise.dat' 18 | 19 | 20 | if __name__ == '__main__': 21 | # Load the dataset 22 | df = pd.read_csv(file_path, sep='\t', header=None) 23 | 24 | # Show the statistics 25 | print(df.describe()) 26 | 27 | # Extract the independent and dependent variables 28 | X = df.iloc[:, 0:5].values 29 | Y = df.iloc[:, 5].values 30 | 31 | # Scale the data 32 | ssx, ssy = StandardScaler(), StandardScaler() 33 | 34 | Xs = ssx.fit_transform(X) 35 | Ys = ssy.fit_transform(Y.reshape(-1, 1)) 36 | 37 | # Create train and test sets 38 | X_train, X_test, Y_train, Y_test = train_test_split(Xs, Ys.ravel(), test_size=300, random_state=1000) 39 | 40 | # Instantiate and train the SVR 41 | svr = SVR(kernel='rbf', gamma=0.75, C=2.8, cache_size=500, epsilon=0.1) 42 | svr.fit(X_train, Y_train) 43 | 44 | # Print the R^2 scores 45 | print('Training R^2 score: %.3f' % svr.score(X_train, Y_train)) 46 | print('Test R^2 score: %.3f' % svr.score(X_test, Y_test)) 47 | 48 | # Show both original dataset and predictions 49 | fig, ax = plt.subplots(2, 1, figsize=(15, 9)) 50 | 51 | ax[0].plot(ssy.inverse_transform(Ys)) 52 | ax[0].set_title('Original dataset') 53 | ax[0].set_ylabel('Scaled sound pressure (dB)') 54 | ax[0].grid() 55 | 56 | ax[1].plot(ssy.inverse_transform(svr.predict(Xs))) 57 | ax[1].set_title('Predictions') 58 | ax[1].set_xlabel('Sample') 59 | ax[1].set_ylabel('Scaled sound pressure (dB)') 60 | ax[1].grid() 61 | 62 | plt.show() 63 | 64 | # Show the absolute errors 65 | fig, ax = plt.subplots(figsize=(15, 4)) 66 | 67 | Y = np.squeeze(ssy.inverse_transform(Ys)) 68 | Yp = ssy.inverse_transform(svr.predict(Xs)) 69 | 70 | ax.plot(np.abs(Y - Yp)) 71 | ax.set_title('Absolute errors') 72 | ax.set_xlabel('Sample') 73 | ax.set_ylabel(r'$|Y-Yp|$') 74 | ax.grid() 75 | 76 | plt.show() -------------------------------------------------------------------------------- /Chapter05/logistic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.model_selection import train_test_split, cross_val_score 8 | from sklearn.linear_model import LogisticRegression 9 | 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | nb_samples = 500 15 | 16 | 17 | def show_dataset(X, Y): 18 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 19 | 20 | ax.grid() 21 | ax.set_xlabel('X') 22 | ax.set_ylabel('Y') 23 | 24 | for i in range(nb_samples): 25 | if Y[i] == 0: 26 | ax.scatter(X[i, 0], X[i, 1], marker='o', color='r') 27 | else: 28 | ax.scatter(X[i, 0], X[i, 1], marker='^', color='b') 29 | 30 | plt.show() 31 | 32 | 33 | def show_classification_areas(X, Y, lr): 34 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 35 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 36 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) 37 | Z = lr.predict(np.c_[xx.ravel(), yy.ravel()]) 38 | 39 | Z = Z.reshape(xx.shape) 40 | plt.figure(1, figsize=(30, 25)) 41 | plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel1) 42 | 43 | # Plot also the training points 44 | plt.scatter(X[:, 0], X[:, 1], c=np.abs(Y - 1), edgecolors='k', cmap=plt.cm.coolwarm) 45 | plt.xlabel('X') 46 | plt.ylabel('Y') 47 | 48 | plt.xlim(xx.min(), xx.max()) 49 | plt.ylim(yy.min(), yy.max()) 50 | plt.xticks(()) 51 | plt.yticks(()) 52 | 53 | plt.show() 54 | 55 | 56 | if __name__ == '__main__': 57 | # Create dataset 58 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, 59 | n_clusters_per_class=1) 60 | 61 | # Show dataset 62 | show_dataset(X, Y) 63 | 64 | # Split dataset 65 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) 66 | 67 | # Create logistic regressor 68 | lr = LogisticRegression() 69 | lr.fit(X_train, Y_train) 70 | print('Logistic regression score: %.3f' % lr.score(X_test, Y_test)) 71 | 72 | # Compute CV score 73 | lr_scores = cross_val_score(lr, X, Y, scoring='accuracy', cv=10) 74 | print('Logistic regression CV average score: %.3f' % lr_scores.mean()) 75 | 76 | # Show classification areas 77 | show_classification_areas(X, Y, lr) 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /Chapter11/agglomerative_clustering.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_blobs 7 | from sklearn.cluster import AgglomerativeClustering 8 | from sklearn.metrics import silhouette_score, adjusted_rand_score 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | nb_samples = 3000 14 | 15 | 16 | def plot_clustered_dataset(X, Y): 17 | fig, ax = plt.subplots(1, 1, figsize=(10, 8)) 18 | 19 | ax.grid() 20 | ax.set_xlabel('X') 21 | ax.set_ylabel('Y') 22 | 23 | markers = ['o', 'd', '^', 'x', '1', '2', '3', 's'] 24 | colors = ['r', 'b', 'g', 'c', 'm', 'k', 'y', '#cccfff'] 25 | 26 | for i in range(nb_samples): 27 | ax.scatter(X[i, 0], X[i, 1], marker=markers[Y[i]], color=colors[Y[i]]) 28 | 29 | plt.show() 30 | 31 | 32 | if __name__ == '__main__': 33 | # Create the dataset 34 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=8, cluster_std=2.0) 35 | 36 | # Show the dataset 37 | fig, ax = plt.subplots(1, 1, figsize=(10, 8)) 38 | 39 | ax.grid() 40 | ax.set_xlabel('X') 41 | ax.set_ylabel('Y') 42 | 43 | ax.scatter(X[:, 0], X[:, 1], marker='o', color='b') 44 | plt.show() 45 | 46 | # Complete linkage 47 | print('Complete linkage') 48 | ac = AgglomerativeClustering(n_clusters=8, linkage='complete') 49 | Y_pred = ac.fit_predict(X) 50 | 51 | print('Silhouette score (Complete): %.3f' % silhouette_score(X, Y_pred)) 52 | print('Adjusted Rand score (Complete): %.3f' % adjusted_rand_score(Y, Y_pred)) 53 | 54 | # Show the clustered dataset 55 | plot_clustered_dataset(X, Y) 56 | 57 | # Average linkage 58 | print('Average linkage') 59 | ac = AgglomerativeClustering(n_clusters=8, linkage='average') 60 | Y_pred = ac.fit_predict(X) 61 | 62 | print('Silhouette score (Average): %.3f' % silhouette_score(X, Y_pred)) 63 | print('Adjusted Rand score (Average): %.3f' % adjusted_rand_score(Y, Y_pred)) 64 | 65 | # Show the clustered dataset 66 | plot_clustered_dataset(X, Y) 67 | 68 | # Ward linkage 69 | print('Ward linkage') 70 | ac = AgglomerativeClustering(n_clusters=8) 71 | Y_pred = ac.fit_predict(X) 72 | 73 | print('Silhouette score (Ward): %.3f' % silhouette_score(X, Y_pred)) 74 | print('Adjusted Rand score (Ward): %.3f' % adjusted_rand_score(Y, Y_pred)) 75 | 76 | # Show the clustered dataset 77 | plot_clustered_dataset(X, Y) 78 | 79 | 80 | -------------------------------------------------------------------------------- /Chapter13/vectorizing.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from nltk.corpus import stopwords 6 | from nltk.tokenize import RegexpTokenizer 7 | from nltk.stem.snowball import SnowballStemmer 8 | 9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 10 | 11 | # For reproducibility 12 | np.random.seed(1000) 13 | 14 | ret = RegexpTokenizer('[a-zA-Z0-9\']+') 15 | sw = set(stopwords.words('english')) 16 | ess = SnowballStemmer('english', ignore_stopwords=True) 17 | 18 | 19 | def tokenizer(sentence): 20 | tokens = ret.tokenize(sentence) 21 | return [ess.stem(t) for t in tokens if t not in sw] 22 | 23 | 24 | if __name__ == '__main__': 25 | # Create a corpus 26 | corpus = [ 27 | 'This is a simple test corpus', 28 | 'A corpus is a set of text documents', 29 | 'We want to analyze the corpus and the documents', 30 | 'Documents can be automatically tokenized' 31 | ] 32 | 33 | # Create a count vectorizer 34 | print('Count vectorizer:') 35 | cv = CountVectorizer() 36 | 37 | vectorized_corpus = cv.fit_transform(corpus) 38 | print(vectorized_corpus.todense()) 39 | 40 | print('CV Vocabulary:') 41 | print(cv.vocabulary_) 42 | 43 | # Perform an inverse transformation 44 | vector = [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1] 45 | print(cv.inverse_transform(vector)) 46 | 47 | # Use a complete external tokenizer 48 | print('CV with external tokenizer:') 49 | cv = CountVectorizer(tokenizer=tokenizer) 50 | vectorized_corpus = cv.fit_transform(corpus) 51 | print(vectorized_corpus.todense()) 52 | 53 | # Use an n-gram range equal to (1, 2) 54 | print('CV witn n-gram range (1, 2):') 55 | cv = CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 2)) 56 | vectorized_corpus = cv.fit_transform(corpus) 57 | print(vectorized_corpus.todense()) 58 | 59 | print('N-gram range (1,2) vocabulary:') 60 | print(cv.vocabulary_) 61 | 62 | # Create a Tf-Idf vectorizer 63 | print('Tf-Idf vectorizer:') 64 | tfidfv = TfidfVectorizer() 65 | vectorized_corpus = tfidfv.fit_transform(corpus) 66 | print(vectorized_corpus.todense()) 67 | 68 | print('Tf-Idf vocabulary:') 69 | print(tfidfv.vocabulary_) 70 | 71 | # Use n-gram range equal to (1, 2) and L2 normalization 72 | print('Tf-Idf witn n-gram range (1, 2) and L2 normalization:') 73 | tfidfv = TfidfVectorizer(tokenizer=tokenizer, ngram_range=(1, 2), norm='l2') 74 | vectorized_corpus = tfidfv.fit_transform(corpus) 75 | print(vectorized_corpus.todense()) 76 | 77 | -------------------------------------------------------------------------------- /Chapter14/sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import multiprocessing 5 | import numpy as np 6 | 7 | from nltk.corpus import stopwords 8 | from nltk.tokenize import RegexpTokenizer 9 | from nltk.stem.lancaster import LancasterStemmer 10 | 11 | from sklearn.feature_extraction.text import TfidfVectorizer 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.metrics import precision_score, recall_score, roc_curve, auc 15 | 16 | # For reproducibility 17 | np.random.seed(1000) 18 | 19 | # Path to the dataset (http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip) 20 | dataset = 'dataset.csv' 21 | 22 | rt = RegexpTokenizer('[a-zA-Z0-9\.]+') 23 | sw = set(stopwords.words('english')) 24 | ls = LancasterStemmer() 25 | 26 | 27 | def tokenizer(sentence): 28 | tokens = rt.tokenize(sentence) 29 | return [ls.stem(t.lower()) for t in tokens if t not in sw] 30 | 31 | 32 | if __name__ == '__main__': 33 | # Load corpus and labels 34 | corpus = [] 35 | labels = [] 36 | 37 | with open(dataset, 'r', encoding='utf-8') as df: 38 | for i, line in enumerate(df): 39 | if i == 0: 40 | continue 41 | 42 | parts = line.strip().split(',') 43 | labels.append(float(parts[1].strip())) 44 | corpus.append(parts[3].strip()) 45 | 46 | # Vectorize the corpus (only 100000 records) 47 | tfv = TfidfVectorizer(tokenizer=tokenizer, sublinear_tf=True, ngram_range=(1, 2), norm='l2') 48 | X = tfv.fit_transform(corpus[0:100000]) 49 | Y = np.array(labels[0:100000]) 50 | 51 | # Prepare train and test sets 52 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1) 53 | 54 | # Create and train a Random Forest 55 | rf = RandomForestClassifier(n_estimators=20, n_jobs=multiprocessing.cpu_count()) 56 | rf.fit(X_train, Y_train) 57 | 58 | # Compute scores 59 | print('Precision: %.3f' % precision_score(Y_test, rf.predict(X_test))) 60 | print('Recall: %.3f' % recall_score(Y_test, rf.predict(X_test))) 61 | 62 | # Compute the ROC curve 63 | y_score = rf.predict_proba(X_test) 64 | fpr, tpr, thresholds = roc_curve(Y_test, y_score[:, 1]) 65 | 66 | plt.figure(figsize=(8, 8)) 67 | plt.plot(fpr, tpr, color='red', label='Random Forest (AUC: %.2f)' % auc(fpr, tpr)) 68 | plt.plot([0, 1], [0, 1], color='blue', linestyle='--') 69 | plt.xlim([0.0, 1.0]) 70 | plt.ylim([0.0, 1.01]) 71 | plt.title('ROC Curve') 72 | plt.xlabel('False Positive Rate') 73 | plt.ylabel('True Positive Rate') 74 | plt.legend(loc="lower right") 75 | plt.show() 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /Chapter03/data_scaling.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.preprocessing import StandardScaler, RobustScaler 7 | 8 | # For reproducibility 9 | np.random.seed(1000) 10 | 11 | if __name__ == '__main__': 12 | # Create a dummy dataset 13 | data = np.ndarray(shape=(100, 2)) 14 | 15 | for i in range(100): 16 | data[i, 0] = 2.0 + np.random.normal(1.5, 3.0) 17 | data[i, 1] = 0.5 + np.random.normal(1.5, 3.0) 18 | 19 | # Show the original and the scaled dataset 20 | fig, ax = plt.subplots(1, 2, figsize=(14, 5)) 21 | 22 | ax[0].scatter(data[:, 0], data[:, 1]) 23 | ax[0].set_xlim([-10, 10]) 24 | ax[0].set_ylim([-10, 10]) 25 | ax[0].grid() 26 | ax[0].set_xlabel('X') 27 | ax[0].set_ylabel('Y') 28 | ax[0].set_title('Raw data') 29 | 30 | # Scale data 31 | ss = StandardScaler() 32 | scaled_data = ss.fit_transform(data) 33 | 34 | ax[1].scatter(scaled_data[:, 0], scaled_data[:, 1]) 35 | ax[1].set_xlim([-10, 10]) 36 | ax[1].set_ylim([-10, 10]) 37 | ax[1].grid() 38 | ax[1].set_xlabel('X') 39 | ax[1].set_ylabel('Y') 40 | ax[1].set_title('Scaled data') 41 | 42 | plt.show() 43 | 44 | # Scale data using a Robust Scaler 45 | fig, ax = plt.subplots(2, 2, figsize=(8, 8)) 46 | 47 | ax[0, 0].scatter(data[:, 0], data[:, 1]) 48 | ax[0, 0].set_xlim([-10, 10]) 49 | ax[0, 0].set_ylim([-10, 10]) 50 | ax[0, 0].grid() 51 | ax[0, 0].set_xlabel('X') 52 | ax[0, 0].set_ylabel('Y') 53 | ax[0, 0].set_title('Raw data') 54 | 55 | rs = RobustScaler(quantile_range=(15, 85)) 56 | scaled_data = rs.fit_transform(data) 57 | 58 | ax[0, 1].scatter(scaled_data[:, 0], scaled_data[:, 1]) 59 | ax[0, 1].set_xlim([-10, 10]) 60 | ax[0, 1].set_ylim([-10, 10]) 61 | ax[0, 1].grid() 62 | ax[0, 1].set_xlabel('X') 63 | ax[0, 1].set_ylabel('Y') 64 | ax[0, 1].set_title('Scaled data (15% - 85%)') 65 | 66 | rs1 = RobustScaler(quantile_range=(25, 75)) 67 | scaled_data1 = rs1.fit_transform(data) 68 | 69 | ax[1, 0].scatter(scaled_data1[:, 0], scaled_data1[:, 1]) 70 | ax[1, 0].set_xlim([-10, 10]) 71 | ax[1, 0].set_ylim([-10, 10]) 72 | ax[1, 0].grid() 73 | ax[1, 0].set_xlabel('X') 74 | ax[1, 0].set_ylabel('Y') 75 | ax[1, 0].set_title('Scaled data (25% - 75%)') 76 | 77 | rs2 = RobustScaler(quantile_range=(30, 65)) 78 | scaled_data2 = rs2.fit_transform(data) 79 | 80 | ax[1, 1].scatter(scaled_data2[:, 0], scaled_data2[:, 1]) 81 | ax[1, 1].set_xlim([-10, 10]) 82 | ax[1, 1].set_ylim([-10, 10]) 83 | ax[1, 1].grid() 84 | ax[1, 1].set_xlabel('X') 85 | ax[1, 1].set_ylabel('Y') 86 | ax[1, 1].set_title('Scaled data (30% - 60%)') 87 | 88 | plt.show() 89 | 90 | -------------------------------------------------------------------------------- /Chapter08/voting_classifier.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.datasets import make_classification 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.svm import SVC 9 | from sklearn.tree import DecisionTreeClassifier 10 | from sklearn.ensemble import VotingClassifier 11 | from sklearn.model_selection import cross_val_score 12 | 13 | # For reproducibility 14 | np.random.seed(1000) 15 | 16 | nb_samples = 500 17 | 18 | 19 | def compute_accuracies(lr, dt, svc, vc, X, Y): 20 | accuracies = [] 21 | 22 | accuracies.append(cross_val_score(lr, X, Y, scoring='accuracy', cv=10).mean()) 23 | accuracies.append(cross_val_score(dt, X, Y, scoring='accuracy', cv=10).mean()) 24 | accuracies.append(cross_val_score(svc, X, Y, scoring='accuracy', cv=10).mean()) 25 | accuracies.append(cross_val_score(vc, X, Y, scoring='accuracy', cv=10).mean()) 26 | 27 | print('Accuracies:') 28 | print(np.array(accuracies)) 29 | 30 | return accuracies 31 | 32 | 33 | def plot_accuracies(accuracies): 34 | fig, ax = plt.subplots(figsize=(12, 8)) 35 | positions = np.array([0, 1, 2, 3]) 36 | 37 | ax.bar(positions, accuracies, 0.5) 38 | ax.set_ylabel('Accuracy') 39 | ax.set_xticklabels(('Logistic Regression', 'Decision Tree', 'SVM', 'Ensemble')) 40 | ax.set_xticks(positions + (5.0 / 20)) 41 | plt.ylim([0.80, 0.93]) 42 | plt.show() 43 | 44 | 45 | if __name__ == '__main__': 46 | # Create the dataset 47 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, n_classes=2) 48 | 49 | # Show the dataset 50 | fig, ax = plt.subplots(figsize=(12, 12)) 51 | 52 | for i, x in enumerate(X): 53 | if Y[i] == 0: 54 | ax.scatter(x[0], x[1], marker='s', color='blue') 55 | else: 56 | ax.scatter(x[0], x[1], marker='d', color='red') 57 | 58 | ax.set_xlabel(r'$X_0$') 59 | ax.set_ylabel(r'$X_1$') 60 | plt.show() 61 | 62 | # Create the classifiers 63 | lr = LogisticRegression() 64 | svc = SVC(kernel='poly', probability=True) 65 | dt = DecisionTreeClassifier() 66 | 67 | classifiers = [('lr', lr), 68 | ('dt', dt), 69 | ('svc', svc)] 70 | 71 | # Hard voting 72 | vc = VotingClassifier(estimators=classifiers, voting='hard') 73 | 74 | # Compute and plot accuracies 75 | hard_accuracies = compute_accuracies(lr, dt, svc, vc, X, Y) 76 | plot_accuracies(hard_accuracies) 77 | 78 | # Soft weighted voting 79 | weights = [1.5, 0.5, 0.75] 80 | 81 | vc = VotingClassifier(estimators=classifiers, weights=weights, voting='soft') 82 | 83 | # Compute and plot accuracies 84 | soft_accuracies = compute_accuracies(lr, dt, svc, vc, X, Y) 85 | plot_accuracies(soft_accuracies) 86 | 87 | -------------------------------------------------------------------------------- /Chapter08/decision_tree_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.tree import DecisionTreeRegressor, export_graphviz 8 | from sklearn.model_selection import train_test_split, cross_val_score 9 | 10 | 11 | # Set random seed for reproducibility 12 | np.random.seed(1000) 13 | 14 | 15 | # Download the dataset from: https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength 16 | # Change with the folder where the file is stored 17 | file_path = '/Concrete_Data.xls' 18 | graphviz_path = '/Concrete_Data.dot' 19 | 20 | 21 | if __name__ == '__main__': 22 | # Load the dataset 23 | df = pd.read_excel(file_path, header=0) 24 | X = df.iloc[:, 0:8].values 25 | Y = df.iloc[:, 8].values 26 | 27 | # Print the statistic summary 28 | print(df.describe()) 29 | 30 | # Print the CV scores 31 | print(cross_val_score(DecisionTreeRegressor(criterion='mse', max_depth=11, random_state=1000), X, Y, cv=20)) 32 | 33 | # Create train and test sets 34 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=200, random_state=1000) 35 | 36 | # Train the Decision Tree Regressor 37 | dtr = DecisionTreeRegressor(criterion='mse', max_depth=11, random_state=1000) 38 | dtr.fit(X_train, Y_train) 39 | 40 | # Export the tree in Graphviz format 41 | # You can use http://www.webgraphviz.com to visualize the tree 42 | export_graphviz(dtr, out_file=graphviz_path, 43 | feature_names=['Cement', 'Blast furnace slag', 'Fly ash', 'Water', 44 | 'Superplasticizer', 'Coarse Aggregate', 'Fine Aggregate', 'Age']) 45 | 46 | print('Training R^2 score: %.3f' % dtr.score(X_train, Y_train)) 47 | print('Validation R^2 score: %.3f' % dtr.score(X_test, Y_test)) 48 | 49 | # Compute the predictions 50 | Y_pred = dtr.predict(X) 51 | 52 | # Show the dataset, predictions and absolute errors 53 | fig, ax = plt.subplots(3, 1, figsize=(18, 15)) 54 | 55 | ax[0].plot(Y) 56 | ax[0].set_title('Original dataset') 57 | ax[0].set_ylabel('Concrete Compressive Strength') 58 | ax[0].grid() 59 | 60 | ax[1].plot(Y_pred) 61 | ax[1].set_title('Predictions') 62 | ax[1].set_xlabel('Sample') 63 | ax[1].set_ylabel('Concrete Compressive Strength') 64 | ax[1].grid() 65 | 66 | ax[2].plot(np.abs(Y_pred - Y)) 67 | ax[2].set_yticks(np.arange(0.0, 81.0, 10.0)) 68 | ax[2].set_xlabel('Sample') 69 | ax[2].set_ylabel('Absolute error') 70 | ax[2].grid() 71 | 72 | plt.show() 73 | 74 | # Show the absolute error histogram 75 | fig, ax = plt.subplots(figsize=(14, 8)) 76 | 77 | ax.hist(np.abs(Y_pred - Y), bins='auto', log=True) 78 | ax.set_xlabel('Absolute error') 79 | ax.set_ylabel('Sample count') 80 | ax.grid() 81 | 82 | plt.show() -------------------------------------------------------------------------------- /Chapter16/mlp.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import tensorflow as tf 6 | 7 | from sklearn.datasets import make_classification 8 | from sklearn.model_selection import train_test_split 9 | 10 | from mpl_toolkits.mplot3d import Axes3D 11 | 12 | nb_samples = 1000 13 | nb_features = 3 14 | nb_epochs = 200 15 | batch_size = 50 16 | 17 | # For reproducibility 18 | np.random.seed(1000) 19 | 20 | 21 | if __name__ == '__main__': 22 | # Create the dataset 23 | X, Y = make_classification(n_samples=nb_samples, n_features=nb_features, 24 | n_informative=3, n_redundant=0, n_classes=2, n_clusters_per_class=3) 25 | 26 | # Show the dataset 27 | fig = plt.figure(figsize=(11, 11)) 28 | ax = fig.add_subplot(111, projection='3d') 29 | 30 | for i, x in enumerate(X): 31 | if Y[i] == 0: 32 | ax.scatter(x[0], x[1], x[2], marker='s', color='blue') 33 | elif Y[i] == 1: 34 | ax.scatter(x[0], x[1], x[2], marker='d', color='red') 35 | 36 | ax.set_xlabel(r'$X_0$') 37 | ax.set_ylabel(r'$X_1$') 38 | ax.set_zlabel(r'$X_2$') 39 | plt.show() 40 | 41 | # Create train and test sets 42 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) 43 | 44 | # Create the graph 45 | graph = tf.Graph() 46 | 47 | with graph.as_default(): 48 | Xt = tf.placeholder(tf.float32, shape=(None, nb_features), name='X') 49 | Yt = tf.placeholder(tf.float32, shape=(None, 1), name='Y') 50 | 51 | layer_1 = tf.layers.dense(inputs=Xt, units=50, activation=tf.nn.tanh) 52 | layer_2 = tf.layers.dense(inputs=layer_1, units=1, activation=tf.nn.sigmoid) 53 | 54 | Yo = tf.round(layer_2) 55 | 56 | loss = tf.nn.l2_loss(layer_2 - Yt) 57 | training_step = tf.train.GradientDescentOptimizer(0.025).minimize(loss) 58 | 59 | session = tf.InteractiveSession(graph=graph) 60 | tf.global_variables_initializer().run() 61 | 62 | # Run the training cycle 63 | for e in range(nb_epochs): 64 | total_loss = 0.0 65 | Xb = np.ndarray(shape=(batch_size, nb_features), dtype=np.float32) 66 | Yb = np.ndarray(shape=(batch_size, 1), dtype=np.float32) 67 | 68 | for i in range(0, X_train.shape[0] - batch_size, batch_size): 69 | Xb[:, :] = X_train[i:i + batch_size, :] 70 | Yb[:, 0] = Y_train[i:i + batch_size] 71 | 72 | loss_value, _ = session.run([loss, training_step], feed_dict={Xt: Xb, Yt: Yb}) 73 | total_loss += loss_value 74 | 75 | Y_predicted = session.run([Yo], feed_dict={Xt: X_test.reshape((X_test.shape[0], nb_features))}) 76 | accuracy = 1.0 - (np.sum(np.abs(np.array(Y_predicted[0]).squeeze(axis=1) - Y_test)) / float(Y_test.shape[0])) 77 | 78 | print('Epoch %d) Total loss: %.2f - Accuracy: %.2f' % (e, total_loss, accuracy)) 79 | -------------------------------------------------------------------------------- /Chapter11/connectivity_constraints.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import matplotlib.cm as cm 6 | 7 | from sklearn.datasets import make_circles 8 | from sklearn.cluster import AgglomerativeClustering 9 | from sklearn.neighbors import kneighbors_graph 10 | from sklearn.metrics import silhouette_score 11 | 12 | # For reproducibility 13 | np.random.seed(1000) 14 | 15 | 16 | nb_samples = 3000 17 | 18 | 19 | if __name__ == '__main__': 20 | # Create the dataset 21 | X, Y = make_circles(n_samples=nb_samples, noise=0.05) 22 | 23 | # Show the dataset 24 | fig, ax = plt.subplots(1, 1, figsize=(10, 8)) 25 | 26 | ax.grid() 27 | ax.set_xlabel('X') 28 | ax.set_ylabel('Y') 29 | 30 | ax.scatter(X[:, 0], X[:, 1], marker='o', color='b') 31 | plt.show() 32 | 33 | # Unstructured clustering with average linkage 34 | print('Unstructured clustering with average linkage') 35 | ac = AgglomerativeClustering(n_clusters=20, linkage='average') 36 | Y_pred = ac.fit_predict(X) 37 | 38 | print('Silhouette score: %.3f' % silhouette_score(X, Y_pred)) 39 | 40 | # Plot the clustered dataset 41 | fig, ax = plt.subplots(1, 1, figsize=(12, 10)) 42 | 43 | ax.grid() 44 | ax.set_xlabel('X') 45 | ax.set_ylabel('Y') 46 | ax.scatter(X[:, 0], X[:, 1], marker='o', cmap=cm.spectral, c=ac.labels_) 47 | plt.show() 48 | 49 | # Connectivity constraints 50 | print('Imposing connectivity constraints') 51 | 52 | acc = [] 53 | k = [50, 100, 200, 500] 54 | 55 | ac = AgglomerativeClustering(n_clusters=20, connectivity=None, linkage='average') 56 | ac.fit(X) 57 | 58 | for i in k: 59 | kng = kneighbors_graph(X, i) 60 | ac1 = AgglomerativeClustering(n_clusters=20, connectivity=kng, linkage='average') 61 | Y_pred = ac1.fit_predict(X) 62 | print('Silhouette score (k=%d): %.3f' % (i, silhouette_score(X, Y_pred))) 63 | acc.append(ac1) 64 | 65 | # Show the four plots 66 | fig, ax = plt.subplots(2, 2, figsize=(14, 10)) 67 | 68 | ax[0, 0].grid() 69 | ax[0, 0].set_title('K = 50') 70 | ax[0, 0].set_xlabel('X') 71 | ax[0, 0].set_ylabel('Y') 72 | ax[0, 0].scatter(X[:, 0], X[:, 1], marker='o', cmap=cm.spectral, c=acc[0].labels_) 73 | 74 | ax[0, 1].grid() 75 | ax[0, 1].set_title('K = 100') 76 | ax[0, 1].set_xlabel('X') 77 | ax[0, 1].set_ylabel('Y') 78 | ax[0, 1].scatter(X[:, 0], X[:, 1], marker='o', cmap=cm.spectral, c=acc[1].labels_) 79 | 80 | ax[1, 0].grid() 81 | ax[1, 0].set_title('K = 200') 82 | ax[1, 0].set_xlabel('X') 83 | ax[1, 0].set_ylabel('Y') 84 | ax[1, 0].scatter(X[:, 0], X[:, 1], marker='o', cmap=cm.spectral, c=acc[2].labels_) 85 | 86 | ax[1, 1].grid() 87 | ax[1, 1].set_title('K = 500') 88 | ax[1, 1].set_xlabel('X') 89 | ax[1, 1].set_ylabel('Y') 90 | ax[1, 1].scatter(X[:, 0], X[:, 1], marker='o', cmap=cm.spectral, c=acc[3].labels_) 91 | plt.show() 92 | 93 | -------------------------------------------------------------------------------- /Chapter16/logistic_regression.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import tensorflow as tf 6 | 7 | from sklearn.datasets import make_classification 8 | 9 | # For reproducibility 10 | np.random.seed(1000) 11 | 12 | nb_samples = 500 13 | 14 | if __name__ == '__main__': 15 | # Create the dataset 16 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, n_classes=2) 17 | 18 | # Plot the dataset 19 | fig, ax = plt.subplots(figsize=(9, 7)) 20 | ax.set_xlabel(r'$X_0$') 21 | ax.set_ylabel(r'$X_1$') 22 | 23 | for i, x in enumerate(X): 24 | if Y[i] == 0: 25 | ax.scatter(x[0], x[1], marker='d', color='blue') 26 | else: 27 | ax.scatter(x[0], x[1], marker='s', color='red') 28 | 29 | plt.show() 30 | 31 | # Create the graph 32 | graph = tf.Graph() 33 | 34 | with graph.as_default(): 35 | Xt = tf.placeholder(tf.float32, shape=(None, 2), name='points') 36 | Yt = tf.placeholder(tf.float32, shape=(None, 1), name='classes') 37 | 38 | W = tf.Variable(tf.zeros((2, 1)), name='weights') 39 | bias = tf.Variable(tf.zeros((1, 1)), name='bias') 40 | 41 | Ye = tf.matmul(Xt, W) + bias 42 | Yc = tf.round(tf.sigmoid(Ye)) 43 | 44 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=Ye, labels=Yt)) 45 | training_step = tf.train.GradientDescentOptimizer(0.025).minimize(loss) 46 | 47 | session = tf.InteractiveSession(graph=graph) 48 | tf.global_variables_initializer().run() 49 | 50 | feed_dict = { 51 | Xt: X, 52 | Yt: Y.reshape((nb_samples, 1)) 53 | } 54 | 55 | for i in range(10000): 56 | loss_value, _ = session.run([loss, training_step], feed_dict=feed_dict) 57 | if i % 100 == 0: 58 | print('Step %d, Loss: %.3f' % (i, loss_value)) 59 | 60 | # Retrieve coefficients and intercept 61 | Wc, Wb = W.eval(), bias.eval() 62 | 63 | print('Coefficients:') 64 | print(Wc) 65 | 66 | print('Intercept:') 67 | print(Wb) 68 | 69 | # Plot the dataset with the separating hyperplane 70 | h = .02 71 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 72 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 73 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 74 | 75 | Z = np.array(session.run([Yc], feed_dict={Xt: np.c_[xx.ravel(), yy.ravel()]})) 76 | 77 | # Put the result into a color plot 78 | Z = Z.reshape(xx.shape) 79 | plt.figure(1, figsize=(12, 12)) 80 | plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel1) 81 | 82 | # Plot also the training points 83 | for i, x in enumerate(X): 84 | if Y[i] == 0: 85 | plt.scatter(x[0], x[1], marker='d', color='blue') 86 | else: 87 | plt.scatter(x[0], x[1], marker='s', color='red') 88 | 89 | plt.xlabel(r'$X_0$') 90 | plt.ylabel(r'$X_1$') 91 | 92 | plt.xlim(xx.min(), xx.max()) 93 | plt.ylim(yy.min(), yy.max()) 94 | plt.xticks(()) 95 | plt.yticks(()) 96 | 97 | plt.show() -------------------------------------------------------------------------------- /Chapter16/dcn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | from keras.datasets import mnist 7 | from keras.models import Sequential 8 | from keras.layers import Dense, Activation, Dropout, Conv2D, AveragePooling2D, Flatten 9 | from keras.optimizers import Adam 10 | from keras.utils import to_categorical 11 | 12 | 13 | # Set random seed for reproducibility 14 | np.random.seed(1000) 15 | 16 | 17 | if __name__ == '__main__': 18 | # Load the dataset 19 | (X_train, Y_train), (X_test, Y_test) = mnist.load_data() 20 | 21 | width = height = X_train.shape[1] 22 | 23 | X_train = X_train.reshape((X_train.shape[0], width, height, 1)).astype(np.float32) / 255.0 24 | X_test = X_test.reshape((X_test.shape[0], width, height, 1)).astype(np.float32) / 255.0 25 | 26 | Y_train = to_categorical(Y_train, num_classes=10) 27 | Y_test = to_categorical(Y_test, num_classes=10) 28 | 29 | # Create the model 30 | model = Sequential() 31 | 32 | model.add(Dropout(0.25, input_shape=(width, height, 1), seed=1000)) 33 | 34 | model.add(Conv2D(16, kernel_size=(3, 3), padding='same')) 35 | model.add(Activation('relu')) 36 | model.add(Dropout(0.5, seed=1000)) 37 | 38 | model.add(Conv2D(16, kernel_size=(3, 3), padding='same')) 39 | model.add(Activation('relu')) 40 | model.add(Dropout(0.5, seed=1000)) 41 | 42 | model.add(AveragePooling2D(pool_size=(2, 2), padding='same')) 43 | 44 | model.add(Conv2D(32, kernel_size=(3, 3), padding='same')) 45 | model.add(Activation('relu')) 46 | 47 | model.add(AveragePooling2D(pool_size=(2, 2), padding='same')) 48 | 49 | model.add(Conv2D(64, kernel_size=(3, 3), padding='same')) 50 | model.add(Activation('relu')) 51 | model.add(Dropout(0.5, seed=1000)) 52 | 53 | model.add(AveragePooling2D(pool_size=(2, 2), padding='same')) 54 | 55 | model.add(Flatten()) 56 | 57 | model.add(Dense(512)) 58 | model.add(Activation('relu')) 59 | model.add(Dropout(0.5, seed=1000)) 60 | 61 | model.add(Dense(10)) 62 | model.add(Activation('softmax')) 63 | 64 | # Compile the model 65 | model.compile(optimizer=Adam(lr=0.001, decay=1e-5), 66 | loss='categorical_crossentropy', 67 | metrics=['accuracy']) 68 | 69 | history = model.fit(X_train, Y_train, 70 | epochs=200, 71 | batch_size=256, 72 | validation_data=(X_test, Y_test)) 73 | 74 | # Show the results 75 | fig, ax = plt.subplots(1, 2, figsize=(18, 6)) 76 | 77 | ax[0].plot(history.history['acc'], label='Training accuracy') 78 | ax[0].plot(history.history['val_acc'], label='Validation accuracy') 79 | ax[0].set_xlabel('Epoch') 80 | ax[0].set_ylabel('Accuracy') 81 | ax[0].legend() 82 | ax[0].grid() 83 | 84 | ax[1].plot(history.history['loss'], label='Training loss') 85 | ax[1].plot(history.history['val_loss'], label='Validation loss') 86 | ax[1].set_xlabel('Epoch') 87 | ax[1].set_ylabel('Loss') 88 | ax[1].set_yticks(np.linspace(0.0, 1.0, 10)) 89 | ax[1].legend() 90 | ax[1].grid() 91 | plt.show() -------------------------------------------------------------------------------- /Chapter17/pipeline_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import warnings 5 | 6 | from sklearn.datasets import load_digits 7 | from sklearn.decomposition import PCA, NMF 8 | from sklearn.feature_selection import SelectKBest, f_classif 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.model_selection import GridSearchCV 11 | from sklearn.pipeline import Pipeline 12 | from sklearn.preprocessing import StandardScaler 13 | from sklearn.svm import SVC 14 | 15 | # For reproducibility 16 | np.random.seed(1000) 17 | 18 | 19 | if __name__ == '__main__': 20 | warnings.simplefilter("ignore") 21 | 22 | # Load the dataset 23 | digits = load_digits() 24 | 25 | # Create the steps for the pipeline 26 | pca = PCA() 27 | nmf = NMF() 28 | scaler = StandardScaler() 29 | kbest = SelectKBest(f_classif) 30 | lr = LogisticRegression() 31 | svc = SVC() 32 | 33 | pipeline_steps = [ 34 | ('dimensionality_reduction', pca), 35 | ('normalization', scaler), 36 | ('classification', lr) 37 | ] 38 | 39 | # Create the pipeline 40 | pipeline = Pipeline(pipeline_steps) 41 | 42 | # Perform a grid search 43 | pca_nmf_components = [10, 20, 30] 44 | 45 | param_grid = [ 46 | { 47 | 'dimensionality_reduction': [pca], 48 | 'dimensionality_reduction__n_components': pca_nmf_components, 49 | 'classification': [lr], 50 | 'classification__C': [1, 5, 10, 20] 51 | }, 52 | { 53 | 'dimensionality_reduction': [pca], 54 | 'dimensionality_reduction__n_components': pca_nmf_components, 55 | 'classification': [svc], 56 | 'classification__kernel': ['rbf', 'poly'], 57 | 'classification__gamma': [0.05, 0.1, 0.2, 0.5, 1.0], 58 | 'classification__degree': [2, 3, 5], 59 | 'classification__C': [1, 5, 10, 20] 60 | }, 61 | { 62 | 'dimensionality_reduction': [nmf], 63 | 'dimensionality_reduction__n_components': pca_nmf_components, 64 | 'classification': [lr], 65 | 'classification__C': [1, 5, 10, 20] 66 | }, 67 | { 68 | 'dimensionality_reduction': [nmf], 69 | 'dimensionality_reduction__n_components': pca_nmf_components, 70 | 'classification': [svc], 71 | 'classification__kernel': ['rbf', 'poly'], 72 | 'classification__gamma': [0.05, 0.1, 0.2, 0.5, 1.0], 73 | 'classification__degree': [2, 3, 5], 74 | 'classification__C': [1, 5, 10, 20] 75 | }, 76 | { 77 | 'dimensionality_reduction': [kbest], 78 | 'classification': [svc], 79 | 'classification__kernel': ['rbf', 'poly'], 80 | 'classification__gamma': [0.05, 0.1, 0.2, 0.5, 1.0], 81 | 'classification__degree': [2, 3, 5], 82 | 'classification__C': [1, 5, 10, 20] 83 | }, 84 | ] 85 | 86 | gs = GridSearchCV(pipeline, param_grid) 87 | gs.fit(digits.data, digits.target) 88 | 89 | print('Best estimator:') 90 | print(gs.best_estimator_) 91 | 92 | print('Best score:') 93 | print(gs.best_score_) 94 | -------------------------------------------------------------------------------- /Chapter14/plsa.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | from nltk.corpus import brown 6 | 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | 9 | 10 | # For reproducibility 11 | np.random.seed(1000) 12 | 13 | rank = 2 14 | alpha_1 = 1000.0 15 | alpha_2 = 10.0 16 | 17 | # Compose a corpus 18 | sentences_1 = brown.sents(categories=['editorial'])[0:20] 19 | sentences_2 = brown.sents(categories=['fiction'])[0:20] 20 | corpus = [] 21 | 22 | for s in sentences_1 + sentences_2: 23 | corpus.append(' '.join(s)) 24 | 25 | # Vectorize the corpus 26 | cv = CountVectorizer(strip_accents='unicode', stop_words='english') 27 | Xc = np.array(cv.fit_transform(corpus).todense()) 28 | 29 | # Define the probability matrices 30 | Ptd = np.random.uniform(0.0, 1.0, size=(len(corpus), rank)) 31 | Pwt = np.random.uniform(0.0, 1.0, size=(rank, len(cv.vocabulary_))) 32 | Ptdw = np.zeros(shape=(len(cv.vocabulary_), len(corpus), rank)) 33 | 34 | # Normalize the probability matrices 35 | for d in range(len(corpus)): 36 | nf = np.sum(Ptd[d, :]) 37 | for t in range(rank): 38 | Ptd[d, t] /= nf 39 | 40 | for t in range(rank): 41 | nf = np.sum(Pwt[t, :]) 42 | for w in range(len(cv.vocabulary_)): 43 | Pwt[t, w] /= nf 44 | 45 | 46 | def log_likelihood(): 47 | value = 0.0 48 | 49 | for d in range(len(corpus)): 50 | for w in range(len(cv.vocabulary_)): 51 | real_topic_value = 0.0 52 | 53 | for t in range(rank): 54 | real_topic_value += Ptd[d, t] * Pwt[t, w] 55 | 56 | if real_topic_value > 0.0: 57 | value += Xc[d, w] * np.log(real_topic_value) 58 | 59 | return value 60 | 61 | 62 | def expectation(): 63 | global Ptd, Pwt, Ptdw 64 | 65 | for d in range(len(corpus)): 66 | for w in range(len(cv.vocabulary_)): 67 | nf = 0.0 68 | 69 | for t in range(rank): 70 | Ptdw[w, d, t] = Ptd[d, t] * Pwt[t, w] 71 | nf += Ptdw[w, d, t] 72 | 73 | Ptdw[w, d, :] = (Ptdw[w, d, :] / nf) if nf != 0.0 else 0.0 74 | 75 | 76 | def maximization(): 77 | global Ptd, Pwt, Ptdw 78 | 79 | for t in range(rank): 80 | nf = 0.0 81 | 82 | for d in range(len(corpus)): 83 | ps = 0.0 84 | 85 | for w in range(len(cv.vocabulary_)): 86 | ps += Xc[d, w] * Ptdw[w, d, t] 87 | 88 | Pwt[t, w] = ps 89 | nf += Pwt[t, w] 90 | 91 | Pwt[:, w] /= nf if nf != 0.0 else alpha_1 92 | 93 | for d in range(len(corpus)): 94 | for t in range(rank): 95 | ps = 0.0 96 | nf = 0.0 97 | 98 | for w in range(len(cv.vocabulary_)): 99 | ps += Xc[d, w] * Ptdw[w, d, t] 100 | nf += Xc[d, w] 101 | 102 | Ptd[d, t] = ps / (nf if nf != 0.0 else alpha_2) 103 | 104 | 105 | if __name__ == '__main__': 106 | print('Initial Log-Likelihood: %f' % log_likelihood()) 107 | 108 | for i in range(30): 109 | expectation() 110 | maximization() 111 | print('Step %d - Log-Likelihood: %f' % (i, log_likelihood())) 112 | 113 | # Show the top 5 words per topic 114 | Pwts = np.argsort(Pwt, axis=1)[::-1] 115 | 116 | for t in range(rank): 117 | print('\nTopic ' + str(t)) 118 | for i in range(5): 119 | print(cv.get_feature_names()[Pwts[t, i]]) -------------------------------------------------------------------------------- /Chapter16/lstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | from keras.models import Sequential 7 | from keras.layers import LSTM, Dense, Activation 8 | from keras.optimizers import Adam 9 | 10 | # Install with pip -U install datapackage. For further information: https://datahub.io/core/global-temp#python 11 | from datapackage import Package 12 | 13 | from sklearn.preprocessing import MinMaxScaler 14 | 15 | 16 | # Set random seed for reproducibility 17 | np.random.seed(1000) 18 | 19 | 20 | nb_samples = 1600 21 | nb_test_samples = 200 22 | sequence_length = 20 23 | 24 | 25 | if __name__ == '__main__': 26 | # Load the dataset 27 | package = Package('https://datahub.io/core/global-temp/datapackage.json') 28 | 29 | for resource in package.resources: 30 | if resource.descriptor['datahub']['type'] == 'derived/csv': 31 | data = resource.read() 32 | 33 | # Extract the time series 34 | data_gcag = data[0:len(data):2][::-1] 35 | 36 | Y = np.zeros(shape=(len(data_gcag), 1), dtype=np.float32) 37 | 38 | for i, y in enumerate(data_gcag): 39 | Y[i - 1, 0] = y[2] 40 | 41 | # Scale between -1.0 and 1.0 42 | mmscaler = MinMaxScaler((-1.0, 1.0)) 43 | Y = mmscaler.fit_transform(Y) 44 | 45 | # Show the time-series 46 | fig, ax = plt.subplots(figsize=(20, 10)) 47 | 48 | ax.plot(Y) 49 | ax.grid() 50 | ax.set_xlabel('Time steps') 51 | ax.set_ylabel('Monthly Avg Temperature Anomaly') 52 | 53 | plt.show() 54 | 55 | # Create the training and test sets 56 | X_ts = np.zeros(shape=(nb_samples - sequence_length, sequence_length, 1), dtype=np.float32) 57 | Y_ts = np.zeros(shape=(nb_samples - sequence_length, 1), dtype=np.float32) 58 | 59 | for i in range(0, nb_samples - sequence_length): 60 | X_ts[i] = Y[i:i + sequence_length] 61 | Y_ts[i] = Y[i + sequence_length] 62 | 63 | X_ts_train = X_ts[0:nb_samples - nb_test_samples, :] 64 | Y_ts_train = Y_ts[0:nb_samples - nb_test_samples] 65 | 66 | X_ts_test = X_ts[nb_samples - nb_test_samples:, :] 67 | Y_ts_test = Y_ts[nb_samples - nb_test_samples:] 68 | 69 | # Create the model 70 | model = Sequential() 71 | 72 | model.add(LSTM(8, stateful=True, batch_input_shape=(20, sequence_length, 1))) 73 | 74 | model.add(Dense(1)) 75 | model.add(Activation('linear')) 76 | 77 | # Compile the model 78 | model.compile(optimizer=Adam(lr=0.001, decay=0.0001), 79 | loss='mse', 80 | metrics=['mse']) 81 | 82 | # Train the model 83 | model.fit(X_ts_train, Y_ts_train, 84 | batch_size=20, 85 | epochs=100, 86 | shuffle=False, 87 | validation_data=(X_ts_test, Y_ts_test)) 88 | 89 | # Show the predictions on the training set 90 | fig, ax = plt.subplots(figsize=(20, 10)) 91 | 92 | ax.plot(Y_ts_train, label='True values') 93 | ax.plot(model.predict(X_ts_train, batch_size=20), label='Predicted values') 94 | ax.grid() 95 | ax.set_xlabel('Time steps') 96 | ax.set_ylabel('Monthly Avg Temperature Anomaly') 97 | ax.legend() 98 | 99 | plt.show() 100 | 101 | # Show the predictions on the test set 102 | fig, ax = plt.subplots(figsize=(20, 10)) 103 | 104 | ax.plot(Y_ts_test, label='True values') 105 | ax.plot(model.predict(X_ts_test, batch_size=20), label='Predicted values') 106 | ax.grid() 107 | ax.set_xlabel('Time steps') 108 | ax.set_ylabel('Monthly Avg Temperature Anomaly') 109 | ax.legend() 110 | 111 | plt.show() 112 | 113 | -------------------------------------------------------------------------------- /Chapter08/dt.dot: -------------------------------------------------------------------------------- 1 | digraph Tree { 2 | node [shape=box] ; 3 | 0 [label="C <= -0.367\ngini = 0.667\nsamples = 500\nvalue = [168, 165, 167]\nclass = C1"] ; 4 | 1 [label="A <= -0.447\ngini = 0.078\nsamples = 172\nvalue = [0, 7, 165]\nclass = C3"] ; 5 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ; 6 | 2 [label="gini = 0.0\nsamples = 4\nvalue = [0, 4, 0]\nclass = C2"] ; 7 | 1 -> 2 ; 8 | 3 [label="A <= -0.171\ngini = 0.035\nsamples = 168\nvalue = [0, 3, 165]\nclass = C3"] ; 9 | 1 -> 3 ; 10 | 4 [label="A <= -0.212\ngini = 0.48\nsamples = 5\nvalue = [0, 2, 3]\nclass = C3"] ; 11 | 3 -> 4 ; 12 | 5 [label="gini = 0.0\nsamples = 3\nvalue = [0, 0, 3]\nclass = C3"] ; 13 | 4 -> 5 ; 14 | 6 [label="gini = 0.0\nsamples = 2\nvalue = [0, 2, 0]\nclass = C2"] ; 15 | 4 -> 6 ; 16 | 7 [label="A <= -0.016\ngini = 0.012\nsamples = 163\nvalue = [0, 1, 162]\nclass = C3"] ; 17 | 3 -> 7 ; 18 | 8 [label="A <= -0.025\ngini = 0.219\nsamples = 8\nvalue = [0, 1, 7]\nclass = C3"] ; 19 | 7 -> 8 ; 20 | 9 [label="gini = 0.0\nsamples = 7\nvalue = [0, 0, 7]\nclass = C3"] ; 21 | 8 -> 9 ; 22 | 10 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = C2"] ; 23 | 8 -> 10 ; 24 | 11 [label="gini = 0.0\nsamples = 155\nvalue = [0, 0, 155]\nclass = C3"] ; 25 | 7 -> 11 ; 26 | 12 [label="B <= -0.299\ngini = 0.506\nsamples = 328\nvalue = [168, 158, 2]\nclass = C1"] ; 27 | 0 -> 12 [labeldistance=2.5, labelangle=-45, headlabel="False"] ; 28 | 13 [label="C <= 2.115\ngini = 0.109\nsamples = 173\nvalue = [163, 10, 0]\nclass = C1"] ; 29 | 12 -> 13 ; 30 | 14 [label="A <= -1.63\ngini = 0.047\nsamples = 167\nvalue = [163, 4, 0]\nclass = C1"] ; 31 | 13 -> 14 ; 32 | 15 [label="C <= 0.942\ngini = 0.238\nsamples = 29\nvalue = [25, 4, 0]\nclass = C1"] ; 33 | 14 -> 15 ; 34 | 16 [label="gini = 0.0\nsamples = 24\nvalue = [24, 0, 0]\nclass = C1"] ; 35 | 15 -> 16 ; 36 | 17 [label="A <= -1.7\ngini = 0.32\nsamples = 5\nvalue = [1, 4, 0]\nclass = C2"] ; 37 | 15 -> 17 ; 38 | 18 [label="gini = 0.0\nsamples = 1\nvalue = [1, 0, 0]\nclass = C1"] ; 39 | 17 -> 18 ; 40 | 19 [label="gini = 0.0\nsamples = 4\nvalue = [0, 4, 0]\nclass = C2"] ; 41 | 17 -> 19 ; 42 | 20 [label="gini = 0.0\nsamples = 138\nvalue = [138, 0, 0]\nclass = C1"] ; 43 | 14 -> 20 ; 44 | 21 [label="gini = 0.0\nsamples = 6\nvalue = [0, 6, 0]\nclass = C2"] ; 45 | 13 -> 21 ; 46 | 22 [label="A <= -0.19\ngini = 0.087\nsamples = 155\nvalue = [5, 148, 2]\nclass = C2"] ; 47 | 12 -> 22 ; 48 | 23 [label="B <= -0.154\ngini = 0.052\nsamples = 151\nvalue = [3, 147, 1]\nclass = C2"] ; 49 | 22 -> 23 ; 50 | 24 [label="C <= 0.802\ngini = 0.32\nsamples = 10\nvalue = [2, 8, 0]\nclass = C2"] ; 51 | 23 -> 24 ; 52 | 25 [label="C <= 0.269\ngini = 0.444\nsamples = 3\nvalue = [2, 1, 0]\nclass = C1"] ; 53 | 24 -> 25 ; 54 | 26 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = C2"] ; 55 | 25 -> 26 ; 56 | 27 [label="gini = 0.0\nsamples = 2\nvalue = [2, 0, 0]\nclass = C1"] ; 57 | 25 -> 27 ; 58 | 28 [label="gini = 0.0\nsamples = 7\nvalue = [0, 7, 0]\nclass = C2"] ; 59 | 24 -> 28 ; 60 | 29 [label="C <= 0.195\ngini = 0.028\nsamples = 141\nvalue = [1, 139, 1]\nclass = C2"] ; 61 | 23 -> 29 ; 62 | 30 [label="C <= 0.178\ngini = 0.194\nsamples = 19\nvalue = [1, 17, 1]\nclass = C2"] ; 63 | 29 -> 30 ; 64 | 31 [label="C <= 0.135\ngini = 0.105\nsamples = 18\nvalue = [0, 17, 1]\nclass = C2"] ; 65 | 30 -> 31 ; 66 | 32 [label="gini = 0.0\nsamples = 16\nvalue = [0, 16, 0]\nclass = C2"] ; 67 | 31 -> 32 ; 68 | 33 [label="C <= 0.156\ngini = 0.5\nsamples = 2\nvalue = [0, 1, 1]\nclass = C2"] ; 69 | 31 -> 33 ; 70 | 34 [label="gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]\nclass = C3"] ; 71 | 33 -> 34 ; 72 | 35 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = C2"] ; 73 | 33 -> 35 ; 74 | 36 [label="gini = 0.0\nsamples = 1\nvalue = [1, 0, 0]\nclass = C1"] ; 75 | 30 -> 36 ; 76 | 37 [label="gini = 0.0\nsamples = 122\nvalue = [0, 122, 0]\nclass = C2"] ; 77 | 29 -> 37 ; 78 | 38 [label="C <= 0.8\ngini = 0.625\nsamples = 4\nvalue = [2, 1, 1]\nclass = C1"] ; 79 | 22 -> 38 ; 80 | 39 [label="B <= 1.822\ngini = 0.5\nsamples = 2\nvalue = [0, 1, 1]\nclass = C2"] ; 81 | 38 -> 39 ; 82 | 40 [label="gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]\nclass = C3"] ; 83 | 39 -> 40 ; 84 | 41 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = C2"] ; 85 | 39 -> 41 ; 86 | 42 [label="gini = 0.0\nsamples = 2\nvalue = [2, 0, 0]\nclass = C1"] ; 87 | 38 -> 42 ; 88 | } -------------------------------------------------------------------------------- /Chapter09/gaussian_mixture.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from matplotlib.patches import Ellipse 7 | 8 | from sklearn.datasets import make_blobs 9 | from sklearn.mixture import GaussianMixture 10 | 11 | 12 | # Set random seed for reproducibility 13 | np.random.seed(1000) 14 | 15 | 16 | # Total number of samples 17 | nb_samples = 800 18 | 19 | 20 | if __name__ == '__main__': 21 | # Create the dataset 22 | X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=3, cluster_std=2.2, random_state=1000) 23 | 24 | # Show the original dataset 25 | fig, ax = plt.subplots(figsize=(15, 8)) 26 | 27 | ax.scatter(X[Y == 0, 0], X[Y == 0, 1], c='r', s=20, marker='p', label='Class 0') 28 | ax.scatter(X[Y == 1, 0], X[Y == 1, 1], c='g', s=20, marker='d', label='Class 1') 29 | ax.scatter(X[Y == 2, 0], X[Y == 2, 1], c='b', s=20, marker='s', label='Class 2') 30 | ax.set_xlabel(r'$x_0$') 31 | ax.set_ylabel(r'$x_1$') 32 | ax.legend() 33 | ax.grid() 34 | 35 | plt.show() 36 | 37 | # Create a fit a Gaussian Mixture model 38 | gm = GaussianMixture(n_components=3, max_iter=1000, random_state=1000) 39 | gm.fit(X) 40 | 41 | # Print means, covariances, and weights 42 | print('Means:\n') 43 | print(gm.means_) 44 | 45 | print('\nCovariances:\n') 46 | print(gm.covariances_) 47 | 48 | print('\nWeights:\n') 49 | print(gm.weights_) 50 | 51 | # Show the clustered dataset with the final Gaussian distributions 52 | fig, ax = plt.subplots(figsize=(15, 8)) 53 | 54 | c = gm.covariances_ 55 | m = gm.means_ 56 | 57 | g1 = Ellipse(xy=m[0], width=4 * np.sqrt(c[0][0, 0]), height=4 * np.sqrt(c[0][1, 1]), fill=False, linestyle='dashed', 58 | linewidth=2) 59 | g1_1 = Ellipse(xy=m[0], width=3 * np.sqrt(c[0][0, 0]), height=3 * np.sqrt(c[0][1, 1]), fill=False, 60 | linestyle='dashed', linewidth=3) 61 | g1_2 = Ellipse(xy=m[0], width=1.5 * np.sqrt(c[0][0, 0]), height=1.5 * np.sqrt(c[0][1, 1]), fill=False, 62 | linestyle='dashed', linewidth=4) 63 | 64 | g2 = Ellipse(xy=m[1], width=4 * np.sqrt(c[1][0, 0]), height=4 * np.sqrt(c[1][1, 1]), fill=False, linestyle='dashed', 65 | linewidth=2) 66 | g2_1 = Ellipse(xy=m[1], width=3 * np.sqrt(c[1][0, 0]), height=3 * np.sqrt(c[1][1, 1]), fill=False, 67 | linestyle='dashed', linewidth=3) 68 | g2_2 = Ellipse(xy=m[1], width=1.5 * np.sqrt(c[1][0, 0]), height=1.5 * np.sqrt(c[1][1, 1]), fill=False, 69 | linestyle='dashed', linewidth=4) 70 | 71 | g3 = Ellipse(xy=m[2], width=4 * np.sqrt(c[2][0, 0]), height=4 * np.sqrt(c[2][1, 1]), fill=False, linestyle='dashed', 72 | linewidth=2) 73 | g3_1 = Ellipse(xy=m[2], width=3 * np.sqrt(c[2][0, 0]), height=3 * np.sqrt(c[2][1, 1]), fill=False, 74 | linestyle='dashed', linewidth=3) 75 | g3_2 = Ellipse(xy=m[2], width=1.5 * np.sqrt(c[2][0, 0]), height=1.5 * np.sqrt(c[2][1, 1]), fill=False, 76 | linestyle='dashed', linewidth=4) 77 | 78 | ax.add_artist(g1) 79 | ax.add_artist(g1_1) 80 | ax.add_artist(g1_2) 81 | ax.add_artist(g2) 82 | ax.add_artist(g2_1) 83 | ax.add_artist(g2_2) 84 | ax.add_artist(g3) 85 | ax.add_artist(g3_1) 86 | ax.add_artist(g3_2) 87 | 88 | ax.scatter(X[Y == 0, 0], X[Y == 0, 1], c='r', s=20, marker='p', label='Class 0') 89 | ax.scatter(X[Y == 1, 0], X[Y == 1, 1], c='g', s=20, marker='d', label='Class 1') 90 | ax.scatter(X[Y == 2, 0], X[Y == 2, 1], c='b', s=20, marker='s', label='Class 2') 91 | ax.set_xlabel(r'$x_0$') 92 | ax.set_ylabel(r'$x_1$') 93 | ax.legend() 94 | ax.grid() 95 | 96 | plt.show() 97 | 98 | # Compute AICs and BICs 99 | nb_components = [2, 3, 4, 5, 6, 7, 8] 100 | 101 | aics = [] 102 | bics = [] 103 | 104 | for n in nb_components: 105 | gm = GaussianMixture(n_components=n, max_iter=1000, random_state=1000) 106 | gm.fit(X) 107 | aics.append(gm.aic(X)) 108 | bics.append(gm.bic(X)) 109 | 110 | fig, ax = plt.subplots(2, 1, figsize=(15, 8)) 111 | 112 | ax[0].plot(nb_components, aics) 113 | ax[0].set_ylabel('AIC') 114 | ax[0].grid() 115 | 116 | ax[1].plot(nb_components, bics) 117 | ax[1].set_xlabel('Number of components') 118 | ax[1].set_ylabel('BIC') 119 | ax[1].grid() 120 | 121 | plt.show() -------------------------------------------------------------------------------- /Chapter15/mlp.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.model_selection import cross_val_score, train_test_split 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.utils import shuffle 10 | 11 | from keras.models import Sequential 12 | from keras.layers import Dense, Activation 13 | from keras.utils import to_categorical 14 | 15 | 16 | # For reproducibility 17 | np.random.seed(1000) 18 | 19 | nb_samples = 2000 20 | 21 | 22 | if __name__ == '__main__': 23 | # Create the dataset 24 | X = np.zeros(shape=(nb_samples, 2), dtype=np.float32) 25 | Y = np.zeros(shape=(nb_samples,), dtype=np.float32) 26 | 27 | t = 15.0 * np.random.uniform(0.0, 1.0, size=(int(nb_samples / 2), 1)) 28 | 29 | X[0:int(nb_samples / 2), :] = t * np.hstack([-np.cos(t), np.sin(t)]) + \ 30 | np.random.uniform(0.0, 1.8, size=(int(nb_samples / 2), 2)) 31 | Y[0:int(nb_samples / 2)] = 0 32 | 33 | X[int(nb_samples / 2):, :] = t * np.hstack([np.cos(t), -np.sin(t)]) + \ 34 | np.random.uniform(0.0, 1.8, size=(int(nb_samples / 2), 2)) 35 | Y[int(nb_samples / 2):] = 1 36 | 37 | ss = StandardScaler() 38 | X = ss.fit_transform(X) 39 | 40 | X, Y = shuffle(X, Y, random_state=1000) 41 | 42 | # Show the dataset 43 | fig, ax = plt.subplots(figsize=(8, 8)) 44 | 45 | ax.scatter(X[Y == 0, 0], X[Y == 0, 1], label='Class 0') 46 | ax.scatter(X[Y == 1, 0], X[Y == 1, 1], label='Class 1') 47 | ax.set_xlabel(r'$x_0$') 48 | ax.set_ylabel(r'$x_1$') 49 | ax.legend() 50 | ax.grid() 51 | 52 | plt.show() 53 | 54 | # Perform a Logistic Regression cross-validation 55 | lr = LogisticRegression(penalty='l2', C=0.01, random_state=1000) 56 | print(np.mean(cross_val_score(lr, X, Y, cv=10))) 57 | 58 | # Show the classification result 59 | lr.fit(X, Y) 60 | Y_pred_lr = lr.predict(X) 61 | 62 | fig, ax = plt.subplots(figsize=(8, 8)) 63 | 64 | ax.scatter(X[Y_pred_lr == 0, 0], X[Y_pred_lr == 0, 1], label='Class 0') 65 | ax.scatter(X[Y_pred_lr == 1, 0], X[Y_pred_lr == 1, 1], label='Class 1') 66 | ax.set_xlabel(r'$x_0$') 67 | ax.set_ylabel(r'$x_1$') 68 | ax.legend() 69 | ax.grid() 70 | 71 | plt.show() 72 | 73 | # Create a Keras model 74 | model = Sequential() 75 | 76 | model.add(Dense(64, input_dim=2)) 77 | model.add(Activation('relu')) 78 | 79 | model.add(Dense(32)) 80 | model.add(Activation('relu')) 81 | 82 | model.add(Dense(16)) 83 | model.add(Activation('relu')) 84 | 85 | model.add(Dense(2)) 86 | model.add(Activation('softmax')) 87 | 88 | # Compile the model 89 | model.compile(optimizer='adam', 90 | loss='categorical_crossentropy', 91 | metrics=['accuracy']) 92 | 93 | # Split the dataset into train and test sets 94 | X_train, X_test, Y_train, Y_test = \ 95 | train_test_split(X, to_categorical(Y), test_size=0.2, random_state=1000) 96 | 97 | # Train the model 98 | model.fit(X_train, Y_train, 99 | epochs=100, 100 | batch_size=32, 101 | validation_data=(X_test, Y_test)) 102 | 103 | # Show the classification result 104 | Y_pred_mlp = np.argmax(model.predict(X), axis=1) 105 | 106 | fig, ax = plt.subplots(figsize=(8, 8)) 107 | 108 | ax.scatter(X[Y_pred_mlp == 0, 0], X[Y_pred_mlp == 0, 1], label='Class 0') 109 | ax.scatter(X[Y_pred_mlp == 1, 0], X[Y_pred_mlp == 1, 1], label='Class 1') 110 | ax.set_xlabel(r'$x_0$') 111 | ax.set_ylabel(r'$x_1$') 112 | ax.legend() 113 | ax.grid() 114 | 115 | plt.show() 116 | 117 | # Show the decision surfaces 118 | Xm = np.linspace(-2.0, 2.0, 1000) 119 | Ym = np.linspace(-2.0, 2.0, 1000) 120 | Xmg, Ymg = np.meshgrid(Xm, Ym) 121 | X_eval = np.vstack([Xmg.ravel(), Ymg.ravel()]).T 122 | 123 | Y_eval_lr = lr.predict(X_eval) 124 | Y_eval_mlp = np.argmax(model.predict(X_eval), axis=1) 125 | 126 | fig, ax = plt.subplots(1, 2, figsize=(16, 8)) 127 | 128 | ax[0].scatter(X_eval[Y_eval_lr == 0, 0], X_eval[Y_eval_lr == 0, 1]) 129 | ax[0].scatter(X_eval[Y_eval_lr == 1, 0], X_eval[Y_eval_lr == 1, 1]) 130 | ax[0].set_xlabel(r'$x_0$') 131 | ax[0].set_ylabel(r'$x_1$') 132 | ax[0].set_title('Logistic Regression') 133 | 134 | ax[1].scatter(X_eval[Y_eval_mlp == 0, 0], X_eval[Y_eval_mlp == 0, 1]) 135 | ax[1].scatter(X_eval[Y_eval_mlp == 1, 0], X_eval[Y_eval_mlp == 1, 1]) 136 | ax[1].set_xlabel(r'$x_0$') 137 | ax[1].set_ylabel(r'$x_1$') 138 | ax[1].set_title('MLP') 139 | 140 | plt.show() 141 | 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Algorithms Second Edition 2 | 3 | Book Name 4 | 5 | This is the code repository for [Machine Learning Algorithms Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-algorithms-second-edition?utm_source=github&utm_medium=reposiory), published by Packt. 6 | 7 | **Popular algorithms for data science and machine learning** 8 | 9 | ## What is this book about? 10 | Machine learning has gained tremendous popularity for its powerful and fast predictions with large datasets. However, the true forces behind its powerful output are the complex algorithms involving substantial statistical analysis that churn large datasets and generate substantial insight. 11 | 12 | This book covers the following exciting features: 13 | * Study feature selection and the feature engineering process 14 | * Assess performance and error trade-offs for linear regression 15 | * Build a data model and understand how it works by using different types of algorithm 16 | * Learn to tune the parameters of Support Vector Machines (SVM) 17 | * Explore the concept of natural language processing (NLP) and recommendation systems 18 | 19 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789347998) today! 20 | 21 | https://www.packtpub.com/ 23 | 24 | 25 | ## Instructions and Navigations 26 | All of the code is organized into folders. For example, Chapter02. 27 | 28 | The code will look like the following: 29 | ``` 30 | from sklearn.svm import SVC 31 | from sklearn.model_selection import cross_val_score 32 | svc = SVC(kernel='linear') 33 | print(cross_val_score(svc, X, Y, scoring='accuracy', cv=10).mean()) 34 | 0.93191356542617032 35 | ``` 36 | 37 | **Following is what you need for this book:** 38 | Machine Learning Algorithms is for you if you are a machine learning engineer, data engineer, or junior data scientist who wants to advance in the field of predictive analytics and machine learning. Familiarity with R and Python will be an added advantage for getting the best from this book. 39 | 40 | With the following software and hardware list you can run all code files present in the book (Chapter 1-15). 41 | 42 | ### Software and Hardware List 43 | 44 | | Chapter | Software required | OS required | 45 | | -------- | ------------------------------------| -----------------------------------| 46 | | 2-17 | Python 2.7/3.5, SciPy 0.18, | Windows, Mac OS X, and Linux (Any) | 47 | | | Numpy 1.11+, Matplotlib 2.0, | | 48 | | | ScikitLearn 0.18+, Crab, | | 49 | | | Apache Spark 2+, NLTK –langdetect, | | 50 | | | Gensim, Keras 2+, Cupy | | 51 | 52 | 53 | 54 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://www.packtpub.com/sites/default/files/downloads/MachineLearningAlgorithmsSecondEdition_ColorImages.pdf). 55 | 56 | 57 | ### Related products 58 | * Mastering Machine Learning Algorithms [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/mastering-machine-learning-algorithms?utm_source=github&utm_medium=repository&utm_campaign=9781788621113) [[Amazon]](https://www.amazon.com/dp/1788621115) 59 | 60 | * Python Deep Learning [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/python-deep-learning?utm_source=github&utm_medium=repository&utm_campaign=9781786464453) [[Amazon]](https://www.amazon.com/dp/1786464454) 61 | 62 | ## Get to Know the Author 63 | **Giuseppe Bonaccorso** 64 | is an experienced team leader/manager in AI, machine/deep learning solution design, management, and delivery. He got his MScEng in electronics in 2005 from the University of Catania, Italy, and continued his studies at the University of Rome Tor Vergata and the University of Essex, UK. His main interests include machine/deep learning, reinforcement learning, big data, bio-inspired adaptive systems, cryptocurrencies, and NLP. 65 | 66 | 67 | 68 | ## Other books by the authors 69 | * [Mastering Machine Learning Algorithms](https://www.packtpub.com/big-data-and-business-intelligence/mastering-machine-learning-algorithms?utm_source=github&utm_medium=repository&utm_campaign=9781788621113) 70 | * [Machine Learning Algorithms](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-algorithms?utm_source=github&utm_medium=repository&utm_campaign=9781785889622) 71 | 72 | ### Suggestions and Feedback 73 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 74 | -------------------------------------------------------------------------------- /Chapter09/evaluation_metrics.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import matplotlib.cm as cm 6 | 7 | from sklearn.datasets import make_blobs 8 | from sklearn.cluster import KMeans 9 | from sklearn.metrics import silhouette_score, silhouette_samples, calinski_harabaz_score, \ 10 | homogeneity_score, completeness_score, adjusted_rand_score 11 | from sklearn.metrics.pairwise import pairwise_distances 12 | 13 | 14 | # For reproducibility 15 | np.random.seed(1000) 16 | 17 | 18 | nb_samples = 1000 19 | 20 | 21 | if __name__ == '__main__': 22 | # Create dataset 23 | X, _ = make_blobs(n_samples=nb_samples, n_features=2, centers=3, cluster_std=1.5, random_state=1000) 24 | 25 | # Show the dataset 26 | fig, ax = plt.subplots(1, 1, figsize=(30, 25)) 27 | 28 | ax.grid() 29 | ax.set_xlabel('X') 30 | ax.set_ylabel('Y') 31 | 32 | ax.scatter(X[:, 0], X[:, 1], marker='o', color='b') 33 | 34 | plt.show() 35 | 36 | # Analyze the inertia 37 | nb_clusters = [2, 3, 5, 6, 7, 8, 9, 10] 38 | 39 | inertias = [] 40 | 41 | for n in nb_clusters: 42 | km = KMeans(n_clusters=n) 43 | km.fit(X) 44 | inertias.append(km.inertia_) 45 | 46 | fig, ax = plt.subplots(figsize=(15, 8)) 47 | 48 | ax.plot(nb_clusters, inertias) 49 | ax.set_xlabel('Number of clusters') 50 | ax.set_ylabel('Inertia') 51 | ax.grid() 52 | 53 | plt.show() 54 | 55 | # Analyze the silhouette scores 56 | avg_silhouettes = [] 57 | 58 | for n in nb_clusters: 59 | km = KMeans(n_clusters=n) 60 | Y = km.fit_predict(X) 61 | avg_silhouettes.append(silhouette_score(X, Y)) 62 | 63 | fig, ax = plt.subplots(figsize=(15, 8)) 64 | 65 | ax.plot(nb_clusters, avg_silhouettes) 66 | ax.set_xlabel('Number of clusters') 67 | ax.set_ylabel('Average Silhouette score') 68 | ax.grid() 69 | 70 | plt.show() 71 | 72 | # Draw the silhouette plots 73 | fig, ax = plt.subplots(2, 2, figsize=(15, 10)) 74 | 75 | nb_clusters = [2, 3, 4, 8] 76 | mapping = [(0, 0), (0, 1), (1, 0), (1, 1)] 77 | 78 | for i, n in enumerate(nb_clusters): 79 | km = KMeans(n_clusters=n) 80 | Y = km.fit_predict(X) 81 | 82 | silhouette_values = silhouette_samples(X, Y) 83 | 84 | ax[mapping[i]].set_xticks([-0.15, 0.0, 0.25, 0.5, 0.75, 1.0]) 85 | ax[mapping[i]].set_yticks([]) 86 | ax[mapping[i]].set_title('%d clusters' % n) 87 | ax[mapping[i]].set_xlim([-0.15, 1]) 88 | ax[mapping[i]].grid() 89 | y_lower = 20 90 | 91 | for t in range(n): 92 | ct_values = silhouette_values[Y == t] 93 | ct_values.sort() 94 | 95 | y_upper = y_lower + ct_values.shape[0] 96 | 97 | color = cm.Accent(float(t) / n) 98 | ax[mapping[i]].fill_betweenx(np.arange(y_lower, y_upper), 0, 99 | ct_values, facecolor=color, edgecolor=color) 100 | 101 | y_lower = y_upper + 20 102 | 103 | # Analyze the Calinski-Harabasz scores 104 | ch_scores = [] 105 | 106 | km = KMeans(n_clusters=n) 107 | Y = km.fit_predict(X) 108 | 109 | for n in nb_clusters: 110 | km = KMeans(n_clusters=n) 111 | Y = km.fit_predict(X) 112 | ch_scores.append(calinski_harabaz_score(X, Y)) 113 | 114 | fig, ax = plt.subplots(figsize=(15, 8)) 115 | 116 | ax.plot(nb_clusters, ch_scores) 117 | ax.set_xlabel('Number of clusters') 118 | ax.set_ylabel('Calinski-Harabasz scores') 119 | ax.grid() 120 | 121 | plt.show() 122 | 123 | # Analyze the cluster instability 124 | nb_noisy_datasets = 10 125 | 126 | X_noise = [] 127 | 128 | for _ in range(nb_noisy_datasets): 129 | Xn = np.ndarray(shape=(1000, 2)) 130 | 131 | for i, x in enumerate(X): 132 | if np.random.uniform(0, 1) < 0.25: 133 | Xn[i] = X[i] + np.random.uniform(-2.0, 2.0) 134 | else: 135 | Xn[i] = X[i] 136 | 137 | X_noise.append(Xn) 138 | 139 | instabilities = [] 140 | 141 | for n in nb_clusters: 142 | Yn = [] 143 | 144 | for Xn in X_noise: 145 | km = KMeans(n_clusters=n) 146 | Yn.append(km.fit_predict(Xn)) 147 | 148 | distances = [] 149 | 150 | for i in range(len(Yn) - 1): 151 | for j in range(i, len(Yn)): 152 | d = pairwise_distances(Yn[i].reshape(-1, 1), Yn[j].reshape(-1, 1), 'hamming') 153 | distances.append(d[0, 0]) 154 | 155 | instability = (2.0 * np.sum(distances)) / float(nb_noisy_datasets ** 2) 156 | instabilities.append(instability) 157 | 158 | fig, ax = plt.subplots(figsize=(15, 8)) 159 | 160 | ax.plot(nb_clusters, instabilities) 161 | ax.set_xlabel('Number of clusters') 162 | ax.set_ylabel('Cluster instability') 163 | ax.grid() 164 | 165 | plt.show() 166 | 167 | # Analyze the homegeneity, completeness, and Adjusted Rand score 168 | km = KMeans(n_clusters=3) 169 | Yp = km.fit_predict(X) 170 | 171 | print('Homegeneity: %.3f' % homogeneity_score(Y, Yp)) 172 | print('Completeness: %.3f' % completeness_score(Y, Yp)) 173 | print('Adjusted Rand score: %.3f' % adjusted_rand_score(Y, Yp)) 174 | 175 | 176 | -------------------------------------------------------------------------------- /Chapter07/s3vm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | from scipy.optimize import minimize 7 | 8 | from sklearn.datasets import make_classification 9 | 10 | # Set random seed for reproducibility 11 | np.random.seed(1000) 12 | 13 | 14 | nb_samples = 200 15 | nb_unlabeled = 150 16 | 17 | 18 | # Create dataset 19 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, random_state=1000) 20 | Y[Y == 0] = -1 21 | Y[nb_samples - nb_unlabeled:nb_samples] = 0 22 | 23 | 24 | # Initialize S3VM variables 25 | w = np.random.uniform(-0.1, 0.1, size=X.shape[1]) 26 | eta = np.random.uniform(0.0, 0.1, size=nb_samples - nb_unlabeled) 27 | xi = np.random.uniform(0.0, 0.1, size=nb_unlabeled) 28 | zi = np.random.uniform(0.0, 0.1, size=nb_unlabeled) 29 | b = np.random.uniform(-0.1, 0.1, size=1) 30 | C = 0.5 31 | 32 | 33 | # Stack all variables into a single vector 34 | theta0 = np.hstack((w, eta, xi, zi, b)) 35 | 36 | 37 | # Vectorize the min() function 38 | vmin = np.vectorize(lambda x1, x2: x1 if x1 <= x2 else x2) 39 | 40 | 41 | def svm_target(theta, Xd, Yd): 42 | wt = theta[0:2].reshape((Xd.shape[1], 1)) 43 | 44 | s_eta = np.sum(theta[2:2 + nb_samples - nb_unlabeled]) 45 | s_min_xi_zi = np.sum(vmin(theta[2 + nb_samples - nb_unlabeled:2 + nb_samples], 46 | theta[2 + nb_samples:2 + nb_samples + nb_unlabeled])) 47 | 48 | return C * (s_eta + s_min_xi_zi) + 0.5 * np.dot(wt.T, wt) 49 | 50 | 51 | def labeled_constraint(theta, Xd, Yd, idx): 52 | wt = theta[0:2].reshape((Xd.shape[1], 1)) 53 | 54 | c = Yd[idx] * (np.dot(Xd[idx], wt) + theta[-1]) + \ 55 | theta[2:2 + nb_samples - nb_unlabeled][idx] - 1.0 56 | 57 | return (c >= 0)[0] 58 | 59 | 60 | def unlabeled_constraint_1(theta, Xd, idx): 61 | wt = theta[0:2].reshape((Xd.shape[1], 1)) 62 | 63 | c = np.dot(Xd[idx], wt) - theta[-1] + \ 64 | theta[2 + nb_samples - nb_unlabeled:2 + nb_samples][idx - nb_samples + nb_unlabeled] - 1.0 65 | 66 | return (c >= 0)[0] 67 | 68 | 69 | def unlabeled_constraint_2(theta, Xd, idx): 70 | wt = theta[0:2].reshape((Xd.shape[1], 1)) 71 | 72 | c = -(np.dot(Xd[idx], wt) - theta[-1]) + \ 73 | theta[2 + nb_samples:2 + nb_samples + nb_unlabeled][idx - nb_samples + nb_unlabeled] - 1.0 74 | 75 | return (c >= 0)[0] 76 | 77 | 78 | def eta_constraint(theta, idx): 79 | return theta[2:2 + nb_samples - nb_unlabeled][idx] >= 0 80 | 81 | 82 | def xi_constraint(theta, idx): 83 | return theta[2 + nb_samples - nb_unlabeled:2 + nb_samples][idx - nb_samples + nb_unlabeled] >= 0 84 | 85 | 86 | def zi_constraint(theta, idx): 87 | return theta[2 + nb_samples:2 + nb_samples+nb_unlabeled ][idx - nb_samples + nb_unlabeled] >= 0 88 | 89 | 90 | if __name__ == '__main__': 91 | # Show the initial dataset 92 | fig, ax = plt.subplots(figsize=(12, 9)) 93 | 94 | ax.scatter(X[Y == -1, 0], X[Y == -1, 1], color='b', marker='s', s=80, label='Class +1') 95 | ax.scatter(X[Y == 1, 0], X[Y == 1, 1], color='g', marker='o', s=80, label='Class -1') 96 | ax.scatter(X[Y == 0, 0], X[Y == 0, 1], color='r', marker='x', s=80, label='Unlabeled') 97 | 98 | ax.set_xlabel(r'$x_0$') 99 | ax.set_ylabel(r'$x_1$') 100 | ax.legend() 101 | ax.grid() 102 | 103 | plt.show() 104 | 105 | # Setup all the constraints 106 | svm_constraints = [] 107 | 108 | for i in range(nb_samples - nb_unlabeled): 109 | svm_constraints.append({ 110 | 'type': 'ineq', 111 | 'fun': labeled_constraint, 112 | 'args': (X, Y, i) 113 | }) 114 | svm_constraints.append({ 115 | 'type': 'ineq', 116 | 'fun': eta_constraint, 117 | 'args': (i,) 118 | }) 119 | 120 | for i in range(nb_samples - nb_unlabeled, nb_samples): 121 | svm_constraints.append({ 122 | 'type': 'ineq', 123 | 'fun': unlabeled_constraint_1, 124 | 'args': (X, i) 125 | }) 126 | svm_constraints.append({ 127 | 'type': 'ineq', 128 | 'fun': unlabeled_constraint_2, 129 | 'args': (X, i) 130 | }) 131 | svm_constraints.append({ 132 | 'type': 'ineq', 133 | 'fun': xi_constraint, 134 | 'args': (i,) 135 | }) 136 | svm_constraints.append({ 137 | 'type': 'ineq', 138 | 'fun': zi_constraint, 139 | 'args': (i,) 140 | }) 141 | 142 | # Optimize the objective 143 | print('Optimizing...') 144 | result = minimize(fun=svm_target, 145 | x0=theta0, 146 | constraints=svm_constraints, 147 | args=(X, Y), 148 | method='SLSQP', 149 | tol=0.0001, 150 | options={'maxiter': 1000}) 151 | 152 | # Extract the last parameters 153 | theta_end = result['x'] 154 | w = theta_end[0:2] 155 | b = theta_end[-1] 156 | 157 | Xu = X[nb_samples - nb_unlabeled:nb_samples] 158 | yu = -np.sign(np.dot(Xu, w) + b) 159 | 160 | # Show the final plots 161 | fig, ax = plt.subplots(1, 2, figsize=(18, 8)) 162 | 163 | ax[0].scatter(X[Y == -1, 0], X[Y == -1, 1], color='b', marker='s', s=80, label='Class +1') 164 | ax[0].scatter(X[Y == 1, 0], X[Y == 1, 1], color='g', marker='o', s=80, label='Class -1') 165 | ax[0].scatter(X[Y == 0, 0], X[Y == 0, 1], color='r', marker='x', s=80, label='Unlabeled') 166 | 167 | ax[0].set_xlabel(r'$x_0$') 168 | ax[0].set_ylabel(r'$x_1$') 169 | ax[0].legend() 170 | ax[0].grid() 171 | 172 | ax[1].scatter(X[Y == -1, 0], X[Y == -1, 1], color='b', marker='s', s=80, label='Class +1') 173 | ax[1].scatter(X[Y == 1, 0], X[Y == 1, 1], color='g', marker='o', s=80, label='Class -1') 174 | 175 | ax[1].scatter(Xu[yu == -1, 0], Xu[yu == -1, 1], color='b', marker='s', s=80) 176 | ax[1].scatter(Xu[yu == 1, 0], Xu[yu == 1, 1], color='g', marker='o', s=80) 177 | 178 | ax[1].set_xlabel(r'$x_0$') 179 | ax[1].set_ylabel(r'$x_1$') 180 | ax[1].legend() 181 | ax[1].grid() 182 | 183 | plt.show() --------------------------------------------------------------------------------