├── Chapter14
    ├── vader_sentiment_analysis.py
    ├── lsa_2.py
    ├── word2vec.py
    ├── lsa_1.py
    ├── lda.py
    ├── lsa.py
    ├── sentiment_analysis.py
    └── plsa.py
├── Chapter13
    ├── language_detection.py
    ├── corpora.py
    ├── stopwords_removal.py
    ├── pos_ner.py
    ├── stemming.py
    ├── tokenizing.py
    ├── reuters_text_classifier.py
    └── vectorizing.py
├── Chapter08
    ├── adaboost_2.py
    ├── adaboost.py
    ├── random_forest.py
    ├── decision_tree_2.py
    ├── gradient_tree_boosting.py
    ├── decision_tree.py
    ├── random_forest_2.py
    ├── voting_classifier.py
    ├── decision_tree_regression.py
    └── dt.dot
├── Chapter03
    ├── data_normalization.py
    ├── toy_dataset.py
    ├── missing_features.py
    ├── nmf.py
    ├── dictionary_learning.py
    ├── tsne.py
    ├── sparse_pca.py
    ├── kernel_pca.py
    ├── feature_selection.py
    ├── feature_filtering.py
    ├── whitening.py
    ├── pca.py
    ├── fastica.py
    ├── categorical.py
    └── data_scaling.py
├── Chapter02
    ├── MLE.py
    ├── resampling.py
    └── SMOTE.py
├── Chapter07
    ├── kernel_svm_1.py
    ├── kernel_svm_2.py
    ├── linear_svm.py
    ├── controlled_svm.py
    ├── svr.py
    ├── kernel_svm.py
    ├── svr_airfoil.py
    └── s3vm.py
├── Chapter12
    ├── model_based_cf.py
    ├── memory_based_cf.py
    ├── content-based.py
    ├── user_based.py
    └── als_spark.py
├── Chapter17
    ├── vectorization.py
    ├── numpy_cupy.py
    ├── feature_union.py
    ├── pipeline.py
    └── pipeline_2.py
├── Chapter11
    ├── dendrogram.py
    ├── agglomerative_clustering.py
    └── connectivity_constraints.py
├── LICENSE
├── Chapter06
    ├── multinomial.py
    ├── newsgroups.py
    ├── discriminant_analysis.py
    ├── bernoulli.py
    └── gaussian.py
├── Chapter05
    ├── grid_search.py
    ├── learning_curve.py
    ├── grid_search_2.py
    ├── perceptron.py
    ├── roc_curve.py
    ├── confusion_matrix.py
    ├── passive_aggressive_classification.py
    ├── classification_metrics.py
    ├── passive_aggressive_regression.py
    └── logistic_regression.py
├── Chapter10
    ├── spectral_clustering_2.py
    ├── dbscan.py
    ├── birch.py
    ├── biclustering.py
    ├── mini_batch_kmeans.py
    └── spectral_clustering.py
├── Chapter04
    ├── ransac_regression.py
    ├── huber_regression.py
    ├── polynomial_regression.py
    ├── isotonic_regression.py
    ├── bayesian_regression.py
    ├── multiple_linear_regression.py
    ├── 2d_linear_regression.py
    └── ridge_lasso_elasticnet.py
├── Chapter16
    ├── convolution.py
    ├── gradients.py
    ├── mlp.py
    ├── logistic_regression.py
    ├── dcn.py
    └── lstm.py
├── Chapter09
    ├── k_means.py
    ├── knn.py
    ├── k_means_2.py
    ├── gaussian_mixture.py
    └── evaluation_metrics.py
├── Chapter15
    ├── keras_scikit_learn.py
    └── mlp.py
└── README.md


/Chapter14/vader_sentiment_analysis.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | 
3 | from nltk.sentiment.vader import SentimentIntensityAnalyzer
4 | 
5 | if __name__ == '__main__':
6 |     text = 'This is a very interesting and quite powerful sentiment analyzer'
7 | 
8 |     vader = SentimentIntensityAnalyzer()
9 |     print(vader.polarity_scores(text))


--------------------------------------------------------------------------------
/Chapter13/language_detection.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from langdetect import detect, detect_langs
 4 | 
 5 | if __name__ == '__main__':
 6 |     # Simple language detection
 7 |     print(detect('This is English'))
 8 |     print(detect('Dies ist Deutsch'))
 9 | 
10 |     # Probabilistic language detection
11 |     print(detect_langs('I really love you mon doux amour!'))


--------------------------------------------------------------------------------
/Chapter13/corpora.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from nltk.corpus import gutenberg
 4 | 
 5 | if __name__ == '__main__':
 6 |     # Print all Gutenberg corpus documents
 7 |     print('Gutenberg corpus files:')
 8 |     print(gutenberg.fileids())
 9 | 
10 |     # Print a raw corpus
11 |     print(gutenberg.raw('milton-paradise.txt'))
12 | 
13 |     # Print 2 sentences from a corpus
14 |     print(gutenberg.sents('milton-paradise.txt')[0:2])
15 | 
16 |     # Print 20 words from a corpus
17 |     print(gutenberg.words('milton-paradise.txt')[0:20])


--------------------------------------------------------------------------------
/Chapter08/adaboost_2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.datasets import load_iris
 6 | from sklearn.ensemble import AdaBoostClassifier
 7 | from sklearn.model_selection import cross_val_score
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     # Load dataset
16 |     iris = load_iris()
17 | 
18 |     # Create and train an AdaBoost classifier
19 |     ada = AdaBoostClassifier(n_estimators=100, learning_rate=1.0)
20 |     ada_scores = cross_val_score(ada, iris.data, iris.target, scoring='accuracy', cv=10)
21 |     print('AdaBoost score: %.3f' % ada_scores.mean())
22 | 
23 | 


--------------------------------------------------------------------------------
/Chapter13/stopwords_removal.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from nltk.corpus import stopwords
 4 | from nltk.tokenize import RegexpTokenizer
 5 | 
 6 | if __name__ == '__main__':
 7 |     # Load English stopwords
 8 |     sw = set(stopwords.words('english'))
 9 | 
10 |     print('English stopwords:')
11 |     print(sw)
12 | 
13 |     # Tokenize and remove stopwords
14 |     complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!'
15 | 
16 |     ret = RegexpTokenizer('[a-zA-Z\']+')
17 |     tokens = ret.tokenize(complex_text)
18 |     clean_tokens = [t for t in tokens if t not in sw]
19 | 
20 |     print('Tokenized and cleaned complex text')
21 |     print(clean_tokens)


--------------------------------------------------------------------------------
/Chapter13/pos_ner.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from nltk import word_tokenize, pos_tag, ne_chunk, tree2conlltags
 4 | 
 5 | 
 6 | if __name__ == '__main__':
 7 |     sentence_1 = 'My friend John lives in Paris'
 8 | 
 9 |     # Perform a POS tagging
10 |     tokens_1 = word_tokenize(sentence_1)
11 |     tags_1 = pos_tag(tokens_1)
12 | 
13 |     print(sentence_1)
14 |     print(tags_1)
15 | 
16 |     # Peform a POS and NER tagging
17 |     sentence_2 = 'Search a hotel in Cambridge near the MIT'
18 | 
19 |     tokens_2 = word_tokenize(sentence_2)
20 |     tags_2 = pos_tag(tokens_2)
21 | 
22 |     print('\n')
23 |     print(sentence_2)
24 |     print(tree2conlltags(ne_chunk(tags_2)))
25 | 
26 | 


--------------------------------------------------------------------------------
/Chapter03/data_normalization.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.preprocessing import Normalizer
 6 | 
 7 | # For reproducibility
 8 | np.random.seed(1000)
 9 | 
10 | if __name__ == '__main__':
11 |     # Create a dummy dataset
12 |     data = np.array([1.0, 2.0])
13 |     print(data)
14 | 
15 |     # Max normalization
16 |     n_max = Normalizer(norm='max')
17 |     nm = n_max.fit_transform(data.reshape(1, -1))
18 |     print(nm)
19 | 
20 |     # L1 normalization
21 |     n_l1 = Normalizer(norm='l1')
22 |     nl1 = n_l1.fit_transform(data.reshape(1, -1))
23 |     print(nl1)
24 | 
25 |     # L2 normalization
26 |     n_l2 = Normalizer(norm='l2')
27 |     nl2 = n_l2.fit_transform(data.reshape(1, -1))
28 |     print(nl2)


--------------------------------------------------------------------------------
/Chapter02/MLE.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from scipy.optimize import minimize
 6 | 
 7 | 
 8 | # For reproducibility
 9 | np.random.seed(1000)
10 | 
11 | 
12 | nb_samples = 100
13 | 
14 | # Create the dataset
15 | X_data = np.random.normal(loc=0.0, scale=np.sqrt(2.0), size=nb_samples)
16 | 
17 | 
18 | def negative_log_likelihood(v):
19 |     l = 0.0
20 |     f1 = 1.0 / np.sqrt(2.0 * np.pi * v[1])
21 |     f2 = 2.0 * v[1]
22 | 
23 |     for x in X_data:
24 |         l += np.log(f1 * np.exp(-np.square(x - v[0]) / f2))
25 | 
26 |     return -l
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     # Create the dataset
31 |     res = minimize(fun=negative_log_likelihood, x0=np.array([0.0, 1.0]))
32 | 
33 |     print(res)
34 | 
35 | 


--------------------------------------------------------------------------------
/Chapter03/toy_dataset.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.datasets import load_boston
 6 | from sklearn.model_selection import train_test_split
 7 | from sklearn.utils import check_random_state
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     # Load the dataset
16 |     boston = load_boston()
17 |     X = boston.data
18 |     Y = boston.target
19 | 
20 |     print(X.shape)
21 |     print(Y.shape)
22 | 
23 |     # Create train and test sets
24 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=1000)
25 | 
26 |     # Use a random state
27 |     rs = check_random_state(1000)
28 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=rs)


--------------------------------------------------------------------------------
/Chapter03/missing_features.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.preprocessing import Imputer
 6 | 
 7 | # For reproducibility
 8 | np.random.seed(1000)
 9 | 
10 | if __name__ == '__main__':
11 |     data = np.array([[1, np.nan, 2], [2, 3, np.nan], [-1, 4, 2]])
12 |     print(data)
13 | 
14 |     # Imputer with mean-strategy
15 |     print('Mean strategy')
16 |     imp = Imputer(strategy='mean')
17 |     print(imp.fit_transform(data))
18 | 
19 |     # Imputer with median-strategy
20 |     print('Median strategy')
21 |     imp = Imputer(strategy='median')
22 |     print(imp.fit_transform(data))
23 | 
24 |     # Imputer with most-frequent-strategy
25 |     print('Most-frequent strategy')
26 |     imp = Imputer(strategy='most_frequent')
27 |     print(imp.fit_transform(data))
28 | 
29 | 


--------------------------------------------------------------------------------
/Chapter03/nmf.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.datasets import load_iris
 6 | from sklearn.decomposition import NMF
 7 | 
 8 | # For reproducibility
 9 | np.random.seed(1000)
10 | 
11 | if __name__ == '__main__':
12 |     # Load iris dataset
13 |     iris = load_iris()
14 |     print('Irid dataset shape')
15 |     print(iris.data.shape)
16 | 
17 |     # Perform a non-negative matrix factorization
18 |     nmf = NMF(n_components=3, init='random', l1_ratio=0.1)
19 |     Xt = nmf.fit_transform(iris.data)
20 | 
21 |     print('Reconstruction error')
22 |     print(nmf.reconstruction_err_)
23 | 
24 |     print('Original Iris sample')
25 |     print(iris.data[0])
26 | 
27 |     print('Compressed Iris sample (via Non-Negative Matrix Factorization)')
28 |     print(Xt[0])
29 | 
30 |     print('Rebuilt sample')
31 |     print(nmf.inverse_transform(Xt[0]))


--------------------------------------------------------------------------------
/Chapter08/adaboost.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.ensemble import AdaBoostClassifier
 8 | from sklearn.model_selection import cross_val_score
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | nb_classifications = 100
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     # Load dataset
19 |     digits = load_digits()
20 | 
21 |     # Collect accuracies
22 |     ab_accuracy = []
23 | 
24 |     for i in range(1, nb_classifications):
25 |         a = cross_val_score(AdaBoostClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy',
26 |                             cv=10).mean()
27 |         ab_accuracy.append(a)
28 | 
29 |     # Show results
30 |     plt.figure(figsize=(30, 25))
31 |     plt.xlabel('Number of trees')
32 |     plt.ylabel('Accuracy')
33 |     plt.grid(True)
34 |     plt.plot(ab_accuracy)
35 |     plt.show()


--------------------------------------------------------------------------------
/Chapter07/kernel_svm_1.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import multiprocessing
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.model_selection import GridSearchCV
 8 | from sklearn.svm import SVC
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     # Load dataset
17 |     digits = load_digits()
18 | 
19 |     # Define a param grid
20 |     param_grid = [
21 |         {
22 |             'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
23 |             'C': [0.1, 0.2, 0.4, 0.5, 1.0, 1.5, 1.8, 2.0, 2.5, 3.0]
24 |         }
25 |     ]
26 | 
27 |     # Create a train grid search on SVM classifier
28 |     gs = GridSearchCV(estimator=SVC(), param_grid=param_grid,
29 |                       scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count())
30 |     gs.fit(digits.data, digits.target)
31 | 
32 |     print(gs.best_estimator_)
33 |     print('Kernel SVM score: %.3f' % gs.best_score_)


--------------------------------------------------------------------------------
/Chapter08/random_forest.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.ensemble import RandomForestClassifier
 8 | from sklearn.model_selection import cross_val_score
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | nb_classifications = 100
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     # Load dataset
19 |     digits = load_digits()
20 | 
21 |     # Collect accuracies
22 |     rf_accuracy = []
23 | 
24 |     for i in range(1, nb_classifications):
25 |         a = cross_val_score(RandomForestClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy',
26 |                             cv=10).mean()
27 |         rf_accuracy.append(a)
28 | 
29 |     # Show results
30 |     plt.figure(figsize=(30, 25))
31 |     plt.xlabel('Number of trees')
32 |     plt.ylabel('Accuracy')
33 |     plt.grid(True)
34 |     plt.plot(rf_accuracy)
35 |     plt.show()


--------------------------------------------------------------------------------
/Chapter03/dictionary_learning.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.decomposition import DictionaryLearning
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | if __name__ == '__main__':
13 |     # Load MNIST digits
14 |     digits = load_digits()
15 | 
16 |     # Perform a dictionary learning (and atom extraction) from the MNIST dataset
17 |     dl = DictionaryLearning(n_components=36, fit_algorithm='lars', transform_algorithm='lasso_lars')
18 |     X_dict = dl.fit_transform(digits.data)
19 | 
20 |     # Show the atoms that have been extracted
21 |     fig, ax = plt.subplots(6, 6, figsize=(8, 8))
22 | 
23 |     samples = [dl.components_[x].reshape((8, 8)) for x in range(34)]
24 | 
25 |     for i in range(6):
26 |         for j in range(6):
27 |             ax[i, j].set_axis_off()
28 |             ax[i, j].imshow(samples[(i * 5) + j], cmap='gray')
29 | 
30 |     plt.show()
31 | 
32 | 


--------------------------------------------------------------------------------
/Chapter13/stemming.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from nltk.stem.snowball import SnowballStemmer
 4 | from nltk.stem.snowball import PorterStemmer
 5 | from nltk.stem.lancaster import LancasterStemmer
 6 | 
 7 | if __name__ == '__main__':
 8 |     print('English Snowball stemming:')
 9 |     ess = SnowballStemmer('english', ignore_stopwords=True)
10 |     print(ess.stem('flies'))
11 | 
12 |     print('French Snowball stemming:')
13 |     fss = SnowballStemmer('french', ignore_stopwords=True)
14 |     print(fss.stem('courais'))
15 | 
16 |     print('English Snowball stemming:')
17 |     print(ess.stem('teeth'))
18 | 
19 |     print('Porter stemming:')
20 |     ps = PorterStemmer()
21 |     print(ps.stem('teeth'))
22 | 
23 |     print('Lancaster stemming:')
24 |     ls = LancasterStemmer()
25 |     print(ls.stem('teeth'))
26 | 
27 |     print('Porter stemming:')
28 |     print(ps.stem('teen'))
29 |     print(ps.stem('teenager'))
30 | 
31 |     print('Lancaster stemming:')
32 |     print(ls.stem('teen'))
33 |     print(ls.stem('teenager'))


--------------------------------------------------------------------------------
/Chapter03/tsne.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import matplotlib.cm as cm
 5 | import numpy as np
 6 | 
 7 | from sklearn.datasets import load_digits
 8 | from sklearn.manifold import TSNE
 9 | 
10 | 
11 | # Set random seed for reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     # Load the dataset
17 |     digits = load_digits()
18 |     X = digits['data'] / np.max(digits['data'])
19 | 
20 |     # Perform a t-SNE
21 |     tsne = TSNE(n_components=2, perplexity=20, random_state=1000)
22 |     X_tsne = tsne.fit_transform(X)
23 | 
24 |     # Plot the t-SNE result
25 |     fig, ax = plt.subplots(figsize=(18, 10))
26 | 
27 |     for i in range(400):
28 |         ax.scatter(X_tsne[:, 0], X_tsne[:, 1], color=cm.rainbow(digits['target'] * 10), marker='o', s=20)
29 |         ax.annotate('%d' % digits['target'][i], xy=(X_tsne[i, 0] + 1, X_tsne[i, 1] + 1))
30 | 
31 |     ax.set_xlabel(r'$x_0$')
32 |     ax.set_ylabel(r'$x_1$')
33 |     ax.grid()
34 | 
35 |     plt.show()


--------------------------------------------------------------------------------
/Chapter12/model_based_cf.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from scipy.linalg import svd
 6 | 
 7 | # For reproducibility
 8 | np.random.seed(1000)
 9 | 
10 | if __name__ == '__main__':
11 |     # Create a dummy user-item matrix
12 |     M = np.random.randint(0, 6, size=(20, 10))
13 | 
14 |     print('User-Item matrix:')
15 |     print(M)
16 | 
17 |     # Decompose M
18 |     U, s, V = svd(M, full_matrices=True)
19 |     S = np.diag(s)
20 | 
21 |     print('U -> %r' % str(U.shape))
22 |     print('S -> %r' % str(S.shape))
23 |     print('V -> %r' % str(V.shape))
24 | 
25 |     # Select the first 8 singular values
26 |     Uk = U[:, 0:8]
27 |     Sk = S[0:8, 0:8]
28 |     Vk = V[0:8, :]
29 | 
30 |     # Compute the user and product vectors
31 |     Su = Uk.dot(np.sqrt(Sk).T)
32 |     Si = np.sqrt(Sk).dot(Vk).T
33 | 
34 |     # Compute the average rating per user
35 |     Er = np.mean(M, axis=1)
36 | 
37 |     # Perform a prediction for the user 5 and item 2
38 |     r5_2 = Er[5] + Su[5].dot(Si[2])
39 |     print(r5_2)
40 | 
41 | 


--------------------------------------------------------------------------------
/Chapter03/sparse_pca.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.decomposition import SparsePCA
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     # Load MNIST digits
15 |     digits = load_digits()
16 | 
17 |     # Show some random digits
18 |     selection = np.random.randint(0, 1797, size=100)
19 | 
20 |     fig, ax = plt.subplots(10, 10, figsize=(10, 10))
21 | 
22 |     samples = [digits.data[x].reshape((8, 8)) for x in selection]
23 | 
24 |     for i in range(10):
25 |         for j in range(10):
26 |             ax[i, j].set_axis_off()
27 |             ax[i, j].imshow(samples[(i * 8) + j], cmap='gray')
28 | 
29 |     plt.show()
30 | 
31 |     # Perform a PCA on the digits dataset
32 |     spca = SparsePCA(n_components=60, alpha=0.1)
33 |     X_spca = spca.fit_transform(digits.data / 255)
34 | 
35 |     print('SPCA components shape:')
36 |     print(spca.components_.shape)
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/Chapter17/vectorization.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import time
 5 | 
 6 | 
 7 | # For reproducibility
 8 | np.random.seed(1000)
 9 | 
10 | 
11 | size = 500
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     # Create the matrices
16 |     A1 = np.random.normal(0.0, 2.0, size=(size, size)).astype(np.float32)
17 |     A2 = np.random.normal(0.0, 2.0, size=(size, size)).astype(np.float32)
18 | 
19 |     # Non-vectorized computation
20 |     D = np.zeros(shape=(size, size)).astype(np.float32)
21 | 
22 |     start_time = time.time()
23 | 
24 |     for i in range(size):
25 |         for j in range(size):
26 |             d = 0.0
27 |             for k in range(size):
28 |                 d += A1[i, k] * A2[k, j]
29 |             D[i, j] = d
30 | 
31 |     end_time = time.time()
32 |     elapsed = end_time - start_time
33 |     print(elapsed)
34 | 
35 |     # Vectorized computation
36 |     start_time = time.time()
37 | 
38 |     D = np.dot(A1, A2)
39 | 
40 |     end_time = time.time()
41 |     elapsed = end_time - start_time
42 |     print(elapsed)


--------------------------------------------------------------------------------
/Chapter08/decision_tree_2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import multiprocessing
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.tree import DecisionTreeClassifier
 8 | from sklearn.model_selection import GridSearchCV
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     # Load dataset
17 |     digits = load_digits()
18 | 
19 |     # Define a param grid
20 |     param_grid = [
21 |         {
22 |             'criterion': ['gini', 'entropy'],
23 |             'max_features': ['auto', 'log2', None],
24 |             'min_samples_split': [2, 10, 25, 100, 200],
25 |             'max_depth': [5, 10, 15, None]
26 |         }
27 |     ]
28 | 
29 |     # Create and train a grid searh
30 |     gs = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid,
31 |                       scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count())
32 |     gs.fit(digits.data, digits.target)
33 | 
34 |     print(gs.best_estimator_)
35 |     print('Decision tree score: %.3f' % gs.best_score_)


--------------------------------------------------------------------------------
/Chapter11/dendrogram.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_blobs
 7 | 
 8 | from scipy.spatial.distance import pdist
 9 | from scipy.cluster.hierarchy import linkage
10 | from scipy.cluster.hierarchy import dendrogram
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | nb_samples = 25
16 | 
17 | if __name__ == '__main__':
18 |     # Create the dataset
19 |     X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=3, cluster_std=1.5)
20 | 
21 |     # Show the dataset
22 |     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
23 | 
24 |     ax.grid()
25 |     ax.set_xlabel('X')
26 |     ax.set_ylabel('Y')
27 | 
28 |     ax.scatter(X[:, 0], X[:, 1], marker='o', color='b')
29 |     plt.show()
30 | 
31 |     # Compute the distance matrix
32 |     Xdist = pdist(X, metric='euclidean')
33 | 
34 |     # Compute the linkage
35 |     Xl = linkage(Xdist, method='ward')
36 | 
37 |     # Compute and show the dendrogram
38 |     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
39 |     Xd = dendrogram(Xl)
40 |     plt.show()


--------------------------------------------------------------------------------
/Chapter07/kernel_svm_2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import multiprocessing
 5 | 
 6 | from sklearn.datasets import fetch_olivetti_faces
 7 | from sklearn.model_selection import GridSearchCV
 8 | from sklearn.svm import SVC
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | # Set a local folder here
15 | olivetti_home = ''
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     # Load dataset
20 |     faces = fetch_olivetti_faces(data_home=olivetti_home)
21 | 
22 |     # Define a param grid
23 |     param_grid = [
24 |         {
25 |             'kernel': ['rbf', 'poly'],
26 |             'C': [0.1, 0.5, 1.0, 1.5],
27 |             'degree': [2, 3, 4, 5],
28 |             'gamma': [0.001, 0.01, 0.1, 0.5]
29 |         }
30 |     ]
31 | 
32 |     # Create a train grid search on SVM classifier
33 |     gs = GridSearchCV(estimator=SVC(), param_grid=param_grid,
34 |                       scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count())
35 |     gs.fit(faces.data, faces.target)
36 | 
37 |     print(gs.best_estimator_)
38 |     print('Kernel SVM score: %.3f' % gs.best_score_)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Chapter03/kernel_pca.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_circles
 7 | from sklearn.decomposition import KernelPCA
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | if __name__ == '__main__':
13 |     # Create a dummy dataset
14 |     Xb, Yb = Xb, Yb = make_circles(n_samples=500, factor=0.1, noise=0.05)
15 | 
16 |     # Show the dataset
17 |     fig, ax = plt.subplots(1, 1, figsize=(8, 8))
18 |     ax.scatter(Xb[:, 0], Xb[:, 1])
19 |     ax.set_xlabel('X')
20 |     ax.set_ylabel('Y')
21 |     ax.grid()
22 | 
23 |     plt.show()
24 | 
25 |     # Perform a kernel PCA (with radial basis function)
26 |     kpca = KernelPCA(n_components=2, kernel='rbf', fit_inverse_transform=True, gamma=1.0)
27 |     X_kpca = kpca.fit_transform(Xb)
28 | 
29 |     # Plot the dataset after PCA
30 |     fig, ax = plt.subplots(1, 1, figsize=(8, 8))
31 |     ax.scatter(kpca.X_transformed_fit_[:, 0], kpca.X_transformed_fit_[:, 1])
32 |     ax.set_xlabel('First component')
33 |     ax.set_ylabel('Second component')
34 |     ax.grid()
35 | 
36 |     plt.show()


--------------------------------------------------------------------------------
/Chapter06/multinomial.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.feature_extraction import DictVectorizer
 6 | from sklearn.naive_bayes import MultinomialNB
 7 | 
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     # Prepare a dummy dataset
15 |     data = [
16 |         {'house': 100, 'street': 50, 'shop': 25, 'car': 100, 'tree': 20},
17 |         {'house': 5, 'street': 5, 'shop': 0, 'car': 10, 'tree': 500, 'river': 1}
18 |     ]
19 | 
20 |     # Create and train a dictionary vectorizer
21 |     dv = DictVectorizer(sparse=False)
22 |     X = dv.fit_transform(data)
23 |     Y = np.array([1, 0])
24 | 
25 |     # Create and train a Multinomial Naive Bayes classifier
26 |     mnb = MultinomialNB()
27 |     mnb.fit(X, Y)
28 | 
29 |     # Create dummy test data
30 |     test_data = data = [
31 |         {'house': 80, 'street': 20, 'shop': 15, 'car': 70, 'tree': 10, 'river': 1},
32 |         {'house': 10, 'street': 5, 'shop': 1, 'car': 8, 'tree': 300, 'river': 0}
33 |     ]
34 | 
35 |     Yp = mnb.predict(dv.fit_transform(test_data))
36 |     print(Yp)
37 | 


--------------------------------------------------------------------------------
/Chapter05/grid_search.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import multiprocessing
 5 | 
 6 | from sklearn.datasets import load_iris
 7 | from sklearn.model_selection import GridSearchCV, cross_val_score
 8 | from sklearn.linear_model import LogisticRegression
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     # Load dataset
17 |     iris = load_iris()
18 | 
19 |     # Define a param grid
20 |     param_grid = [
21 |         {
22 |             'penalty': ['l1', 'l2'],
23 |             'C': [0.5, 1.0, 1.5, 1.8, 2.0, 2.5]
24 |         }
25 |     ]
26 | 
27 |     # Create and train a grid search
28 |     gs = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid,
29 |                       scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count())
30 |     gs.fit(iris.data, iris.target)
31 | 
32 |     # Best estimator
33 |     print(gs.best_estimator_)
34 | 
35 |     gs_scores = cross_val_score(gs.best_estimator_, iris.data, iris.target, scoring='accuracy', cv=10)
36 |     print('Best estimator CV average score: %.3f' % gs_scores.mean())
37 | 
38 | 


--------------------------------------------------------------------------------
/Chapter08/gradient_tree_boosting.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.ensemble import GradientBoostingClassifier
 8 | from sklearn.model_selection import cross_val_score
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | nb_samples = 500
14 | 
15 | if __name__ == '__main__':
16 |     # Create the dataset
17 |     X, Y = make_classification(n_samples=nb_samples, n_features=4, n_informative=3, n_redundant=1, n_classes=3)
18 | 
19 |     # Collect the scores for n_estimators in (1, 50)
20 |     a = []
21 |     max_estimators = 50
22 | 
23 |     for i in range(1, max_estimators):
24 |         score = cross_val_score(GradientBoostingClassifier(n_estimators=i, learning_rate=10.0 / float(i)), X, Y,
25 |                                      cv=10, scoring='accuracy').mean()
26 |         a.append(score)
27 | 
28 |     # Plot the results
29 |     plt.figure(figsize=(30, 25))
30 |     plt.xlabel('Number of estimators')
31 |     plt.ylabel('Average CV accuracy')
32 |     plt.grid(True)
33 |     plt.plot(a)
34 |     plt.show()


--------------------------------------------------------------------------------
/Chapter14/lsa_2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from nltk.corpus import brown
 6 | 
 7 | from sklearn.decomposition import TruncatedSVD
 8 | from sklearn.feature_extraction.text import TfidfVectorizer
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     # Compose a corpus
17 |     sentences = sentences = brown.sents(categories=['news', 'fiction'])
18 |     corpus = []
19 | 
20 |     for s in sentences:
21 |         corpus.append(' '.join(s))
22 | 
23 |     # Vectorize the corpus
24 |     vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', sublinear_tf=True, use_idf=True)
25 |     Xc = vectorizer.fit_transform(corpus)
26 | 
27 |     rank = 2
28 | 
29 |     # Performed a truncated SVD
30 |     tsvd = TruncatedSVD(n_components=rank)
31 |     Xt = tsvd.fit_transform(Xc)
32 | 
33 |     # Check the top-10 word per topic
34 |     Mwts = np.argsort(tsvd.components_, axis=1)[::-1]
35 | 
36 |     for t in range(rank):
37 |         print('\nTopic ' + str(t))
38 |         for i in range(10):
39 |             print(vectorizer.get_feature_names()[Mwts[t, i]])


--------------------------------------------------------------------------------
/Chapter05/learning_curve.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import load_wine
 7 | from sklearn.model_selection import learning_curve
 8 | from sklearn.linear_model import LogisticRegression
 9 | from sklearn.utils import shuffle
10 | 
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     # Load the dataset
18 |     wine = load_wine()
19 | 
20 |     # Shuffle the dataset and compute the learning curves
21 |     X, Y = shuffle(wine['data'], wine['target'])
22 |     tsize, training_score, test_score = learning_curve(LogisticRegression(), X, Y, cv=20, random_state=1000)
23 | 
24 |     # Show the learning curve
25 |     avg_tr_scores = np.mean(training_score, axis=1)
26 |     avg_test_scores = np.mean(test_score, axis=1)
27 | 
28 |     fig, ax = plt.subplots(figsize=(15, 8))
29 | 
30 |     ax.plot(tsize, avg_tr_scores, label='Training score')
31 |     ax.plot(tsize, avg_test_scores, label='CV score')
32 |     ax.set_xlabel('Number of samples')
33 |     ax.set_ylabel('Accuracy')
34 |     ax.legend()
35 |     ax.grid()
36 | 
37 |     plt.show()


--------------------------------------------------------------------------------
/Chapter08/decision_tree.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.datasets import make_classification
 6 | from sklearn.tree import DecisionTreeClassifier, export_graphviz
 7 | from sklearn.model_selection import cross_val_score
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | nb_samples = 500
14 | 
15 | # Set a folder to store the graph in
16 | graph_folder = './dt.dot'
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     # Create dataset
21 |     X, Y = make_classification(n_samples=nb_samples, n_features=3, n_informative=3, n_redundant=0, n_classes=3,
22 |                                n_clusters_per_class=1)
23 | 
24 |     # Create a Decision tree classifier
25 |     dt = DecisionTreeClassifier()
26 |     dt_scores = cross_val_score(dt, X, Y, scoring='accuracy', cv=10)
27 |     print('Decision tree score: %.3f' % dt_scores.mean())
28 | 
29 |     # Save in Graphviz format
30 |     dt.fit(X, Y)
31 | 
32 |     with open(graph_folder, 'w') as df:
33 |         df = export_graphviz(dt, out_file=df,
34 |                              feature_names=['A', 'B', 'C'],
35 |                              class_names=['C1', 'C2', 'C3'])


--------------------------------------------------------------------------------
/Chapter03/feature_selection.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.feature_selection import VarianceThreshold
 7 | 
 8 | # For reproducibility
 9 | np.random.seed(1000)
10 | 
11 | if __name__ == '__main__':
12 |     # Create a dummy dataset
13 |     X = np.ndarray(shape=(100, 3))
14 | 
15 |     X[:, 0] = np.random.normal(0.0, 5.0, size=100)
16 |     X[:, 1] = np.random.normal(0.5, 5.0, size=100)
17 |     X[:, 2] = np.random.normal(1.0, 0.5, size=100)
18 | 
19 |     # Show the dataset
20 |     fig, ax = plt.subplots(1, 1, figsize=(12, 8))
21 |     ax.grid()
22 |     ax.set_xlabel('X')
23 |     ax.set_ylabel('Y')
24 | 
25 |     ax.plot(X[:, 0], label='STD = 5.0')
26 |     ax.plot(X[:, 1], label='STD = 5.0')
27 |     ax.plot(X[:, 2], label='STD = 0.5')
28 | 
29 |     plt.legend()
30 |     plt.show()
31 | 
32 |     # Impose a variance threshold
33 |     print('Samples before variance thresholding')
34 |     print(X[0:3, :])
35 | 
36 |     vt = VarianceThreshold(threshold=1.5)
37 |     X_t = vt.fit_transform(X)
38 | 
39 |     # After the filter has removed the componenents
40 |     print('Samples after variance thresholding')
41 |     print(X_t[0:3, :])


--------------------------------------------------------------------------------
/Chapter12/memory_based_cf.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import warnings
 5 | 
 6 | from scikits.crab.models import MatrixPreferenceDataModel
 7 | from scikits.crab.similarities import UserSimilarity
 8 | from scikits.crab.metrics import euclidean_distances
 9 | from scikits.crab.recommenders.knn import UserBasedRecommender
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | if __name__ == '__main__':
15 |     # Define a user-item matrix
16 |     user_item_matrix = {
17 |         1: {1: 2, 2: 5, 3: 3},
18 |         2: {1: 5, 4: 2},
19 |         3: {2: 3, 4: 5, 3: 2},
20 |         4: {3: 5, 5: 1},
21 |         5: {1: 3, 2: 3, 4: 1, 5: 3}
22 |     }
23 | 
24 |     # Build a matrix preference model
25 |     model = MatrixPreferenceDataModel(user_item_matrix)
26 | 
27 |     # Build a similarity matrix
28 |     similarity_matrix = UserSimilarity(model, euclidean_distances)
29 | 
30 |     # Create a recommender
31 |     recommender = UserBasedRecommender(model, similarity_matrix, with_preference=True)
32 | 
33 |     # Test the recommender for user 2
34 |     with warnings.catch_warnings():
35 |         warnings.simplefilter("ignore")
36 |         print(recommender.recommend(2))
37 | 


--------------------------------------------------------------------------------
/Chapter10/spectral_clustering_2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_moons
 7 | from sklearn.cluster import SpectralClustering
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | nb_samples = 1000
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     # Create dataset
18 |     X, Y = make_moons(n_samples=nb_samples, noise=0.05)
19 | 
20 |     # Try different gammas with a RBF affinity
21 |     Yss = []
22 |     gammas = np.linspace(0, 12, 4)
23 | 
24 |     for gamma in gammas:
25 |         sc = SpectralClustering(n_clusters=2, affinity='rbf', gamma=gamma)
26 |         Yss.append(sc.fit_predict(X))
27 | 
28 |     # Show data
29 |     fig, ax = plt.subplots(1, 4, figsize=(30, 10), sharey=True)
30 | 
31 |     for x in range(4):
32 |         ax[x].grid()
33 |         ax[x].set_title('Gamma = %.0f' % gammas[x])
34 | 
35 |         for i in range(nb_samples):
36 |             c = Yss[x][i]
37 | 
38 |             if c == 0:
39 |                 ax[x].scatter(X[i, 0], X[i, 1], marker='o', color='r')
40 |             else:
41 |                 ax[x].scatter(X[i, 0], X[i, 1], marker='^', color='b')
42 | 
43 |     plt.show()


--------------------------------------------------------------------------------
/Chapter03/feature_filtering.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.datasets import load_boston, load_iris
 6 | from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, f_regression
 7 | 
 8 | # For reproducibility
 9 | np.random.seed(1000)
10 | 
11 | if __name__ == '__main__':
12 |     # Load Boston data
13 |     regr_data = load_boston()
14 |     print('Boston data shape')
15 |     print(regr_data.data.shape)
16 | 
17 |     # Select the best k features with regression test
18 |     kb_regr = SelectKBest(f_regression)
19 |     X_b = kb_regr.fit_transform(regr_data.data, regr_data.target)
20 |     print('K-Best-filtered Boston dataset shape')
21 |     print(X_b.shape)
22 |     print('K-Best scores')
23 |     print(kb_regr.scores_)
24 | 
25 |     # Load iris data
26 |     class_data = load_iris()
27 |     print('Iris dataset shape')
28 |     print(class_data.data.shape)
29 | 
30 |     # Select the best k features using Chi^2 classification test
31 |     perc_class = SelectPercentile(chi2, percentile=15)
32 |     X_p = perc_class.fit_transform(class_data.data, class_data.target)
33 |     print('Chi2-filtered Iris dataset shape')
34 |     print(X_p.shape)
35 |     print('Chi2 scores')
36 |     print(perc_class.scores_)
37 | 
38 | 


--------------------------------------------------------------------------------
/Chapter17/numpy_cupy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | # For further information, please visit https://cupy.chainer.org/
 5 | import cupy as cp
 6 | import time
 7 | 
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | cp.random.seed(1000)
12 | 
13 | size = 5000
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     # Create the matrices using NumPy
18 |     A1 = np.random.normal(0.0, 2.0, size=(size, size)).astype(np.float32)
19 |     A2 = np.random.normal(0.0, 2.0, size=(size, size)).astype(np.float32)
20 | 
21 |     # Perform the measurement using NumPy
22 |     Ad = A1.copy()
23 | 
24 |     start_time = time.time()
25 | 
26 |     for _ in range(100):
27 |         Ad = np.dot(Ad, A2)
28 | 
29 |     end_time = time.time()
30 |     elapsed = end_time - start_time
31 |     print(elapsed)
32 | 
33 |     # Create the matrices using CuPy
34 |     B1 = cp.random.normal(0.0, 2.0, size=(size, size))
35 |     B2 = cp.random.normal(0.0, 2.0, size=(size, size))
36 | 
37 |     # Perform the measurement using CuPy with GPU support
38 |     Bd = B1.copy()
39 | 
40 |     start_time = time.time()
41 | 
42 |     for _ in range(100):
43 |         Bd = cp.dot(Bd, B2)
44 | 
45 |     end_time = time.time()
46 |     elapsed = end_time - start_time
47 |     print(elapsed)
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/Chapter17/feature_union.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import warnings
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.decomposition import PCA
 8 | from sklearn.feature_selection import SelectKBest, f_classif
 9 | from sklearn.model_selection import cross_val_score
10 | from sklearn.pipeline import Pipeline, FeatureUnion
11 | from sklearn.preprocessing import StandardScaler
12 | from sklearn.svm import SVC
13 | 
14 | # For reproducibility
15 | np.random.seed(1000)
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     warnings.simplefilter("ignore")
20 | 
21 |     # Load the dataset
22 |     digits = load_digits()
23 | 
24 |     # Create the steps for a feature union
25 |     steps_fu = [
26 |         ('pca', PCA(n_components=10)),
27 |         ('kbest', SelectKBest(f_classif, k=5)),
28 |     ]
29 | 
30 |     # Create the steps for the pipeline
31 |     fu = FeatureUnion(steps_fu)
32 |     scaler = StandardScaler()
33 |     svc = SVC(kernel='rbf', C=5.0, gamma=0.05)
34 | 
35 |     pipeline_steps = [
36 |         ('fu', fu),
37 |         ('scaler', scaler),
38 |         ('classifier', svc)
39 |     ]
40 | 
41 |     pipeline = Pipeline(pipeline_steps)
42 | 
43 |     print('Cross-validation score:')
44 |     print(cross_val_score(pipeline, digits.data, digits.target, cv=10).mean())


--------------------------------------------------------------------------------
/Chapter05/grid_search_2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import multiprocessing
 5 | 
 6 | from sklearn.datasets import load_iris
 7 | from sklearn.model_selection import GridSearchCV, cross_val_score
 8 | from sklearn.linear_model import SGDClassifier
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | if __name__ == '__main__':
15 |     # Load dataset
16 |     iris = load_iris()
17 | 
18 |     # Define a param grid
19 |     param_grid = [
20 |         {
21 |             'penalty': ['l1', 'l2', 'elasticnet'],
22 |             'alpha': [1e-5, 1e-4, 5e-4, 1e-3, 2.3e-3, 5e-3, 1e-2],
23 |             'l1_ratio': [0.01, 0.05, 0.1, 0.15, 0.25, 0.35, 0.5, 0.75, 0.8]
24 |         }
25 |     ]
26 | 
27 |     # Create SGD classifier
28 |     sgd = SGDClassifier(loss='perceptron', learning_rate='optimal')
29 | 
30 |     # Create and train a grid search
31 |     gs = GridSearchCV(estimator=sgd, param_grid=param_grid, scoring='accuracy', cv=10,
32 |                       n_jobs=multiprocessing.cpu_count())
33 |     gs.fit(iris.data, iris.target)
34 | 
35 |     # Best estimator
36 |     print(gs.best_estimator_)
37 | 
38 |     gs_scores = cross_val_score(gs.best_estimator_, iris.data, iris.target, scoring='accuracy', cv=10)
39 |     print('Best estimator CV average score: %.3f' % gs_scores.mean())


--------------------------------------------------------------------------------
/Chapter07/linear_svm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.svm import SVC
 8 | from sklearn.model_selection import cross_val_score
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | nb_samples = 500
15 | 
16 | 
17 | def show_dataset(X, Y):
18 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
19 | 
20 |     ax.grid()
21 |     ax.set_xlabel('X')
22 |     ax.set_ylabel('Y')
23 | 
24 |     for i in range(nb_samples):
25 |         if Y[i] == 0:
26 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
27 |         else:
28 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
29 | 
30 |     plt.show()
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     # Create dataset
35 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0,
36 |                                n_clusters_per_class=1)
37 | 
38 |     # Show dataset
39 |     show_dataset(X, Y)
40 | 
41 |     # Create a SVM with linear kernel
42 |     svc = SVC(kernel='linear')
43 | 
44 |     # Compute CV score
45 |     svc_scores = cross_val_score(svc, X, Y, scoring='accuracy', cv=10)
46 |     print('Linear SVM CV average score: %.3f' % svc_scores.mean())
47 | 
48 | 


--------------------------------------------------------------------------------
/Chapter14/word2vec.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import multiprocessing
 5 | 
 6 | from nltk.corpus import brown
 7 | from nltk.corpus import stopwords
 8 | 
 9 | # Install Gensim using: pip install -U gensim
10 | # Further information: https://radimrehurek.com/gensim/
11 | from gensim.models import Word2Vec
12 | 
13 | 
14 | # For reproducibility
15 | np.random.seed(1000)
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     sw = set(stopwords.words('english'))
20 | 
21 |     # Prepare the corpus
22 |     brown_corpus = brown.sents()
23 | 
24 |     corpus = []
25 | 
26 |     for sent in brown_corpus:
27 |         c_sent = [w.strip().lower() for w in sent if w.strip().lower() not in sw]
28 |         corpus.append(c_sent)
29 | 
30 |     # Train the Word2Vec model
31 |     # A UserWarning: detected Windows; can be discarded
32 |     model = Word2Vec(corpus, size=300, window=10, min_count=1, workers=multiprocessing.cpu_count())
33 |     wv = model.wv
34 |     del model
35 | 
36 |     # Show a feature vector
37 |     print(wv['committee'])
38 | 
39 |     print('\n')
40 | 
41 |     # Show the words most similar to "house"
42 |     print(wv.most_similar('house'))
43 | 
44 |     print('\n')
45 | 
46 |     # Show the similarity between "committee" and "president"
47 |     print(wv.similarity('committee', 'president'))


--------------------------------------------------------------------------------
/Chapter04/ransac_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.linear_model import LinearRegression, RANSACRegressor
 7 | 
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | nb_samples = 200
13 | nb_noise_samples = 150
14 | 
15 | 
16 | def show_dataset(X, Y):
17 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
18 | 
19 |     ax.scatter(X, Y)
20 |     ax.set_xlabel('X')
21 |     ax.set_ylabel('Y')
22 |     ax.grid()
23 | 
24 |     plt.show()
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     # Create dataset
29 |     X = np.arange(-5, 5, 0.05)
30 | 
31 |     Y = X + 2
32 |     Y += np.random.uniform(-0.5, 0.5, size=nb_samples)
33 | 
34 |     for i in range(nb_noise_samples, nb_samples):
35 |         Y[i] += np.random.uniform(12, 15)
36 | 
37 |     # Show the dataset
38 |     show_dataset(X, Y)
39 | 
40 |     # Create a linear regressor
41 |     lr = LinearRegression(normalize=True)
42 |     lr.fit(X.reshape(-1, 1), Y.reshape(-1, 1))
43 |     print('Standard regressor: y = %.3fx + %.3f' % (lr.coef_, lr.intercept_))
44 | 
45 |     # Create RANSAC regressor
46 |     rs = RANSACRegressor(lr)
47 |     rs.fit(X.reshape(-1, 1), Y.reshape(-1, 1))
48 |     print('RANSAC regressor: y = %.3fx + %.3f' % (rs.estimator_.coef_, rs.estimator_.intercept_))


--------------------------------------------------------------------------------
/Chapter12/content-based.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.neighbors import NearestNeighbors
 6 | 
 7 | # For reproducibility
 8 | np.random.seed(1000)
 9 | 
10 | nb_items = 1000
11 | 
12 | if __name__ == '__main__':
13 |     # Create the item dataset
14 |     items = np.zeros(shape=(nb_items, 4))
15 | 
16 |     for i in range(nb_items):
17 |         items[i, 0] = np.random.randint(0, 100)
18 |         items[i, 1] = np.random.randint(0, 100)
19 |         items[i, 2] = np.random.randint(0, 100)
20 |         items[i, 3] = np.random.randint(0, 100)
21 | 
22 |     metrics = ['euclidean', 'hamming', 'jaccard']
23 | 
24 |     for metric in metrics:
25 |         print('Metric: %r' % metric)
26 | 
27 |         # Fit k-nearest neighbors
28 |         nn = NearestNeighbors(n_neighbors=10, radius=5.0, metric=metric)
29 |         nn.fit(items)
30 | 
31 |         # Create a test product
32 |         test_product = np.array([15, 60, 28, 73])
33 | 
34 |         # Determine the neighbors with different radiuses
35 |         d, suggestions = nn.radius_neighbors(test_product.reshape(1, -1), radius=20)
36 | 
37 |         print('Suggestions (radius=10):')
38 |         print(suggestions)
39 | 
40 |         d, suggestions = nn.radius_neighbors(test_product.reshape(1, -1), radius=30)
41 | 
42 |         print('Suggestions (radius=15):')
43 |         print(suggestions)


--------------------------------------------------------------------------------
/Chapter08/random_forest_2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 8 | from sklearn.model_selection import cross_val_score
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | nb_classifications = 100
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     # Load dataset
19 |     digits = load_digits()
20 | 
21 |     # Collect accuracies
22 |     rf_accuracy = []
23 |     et_accuracy = []
24 | 
25 |     for i in range(1, nb_classifications):
26 |         a = cross_val_score(RandomForestClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy',
27 |                             cv=10).mean()
28 |         rf_accuracy.append(a)
29 | 
30 |         b = cross_val_score(ExtraTreesClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy',
31 |                             cv=10).mean()
32 |         et_accuracy.append(b)
33 | 
34 |     # Show results
35 |     plt.figure(figsize=(30, 25))
36 |     plt.xlabel('Number of trees')
37 |     plt.ylabel('Accuracy')
38 |     plt.grid(True)
39 |     plt.plot(rf_accuracy, color='blue', label='Random Forest')
40 |     plt.plot(et_accuracy, color='red', label='Extra Random Forest')
41 |     plt.legend(loc="lower right")
42 |     plt.show()


--------------------------------------------------------------------------------
/Chapter07/controlled_svm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.svm import SVC, NuSVC
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | nb_samples = 500
14 | 
15 | 
16 | def show_dataset(X, Y):
17 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
18 | 
19 |     ax.grid()
20 |     ax.set_xlabel('X')
21 |     ax.set_ylabel('Y')
22 | 
23 |     for i in range(nb_samples):
24 |         if Y[i] == 0:
25 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
26 |         else:
27 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
28 | 
29 |     plt.show()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     # Create dataset
34 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0,
35 |                                n_clusters_per_class=1)
36 | 
37 |     # Show dataset
38 |     show_dataset(X, Y)
39 | 
40 |     # Create and train a linear SVM
41 |     svc = SVC(kernel='linear')
42 |     svc.fit(X, Y)
43 |     print('Number of support vectors: %d' % len(svc.support_vectors_))
44 | 
45 |     # Create and train a Nu-SVM classifier
46 |     nusvc = NuSVC(kernel='linear', nu=0.05)
47 |     nusvc.fit(X, Y)
48 |     print('Number of support vectors (nu=0.05): %d' % len(nusvc.support_vectors_))


--------------------------------------------------------------------------------
/Chapter16/convolution.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | 
 7 | from scipy.misc import face
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | if __name__ == '__main__':
13 |     # Load the image
14 |     img = face(gray=True)
15 | 
16 |     # Show the original image
17 |     plt.imshow(img, cmap='gray')
18 |     plt.show()
19 | 
20 |     # Define the kernel
21 |     kernel = np.array(
22 |         [[0, 1, 0],
23 |          [1, -4, 0],
24 |          [0, 1, 0]],
25 |         dtype=np.float32)
26 | 
27 |     cfilter = np.zeros((3, 3, 1, 1), dtype=np.float32)
28 |     cfilter[:, :, 0, 0] = kernel
29 | 
30 |     # Create the graph
31 |     graph = tf.Graph()
32 | 
33 |     with graph.as_default():
34 |         x = tf.placeholder(tf.float32, shape=(None, 768, 1024, 1), name='image')
35 |         f = tf.constant(cfilter)
36 | 
37 |         # In case of errors, please use padding='SAME'
38 |         y = tf.nn.conv2d(x, f, strides=[1, 1, 1, 1], padding='same')
39 | 
40 |     session = tf.InteractiveSession(graph=graph)
41 | 
42 |     # Compute the convolution
43 |     c_img = session.run([y], feed_dict={x: img.reshape((1, 768, 1024, 1))})
44 |     n_img = np.array(c_img).reshape((768, 1024))
45 | 
46 |     # Show the final image
47 |     plt.imshow(n_img, cmap='gray')
48 |     plt.show()
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/Chapter02/resampling.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.utils import resample
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | 
14 | nb_samples = 1000
15 | weights = (0.95, 0.05)
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     # Create an unbalanced dataset
20 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, weights=weights, random_state=1000)
21 | 
22 |     # Show the shapes
23 |     print(X[Y == 0].shape)
24 |     print(X[Y == 1].shape)
25 | 
26 |     # Show the dataset
27 |     fig, ax = plt.subplots(figsize=(10, 8))
28 | 
29 |     ax.scatter(X[Y == 0, 0], X[Y == 0, 1], label='Class 1')
30 |     ax.scatter(X[Y == 1, 0], X[Y == 1, 1], label='Class 2')
31 |     ax.set_xlabel(r'$x_0$')
32 |     ax.set_ylabel(r'$x_1$')
33 |     ax.set_title('Unbalanced dataset')
34 |     ax.legend()
35 |     ax.grid()
36 | 
37 |     plt.show()
38 | 
39 |     # Resample the dataset
40 |     X_1_resampled = resample(X[Y == 1], n_samples=X[Y == 0].shape[0], random_state=1000)
41 | 
42 |     Xu = np.concatenate((X[Y == 0], X_1_resampled))
43 |     Yu = np.concatenate((Y[Y == 0], np.ones(shape=(X[Y == 0].shape[0],), dtype=np.int32)))
44 | 
45 |     # Show the new shapes
46 |     print(Xu[Yu == 0].shape)
47 |     print(Xu[Yu == 1].shape)
48 | 
49 | 


--------------------------------------------------------------------------------
/Chapter04/huber_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.linear_model import LinearRegression, HuberRegressor
 7 | 
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | nb_samples = 500
13 | nb_noise_samples = 50
14 | 
15 | 
16 | def show_dataset(X, Y):
17 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
18 | 
19 |     ax.scatter(X, Y)
20 |     ax.set_xlabel('X')
21 |     ax.set_ylabel('Y')
22 |     ax.grid()
23 | 
24 |     plt.show()
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     # Create dataset
29 |     X = np.arange(-5, 5, 10.0 / float(nb_samples))
30 | 
31 |     Y = X + 2
32 |     Y += np.random.uniform(-0.5, 0.5, size=nb_samples)
33 | 
34 |     noisy_samples = np.random.choice(np.arange(0, nb_samples), size=nb_noise_samples, replace=False)
35 | 
36 |     for i in noisy_samples:
37 |         Y[i] += np.random.uniform(0, 10.0)
38 | 
39 |     # Show the dataset
40 |     show_dataset(X, Y)
41 | 
42 |     # Create a linear regressor
43 |     lr = LinearRegression(normalize=True)
44 |     lr.fit(X.reshape(-1, 1), Y.reshape(-1, 1))
45 |     print('Standard regressor: y = %.3fx + %.3f' % (lr.coef_, lr.intercept_))
46 | 
47 |     # Create a Huber regressor
48 |     hr = HuberRegressor(epsilon=1.25)
49 |     hr.fit(X.reshape(-1, 1), Y)
50 |     print('Huber regressor: y = %.3fx + %.3f' % (hr.coef_, hr.intercept_))


--------------------------------------------------------------------------------
/Chapter05/perceptron.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.linear_model import SGDClassifier
 8 | from sklearn.model_selection import cross_val_score
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | nb_samples = 500
15 | 
16 | 
17 | def show_dataset(X, Y):
18 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
19 | 
20 |     ax.grid()
21 |     ax.set_xlabel('X')
22 |     ax.set_ylabel('Y')
23 | 
24 |     for i in range(nb_samples):
25 |         if Y[i] == 0:
26 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
27 |         else:
28 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
29 | 
30 |     plt.show()
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     # Create dataset
35 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0,
36 |                                n_clusters_per_class=1)
37 | 
38 |     # Show dataset
39 |     show_dataset(X, Y)
40 | 
41 |     # Create perceptron as SGD instance
42 |     # The same result can be obtained using directly the class sklearn.linear_model.Perceptron
43 |     sgd = SGDClassifier(loss='perceptron', learning_rate='optimal', n_iter=10)
44 |     sgd_scores = cross_val_score(sgd, X, Y, scoring='accuracy', cv=10)
45 |     print('Perceptron CV average score: %.3f' % sgd_scores.mean())
46 | 
47 | 


--------------------------------------------------------------------------------
/Chapter05/roc_curve.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.model_selection import train_test_split
 8 | from sklearn.linear_model import LogisticRegression
 9 | from sklearn.metrics import roc_curve, auc
10 | 
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | nb_samples = 500
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     # Create dataset
20 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0,
21 |                                n_clusters_per_class=1)
22 | 
23 |     # Split dataset
24 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
25 | 
26 |     #Create and train logistic regressor
27 |     lr = LogisticRegression()
28 |     lr.fit(X_train, Y_train)
29 | 
30 |     # Compute ROC curve
31 |     Y_score = lr.decision_function(X_test)
32 |     fpr, tpr, thresholds = roc_curve(Y_test, Y_score)
33 | 
34 |     plt.figure(figsize=(30, 25))
35 | 
36 |     plt.plot(fpr, tpr, color='red', label='Logistic regression (AUC: %.2f)' % auc(fpr, tpr))
37 |     plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
38 |     plt.xlim([0.0, 1.0])
39 |     plt.ylim([0.0, 1.01])
40 |     plt.title('ROC Curve')
41 |     plt.xlabel('False Positive Rate')
42 |     plt.ylabel('True Positive Rate')
43 |     plt.legend(loc="lower right")
44 | 
45 |     plt.show()


--------------------------------------------------------------------------------
/Chapter04/polynomial_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.linear_model import LinearRegression
 7 | from sklearn.model_selection import train_test_split
 8 | from sklearn.preprocessing import PolynomialFeatures
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | nb_samples = 200
15 | 
16 | 
17 | def show_dataset(X, Y):
18 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
19 | 
20 |     ax.scatter(X, Y)
21 |     ax.set_xlabel('X')
22 |     ax.set_ylabel('Y')
23 |     ax.grid()
24 | 
25 |     plt.show()
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     # Create dataset
30 |     X = np.arange(-5, 5, 0.05)
31 | 
32 |     Y = X + 2
33 |     Y += X**2 + np.random.uniform(-0.5, 0.5, size=nb_samples)
34 | 
35 |     # Show the dataset
36 |     show_dataset(X, Y)
37 | 
38 |     # Split dataset
39 |     X_train, X_test, Y_train, Y_test = train_test_split(X.reshape(-1, 1), Y.reshape(-1, 1), test_size=0.25)
40 | 
41 |     lr = LinearRegression(normalize=True)
42 |     lr.fit(X_train, Y_train)
43 |     print('Linear regression score: %.3f' % lr.score(X_train, Y_train))
44 | 
45 |     # Create polynomial features
46 |     pf = PolynomialFeatures(degree=2)
47 |     X_train = pf.fit_transform(X_train)
48 |     X_test = pf.fit_transform(X_test)
49 | 
50 |     lr.fit(X_train, Y_train)
51 |     print('Second degree polynomial regression score: %.3f' % lr.score(X_train, Y_train))


--------------------------------------------------------------------------------
/Chapter16/gradients.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import tensorflow as tf
 6 | 
 7 | # For reproducibility
 8 | np.random.seed(1000)
 9 | 
10 | nb_points = 100
11 | 
12 | if __name__ == '__main__':
13 |     # Create the dataset
14 |     X = np.linspace(-nb_points, nb_points, 200, dtype=np.float32)
15 | 
16 |     # Plot the dataset
17 |     fig, ax = plt.subplots(figsize=(8, 6))
18 |     ax.plot(X, X)
19 |     ax.grid()
20 |     plt.show()
21 | 
22 |     # Create the graph
23 |     graph = tf.Graph()
24 | 
25 |     with graph.as_default():
26 |         Xt = tf.placeholder(tf.float32, shape=(None, 1), name='x')
27 |         Y = tf.pow(Xt, 3.0, name='x_3')
28 |         Yd = tf.gradients(Y, Xt, name='dx')
29 |         Yd2 = tf.gradients(Yd, Xt, name='d2x')
30 | 
31 |     session = tf.InteractiveSession(graph=graph)
32 | 
33 |     # Compute the gradients
34 |     X2, dX, d2X = session.run([Y, Yd, Yd2], feed_dict={Xt: X.reshape((nb_points * 2, 1))})
35 | 
36 |     # Plot the gradients
37 |     fig, ax = plt.subplots(1, 3, figsize=(20, 5))
38 | 
39 |     ax[0].plot(X, X2)
40 |     ax[0].grid()
41 |     ax[0].set_xlabel('x')
42 |     ax[0].set_ylabel(r'$x^2$')
43 | 
44 |     ax[1].plot(X, dX[0])
45 |     ax[1].grid()
46 |     ax[1].set_xlabel('x')
47 |     ax[1].set_ylabel(r'$dx/dy$')
48 | 
49 |     ax[2].plot(X, d2X[0])
50 |     ax[2].grid()
51 |     ax[2].set_xlabel('x')
52 |     ax[2].set_ylabel(r'$d^2x/dy^2$')
53 | 
54 |     plt.show()


--------------------------------------------------------------------------------
/Chapter17/pipeline.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.datasets import make_classification
 6 | from sklearn.decomposition import PCA
 7 | from sklearn.model_selection import GridSearchCV
 8 | from sklearn.pipeline import Pipeline
 9 | from sklearn.preprocessing import StandardScaler
10 | from sklearn.svm import SVC
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | nb_samples = 500
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     # Create the dataset
20 |     X, Y = make_classification(n_samples=nb_samples, n_informative=15, n_redundant=5, n_classes=2)
21 | 
22 |     # Create the steps for the pipeline
23 |     pca = PCA(n_components=10)
24 |     scaler = StandardScaler()
25 |     svc = SVC(kernel='poly', gamma=3)
26 | 
27 |     steps = [
28 |                 ('pca', pca),
29 |                 ('scaler', scaler),
30 |         ('classifier', svc)
31 |     ]
32 | 
33 |     # Create the pipeline
34 |     pipeline = Pipeline(steps)
35 | 
36 |     # Perform a grid search
37 |     param_grid = {
38 |         'pca__n_components': [5, 10, 12, 15, 18, 20],
39 |         'classifier__kernel': ['rbf', 'poly'],
40 |         'classifier__gamma': [0.05, 0.1, 0.2, 0.5],
41 |         'classifier__degree': [2, 3, 5]
42 |     }
43 | 
44 |     gs = GridSearchCV(pipeline, param_grid)
45 |     gs.fit(X, Y)
46 | 
47 |     print('Best estimator:')
48 |     print(gs.best_estimator_)
49 | 
50 |     print('Best score:')
51 |     print(gs.best_score_)
52 | 


--------------------------------------------------------------------------------
/Chapter07/svr.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.svm import SVR
 7 | from sklearn.model_selection import cross_val_score
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | nb_samples = 50
14 | 
15 | 
16 | def show_dataset(X, Y, Y_pred=None):
17 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
18 | 
19 |     ax.grid()
20 |     ax.set_xlabel('X')
21 |     ax.set_ylabel('Y')
22 | 
23 |     ax.scatter(X, Y)
24 | 
25 |     if Y_pred is not None:
26 |         ax.plot(X, Y_pred, c='r')
27 | 
28 |     plt.show()
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     # Create dataset
33 |     X = np.arange(-nb_samples, nb_samples, 1)
34 |     Y = np.zeros(shape=(2 * nb_samples,))
35 | 
36 |     for x in X:
37 |         Y[int(x) + nb_samples] = np.power(x * 6, 2.0) / 1e4 + np.random.uniform(-2, 2)
38 | 
39 |     # Show dataset
40 |     show_dataset(X, Y)
41 | 
42 |     # Create and train a Support Vector regressor
43 |     svr = SVR(kernel='poly', degree=2, C=1.5, epsilon=0.5)
44 |     svr_scores = cross_val_score(svr, X.reshape((nb_samples*2, 1)), Y, scoring='neg_mean_squared_error', cv=10)
45 |     print('SVR CV average negative squared error: %.3f' % svr_scores.mean())
46 | 
47 |     # Fit the model
48 |     svr.fit(X.reshape(-1, 1), Y.ravel())
49 |     Y_pred = svr.predict(X.reshape(-1, 1))
50 | 
51 |     # Show the dataset together with the prediction
52 |     show_dataset(X, Y, Y_pred)
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/Chapter06/newsgroups.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.cm as cm
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | 
 7 | from sklearn.datasets import fetch_20newsgroups_vectorized
 8 | from sklearn.naive_bayes import MultinomialNB
 9 | from sklearn.metrics import confusion_matrix
10 | 
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | 
16 | def plot_confusion_matrix(Y_test, Y_pred, targets):
17 |     cmatrix = confusion_matrix(y_true=Y_test, y_pred=Y_pred)
18 |     cm_fig, cm_ax = plt.subplots(figsize=(12, 12))
19 |     cm_ax.matshow(cmatrix, cmap=cm.GnBu)
20 | 
21 |     x = y = np.arange(0, len(targets))
22 |     plt.xticks(x, targets, rotation='vertical')
23 |     plt.yticks(y, targets)
24 | 
25 |     for i in range(len(targets)):
26 |         for j in range(len(targets)):
27 |             cm_ax.text(x=j, y=i, s=cmatrix[i, j], va='center', ha='center', size='x-large')
28 | 
29 |     plt.show()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     # Load the dataset
34 |     train_data = fetch_20newsgroups_vectorized(subset='train')
35 |     test_data = fetch_20newsgroups_vectorized(subset='test')
36 | 
37 |     # Create and train the model
38 |     mnb = MultinomialNB(alpha=0.01)
39 |     mnb.fit(train_data['data'], train_data['target'])
40 | 
41 |     print(mnb.score(test_data['data'], test_data['target']))
42 | 
43 |     # Plot the confusion matrix
44 |     plot_confusion_matrix(test_data['target'], mnb.predict(test_data['data']), list(test_data['target_names']))


--------------------------------------------------------------------------------
/Chapter12/user_based.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.neighbors import NearestNeighbors
 6 | 
 7 | # For reproducibility
 8 | np.random.seed(1000)
 9 | 
10 | nb_users = 1000
11 | nb_product = 20
12 | 
13 | if __name__ == '__main__':
14 |     # Create the user dataset
15 |     users = np.zeros(shape=(nb_users, 4))
16 | 
17 |     for i in range(nb_users):
18 |         users[i, 0] = np.random.randint(0, 4)
19 |         users[i, 1] = np.random.randint(0, 2)
20 |         users[i, 2] = np.random.randint(0, 5)
21 |         users[i, 2] = np.random.randint(0, 5)
22 | 
23 |     # Create user-product dataset
24 |     user_products = np.random.randint(0, nb_product, size=(nb_users, 5))
25 | 
26 |     # Fit k-nearest neighbors
27 |     nn = NearestNeighbors(n_neighbors=20, radius=2.0)
28 |     nn.fit(users)
29 | 
30 |     # Create a test user
31 |     test_user = np.array([2, 0, 3, 2])
32 | 
33 |     # Determine the neighbors
34 |     d, neighbors = nn.kneighbors(test_user.reshape(1, -1))
35 | 
36 |     print('Neighbors:')
37 |     print(neighbors)
38 | 
39 |     # Determine the suggested products
40 |     suggested_products = []
41 | 
42 |     for n in neighbors:
43 |         for products in user_products[n]:
44 |             for product in products:
45 |                 if product != 0 and product not in suggested_products:
46 |                     suggested_products.append(product)
47 | 
48 |     print('Suggested products:')
49 |     print(suggested_products)
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/Chapter05/confusion_matrix.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.cm as cm
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | import numpy as np
 7 | 
 8 | from sklearn.datasets import load_wine
 9 | from sklearn.model_selection import train_test_split
10 | from sklearn.linear_model import LogisticRegression
11 | from sklearn.metrics import confusion_matrix
12 | 
13 | 
14 | # For reproducibility
15 | np.random.seed(1000)
16 | 
17 | 
18 | def plot_confusion_matrix(Y_test, Y_pred, targets):
19 |     cmatrix = confusion_matrix(y_true=Y_test, y_pred=Y_pred)
20 |     cm_fig, cm_ax = plt.subplots(figsize=(8.0, 8.0))
21 |     cm_ax.matshow(cmatrix, cmap=cm.GnBu)
22 | 
23 |     cm_ax.set_xticklabels([''] + targets)
24 |     cm_ax.set_yticklabels([''] + targets)
25 | 
26 |     for i in range(len(targets)):
27 |         for j in range(len(targets)):
28 |             cm_ax.text(x=j, y=i, s=cmatrix[i, j], va='center', ha='center', size='x-large')
29 | 
30 |     plt.title('Confusion matrix')
31 |     plt.show()
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     # Load the dataset
36 |     wine = load_wine()
37 | 
38 |     # Split the dataset
39 |     X_train, X_test, Y_train, Y_test = train_test_split(wine['data'], wine['target'], test_size=0.25)
40 | 
41 |     # Train the model
42 |     lr = LogisticRegression()
43 |     lr.fit(X_train, Y_train)
44 | 
45 |     # Plot the confusion matrix
46 |     targets = list(wine['target_names'])
47 |     plot_confusion_matrix(lr.predict(X_test), Y_test, targets)
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/Chapter10/dbscan.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_moons
 7 | from sklearn.cluster import DBSCAN
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | nb_samples = 1000
14 | 
15 | 
16 | def show_dataset(X, Y):
17 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
18 | 
19 |     ax.grid()
20 |     ax.set_xlabel('X')
21 |     ax.set_ylabel('Y')
22 | 
23 |     for i in range(nb_samples):
24 |         if Y[i] == 0:
25 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
26 |         else:
27 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
28 | 
29 |     plt.show()
30 | 
31 | 
32 | def show_clustered_dataset(X, Y):
33 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
34 | 
35 |     ax.grid()
36 |     ax.set_xlabel('X')
37 |     ax.set_ylabel('Y')
38 | 
39 |     for i in range(nb_samples):
40 |         if Y[i] == 0:
41 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
42 |         else:
43 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
44 | 
45 |     plt.show()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     # Create dataset
50 |     X, Y = make_moons(n_samples=nb_samples, noise=0.05)
51 | 
52 |     # Show dataset
53 |     show_dataset(X, Y)
54 | 
55 |     # Create and train DBSCAN
56 |     dbs = DBSCAN(eps=0.1)
57 |     Y = dbs.fit_predict(X)
58 | 
59 |     # Show clustered dataset
60 |     show_clustered_dataset(X, Y)
61 | 
62 | 


--------------------------------------------------------------------------------
/Chapter06/discriminant_analysis.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | from sklearn.datasets import make_blobs
 7 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
 8 | from sklearn.model_selection import cross_val_score
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | # Total number of samples
16 | nb_samples = 1000
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     # Create the dataset
21 |     X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=2, cluster_std=[1.0, 10.0], random_state=1000)
22 | 
23 |     # Show the dataset
24 |     fig, ax = plt.subplots(figsize=(11, 7))
25 | 
26 |     ax.scatter(X[Y == 0, 0], X[Y == 0, 1], label='Class 0')
27 |     ax.scatter(X[Y == 1, 0], X[Y == 1, 1], label='Class 1')
28 |     ax.set_xlabel(r'$x_0$')
29 |     ax.set_ylabel(r'$x_1$')
30 |     ax.grid()
31 |     ax.legend()
32 | 
33 |     plt.show()
34 | 
35 |     # Show the covariance matrices
36 |     print('Covariance matrix for class 0:')
37 |     print(np.cov(X[Y == 0].T))
38 | 
39 |     print('\nCovariance matrix for class 1:')
40 |     print(np.cov(X[Y == 1].T))
41 | 
42 |     # Show the CV scores
43 |     lda = LinearDiscriminantAnalysis()
44 |     print('\nLDA average CV accuracy: %.3f' % cross_val_score(lda, X, Y, cv=10).mean())
45 | 
46 |     qda = QuadraticDiscriminantAnalysis()
47 |     print('QDA average CV accuracy: %.3f' % cross_val_score(qda, X, Y, cv=10).mean())
48 | 
49 | 


--------------------------------------------------------------------------------
/Chapter12/als_spark.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from pyspark import SparkContext, SparkConf
 4 | from pyspark.mllib.recommendation import Rating
 5 | from pyspark.mllib.recommendation import ALS
 6 | 
 7 | # For reproducibility
 8 | np.random.seed(1000)
 9 | 
10 | nb_users = 200
11 | nb_products = 100
12 | ratings = []
13 | 
14 | if __name__ == '__main__':
15 |     conf = SparkConf().setAppName('ALS').setMaster('local[*]')
16 |     sc = SparkContext(conf=conf)
17 | 
18 |     for _ in range(10):
19 |         for i in range(nb_users):
20 |             rating = Rating(user=i, product=np.random.randint(1, nb_products), rating=np.random.randint(0, 5))
21 |             ratings.append(rating)
22 | 
23 |     # Parallelize the ratings
24 |     ratings = sc.parallelize(ratings)
25 | 
26 |     # Train the model
27 |     model = ALS.train(ratings, rank=5, iterations=10)
28 | 
29 |     # Test the model
30 |     test = ratings.map(lambda rating: (rating.user, rating.product))
31 | 
32 |     predictions = model.predictAll(test)
33 |     full_predictions = predictions.map(lambda pred: ((pred.user, pred.product), pred.rating))
34 | 
35 |     # Compute MSE
36 |     split_ratings = ratings.map(lambda rating: ((rating.user, rating.product), rating.rating))
37 |     joined_predictions = split_ratings.join(full_predictions)
38 |     mse = joined_predictions.map(lambda x: (x[1][0] - x[1][1]) ** 2).mean()
39 | 
40 |     print('MSE: %.3f' % mse)
41 | 
42 |     # Perform a single prediction
43 |     prediction = model.predict(10, 20)
44 |     print('Prediction: %.3f' % prediction)


--------------------------------------------------------------------------------
/Chapter07/kernel_svm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import multiprocessing
 6 | 
 7 | from sklearn.datasets import make_circles
 8 | from sklearn.model_selection import GridSearchCV
 9 | from sklearn.svm import SVC
10 | 
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | nb_samples = 500
16 | 
17 | 
18 | def show_dataset(X, Y):
19 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
20 | 
21 |     ax.grid()
22 |     ax.set_xlabel('X')
23 |     ax.set_ylabel('Y')
24 | 
25 |     for i in range(nb_samples):
26 |         if Y[i] == 0:
27 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
28 |         else:
29 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
30 | 
31 |     plt.show()
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     # Create datasets
36 |     X, Y = make_circles(n_samples=nb_samples, noise=0.1)
37 | 
38 |     # Show dataset
39 |     show_dataset(X, Y)
40 | 
41 |     # Define a param grid
42 |     param_grid = [
43 |         {
44 |             'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
45 |             'C': [0.1, 0.2, 0.4, 0.5, 1.0, 1.5, 1.8, 2.0, 2.5, 3.0]
46 |         }
47 |     ]
48 | 
49 |     # Create a train grid search on SVM classifier
50 |     gs = GridSearchCV(estimator=SVC(), param_grid=param_grid,
51 |                       scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count())
52 |     gs.fit(X, Y)
53 | 
54 |     print(gs.best_estimator_)
55 |     print('Kernel SVM score: %.3f' % gs.best_score_)
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/Chapter10/birch.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_blobs
 7 | from sklearn.cluster import Birch
 8 | from sklearn.metrics import adjusted_rand_score
 9 | 
10 | 
11 | # Set random seed for reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | nb_samples = 2000
16 | batch_size = 80
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     # Create the dataset
21 |     X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=5, cluster_std=1.5, random_state=1000)
22 | 
23 |     # Create an instance of BIRCH
24 |     birch = Birch(n_clusters=5, threshold=0.15, branching_factor=100)
25 | 
26 |     # Train the model
27 |     X_batch = []
28 |     Y_preds = []
29 | 
30 |     for i in range(0, nb_samples, batch_size):
31 |         birch.partial_fit(X[i:i + batch_size])
32 |         X_batch.append(X[:i + batch_size])
33 |         Y_preds.append(birch.predict(X[:i + batch_size]))
34 | 
35 |     print(adjusted_rand_score(birch.predict(X), Y))
36 | 
37 |     # Show the training steps
38 |     fig, ax = plt.subplots(5, 5, figsize=(20, 12))
39 | 
40 |     for i in range(5):
41 |         for j in range(5):
42 |             idx = (i * 5) + j
43 | 
44 |             for k in range(5):
45 |                 ax[i][j].scatter(X_batch[idx][Y_preds[idx] == k, 0], X_batch[idx][Y_preds[idx] == k, 1], s=3)
46 | 
47 |             ax[i][j].set_xticks([])
48 |             ax[i][j].set_yticks([])
49 |             ax[i][j].set_title('{} samples'.format(batch_size * (idx + 1)))
50 | 
51 |     plt.show()
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/Chapter04/isotonic_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from matplotlib.collections import LineCollection
 7 | 
 8 | from sklearn.isotonic import IsotonicRegression
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | nb_samples = 100
15 | 
16 | 
17 | def show_dataset(X, Y):
18 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
19 | 
20 |     ax.plot(X, Y, 'b.-')
21 |     ax.grid()
22 |     ax.set_xlabel('X')
23 |     ax.set_ylabel('Y')
24 | 
25 |     plt.show()
26 | 
27 | 
28 | def show_isotonic_regression_segments(X, Y, Yi, segments):
29 |     lc = LineCollection(segments, zorder=0)
30 |     lc.set_array(np.ones(len(Y)))
31 |     lc.set_linewidths(0.5 * np.ones(nb_samples))
32 | 
33 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
34 | 
35 |     ax.plot(X, Y, 'b.', markersize=8)
36 |     ax.plot(X, Yi, 'g.-', markersize=8)
37 |     ax.grid()
38 |     ax.set_xlabel('X')
39 |     ax.set_ylabel('Y')
40 | 
41 |     plt.show()
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     # Create dataset
46 |     X = np.arange(-5, 5, 0.1)
47 |     Y = X + np.random.uniform(-0.5, 1, size=X.shape)
48 | 
49 |     # Show original dataset
50 |     show_dataset(X, Y)
51 | 
52 |     # Create an isotonic regressor
53 |     ir = IsotonicRegression(-6, 10)
54 |     Yi = ir.fit_transform(X, Y)
55 | 
56 |     # Create a segment list
57 |     segments = [[[i, Y[i]], [i, Yi[i]]] for i in range(nb_samples)]
58 | 
59 |     # Show isotonic interpolation
60 |     show_isotonic_regression_segments(X, Y, Yi, segments)
61 | 


--------------------------------------------------------------------------------
/Chapter09/k_means.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_blobs
 7 | from sklearn.cluster import KMeans
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | nb_samples = 1000
14 | 
15 | 
16 | def show_dataset(X):
17 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
18 | 
19 |     ax.grid()
20 |     ax.set_xlabel('X')
21 |     ax.set_ylabel('Y')
22 | 
23 |     ax.scatter(X[:, 0], X[:, 1], marker='o', color='b')
24 | 
25 |     plt.show()
26 | 
27 | 
28 | def show_clustered_dataset(X, km):
29 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
30 | 
31 |     ax.grid()
32 |     ax.set_xlabel('X')
33 |     ax.set_ylabel('Y')
34 | 
35 |     for i in range(nb_samples):
36 |         c = km.predict(X[i].reshape(1, -1))
37 |         if c == 0:
38 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
39 |         elif c == 1:
40 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
41 |         else:
42 |             ax.scatter(X[i, 0], X[i, 1], marker='d', color='g')
43 | 
44 |     plt.show()
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     # Create dataset
49 |     X, _ = make_blobs(n_samples=nb_samples, n_features=2, centers=3, cluster_std=1.5, random_state=1000)
50 | 
51 |     # Show dataset
52 |     show_dataset(X)
53 | 
54 |     # Create and train K-Means
55 |     km = KMeans(n_clusters=3)
56 |     km.fit(X)
57 | 
58 |     # Show the centroids
59 |     print(km.cluster_centers_)
60 | 
61 |     # Show clustered dataset
62 |     show_clustered_dataset(X, km)
63 | 
64 | 


--------------------------------------------------------------------------------
/Chapter10/biclustering.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.cluster.bicluster import SpectralBiclustering
 7 | 
 8 | 
 9 | # Set random seed for reproducibility
10 | np.random.seed(1000)
11 | 
12 | 
13 | nb_users = 100
14 | nb_products = 150
15 | max_rating = 10
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     # Create the user-product matrix
20 |     up_matrix = np.random.randint(0, max_rating + 1, size=(nb_users, nb_products))
21 |     mask_matrix = np.random.randint(0, 2, size=(nb_users, nb_products))
22 |     up_matrix *= mask_matrix
23 | 
24 |     # Show the matrix
25 |     fig, ax = plt.subplots(figsize=(12, 6))
26 | 
27 |     matx = ax.matshow(up_matrix)
28 |     fig.colorbar(matx)
29 | 
30 |     ax.set_xticks([])
31 |     ax.set_yticks([])
32 |     ax.set_xlabel('Products')
33 |     ax.set_ylabel('Users')
34 | 
35 |     plt.show()
36 | 
37 |     # Perform a Spectral Biclustering
38 |     sbc = SpectralBiclustering(n_clusters=10, random_state=1000)
39 |     sbc.fit(up_matrix)
40 | 
41 |     # Show the clustered matrix
42 |     up_clustered = np.outer(np.sort(sbc.row_labels_) + 1, np.sort(sbc.column_labels_) + 1)
43 | 
44 |     fig, ax = plt.subplots(figsize=(12, 6))
45 | 
46 |     matx = ax.matshow(up_clustered)
47 | 
48 |     ax.set_xticks([])
49 |     ax.set_yticks([])
50 |     ax.set_xlabel('Products')
51 |     ax.set_ylabel('Users')
52 | 
53 |     plt.show()
54 | 
55 |     # Show some examples of users and products associated with ranking 6
56 |     print(np.where(sbc.rows_[6, :] == True))
57 |     print(np.where(sbc.columns_[6, :] == True))


--------------------------------------------------------------------------------
/Chapter14/lsa_1.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from nltk.corpus import brown
 7 | 
 8 | from scipy.linalg import svd
 9 | 
10 | from sklearn.feature_extraction.text import TfidfVectorizer
11 | 
12 | 
13 | # For reproducibility
14 | np.random.seed(1000)
15 | 
16 | 
17 | def scatter_documents(X):
18 |     fig, ax = plt.subplots(1, 1, figsize=(10, 6))
19 | 
20 |     ax.scatter(X[:, 0], X[:, 1])
21 |     ax.set_xlabel('t0')
22 |     ax.set_ylabel('t1')
23 |     ax.grid()
24 |     plt.show()
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     # Compose a corpus
29 |     sentences = sentences = brown.sents(categories=['news', 'fiction'])
30 |     corpus = []
31 | 
32 |     for s in sentences:
33 |         corpus.append(' '.join(s))
34 | 
35 |     # Vectorize the corpus
36 |     vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', sublinear_tf=True, use_idf=True)
37 |     Xc = vectorizer.fit_transform(corpus).todense()
38 | 
39 |     # Perform SVD
40 |     U, s, V = svd(Xc, full_matrices=False)
41 | 
42 |     # Extract a sub-space with rank=2
43 |     rank = 2
44 | 
45 |     Uk = U[:, 0:rank]
46 |     sk = np.diag(s)[0:rank, 0:rank]
47 |     Vk = V[0:rank, :]
48 | 
49 |     # Check the top-10 word per topic
50 |     Mwts = np.argsort(np.abs(Vk), axis=1)[::-1]
51 | 
52 |     for t in range(rank):
53 |         print('\nTopic ' + str(t))
54 |         for i in range(10):
55 |             print(vectorizer.get_feature_names()[Mwts[t, i]])
56 | 
57 |     # Show a scatter plot of all documents
58 |     Mdtk = Uk.dot(sk)
59 |     scatter_documents(Mdtk)
60 | 


--------------------------------------------------------------------------------
/Chapter13/tokenizing.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | from nltk.tokenize import sent_tokenize
 6 | from nltk.tokenize import TreebankWordTokenizer
 7 | from nltk.tokenize import RegexpTokenizer
 8 | 
 9 | if __name__ == '__main__':
10 |     # Sentence tokenizing
11 |     print('Generic text:')
12 |     generic_text = 'Lorem ipsum dolor sit amet, amet minim temporibus in sit. Vel ne impedit consequat intellegebat.'
13 |     print(sent_tokenize(generic_text))
14 | 
15 |     print('English text:')
16 |     english_text = 'Where is the closest train station? I need to reach London'
17 |     print(sent_tokenize(english_text, language='english'))
18 | 
19 |     print('Spanish text:')
20 |     spanish_text = u'¿Dónde está la estación más cercana? Inmediatamente me tengo que ir a Barcelona.'
21 |     for sentence in sent_tokenize(spanish_text, language='spanish'):
22 |         print(sentence)
23 | 
24 |     # Word tokenizing
25 |     # Create a Treebank word tokenizer
26 |     tbwt = TreebankWordTokenizer()
27 | 
28 |     print('Simple text:')
29 |     simple_text = 'This is a simple text.'
30 |     print(tbwt.tokenize(simple_text))
31 | 
32 |     print('Complex text:')
33 |     complex_text = 'This isn\'t a simple text'
34 |     print(tbwt.tokenize(complex_text))
35 | 
36 |     # Create a Regexp tokenizer
37 |     ret = RegexpTokenizer('[a-zA-Z0-9\'\.]+')
38 |     print(ret.tokenize(complex_text))
39 | 
40 |     # Create a more restrictive Regexp tokenizer
41 |     ret = RegexpTokenizer('[a-zA-Z\']+')
42 | 
43 |     complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!'
44 |     print(ret.tokenize(complex_text))


--------------------------------------------------------------------------------
/Chapter03/whitening.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | 
 7 | # For reproducibility
 8 | np.random.seed(1000)
 9 | 
10 | 
11 | nb_samples = 1000
12 | 
13 | 
14 | def zero_center(X):
15 |     return X - np.mean(X, axis=0)
16 | 
17 | 
18 | def whiten(X, correct=True):
19 |     Xc = zero_center(X)
20 |     _, L, V = np.linalg.svd(Xc)
21 |     W = np.dot(V.T, np.diag(1.0 / L))
22 |     return np.dot(Xc, W) * np.sqrt(X.shape[0]) if correct else 1.0
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     # Create the dataset
27 |     X = np.random.normal(0.0, [2.5, 1.0], size=(nb_samples, 2))
28 | 
29 |     theta = np.pi / 4.0
30 |     R = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
31 | 
32 |     Xr = np.dot(X, R)
33 | 
34 |     # Create a whitened version
35 |     Xw = whiten(Xr)
36 | 
37 |     # Print the whitened covariance matrix
38 |     print(np.cov(Xw.T))
39 | 
40 |     # Show original and whitened datasets
41 |     fig, ax = plt.subplots(1, 2, figsize=(15, 5))
42 | 
43 |     ax[0].scatter(Xr[:, 0], Xr[:, 1])
44 |     ax[0].set_xticks(np.arange(-10, 10), 2)
45 |     ax[0].set_yticks(np.arange(-8, 8), 2)
46 |     ax[0].set_xlabel(r'$x_1$')
47 |     ax[0].set_ylabel(r'$x_2$')
48 |     ax[0].set_title(r'Original dataset')
49 |     ax[0].grid()
50 | 
51 |     ax[1].scatter(Xw[:, 0], Xw[:, 1])
52 |     ax[1].set_xticks(np.arange(-10, 10), 2)
53 |     ax[1].set_yticks(np.arange(-8, 8), 2)
54 |     ax[1].set_xlabel(r'$x_1$')
55 |     ax[1].set_ylabel(r'$x_2$')
56 |     ax[1].set_title(r'Whitened dataset')
57 |     ax[1].grid()
58 | 
59 |     plt.show()
60 | 
61 | 


--------------------------------------------------------------------------------
/Chapter02/SMOTE.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | # Install Imbalanced-Learning with: pip install -U imbalanced-learn
 7 | # For further information: http://contrib.scikit-learn.org/imbalanced-learn/stable/index.html
 8 | from imblearn.over_sampling import SMOTE
 9 | 
10 | from sklearn.datasets import make_classification
11 | 
12 | 
13 | # For reproducibility
14 | np.random.seed(1000)
15 | 
16 | 
17 | nb_samples = 1000
18 | weights = (0.95, 0.05)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     # Create an unbalanced dataset
23 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, weights=weights, random_state=1000)
24 | 
25 |     # Create and train a SMOTE instance
26 |     smote = SMOTE()
27 |     X_resampled, Y_resampled = smote.fit_sample(X, Y)
28 | 
29 |     # Show original and resampled datasets
30 |     fig, ax = plt.subplots(1, 2, figsize=(20, 8))
31 | 
32 |     ax[0].scatter(X[Y == 0, 0], X[Y == 0, 1], label='Class 1')
33 |     ax[0].scatter(X[Y == 1, 0], X[Y == 1, 1], label='Class 2')
34 |     ax[0].set_xlabel(r'$x_0$')
35 |     ax[0].set_ylabel(r'$x_1$')
36 |     ax[0].set_title('Unbalanced dataset')
37 |     ax[0].legend()
38 |     ax[0].grid()
39 | 
40 |     ax[1].scatter(X_resampled[Y_resampled == 0, 0], X_resampled[Y_resampled == 0, 1], label='Class 1')
41 |     ax[1].scatter(X_resampled[Y_resampled == 1, 0], X_resampled[Y_resampled == 1, 1], label='Class 2')
42 |     ax[1].set_xlabel(r'$x_0$')
43 |     ax[1].set_ylabel(r'$x_1$')
44 |     ax[1].set_title('SMOTE balancing')
45 |     ax[1].legend()
46 |     ax[1].grid()
47 | 
48 |     plt.show()
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/Chapter05/passive_aggressive_classification.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | from sklearn.datasets import load_iris
 7 | from sklearn.linear_model import PassiveAggressiveClassifier
 8 | from sklearn.preprocessing import StandardScaler
 9 | from sklearn.model_selection import train_test_split
10 | 
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     # Load and scale the dataset
18 |     iris = load_iris()
19 | 
20 |     ss = StandardScaler()
21 | 
22 |     X = ss.fit_transform(iris['data'])
23 |     Y = iris['target']
24 | 
25 |     # Split the dataset
26 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1000)
27 | 
28 |     # Create the model
29 |     pac = PassiveAggressiveClassifier(C=0.05, loss='squared_hinge', max_iter=2000, random_state=1000)
30 | 
31 |     # Train with the start-up samples
32 |     nb_initial_samples = int(X_train.shape[0] / 1.5)
33 |     pac.fit(X_train[0:nb_initial_samples], Y_train[0:nb_initial_samples])
34 | 
35 |     # Continue with the incremental samples
36 |     validation_accuracies = []
37 | 
38 |     for (x, y) in zip(X_train[nb_initial_samples:], Y_train[nb_initial_samples:]):
39 |         pac.partial_fit(x.reshape(1, -1), y.ravel(), classes=np.unique(iris['target']))
40 |         validation_accuracies.append(pac.score(X_test, Y_test))
41 | 
42 |     # Show the validation plot
43 |     fig, ax = plt.subplots(figsize=(18, 8))
44 | 
45 |     ax.plot(validation_accuracies)
46 |     ax.set_xlabel('Online sample')
47 |     ax.set_ylabel('Validation accuracy')
48 |     ax.grid()
49 | 
50 |     plt.show()
51 | 


--------------------------------------------------------------------------------
/Chapter06/bernoulli.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.model_selection import train_test_split, cross_val_score
 8 | from sklearn.naive_bayes import BernoulliNB
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | nb_samples = 300
15 | 
16 | 
17 | def show_dataset(X, Y):
18 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
19 | 
20 |     ax.grid()
21 |     ax.set_xlabel('X')
22 |     ax.set_ylabel('Y')
23 | 
24 |     for i in range(nb_samples):
25 |         if Y[i] == 0:
26 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
27 |         else:
28 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
29 | 
30 |     plt.show()
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     # Create dataset
35 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0)
36 | 
37 |     # Show dataset
38 |     show_dataset(X, Y)
39 | 
40 |     # Split dataset
41 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
42 | 
43 |     # Create and train Bernoulli Naive Bayes classifier
44 |     bnb = BernoulliNB(binarize=0.0)
45 |     bnb.fit(X_train, Y_train)
46 | 
47 |     print('Bernoulli Naive Bayes score: %.3f' % bnb.score(X_test, Y_test))
48 | 
49 |     # Compute CV score
50 |     bnb_scores = cross_val_score(bnb, X, Y, scoring='accuracy', cv=10)
51 |     print('Bernoulli Naive Bayes CV average score: %.3f' % bnb_scores.mean())
52 | 
53 |     # Predict some values
54 |     data = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
55 |     Yp = bnb.predict(data)
56 |     print(Yp)
57 | 
58 | 


--------------------------------------------------------------------------------
/Chapter14/lda.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from nltk.corpus import brown
 6 | 
 7 | from sklearn.decomposition import LatentDirichletAllocation
 8 | from sklearn.feature_extraction.text import CountVectorizer
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | if __name__ == '__main__':
14 |     # Compose a corpus
15 |     sentences_1 = brown.sents(categories=['reviews'])[0:1000]
16 |     sentences_2 = brown.sents(categories=['government'])[0:1000]
17 |     sentences_3 = brown.sents(categories=['fiction'])[0:1000]
18 |     sentences_4 = brown.sents(categories=['news'])[0:1000]
19 |     corpus = []
20 | 
21 |     for s in sentences_1 + sentences_2 + sentences_3 + sentences_4:
22 |         corpus.append(' '.join(s))
23 | 
24 |     # Vectorize the corpus
25 |     cv = CountVectorizer(strip_accents='unicode', stop_words='english', analyzer='word')
26 |     Xc = cv.fit_transform(corpus)
27 | 
28 |     # Perform LDA
29 |     lda = LatentDirichletAllocation(n_topics=8, learning_method='online', max_iter=25)
30 |     Xl = lda.fit_transform(Xc)
31 | 
32 |     # Show the top 5 words per topic
33 |     Mwts_lda = np.argsort(lda.components_, axis=1)[::-1]
34 | 
35 |     for t in range(8):
36 |         print('\nTopic ' + str(t))
37 |         for i in range(5):
38 |             print(cv.get_feature_names()[Mwts_lda[t, i]])
39 | 
40 |     # Test the model with new document
41 |     print('Document 0:')
42 |     print(corpus[0])
43 |     print(Xl[0])
44 | 
45 |     print('Document 2500:')
46 |     print(corpus[2500])
47 |     print(Xl[2500])
48 | 
49 |     test_doc = corpus[0] + ' ' + corpus[2500]
50 |     y_test = lda.transform(cv.transform([test_doc]))
51 |     print(y_test)
52 | 
53 | 


--------------------------------------------------------------------------------
/Chapter09/knn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.neighbors import NearestNeighbors
 8 | from sklearn.preprocessing import StandardScaler
 9 | 
10 | 
11 | # Set random seed for reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     # Load the dataset
17 |     digits = load_digits()
18 | 
19 |     # Scale the dataset
20 |     ss = StandardScaler(with_std=False)
21 |     X = ss.fit_transform(digits['data'])
22 | 
23 |     # Create and train the model
24 |     knn = NearestNeighbors(n_neighbors=25, leaf_size=30, algorithm='ball_tree')
25 |     knn.fit(X)
26 | 
27 |     # Create a noisy sample (and show it)
28 |     X_noise = X[50] + np.random.normal(0.0, 1.5, size=(64,))
29 | 
30 |     fig, ax = plt.subplots(1, 2, figsize=(4, 8))
31 | 
32 |     ax[0].imshow(digits['images'][50], cmap='gray')
33 |     ax[0].set_xticks([])
34 |     ax[0].set_yticks([])
35 | 
36 |     ax[1].imshow(ss.inverse_transform(X_noise).reshape((8, 8)), cmap='gray')
37 |     ax[1].set_xticks([])
38 |     ax[1].set_yticks([])
39 | 
40 |     plt.show()
41 | 
42 |     # Compute the neighbors
43 |     distances, neighbors = knn.kneighbors(X_noise.reshape(1, -1), return_distance=True)
44 | 
45 |     print('Distances:\n')
46 |     print(distances[0])
47 | 
48 |     # Show the neighbors
49 |     fig, ax = plt.subplots(5, 5, figsize=(8, 8))
50 | 
51 |     for y in range(5):
52 |         for x in range(5):
53 |             idx = neighbors[0][(x + (y * 5))]
54 |             ax[y, x].matshow(digits['images'][idx], cmap='gray')
55 |             ax[y, x].set_xticks([])
56 |             ax[y, x].set_yticks([])
57 | 
58 |     plt.show()


--------------------------------------------------------------------------------
/Chapter14/lsa.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from nltk.corpus import brown
 7 | 
 8 | from scipy.linalg import svd
 9 | 
10 | from sklearn.feature_extraction.text import TfidfVectorizer
11 | 
12 | 
13 | # For reproducibility
14 | np.random.seed(1000)
15 | 
16 | 
17 | def scatter_documents(X):
18 |     fig, ax = plt.subplots(1, 1, figsize=(10, 6))
19 | 
20 |     ax.scatter(X[:, 0], X[:, 1])
21 |     ax.set_xlabel('t0')
22 |     ax.set_ylabel('t1')
23 |     ax.grid()
24 |     plt.show()
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     # Compose a corpus
29 |     sentences = brown.sents(categories=['news'])[0:500]
30 |     corpus = []
31 | 
32 |     for s in sentences:
33 |         corpus.append(' '.join(s))
34 | 
35 |     # Vectorize the corpus
36 |     vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', sublinear_tf=True, use_idf=True)
37 |     Xc = vectorizer.fit_transform(corpus).todense()
38 | 
39 |     # Perform SVD
40 |     U, s, V = svd(Xc, full_matrices=False)
41 | 
42 |     # Extract a sub-space with rank=2
43 |     rank = 2
44 | 
45 |     Uk = U[:, 0:rank]
46 |     sk = np.diag(s)[0:rank, 0:rank]
47 |     Vk = V[0:rank, :]
48 | 
49 |     # Check the top-10 word per topic
50 |     Mwts = np.argsort(np.abs(Vk), axis=1)[::-1]
51 | 
52 |     for t in range(rank):
53 |         print('\nTopic ' + str(t))
54 |         for i in range(10):
55 |             print(vectorizer.get_feature_names()[Mwts[t, i]])
56 | 
57 |     # Compute the structure of a document
58 |     print('\nSample document:')
59 |     print(corpus[0])
60 | 
61 |     Mdtk = Uk.dot(sk)
62 |     print('\nSample document in the topic sub-space:')
63 |     print('d0 = %.2f*t1 + %.2f*t2' % (Mdtk[0][0], Mdtk[0][1]))
64 | 
65 |     # Show a scatter plot of all documents
66 |     scatter_documents(Mdtk)
67 | 


--------------------------------------------------------------------------------
/Chapter10/mini_batch_kmeans.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_blobs
 7 | from sklearn.cluster import KMeans, MiniBatchKMeans
 8 | from sklearn.metrics import adjusted_rand_score
 9 | 
10 | 
11 | # Set random seed for reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | nb_samples = 2000
16 | batch_size = 80
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     # Create the dataset
21 |     X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=5, cluster_std=1.5, random_state=1000)
22 | 
23 |     # Create an instance of Mini-Batch k-Means
24 |     mbkm = MiniBatchKMeans(n_clusters=5, max_iter=1000, batch_size=batch_size, random_state=1000)
25 | 
26 |     # Train the model
27 |     X_batch = []
28 |     Y_preds = []
29 | 
30 |     for i in range(0, nb_samples, batch_size):
31 |         mbkm.partial_fit(X[i:i + batch_size])
32 | 
33 |         X_batch.append(X[:i + batch_size])
34 |         Y_preds.append(mbkm.predict(X[:i + batch_size]))
35 | 
36 |     # Show the training steps
37 |     fig, ax = plt.subplots(5, 5, figsize=(20, 12))
38 | 
39 |     for i in range(5):
40 |         for j in range(5):
41 |             idx = (i * 5) + j
42 | 
43 |             for k in range(5):
44 |                 ax[i][j].scatter(X_batch[idx][Y_preds[idx] == k, 0], X_batch[idx][Y_preds[idx] == k, 1], s=3)
45 | 
46 |             ax[i][j].set_xticks([])
47 |             ax[i][j].set_yticks([])
48 |             ax[i][j].set_title('{} samples'.format(batch_size * (idx + 1)))
49 | 
50 |     plt.show()
51 | 
52 |     # Compute the Adjusted-Rand score and compare it with a standard K-Means
53 |     print(adjusted_rand_score(mbkm.predict(X), Y))
54 | 
55 |     km = KMeans(n_clusters=5, max_iter=1000, random_state=1000)
56 |     km.fit(X)
57 | 
58 |     print(adjusted_rand_score(km.predict(X), Y))
59 | 
60 | 


--------------------------------------------------------------------------------
/Chapter03/pca.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.decomposition import PCA
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | if __name__ == '__main__':
13 |     # Load MNIST digits
14 |     digits = load_digits()
15 | 
16 |     # Show some random digits
17 |     selection = np.random.randint(0, 1797, size=100)
18 | 
19 |     fig, ax = plt.subplots(10, 10, figsize=(10, 10))
20 | 
21 |     samples = [digits.data[x].reshape((8, 8)) for x in selection]
22 | 
23 |     for i in range(10):
24 |         for j in range(10):
25 |             ax[i, j].set_axis_off()
26 |             ax[i, j].imshow(samples[(i * 8) + j], cmap='gray')
27 | 
28 |     plt.show()
29 | 
30 |     # Perform a PCA on the digits dataset
31 |     pca = PCA(n_components=36, whiten=True)
32 |     X_pca = pca.fit_transform(digits.data / 255)
33 | 
34 |     print('Explained variance ratio')
35 |     print(pca.explained_variance_ratio_)
36 | 
37 |     # Plot the explained variance ratio
38 |     fig, ax = plt.subplots(1, 2, figsize=(16, 6))
39 | 
40 |     ax[0].set_xlabel('Component')
41 |     ax[0].set_ylabel('Variance ratio (%)')
42 |     ax[0].bar(np.arange(36), pca.explained_variance_ratio_ * 100.0)
43 | 
44 |     ax[1].set_xlabel('Component')
45 |     ax[1].set_ylabel('Cumulative variance (%)')
46 |     ax[1].bar(np.arange(36), np.cumsum(pca.explained_variance_)[::-1])
47 | 
48 |     plt.show()
49 | 
50 |     # Rebuild from PCA and show the result
51 |     fig, ax = plt.subplots(10, 10, figsize=(10, 10))
52 | 
53 |     samples = [pca.inverse_transform(X_pca[x]).reshape((8, 8)) for x in selection]
54 | 
55 |     for i in range(10):
56 |         for j in range(10):
57 |             ax[i, j].set_axis_off()
58 |             ax[i, j].imshow(samples[(i * 8) + j], cmap='gray')
59 | 
60 |     plt.show()
61 | 
62 | 


--------------------------------------------------------------------------------
/Chapter04/bayesian_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import load_boston
 7 | from sklearn.linear_model import BayesianRidge
 8 | from sklearn.model_selection import train_test_split, cross_val_score
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | def show_dataset(data):
16 |     fig, ax = plt.subplots(4, 3, figsize=(20, 15))
17 | 
18 |     for i in range(4):
19 |         for j in range(3):
20 |             ax[i, j].plot(data.data[:, i + (j + 1) * 3])
21 |             ax[i, j].grid()
22 | 
23 |     plt.show()
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     # Load dataset
28 |     boston = load_boston()
29 | 
30 |     # Show dataset
31 |     show_dataset(boston)
32 | 
33 |     # Create a Bayesian ridge regressor instance
34 |     br = BayesianRidge(n_iter=1000)
35 | 
36 |     # Split dataset
37 |     X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, test_size=0.1)
38 | 
39 |     # Train the model
40 |     br.fit(X_train, Y_train)
41 | 
42 |     print('Score %.3f' % br.score(X_test, Y_test))
43 | 
44 |     # CV score
45 |     scores = cross_val_score(br, boston.data, boston.target, cv=7, scoring='neg_mean_squared_error')
46 |     print('CV Negative mean squared errors mean: %.3f' % scores.mean())
47 |     print('CV Negative mean squared errors std: %.3f' % scores.std())
48 | 
49 |     # CV R2 score
50 |     r2_scores = cross_val_score(br, boston.data, boston.target, cv=10, scoring='r2')
51 |     print('CV R2 score mean: %.3f' % r2_scores.mean())
52 |     print('CV R2 score std: %.3f' % r2_scores.std())
53 | 
54 |     # Explained variance score
55 |     ev_scores = cross_val_score(br, boston.data, boston.target, cv=10, scoring='explained_variance')
56 |     print('CV explained variance score mean: %.3f' % ev_scores.mean())
57 |     print('CV explained variance score std: %.3f' % ev_scores.std())


--------------------------------------------------------------------------------
/Chapter04/multiple_linear_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import load_boston
 7 | from sklearn.linear_model import LinearRegression
 8 | from sklearn.model_selection import train_test_split, cross_val_score
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | def show_dataset(data):
16 |     fig, ax = plt.subplots(4, 3, figsize=(20, 15))
17 | 
18 |     for i in range(4):
19 |         for j in range(3):
20 |             ax[i, j].plot(data.data[:, i + (j + 1) * 3])
21 |             ax[i, j].grid()
22 | 
23 |     plt.show()
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     # Load dataset
28 |     boston = load_boston()
29 | 
30 |     # Show dataset
31 |     show_dataset(boston)
32 | 
33 |     # Create a linear regressor instance
34 |     lr = LinearRegression(normalize=True)
35 | 
36 |     # Split dataset
37 |     X_train, X_test, Y_train, Y_test = train_test_split(boston.data, boston.target, test_size=0.1)
38 | 
39 |     # Train the model
40 |     lr.fit(X_train, Y_train)
41 | 
42 |     print('Score %.3f' % lr.score(X_test, Y_test))
43 | 
44 |     # CV score
45 |     scores = cross_val_score(lr, boston.data, boston.target, cv=7, scoring='neg_mean_squared_error')
46 |     print('CV Negative mean squared errors mean: %.3f' % scores.mean())
47 |     print('CV Negative mean squared errors std: %.3f' % scores.std())
48 | 
49 |     # CV R2 score
50 |     r2_scores = cross_val_score(lr, boston.data, boston.target, cv=10, scoring='r2')
51 |     print('CV R2 score mean: %.3f' % r2_scores.mean())
52 |     print('CV R2 score std: %.3f' % r2_scores.std())
53 | 
54 |     # Explained variance score
55 |     ev_scores = cross_val_score(lr, boston.data, boston.target, cv=10, scoring='explained_variance')
56 |     print('CV explained variance score mean: %.3f' % ev_scores.mean())
57 |     print('CV explained variance score std: %.3f' % ev_scores.std())
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/Chapter03/fastica.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import os
 6 | 
 7 | from shutil import copyfileobj
 8 | from six.moves import urllib
 9 | 
10 | from sklearn.datasets.base import get_data_home
11 | from sklearn.datasets import fetch_mldata
12 | from sklearn.decomposition import FastICA
13 | 
14 | 
15 | # Set random seed for reproducibility
16 | np.random.seed(1000)
17 | 
18 | 
19 | # mldata.org can be subject to outages
20 | # Alternative original MNIST source (provided by Aurélien Geron)
21 | def fetch_mnist(data_home=None):
22 |     mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
23 |     data_home = get_data_home(data_home=data_home)
24 |     data_home = os.path.join(data_home, 'mldata')
25 |     if not os.path.exists(data_home):
26 |         os.makedirs(data_home)
27 |     mnist_save_path = os.path.join(data_home, "mnist-original.mat")
28 |     if not os.path.exists(mnist_save_path):
29 |         mnist_url = urllib.request.urlopen(mnist_alternative_url)
30 |         with open(mnist_save_path, "wb") as matlab_file:
31 |             copyfileobj(mnist_url, matlab_file)
32 | 
33 | 
34 | def zero_center(Xd):
35 |     return Xd - np.mean(Xd, axis=0)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     # Load the dataset
40 |     mnist = fetch_mnist()
41 |     digits = fetch_mldata("MNIST original")
42 |     X = zero_center(digits['data'].astype(np.float64))
43 |     np.random.shuffle(X)
44 | 
45 |     # Peform Fast ICA with 64 components
46 |     fastica = FastICA(n_components=256, max_iter=5000, random_state=1000)
47 |     fastica.fit(X)
48 | 
49 |     # Plot the indipendent components
50 |     fig, ax = plt.subplots(8, 8, figsize=(11, 11))
51 | 
52 |     for i in range(8):
53 |         for j in range(8):
54 |             ax[i, j].imshow(fastica.components_[(i * 8) + j].reshape((28, 28)), cmap='gray')
55 |             ax[i, j].axis('off')
56 | 
57 |     plt.show()
58 | 


--------------------------------------------------------------------------------
/Chapter03/categorical.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder
 6 | from sklearn.feature_extraction import DictVectorizer, FeatureHasher
 7 | 
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | if __name__ == '__main__':
13 |     Y = np.random.choice(('Male', 'Female'), size=(10))
14 | 
15 |     # Encode the labels
16 |     print('Label encoding')
17 |     le = LabelEncoder()
18 |     yt = le.fit_transform(Y)
19 |     print(yt)
20 | 
21 |     # Decode a dummy output
22 |     print('Label decoding')
23 |     output = [1, 0, 1, 1, 0, 0]
24 |     decoded_output = [le.classes_[int(i)] for i in output]
25 |     print(decoded_output)
26 | 
27 |     # Binarize the labels
28 |     print('Label binarization')
29 |     lb = LabelBinarizer()
30 |     yb = lb.fit_transform(Y)
31 |     print(yb)
32 | 
33 |     # Decode the binarized labels
34 |     print('Label decoding')
35 |     lb.inverse_transform(yb)
36 | 
37 |     # Define some dictionary data
38 |     data = [
39 |         {'feature_1': 10.0, 'feature_2': 15.0},
40 |         {'feature_1': -5.0, 'feature_3': 22.0},
41 |         {'feature_3': -2.0, 'feature_4': 10.0}
42 |     ]
43 | 
44 |     # Vectorize the dictionary data
45 |     print('Dictionary data vectorization')
46 |     dv = DictVectorizer()
47 |     Y_dict = dv.fit_transform(data)
48 |     print(Y_dict.todense())
49 | 
50 |     print('Vocabulary:')
51 |     print(dv.vocabulary_)
52 | 
53 |     # Feature hashing
54 |     print('Feature hashing')
55 |     fh = FeatureHasher()
56 |     Y_hashed = fh.fit_transform(data)
57 | 
58 |     # Decode the features
59 |     print('Feature decoding')
60 |     print(Y_hashed.todense())
61 | 
62 |     # One-hot encoding
63 |     data1 = [
64 |         [0.0, 10.0],
65 |         [1.0, 11.0],
66 |         [1.0, 8.0],
67 |         [0.0, 12.0],
68 |         [0.0, 15.0]
69 |     ]
70 | 
71 |     # Encode data
72 |     oh = OneHotEncoder(categorical_features=[0])
73 |     Y_oh = oh.fit_transform(data1)
74 |     print(Y_oh.todense())
75 | 


--------------------------------------------------------------------------------
/Chapter09/k_means_2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_circles
 7 | from sklearn.cluster import KMeans
 8 | 
 9 | from scipy.spatial.distance import pdist
10 | 
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | nb_samples = 1000
16 | 
17 | 
18 | def show_dataset(X):
19 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
20 | 
21 |     ax.grid()
22 |     ax.set_xlabel('X')
23 |     ax.set_ylabel('Y')
24 | 
25 |     for i in range(nb_samples):
26 |         if Y[i] == 0:
27 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
28 |         else:
29 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
30 | 
31 |     plt.show()
32 | 
33 | 
34 | def show_clustered_dataset(X, km):
35 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
36 | 
37 |     ax.grid()
38 |     ax.set_xlabel('X')
39 |     ax.set_ylabel('Y')
40 | 
41 |     for i in range(nb_samples):
42 |         c = km.predict(X[i].reshape(1, -1))
43 |         if c == 0:
44 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
45 |         else:
46 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
47 | 
48 |     plt.show()
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     # Create dataset
53 |     X, Y = make_circles(n_samples=nb_samples, noise=0.05)
54 | 
55 |     # Show dataset
56 |     show_dataset(X)
57 | 
58 |     # Create and train K-Means
59 |     km = KMeans(n_clusters=2)
60 |     km.fit(X)
61 | 
62 |     # Show clustered dataset
63 |     show_clustered_dataset(X, km)
64 | 
65 |     # Compute the average intra-cluster distances
66 |     Y_pred = km.predict(X)
67 | 
68 |     sampled_X = np.random.choice(X[Y_pred == 0, 0], replace=False, size=300).astype(np.int32)
69 | 
70 |     true_distances = pdist(X[Y == 0], metric='euclidean')
71 |     distances = pdist(X[sampled_X], metric='euclidean')
72 | 
73 |     print('True average distance: %.3f' % np.mean(true_distances))
74 |     print('Clustering agerage distance: %.3f' % np.mean(distances))
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/Chapter13/reuters_text_classifier.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from nltk.corpus import reuters, stopwords
 6 | from nltk.tokenize import RegexpTokenizer
 7 | from nltk.stem.snowball import SnowballStemmer
 8 | 
 9 | from sklearn.feature_extraction.text import TfidfVectorizer
10 | from sklearn.model_selection import train_test_split
11 | from sklearn.ensemble import RandomForestClassifier
12 | 
13 | # For reproducibility
14 | np.random.seed(1000)
15 | 
16 | ret = RegexpTokenizer('[a-zA-Z0-9\']+')
17 | sw = set(stopwords.words('english'))
18 | ess = SnowballStemmer('english', ignore_stopwords=True)
19 | 
20 | 
21 | def tokenizer(sentence):
22 |     tokens = ret.tokenize(sentence)
23 |     return [ess.stem(t) for t in tokens if t not in sw]
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     # Compose the corpus
28 |     Xr = np.array(reuters.sents(categories=['rubber']))
29 |     Xc = np.array(reuters.sents(categories=['cotton']))
30 |     Xw = np.concatenate((Xr, Xc))
31 |     X = []
32 | 
33 |     for document in Xw:
34 |         X.append(' '.join(document).strip().lower())
35 | 
36 |     # Create the label vectors
37 |     Yr = np.zeros(shape=Xr.shape)
38 |     Yc = np.ones(shape=Xc.shape)
39 |     Y = np.concatenate((Yr, Yc))
40 | 
41 |     # Vectorize
42 |     tfidfv = TfidfVectorizer(tokenizer=tokenizer, ngram_range=(1, 2), norm='l2')
43 |     Xv = tfidfv.fit_transform(X)
44 | 
45 |     # Prepare train and test sets
46 |     X_train, X_test, Y_train, Y_test = train_test_split(Xv, Y, test_size=0.25)
47 | 
48 |     # Create and train a Random Forest classifier
49 |     rf = RandomForestClassifier(n_estimators=25)
50 |     rf.fit(X_train, Y_train)
51 | 
52 |     # Test classifier
53 |     score = rf.score(X_test, Y_test)
54 |     print('Score: %.3f' % score)
55 | 
56 |     test_newsline = [
57 |         'Trading tobacco is reducing the amount of requests for cotton and this has a negative impact on our economy']
58 |     yvt = tfidfv.transform(test_newsline)
59 |     category = rf.predict(yvt)
60 |     print('Predicted category: %d' % int(category[0]))
61 | 
62 | 


--------------------------------------------------------------------------------
/Chapter04/2d_linear_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from scipy.optimize import minimize
 7 | 
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | # Number of samples
13 | nb_samples = 200
14 | 
15 | 
16 | def loss(v):
17 |     e = 0.0
18 |     for i in range(nb_samples):
19 |         e += np.square(v[0] + v[1]*X[i] - Y[i])
20 |     return 0.5 * e
21 | 
22 | 
23 | def gradient(v):
24 |     g = np.zeros(shape=2)
25 |     for i in range(nb_samples):
26 |         g[0] += (v[0] + v[1]*X[i] - Y[i])
27 |         g[1] += ((v[0] + v[1]*X[i] - Y[i]) * X[i])
28 |     return g
29 | 
30 | 
31 | def show_dataset(X, Y):
32 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
33 | 
34 |     ax.scatter(X, Y)
35 |     ax.set_xlabel('X')
36 |     ax.set_ylabel('Y')
37 |     ax.grid()
38 | 
39 |     plt.show()
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     # Create dataset
44 |     X = np.arange(-5, 5, 0.05)
45 | 
46 |     Y = X + 2
47 |     Y += np.random.normal(0.0, 0.5, size=nb_samples)
48 | 
49 |     # Show the dataset
50 |     show_dataset(X, Y)
51 | 
52 |     # Minimize loss function
53 |     result = minimize(fun=loss, x0=np.array([0.0, 0.0]), jac=gradient, method='L-BFGS-B')
54 |     print(result)
55 | 
56 |     print('Interpolating rect:')
57 |     print('y = %.2fx + %2.f' % (result.x[1], result.x[0]))
58 | 
59 |     # Compute the absolute error
60 |     err = 0.0
61 | 
62 |     for i in range(nb_samples):
63 |         err += np.abs(Y[i] - (result.x[1]*X[i] + result.x[0]))
64 | 
65 |     print('Absolute error: %.2f' % err)
66 | 
67 |     # Repeat the process using the Moore-Penrose pseudo-inverse
68 |     Xs = np.expand_dims(X, axis=1)
69 |     Ys = np.expand_dims(Y, axis=1)
70 |     Xs = np.concatenate((Xs, np.ones_like(Xs)), axis=1)
71 | 
72 |     result = np.linalg.inv(np.dot(Xs.T, Xs)).dot(Xs.T).dot(Y)
73 | 
74 |     print('Interpolating rect:')
75 |     print('y = %.2fx + %2.f' % (result[0], result[1]))
76 | 
77 |     # Compute the estimator covariance
78 |     covariance = (0.5 ** 2) * np.linalg.inv(np.dot(Xs.T, Xs))
79 | 
80 |     print('Estimator covariance matrix:')
81 |     print(covariance)
82 | 


--------------------------------------------------------------------------------
/Chapter05/classification_metrics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.datasets import make_classification
 6 | from sklearn.model_selection import train_test_split
 7 | from sklearn.linear_model import LogisticRegression
 8 | from sklearn.metrics import accuracy_score, zero_one_loss, jaccard_similarity_score, confusion_matrix, \
 9 |     precision_score, recall_score, fbeta_score, cohen_kappa_score, classification_report
10 | 
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | nb_samples = 500
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     # Create dataset
20 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0,
21 |                                n_clusters_per_class=1, random_state=1000)
22 | 
23 |     # Split dataset
24 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=1000)
25 | 
26 |     # Create and train logistic regressor
27 |     lr = LogisticRegression()
28 |     lr.fit(X_train, Y_train)
29 | 
30 |     print('Accuracy score: %.3f' % accuracy_score(Y_test, lr.predict(X_test)))
31 |     print('Zero-one loss (normalized): %.3f' % zero_one_loss(Y_test, lr.predict(X_test)))
32 |     print('Zero-one loss (unnormalized): %.3f' % zero_one_loss(Y_test, lr.predict(X_test), normalize=False))
33 |     print('Jaccard similarity score: %.3f' % jaccard_similarity_score(Y_test, lr.predict(X_test)))
34 | 
35 |     # Compute confusion matrix
36 |     cm = confusion_matrix(y_true=Y_test, y_pred=lr.predict(X_test))
37 |     print('Confusion matrix:')
38 |     print(cm[::-1, ::-1])
39 | 
40 |     print('Precision score: %.3f' % precision_score(Y_test, lr.predict(X_test)))
41 |     print('Recall score: %.3f' % recall_score(Y_test, lr.predict(X_test)))
42 |     print('F-Beta score (1): %.3f' % fbeta_score(Y_test, lr.predict(X_test), beta=1))
43 |     print('F-Beta score (0.75): %.3f' % fbeta_score(Y_test, lr.predict(X_test), beta=0.75))
44 |     print('F-Beta score (1.25): %.3f' % fbeta_score(Y_test, lr.predict(X_test), beta=1.25))
45 |     print('Cohen-Kappa score: %.3f' % cohen_kappa_score(Y_test, lr.predict(X_test)))
46 | 
47 |     # Print the classification report
48 |     print('\n\nClassification report:')
49 |     print(classification_report(Y_test, lr.predict(X_test)))
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/Chapter06/gaussian.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.naive_bayes import GaussianNB
 8 | from sklearn.model_selection import train_test_split
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.metrics import roc_curve, auc
11 | 
12 | 
13 | # For reproducibility
14 | np.random.seed(1000)
15 | 
16 | nb_samples = 300
17 | 
18 | 
19 | def show_dataset(X, Y):
20 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
21 | 
22 |     ax.grid()
23 |     ax.set_xlabel('X')
24 |     ax.set_ylabel('Y')
25 | 
26 |     for i in range(nb_samples):
27 |         if Y[i] == 0:
28 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
29 |         else:
30 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
31 | 
32 |     plt.show()
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     # Create dataset
37 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0)
38 | 
39 |     # Show dataset
40 |     show_dataset(X, Y)
41 | 
42 |     # Split dataset
43 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
44 | 
45 |     # Create and train Gaussian Naive Bayes classifier
46 |     gnb = GaussianNB()
47 |     gnb.fit(X_train, Y_train)
48 | 
49 |     # Create and train a Logistic regressor (for comparison)
50 |     lr = LogisticRegression()
51 |     lr.fit(X_train, Y_train)
52 | 
53 |     # Compute ROC Curve
54 |     Y_gnb_score = gnb.predict_proba(X_test)
55 |     Y_lr_score = lr.decision_function(X_test)
56 | 
57 |     fpr_gnb, tpr_gnb, thresholds_gnb = roc_curve(Y_test, Y_gnb_score[:, 1])
58 |     fpr_lr, tpr_lr, thresholds_lr = roc_curve(Y_test, Y_lr_score)
59 | 
60 |     # Plot ROC Curve
61 |     plt.figure(figsize=(30, 25))
62 | 
63 |     plt.plot(fpr_gnb, tpr_gnb, color='red', label='Naive Bayes (AUC: %.2f)' % auc(fpr_gnb, tpr_gnb))
64 |     plt.plot(fpr_lr, tpr_lr, color='green', label='Logistic Regression (AUC: %.2f)' % auc(fpr_lr, tpr_lr))
65 |     plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
66 |     plt.xlim([0.0, 1.0])
67 |     plt.ylim([0.0, 1.01])
68 |     plt.title('ROC Curve')
69 |     plt.xlabel('False Positive Rate')
70 |     plt.ylabel('True Positive Rate')
71 |     plt.legend(loc="lower right")
72 | 
73 |     plt.show()
74 | 


--------------------------------------------------------------------------------
/Chapter05/passive_aggressive_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | from sklearn.datasets import make_regression
 7 | from sklearn.linear_model import PassiveAggressiveRegressor
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | nb_samples_1 = 300
14 | nb_samples_2 = 500
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     # Create the dataset
19 |     X, Y = make_regression(n_samples=nb_samples_1, n_features=5, random_state=1000)
20 | 
21 |     # Create the model
22 |     par = PassiveAggressiveRegressor(C=0.01, loss='squared_epsilon_insensitive', epsilon=0.001, max_iter=2000,
23 |                                      random_state=1000)
24 | 
25 |     # Fit the model incrementally and collect the squared errors
26 |     squared_errors = []
27 | 
28 |     for (x, y) in zip(X, Y):
29 |         par.partial_fit(x.reshape(1, -1), y.ravel())
30 |         y_pred = par.predict(x.reshape(1, -1))
31 |         squared_errors.append(np.power(y_pred - y, 2))
32 | 
33 |     # Show the error plot
34 |     fig, ax = plt.subplots(figsize=(18, 8))
35 | 
36 |     ax.plot(squared_errors)
37 |     ax.set_xlabel('Sample')
38 |     ax.set_ylabel('Squared error')
39 |     ax.grid()
40 | 
41 |     plt.show()
42 | 
43 |     # Repeat the example with a discontinuous dataset
44 |     X1, Y1 = make_regression(n_samples=nb_samples_2, n_features=5, random_state=1000)
45 |     X2, Y2 = make_regression(n_samples=nb_samples_2, n_features=5, random_state=1000)
46 | 
47 |     X2 += np.max(X1)
48 |     Y2 += 0.5
49 | 
50 |     X = np.concatenate((X1, X2))
51 |     Y = np.concatenate((Y1, Y2))
52 | 
53 |     par = PassiveAggressiveRegressor(C=0.01, loss='squared_epsilon_insensitive', epsilon=0.001, max_iter=2000,
54 |                                      random_state=1000)
55 | 
56 |     # Fit the model incrementally and collect the squared errors
57 |     squared_errors = []
58 | 
59 |     for (x, y) in zip(X, Y):
60 |         par.partial_fit(x.reshape(1, -1), y.ravel())
61 |         y_pred = par.predict(x.reshape(1, -1))
62 |         squared_errors.append(np.power(y_pred - y, 2))
63 | 
64 |     # Show the error plot
65 |     fig, ax = plt.subplots(figsize=(18, 8))
66 | 
67 |     ax.plot(squared_errors)
68 |     ax.set_xlabel('Sample')
69 |     ax.set_ylabel('Squared error')
70 |     ax.grid()
71 | 
72 |     plt.show()


--------------------------------------------------------------------------------
/Chapter10/spectral_clustering.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import warnings
 6 | 
 7 | from sklearn.datasets import make_moons
 8 | from sklearn.cluster import SpectralClustering
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | nb_samples = 1000
15 | 
16 | 
17 | def show_dataset(X, Y):
18 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
19 | 
20 |     ax.grid()
21 |     ax.set_xlabel('X')
22 |     ax.set_ylabel('Y')
23 | 
24 |     for i in range(nb_samples):
25 |         if Y[i] == 0:
26 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
27 |         else:
28 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
29 | 
30 |     plt.show()
31 | 
32 | 
33 | def show_clustered_dataset(X, Y):
34 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
35 | 
36 |     ax.grid()
37 |     ax.set_xlabel('X')
38 |     ax.set_ylabel('Y')
39 | 
40 |     ax.scatter(X[Y == 1, 0], X[Y == 1, 1], marker='o', color='r')
41 |     ax.scatter(X[Y == 0, 0], X[Y == 0, 1], marker='^', color='b')
42 | 
43 |     plt.show()
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     warnings.simplefilter("ignore")
48 | 
49 |     # Create dataset
50 |     X, Y = make_moons(n_samples=nb_samples, noise=0.05)
51 | 
52 |     # Show dataset
53 |     show_dataset(X, Y)
54 | 
55 |     # Cluster the dataset for different gamma values
56 |     Yss = []
57 |     gammas = np.linspace(0, 12, 4)
58 | 
59 |     for gamma in gammas:
60 |         sc = SpectralClustering(n_clusters=2, affinity='rbf', gamma=gamma)
61 |         Yss.append(sc.fit_predict(X))
62 | 
63 |     # Show the result
64 |     # The colors can be inverted with respect to the figure in the book
65 |     fig, ax = plt.subplots(1, 4, figsize=(18, 8))
66 | 
67 |     for i in range(4):
68 |         ax[i].scatter(X[Yss[i] == 1, 0], X[Yss[i] == 1, 1], marker='o', color='r')
69 |         ax[i].scatter(X[Yss[i] == 0, 0], X[Yss[i] == 0, 1], marker='^', color='b')
70 |         ax[i].grid()
71 |         ax[i].set_xlabel('X')
72 |         ax[i].set_ylabel('Y')
73 |         ax[i].set_title('Gamma = {}'.format(i * 4))
74 | 
75 |     plt.show()
76 | 
77 |     # Create and train Spectral Clustering
78 |     sc = SpectralClustering(n_clusters=2, affinity='nearest_neighbors')
79 |     Ys = sc.fit_predict(X)
80 | 
81 |     # Show clustered dataset
82 |     show_clustered_dataset(X, Y)
83 | 


--------------------------------------------------------------------------------
/Chapter04/ridge_lasso_elasticnet.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.datasets import load_boston
 6 | from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
 7 | from sklearn.model_selection import cross_val_score
 8 | 
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     boston = load_boston()
16 | 
17 |     # Create a linear regressor and compute CV score
18 |     lr = LinearRegression(normalize=True)
19 |     lr_scores = cross_val_score(lr, boston.data, boston.target, cv=10)
20 |     print('Linear regression CV average score: %.6f' % lr_scores.mean())
21 | 
22 |     # Create a Ridge regressor and compute CV score
23 |     rg = Ridge(0.05, normalize=True)
24 |     rg_scores = cross_val_score(rg, boston.data, boston.target, cv=10)
25 |     print('Ridge regression CV average score: %.6f' % rg_scores.mean())
26 | 
27 |     # Create a Lasso regressor and compute CV score
28 |     ls = Lasso(0.01, normalize=True)
29 |     ls_scores = cross_val_score(ls, boston.data, boston.target, cv=10)
30 |     print('Lasso regression CV average score: %.6f' % ls_scores.mean())
31 | 
32 |     # Create ElasticNet regressor and compute CV score
33 |     en = ElasticNet(alpha=0.001, l1_ratio=0.8, normalize=True)
34 |     en_scores = cross_val_score(en, boston.data, boston.target, cv=10)
35 |     print('ElasticNet regression CV average score: %.6f' % en_scores.mean())
36 | 
37 |     # Find the optimal alpha value for Ridge regression
38 |     rgcv = RidgeCV(alphas=(1.0, 0.1, 0.01, 0.001, 0.005, 0.0025, 0.001, 0.00025), normalize=True)
39 |     rgcv.fit(boston.data, boston.target)
40 |     print('Ridge optimal alpha: %.3f' % rgcv.alpha_)
41 | 
42 |     # Find the optimal alpha value for Lasso regression
43 |     lscv = LassoCV(alphas=(1.0, 0.1, 0.01, 0.001, 0.005, 0.0025, 0.001, 0.00025), normalize=True)
44 |     lscv.fit(boston.data, boston.target)
45 |     print('Lasso optimal alpha: %.3f' % lscv.alpha_)
46 | 
47 |     # Find the optimal alpha and l1_ratio for Elastic Net
48 |     encv = ElasticNetCV(alphas=(0.1, 0.01, 0.005, 0.0025, 0.001), l1_ratio=(0.1, 0.25, 0.5, 0.75, 0.8), normalize=True)
49 |     encv.fit(boston.data, boston.target)
50 |     print('ElasticNet optimal alpha: %.3f and L1 ratio: %.4f' % (encv.alpha_, encv.l1_ratio_))
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/Chapter15/keras_scikit_learn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sklearn.preprocessing import StandardScaler
 6 | from sklearn.utils import shuffle
 7 | from sklearn.model_selection import GridSearchCV
 8 | 
 9 | from keras.models import Sequential
10 | from keras.layers import Dense, Activation
11 | from keras.optimizers import Adam
12 | from keras.utils import to_categorical
13 | from keras.wrappers.scikit_learn import KerasClassifier
14 | 
15 | 
16 | # For reproducibility
17 | np.random.seed(1000)
18 | 
19 | nb_samples = 2000
20 | 
21 | 
22 | def build_model(lr=0.001):
23 |     model = Sequential()
24 | 
25 |     model.add(Dense(64, input_dim=2))
26 |     model.add(Activation('relu'))
27 | 
28 |     model.add(Dense(32))
29 |     model.add(Activation('relu'))
30 | 
31 |     model.add(Dense(16))
32 |     model.add(Activation('relu'))
33 | 
34 |     model.add(Dense(2))
35 |     model.add(Activation('softmax'))
36 | 
37 |     model.compile(optimizer=Adam(lr=lr),
38 |                   loss='categorical_crossentropy',
39 |                   metrics=['accuracy'])
40 | 
41 |     return model
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     # Create the dataset
46 |     X = np.zeros(shape=(nb_samples, 2), dtype=np.float32)
47 |     Y = np.zeros(shape=(nb_samples,), dtype=np.float32)
48 | 
49 |     t = 15.0 * np.random.uniform(0.0, 1.0, size=(int(nb_samples / 2), 1))
50 | 
51 |     X[0:int(nb_samples / 2), :] = t * np.hstack([-np.cos(t), np.sin(t)]) + \
52 |                                   np.random.uniform(0.0, 1.8, size=(int(nb_samples / 2), 2))
53 |     Y[0:int(nb_samples / 2)] = 0
54 | 
55 |     X[int(nb_samples / 2):, :] = t * np.hstack([np.cos(t), -np.sin(t)]) + \
56 |                                  np.random.uniform(0.0, 1.8, size=(int(nb_samples / 2), 2))
57 |     Y[int(nb_samples / 2):] = 1
58 | 
59 |     ss = StandardScaler()
60 |     X = ss.fit_transform(X)
61 | 
62 |     X, Y = shuffle(X, Y, random_state=1000)
63 | 
64 |     # Wrap the Keras model
65 |     skmodel = KerasClassifier(build_fn=build_model, epochs=100, batch_size=32, lr=0.001)
66 | 
67 |     # Perform a grid search
68 |     parameters = {
69 |         'lr': [0.001, 0.01, 0.1],
70 |         'batch_size': [32, 64, 128]
71 |     }
72 | 
73 |     gs = GridSearchCV(skmodel, parameters, cv=5)
74 |     gs.fit(X, to_categorical(Y, 2))
75 | 
76 |     # Show the best score and parameters
77 |     print(gs.best_score_)
78 |     print(gs.best_params_)
79 | 
80 | 


--------------------------------------------------------------------------------
/Chapter07/svr_airfoil.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import pandas as pd
 6 | 
 7 | from sklearn.svm import SVR
 8 | from sklearn.preprocessing import StandardScaler
 9 | from sklearn.model_selection import train_test_split
10 | 
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | # Download the dataset from: https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise
16 | # Change <DATA_HOME> with the folder where the file is stored
17 | file_path = '<DATA_HOME>/airfoil_self_noise.dat'
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     # Load the dataset
22 |     df = pd.read_csv(file_path, sep='\t', header=None)
23 | 
24 |     # Show the statistics
25 |     print(df.describe())
26 | 
27 |     # Extract the independent and dependent variables
28 |     X = df.iloc[:, 0:5].values
29 |     Y = df.iloc[:, 5].values
30 | 
31 |     # Scale the data
32 |     ssx, ssy = StandardScaler(), StandardScaler()
33 | 
34 |     Xs = ssx.fit_transform(X)
35 |     Ys = ssy.fit_transform(Y.reshape(-1, 1))
36 | 
37 |     # Create train and test sets
38 |     X_train, X_test, Y_train, Y_test = train_test_split(Xs, Ys.ravel(), test_size=300, random_state=1000)
39 | 
40 |     # Instantiate and train the SVR
41 |     svr = SVR(kernel='rbf', gamma=0.75, C=2.8, cache_size=500, epsilon=0.1)
42 |     svr.fit(X_train, Y_train)
43 | 
44 |     # Print the R^2 scores
45 |     print('Training R^2 score: %.3f' % svr.score(X_train, Y_train))
46 |     print('Test R^2 score: %.3f' % svr.score(X_test, Y_test))
47 | 
48 |     # Show both original dataset and predictions
49 |     fig, ax = plt.subplots(2, 1, figsize=(15, 9))
50 | 
51 |     ax[0].plot(ssy.inverse_transform(Ys))
52 |     ax[0].set_title('Original dataset')
53 |     ax[0].set_ylabel('Scaled sound pressure (dB)')
54 |     ax[0].grid()
55 | 
56 |     ax[1].plot(ssy.inverse_transform(svr.predict(Xs)))
57 |     ax[1].set_title('Predictions')
58 |     ax[1].set_xlabel('Sample')
59 |     ax[1].set_ylabel('Scaled sound pressure (dB)')
60 |     ax[1].grid()
61 | 
62 |     plt.show()
63 | 
64 |     # Show the absolute errors
65 |     fig, ax = plt.subplots(figsize=(15, 4))
66 | 
67 |     Y = np.squeeze(ssy.inverse_transform(Ys))
68 |     Yp = ssy.inverse_transform(svr.predict(Xs))
69 | 
70 |     ax.plot(np.abs(Y - Yp))
71 |     ax.set_title('Absolute errors')
72 |     ax.set_xlabel('Sample')
73 |     ax.set_ylabel(r'$|Y-Yp|$')
74 |     ax.grid()
75 | 
76 |     plt.show()


--------------------------------------------------------------------------------
/Chapter05/logistic_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.model_selection import train_test_split, cross_val_score
 8 | from sklearn.linear_model import LogisticRegression
 9 | 
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | nb_samples = 500
15 | 
16 | 
17 | def show_dataset(X, Y):
18 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
19 | 
20 |     ax.grid()
21 |     ax.set_xlabel('X')
22 |     ax.set_ylabel('Y')
23 | 
24 |     for i in range(nb_samples):
25 |         if Y[i] == 0:
26 |             ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
27 |         else:
28 |             ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')
29 | 
30 |     plt.show()
31 | 
32 | 
33 | def show_classification_areas(X, Y, lr):
34 |     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
35 |     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
36 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
37 |     Z = lr.predict(np.c_[xx.ravel(), yy.ravel()])
38 | 
39 |     Z = Z.reshape(xx.shape)
40 |     plt.figure(1, figsize=(30, 25))
41 |     plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel1)
42 | 
43 |     # Plot also the training points
44 |     plt.scatter(X[:, 0], X[:, 1], c=np.abs(Y - 1), edgecolors='k', cmap=plt.cm.coolwarm)
45 |     plt.xlabel('X')
46 |     plt.ylabel('Y')
47 | 
48 |     plt.xlim(xx.min(), xx.max())
49 |     plt.ylim(yy.min(), yy.max())
50 |     plt.xticks(())
51 |     plt.yticks(())
52 | 
53 |     plt.show()
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     # Create dataset
58 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0,
59 |                                n_clusters_per_class=1)
60 | 
61 |     # Show dataset
62 |     show_dataset(X, Y)
63 | 
64 |     # Split dataset
65 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
66 | 
67 |     # Create logistic regressor
68 |     lr = LogisticRegression()
69 |     lr.fit(X_train, Y_train)
70 |     print('Logistic regression score: %.3f' % lr.score(X_test, Y_test))
71 | 
72 |     # Compute CV score
73 |     lr_scores = cross_val_score(lr, X, Y, scoring='accuracy', cv=10)
74 |     print('Logistic regression CV average score: %.3f' % lr_scores.mean())
75 | 
76 |     # Show classification areas
77 |     show_classification_areas(X, Y, lr)
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/Chapter11/agglomerative_clustering.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_blobs
 7 | from sklearn.cluster import AgglomerativeClustering
 8 | from sklearn.metrics import silhouette_score, adjusted_rand_score
 9 | 
10 | # For reproducibility
11 | np.random.seed(1000)
12 | 
13 | nb_samples = 3000
14 | 
15 | 
16 | def plot_clustered_dataset(X, Y):
17 |     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
18 | 
19 |     ax.grid()
20 |     ax.set_xlabel('X')
21 |     ax.set_ylabel('Y')
22 | 
23 |     markers = ['o', 'd', '^', 'x', '1', '2', '3', 's']
24 |     colors = ['r', 'b', 'g', 'c', 'm', 'k', 'y', '#cccfff']
25 | 
26 |     for i in range(nb_samples):
27 |         ax.scatter(X[i, 0], X[i, 1], marker=markers[Y[i]], color=colors[Y[i]])
28 | 
29 |     plt.show()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     # Create the dataset
34 |     X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=8, cluster_std=2.0)
35 | 
36 |     # Show the dataset
37 |     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
38 | 
39 |     ax.grid()
40 |     ax.set_xlabel('X')
41 |     ax.set_ylabel('Y')
42 | 
43 |     ax.scatter(X[:, 0], X[:, 1], marker='o', color='b')
44 |     plt.show()
45 | 
46 |     # Complete linkage
47 |     print('Complete linkage')
48 |     ac = AgglomerativeClustering(n_clusters=8, linkage='complete')
49 |     Y_pred = ac.fit_predict(X)
50 | 
51 |     print('Silhouette score (Complete): %.3f' % silhouette_score(X, Y_pred))
52 |     print('Adjusted Rand score (Complete): %.3f' % adjusted_rand_score(Y, Y_pred))
53 | 
54 |     # Show the clustered dataset
55 |     plot_clustered_dataset(X, Y)
56 | 
57 |     # Average linkage
58 |     print('Average linkage')
59 |     ac = AgglomerativeClustering(n_clusters=8, linkage='average')
60 |     Y_pred = ac.fit_predict(X)
61 | 
62 |     print('Silhouette score (Average): %.3f' % silhouette_score(X, Y_pred))
63 |     print('Adjusted Rand score (Average): %.3f' % adjusted_rand_score(Y, Y_pred))
64 | 
65 |     # Show the clustered dataset
66 |     plot_clustered_dataset(X, Y)
67 | 
68 |     # Ward linkage
69 |     print('Ward linkage')
70 |     ac = AgglomerativeClustering(n_clusters=8)
71 |     Y_pred = ac.fit_predict(X)
72 | 
73 |     print('Silhouette score (Ward): %.3f' % silhouette_score(X, Y_pred))
74 |     print('Adjusted Rand score (Ward): %.3f' % adjusted_rand_score(Y, Y_pred))
75 | 
76 |     # Show the clustered dataset
77 |     plot_clustered_dataset(X, Y)
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/Chapter13/vectorizing.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | from nltk.corpus import stopwords
 6 | from nltk.tokenize import RegexpTokenizer
 7 | from nltk.stem.snowball import SnowballStemmer
 8 | 
 9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
10 | 
11 | # For reproducibility
12 | np.random.seed(1000)
13 | 
14 | ret = RegexpTokenizer('[a-zA-Z0-9\']+')
15 | sw = set(stopwords.words('english'))
16 | ess = SnowballStemmer('english', ignore_stopwords=True)
17 | 
18 | 
19 | def tokenizer(sentence):
20 |     tokens = ret.tokenize(sentence)
21 |     return [ess.stem(t) for t in tokens if t not in sw]
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     # Create a corpus
26 |     corpus = [
27 |         'This is a simple test corpus',
28 |         'A corpus is a set of text documents',
29 |         'We want to analyze the corpus and the documents',
30 |         'Documents can be automatically tokenized'
31 |     ]
32 | 
33 |     # Create a count vectorizer
34 |     print('Count vectorizer:')
35 |     cv = CountVectorizer()
36 | 
37 |     vectorized_corpus = cv.fit_transform(corpus)
38 |     print(vectorized_corpus.todense())
39 | 
40 |     print('CV Vocabulary:')
41 |     print(cv.vocabulary_)
42 | 
43 |     # Perform an inverse transformation
44 |     vector = [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1]
45 |     print(cv.inverse_transform(vector))
46 | 
47 |     # Use a complete external tokenizer
48 |     print('CV with external tokenizer:')
49 |     cv = CountVectorizer(tokenizer=tokenizer)
50 |     vectorized_corpus = cv.fit_transform(corpus)
51 |     print(vectorized_corpus.todense())
52 | 
53 |     # Use an n-gram range equal to (1, 2)
54 |     print('CV witn n-gram range (1, 2):')
55 |     cv = CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 2))
56 |     vectorized_corpus = cv.fit_transform(corpus)
57 |     print(vectorized_corpus.todense())
58 | 
59 |     print('N-gram range (1,2) vocabulary:')
60 |     print(cv.vocabulary_)
61 | 
62 |     # Create a Tf-Idf vectorizer
63 |     print('Tf-Idf vectorizer:')
64 |     tfidfv = TfidfVectorizer()
65 |     vectorized_corpus = tfidfv.fit_transform(corpus)
66 |     print(vectorized_corpus.todense())
67 | 
68 |     print('Tf-Idf vocabulary:')
69 |     print(tfidfv.vocabulary_)
70 | 
71 |     # Use n-gram range equal to (1, 2) and L2 normalization
72 |     print('Tf-Idf witn n-gram range (1, 2) and L2 normalization:')
73 |     tfidfv = TfidfVectorizer(tokenizer=tokenizer, ngram_range=(1, 2), norm='l2')
74 |     vectorized_corpus = tfidfv.fit_transform(corpus)
75 |     print(vectorized_corpus.todense())
76 | 
77 | 


--------------------------------------------------------------------------------
/Chapter14/sentiment_analysis.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import multiprocessing
 5 | import numpy as np
 6 | 
 7 | from nltk.corpus import stopwords
 8 | from nltk.tokenize import RegexpTokenizer
 9 | from nltk.stem.lancaster import LancasterStemmer
10 | 
11 | from sklearn.feature_extraction.text import TfidfVectorizer
12 | from sklearn.model_selection import train_test_split
13 | from sklearn.ensemble import RandomForestClassifier
14 | from sklearn.metrics import precision_score, recall_score, roc_curve, auc
15 | 
16 | # For reproducibility
17 | np.random.seed(1000)
18 | 
19 | # Path to the dataset (http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip)
20 | dataset = 'dataset.csv'
21 | 
22 | rt = RegexpTokenizer('[a-zA-Z0-9\.]+')
23 | sw = set(stopwords.words('english'))
24 | ls = LancasterStemmer()
25 | 
26 | 
27 | def tokenizer(sentence):
28 |     tokens = rt.tokenize(sentence)
29 |     return [ls.stem(t.lower()) for t in tokens if t not in sw]
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     # Load corpus and labels
34 |     corpus = []
35 |     labels = []
36 | 
37 |     with open(dataset, 'r', encoding='utf-8') as df:
38 |         for i, line in enumerate(df):
39 |             if i == 0:
40 |                 continue
41 | 
42 |             parts = line.strip().split(',')
43 |             labels.append(float(parts[1].strip()))
44 |             corpus.append(parts[3].strip())
45 | 
46 |     # Vectorize the corpus (only 100000 records)
47 |     tfv = TfidfVectorizer(tokenizer=tokenizer, sublinear_tf=True, ngram_range=(1, 2), norm='l2')
48 |     X = tfv.fit_transform(corpus[0:100000])
49 |     Y = np.array(labels[0:100000])
50 | 
51 |     # Prepare train and test sets
52 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)
53 | 
54 |     # Create and train a Random Forest
55 |     rf = RandomForestClassifier(n_estimators=20, n_jobs=multiprocessing.cpu_count())
56 |     rf.fit(X_train, Y_train)
57 | 
58 |     # Compute scores
59 |     print('Precision: %.3f' % precision_score(Y_test, rf.predict(X_test)))
60 |     print('Recall: %.3f' % recall_score(Y_test, rf.predict(X_test)))
61 | 
62 |     # Compute the ROC curve
63 |     y_score = rf.predict_proba(X_test)
64 |     fpr, tpr, thresholds = roc_curve(Y_test, y_score[:, 1])
65 | 
66 |     plt.figure(figsize=(8, 8))
67 |     plt.plot(fpr, tpr, color='red', label='Random Forest (AUC: %.2f)' % auc(fpr, tpr))
68 |     plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
69 |     plt.xlim([0.0, 1.0])
70 |     plt.ylim([0.0, 1.01])
71 |     plt.title('ROC Curve')
72 |     plt.xlabel('False Positive Rate')
73 |     plt.ylabel('True Positive Rate')
74 |     plt.legend(loc="lower right")
75 |     plt.show()
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/Chapter03/data_scaling.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.preprocessing import StandardScaler, RobustScaler
 7 | 
 8 | # For reproducibility
 9 | np.random.seed(1000)
10 | 
11 | if __name__ == '__main__':
12 |     # Create a dummy dataset
13 |     data = np.ndarray(shape=(100, 2))
14 | 
15 |     for i in range(100):
16 |         data[i, 0] = 2.0 + np.random.normal(1.5, 3.0)
17 |         data[i, 1] = 0.5 + np.random.normal(1.5, 3.0)
18 | 
19 |     # Show the original and the scaled dataset
20 |     fig, ax = plt.subplots(1, 2, figsize=(14, 5))
21 | 
22 |     ax[0].scatter(data[:, 0], data[:, 1])
23 |     ax[0].set_xlim([-10, 10])
24 |     ax[0].set_ylim([-10, 10])
25 |     ax[0].grid()
26 |     ax[0].set_xlabel('X')
27 |     ax[0].set_ylabel('Y')
28 |     ax[0].set_title('Raw data')
29 | 
30 |     # Scale data
31 |     ss = StandardScaler()
32 |     scaled_data = ss.fit_transform(data)
33 | 
34 |     ax[1].scatter(scaled_data[:, 0], scaled_data[:, 1])
35 |     ax[1].set_xlim([-10, 10])
36 |     ax[1].set_ylim([-10, 10])
37 |     ax[1].grid()
38 |     ax[1].set_xlabel('X')
39 |     ax[1].set_ylabel('Y')
40 |     ax[1].set_title('Scaled data')
41 | 
42 |     plt.show()
43 | 
44 |     # Scale data using a Robust Scaler
45 |     fig, ax = plt.subplots(2, 2, figsize=(8, 8))
46 | 
47 |     ax[0, 0].scatter(data[:, 0], data[:, 1])
48 |     ax[0, 0].set_xlim([-10, 10])
49 |     ax[0, 0].set_ylim([-10, 10])
50 |     ax[0, 0].grid()
51 |     ax[0, 0].set_xlabel('X')
52 |     ax[0, 0].set_ylabel('Y')
53 |     ax[0, 0].set_title('Raw data')
54 | 
55 |     rs = RobustScaler(quantile_range=(15, 85))
56 |     scaled_data = rs.fit_transform(data)
57 | 
58 |     ax[0, 1].scatter(scaled_data[:, 0], scaled_data[:, 1])
59 |     ax[0, 1].set_xlim([-10, 10])
60 |     ax[0, 1].set_ylim([-10, 10])
61 |     ax[0, 1].grid()
62 |     ax[0, 1].set_xlabel('X')
63 |     ax[0, 1].set_ylabel('Y')
64 |     ax[0, 1].set_title('Scaled data (15% - 85%)')
65 | 
66 |     rs1 = RobustScaler(quantile_range=(25, 75))
67 |     scaled_data1 = rs1.fit_transform(data)
68 | 
69 |     ax[1, 0].scatter(scaled_data1[:, 0], scaled_data1[:, 1])
70 |     ax[1, 0].set_xlim([-10, 10])
71 |     ax[1, 0].set_ylim([-10, 10])
72 |     ax[1, 0].grid()
73 |     ax[1, 0].set_xlabel('X')
74 |     ax[1, 0].set_ylabel('Y')
75 |     ax[1, 0].set_title('Scaled data (25% - 75%)')
76 | 
77 |     rs2 = RobustScaler(quantile_range=(30, 65))
78 |     scaled_data2 = rs2.fit_transform(data)
79 | 
80 |     ax[1, 1].scatter(scaled_data2[:, 0], scaled_data2[:, 1])
81 |     ax[1, 1].set_xlim([-10, 10])
82 |     ax[1, 1].set_ylim([-10, 10])
83 |     ax[1, 1].grid()
84 |     ax[1, 1].set_xlabel('X')
85 |     ax[1, 1].set_ylabel('Y')
86 |     ax[1, 1].set_title('Scaled data (30% - 60%)')
87 | 
88 |     plt.show()
89 | 
90 | 


--------------------------------------------------------------------------------
/Chapter08/voting_classifier.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | from sklearn.datasets import make_classification
 7 | from sklearn.linear_model import LogisticRegression
 8 | from sklearn.svm import SVC
 9 | from sklearn.tree import DecisionTreeClassifier
10 | from sklearn.ensemble import VotingClassifier
11 | from sklearn.model_selection import cross_val_score
12 | 
13 | # For reproducibility
14 | np.random.seed(1000)
15 | 
16 | nb_samples = 500
17 | 
18 | 
19 | def compute_accuracies(lr, dt, svc, vc, X, Y):
20 |     accuracies = []
21 | 
22 |     accuracies.append(cross_val_score(lr, X, Y, scoring='accuracy', cv=10).mean())
23 |     accuracies.append(cross_val_score(dt, X, Y, scoring='accuracy', cv=10).mean())
24 |     accuracies.append(cross_val_score(svc, X, Y, scoring='accuracy', cv=10).mean())
25 |     accuracies.append(cross_val_score(vc, X, Y, scoring='accuracy', cv=10).mean())
26 | 
27 |     print('Accuracies:')
28 |     print(np.array(accuracies))
29 | 
30 |     return accuracies
31 | 
32 | 
33 | def plot_accuracies(accuracies):
34 |     fig, ax = plt.subplots(figsize=(12, 8))
35 |     positions = np.array([0, 1, 2, 3])
36 | 
37 |     ax.bar(positions, accuracies, 0.5)
38 |     ax.set_ylabel('Accuracy')
39 |     ax.set_xticklabels(('Logistic Regression', 'Decision Tree', 'SVM', 'Ensemble'))
40 |     ax.set_xticks(positions + (5.0 / 20))
41 |     plt.ylim([0.80, 0.93])
42 |     plt.show()
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     # Create the dataset
47 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, n_classes=2)
48 | 
49 |     # Show the dataset
50 |     fig, ax = plt.subplots(figsize=(12, 12))
51 | 
52 |     for i, x in enumerate(X):
53 |         if Y[i] == 0:
54 |             ax.scatter(x[0], x[1], marker='s', color='blue')
55 |         else:
56 |             ax.scatter(x[0], x[1], marker='d', color='red')
57 | 
58 |     ax.set_xlabel(r'$X_0$')
59 |     ax.set_ylabel(r'$X_1$')
60 |     plt.show()
61 | 
62 |     # Create the classifiers
63 |     lr = LogisticRegression()
64 |     svc = SVC(kernel='poly', probability=True)
65 |     dt = DecisionTreeClassifier()
66 | 
67 |     classifiers = [('lr', lr),
68 |                    ('dt', dt),
69 |                    ('svc', svc)]
70 | 
71 |     # Hard voting
72 |     vc = VotingClassifier(estimators=classifiers, voting='hard')
73 | 
74 |     # Compute and plot accuracies
75 |     hard_accuracies = compute_accuracies(lr, dt, svc, vc, X, Y)
76 |     plot_accuracies(hard_accuracies)
77 | 
78 |     # Soft weighted voting
79 |     weights = [1.5, 0.5, 0.75]
80 | 
81 |     vc = VotingClassifier(estimators=classifiers, weights=weights, voting='soft')
82 | 
83 |     # Compute and plot accuracies
84 |     soft_accuracies = compute_accuracies(lr, dt, svc, vc, X, Y)
85 |     plot_accuracies(soft_accuracies)
86 | 
87 | 


--------------------------------------------------------------------------------
/Chapter08/decision_tree_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from sklearn.tree import DecisionTreeRegressor, export_graphviz
 8 | from sklearn.model_selection import train_test_split, cross_val_score
 9 | 
10 | 
11 | # Set random seed for reproducibility
12 | np.random.seed(1000)
13 | 
14 | 
15 | # Download the dataset from: https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength
16 | # Change <DATA_HOME> with the folder where the file is stored
17 | file_path = '<DATA_HOME>/Concrete_Data.xls'
18 | graphviz_path = '<DATA_HOME>/Concrete_Data.dot'
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     # Load the dataset
23 |     df = pd.read_excel(file_path, header=0)
24 |     X = df.iloc[:, 0:8].values
25 |     Y = df.iloc[:, 8].values
26 | 
27 |     # Print the statistic summary
28 |     print(df.describe())
29 | 
30 |     # Print the CV scores
31 |     print(cross_val_score(DecisionTreeRegressor(criterion='mse', max_depth=11, random_state=1000), X, Y, cv=20))
32 | 
33 |     # Create train and test sets
34 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=200, random_state=1000)
35 | 
36 |     # Train the Decision Tree Regressor
37 |     dtr = DecisionTreeRegressor(criterion='mse', max_depth=11, random_state=1000)
38 |     dtr.fit(X_train, Y_train)
39 | 
40 |     # Export the tree in Graphviz format
41 |     # You can use http://www.webgraphviz.com to visualize the tree
42 |     export_graphviz(dtr, out_file=graphviz_path,
43 |                     feature_names=['Cement', 'Blast furnace slag', 'Fly ash', 'Water',
44 |                                    'Superplasticizer', 'Coarse Aggregate', 'Fine Aggregate', 'Age'])
45 | 
46 |     print('Training R^2 score: %.3f' % dtr.score(X_train, Y_train))
47 |     print('Validation R^2 score: %.3f' % dtr.score(X_test, Y_test))
48 | 
49 |     # Compute the predictions
50 |     Y_pred = dtr.predict(X)
51 | 
52 |     # Show the dataset, predictions and absolute errors
53 |     fig, ax = plt.subplots(3, 1, figsize=(18, 15))
54 | 
55 |     ax[0].plot(Y)
56 |     ax[0].set_title('Original dataset')
57 |     ax[0].set_ylabel('Concrete Compressive Strength')
58 |     ax[0].grid()
59 | 
60 |     ax[1].plot(Y_pred)
61 |     ax[1].set_title('Predictions')
62 |     ax[1].set_xlabel('Sample')
63 |     ax[1].set_ylabel('Concrete Compressive Strength')
64 |     ax[1].grid()
65 | 
66 |     ax[2].plot(np.abs(Y_pred - Y))
67 |     ax[2].set_yticks(np.arange(0.0, 81.0, 10.0))
68 |     ax[2].set_xlabel('Sample')
69 |     ax[2].set_ylabel('Absolute error')
70 |     ax[2].grid()
71 | 
72 |     plt.show()
73 | 
74 |     # Show the absolute error histogram
75 |     fig, ax = plt.subplots(figsize=(14, 8))
76 | 
77 |     ax.hist(np.abs(Y_pred - Y), bins='auto', log=True)
78 |     ax.set_xlabel('Absolute error')
79 |     ax.set_ylabel('Sample count')
80 |     ax.grid()
81 | 
82 |     plt.show()


--------------------------------------------------------------------------------
/Chapter16/mlp.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import tensorflow as tf
 6 | 
 7 | from sklearn.datasets import make_classification
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | from mpl_toolkits.mplot3d import Axes3D
11 | 
12 | nb_samples = 1000
13 | nb_features = 3
14 | nb_epochs = 200
15 | batch_size = 50
16 | 
17 | # For reproducibility
18 | np.random.seed(1000)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     # Create the dataset
23 |     X, Y = make_classification(n_samples=nb_samples, n_features=nb_features,
24 |                                n_informative=3, n_redundant=0, n_classes=2, n_clusters_per_class=3)
25 | 
26 |     # Show the dataset
27 |     fig = plt.figure(figsize=(11, 11))
28 |     ax = fig.add_subplot(111, projection='3d')
29 | 
30 |     for i, x in enumerate(X):
31 |         if Y[i] == 0:
32 |             ax.scatter(x[0], x[1], x[2], marker='s', color='blue')
33 |         elif Y[i] == 1:
34 |             ax.scatter(x[0], x[1], x[2], marker='d', color='red')
35 | 
36 |     ax.set_xlabel(r'$X_0$')
37 |     ax.set_ylabel(r'$X_1$')
38 |     ax.set_zlabel(r'$X_2$')
39 |     plt.show()
40 | 
41 |     # Create train and test sets
42 |     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
43 | 
44 |     # Create the graph
45 |     graph = tf.Graph()
46 | 
47 |     with graph.as_default():
48 |         Xt = tf.placeholder(tf.float32, shape=(None, nb_features), name='X')
49 |         Yt = tf.placeholder(tf.float32, shape=(None, 1), name='Y')
50 | 
51 |         layer_1 = tf.layers.dense(inputs=Xt, units=50, activation=tf.nn.tanh)
52 |         layer_2 = tf.layers.dense(inputs=layer_1, units=1, activation=tf.nn.sigmoid)
53 | 
54 |         Yo = tf.round(layer_2)
55 | 
56 |         loss = tf.nn.l2_loss(layer_2 - Yt)
57 |         training_step = tf.train.GradientDescentOptimizer(0.025).minimize(loss)
58 | 
59 |     session = tf.InteractiveSession(graph=graph)
60 |     tf.global_variables_initializer().run()
61 | 
62 |     # Run the training cycle
63 |     for e in range(nb_epochs):
64 |         total_loss = 0.0
65 |         Xb = np.ndarray(shape=(batch_size, nb_features), dtype=np.float32)
66 |         Yb = np.ndarray(shape=(batch_size, 1), dtype=np.float32)
67 | 
68 |         for i in range(0, X_train.shape[0] - batch_size, batch_size):
69 |             Xb[:, :] = X_train[i:i + batch_size, :]
70 |             Yb[:, 0] = Y_train[i:i + batch_size]
71 | 
72 |             loss_value, _ = session.run([loss, training_step], feed_dict={Xt: Xb, Yt: Yb})
73 |             total_loss += loss_value
74 | 
75 |         Y_predicted = session.run([Yo], feed_dict={Xt: X_test.reshape((X_test.shape[0], nb_features))})
76 |         accuracy = 1.0 - (np.sum(np.abs(np.array(Y_predicted[0]).squeeze(axis=1) - Y_test)) / float(Y_test.shape[0]))
77 | 
78 |         print('Epoch %d) Total loss: %.2f - Accuracy: %.2f' % (e, total_loss, accuracy))
79 | 


--------------------------------------------------------------------------------
/Chapter11/connectivity_constraints.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import matplotlib.cm as cm
 6 | 
 7 | from sklearn.datasets import make_circles
 8 | from sklearn.cluster import AgglomerativeClustering
 9 | from sklearn.neighbors import kneighbors_graph
10 | from sklearn.metrics import silhouette_score
11 | 
12 | # For reproducibility
13 | np.random.seed(1000)
14 | 
15 | 
16 | nb_samples = 3000
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     # Create the dataset
21 |     X, Y = make_circles(n_samples=nb_samples, noise=0.05)
22 | 
23 |     # Show the dataset
24 |     fig, ax = plt.subplots(1, 1, figsize=(10, 8))
25 | 
26 |     ax.grid()
27 |     ax.set_xlabel('X')
28 |     ax.set_ylabel('Y')
29 | 
30 |     ax.scatter(X[:, 0], X[:, 1], marker='o', color='b')
31 |     plt.show()
32 | 
33 |     # Unstructured clustering with average linkage
34 |     print('Unstructured clustering with average linkage')
35 |     ac = AgglomerativeClustering(n_clusters=20, linkage='average')
36 |     Y_pred = ac.fit_predict(X)
37 | 
38 |     print('Silhouette score: %.3f' % silhouette_score(X, Y_pred))
39 | 
40 |     # Plot the clustered dataset
41 |     fig, ax = plt.subplots(1, 1, figsize=(12, 10))
42 | 
43 |     ax.grid()
44 |     ax.set_xlabel('X')
45 |     ax.set_ylabel('Y')
46 |     ax.scatter(X[:, 0], X[:, 1], marker='o', cmap=cm.spectral, c=ac.labels_)
47 |     plt.show()
48 | 
49 |     # Connectivity constraints
50 |     print('Imposing connectivity constraints')
51 | 
52 |     acc = []
53 |     k = [50, 100, 200, 500]
54 | 
55 |     ac = AgglomerativeClustering(n_clusters=20, connectivity=None, linkage='average')
56 |     ac.fit(X)
57 | 
58 |     for i in k:
59 |         kng = kneighbors_graph(X, i)
60 |         ac1 = AgglomerativeClustering(n_clusters=20, connectivity=kng, linkage='average')
61 |         Y_pred = ac1.fit_predict(X)
62 |         print('Silhouette score (k=%d): %.3f' % (i, silhouette_score(X, Y_pred)))
63 |         acc.append(ac1)
64 | 
65 |     # Show the four plots
66 |     fig, ax = plt.subplots(2, 2, figsize=(14, 10))
67 | 
68 |     ax[0, 0].grid()
69 |     ax[0, 0].set_title('K = 50')
70 |     ax[0, 0].set_xlabel('X')
71 |     ax[0, 0].set_ylabel('Y')
72 |     ax[0, 0].scatter(X[:, 0], X[:, 1], marker='o', cmap=cm.spectral, c=acc[0].labels_)
73 | 
74 |     ax[0, 1].grid()
75 |     ax[0, 1].set_title('K = 100')
76 |     ax[0, 1].set_xlabel('X')
77 |     ax[0, 1].set_ylabel('Y')
78 |     ax[0, 1].scatter(X[:, 0], X[:, 1], marker='o', cmap=cm.spectral, c=acc[1].labels_)
79 | 
80 |     ax[1, 0].grid()
81 |     ax[1, 0].set_title('K = 200')
82 |     ax[1, 0].set_xlabel('X')
83 |     ax[1, 0].set_ylabel('Y')
84 |     ax[1, 0].scatter(X[:, 0], X[:, 1], marker='o', cmap=cm.spectral, c=acc[2].labels_)
85 | 
86 |     ax[1, 1].grid()
87 |     ax[1, 1].set_title('K = 500')
88 |     ax[1, 1].set_xlabel('X')
89 |     ax[1, 1].set_ylabel('Y')
90 |     ax[1, 1].scatter(X[:, 0], X[:, 1], marker='o', cmap=cm.spectral, c=acc[3].labels_)
91 |     plt.show()
92 | 
93 | 


--------------------------------------------------------------------------------
/Chapter16/logistic_regression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import tensorflow as tf
 6 | 
 7 | from sklearn.datasets import make_classification
 8 | 
 9 | # For reproducibility
10 | np.random.seed(1000)
11 | 
12 | nb_samples = 500
13 | 
14 | if __name__ == '__main__':
15 |     # Create the dataset
16 |     X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, n_classes=2)
17 | 
18 |     # Plot the dataset
19 |     fig, ax = plt.subplots(figsize=(9, 7))
20 |     ax.set_xlabel(r'$X_0$')
21 |     ax.set_ylabel(r'$X_1$')
22 | 
23 |     for i, x in enumerate(X):
24 |         if Y[i] == 0:
25 |             ax.scatter(x[0], x[1], marker='d', color='blue')
26 |         else:
27 |             ax.scatter(x[0], x[1], marker='s', color='red')
28 | 
29 |     plt.show()
30 | 
31 |     # Create the graph
32 |     graph = tf.Graph()
33 | 
34 |     with graph.as_default():
35 |         Xt = tf.placeholder(tf.float32, shape=(None, 2), name='points')
36 |         Yt = tf.placeholder(tf.float32, shape=(None, 1), name='classes')
37 | 
38 |         W = tf.Variable(tf.zeros((2, 1)), name='weights')
39 |         bias = tf.Variable(tf.zeros((1, 1)), name='bias')
40 | 
41 |         Ye = tf.matmul(Xt, W) + bias
42 |         Yc = tf.round(tf.sigmoid(Ye))
43 | 
44 |         loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=Ye, labels=Yt))
45 |         training_step = tf.train.GradientDescentOptimizer(0.025).minimize(loss)
46 | 
47 |     session = tf.InteractiveSession(graph=graph)
48 |     tf.global_variables_initializer().run()
49 | 
50 |     feed_dict = {
51 |         Xt: X,
52 |         Yt: Y.reshape((nb_samples, 1))
53 |     }
54 | 
55 |     for i in range(10000):
56 |         loss_value, _ = session.run([loss, training_step], feed_dict=feed_dict)
57 |         if i % 100 == 0:
58 |             print('Step %d, Loss: %.3f' % (i, loss_value))
59 | 
60 |     # Retrieve coefficients and intercept
61 |     Wc, Wb = W.eval(), bias.eval()
62 | 
63 |     print('Coefficients:')
64 |     print(Wc)
65 | 
66 |     print('Intercept:')
67 |     print(Wb)
68 | 
69 |     # Plot the dataset with the separating hyperplane
70 |     h = .02
71 |     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
72 |     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
73 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
74 | 
75 |     Z = np.array(session.run([Yc], feed_dict={Xt: np.c_[xx.ravel(), yy.ravel()]}))
76 | 
77 |     # Put the result into a color plot
78 |     Z = Z.reshape(xx.shape)
79 |     plt.figure(1, figsize=(12, 12))
80 |     plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Pastel1)
81 | 
82 |     # Plot also the training points
83 |     for i, x in enumerate(X):
84 |         if Y[i] == 0:
85 |             plt.scatter(x[0], x[1], marker='d', color='blue')
86 |         else:
87 |             plt.scatter(x[0], x[1], marker='s', color='red')
88 | 
89 |     plt.xlabel(r'$X_0$')
90 |     plt.ylabel(r'$X_1$')
91 | 
92 |     plt.xlim(xx.min(), xx.max())
93 |     plt.ylim(yy.min(), yy.max())
94 |     plt.xticks(())
95 |     plt.yticks(())
96 | 
97 |     plt.show()


--------------------------------------------------------------------------------
/Chapter16/dcn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | from keras.datasets import mnist
 7 | from keras.models import Sequential
 8 | from keras.layers import Dense, Activation, Dropout, Conv2D, AveragePooling2D, Flatten
 9 | from keras.optimizers import Adam
10 | from keras.utils import to_categorical
11 | 
12 | 
13 | # Set random seed for reproducibility
14 | np.random.seed(1000)
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     # Load the dataset
19 |     (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
20 | 
21 |     width = height = X_train.shape[1]
22 | 
23 |     X_train = X_train.reshape((X_train.shape[0], width, height, 1)).astype(np.float32) / 255.0
24 |     X_test = X_test.reshape((X_test.shape[0], width, height, 1)).astype(np.float32) / 255.0
25 | 
26 |     Y_train = to_categorical(Y_train, num_classes=10)
27 |     Y_test = to_categorical(Y_test, num_classes=10)
28 | 
29 |     # Create the model
30 |     model = Sequential()
31 | 
32 |     model.add(Dropout(0.25, input_shape=(width, height, 1), seed=1000))
33 | 
34 |     model.add(Conv2D(16, kernel_size=(3, 3), padding='same'))
35 |     model.add(Activation('relu'))
36 |     model.add(Dropout(0.5, seed=1000))
37 | 
38 |     model.add(Conv2D(16, kernel_size=(3, 3), padding='same'))
39 |     model.add(Activation('relu'))
40 |     model.add(Dropout(0.5, seed=1000))
41 | 
42 |     model.add(AveragePooling2D(pool_size=(2, 2), padding='same'))
43 | 
44 |     model.add(Conv2D(32, kernel_size=(3, 3), padding='same'))
45 |     model.add(Activation('relu'))
46 | 
47 |     model.add(AveragePooling2D(pool_size=(2, 2), padding='same'))
48 | 
49 |     model.add(Conv2D(64, kernel_size=(3, 3), padding='same'))
50 |     model.add(Activation('relu'))
51 |     model.add(Dropout(0.5, seed=1000))
52 | 
53 |     model.add(AveragePooling2D(pool_size=(2, 2), padding='same'))
54 | 
55 |     model.add(Flatten())
56 | 
57 |     model.add(Dense(512))
58 |     model.add(Activation('relu'))
59 |     model.add(Dropout(0.5, seed=1000))
60 | 
61 |     model.add(Dense(10))
62 |     model.add(Activation('softmax'))
63 | 
64 |     # Compile the model
65 |     model.compile(optimizer=Adam(lr=0.001, decay=1e-5),
66 |                   loss='categorical_crossentropy',
67 |                   metrics=['accuracy'])
68 | 
69 |     history = model.fit(X_train, Y_train,
70 |                         epochs=200,
71 |                         batch_size=256,
72 |                         validation_data=(X_test, Y_test))
73 | 
74 |     # Show the results
75 |     fig, ax = plt.subplots(1, 2, figsize=(18, 6))
76 | 
77 |     ax[0].plot(history.history['acc'], label='Training accuracy')
78 |     ax[0].plot(history.history['val_acc'], label='Validation accuracy')
79 |     ax[0].set_xlabel('Epoch')
80 |     ax[0].set_ylabel('Accuracy')
81 |     ax[0].legend()
82 |     ax[0].grid()
83 | 
84 |     ax[1].plot(history.history['loss'], label='Training loss')
85 |     ax[1].plot(history.history['val_loss'], label='Validation loss')
86 |     ax[1].set_xlabel('Epoch')
87 |     ax[1].set_ylabel('Loss')
88 |     ax[1].set_yticks(np.linspace(0.0, 1.0, 10))
89 |     ax[1].legend()
90 |     ax[1].grid()
91 |     plt.show()


--------------------------------------------------------------------------------
/Chapter17/pipeline_2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import warnings
 5 | 
 6 | from sklearn.datasets import load_digits
 7 | from sklearn.decomposition import PCA, NMF
 8 | from sklearn.feature_selection import SelectKBest, f_classif
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.model_selection import GridSearchCV
11 | from sklearn.pipeline import Pipeline
12 | from sklearn.preprocessing import StandardScaler
13 | from sklearn.svm import SVC
14 | 
15 | # For reproducibility
16 | np.random.seed(1000)
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     warnings.simplefilter("ignore")
21 | 
22 |     # Load the dataset
23 |     digits = load_digits()
24 | 
25 |     # Create the steps for the pipeline
26 |     pca = PCA()
27 |     nmf = NMF()
28 |     scaler = StandardScaler()
29 |     kbest = SelectKBest(f_classif)
30 |     lr = LogisticRegression()
31 |     svc = SVC()
32 | 
33 |     pipeline_steps = [
34 |         ('dimensionality_reduction', pca),
35 |         ('normalization', scaler),
36 |         ('classification', lr)
37 |     ]
38 | 
39 |     # Create the pipeline
40 |     pipeline = Pipeline(pipeline_steps)
41 | 
42 |     # Perform a grid search
43 |     pca_nmf_components = [10, 20, 30]
44 | 
45 |     param_grid = [
46 |         {
47 |             'dimensionality_reduction': [pca],
48 |             'dimensionality_reduction__n_components': pca_nmf_components,
49 |             'classification': [lr],
50 |             'classification__C': [1, 5, 10, 20]
51 |         },
52 |         {
53 |             'dimensionality_reduction': [pca],
54 |             'dimensionality_reduction__n_components': pca_nmf_components,
55 |             'classification': [svc],
56 |             'classification__kernel': ['rbf', 'poly'],
57 |             'classification__gamma': [0.05, 0.1, 0.2, 0.5, 1.0],
58 |             'classification__degree': [2, 3, 5],
59 |             'classification__C': [1, 5, 10, 20]
60 |         },
61 |         {
62 |             'dimensionality_reduction': [nmf],
63 |             'dimensionality_reduction__n_components': pca_nmf_components,
64 |             'classification': [lr],
65 |             'classification__C': [1, 5, 10, 20]
66 |         },
67 |         {
68 |             'dimensionality_reduction': [nmf],
69 |             'dimensionality_reduction__n_components': pca_nmf_components,
70 |             'classification': [svc],
71 |             'classification__kernel': ['rbf', 'poly'],
72 |             'classification__gamma': [0.05, 0.1, 0.2, 0.5, 1.0],
73 |             'classification__degree': [2, 3, 5],
74 |             'classification__C': [1, 5, 10, 20]
75 |         },
76 |         {
77 |             'dimensionality_reduction': [kbest],
78 |             'classification': [svc],
79 |             'classification__kernel': ['rbf', 'poly'],
80 |             'classification__gamma': [0.05, 0.1, 0.2, 0.5, 1.0],
81 |             'classification__degree': [2, 3, 5],
82 |             'classification__C': [1, 5, 10, 20]
83 |         },
84 |     ]
85 | 
86 |     gs = GridSearchCV(pipeline, param_grid)
87 |     gs.fit(digits.data, digits.target)
88 | 
89 |     print('Best estimator:')
90 |     print(gs.best_estimator_)
91 | 
92 |     print('Best score:')
93 |     print(gs.best_score_)
94 | 


--------------------------------------------------------------------------------
/Chapter14/plsa.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | 
  5 | from nltk.corpus import brown
  6 | 
  7 | from sklearn.feature_extraction.text import CountVectorizer
  8 | 
  9 | 
 10 | # For reproducibility
 11 | np.random.seed(1000)
 12 | 
 13 | rank = 2
 14 | alpha_1 = 1000.0
 15 | alpha_2 = 10.0
 16 | 
 17 | # Compose a corpus
 18 | sentences_1 = brown.sents(categories=['editorial'])[0:20]
 19 | sentences_2 = brown.sents(categories=['fiction'])[0:20]
 20 | corpus = []
 21 | 
 22 | for s in sentences_1 + sentences_2:
 23 |     corpus.append(' '.join(s))
 24 | 
 25 | # Vectorize the corpus
 26 | cv = CountVectorizer(strip_accents='unicode', stop_words='english')
 27 | Xc = np.array(cv.fit_transform(corpus).todense())
 28 | 
 29 | # Define the probability matrices
 30 | Ptd = np.random.uniform(0.0, 1.0, size=(len(corpus), rank))
 31 | Pwt = np.random.uniform(0.0, 1.0, size=(rank, len(cv.vocabulary_)))
 32 | Ptdw = np.zeros(shape=(len(cv.vocabulary_), len(corpus), rank))
 33 | 
 34 | # Normalize the probability matrices
 35 | for d in range(len(corpus)):
 36 |     nf = np.sum(Ptd[d, :])
 37 |     for t in range(rank):
 38 |         Ptd[d, t] /= nf
 39 | 
 40 | for t in range(rank):
 41 |     nf = np.sum(Pwt[t, :])
 42 |     for w in range(len(cv.vocabulary_)):
 43 |         Pwt[t, w] /= nf
 44 | 
 45 | 
 46 | def log_likelihood():
 47 |     value = 0.0
 48 | 
 49 |     for d in range(len(corpus)):
 50 |         for w in range(len(cv.vocabulary_)):
 51 |             real_topic_value = 0.0
 52 | 
 53 |             for t in range(rank):
 54 |                 real_topic_value += Ptd[d, t] * Pwt[t, w]
 55 | 
 56 |             if real_topic_value > 0.0:
 57 |                 value += Xc[d, w] * np.log(real_topic_value)
 58 | 
 59 |     return value
 60 | 
 61 | 
 62 | def expectation():
 63 |     global Ptd, Pwt, Ptdw
 64 | 
 65 |     for d in range(len(corpus)):
 66 |         for w in range(len(cv.vocabulary_)):
 67 |             nf = 0.0
 68 | 
 69 |             for t in range(rank):
 70 |                 Ptdw[w, d, t] = Ptd[d, t] * Pwt[t, w]
 71 |                 nf += Ptdw[w, d, t]
 72 | 
 73 |             Ptdw[w, d, :] = (Ptdw[w, d, :] / nf) if nf != 0.0 else 0.0
 74 | 
 75 | 
 76 | def maximization():
 77 |     global Ptd, Pwt, Ptdw
 78 | 
 79 |     for t in range(rank):
 80 |         nf = 0.0
 81 | 
 82 |         for d in range(len(corpus)):
 83 |             ps = 0.0
 84 | 
 85 |             for w in range(len(cv.vocabulary_)):
 86 |                 ps += Xc[d, w] * Ptdw[w, d, t]
 87 | 
 88 |             Pwt[t, w] = ps
 89 |             nf += Pwt[t, w]
 90 | 
 91 |         Pwt[:, w] /= nf if nf != 0.0 else alpha_1
 92 | 
 93 |     for d in range(len(corpus)):
 94 |         for t in range(rank):
 95 |             ps = 0.0
 96 |             nf = 0.0
 97 | 
 98 |             for w in range(len(cv.vocabulary_)):
 99 |                 ps += Xc[d, w] * Ptdw[w, d, t]
100 |                 nf += Xc[d, w]
101 | 
102 |             Ptd[d, t] = ps / (nf if nf != 0.0 else alpha_2)
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     print('Initial Log-Likelihood: %f' % log_likelihood())
107 | 
108 |     for i in range(30):
109 |         expectation()
110 |         maximization()
111 |         print('Step %d - Log-Likelihood: %f' % (i, log_likelihood()))
112 | 
113 |     # Show the top 5 words per topic
114 |     Pwts = np.argsort(Pwt, axis=1)[::-1]
115 | 
116 |     for t in range(rank):
117 |         print('\nTopic ' + str(t))
118 |         for i in range(5):
119 |             print(cv.get_feature_names()[Pwts[t, i]])


--------------------------------------------------------------------------------
/Chapter16/lstm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | 
  6 | from keras.models import Sequential
  7 | from keras.layers import LSTM, Dense, Activation
  8 | from keras.optimizers import Adam
  9 | 
 10 | # Install with pip -U install datapackage. For further information: https://datahub.io/core/global-temp#python
 11 | from datapackage import Package
 12 | 
 13 | from sklearn.preprocessing import MinMaxScaler
 14 | 
 15 | 
 16 | # Set random seed for reproducibility
 17 | np.random.seed(1000)
 18 | 
 19 | 
 20 | nb_samples = 1600
 21 | nb_test_samples = 200
 22 | sequence_length = 20
 23 | 
 24 | 
 25 | if __name__ == '__main__':
 26 |     # Load the dataset
 27 |     package = Package('https://datahub.io/core/global-temp/datapackage.json')
 28 | 
 29 |     for resource in package.resources:
 30 |         if resource.descriptor['datahub']['type'] == 'derived/csv':
 31 |             data = resource.read()
 32 | 
 33 |     # Extract the time series
 34 |     data_gcag = data[0:len(data):2][::-1]
 35 | 
 36 |     Y = np.zeros(shape=(len(data_gcag), 1), dtype=np.float32)
 37 | 
 38 |     for i, y in enumerate(data_gcag):
 39 |         Y[i - 1, 0] = y[2]
 40 | 
 41 |     # Scale between -1.0 and 1.0
 42 |     mmscaler = MinMaxScaler((-1.0, 1.0))
 43 |     Y = mmscaler.fit_transform(Y)
 44 | 
 45 |     # Show the time-series
 46 |     fig, ax = plt.subplots(figsize=(20, 10))
 47 | 
 48 |     ax.plot(Y)
 49 |     ax.grid()
 50 |     ax.set_xlabel('Time steps')
 51 |     ax.set_ylabel('Monthly Avg Temperature Anomaly')
 52 | 
 53 |     plt.show()
 54 | 
 55 |     # Create the training and test sets
 56 |     X_ts = np.zeros(shape=(nb_samples - sequence_length, sequence_length, 1), dtype=np.float32)
 57 |     Y_ts = np.zeros(shape=(nb_samples - sequence_length, 1), dtype=np.float32)
 58 | 
 59 |     for i in range(0, nb_samples - sequence_length):
 60 |         X_ts[i] = Y[i:i + sequence_length]
 61 |         Y_ts[i] = Y[i + sequence_length]
 62 | 
 63 |     X_ts_train = X_ts[0:nb_samples - nb_test_samples, :]
 64 |     Y_ts_train = Y_ts[0:nb_samples - nb_test_samples]
 65 | 
 66 |     X_ts_test = X_ts[nb_samples - nb_test_samples:, :]
 67 |     Y_ts_test = Y_ts[nb_samples - nb_test_samples:]
 68 | 
 69 |     # Create the model
 70 |     model = Sequential()
 71 | 
 72 |     model.add(LSTM(8, stateful=True, batch_input_shape=(20, sequence_length, 1)))
 73 | 
 74 |     model.add(Dense(1))
 75 |     model.add(Activation('linear'))
 76 | 
 77 |     # Compile the model
 78 |     model.compile(optimizer=Adam(lr=0.001, decay=0.0001),
 79 |                   loss='mse',
 80 |                   metrics=['mse'])
 81 | 
 82 |     # Train the model
 83 |     model.fit(X_ts_train, Y_ts_train,
 84 |               batch_size=20,
 85 |               epochs=100,
 86 |               shuffle=False,
 87 |               validation_data=(X_ts_test, Y_ts_test))
 88 | 
 89 |     # Show the predictions on the training set
 90 |     fig, ax = plt.subplots(figsize=(20, 10))
 91 | 
 92 |     ax.plot(Y_ts_train, label='True values')
 93 |     ax.plot(model.predict(X_ts_train, batch_size=20), label='Predicted values')
 94 |     ax.grid()
 95 |     ax.set_xlabel('Time steps')
 96 |     ax.set_ylabel('Monthly Avg Temperature Anomaly')
 97 |     ax.legend()
 98 | 
 99 |     plt.show()
100 | 
101 |     # Show the predictions on the test set
102 |     fig, ax = plt.subplots(figsize=(20, 10))
103 | 
104 |     ax.plot(Y_ts_test, label='True values')
105 |     ax.plot(model.predict(X_ts_test, batch_size=20), label='Predicted values')
106 |     ax.grid()
107 |     ax.set_xlabel('Time steps')
108 |     ax.set_ylabel('Monthly Avg Temperature Anomaly')
109 |     ax.legend()
110 | 
111 |     plt.show()
112 | 
113 | 


--------------------------------------------------------------------------------
/Chapter08/dt.dot:
--------------------------------------------------------------------------------
 1 | digraph Tree {
 2 | node [shape=box] ;
 3 | 0 [label="C <= -0.367\ngini = 0.667\nsamples = 500\nvalue = [168, 165, 167]\nclass = C1"] ;
 4 | 1 [label="A <= -0.447\ngini = 0.078\nsamples = 172\nvalue = [0, 7, 165]\nclass = C3"] ;
 5 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
 6 | 2 [label="gini = 0.0\nsamples = 4\nvalue = [0, 4, 0]\nclass = C2"] ;
 7 | 1 -> 2 ;
 8 | 3 [label="A <= -0.171\ngini = 0.035\nsamples = 168\nvalue = [0, 3, 165]\nclass = C3"] ;
 9 | 1 -> 3 ;
10 | 4 [label="A <= -0.212\ngini = 0.48\nsamples = 5\nvalue = [0, 2, 3]\nclass = C3"] ;
11 | 3 -> 4 ;
12 | 5 [label="gini = 0.0\nsamples = 3\nvalue = [0, 0, 3]\nclass = C3"] ;
13 | 4 -> 5 ;
14 | 6 [label="gini = 0.0\nsamples = 2\nvalue = [0, 2, 0]\nclass = C2"] ;
15 | 4 -> 6 ;
16 | 7 [label="A <= -0.016\ngini = 0.012\nsamples = 163\nvalue = [0, 1, 162]\nclass = C3"] ;
17 | 3 -> 7 ;
18 | 8 [label="A <= -0.025\ngini = 0.219\nsamples = 8\nvalue = [0, 1, 7]\nclass = C3"] ;
19 | 7 -> 8 ;
20 | 9 [label="gini = 0.0\nsamples = 7\nvalue = [0, 0, 7]\nclass = C3"] ;
21 | 8 -> 9 ;
22 | 10 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = C2"] ;
23 | 8 -> 10 ;
24 | 11 [label="gini = 0.0\nsamples = 155\nvalue = [0, 0, 155]\nclass = C3"] ;
25 | 7 -> 11 ;
26 | 12 [label="B <= -0.299\ngini = 0.506\nsamples = 328\nvalue = [168, 158, 2]\nclass = C1"] ;
27 | 0 -> 12 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
28 | 13 [label="C <= 2.115\ngini = 0.109\nsamples = 173\nvalue = [163, 10, 0]\nclass = C1"] ;
29 | 12 -> 13 ;
30 | 14 [label="A <= -1.63\ngini = 0.047\nsamples = 167\nvalue = [163, 4, 0]\nclass = C1"] ;
31 | 13 -> 14 ;
32 | 15 [label="C <= 0.942\ngini = 0.238\nsamples = 29\nvalue = [25, 4, 0]\nclass = C1"] ;
33 | 14 -> 15 ;
34 | 16 [label="gini = 0.0\nsamples = 24\nvalue = [24, 0, 0]\nclass = C1"] ;
35 | 15 -> 16 ;
36 | 17 [label="A <= -1.7\ngini = 0.32\nsamples = 5\nvalue = [1, 4, 0]\nclass = C2"] ;
37 | 15 -> 17 ;
38 | 18 [label="gini = 0.0\nsamples = 1\nvalue = [1, 0, 0]\nclass = C1"] ;
39 | 17 -> 18 ;
40 | 19 [label="gini = 0.0\nsamples = 4\nvalue = [0, 4, 0]\nclass = C2"] ;
41 | 17 -> 19 ;
42 | 20 [label="gini = 0.0\nsamples = 138\nvalue = [138, 0, 0]\nclass = C1"] ;
43 | 14 -> 20 ;
44 | 21 [label="gini = 0.0\nsamples = 6\nvalue = [0, 6, 0]\nclass = C2"] ;
45 | 13 -> 21 ;
46 | 22 [label="A <= -0.19\ngini = 0.087\nsamples = 155\nvalue = [5, 148, 2]\nclass = C2"] ;
47 | 12 -> 22 ;
48 | 23 [label="B <= -0.154\ngini = 0.052\nsamples = 151\nvalue = [3, 147, 1]\nclass = C2"] ;
49 | 22 -> 23 ;
50 | 24 [label="C <= 0.802\ngini = 0.32\nsamples = 10\nvalue = [2, 8, 0]\nclass = C2"] ;
51 | 23 -> 24 ;
52 | 25 [label="C <= 0.269\ngini = 0.444\nsamples = 3\nvalue = [2, 1, 0]\nclass = C1"] ;
53 | 24 -> 25 ;
54 | 26 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = C2"] ;
55 | 25 -> 26 ;
56 | 27 [label="gini = 0.0\nsamples = 2\nvalue = [2, 0, 0]\nclass = C1"] ;
57 | 25 -> 27 ;
58 | 28 [label="gini = 0.0\nsamples = 7\nvalue = [0, 7, 0]\nclass = C2"] ;
59 | 24 -> 28 ;
60 | 29 [label="C <= 0.195\ngini = 0.028\nsamples = 141\nvalue = [1, 139, 1]\nclass = C2"] ;
61 | 23 -> 29 ;
62 | 30 [label="C <= 0.178\ngini = 0.194\nsamples = 19\nvalue = [1, 17, 1]\nclass = C2"] ;
63 | 29 -> 30 ;
64 | 31 [label="C <= 0.135\ngini = 0.105\nsamples = 18\nvalue = [0, 17, 1]\nclass = C2"] ;
65 | 30 -> 31 ;
66 | 32 [label="gini = 0.0\nsamples = 16\nvalue = [0, 16, 0]\nclass = C2"] ;
67 | 31 -> 32 ;
68 | 33 [label="C <= 0.156\ngini = 0.5\nsamples = 2\nvalue = [0, 1, 1]\nclass = C2"] ;
69 | 31 -> 33 ;
70 | 34 [label="gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]\nclass = C3"] ;
71 | 33 -> 34 ;
72 | 35 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = C2"] ;
73 | 33 -> 35 ;
74 | 36 [label="gini = 0.0\nsamples = 1\nvalue = [1, 0, 0]\nclass = C1"] ;
75 | 30 -> 36 ;
76 | 37 [label="gini = 0.0\nsamples = 122\nvalue = [0, 122, 0]\nclass = C2"] ;
77 | 29 -> 37 ;
78 | 38 [label="C <= 0.8\ngini = 0.625\nsamples = 4\nvalue = [2, 1, 1]\nclass = C1"] ;
79 | 22 -> 38 ;
80 | 39 [label="B <= 1.822\ngini = 0.5\nsamples = 2\nvalue = [0, 1, 1]\nclass = C2"] ;
81 | 38 -> 39 ;
82 | 40 [label="gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]\nclass = C3"] ;
83 | 39 -> 40 ;
84 | 41 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]\nclass = C2"] ;
85 | 39 -> 41 ;
86 | 42 [label="gini = 0.0\nsamples = 2\nvalue = [2, 0, 0]\nclass = C1"] ;
87 | 38 -> 42 ;
88 | }


--------------------------------------------------------------------------------
/Chapter09/gaussian_mixture.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | from matplotlib.patches import Ellipse
  7 | 
  8 | from sklearn.datasets import make_blobs
  9 | from sklearn.mixture import GaussianMixture
 10 | 
 11 | 
 12 | # Set random seed for reproducibility
 13 | np.random.seed(1000)
 14 | 
 15 | 
 16 | # Total number of samples
 17 | nb_samples = 800
 18 | 
 19 | 
 20 | if __name__ == '__main__':
 21 |     # Create the dataset
 22 |     X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=3, cluster_std=2.2, random_state=1000)
 23 | 
 24 |     # Show the original dataset
 25 |     fig, ax = plt.subplots(figsize=(15, 8))
 26 | 
 27 |     ax.scatter(X[Y == 0, 0], X[Y == 0, 1], c='r', s=20, marker='p', label='Class 0')
 28 |     ax.scatter(X[Y == 1, 0], X[Y == 1, 1], c='g', s=20, marker='d', label='Class 1')
 29 |     ax.scatter(X[Y == 2, 0], X[Y == 2, 1], c='b', s=20, marker='s', label='Class 2')
 30 |     ax.set_xlabel(r'$x_0$')
 31 |     ax.set_ylabel(r'$x_1$')
 32 |     ax.legend()
 33 |     ax.grid()
 34 | 
 35 |     plt.show()
 36 | 
 37 |     # Create a fit a Gaussian Mixture model
 38 |     gm = GaussianMixture(n_components=3, max_iter=1000, random_state=1000)
 39 |     gm.fit(X)
 40 | 
 41 |     # Print means, covariances, and weights
 42 |     print('Means:\n')
 43 |     print(gm.means_)
 44 | 
 45 |     print('\nCovariances:\n')
 46 |     print(gm.covariances_)
 47 | 
 48 |     print('\nWeights:\n')
 49 |     print(gm.weights_)
 50 | 
 51 |     # Show the clustered dataset with the final Gaussian distributions
 52 |     fig, ax = plt.subplots(figsize=(15, 8))
 53 | 
 54 |     c = gm.covariances_
 55 |     m = gm.means_
 56 | 
 57 |     g1 = Ellipse(xy=m[0], width=4 * np.sqrt(c[0][0, 0]), height=4 * np.sqrt(c[0][1, 1]), fill=False, linestyle='dashed',
 58 |                  linewidth=2)
 59 |     g1_1 = Ellipse(xy=m[0], width=3 * np.sqrt(c[0][0, 0]), height=3 * np.sqrt(c[0][1, 1]), fill=False,
 60 |                    linestyle='dashed', linewidth=3)
 61 |     g1_2 = Ellipse(xy=m[0], width=1.5 * np.sqrt(c[0][0, 0]), height=1.5 * np.sqrt(c[0][1, 1]), fill=False,
 62 |                    linestyle='dashed', linewidth=4)
 63 | 
 64 |     g2 = Ellipse(xy=m[1], width=4 * np.sqrt(c[1][0, 0]), height=4 * np.sqrt(c[1][1, 1]), fill=False, linestyle='dashed',
 65 |                  linewidth=2)
 66 |     g2_1 = Ellipse(xy=m[1], width=3 * np.sqrt(c[1][0, 0]), height=3 * np.sqrt(c[1][1, 1]), fill=False,
 67 |                    linestyle='dashed', linewidth=3)
 68 |     g2_2 = Ellipse(xy=m[1], width=1.5 * np.sqrt(c[1][0, 0]), height=1.5 * np.sqrt(c[1][1, 1]), fill=False,
 69 |                    linestyle='dashed', linewidth=4)
 70 | 
 71 |     g3 = Ellipse(xy=m[2], width=4 * np.sqrt(c[2][0, 0]), height=4 * np.sqrt(c[2][1, 1]), fill=False, linestyle='dashed',
 72 |                  linewidth=2)
 73 |     g3_1 = Ellipse(xy=m[2], width=3 * np.sqrt(c[2][0, 0]), height=3 * np.sqrt(c[2][1, 1]), fill=False,
 74 |                    linestyle='dashed', linewidth=3)
 75 |     g3_2 = Ellipse(xy=m[2], width=1.5 * np.sqrt(c[2][0, 0]), height=1.5 * np.sqrt(c[2][1, 1]), fill=False,
 76 |                    linestyle='dashed', linewidth=4)
 77 | 
 78 |     ax.add_artist(g1)
 79 |     ax.add_artist(g1_1)
 80 |     ax.add_artist(g1_2)
 81 |     ax.add_artist(g2)
 82 |     ax.add_artist(g2_1)
 83 |     ax.add_artist(g2_2)
 84 |     ax.add_artist(g3)
 85 |     ax.add_artist(g3_1)
 86 |     ax.add_artist(g3_2)
 87 | 
 88 |     ax.scatter(X[Y == 0, 0], X[Y == 0, 1], c='r', s=20, marker='p', label='Class 0')
 89 |     ax.scatter(X[Y == 1, 0], X[Y == 1, 1], c='g', s=20, marker='d', label='Class 1')
 90 |     ax.scatter(X[Y == 2, 0], X[Y == 2, 1], c='b', s=20, marker='s', label='Class 2')
 91 |     ax.set_xlabel(r'$x_0$')
 92 |     ax.set_ylabel(r'$x_1$')
 93 |     ax.legend()
 94 |     ax.grid()
 95 | 
 96 |     plt.show()
 97 | 
 98 |     # Compute AICs and BICs
 99 |     nb_components = [2, 3, 4, 5, 6, 7, 8]
100 | 
101 |     aics = []
102 |     bics = []
103 | 
104 |     for n in nb_components:
105 |         gm = GaussianMixture(n_components=n, max_iter=1000, random_state=1000)
106 |         gm.fit(X)
107 |         aics.append(gm.aic(X))
108 |         bics.append(gm.bic(X))
109 | 
110 |     fig, ax = plt.subplots(2, 1, figsize=(15, 8))
111 | 
112 |     ax[0].plot(nb_components, aics)
113 |     ax[0].set_ylabel('AIC')
114 |     ax[0].grid()
115 | 
116 |     ax[1].plot(nb_components, bics)
117 |     ax[1].set_xlabel('Number of components')
118 |     ax[1].set_ylabel('BIC')
119 |     ax[1].grid()
120 | 
121 |     plt.show()


--------------------------------------------------------------------------------
/Chapter15/mlp.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | from sklearn.linear_model import LogisticRegression
  7 | from sklearn.model_selection import cross_val_score, train_test_split
  8 | from sklearn.preprocessing import StandardScaler
  9 | from sklearn.utils import shuffle
 10 | 
 11 | from keras.models import Sequential
 12 | from keras.layers import Dense, Activation
 13 | from keras.utils import to_categorical
 14 | 
 15 | 
 16 | # For reproducibility
 17 | np.random.seed(1000)
 18 | 
 19 | nb_samples = 2000
 20 | 
 21 | 
 22 | if __name__ == '__main__':
 23 |     # Create the dataset
 24 |     X = np.zeros(shape=(nb_samples, 2), dtype=np.float32)
 25 |     Y = np.zeros(shape=(nb_samples,), dtype=np.float32)
 26 | 
 27 |     t = 15.0 * np.random.uniform(0.0, 1.0, size=(int(nb_samples / 2), 1))
 28 | 
 29 |     X[0:int(nb_samples / 2), :] = t * np.hstack([-np.cos(t), np.sin(t)]) + \
 30 |                                   np.random.uniform(0.0, 1.8, size=(int(nb_samples / 2), 2))
 31 |     Y[0:int(nb_samples / 2)] = 0
 32 | 
 33 |     X[int(nb_samples / 2):, :] = t * np.hstack([np.cos(t), -np.sin(t)]) + \
 34 |                                  np.random.uniform(0.0, 1.8, size=(int(nb_samples / 2), 2))
 35 |     Y[int(nb_samples / 2):] = 1
 36 | 
 37 |     ss = StandardScaler()
 38 |     X = ss.fit_transform(X)
 39 | 
 40 |     X, Y = shuffle(X, Y, random_state=1000)
 41 | 
 42 |     # Show the dataset
 43 |     fig, ax = plt.subplots(figsize=(8, 8))
 44 | 
 45 |     ax.scatter(X[Y == 0, 0], X[Y == 0, 1], label='Class 0')
 46 |     ax.scatter(X[Y == 1, 0], X[Y == 1, 1], label='Class 1')
 47 |     ax.set_xlabel(r'$x_0$')
 48 |     ax.set_ylabel(r'$x_1$')
 49 |     ax.legend()
 50 |     ax.grid()
 51 | 
 52 |     plt.show()
 53 | 
 54 |     # Perform a Logistic Regression cross-validation
 55 |     lr = LogisticRegression(penalty='l2', C=0.01, random_state=1000)
 56 |     print(np.mean(cross_val_score(lr, X, Y, cv=10)))
 57 | 
 58 |     # Show the classification result
 59 |     lr.fit(X, Y)
 60 |     Y_pred_lr = lr.predict(X)
 61 | 
 62 |     fig, ax = plt.subplots(figsize=(8, 8))
 63 | 
 64 |     ax.scatter(X[Y_pred_lr == 0, 0], X[Y_pred_lr == 0, 1], label='Class 0')
 65 |     ax.scatter(X[Y_pred_lr == 1, 0], X[Y_pred_lr == 1, 1], label='Class 1')
 66 |     ax.set_xlabel(r'$x_0$')
 67 |     ax.set_ylabel(r'$x_1$')
 68 |     ax.legend()
 69 |     ax.grid()
 70 | 
 71 |     plt.show()
 72 | 
 73 |     # Create a Keras model
 74 |     model = Sequential()
 75 | 
 76 |     model.add(Dense(64, input_dim=2))
 77 |     model.add(Activation('relu'))
 78 | 
 79 |     model.add(Dense(32))
 80 |     model.add(Activation('relu'))
 81 | 
 82 |     model.add(Dense(16))
 83 |     model.add(Activation('relu'))
 84 | 
 85 |     model.add(Dense(2))
 86 |     model.add(Activation('softmax'))
 87 | 
 88 |     # Compile the model
 89 |     model.compile(optimizer='adam',
 90 |                   loss='categorical_crossentropy',
 91 |                   metrics=['accuracy'])
 92 | 
 93 |     # Split the dataset into train and test sets
 94 |     X_train, X_test, Y_train, Y_test = \
 95 |         train_test_split(X, to_categorical(Y), test_size=0.2, random_state=1000)
 96 | 
 97 |     # Train the model
 98 |     model.fit(X_train, Y_train,
 99 |               epochs=100,
100 |               batch_size=32,
101 |               validation_data=(X_test, Y_test))
102 | 
103 |     # Show the classification result
104 |     Y_pred_mlp = np.argmax(model.predict(X), axis=1)
105 | 
106 |     fig, ax = plt.subplots(figsize=(8, 8))
107 | 
108 |     ax.scatter(X[Y_pred_mlp == 0, 0], X[Y_pred_mlp == 0, 1], label='Class 0')
109 |     ax.scatter(X[Y_pred_mlp == 1, 0], X[Y_pred_mlp == 1, 1], label='Class 1')
110 |     ax.set_xlabel(r'$x_0$')
111 |     ax.set_ylabel(r'$x_1$')
112 |     ax.legend()
113 |     ax.grid()
114 | 
115 |     plt.show()
116 | 
117 |     # Show the decision surfaces
118 |     Xm = np.linspace(-2.0, 2.0, 1000)
119 |     Ym = np.linspace(-2.0, 2.0, 1000)
120 |     Xmg, Ymg = np.meshgrid(Xm, Ym)
121 |     X_eval = np.vstack([Xmg.ravel(), Ymg.ravel()]).T
122 | 
123 |     Y_eval_lr = lr.predict(X_eval)
124 |     Y_eval_mlp = np.argmax(model.predict(X_eval), axis=1)
125 | 
126 |     fig, ax = plt.subplots(1, 2, figsize=(16, 8))
127 | 
128 |     ax[0].scatter(X_eval[Y_eval_lr == 0, 0], X_eval[Y_eval_lr == 0, 1])
129 |     ax[0].scatter(X_eval[Y_eval_lr == 1, 0], X_eval[Y_eval_lr == 1, 1])
130 |     ax[0].set_xlabel(r'$x_0$')
131 |     ax[0].set_ylabel(r'$x_1$')
132 |     ax[0].set_title('Logistic Regression')
133 | 
134 |     ax[1].scatter(X_eval[Y_eval_mlp == 0, 0], X_eval[Y_eval_mlp == 0, 1])
135 |     ax[1].scatter(X_eval[Y_eval_mlp == 1, 0], X_eval[Y_eval_mlp == 1, 1])
136 |     ax[1].set_xlabel(r'$x_0$')
137 |     ax[1].set_ylabel(r'$x_1$')
138 |     ax[1].set_title('MLP')
139 | 
140 |     plt.show()
141 | 
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning Algorithms Second Edition
 2 | 
 3 | <a href="https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-algorithms-second-edition?utm_source=github&utm_medium=reposiory"><img src="https://dz13w8afd47il.cloudfront.net/sites/default/files/imagecache/ppv4_main_book_cover/9781789347999.png" alt="Book Name" height="256px" align="right"></a>
 4 | 
 5 | This is the code repository for [Machine Learning Algorithms Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-algorithms-second-edition?utm_source=github&utm_medium=reposiory), published by Packt.
 6 | 
 7 | **Popular algorithms for data science and machine learning**
 8 | 
 9 | ## What is this book about?
10 | Machine learning has gained tremendous popularity for its powerful and fast predictions with large datasets. However, the true forces behind its powerful output are the complex algorithms involving substantial statistical analysis that churn large datasets and generate substantial insight.
11 | 
12 | This book covers the following exciting features: 
13 | * Study feature selection and the feature engineering process
14 | * Assess performance and error trade-offs for linear regression
15 | * Build a data model and understand how it works by using different types of algorithm
16 | * Learn to tune the parameters of Support Vector Machines (SVM)
17 | * Explore the concept of natural language processing (NLP) and recommendation systems
18 | 
19 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789347998) today!
20 | 
21 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
22 | alt="https://www.packtpub.com/" border="5" /></a>
23 | 
24 | 
25 | ## Instructions and Navigations
26 | All of the code is organized into folders. For example, Chapter02.
27 | 
28 | The code will look like the following:
29 | ```
30 | from sklearn.svm import SVC
31 | from sklearn.model_selection import cross_val_score
32 | svc = SVC(kernel='linear')
33 | print(cross_val_score(svc, X, Y, scoring='accuracy', cv=10).mean())
34 | 0.93191356542617032
35 | ```
36 | 
37 | **Following is what you need for this book:**
38 | Machine Learning Algorithms is for you if you are a machine learning engineer, data engineer, or junior data scientist who wants to advance in the field of predictive analytics and machine learning. Familiarity with R and Python will be an added advantage for getting the best from this book.
39 | 
40 | With the following software and hardware list you can run all code files present in the book (Chapter 1-15).
41 | 
42 | ### Software and Hardware List
43 | 
44 | | Chapter  | Software required                   | OS required                        |
45 | | -------- | ------------------------------------| -----------------------------------|
46 | | 2-17     | Python 2.7/3.5, SciPy 0.18,         | Windows, Mac OS X, and Linux (Any) |
47 | |          | Numpy 1.11+, Matplotlib 2.0,        |                                    |
48 | |          | ScikitLearn 0.18+, Crab,            |                                    |
49 | |          | Apache Spark 2+, NLTK –langdetect,  |                                    |
50 | |          | Gensim, Keras 2+, Cupy              |                                    |
51 |                              
52 | 
53 | 
54 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://www.packtpub.com/sites/default/files/downloads/MachineLearningAlgorithmsSecondEdition_ColorImages.pdf).
55 | 
56 | 
57 | ### Related products <Other books you may enjoy>
58 | * Mastering Machine Learning Algorithms [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/mastering-machine-learning-algorithms?utm_source=github&utm_medium=repository&utm_campaign=9781788621113) [[Amazon]](https://www.amazon.com/dp/1788621115)
59 | 
60 | * Python Deep Learning [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/python-deep-learning?utm_source=github&utm_medium=repository&utm_campaign=9781786464453) [[Amazon]](https://www.amazon.com/dp/1786464454)
61 | 
62 | ## Get to Know the Author
63 | **Giuseppe Bonaccorso**
64 | is an experienced team leader/manager in AI, machine/deep learning solution design, management, and delivery. He got his MScEng in electronics in 2005 from the University of Catania, Italy, and continued his studies at the University of Rome Tor Vergata and the University of Essex, UK. His main interests include machine/deep learning, reinforcement learning, big data, bio-inspired adaptive systems, cryptocurrencies, and NLP.
65 | 
66 | 
67 | 
68 | ## Other books by the authors
69 | * [Mastering Machine Learning Algorithms](https://www.packtpub.com/big-data-and-business-intelligence/mastering-machine-learning-algorithms?utm_source=github&utm_medium=repository&utm_campaign=9781788621113)
70 | * [Machine Learning Algorithms](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-algorithms?utm_source=github&utm_medium=repository&utm_campaign=9781785889622)
71 | 
72 | ### Suggestions and Feedback
73 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.
74 | 


--------------------------------------------------------------------------------
/Chapter09/evaluation_metrics.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import matplotlib.cm as cm
  6 | 
  7 | from sklearn.datasets import make_blobs
  8 | from sklearn.cluster import KMeans
  9 | from sklearn.metrics import silhouette_score, silhouette_samples, calinski_harabaz_score, \
 10 |     homogeneity_score, completeness_score, adjusted_rand_score
 11 | from sklearn.metrics.pairwise import pairwise_distances
 12 | 
 13 | 
 14 | # For reproducibility
 15 | np.random.seed(1000)
 16 | 
 17 | 
 18 | nb_samples = 1000
 19 | 
 20 | 
 21 | if __name__ == '__main__':
 22 |     # Create dataset
 23 |     X, _ = make_blobs(n_samples=nb_samples, n_features=2, centers=3, cluster_std=1.5, random_state=1000)
 24 | 
 25 |     # Show the dataset
 26 |     fig, ax = plt.subplots(1, 1, figsize=(30, 25))
 27 | 
 28 |     ax.grid()
 29 |     ax.set_xlabel('X')
 30 |     ax.set_ylabel('Y')
 31 | 
 32 |     ax.scatter(X[:, 0], X[:, 1], marker='o', color='b')
 33 | 
 34 |     plt.show()
 35 | 
 36 |     # Analyze the inertia
 37 |     nb_clusters = [2, 3, 5, 6, 7, 8, 9, 10]
 38 | 
 39 |     inertias = []
 40 | 
 41 |     for n in nb_clusters:
 42 |         km = KMeans(n_clusters=n)
 43 |         km.fit(X)
 44 |         inertias.append(km.inertia_)
 45 | 
 46 |     fig, ax = plt.subplots(figsize=(15, 8))
 47 | 
 48 |     ax.plot(nb_clusters, inertias)
 49 |     ax.set_xlabel('Number of clusters')
 50 |     ax.set_ylabel('Inertia')
 51 |     ax.grid()
 52 | 
 53 |     plt.show()
 54 | 
 55 |     # Analyze the silhouette scores
 56 |     avg_silhouettes = []
 57 | 
 58 |     for n in nb_clusters:
 59 |         km = KMeans(n_clusters=n)
 60 |         Y = km.fit_predict(X)
 61 |         avg_silhouettes.append(silhouette_score(X, Y))
 62 | 
 63 |     fig, ax = plt.subplots(figsize=(15, 8))
 64 | 
 65 |     ax.plot(nb_clusters, avg_silhouettes)
 66 |     ax.set_xlabel('Number of clusters')
 67 |     ax.set_ylabel('Average Silhouette score')
 68 |     ax.grid()
 69 | 
 70 |     plt.show()
 71 | 
 72 |     # Draw the silhouette plots
 73 |     fig, ax = plt.subplots(2, 2, figsize=(15, 10))
 74 | 
 75 |     nb_clusters = [2, 3, 4, 8]
 76 |     mapping = [(0, 0), (0, 1), (1, 0), (1, 1)]
 77 | 
 78 |     for i, n in enumerate(nb_clusters):
 79 |         km = KMeans(n_clusters=n)
 80 |         Y = km.fit_predict(X)
 81 | 
 82 |         silhouette_values = silhouette_samples(X, Y)
 83 | 
 84 |         ax[mapping[i]].set_xticks([-0.15, 0.0, 0.25, 0.5, 0.75, 1.0])
 85 |         ax[mapping[i]].set_yticks([])
 86 |         ax[mapping[i]].set_title('%d clusters' % n)
 87 |         ax[mapping[i]].set_xlim([-0.15, 1])
 88 |         ax[mapping[i]].grid()
 89 |         y_lower = 20
 90 | 
 91 |         for t in range(n):
 92 |             ct_values = silhouette_values[Y == t]
 93 |             ct_values.sort()
 94 | 
 95 |             y_upper = y_lower + ct_values.shape[0]
 96 | 
 97 |             color = cm.Accent(float(t) / n)
 98 |             ax[mapping[i]].fill_betweenx(np.arange(y_lower, y_upper), 0,
 99 |                                          ct_values, facecolor=color, edgecolor=color)
100 | 
101 |             y_lower = y_upper + 20
102 | 
103 |     # Analyze the Calinski-Harabasz scores
104 |     ch_scores = []
105 | 
106 |     km = KMeans(n_clusters=n)
107 |     Y = km.fit_predict(X)
108 | 
109 |     for n in nb_clusters:
110 |         km = KMeans(n_clusters=n)
111 |         Y = km.fit_predict(X)
112 |         ch_scores.append(calinski_harabaz_score(X, Y))
113 | 
114 |     fig, ax = plt.subplots(figsize=(15, 8))
115 | 
116 |     ax.plot(nb_clusters, ch_scores)
117 |     ax.set_xlabel('Number of clusters')
118 |     ax.set_ylabel('Calinski-Harabasz scores')
119 |     ax.grid()
120 | 
121 |     plt.show()
122 | 
123 |     # Analyze the cluster instability
124 |     nb_noisy_datasets = 10
125 | 
126 |     X_noise = []
127 | 
128 |     for _ in range(nb_noisy_datasets):
129 |         Xn = np.ndarray(shape=(1000, 2))
130 | 
131 |         for i, x in enumerate(X):
132 |             if np.random.uniform(0, 1) < 0.25:
133 |                 Xn[i] = X[i] + np.random.uniform(-2.0, 2.0)
134 |             else:
135 |                 Xn[i] = X[i]
136 | 
137 |         X_noise.append(Xn)
138 | 
139 |     instabilities = []
140 | 
141 |     for n in nb_clusters:
142 |         Yn = []
143 | 
144 |         for Xn in X_noise:
145 |             km = KMeans(n_clusters=n)
146 |             Yn.append(km.fit_predict(Xn))
147 | 
148 |         distances = []
149 | 
150 |         for i in range(len(Yn) - 1):
151 |             for j in range(i, len(Yn)):
152 |                 d = pairwise_distances(Yn[i].reshape(-1, 1), Yn[j].reshape(-1, 1), 'hamming')
153 |                 distances.append(d[0, 0])
154 | 
155 |         instability = (2.0 * np.sum(distances)) / float(nb_noisy_datasets ** 2)
156 |         instabilities.append(instability)
157 | 
158 |     fig, ax = plt.subplots(figsize=(15, 8))
159 | 
160 |     ax.plot(nb_clusters, instabilities)
161 |     ax.set_xlabel('Number of clusters')
162 |     ax.set_ylabel('Cluster instability')
163 |     ax.grid()
164 | 
165 |     plt.show()
166 | 
167 |     # Analyze the homegeneity, completeness, and Adjusted Rand score
168 |     km = KMeans(n_clusters=3)
169 |     Yp = km.fit_predict(X)
170 | 
171 |     print('Homegeneity: %.3f' % homogeneity_score(Y, Yp))
172 |     print('Completeness: %.3f' % completeness_score(Y, Yp))
173 |     print('Adjusted Rand score: %.3f' % adjusted_rand_score(Y, Yp))
174 | 
175 | 
176 | 


--------------------------------------------------------------------------------
/Chapter07/s3vm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | 
  6 | from scipy.optimize import minimize
  7 | 
  8 | from sklearn.datasets import make_classification
  9 | 
 10 | # Set random seed for reproducibility
 11 | np.random.seed(1000)
 12 | 
 13 | 
 14 | nb_samples = 200
 15 | nb_unlabeled = 150
 16 | 
 17 | 
 18 | # Create dataset
 19 | X, Y = make_classification(n_samples=nb_samples, n_features=2, n_redundant=0, random_state=1000)
 20 | Y[Y == 0] = -1
 21 | Y[nb_samples - nb_unlabeled:nb_samples] = 0
 22 | 
 23 | 
 24 | # Initialize S3VM variables
 25 | w = np.random.uniform(-0.1, 0.1, size=X.shape[1])
 26 | eta = np.random.uniform(0.0, 0.1, size=nb_samples - nb_unlabeled)
 27 | xi = np.random.uniform(0.0, 0.1, size=nb_unlabeled)
 28 | zi = np.random.uniform(0.0, 0.1, size=nb_unlabeled)
 29 | b = np.random.uniform(-0.1, 0.1, size=1)
 30 | C = 0.5
 31 | 
 32 | 
 33 | # Stack all variables into a single vector
 34 | theta0 = np.hstack((w, eta, xi, zi, b))
 35 | 
 36 | 
 37 | # Vectorize the min() function
 38 | vmin = np.vectorize(lambda x1, x2: x1 if x1 <= x2 else x2)
 39 | 
 40 | 
 41 | def svm_target(theta, Xd, Yd):
 42 |     wt = theta[0:2].reshape((Xd.shape[1], 1))
 43 | 
 44 |     s_eta = np.sum(theta[2:2 + nb_samples - nb_unlabeled])
 45 |     s_min_xi_zi = np.sum(vmin(theta[2 + nb_samples - nb_unlabeled:2 + nb_samples],
 46 |                               theta[2 + nb_samples:2 + nb_samples + nb_unlabeled]))
 47 | 
 48 |     return C * (s_eta + s_min_xi_zi) + 0.5 * np.dot(wt.T, wt)
 49 | 
 50 | 
 51 | def labeled_constraint(theta, Xd, Yd, idx):
 52 |     wt = theta[0:2].reshape((Xd.shape[1], 1))
 53 | 
 54 |     c = Yd[idx] * (np.dot(Xd[idx], wt) + theta[-1]) + \
 55 |         theta[2:2 + nb_samples - nb_unlabeled][idx] - 1.0
 56 | 
 57 |     return (c >= 0)[0]
 58 | 
 59 | 
 60 | def unlabeled_constraint_1(theta, Xd, idx):
 61 |     wt = theta[0:2].reshape((Xd.shape[1], 1))
 62 | 
 63 |     c = np.dot(Xd[idx], wt) - theta[-1] + \
 64 |         theta[2 + nb_samples - nb_unlabeled:2 + nb_samples][idx - nb_samples + nb_unlabeled] - 1.0
 65 | 
 66 |     return (c >= 0)[0]
 67 | 
 68 | 
 69 | def unlabeled_constraint_2(theta, Xd, idx):
 70 |     wt = theta[0:2].reshape((Xd.shape[1], 1))
 71 | 
 72 |     c = -(np.dot(Xd[idx], wt) - theta[-1]) + \
 73 |         theta[2 + nb_samples:2 + nb_samples + nb_unlabeled][idx - nb_samples + nb_unlabeled] - 1.0
 74 | 
 75 |     return (c >= 0)[0]
 76 | 
 77 | 
 78 | def eta_constraint(theta, idx):
 79 |     return theta[2:2 + nb_samples - nb_unlabeled][idx] >= 0
 80 | 
 81 | 
 82 | def xi_constraint(theta, idx):
 83 |     return theta[2 + nb_samples - nb_unlabeled:2 + nb_samples][idx - nb_samples + nb_unlabeled] >= 0
 84 | 
 85 | 
 86 | def zi_constraint(theta, idx):
 87 |     return theta[2 + nb_samples:2 + nb_samples+nb_unlabeled ][idx - nb_samples + nb_unlabeled] >= 0
 88 | 
 89 | 
 90 | if __name__ == '__main__':
 91 |     # Show the initial dataset
 92 |     fig, ax = plt.subplots(figsize=(12, 9))
 93 | 
 94 |     ax.scatter(X[Y == -1, 0], X[Y == -1, 1], color='b', marker='s', s=80, label='Class +1')
 95 |     ax.scatter(X[Y == 1, 0], X[Y == 1, 1], color='g', marker='o', s=80, label='Class -1')
 96 |     ax.scatter(X[Y == 0, 0], X[Y == 0, 1], color='r', marker='x', s=80, label='Unlabeled')
 97 | 
 98 |     ax.set_xlabel(r'$x_0$')
 99 |     ax.set_ylabel(r'$x_1$')
100 |     ax.legend()
101 |     ax.grid()
102 | 
103 |     plt.show()
104 | 
105 |     # Setup all the constraints
106 |     svm_constraints = []
107 | 
108 |     for i in range(nb_samples - nb_unlabeled):
109 |         svm_constraints.append({
110 |             'type': 'ineq',
111 |             'fun': labeled_constraint,
112 |             'args': (X, Y, i)
113 |         })
114 |         svm_constraints.append({
115 |             'type': 'ineq',
116 |             'fun': eta_constraint,
117 |             'args': (i,)
118 |         })
119 | 
120 |     for i in range(nb_samples - nb_unlabeled, nb_samples):
121 |         svm_constraints.append({
122 |             'type': 'ineq',
123 |             'fun': unlabeled_constraint_1,
124 |             'args': (X, i)
125 |         })
126 |         svm_constraints.append({
127 |             'type': 'ineq',
128 |             'fun': unlabeled_constraint_2,
129 |             'args': (X, i)
130 |         })
131 |         svm_constraints.append({
132 |             'type': 'ineq',
133 |             'fun': xi_constraint,
134 |             'args': (i,)
135 |         })
136 |         svm_constraints.append({
137 |             'type': 'ineq',
138 |             'fun': zi_constraint,
139 |             'args': (i,)
140 |         })
141 | 
142 |     # Optimize the objective
143 |     print('Optimizing...')
144 |     result = minimize(fun=svm_target,
145 |                       x0=theta0,
146 |                       constraints=svm_constraints,
147 |                       args=(X, Y),
148 |                       method='SLSQP',
149 |                       tol=0.0001,
150 |                       options={'maxiter': 1000})
151 | 
152 |     # Extract the last parameters
153 |     theta_end = result['x']
154 |     w = theta_end[0:2]
155 |     b = theta_end[-1]
156 | 
157 |     Xu = X[nb_samples - nb_unlabeled:nb_samples]
158 |     yu = -np.sign(np.dot(Xu, w) + b)
159 | 
160 |     # Show the final plots
161 |     fig, ax = plt.subplots(1, 2, figsize=(18, 8))
162 | 
163 |     ax[0].scatter(X[Y == -1, 0], X[Y == -1, 1], color='b', marker='s', s=80, label='Class +1')
164 |     ax[0].scatter(X[Y == 1, 0], X[Y == 1, 1], color='g', marker='o', s=80, label='Class -1')
165 |     ax[0].scatter(X[Y == 0, 0], X[Y == 0, 1], color='r', marker='x', s=80, label='Unlabeled')
166 | 
167 |     ax[0].set_xlabel(r'$x_0$')
168 |     ax[0].set_ylabel(r'$x_1$')
169 |     ax[0].legend()
170 |     ax[0].grid()
171 | 
172 |     ax[1].scatter(X[Y == -1, 0], X[Y == -1, 1], color='b', marker='s', s=80, label='Class +1')
173 |     ax[1].scatter(X[Y == 1, 0], X[Y == 1, 1], color='g', marker='o', s=80, label='Class -1')
174 | 
175 |     ax[1].scatter(Xu[yu == -1, 0], Xu[yu == -1, 1], color='b', marker='s', s=80)
176 |     ax[1].scatter(Xu[yu == 1, 0], Xu[yu == 1, 1], color='g', marker='o', s=80)
177 | 
178 |     ax[1].set_xlabel(r'$x_0$')
179 |     ax[1].set_ylabel(r'$x_1$')
180 |     ax[1].legend()
181 |     ax[1].grid()
182 | 
183 |     plt.show()


--------------------------------------------------------------------------------