├── .DS_Store ├── .gitattributes ├── .gitignore ├── Chapter02 ├── .DS_Store ├── exploring_nlp.py ├── getting_exploring_newsgroups.py ├── tSNE.py └── thinking_about_features.py ├── Chapter03 ├── .DS_Store ├── kmeans_elbow.py ├── kmeans_from_scratch.py ├── kmeans_newsgroups.py ├── kmeans_sklearn.py ├── lda_newsgroups.py └── nmf_newsgroups.py ├── Chapter04 ├── .DS_Store └── email_spam.py ├── Chapter05 ├── CTG.xls ├── ctg.py ├── plot_rbf_kernels.py ├── svm_tf.py └── topic_categorization.py ├── Chapter06 ├── avazu_ctr.py ├── avazu_ctr_tf.py └── decision_tree_submit.py ├── Chapter07 ├── encoding.py ├── logistic_function.py ├── logistic_regression_from_scratch.py ├── logistic_regression_tf.py ├── random_forest_feature_selection.py └── scikit_logistic_regression.py ├── Chapter08 ├── ctr.py ├── ctr_hashing.py └── ctr_interaction.py ├── Chapter09 ├── 19880101_20161231.csv ├── 20051201_20051210.csv ├── decision_tree_regression.py ├── get_dji_data.py ├── linear_regression.py ├── neural_network.py ├── regression_evaluation.py ├── stock_prediction.py └── svr.py ├── Chapter10 ├── dimensionality_reduction.py ├── feature_selection.py ├── generic_feature_engineering.py ├── imputation.py ├── save_reuse_model_tf.py ├── save_reuse_monitor_model.py └── word_embedding.py ├── LICENSE └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenliu/Python-Machine-Learning-By-Example-Second-Edition/199b861e0158a9bee57cc6f1ceb61b3a7d76dcba/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows thumbnail cache files 2 | Thumbs.db 3 | ehthumbs.db 4 | ehthumbs_vista.db 5 | 6 | # Folder config file 7 | Desktop.ini 8 | 9 | # Recycle Bin used on file shares 10 | $RECYCLE.BIN/ 11 | 12 | # Windows Installer files 13 | *.cab 14 | *.msi 15 | *.msm 16 | *.msp 17 | 18 | # Windows shortcuts 19 | *.lnk 20 | 21 | # ========================= 22 | # Operating System Files 23 | # ========================= 24 | -------------------------------------------------------------------------------- /Chapter02/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenliu/Python-Machine-Learning-By-Example-Second-Edition/199b861e0158a9bee57cc6f1ceb61b3a7d76dcba/Chapter02/.DS_Store -------------------------------------------------------------------------------- /Chapter02/exploring_nlp.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from nltk.corpus import names 8 | 9 | print(names.words()[:10]) 10 | 11 | print(len(names.words())) 12 | 13 | 14 | 15 | from nltk.tokenize import word_tokenize 16 | sent = '''I am reading a book. 17 | It is Python Machine Learning By Example, 18 | 2nd edition.''' 19 | 20 | print(word_tokenize(sent)) 21 | 22 | 23 | sent2 = 'I have been to U.K. and U.S.A.' 24 | print(word_tokenize(sent2)) 25 | 26 | 27 | 28 | import spacy 29 | 30 | nlp = spacy.load('en_core_web_sm') 31 | tokens2 = nlp(sent2) 32 | 33 | print([token.text for token in tokens2]) 34 | 35 | 36 | from nltk.tokenize import sent_tokenize 37 | print(sent_tokenize(sent)) 38 | 39 | 40 | import nltk 41 | tokens = word_tokenize(sent) 42 | print(nltk.pos_tag(tokens)) 43 | nltk.help.upenn_tagset('PRP') 44 | nltk.help.upenn_tagset('VBP') 45 | 46 | 47 | 48 | print([(token.text, token.pos_) for token in tokens2]) 49 | 50 | 51 | 52 | tokens3 = nlp('The book written by Hayden Liu in 2018 was sold at $30 in America') 53 | print([(token_ent.text, token_ent.label_) for token_ent in tokens3.ents]) 54 | 55 | 56 | 57 | from nltk.stem.porter import PorterStemmer 58 | porter_stemmer = PorterStemmer() 59 | porter_stemmer.stem('machines') 60 | porter_stemmer.stem('learning') 61 | 62 | 63 | from nltk.stem import WordNetLemmatizer 64 | lemmatizer = WordNetLemmatizer() 65 | lemmatizer.lemmatize('machines') -------------------------------------------------------------------------------- /Chapter02/getting_exploring_newsgroups.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | 8 | from sklearn.datasets import fetch_20newsgroups 9 | 10 | 11 | groups = fetch_20newsgroups() 12 | groups.keys() 13 | groups['target_names'] 14 | groups.target 15 | 16 | 17 | import numpy as np 18 | np.unique(groups.target) 19 | 20 | 21 | 22 | import seaborn as sns 23 | sns.distplot(groups.target) 24 | import matplotlib.pyplot as plt 25 | plt.show() 26 | 27 | 28 | groups.data[0] 29 | groups.target[0] 30 | groups.target_names[groups.target[0]] 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /Chapter02/tSNE.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn.datasets import fetch_20newsgroups 8 | from sklearn.feature_extraction.text import CountVectorizer 9 | 10 | 11 | categories_3 = ['talk.religion.misc', 'comp.graphics', 'sci.space'] 12 | 13 | groups_3 = fetch_20newsgroups(categories=categories_3) 14 | 15 | 16 | 17 | def is_letter_only(word): 18 | for char in word: 19 | if not char.isalpha(): 20 | return False 21 | return True 22 | 23 | 24 | 25 | from nltk.corpus import names 26 | all_names = set(names.words()) 27 | 28 | 29 | count_vector_sw = CountVectorizer(stop_words="english", max_features=500) 30 | 31 | 32 | from nltk.stem import WordNetLemmatizer 33 | lemmatizer = WordNetLemmatizer() 34 | 35 | data_cleaned = [] 36 | 37 | for doc in groups_3.data: 38 | doc = doc.lower() 39 | doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names) 40 | data_cleaned.append(doc_cleaned) 41 | 42 | 43 | data_cleaned_count_3 = count_vector_sw.fit_transform(data_cleaned) 44 | 45 | 46 | 47 | 48 | from sklearn.manifold import TSNE 49 | 50 | 51 | tsne_model = TSNE(n_components=2, perplexity=40, random_state=42, learning_rate=500) 52 | 53 | 54 | data_tsne = tsne_model.fit_transform(data_cleaned_count_3.toarray()) 55 | 56 | 57 | import matplotlib.pyplot as plt 58 | plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=groups_3.target) 59 | 60 | plt.show() 61 | 62 | 63 | 64 | 65 | 66 | 67 | categories_5 = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 68 | 'comp.windows.x'] 69 | groups_5 = fetch_20newsgroups(categories=categories_5) 70 | 71 | count_vector_sw = CountVectorizer(stop_words="english", max_features=500) 72 | 73 | data_cleaned = [] 74 | 75 | for doc in groups_5.data: 76 | doc = doc.lower() 77 | doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names) 78 | data_cleaned.append(doc_cleaned) 79 | 80 | data_cleaned_count_5 = count_vector_sw.fit_transform(data_cleaned) 81 | 82 | data_tsne = tsne_model.fit_transform(data_cleaned_count_5.toarray()) 83 | 84 | plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=groups_5.target) 85 | 86 | plt.show() -------------------------------------------------------------------------------- /Chapter02/thinking_about_features.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn.datasets import fetch_20newsgroups 8 | 9 | 10 | groups = fetch_20newsgroups() 11 | 12 | 13 | 14 | from sklearn.feature_extraction.text import CountVectorizer 15 | 16 | count_vector = CountVectorizer(stop_words="english",max_features=500) 17 | data_count = count_vector.fit_transform(groups.data) 18 | 19 | print(count_vector.get_feature_names()) 20 | 21 | data_count.toarray()[0] 22 | 23 | 24 | 25 | def is_letter_only(word): 26 | for char in word: 27 | if not char.isalpha(): 28 | return False 29 | return True 30 | 31 | data_cleaned = [] 32 | for doc in groups.data: 33 | doc_cleaned = ' '.join(word for word in doc.split() if is_letter_only(word) ) 34 | data_cleaned.append(doc_cleaned) 35 | 36 | 37 | from sklearn.feature_extraction import stop_words 38 | print(stop_words.ENGLISH_STOP_WORDS) 39 | 40 | 41 | from nltk.corpus import names 42 | all_names = set(names.words()) 43 | 44 | 45 | count_vector_sw = CountVectorizer(stop_words="english", max_features=500) 46 | 47 | 48 | from nltk.stem import WordNetLemmatizer 49 | lemmatizer = WordNetLemmatizer() 50 | 51 | data_cleaned = [] 52 | 53 | for doc in groups.data: 54 | doc = doc.lower() 55 | doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names) 56 | data_cleaned.append(doc_cleaned) 57 | 58 | 59 | data_cleaned_count = count_vector_sw.fit_transform(data_cleaned) 60 | 61 | print(count_vector_sw.get_feature_names()) 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /Chapter03/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenliu/Python-Machine-Learning-By-Example-Second-Edition/199b861e0158a9bee57cc6f1ceb61b3a7d76dcba/Chapter03/.DS_Store -------------------------------------------------------------------------------- /Chapter03/kmeans_elbow.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | 8 | from sklearn import datasets 9 | from sklearn.cluster import KMeans 10 | import numpy as np 11 | from matplotlib import pyplot as plt 12 | 13 | iris = datasets.load_iris() 14 | X = iris.data 15 | y = iris.target 16 | 17 | 18 | k_list = list(range(1, 7)) 19 | sse_list = [0] * len(k_list) 20 | 21 | for k_ind, k in enumerate(k_list): 22 | kmeans = KMeans(n_clusters=k, random_state=42) 23 | kmeans.fit(X) 24 | clusters = kmeans.labels_ 25 | centroids = kmeans.cluster_centers_ 26 | 27 | sse = 0 28 | for i in range(k): 29 | cluster_i = np.where(clusters == i) 30 | 31 | sse += np.linalg.norm(X[cluster_i] - centroids[i]) 32 | 33 | print('k={}, SSE={}'.format(k, sse)) 34 | sse_list[k_ind] = sse 35 | 36 | 37 | 38 | plt.plot(k_list, sse_list) 39 | plt.show() 40 | -------------------------------------------------------------------------------- /Chapter03/kmeans_from_scratch.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | 8 | from sklearn import datasets 9 | iris = datasets.load_iris() 10 | X = iris.data[:, 2:4] 11 | y = iris.target 12 | 13 | import numpy as np 14 | from matplotlib import pyplot as plt 15 | y_0 = np.where(y==0) 16 | plt.scatter(X[y_0, 0], X[y_0, 1]) 17 | y_1 = np.where(y==1) 18 | plt.scatter(X[y_1, 0], X[y_1, 1]) 19 | y_2 = np.where(y==2) 20 | plt.scatter(X[y_2, 0], X[y_2, 1]) 21 | plt.show() 22 | 23 | 24 | k = 3 25 | random_index = np.random.choice(range(len(X)), k) 26 | centroids = X[random_index] 27 | 28 | 29 | def visualize_centroids(X, centroids): 30 | plt.scatter(X[:, 0], X[:, 1]) 31 | plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505') 32 | plt.show() 33 | 34 | 35 | visualize_centroids(X, centroids) 36 | 37 | 38 | def dist(a, b): 39 | return np.linalg.norm(a - b, axis=1) 40 | 41 | def assign_cluster(x, centroids): 42 | distances = dist(x, centroids) 43 | cluster = np.argmin(distances) 44 | return cluster 45 | 46 | def update_centroids(X, centroids, clusters): 47 | for i in range(k): 48 | cluster_i = np.where(clusters == i) 49 | centroids[i] = np.mean(X[cluster_i], axis=0) 50 | 51 | 52 | clusters = np.zeros(len(X)) 53 | 54 | tol = 0.0001 55 | max_iter = 100 56 | 57 | iter = 0 58 | centroids_diff = 100000 59 | 60 | from copy import deepcopy 61 | while iter < max_iter and centroids_diff > tol: 62 | for i in range(len(X)): 63 | clusters[i] = assign_cluster(X[i], centroids) 64 | centroids_prev = deepcopy(centroids) 65 | update_centroids(X, centroids, clusters) 66 | iter += 1 67 | centroids_diff = np.linalg.norm(centroids - centroids_prev) 68 | print('Iteration:', str(iter)) 69 | print('Centroids:\n', centroids) 70 | print('Centroids move: {:5.4f}'.format(centroids_diff)) 71 | visualize_centroids(X, centroids) 72 | 73 | 74 | for i in range(k): 75 | cluster_i = np.where(clusters == i) 76 | plt.scatter(X[cluster_i, 0], X[cluster_i, 1]) 77 | plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505') 78 | plt.show() 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /Chapter03/kmeans_newsgroups.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn.datasets import fetch_20newsgroups 8 | 9 | categories = [ 10 | 'alt.atheism', 11 | 'talk.religion.misc', 12 | 'comp.graphics', 13 | 'sci.space', 14 | ] 15 | 16 | 17 | groups = fetch_20newsgroups(subset='all', categories=categories) 18 | 19 | 20 | labels = groups.target 21 | label_names = groups.target_names 22 | 23 | 24 | 25 | def is_letter_only(word): 26 | for char in word: 27 | if not char.isalpha(): 28 | return False 29 | return True 30 | 31 | 32 | 33 | from nltk.corpus import names 34 | all_names = set(names.words()) 35 | 36 | 37 | 38 | 39 | from nltk.stem import WordNetLemmatizer 40 | lemmatizer = WordNetLemmatizer() 41 | 42 | data_cleaned = [] 43 | 44 | for doc in groups.data: 45 | doc = doc.lower() 46 | doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names) 47 | data_cleaned.append(doc_cleaned) 48 | 49 | 50 | from sklearn.feature_extraction.text import CountVectorizer 51 | count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2) 52 | 53 | from sklearn.feature_extraction.text import TfidfVectorizer 54 | tfidf_vector = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2) 55 | 56 | data = tfidf_vector.fit_transform(data_cleaned) 57 | 58 | 59 | from sklearn.cluster import KMeans 60 | 61 | k = 4 62 | kmeans = KMeans(n_clusters=k, random_state=42) 63 | 64 | kmeans.fit(data) 65 | 66 | clusters = kmeans.labels_ 67 | 68 | 69 | 70 | from collections import Counter 71 | print(Counter(clusters)) 72 | 73 | import numpy as np 74 | cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)} 75 | 76 | terms = tfidf_vector.get_feature_names() 77 | centroids = kmeans.cluster_centers_ 78 | for cluster, index_list in cluster_label.items(): 79 | counter = Counter(cluster_label[cluster]) 80 | print('cluster_{}: {} samples'.format(cluster, len(index_list))) 81 | for label_index, count in sorted(counter.items(), key=lambda x: x[1], reverse=True): 82 | print('{}: {} samples'.format(label_names[label_index], count)) 83 | print('Top 10 terms:') 84 | for ind in centroids[cluster].argsort()[-10:]: 85 | print(' %s' % terms[ind], end="") 86 | print() 87 | -------------------------------------------------------------------------------- /Chapter03/kmeans_sklearn.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | 8 | from sklearn import datasets 9 | iris = datasets.load_iris() 10 | X = iris.data[:, 2:4] 11 | y = iris.target 12 | 13 | import numpy as np 14 | from matplotlib import pyplot as plt 15 | 16 | k = 3 17 | from sklearn.cluster import KMeans 18 | kmeans_sk = KMeans(n_clusters=3, random_state=42) 19 | kmeans_sk.fit(X) 20 | clusters_sk = kmeans_sk.labels_ 21 | centroids_sk = kmeans_sk.cluster_centers_ 22 | 23 | for i in range(k): 24 | cluster_i = np.where(clusters_sk == i) 25 | plt.scatter(X[cluster_i, 0], X[cluster_i, 1]) 26 | plt.scatter(centroids_sk[:, 0], centroids_sk[:, 1], marker='*', s=200, c='#050505') 27 | plt.show() 28 | -------------------------------------------------------------------------------- /Chapter03/lda_newsgroups.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn.datasets import fetch_20newsgroups 8 | 9 | categories = [ 10 | 'alt.atheism', 11 | 'talk.religion.misc', 12 | 'comp.graphics', 13 | 'sci.space', 14 | ] 15 | 16 | 17 | groups = fetch_20newsgroups(subset='all', categories=categories) 18 | 19 | 20 | 21 | def is_letter_only(word): 22 | for char in word: 23 | if not char.isalpha(): 24 | return False 25 | return True 26 | 27 | 28 | 29 | from nltk.corpus import names 30 | all_names = set(names.words()) 31 | 32 | 33 | 34 | from nltk.stem import WordNetLemmatizer 35 | lemmatizer = WordNetLemmatizer() 36 | 37 | data_cleaned = [] 38 | 39 | for doc in groups.data: 40 | doc = doc.lower() 41 | doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names) 42 | data_cleaned.append(doc_cleaned) 43 | 44 | 45 | 46 | from sklearn.feature_extraction.text import CountVectorizer 47 | count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2) 48 | 49 | 50 | data = count_vector.fit_transform(data_cleaned) 51 | 52 | 53 | from sklearn.decomposition import LatentDirichletAllocation 54 | 55 | t = 20 56 | lda = LatentDirichletAllocation(n_components=t, learning_method='batch',random_state=42) 57 | 58 | lda.fit(data) 59 | 60 | print(lda.components_) 61 | 62 | terms = count_vector.get_feature_names() 63 | 64 | 65 | for topic_idx, topic in enumerate(lda.components_): 66 | print("Topic {}:" .format(topic_idx)) 67 | print(" ".join([terms[i] for i in topic.argsort()[-10:]])) 68 | 69 | 70 | -------------------------------------------------------------------------------- /Chapter03/nmf_newsgroups.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn.datasets import fetch_20newsgroups 8 | 9 | categories = [ 10 | 'alt.atheism', 11 | 'talk.religion.misc', 12 | 'comp.graphics', 13 | 'sci.space', 14 | ] 15 | 16 | 17 | groups = fetch_20newsgroups(subset='all', categories=categories) 18 | 19 | 20 | 21 | def is_letter_only(word): 22 | for char in word: 23 | if not char.isalpha(): 24 | return False 25 | return True 26 | 27 | 28 | 29 | from nltk.corpus import names 30 | all_names = set(names.words()) 31 | 32 | 33 | 34 | from nltk.stem import WordNetLemmatizer 35 | lemmatizer = WordNetLemmatizer() 36 | 37 | data_cleaned = [] 38 | 39 | for doc in groups.data: 40 | doc = doc.lower() 41 | doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names) 42 | data_cleaned.append(doc_cleaned) 43 | 44 | 45 | from sklearn.feature_extraction.text import CountVectorizer 46 | count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2) 47 | 48 | data = count_vector.fit_transform(data_cleaned) 49 | 50 | 51 | from sklearn.decomposition import NMF 52 | 53 | t = 20 54 | nmf = NMF(n_components=t, random_state=42) 55 | 56 | nmf.fit(data) 57 | 58 | print(nmf.components_) 59 | 60 | terms = count_vector.get_feature_names() 61 | 62 | 63 | for topic_idx, topic in enumerate(nmf.components_): 64 | print("Topic {}:" .format(topic_idx)) 65 | print(" ".join([terms[i] for i in topic.argsort()[-10:]])) 66 | 67 | 68 | -------------------------------------------------------------------------------- /Chapter04/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenliu/Python-Machine-Learning-By-Example-Second-Edition/199b861e0158a9bee57cc6f1ceb61b3a7d76dcba/Chapter04/.DS_Store -------------------------------------------------------------------------------- /Chapter04/email_spam.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 4: Detecting Spam Email with Naive Bayes 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | # -*- coding: utf-8 -*- 8 | 9 | import glob 10 | import os 11 | import numpy as np 12 | 13 | 14 | file_path = 'enron1/ham/0007.1999-12-14.farmer.ham.txt' 15 | with open(file_path, 'r') as infile: 16 | ham_sample = infile.read() 17 | 18 | print(ham_sample) 19 | 20 | file_path = 'enron1/spam/0058.2003-12-21.GP.spam.txt' 21 | with open(file_path, 'r') as infile: 22 | spam_sample = infile.read() 23 | 24 | print(spam_sample) 25 | 26 | 27 | emails, labels = [], [] 28 | 29 | file_path = 'enron1/spam/' 30 | for filename in glob.glob(os.path.join(file_path, '*.txt')): 31 | with open(filename, 'r', encoding="ISO-8859-1") as infile: 32 | emails.append(infile.read()) 33 | labels.append(1) 34 | 35 | file_path = 'enron1/ham/' 36 | for filename in glob.glob(os.path.join(file_path, '*.txt')): 37 | with open(filename, 'r', encoding="ISO-8859-1") as infile: 38 | emails.append(infile.read()) 39 | labels.append(0) 40 | 41 | print(len(labels)) 42 | 43 | print(len(emails)) 44 | 45 | 46 | 47 | 48 | def is_letter_only(word): 49 | return word.isalpha() 50 | 51 | from nltk.corpus import names 52 | all_names = set(names.words()) 53 | 54 | from nltk.stem import WordNetLemmatizer 55 | lemmatizer = WordNetLemmatizer() 56 | 57 | def clean_text(docs): 58 | docs_cleaned = [] 59 | for doc in docs: 60 | doc = doc.lower() 61 | doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names) 62 | docs_cleaned.append(doc_cleaned) 63 | return docs_cleaned 64 | 65 | emails_cleaned = clean_text(emails) 66 | 67 | from sklearn.feature_extraction.text import CountVectorizer 68 | cv = CountVectorizer(stop_words="english", max_features=1000, max_df=0.5, min_df=2) 69 | 70 | docs_cv = cv.fit_transform(emails_cleaned) 71 | print(docs_cv[0]) 72 | 73 | terms = cv.get_feature_names() 74 | print(terms[932]) 75 | print(terms[968]) 76 | print(terms[715]) 77 | 78 | 79 | 80 | def get_label_index(labels): 81 | from collections import defaultdict 82 | label_index = defaultdict(list) 83 | for index, label in enumerate(labels): 84 | label_index[label].append(index) 85 | return label_index 86 | 87 | 88 | def get_prior(label_index): 89 | """ 90 | Compute prior based on training samples 91 | @param label_index: grouped sample indices by class 92 | @return: dictionary, with class label as key, corresponding prior as the value 93 | """ 94 | prior = {label: len(index) for label, index in label_index.items()} 95 | total_count = sum(prior.values()) 96 | for label in prior: 97 | prior[label] /= float(total_count) 98 | return prior 99 | 100 | 101 | label_index = get_label_index(labels) 102 | prior = get_prior(label_index) 103 | print('Prior:', prior) 104 | 105 | 106 | def get_likelihood(term_matrix, label_index, smoothing=0): 107 | """ 108 | Compute likelihood based on training samples 109 | @param term_matrix: sparse matrix of the term frequency features 110 | @param label_index: grouped sample indices by class 111 | @param smoothing: integer, additive Laplace smoothing parameter 112 | @return: dictionary, with class as key, corresponding conditional probability P(feature|class) vector as value 113 | """ 114 | likelihood = {} 115 | for label, index in label_index.items(): 116 | likelihood[label] = term_matrix[index, :].sum(axis=0) + smoothing 117 | likelihood[label] = np.asarray(likelihood[label])[0] 118 | total_count = likelihood[label].sum() 119 | likelihood[label] = likelihood[label] / float(total_count) 120 | return likelihood 121 | 122 | smoothing = 1 123 | likelihood = get_likelihood(docs_cv, label_index, smoothing) 124 | 125 | print(len(likelihood[0])) 126 | 127 | print(likelihood[0][:5]) 128 | print(likelihood[1][:5]) 129 | 130 | 131 | 132 | def get_posterior(term_matrix, prior, likelihood): 133 | """ 134 | Compute posterior of testing samples, based on prior and likelihood 135 | @param term_matrix: sparse matrix of the term frequency features 136 | @param prior: dictionary, with class label as key, corresponding prior as the value 137 | @param likelihood: dictionary, with class label as key, corresponding conditional probability vector as value 138 | @return: dictionary, with class label as key, corresponding posterior as value 139 | """ 140 | num_docs = term_matrix.shape[0] 141 | posteriors = [] 142 | for i in range(num_docs): 143 | # posterior is proportional to prior * likelihood 144 | # = exp(log(prior * likelihood)) 145 | # = exp(log(prior) + log(likelihood)) 146 | posterior = {key: np.log(prior_label) for key, prior_label in prior.items()} 147 | for label, likelihood_label in likelihood.items(): 148 | term_document_vector = term_matrix.getrow(i) 149 | counts = term_document_vector.data 150 | indices = term_document_vector.indices 151 | for count, index in zip(counts, indices): 152 | posterior[label] += np.log(likelihood_label[index]) * count 153 | # exp(-1000):exp(-999) will cause zero division error, 154 | # however it equates to exp(0):exp(1) 155 | min_log_posterior = min(posterior.values()) 156 | for label in posterior: 157 | try: 158 | posterior[label] = np.exp(posterior[label] - min_log_posterior) 159 | except: 160 | posterior[label] = float('inf') 161 | # normalize so that all sums up to 1 162 | sum_posterior = sum(posterior.values()) 163 | for label in posterior: 164 | if posterior[label] == float('inf'): 165 | posterior[label] = 1.0 166 | else: 167 | posterior[label] /= sum_posterior 168 | posteriors.append(posterior.copy()) 169 | return posteriors 170 | 171 | 172 | 173 | emails_test = [ 174 | '''Subject: flat screens 175 | hello , 176 | please call or contact regarding the other flat screens requested . 177 | trisha tlapek - eb 3132 b 178 | michael sergeev - eb 3132 a 179 | also the sun blocker that was taken away from eb 3131 a . 180 | trisha should two monitors also michael . 181 | thanks 182 | kevin moore''', 183 | '''Subject: let ' s stop the mlm insanity ! 184 | still believe you can earn $ 100 , 000 fast in mlm ? get real ! 185 | get emm , a brand new system that replaces mlm with something that works ! 186 | start earning 1 , 000 ' s now ! up to $ 10 , 000 per week doing simple online tasks . 187 | free info - breakfree @ luxmail . com - type " send emm info " in the subject box . 188 | this message is sent in compliance of the proposed bill section 301 . per section 301 , paragraph ( a ) ( 2 ) ( c ) of s . 1618 . further transmission to you by the sender of this e - mail may be stopped at no cost to you by sending a reply to : " email address " with the word remove in the subject line . 189 | ''', 190 | ] 191 | 192 | emails_cleaned_test = clean_text(emails_test) 193 | term_docs_test = cv.transform(emails_cleaned_test) 194 | 195 | 196 | posterior = get_posterior(term_docs_test, prior, likelihood) 197 | print(posterior) 198 | 199 | 200 | 201 | from sklearn.model_selection import train_test_split 202 | X_train, X_test, Y_train, Y_test = train_test_split(emails_cleaned, labels, test_size=0.33, random_state=42) 203 | 204 | print(len(X_train), len(Y_train)) 205 | len(X_test), len(Y_test) 206 | 207 | term_docs_train = cv.fit_transform(X_train) 208 | 209 | label_index = get_label_index(Y_train) 210 | prior = get_prior(label_index) 211 | likelihood = get_likelihood(term_docs_train, label_index, smoothing) 212 | 213 | term_docs_test = cv.transform(X_test) 214 | 215 | 216 | posterior = get_posterior(term_docs_test, prior, likelihood) 217 | 218 | correct = 0.0 219 | for pred, actual in zip(posterior, Y_test): 220 | if actual == 1: 221 | if pred[1] >= 0.5: 222 | correct += 1 223 | elif pred[0] > 0.5: 224 | correct += 1 225 | 226 | print('The accuracy on {0} testing samples is: {1:.1f}%'.format(len(Y_test), correct/len(Y_test)*100)) 227 | 228 | 229 | 230 | 231 | from sklearn.naive_bayes import MultinomialNB 232 | clf = MultinomialNB(alpha=1.0, fit_prior=True) 233 | clf.fit(term_docs_train, Y_train) 234 | prediction_prob = clf.predict_proba(term_docs_test) 235 | 236 | print(prediction_prob[0:10]) 237 | 238 | prediction = clf.predict(term_docs_test) 239 | 240 | print(prediction[:10]) 241 | 242 | accuracy = clf.score(term_docs_test, Y_test) 243 | 244 | print('The accuracy using MultinomialNB is: {0:.1f}%'.format(accuracy*100)) 245 | 246 | 247 | 248 | 249 | from sklearn.metrics import confusion_matrix 250 | print(confusion_matrix(Y_test, prediction, labels=[0, 1])) 251 | 252 | from sklearn.metrics import precision_score, recall_score, f1_score 253 | precision_score(Y_test, prediction, pos_label=1) 254 | recall_score(Y_test, prediction, pos_label=1) 255 | f1_score(Y_test, prediction, pos_label=1) 256 | 257 | f1_score(Y_test, prediction, pos_label=0) 258 | 259 | from sklearn.metrics import classification_report 260 | report = classification_report(Y_test, prediction) 261 | print(report) 262 | 263 | 264 | 265 | 266 | pos_prob = prediction_prob[:, 1] 267 | thresholds = np.arange(0.0, 1.2, 0.1) 268 | true_pos, false_pos = [0]*len(thresholds), [0]*len(thresholds) 269 | for pred, y in zip(pos_prob, Y_test): 270 | for i, threshold in enumerate(thresholds): 271 | if pred >= threshold: 272 | if y == 1: 273 | true_pos[i] += 1 274 | else: 275 | false_pos[i] += 1 276 | else: 277 | break 278 | 279 | true_pos_rate = [tp / 516.0 for tp in true_pos] 280 | false_pos_rate = [fp / 1191.0 for fp in false_pos] 281 | 282 | 283 | import matplotlib.pyplot as plt 284 | plt.figure() 285 | lw = 2 286 | plt.plot(false_pos_rate, true_pos_rate, color='darkorange', 287 | lw=lw) 288 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 289 | plt.xlim([0.0, 1.0]) 290 | plt.ylim([0.0, 1.05]) 291 | plt.xlabel('False Positive Rate') 292 | plt.ylabel('True Positive Rate') 293 | plt.title('Receiver Operating Characteristic') 294 | plt.legend(loc="lower right") 295 | plt.show() 296 | 297 | 298 | 299 | 300 | from sklearn.metrics import roc_auc_score 301 | roc_auc_score(Y_test, pos_prob) 302 | 303 | 304 | 305 | from sklearn.model_selection import StratifiedKFold 306 | k = 10 307 | k_fold = StratifiedKFold(n_splits=k, random_state=42) 308 | cleaned_emails_np = np.array(emails_cleaned) 309 | labels_np = np.array(labels) 310 | 311 | max_features_option = [2000, 8000, None] 312 | smoothing_factor_option = [0.5, 1.0, 2.0, 4.0] 313 | fit_prior_option = [True, False] 314 | 315 | max_features_option = [None] 316 | smoothing_factor_option = [4.0, 10, 16, 20, 32] 317 | fit_prior_option = [True, False] 318 | 319 | 320 | auc_record = {} 321 | 322 | for train_indices, test_indices in k_fold.split(emails_cleaned, labels): 323 | X_train, X_test = cleaned_emails_np[train_indices], cleaned_emails_np[test_indices] 324 | Y_train, Y_test = labels_np[train_indices], labels_np[test_indices] 325 | for max_features in max_features_option: 326 | if max_features not in auc_record: 327 | auc_record[max_features] = {} 328 | cv = CountVectorizer(stop_words="english", max_features=max_features, max_df=0.5, min_df=2) 329 | term_docs_train = cv.fit_transform(X_train) 330 | term_docs_test = cv.transform(X_test) 331 | for alpha in smoothing_factor_option: 332 | if alpha not in auc_record[max_features]: 333 | auc_record[max_features][alpha] = {} 334 | for fit_prior in fit_prior_option: 335 | clf = MultinomialNB(alpha=alpha, fit_prior=fit_prior) 336 | clf.fit(term_docs_train, Y_train) 337 | prediction_prob = clf.predict_proba(term_docs_test) 338 | pos_prob = prediction_prob[:, 1] 339 | auc = roc_auc_score(Y_test, pos_prob) 340 | auc_record[max_features][alpha][fit_prior] = auc + auc_record[max_features][alpha].get(fit_prior, 0.0) 341 | 342 | 343 | 344 | print('max features smoothing fit prior auc') 345 | for max_features, max_feature_record in auc_record.items(): 346 | for smoothing, smoothing_record in max_feature_record.items(): 347 | for fit_prior, auc in smoothing_record.items(): 348 | print(' {0} {1} {2} {3:.5f}'.format(max_features, smoothing, fit_prior, auc/k)) 349 | 350 | -------------------------------------------------------------------------------- /Chapter05/CTG.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenliu/Python-Machine-Learning-By-Example-Second-Edition/199b861e0158a9bee57cc6f1ceb61b3a7d76dcba/Chapter05/CTG.xls -------------------------------------------------------------------------------- /Chapter05/ctg.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 5: Classifying Newsgroup Topic with Support Vector Machine 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import pandas as pd 8 | df = pd.read_excel('CTG.xls', "Raw Data") 9 | 10 | X = df.ix[1:2126, 3:-2].values 11 | Y = df.ix[1:2126, -1].values 12 | 13 | from collections import Counter 14 | Counter(Y) 15 | 16 | from sklearn.model_selection import train_test_split 17 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 18 | 19 | from sklearn.svm import SVC 20 | svc = SVC(kernel='rbf') 21 | 22 | parameters = {'C': (100, 1e3, 1e4, 1e5), 23 | 'gamma': (1e-08, 1e-7, 1e-6, 1e-5) 24 | } 25 | from sklearn.model_selection import GridSearchCV 26 | grid_search = GridSearchCV(svc, parameters, n_jobs=-1, cv=5) 27 | 28 | 29 | import timeit 30 | start_time = timeit.default_timer() 31 | grid_search.fit(X_train, Y_train) 32 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 33 | 34 | print(grid_search.best_params_) 35 | print(grid_search.best_score_) 36 | 37 | svc_best = grid_search.best_estimator_ 38 | 39 | accuracy = svc_best.score(X_test, Y_test) 40 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100)) 41 | 42 | prediction = svc_best.predict(X_test) 43 | from sklearn.metrics import classification_report 44 | report = classification_report(Y_test, prediction) 45 | print(report) -------------------------------------------------------------------------------- /Chapter05/plot_rbf_kernels.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 5: Classifying Newsgroup Topic with Support Vector Machine 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from sklearn.svm import SVC 10 | 11 | X = np.c_[# negative class 12 | (.3, -.8), 13 | (-1.5, -1), 14 | (-1.3, -.8), 15 | (-1.1, -1.3), 16 | (-1.2, -.3), 17 | (-1.3, -.5), 18 | (-.6, 1.1), 19 | (-1.4, 2.2), 20 | (1, 1), 21 | # positive class 22 | (1.3, .8), 23 | (1.2, .5), 24 | (.2, -2), 25 | (.5, -2.4), 26 | (.2, -2.3), 27 | (0, -2.7), 28 | (1.3, 2.1)].T 29 | Y = [-1] * 8 + [1] * 8 30 | 31 | gamma_option = [1, 2, 4] 32 | 33 | for i, gamma in enumerate(gamma_option, 1): 34 | svm = SVC(kernel='rbf', gamma=gamma) 35 | svm.fit(X, Y) 36 | plt.scatter(X[:, 0], X[:, 1], c=['b']*8+['r']*8, zorder=10, cmap=plt.cm.Paired) 37 | plt.axis('tight') 38 | XX, YY = np.mgrid[-3:3:200j, -3:3:200j] 39 | Z = svm.decision_function(np.c_[XX.ravel(), YY.ravel()]) 40 | Z = Z.reshape(XX.shape) 41 | plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired) 42 | plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'], levels=[-.5, 0, .5]) 43 | plt.title('gamma = %d' % gamma) 44 | plt.show() 45 | 46 | -------------------------------------------------------------------------------- /Chapter05/svm_tf.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 5: Classifying Newsgroup Topic with Support Vector Machine 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | 10 | from sklearn import datasets 11 | cancer_data = datasets.load_breast_cancer() 12 | X = cancer_data.data 13 | Y = cancer_data.target 14 | 15 | from collections import Counter 16 | print(Counter(Y)) 17 | 18 | 19 | np.random.seed(42) 20 | train_indices = np.random.choice(len(Y), round(len(Y) * 0.8), replace=False) 21 | test_indices = np.array(list(set(range(len(Y))) - set(train_indices))) 22 | X_train = X[train_indices] 23 | X_test = X[test_indices] 24 | Y_train = Y[train_indices] 25 | Y_test = Y[test_indices] 26 | 27 | 28 | svm_tf = tf.contrib.learn.SVM( 29 | feature_columns=(tf.contrib.layers.real_valued_column(column_name='x'),), 30 | example_id_column='example_id') 31 | 32 | 33 | 34 | input_fn_train = tf.estimator.inputs.numpy_input_fn( 35 | x={'x': X_train, 'example_id': np.array(['%d' % i for i in range(len(Y_train))])}, 36 | y=Y_train, 37 | num_epochs=None, 38 | batch_size=100, 39 | shuffle=True) 40 | 41 | 42 | 43 | svm_tf.fit(input_fn=input_fn_train, max_steps=100) 44 | 45 | 46 | metrics = svm_tf.evaluate(input_fn=input_fn_train, steps=1) 47 | print('The training accuracy is: {0:.1f}%'.format(metrics['accuracy']*100)) 48 | 49 | 50 | 51 | input_fn_test = tf.estimator.inputs.numpy_input_fn( 52 | x={'x': X_test, 'example_id': np.array(['%d' % (i + len(Y_train)) for i in range(len(X_test))])}, 53 | y=Y_test, 54 | num_epochs=None, 55 | shuffle=False) 56 | 57 | 58 | metrics = svm_tf.evaluate(input_fn=input_fn_test, steps=1) 59 | print('The testing accuracy is: {0:.1f}%'.format(metrics['accuracy']*100)) 60 | 61 | -------------------------------------------------------------------------------- /Chapter05/topic_categorization.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 5: Classifying Newsgroup Topic with Support Vector Machine 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.datasets import fetch_20newsgroups 9 | from nltk.corpus import names 10 | from nltk.stem import WordNetLemmatizer 11 | 12 | all_names = set(names.words()) 13 | lemmatizer = WordNetLemmatizer() 14 | 15 | def is_letter_only(word): 16 | return word.isalpha() 17 | 18 | from nltk.corpus import stopwords 19 | stop_words = stopwords.words('english') 20 | 21 | def clean_text(docs): 22 | docs_cleaned = [] 23 | for doc in docs: 24 | doc = doc.lower() 25 | doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() 26 | if is_letter_only(word) and word not in all_names and word not in stop_words) 27 | docs_cleaned.append(doc_cleaned) 28 | return docs_cleaned 29 | 30 | 31 | # Binary classification 32 | categories = ['comp.graphics', 'sci.space'] 33 | 34 | data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42) 35 | data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42) 36 | 37 | cleaned_train = clean_text(data_train.data) 38 | label_train = data_train.target 39 | cleaned_test = clean_text(data_test.data) 40 | label_test = data_test.target 41 | 42 | from collections import Counter 43 | Counter(label_train) 44 | 45 | tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=None) 46 | term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train) 47 | term_docs_test = tfidf_vectorizer.transform(cleaned_test) 48 | 49 | from sklearn.svm import SVC 50 | svm = SVC(kernel='linear', C=1.0, random_state=42) 51 | svm.fit(term_docs_train, label_train) 52 | accuracy = svm.score(term_docs_test, label_test) 53 | print('The accuracy of binary classification is: {0:.1f}%'.format(accuracy*100)) 54 | 55 | 56 | 57 | # Multiclass classification 58 | categories = [ 59 | 'alt.atheism', 60 | 'talk.religion.misc', 61 | 'comp.graphics', 62 | 'sci.space', 63 | 'rec.sport.hockey' 64 | ] 65 | data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42) 66 | data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42) 67 | 68 | cleaned_train = clean_text(data_train.data) 69 | label_train = data_train.target 70 | cleaned_test = clean_text(data_test.data) 71 | label_test = data_test.target 72 | 73 | term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train) 74 | term_docs_test = tfidf_vectorizer.transform(cleaned_test) 75 | 76 | svm = SVC(kernel='linear', C=1.0, random_state=42) 77 | svm.fit(term_docs_train, label_train) 78 | accuracy = svm.score(term_docs_test, label_test) 79 | print('The accuracy of 5-class classification is: {0:.1f}%'.format(accuracy*100)) 80 | 81 | from sklearn.metrics import classification_report 82 | prediction = svm.predict(term_docs_test) 83 | report = classification_report(label_test, prediction) 84 | print(report) 85 | 86 | 87 | 88 | # Grid search 89 | 90 | categories = None 91 | data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42) 92 | data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42) 93 | 94 | cleaned_train = clean_text(data_train.data) 95 | label_train = data_train.target 96 | cleaned_test = clean_text(data_test.data) 97 | label_test = data_test.target 98 | 99 | tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=None) 100 | term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train) 101 | term_docs_test = tfidf_vectorizer.transform(cleaned_test) 102 | 103 | parameters = {'C': [0.1, 1, 10, 100]} 104 | svc_libsvm = SVC(kernel='linear') 105 | 106 | from sklearn.model_selection import GridSearchCV 107 | grid_search = GridSearchCV(svc_libsvm, parameters, n_jobs=-1, cv=5) 108 | 109 | 110 | import timeit 111 | start_time = timeit.default_timer() 112 | grid_search.fit(term_docs_train, label_train) 113 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 114 | 115 | print(grid_search.best_params_) 116 | print(grid_search.best_score_) 117 | 118 | svc_libsvm_best = grid_search.best_estimator_ 119 | accuracy = svc_libsvm_best.score(term_docs_test, label_test) 120 | print('The accuracy of 20-class classification is: {0:.1f}%'.format(accuracy*100)) 121 | 122 | 123 | 124 | 125 | 126 | from sklearn.svm import LinearSVC 127 | svc_linear = LinearSVC() 128 | grid_search = GridSearchCV(svc_linear, parameters, n_jobs=-1, cv=5) 129 | 130 | start_time = timeit.default_timer() 131 | grid_search.fit(term_docs_train, label_train) 132 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 133 | 134 | print(grid_search.best_params_) 135 | print(grid_search.best_score_) 136 | svc_linear_best = grid_search.best_estimator_ 137 | accuracy = svc_linear_best.score(term_docs_test, label_test) 138 | print('TThe accuracy of 20-class classification is: {0:.1f}%'.format(accuracy*100)) 139 | 140 | 141 | 142 | 143 | # Pipeline 144 | from sklearn.pipeline import Pipeline 145 | 146 | pipeline = Pipeline([ 147 | ('tfidf', TfidfVectorizer(stop_words='english')), 148 | ('svc', LinearSVC()), 149 | ]) 150 | 151 | parameters_pipeline = { 152 | 'tfidf__max_df': (0.25, 0.5, 1.0), 153 | 'tfidf__max_features': (10000, None), 154 | 'tfidf__sublinear_tf': (True, False), 155 | 'tfidf__smooth_idf': (True, False), 156 | 'svc__C': (0.3, 1, 3), 157 | } 158 | 159 | grid_search = GridSearchCV(pipeline, parameters_pipeline, n_jobs=-1, cv=5) 160 | 161 | start_time = timeit.default_timer() 162 | grid_search.fit(cleaned_train, label_train) 163 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 164 | 165 | print(grid_search.best_params_) 166 | print(grid_search.best_score_) 167 | pipeline_best = grid_search.best_estimator_ 168 | accuracy = pipeline_best.score(cleaned_test, label_test) 169 | print('The accuracy of 20-class classification is: {0:.1f}%'.format(accuracy*100)) 170 | -------------------------------------------------------------------------------- /Chapter06/avazu_ctr.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 6: Predicting Online Ads Click-through with Tree-Based Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import pandas as pd 8 | n_rows = 300000 9 | df = pd.read_csv("train", nrows=n_rows) 10 | print(df.head(5)) 11 | 12 | 13 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 14 | Y = df['click'].values 15 | 16 | print(X.shape) 17 | 18 | n_train = int(n_rows * 0.9) 19 | X_train = X[:n_train] 20 | Y_train = Y[:n_train] 21 | X_test = X[n_train:] 22 | Y_test = Y[n_train:] 23 | 24 | from sklearn.preprocessing import OneHotEncoder 25 | enc = OneHotEncoder(handle_unknown='ignore') 26 | X_train_enc = enc.fit_transform(X_train) 27 | 28 | X_train_enc[0] 29 | print(X_train_enc[0]) 30 | 31 | 32 | X_test_enc = enc.transform(X_test) 33 | 34 | 35 | 36 | from sklearn.tree import DecisionTreeClassifier 37 | parameters = {'max_depth': [3, 10, None]} 38 | decision_tree = DecisionTreeClassifier(criterion='gini', min_samples_split=30) 39 | 40 | from sklearn.model_selection import GridSearchCV 41 | grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3, scoring='roc_auc') 42 | 43 | grid_search.fit(X_train_enc, Y_train) 44 | print(grid_search.best_params_) 45 | 46 | decision_tree_best = grid_search.best_estimator_ 47 | pos_prob = decision_tree_best.predict_proba(X_test_enc)[:, 1] 48 | 49 | from sklearn.metrics import roc_auc_score 50 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(Y_test, pos_prob))) 51 | 52 | import numpy as np 53 | pos_prob = np.zeros(len(Y_test)) 54 | click_index = np.random.choice(len(Y_test), int(len(Y_test) * 51211.0/300000), replace=False) 55 | pos_prob[click_index] = 1 56 | 57 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(Y_test, pos_prob))) 58 | 59 | 60 | from sklearn.ensemble import RandomForestClassifier 61 | 62 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1) 63 | grid_search = GridSearchCV(random_forest, parameters, n_jobs=-1, cv=3, scoring='roc_auc') 64 | grid_search.fit(X_train_enc, Y_train) 65 | print(grid_search.best_params_) 66 | print(grid_search.best_score_) 67 | 68 | random_forest_best = grid_search.best_estimator_ 69 | pos_prob = random_forest_best.predict_proba(X_test_enc)[:, 1] 70 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(Y_test, pos_prob))) 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /Chapter06/avazu_ctr_tf.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 6: Predicting Online Ads Click-through with Tree-Based Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import pandas as pd 8 | n_rows = 300000 9 | df = pd.read_csv("train", nrows=n_rows) 10 | print(df.head(5)) 11 | 12 | 13 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 14 | Y = df['click'].values 15 | 16 | print(X.shape) 17 | 18 | n_train = int(n_rows * 0.9) 19 | X_train = X[:n_train] 20 | Y_train = Y[:n_train] 21 | X_test = X[n_train:] 22 | Y_test = Y[n_train:] 23 | 24 | from sklearn.preprocessing import OneHotEncoder 25 | enc = OneHotEncoder(handle_unknown='ignore') 26 | X_train_enc = enc.fit_transform(X_train) 27 | 28 | X_train_enc[0] 29 | print(X_train_enc[0]) 30 | 31 | 32 | X_test_enc = enc.transform(X_test) 33 | 34 | 35 | import tensorflow as tf 36 | from tensorflow.contrib.tensor_forest.python import tensor_forest 37 | from tensorflow.python.ops import resources 38 | 39 | 40 | n_iter = 20 41 | n_classes = 2 42 | n_features = int(X_train_enc.toarray().shape[1]) 43 | n_trees = 10 44 | max_nodes = 30000 45 | 46 | 47 | x = tf.placeholder(tf.float32, shape=[None, n_features]) 48 | y = tf.placeholder(tf.int64, shape=[None]) 49 | 50 | 51 | hparams = tensor_forest.ForestHParams(num_classes=n_classes, num_features=n_features, num_trees=n_trees, 52 | max_nodes=max_nodes, split_after_samples=30).fill() 53 | 54 | forest_graph = tensor_forest.RandomForestGraphs(hparams) 55 | 56 | train_op = forest_graph.training_graph(x, y) 57 | loss_op = forest_graph.training_loss(x, y) 58 | 59 | infer_op, _, _ = forest_graph.inference_graph(x) 60 | 61 | auc = tf.metrics.auc(tf.cast(y, tf.int64), infer_op[:, 1])[1] 62 | 63 | 64 | init_vars = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), resources.initialize_resources(resources.shared_resources())) 65 | 66 | sess = tf.Session() 67 | 68 | sess.run(init_vars) 69 | 70 | batch_size = 1000 71 | 72 | import numpy as np 73 | indices = list(range(n_train)) 74 | 75 | def gen_batch(indices): 76 | np.random.shuffle(indices) 77 | for batch_i in range(int(n_train / batch_size)): 78 | batch_index = indices[batch_i*batch_size: (batch_i+1)*batch_size] 79 | yield X_train_enc[batch_index], Y_train[batch_index] 80 | 81 | 82 | for i in range(1, n_iter + 1): 83 | for X_batch, Y_batch in gen_batch(indices): 84 | _, l = sess.run([train_op, loss_op], feed_dict={x: X_batch.toarray(), y: Y_batch}) 85 | acc_train = sess.run(auc, feed_dict={x: X_train_enc.toarray(), y: Y_train}) 86 | print('Iteration %i, AUC of ROC on training set: %f' % (i, acc_train)) 87 | acc_test = sess.run(auc, feed_dict={x: X_test_enc.toarray(), y: Y_test}) 88 | print("AUC of ROC on testing set:", acc_test) 89 | 90 | -------------------------------------------------------------------------------- /Chapter06/decision_tree_submit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 6: Predicting Online Ads Click-through with Tree-Based Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | 10 | 11 | # Plot Gini Impurity in binary case 12 | pos_fraction = np.linspace(0.00, 1.00, 1000) 13 | gini = 1 - pos_fraction**2 - (1-pos_fraction)**2 14 | plt.plot(pos_fraction, gini) 15 | plt.xlabel('Positive fraction') 16 | plt.ylabel('Gini Impurity') 17 | plt.ylim(0, 1) 18 | plt.show() 19 | 20 | 21 | # Given labels of a data set, the Gini Impurity calculation function 22 | def gini_impurity(labels): 23 | # When the set is empty, it is also pure 24 | if not labels: 25 | return 0 26 | # Count the occurrences of each label 27 | counts = np.unique(labels, return_counts=True)[1] 28 | fractions = counts / float(len(labels)) 29 | return 1 - np.sum(fractions ** 2) 30 | 31 | print('{0:.4f}'.format(gini_impurity([1, 1, 0, 1, 0]))) 32 | print('{0:.4f}'.format(gini_impurity([1, 1, 0, 1, 0, 0]))) 33 | print('{0:.4f}'.format(gini_impurity([1, 1, 1, 1]))) 34 | 35 | 36 | # Plot entropy in binary case 37 | pos_fraction = np.linspace(0.00, 1.00, 1000) 38 | ent = - (pos_fraction * np.log2(pos_fraction) + (1 - pos_fraction) * np.log2(1 - pos_fraction)) 39 | plt.plot(pos_fraction, ent) 40 | plt.xlabel('Positive fraction') 41 | plt.ylabel('Entropy') 42 | plt.ylim(0, 1) 43 | plt.show() 44 | 45 | 46 | # Given labels of a data set, the entropy calculation function 47 | def entropy(labels): 48 | if not labels: 49 | return 0 50 | counts = np.unique(labels, return_counts=True)[1] 51 | fractions = counts / float(len(labels)) 52 | return - np.sum(fractions * np.log2(fractions)) 53 | 54 | print('{0:.4f}'.format(entropy([1, 1, 0, 1, 0]))) 55 | print('{0:.4f}'.format(entropy([1, 1, 0, 1, 0, 0]))) 56 | print('{0:.4f}'.format(entropy([1, 1, 1, 1]))) 57 | 58 | 59 | def information_gain(y, mask, func=entropy): 60 | s1 = np.sum(mask) 61 | s2 = mask.size - s1 62 | if (s1 == 0 | s2 == 0): return 0 63 | return func(y) - s1 / float(s1 + s2) * func(y[mask]) - s2 / float(s1 + s2) * func(y[np.logical_not(mask)]) 64 | 65 | 66 | criterion_function = {'gini': gini_impurity, 'entropy': entropy} 67 | def weighted_impurity(groups, criterion='gini'): 68 | """ 69 | Calculate weighted impurity of children after a split 70 | @param groups: list of children, and a child consists a list of class labels 71 | @param criterion: metric to measure the quality of a split, 'gini' for Gini Impurity or 'entropy' for Information Gain 72 | @return: float, weighted impurity 73 | """ 74 | total = sum(len(group) for group in groups) 75 | weighted_sum = 0.0 76 | for group in groups: 77 | weighted_sum += len(group) / float(total) * criterion_function[criterion](group) 78 | return weighted_sum 79 | 80 | 81 | children_1 = [[1, 0, 1], [0, 1]] 82 | children_2 = [[1, 1], [0, 0, 1]] 83 | print('Entropy of #1 split: {0:.4f}'.format(weighted_impurity(children_1, 'entropy'))) 84 | print('Entropy of #2 split: {0:.4f}'.format(weighted_impurity(children_2, 'entropy'))) 85 | 86 | 87 | 88 | def gini_impurity_np(labels): 89 | # When the set is empty, it is also pure 90 | if labels.size == 0: 91 | return 0 92 | # Count the occurrences of each label 93 | counts = np.unique(labels, return_counts=True)[1] 94 | fractions = counts / float(len(labels)) 95 | return 1 - np.sum(fractions ** 2) 96 | 97 | 98 | def entropy_np(labels): 99 | # When the set is empty, it is also pure 100 | if labels.size == 0: 101 | return 0 102 | counts = np.unique(labels, return_counts=True)[1] 103 | fractions = counts / float(len(labels)) 104 | return - np.sum(fractions * np.log2(fractions)) 105 | 106 | 107 | criterion_function_np = {'gini': gini_impurity_np, 'entropy': entropy_np} 108 | def weighted_impurity(groups, criterion='gini'): 109 | """ 110 | Calculate weighted impurity of children after a split 111 | @param groups: list of children, and a child consists a list of class labels 112 | @param criterion: metric to measure the quality of a split, 'gini' for Gini Impurity or 'entropy' for Information Gain 113 | @return: float, weighted impurity 114 | """ 115 | total = sum(len(group) for group in groups) 116 | weighted_sum = 0.0 117 | for group in groups: 118 | weighted_sum += len(group) / float(total) * criterion_function_np[criterion](group) 119 | return weighted_sum 120 | 121 | 122 | def split_node(X, y, index, value): 123 | """ 124 | Split data set X, y based on a feature and a value 125 | @param X: numpy.ndarray, dataset feature 126 | @param y: numpy.ndarray, dataset target 127 | @param index: int, index of the feature used for splitting 128 | @param value: value of the feature used for splitting 129 | @return: list, list: left and right child, a child is in the format of [X, y] 130 | """ 131 | x_index = X[:, index] 132 | # if this feature is numerical 133 | if X[0, index].dtype.kind in ['i', 'f']: 134 | mask = x_index >= value 135 | # if this feature is categorical 136 | else: 137 | mask = x_index == value 138 | # split into left and right child 139 | left = [X[~mask, :], y[~mask]] 140 | right = [X[mask, :], y[mask]] 141 | return left, right 142 | 143 | 144 | def get_best_split(X, y, criterion): 145 | """ 146 | Obtain the best splitting point and resulting children for the data set X, y 147 | @param X: numpy.ndarray, dataset feature 148 | @param y: numpy.ndarray, dataset target 149 | @param criterion: gini or entropy 150 | @return: dict {index: index of the feature, value: feature value, children: left and right children} 151 | """ 152 | best_index, best_value, best_score, children = None, None, 1, None 153 | for index in range(len(X[0])): 154 | for value in np.sort(np.unique(X[:, index])): 155 | groups = split_node(X, y, index, value) 156 | impurity = weighted_impurity([groups[0][1], groups[1][1]], criterion) 157 | if impurity < best_score: 158 | best_index, best_value, best_score, children = index, value, impurity, groups 159 | return {'index': best_index, 'value': best_value, 'children': children} 160 | 161 | 162 | 163 | def get_leaf(labels): 164 | # Obtain the leaf as the majority of the labels 165 | return np.bincount(labels).argmax() 166 | 167 | 168 | 169 | def split(node, max_depth, min_size, depth, criterion): 170 | """ 171 | Split children of a node to construct new nodes or assign them terminals 172 | @param node: dict, with children info 173 | @param max_depth: int, maximal depth of the tree 174 | @param min_size: int, minimal samples required to further split a child 175 | @param depth: int, current depth of the node 176 | @param criterion: gini or entropy 177 | """ 178 | left, right = node['children'] 179 | del (node['children']) 180 | if left[1].size == 0: 181 | node['right'] = get_leaf(right[1]) 182 | return 183 | if right[1].size == 0: 184 | node['left'] = get_leaf(left[1]) 185 | return 186 | # Check if the current depth exceeds the maximal depth 187 | if depth >= max_depth: 188 | node['left'], node['right'] = get_leaf(left[1]), get_leaf(right[1]) 189 | return 190 | # Check if the left child has enough samples 191 | if left[1].size <= min_size: 192 | node['left'] = get_leaf(left[1]) 193 | else: 194 | # It has enough samples, we further split it 195 | result = get_best_split(left[0], left[1], criterion) 196 | result_left, result_right = result['children'] 197 | if result_left[1].size == 0: 198 | node['left'] = get_leaf(result_right[1]) 199 | elif result_right[1].size == 0: 200 | node['left'] = get_leaf(result_left[1]) 201 | else: 202 | node['left'] = result 203 | split(node['left'], max_depth, min_size, depth + 1, criterion) 204 | # Check if the right child has enough samples 205 | if right[1].size <= min_size: 206 | node['right'] = get_leaf(right[1]) 207 | else: 208 | # It has enough samples, we further split it 209 | result = get_best_split(right[0], right[1], criterion) 210 | result_left, result_right = result['children'] 211 | if result_left[1].size == 0: 212 | node['right'] = get_leaf(result_right[1]) 213 | elif result_right[1].size == 0: 214 | node['right'] = get_leaf(result_left[1]) 215 | else: 216 | node['right'] = result 217 | split(node['right'], max_depth, min_size, depth + 1, criterion) 218 | 219 | 220 | def train_tree(X_train, y_train, max_depth, min_size, criterion='gini'): 221 | """ 222 | Construction of a tree starts here 223 | @param X_train: list of training samples (feature) 224 | @param y_train: list of training samples (target) 225 | @param max_depth: int, maximal depth of the tree 226 | @param min_size: int, minimal samples required to further split a child 227 | @param criterion: gini or entropy 228 | """ 229 | X = np.array(X_train) 230 | y = np.array(y_train) 231 | root = get_best_split(X, y, criterion) 232 | split(root, max_depth, min_size, 1, criterion) 233 | return root 234 | 235 | 236 | 237 | CONDITION = {'numerical': {'yes': '>=', 'no': '<'}, 238 | 'categorical': {'yes': 'is', 'no': 'is not'}} 239 | def visualize_tree(node, depth=0): 240 | if isinstance(node, dict): 241 | if node['value'].dtype.kind in ['i', 'f']: 242 | condition = CONDITION['numerical'] 243 | else: 244 | condition = CONDITION['categorical'] 245 | print('{}|- X{} {} {}'.format(depth * ' ', node['index'] + 1, condition['no'], node['value'])) 246 | if 'left' in node: 247 | visualize_tree(node['left'], depth + 1) 248 | print('{}|- X{} {} {}'.format(depth * ' ', node['index'] + 1, condition['yes'], node['value'])) 249 | if 'right' in node: 250 | visualize_tree(node['right'], depth + 1) 251 | else: 252 | print('{}[{}]'.format(depth * ' ', node)) 253 | 254 | 255 | X_train = [['tech', 'professional'], 256 | ['fashion', 'student'], 257 | ['fashion', 'professional'], 258 | ['sports', 'student'], 259 | ['tech', 'student'], 260 | ['tech', 'retired'], 261 | ['sports', 'professional']] 262 | 263 | y_train = [1, 264 | 0, 265 | 0, 266 | 0, 267 | 1, 268 | 0, 269 | 1] 270 | 271 | tree = train_tree(X_train, y_train, 2, 2) 272 | visualize_tree(tree) 273 | 274 | 275 | 276 | 277 | X_train_n = [[6, 7], 278 | [2, 4], 279 | [7, 2], 280 | [3, 6], 281 | [4, 7], 282 | [5, 2], 283 | [1, 6], 284 | [2, 0], 285 | [6, 3], 286 | [4, 1]] 287 | 288 | y_train_n = [0, 289 | 0, 290 | 0, 291 | 0, 292 | 0, 293 | 1, 294 | 1, 295 | 1, 296 | 1, 297 | 1] 298 | 299 | tree = train_tree(X_train_n, y_train_n, 2, 2) 300 | visualize_tree(tree) 301 | 302 | 303 | from sklearn.tree import DecisionTreeClassifier 304 | tree_sk = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_split=2) 305 | tree_sk.fit(X_train_n, y_train_n) 306 | 307 | from sklearn.tree import export_graphviz 308 | export_graphviz(tree_sk, out_file='tree.dot', feature_names=['X1', 'X2'], impurity=False, filled=True, class_names=['0', '1']) 309 | 310 | 311 | -------------------------------------------------------------------------------- /Chapter07/encoding.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn.feature_extraction import DictVectorizer 8 | 9 | 10 | X_dict = [{'interest': 'tech', 'occupation': 'professional'}, 11 | {'interest': 'fashion', 'occupation': 'student'}, 12 | {'interest': 'fashion', 'occupation': 'professional'}, 13 | {'interest': 'sports', 'occupation': 'student'}, 14 | {'interest': 'tech', 'occupation': 'student'}, 15 | {'interest': 'tech', 'occupation': 'retired'}, 16 | {'interest': 'sports', 'occupation': 'professional'}] 17 | 18 | dict_one_hot_encoder = DictVectorizer(sparse=False) 19 | X_encoded = dict_one_hot_encoder.fit_transform(X_dict) 20 | print(X_encoded) 21 | 22 | print(dict_one_hot_encoder.vocabulary_) 23 | 24 | 25 | new_dict = [{'interest': 'sports', 'occupation': 'retired'}] 26 | new_encoded = dict_one_hot_encoder.transform(new_dict) 27 | print(new_encoded) 28 | 29 | print(dict_one_hot_encoder.inverse_transform(new_encoded)) 30 | 31 | 32 | # new category not encountered before 33 | new_dict = [{'interest': 'unknown_interest', 'occupation': 'retired'}, 34 | {'interest': 'tech', 'occupation': 'unseen_occupation'}] 35 | new_encoded = dict_one_hot_encoder.transform(new_dict) 36 | print(new_encoded) 37 | 38 | 39 | 40 | import pandas as pd 41 | df = pd.DataFrame({'score': ['low', 42 | 'high', 43 | 'medium', 44 | 'medium', 45 | 'low']}) 46 | print(df) 47 | 48 | mapping = {'low':1, 'medium':2, 'high':3} 49 | df['score'] = df['score'].replace(mapping) 50 | 51 | print(df) -------------------------------------------------------------------------------- /Chapter07/logistic_function.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import numpy as np 8 | 9 | 10 | def sigmoid(input): 11 | return 1.0 / (1 + np.exp(-input)) 12 | 13 | 14 | import matplotlib.pyplot as plt 15 | z = np.linspace(-8, 8, 1000) 16 | y = sigmoid(z) 17 | plt.plot(z, y) 18 | plt.axhline(y=0, ls='dotted', color='k') 19 | plt.axhline(y=0.5, ls='dotted', color='k') 20 | plt.axhline(y=1, ls='dotted', color='k') 21 | plt.yticks([0.0, 0.25, 0.5, 0.75, 1.0]) 22 | plt.xlabel('z') 23 | plt.ylabel('y(z)') 24 | plt.show() 25 | 26 | 27 | # plot sample cost vs y_hat (prediction), for y (truth) = 1 28 | y_hat = np.linspace(0, 1, 1000) 29 | cost = -np.log(y_hat) 30 | plt.plot(y_hat, cost) 31 | plt.xlabel('Prediction') 32 | plt.ylabel('Cost') 33 | plt.xlim(0, 1) 34 | plt.ylim(0, 7) 35 | plt.show() 36 | 37 | # plot sample cost vs y_hat (prediction), for y (truth) = 0 38 | y_hat = np.linspace(0, 1, 1000) 39 | cost = -np.log(1 - y_hat) 40 | plt.plot(y_hat, cost) 41 | plt.xlabel('Prediction') 42 | plt.ylabel('Cost') 43 | plt.xlim(0, 1) 44 | plt.ylim(0, 7) 45 | plt.show() 46 | 47 | -------------------------------------------------------------------------------- /Chapter07/logistic_regression_from_scratch.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import numpy as np 8 | 9 | def sigmoid(input): 10 | return 1.0 / (1 + np.exp(-input)) 11 | 12 | 13 | 14 | # Gradient descent based logistic regression from scratch 15 | def compute_prediction(X, weights): 16 | """ Compute the prediction y_hat based on current weights 17 | Args: 18 | X (numpy.ndarray) 19 | weights (numpy.ndarray) 20 | Returns: 21 | numpy.ndarray, y_hat of X under weights 22 | """ 23 | z = np.dot(X, weights) 24 | predictions = sigmoid(z) 25 | return predictions 26 | 27 | def update_weights_gd(X_train, y_train, weights, learning_rate): 28 | """ Update weights by one step 29 | Args: 30 | X_train, y_train (numpy.ndarray, training data set) 31 | weights (numpy.ndarray) 32 | learning_rate (float) 33 | Returns: 34 | numpy.ndarray, updated weights 35 | """ 36 | predictions = compute_prediction(X_train, weights) 37 | weights_delta = np.dot(X_train.T, y_train - predictions) 38 | m = y_train.shape[0] 39 | weights += learning_rate / float(m) * weights_delta 40 | return weights 41 | 42 | def compute_cost(X, y, weights): 43 | """ Compute the cost J(w) 44 | Args: 45 | X, y (numpy.ndarray, data set) 46 | weights (numpy.ndarray) 47 | Returns: 48 | float 49 | """ 50 | predictions = compute_prediction(X, weights) 51 | cost = np.mean(-y * np.log(predictions) - (1 - y) * np.log(1 - predictions)) 52 | return cost 53 | 54 | def train_logistic_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False): 55 | """ Train a logistic regression model 56 | Args: 57 | X_train, y_train (numpy.ndarray, training data set) 58 | max_iter (int, number of iterations) 59 | learning_rate (float) 60 | fit_intercept (bool, with an intercept w0 or not) 61 | Returns: 62 | numpy.ndarray, learned weights 63 | """ 64 | if fit_intercept: 65 | intercept = np.ones((X_train.shape[0], 1)) 66 | X_train = np.hstack((intercept, X_train)) 67 | weights = np.zeros(X_train.shape[1]) 68 | for iteration in range(max_iter): 69 | weights = update_weights_gd(X_train, y_train, weights, learning_rate) 70 | # Check the cost for every 100 (for example) iterations 71 | if iteration % 100 == 0: 72 | print(compute_cost(X_train, y_train, weights)) 73 | return weights 74 | 75 | def predict(X, weights): 76 | if X.shape[1] == weights.shape[0] - 1: 77 | intercept = np.ones((X.shape[0], 1)) 78 | X = np.hstack((intercept, X)) 79 | return compute_prediction(X, weights) 80 | 81 | 82 | # A example 83 | X_train = np.array([[6, 7], 84 | [2, 4], 85 | [3, 6], 86 | [4, 7], 87 | [1, 6], 88 | [5, 2], 89 | [2, 0], 90 | [6, 3], 91 | [4, 1], 92 | [7, 2]]) 93 | 94 | y_train = np.array([0, 95 | 0, 96 | 0, 97 | 0, 98 | 0, 99 | 1, 100 | 1, 101 | 1, 102 | 1, 103 | 1]) 104 | 105 | weights = train_logistic_regression(X_train, y_train, max_iter=1000, learning_rate=0.1, fit_intercept=True) 106 | 107 | X_test = np.array([[6, 1], 108 | [1, 3], 109 | [3, 1], 110 | [4, 5]]) 111 | 112 | predictions = predict(X_test, weights) 113 | 114 | import matplotlib.pyplot as plt 115 | plt.scatter(X_train[:,0], X_train[:,1], c=['b']*5+['k']*5, marker='o') 116 | colours = ['k' if prediction >= 0.5 else 'b' for prediction in predictions] 117 | plt.scatter(X_test[:,0], X_test[:,1], marker='*', c=colours) 118 | plt.xlabel('x1') 119 | plt.ylabel('x2') 120 | plt.show() 121 | 122 | 123 | 124 | 125 | import pandas as pd 126 | n_rows = 300000 127 | df = pd.read_csv("train", nrows=n_rows) 128 | 129 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 130 | Y = df['click'].values 131 | 132 | n_train = 100000 133 | X_train = X[:n_train] 134 | Y_train = Y[:n_train] 135 | X_test = X[n_train:] 136 | Y_test = Y[n_train:] 137 | 138 | from sklearn.preprocessing import OneHotEncoder 139 | enc = OneHotEncoder(handle_unknown='ignore') 140 | X_train_enc = enc.fit_transform(X_train) 141 | 142 | X_test_enc = enc.transform(X_test) 143 | 144 | 145 | import timeit 146 | start_time = timeit.default_timer() 147 | weights = train_logistic_regression(X_train_enc.toarray(), Y_train, max_iter=10000, learning_rate=0.01, fit_intercept=True) 148 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 149 | 150 | 151 | pred = predict(X_test_enc.toarray(), weights) 152 | from sklearn.metrics import roc_auc_score 153 | print('Training samples: {0}, AUC on testing set: {1:.3f}'.format(n_train, roc_auc_score(Y_test, pred))) 154 | 155 | 156 | 157 | 158 | 159 | def update_weights_sgd(X_train, y_train, weights, learning_rate): 160 | """ One weight update iteration: moving weights by one step based on each individual sample 161 | Args: 162 | X_train, y_train (numpy.ndarray, training data set) 163 | weights (numpy.ndarray) 164 | learning_rate (float) 165 | Returns: 166 | numpy.ndarray, updated weights 167 | """ 168 | for X_each, y_each in zip(X_train, y_train): 169 | prediction = compute_prediction(X_each, weights) 170 | weights_delta = X_each.T * (y_each - prediction) 171 | weights += learning_rate * weights_delta 172 | return weights 173 | 174 | def train_logistic_regression_sgd(X_train, y_train, max_iter, learning_rate, fit_intercept=False): 175 | """ Train a logistic regression model via SGD 176 | Args: 177 | X_train, y_train (numpy.ndarray, training data set) 178 | max_iter (int, number of iterations) 179 | learning_rate (float) 180 | fit_intercept (bool, with an intercept w0 or not) 181 | Returns: 182 | numpy.ndarray, learned weights 183 | """ 184 | if fit_intercept: 185 | intercept = np.ones((X_train.shape[0], 1)) 186 | X_train = np.hstack((intercept, X_train)) 187 | weights = np.zeros(X_train.shape[1]) 188 | for iteration in range(max_iter): 189 | weights = update_weights_sgd(X_train, y_train, weights, learning_rate) 190 | # Check the cost for every 2 (for example) iterations 191 | if iteration % 2 == 0: 192 | print(compute_cost(X_train, y_train, weights)) 193 | return weights 194 | 195 | 196 | # Train the SGD model based on 100000 samples 197 | start_time = timeit.default_timer() 198 | weights = train_logistic_regression_sgd(X_train_enc.toarray(), Y_train, max_iter=10, learning_rate=0.01, fit_intercept=True) 199 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 200 | pred = predict(X_test_enc.toarray(), weights) 201 | print('Training samples: {0}, AUC on testing set: {1:.3f}'.format(n_train, roc_auc_score(Y_test, pred))) 202 | 203 | -------------------------------------------------------------------------------- /Chapter07/logistic_regression_tf.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | import pandas as pd 5 | n_rows = 300000 6 | df = pd.read_csv("train", nrows=n_rows) 7 | 8 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 9 | Y = df['click'].values 10 | 11 | n_train = int(n_rows * 0.9) 12 | X_train = X[:n_train] 13 | Y_train = Y[:n_train] 14 | X_test = X[n_train:] 15 | Y_test = Y[n_train:] 16 | 17 | from sklearn.preprocessing import OneHotEncoder 18 | enc = OneHotEncoder(handle_unknown='ignore') 19 | X_train_enc = enc.fit_transform(X_train) 20 | X_test_enc = enc.transform(X_test) 21 | 22 | 23 | n_features = int(X_train_enc.toarray().shape[1]) 24 | learning_rate = 0.001 25 | n_iter = 20 26 | 27 | 28 | # Input and Target placeholders 29 | x = tf.placeholder(tf.float32, shape=[None, n_features]) 30 | y = tf.placeholder(tf.float32, shape=[None]) 31 | 32 | # Build the logistic regression model 33 | W = tf.Variable(tf.zeros([n_features, 1])) 34 | b = tf.Variable(tf.zeros([1])) 35 | 36 | logits = tf.add(tf.matmul(x, W), b)[:, 0] 37 | pred = tf.nn.sigmoid(logits) 38 | 39 | cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)) 40 | auc = tf.metrics.auc(tf.cast(y, tf.int64), pred)[1] 41 | 42 | 43 | optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost) 44 | 45 | 46 | 47 | # Initialize the variables (i.e. assign their default value) 48 | init_vars = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 49 | 50 | 51 | batch_size = 1000 52 | 53 | import numpy as np 54 | indices = list(range(n_train)) 55 | 56 | def gen_batch(indices): 57 | np.random.shuffle(indices) 58 | for batch_i in range(int(n_train / batch_size)): 59 | batch_index = indices[batch_i*batch_size: (batch_i+1)*batch_size] 60 | yield X_train_enc[batch_index], Y_train[batch_index] 61 | 62 | 63 | sess = tf.Session() 64 | 65 | sess.run(init_vars) 66 | 67 | 68 | for i in range(1, n_iter+1): 69 | avg_cost = 0. 70 | for X_batch, Y_batch in gen_batch(indices): 71 | _, c = sess.run([optimizer, cost], feed_dict={x: X_batch.toarray(), y: Y_batch}) 72 | avg_cost += c / int(n_train / batch_size) 73 | print('Iteration %i, training loss: %f' % (i, avg_cost)) 74 | 75 | 76 | auc_test = sess.run(auc, feed_dict={x: X_test_enc.toarray(), y: Y_test}) 77 | print("AUC of ROC on testing set:", auc_test) 78 | -------------------------------------------------------------------------------- /Chapter07/random_forest_feature_selection.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import numpy as np 8 | from sklearn.metrics import roc_auc_score 9 | 10 | 11 | import pandas as pd 12 | n_rows = 100000 13 | df = pd.read_csv("train", nrows=n_rows) 14 | 15 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 16 | Y = df['click'].values 17 | 18 | 19 | X_train = X 20 | Y_train = Y 21 | 22 | 23 | from sklearn.preprocessing import OneHotEncoder 24 | enc = OneHotEncoder(handle_unknown='ignore') 25 | X_train_enc = enc.fit_transform(X_train) 26 | 27 | 28 | 29 | # Feature selection with random forest 30 | 31 | from sklearn.ensemble import RandomForestClassifier 32 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1) 33 | random_forest.fit(X_train_enc.toarray(), Y_train) 34 | 35 | 36 | 37 | 38 | feature_imp = random_forest.feature_importances_ 39 | print(feature_imp) 40 | 41 | # bottom 10 weights and the corresponding 10 least important features 42 | feature_names = enc.get_feature_names() 43 | print(np.sort(feature_imp)[:10]) 44 | bottom_10 = np.argsort(feature_imp)[:10] 45 | print('10 least important features are:\n', feature_names[bottom_10]) 46 | 47 | # top 10 weights and the corresponding 10 most important features 48 | print(np.sort(feature_imp)[-10:]) 49 | top_10 = np.argsort(feature_imp)[-10:] 50 | print('10 most important features are:\n', feature_names[top_10]) 51 | 52 | -------------------------------------------------------------------------------- /Chapter07/scikit_logistic_regression.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import numpy as np 8 | from sklearn.metrics import roc_auc_score 9 | 10 | 11 | import pandas as pd 12 | n_rows = 300000 13 | df = pd.read_csv("train", nrows=n_rows) 14 | 15 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 16 | Y = df['click'].values 17 | 18 | n_train = 100000 19 | X_train = X[:n_train] 20 | Y_train = Y[:n_train] 21 | X_test = X[n_train:] 22 | Y_test = Y[n_train:] 23 | 24 | from sklearn.preprocessing import OneHotEncoder 25 | enc = OneHotEncoder(handle_unknown='ignore') 26 | X_train_enc = enc.fit_transform(X_train) 27 | 28 | X_test_enc = enc.transform(X_test) 29 | 30 | # # Use scikit-learn package 31 | from sklearn.linear_model import SGDClassifier 32 | sgd_lr = SGDClassifier(loss='log', penalty=None, fit_intercept=True, n_iter=10, learning_rate='constant', eta0=0.01) 33 | sgd_lr.fit(X_train_enc.toarray(), Y_train) 34 | 35 | pred = sgd_lr.predict_proba(X_test_enc.toarray())[:, 1] 36 | print('Training samples: {0}, AUC on testing set: {1:.3f}'.format(n_train, roc_auc_score(Y_test, pred))) 37 | 38 | 39 | 40 | # Feature selection with L1 regularization 41 | 42 | sgd_lr_l1 = SGDClassifier(loss='log', penalty='l1', alpha=0.0001, fit_intercept=True, n_iter=10, learning_rate='constant', eta0=0.01) 43 | sgd_lr_l1.fit(X_train_enc.toarray(), Y_train) 44 | 45 | coef_abs = np.abs(sgd_lr_l1.coef_) 46 | print(coef_abs) 47 | 48 | # bottom 10 weights and the corresponding 10 least important features 49 | print(np.sort(coef_abs)[0][:10]) 50 | 51 | feature_names = enc.get_feature_names() 52 | bottom_10 = np.argsort(coef_abs)[0][:10] 53 | print('10 least important features are:\n', feature_names[bottom_10]) 54 | 55 | # top 10 weights and the corresponding 10 most important features 56 | print(np.sort(coef_abs)[0][-10:]) 57 | top_10 = np.argsort(coef_abs)[0][-10:] 58 | print('10 most important features are:\n', feature_names[top_10]) 59 | 60 | 61 | 62 | # Online learning 63 | 64 | 65 | n_rows = 100000 * 11 66 | df = pd.read_csv("train", nrows=n_rows) 67 | 68 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values 69 | Y = df['click'].values 70 | 71 | n_train = 100000 * 10 72 | X_train = X[:n_train] 73 | Y_train = Y[:n_train] 74 | X_test = X[n_train:] 75 | Y_test = Y[n_train:] 76 | 77 | from sklearn.preprocessing import OneHotEncoder 78 | enc = OneHotEncoder(handle_unknown='ignore') 79 | enc.fit(X_train) 80 | 81 | 82 | # The number of iterations is set to 1 if using partial_fit. 83 | sgd_lr_online = SGDClassifier(loss='log', penalty=None, fit_intercept=True, n_iter=1, learning_rate='constant', eta0=0.01) 84 | 85 | import timeit 86 | start_time = timeit.default_timer() 87 | 88 | 89 | # Use the first 1,000,000 samples for training, and the next 100,000 for testing 90 | for i in range(10): 91 | x_train = X_train[i*100000:(i+1)*100000] 92 | y_train = Y_train[i*100000:(i+1)*100000] 93 | x_train_enc = enc.transform(x_train) 94 | sgd_lr_online.partial_fit(x_train_enc.toarray(), y_train, classes=[0, 1]) 95 | 96 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 97 | 98 | x_test_enc = enc.transform(X_test) 99 | 100 | pred = sgd_lr_online.predict_proba(x_test_enc.toarray())[:, 1] 101 | print('Training samples: {0}, AUC on testing set: {1:.3f}'.format(n_train * 10, roc_auc_score(Y_test, pred))) 102 | 103 | 104 | 105 | # Multiclass classification with logistic regression 106 | 107 | from sklearn import datasets 108 | digits = datasets.load_digits() 109 | n_samples = len(digits.images) 110 | X = digits.images.reshape((n_samples, -1)) 111 | Y = digits.target 112 | 113 | from sklearn.model_selection import train_test_split 114 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 115 | 116 | from sklearn.model_selection import GridSearchCV 117 | parameters = {'penalty': ['l2', None], 118 | 'alpha': [1e-07, 1e-06, 1e-05, 1e-04], 119 | 'eta0': [0.01, 0.1, 1, 10]} 120 | 121 | sgd_lr = SGDClassifier(loss='log', learning_rate='constant', eta0=0.01, fit_intercept=True, n_iter=10) 122 | 123 | grid_search = GridSearchCV(sgd_lr, parameters, n_jobs=-1, cv=5) 124 | 125 | grid_search.fit(X_train, Y_train) 126 | print(grid_search.best_params_) 127 | 128 | sgd_lr_best = grid_search.best_estimator_ 129 | accuracy = sgd_lr_best.score(X_test, Y_test) 130 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100)) 131 | -------------------------------------------------------------------------------- /Chapter08/ctr.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 8: Scaling Up Learning On Massive Click Logs 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from pyspark.sql import SparkSession 8 | 9 | 10 | spark = SparkSession\ 11 | .builder\ 12 | .appName("CTR")\ 13 | .getOrCreate() 14 | 15 | 16 | 17 | from pyspark.sql.types import StructField, StringType, StructType, IntegerType 18 | 19 | schema = StructType([ 20 | StructField("id", StringType(), True), 21 | StructField("click", IntegerType(), True), 22 | StructField("hour", IntegerType(), True), 23 | StructField("C1", StringType(), True), 24 | StructField("banner_pos", StringType(), True), 25 | StructField("site_id", StringType(), True), 26 | StructField("site_domain", StringType(), True), 27 | StructField("site_category", StringType(), True), 28 | StructField("app_id", StringType(), True), 29 | StructField("app_domain", StringType(), True), 30 | StructField("app_category", StringType(), True), 31 | StructField("device_id", StringType(), True), 32 | StructField("device_ip", StringType(), True), 33 | StructField("device_model", StringType(), True), 34 | StructField("device_type", StringType(), True), 35 | StructField("device_conn_type", StringType(), True), 36 | StructField("C14", StringType(), True), 37 | StructField("C15", StringType(), True), 38 | StructField("C16", StringType(), True), 39 | StructField("C17", StringType(), True), 40 | StructField("C18", StringType(), True), 41 | StructField("C19", StringType(), True), 42 | StructField("C20", StringType(), True), 43 | StructField("C21", StringType(), True), 44 | ]) 45 | 46 | 47 | # Download data in: https://www.kaggle.com/c/avazu-ctr-prediction/data 48 | df = spark.read.csv("filepath/train", schema=schema, header=True) 49 | 50 | 51 | df.printSchema() 52 | 53 | df.count() 54 | 55 | df = df.drop('id').drop('hour').drop('device_id').drop('device_ip') 56 | 57 | df = df.withColumnRenamed("click", "label") 58 | 59 | df.columns 60 | 61 | 62 | df_train, df_test = df.randomSplit([0.7, 0.3], 42) 63 | 64 | df_train.cache() 65 | df_train.count() 66 | 67 | df_test.cache() 68 | df_test.count() 69 | 70 | 71 | 72 | categorical = df_train.columns 73 | categorical.remove('label') 74 | print(categorical) 75 | 76 | 77 | 78 | 79 | 80 | from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoderEstimator 81 | 82 | 83 | 84 | 85 | indexers = [ 86 | StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)).setHandleInvalid("keep") 87 | for c in categorical 88 | ] 89 | 90 | encoder = OneHotEncoderEstimator( 91 | inputCols=[indexer.getOutputCol() for indexer in indexers], 92 | outputCols=[ 93 | "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers] 94 | ) 95 | 96 | assembler = VectorAssembler( 97 | inputCols=encoder.getOutputCols(), 98 | outputCol="features" 99 | ) 100 | 101 | stages = indexers + [encoder, assembler] 102 | 103 | from pyspark.ml import Pipeline 104 | 105 | 106 | pipeline = Pipeline(stages=stages) 107 | 108 | 109 | one_hot_encoder = pipeline.fit(df_train) 110 | 111 | 112 | df_train_encoded = one_hot_encoder.transform(df_train) 113 | 114 | 115 | df_train_encoded.show() 116 | 117 | df_train_encoded = df_train_encoded.select(["label", "features"]) 118 | 119 | df_train_encoded.show() 120 | 121 | df_train_encoded.cache() 122 | 123 | df_train.unpersist() 124 | 125 | 126 | 127 | df_test_encoded = one_hot_encoder.transform(df_test) 128 | 129 | 130 | 131 | df_test_encoded = df_test_encoded.select(["label", "features"]) 132 | 133 | df_test_encoded.show() 134 | 135 | df_test_encoded.cache() 136 | 137 | df_test.unpersist() 138 | 139 | 140 | 141 | from pyspark.ml.classification import LogisticRegression 142 | 143 | classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000) 144 | 145 | lr_model = classifier.fit(df_train_encoded) 146 | 147 | 148 | df_train_encoded.unpersist() 149 | 150 | predictions = lr_model.transform(df_test_encoded) 151 | 152 | df_test_encoded.unpersist() 153 | 154 | predictions.cache() 155 | 156 | predictions.show() 157 | 158 | 159 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 160 | 161 | ev = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", metricName = "areaUnderROC") 162 | print(ev.evaluate(predictions)) 163 | 164 | 165 | spark.stop() 166 | -------------------------------------------------------------------------------- /Chapter08/ctr_hashing.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 8: Scaling Up Learning On Massive Click Logs 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from pyspark.sql import SparkSession 8 | 9 | 10 | spark = SparkSession\ 11 | .builder\ 12 | .appName("CTR")\ 13 | .getOrCreate() 14 | 15 | 16 | 17 | 18 | 19 | from pyspark.sql.types import StructField, StringType, StructType, IntegerType 20 | 21 | schema = StructType([ 22 | StructField("id", StringType(), True), 23 | StructField("click", IntegerType(), True), 24 | StructField("hour", IntegerType(), True), 25 | StructField("C1", StringType(), True), 26 | StructField("banner_pos", StringType(), True), 27 | StructField("site_id", StringType(), True), 28 | StructField("site_domain", StringType(), True), 29 | StructField("site_category", StringType(), True), 30 | StructField("app_id", StringType(), True), 31 | StructField("app_domain", StringType(), True), 32 | StructField("app_category", StringType(), True), 33 | StructField("device_id", StringType(), True), 34 | StructField("device_ip", StringType(), True), 35 | StructField("device_model", StringType(), True), 36 | StructField("device_type", StringType(), True), 37 | StructField("device_conn_type", StringType(), True), 38 | StructField("C14", StringType(), True), 39 | StructField("C15", StringType(), True), 40 | StructField("C16", StringType(), True), 41 | StructField("C17", StringType(), True), 42 | StructField("C18", StringType(), True), 43 | StructField("C19", StringType(), True), 44 | StructField("C20", StringType(), True), 45 | StructField("C21", StringType(), True), 46 | ]) 47 | 48 | 49 | 50 | df = spark.read.csv("file:///Users/hayden/dev/project/my_python2_book/ch7/train", schema=schema, header=True) 51 | 52 | 53 | df = df.drop('id').drop('hour').drop('device_id').drop('device_ip') 54 | 55 | df = df.withColumnRenamed("click", "label") 56 | 57 | 58 | df_train, df_test = df.randomSplit([0.7, 0.3], 42) 59 | 60 | df_train.cache() 61 | 62 | df_test.cache() 63 | 64 | 65 | 66 | categorical = df_train.columns 67 | categorical.remove('label') 68 | print(categorical) 69 | 70 | 71 | 72 | from pyspark.ml.feature import FeatureHasher 73 | hasher = FeatureHasher(numFeatures=10000, inputCols=categorical, 74 | outputCol="features") 75 | 76 | hasher.transform(df_train).select("features").show() 77 | 78 | from pyspark.ml.classification import LogisticRegression 79 | 80 | classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000) 81 | 82 | stages = [hasher, classifier] 83 | 84 | from pyspark.ml import Pipeline 85 | 86 | pipeline = Pipeline(stages=stages) 87 | 88 | 89 | model = pipeline.fit(df_train) 90 | 91 | predictions = model.transform(df_test) 92 | 93 | 94 | predictions.cache() 95 | 96 | 97 | 98 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 99 | 100 | ev = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", metricName = "areaUnderROC") 101 | print(ev.evaluate(predictions)) 102 | 103 | 104 | spark.stop() 105 | -------------------------------------------------------------------------------- /Chapter08/ctr_interaction.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 8: Scaling Up Learning On Massive Click Logs 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from pyspark.sql import SparkSession 8 | 9 | 10 | spark = SparkSession\ 11 | .builder\ 12 | .appName("CTR")\ 13 | .getOrCreate() 14 | 15 | 16 | 17 | 18 | 19 | from pyspark.sql.types import StructField, StringType, StructType, IntegerType 20 | 21 | schema = StructType([ 22 | StructField("id", StringType(), True), 23 | StructField("click", IntegerType(), True), 24 | StructField("hour", IntegerType(), True), 25 | StructField("C1", StringType(), True), 26 | StructField("banner_pos", StringType(), True), 27 | StructField("site_id", StringType(), True), 28 | StructField("site_domain", StringType(), True), 29 | StructField("site_category", StringType(), True), 30 | StructField("app_id", StringType(), True), 31 | StructField("app_domain", StringType(), True), 32 | StructField("app_category", StringType(), True), 33 | StructField("device_id", StringType(), True), 34 | StructField("device_ip", StringType(), True), 35 | StructField("device_model", StringType(), True), 36 | StructField("device_type", StringType(), True), 37 | StructField("device_conn_type", StringType(), True), 38 | StructField("C14", StringType(), True), 39 | StructField("C15", StringType(), True), 40 | StructField("C16", StringType(), True), 41 | StructField("C17", StringType(), True), 42 | StructField("C18", StringType(), True), 43 | StructField("C19", StringType(), True), 44 | StructField("C20", StringType(), True), 45 | StructField("C21", StringType(), True), 46 | ]) 47 | 48 | 49 | 50 | df = spark.read.csv("file:///Users/hayden/dev/project/my_python2_book/ch7/train", schema=schema, header=True) 51 | 52 | 53 | df = df.drop('id').drop('hour').drop('device_id').drop('device_ip') 54 | 55 | df = df.withColumnRenamed("click", "label") 56 | 57 | 58 | 59 | 60 | df_train, df_test = df.randomSplit([0.7, 0.3], 42) 61 | 62 | df_train.cache() 63 | 64 | df_test.cache() 65 | 66 | 67 | 68 | categorical = df_train.columns 69 | categorical.remove('label') 70 | print(categorical) 71 | 72 | 73 | 74 | cat_inter = ['C14', 'C15'] 75 | 76 | concat = '+'.join(categorical) 77 | interaction = ':'.join(cat_inter) 78 | formula = "label ~ " + concat + '+' + interaction 79 | 80 | print(formula) 81 | 82 | from pyspark.ml.feature import RFormula 83 | interactor = RFormula( 84 | formula=formula, 85 | featuresCol="features", 86 | labelCol="label").setHandleInvalid("keep") 87 | 88 | interactor.fit(df_train).transform(df_train).select("features").show() 89 | 90 | from pyspark.ml.classification import LogisticRegression 91 | 92 | classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000) 93 | 94 | stages = [interactor, classifier] 95 | 96 | from pyspark.ml import Pipeline 97 | 98 | pipeline = Pipeline(stages=stages) 99 | 100 | 101 | model = pipeline.fit(df_train) 102 | 103 | predictions = model.transform(df_test) 104 | 105 | 106 | predictions.cache() 107 | 108 | predictions.show() 109 | 110 | 111 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 112 | 113 | ev = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", metricName = "areaUnderROC") 114 | print(ev.evaluate(predictions)) 115 | 116 | 117 | spark.stop() 118 | 119 | -------------------------------------------------------------------------------- /Chapter09/20051201_20051210.csv: -------------------------------------------------------------------------------- 1 | Date,Open,High,Low,Close,Adj Close,Volume 2 | 2005-12-01,10806.030273,10934.900391,10806.030273,10912.570313,10912.570313,256980000 3 | 2005-12-02,10912.009766,10921.370117,10861.660156,10877.509766,10877.509766,214900000 4 | 2005-12-05,10876.950195,10876.950195,10810.669922,10835.009766,10835.009766,237340000 5 | 2005-12-06,10835.410156,10936.200195,10835.410156,10856.860352,10856.860352,264630000 6 | 2005-12-07,10856.860352,10868.059570,10764.009766,10810.910156,10810.910156,243490000 7 | 2005-12-08,10808.429688,10847.250000,10729.669922,10755.120117,10755.120117,253290000 8 | 2005-12-09,10751.759766,10805.950195,10729.910156,10778.580078,10778.580078,238930000 9 | -------------------------------------------------------------------------------- /Chapter09/decision_tree_regression.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 9: Stock Price Prediction with Regression Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | import numpy as np 7 | 8 | 9 | # Mean squared error calculation function given continuous targets of a data set, 10 | def mse(targets): 11 | # When the set is empty 12 | if targets.size == 0: 13 | return 0 14 | return np.var(targets) 15 | 16 | def weighted_mse(groups): 17 | """ Calculate weighted MSE of children after a split 18 | Args: 19 | groups (list of children, and a child consists a list of targets) 20 | Returns: 21 | float, weighted impurity 22 | """ 23 | total = sum(len(group) for group in groups) 24 | weighted_sum = 0.0 25 | for group in groups: 26 | weighted_sum += len(group) / float(total) * mse(group) 27 | return weighted_sum 28 | 29 | 30 | print('{0:.4f}'.format(mse(np.array([1, 2, 3])))) 31 | print('{0:.4f}'.format(weighted_mse([np.array([1, 2, 3]), np.array([1, 2])]))) 32 | 33 | print('type-semi: {0:.4f}'.format(weighted_mse([np.array([600, 400, 700]), np.array([700, 800])]))) 34 | print('bedroom-2: {0:.4f}'.format(weighted_mse([np.array([700, 400]), np.array([600, 800, 700])]))) 35 | print('bedroom-3: {0:.4f}'.format(weighted_mse([np.array([600, 800]), np.array([700, 400, 700])]))) 36 | print('bedroom-4: {0:.4f}'.format(weighted_mse([np.array([700]), np.array([600, 700, 800, 400])]))) 37 | 38 | 39 | print('bedroom-2: {0:.4f}'.format(weighted_mse([np.array([]), np.array([600, 400, 700])]))) 40 | print('bedroom-3: {0:.4f}'.format(weighted_mse([np.array([400]), np.array([600, 700])]))) 41 | print('bedroom-4: {0:.4f}'.format(weighted_mse([np.array([400, 600]), np.array([700])]))) 42 | 43 | 44 | 45 | 46 | def split_node(X, y, index, value): 47 | """ Split data set X, y based on a feature and a value 48 | Args: 49 | X, y (numpy.ndarray, data set) 50 | index (int, index of the feature used for splitting) 51 | value (value of the feature used for splitting) 52 | Returns: 53 | list, list: left and right child, a child is in the format of [X, y] 54 | """ 55 | x_index = X[:, index] 56 | # if this feature is numerical 57 | if type(X[0, index]) in [int, float]: 58 | mask = x_index >= value 59 | # if this feature is categorical 60 | else: 61 | mask = x_index == value 62 | # split into left and right child 63 | left = [X[~mask, :], y[~mask]] 64 | right = [X[mask, :], y[mask]] 65 | return left, right 66 | 67 | 68 | def get_best_split(X, y): 69 | """ Obtain the best splitting point and resulting children for the data set X, y 70 | Args: 71 | X, y (numpy.ndarray, data set) 72 | criterion (gini or entropy) 73 | Returns: 74 | dict {index: index of the feature, value: feature value, children: left and right children} 75 | """ 76 | best_index, best_value, best_score, children = None, None, 1e10, None 77 | for index in range(len(X[0])): 78 | for value in np.sort(np.unique(X[:, index])): 79 | groups = split_node(X, y, index, value) 80 | impurity = weighted_mse([groups[0][1], groups[1][1]]) 81 | if impurity < best_score: 82 | best_index, best_value, best_score, children = index, value, impurity, groups 83 | return {'index': best_index, 'value': best_value, 'children': children} 84 | 85 | 86 | 87 | def get_leaf(targets): 88 | # Obtain the leaf as the mean of the targets 89 | return np.mean(targets) 90 | 91 | 92 | 93 | def split(node, max_depth, min_size, depth): 94 | """ Split children of a node to construct new nodes or assign them terminals 95 | Args: 96 | node (dict, with children info) 97 | max_depth (int, maximal depth of the tree) 98 | min_size (int, minimal samples required to further split a child) 99 | depth (int, current depth of the node) 100 | """ 101 | left, right = node['children'] 102 | del (node['children']) 103 | if left[1].size == 0: 104 | node['right'] = get_leaf(right[1]) 105 | return 106 | if right[1].size == 0: 107 | node['left'] = get_leaf(left[1]) 108 | return 109 | # Check if the current depth exceeds the maximal depth 110 | if depth >= max_depth: 111 | node['left'], node['right'] = get_leaf(left[1]), get_leaf(right[1]) 112 | return 113 | # Check if the left child has enough samples 114 | if left[1].size <= min_size: 115 | node['left'] = get_leaf(left[1]) 116 | else: 117 | # It has enough samples, we further split it 118 | result = get_best_split(left[0], left[1]) 119 | result_left, result_right = result['children'] 120 | if result_left[1].size == 0: 121 | node['left'] = get_leaf(result_right[1]) 122 | elif result_right[1].size == 0: 123 | node['left'] = get_leaf(result_left[1]) 124 | else: 125 | node['left'] = result 126 | split(node['left'], max_depth, min_size, depth + 1) 127 | # Check if the right child has enough samples 128 | if right[1].size <= min_size: 129 | node['right'] = get_leaf(right[1]) 130 | else: 131 | # It has enough samples, we further split it 132 | result = get_best_split(right[0], right[1]) 133 | result_left, result_right = result['children'] 134 | if result_left[1].size == 0: 135 | node['right'] = get_leaf(result_right[1]) 136 | elif result_right[1].size == 0: 137 | node['right'] = get_leaf(result_left[1]) 138 | else: 139 | node['right'] = result 140 | split(node['right'], max_depth, min_size, depth + 1) 141 | 142 | 143 | def train_tree(X_train, y_train, max_depth, min_size): 144 | """ Construction of a tree starts here 145 | Args: 146 | X_train, y_train (list, list, training data) 147 | max_depth (int, maximal depth of the tree) 148 | min_size (int, minimal samples required to further split a child) 149 | """ 150 | root = get_best_split(X_train, y_train) 151 | split(root, max_depth, min_size, 1) 152 | return root 153 | 154 | 155 | 156 | CONDITION = {'numerical': {'yes': '>=', 'no': '<'}, 157 | 'categorical': {'yes': 'is', 'no': 'is not'}} 158 | def visualize_tree(node, depth=0): 159 | if isinstance(node, dict): 160 | if type(node['value']) in [int, float]: 161 | condition = CONDITION['numerical'] 162 | else: 163 | condition = CONDITION['categorical'] 164 | print('{}|- X{} {} {}'.format(depth * ' ', node['index'] + 1, condition['no'], node['value'])) 165 | if 'left' in node: 166 | visualize_tree(node['left'], depth + 1) 167 | print('{}|- X{} {} {}'.format(depth * ' ', node['index'] + 1, condition['yes'], node['value'])) 168 | if 'right' in node: 169 | visualize_tree(node['right'], depth + 1) 170 | else: 171 | print('{}[{}]'.format(depth * ' ', node)) 172 | 173 | 174 | X_train = np.array([['semi', 3], 175 | ['detached', 2], 176 | ['detached', 3], 177 | ['semi', 2], 178 | ['semi', 4]], dtype=object) 179 | 180 | y_train = np.array([600, 700, 800, 400, 700]) 181 | 182 | tree = train_tree(X_train, y_train, 2, 2) 183 | visualize_tree(tree) 184 | 185 | 186 | 187 | # Directly use DecisionTreeRegressor from scikit-learn 188 | from sklearn import datasets 189 | boston = datasets.load_boston() 190 | 191 | num_test = 10 # the last 10 samples as testing set 192 | X_train = boston.data[:-num_test, :] 193 | y_train = boston.target[:-num_test] 194 | X_test = boston.data[-num_test:, :] 195 | y_test = boston.target[-num_test:] 196 | 197 | from sklearn.tree import DecisionTreeRegressor 198 | regressor = DecisionTreeRegressor(max_depth=10, min_samples_split=3) 199 | 200 | regressor.fit(X_train, y_train) 201 | predictions = regressor.predict(X_test) 202 | print(predictions) 203 | print(y_test) 204 | 205 | 206 | from sklearn.ensemble import RandomForestRegressor 207 | regressor = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=3) 208 | regressor.fit(X_train, y_train) 209 | predictions = regressor.predict(X_test) 210 | print(predictions) 211 | 212 | 213 | 214 | 215 | 216 | import tensorflow as tf 217 | from tensorflow.contrib.tensor_forest.python import tensor_forest 218 | from tensorflow.python.ops import resources 219 | 220 | 221 | n_iter = 20 222 | n_features = int(X_train.shape[1]) 223 | n_trees = 10 224 | max_nodes = 30000 225 | 226 | 227 | x = tf.placeholder(tf.float32, shape=[None, n_features]) 228 | y = tf.placeholder(tf.float32, shape=[None]) 229 | 230 | 231 | hparams = tensor_forest.ForestHParams(num_classes=1, regression=True, num_features=n_features, num_trees=n_trees, 232 | max_nodes=max_nodes, split_after_samples=30).fill() 233 | 234 | 235 | forest_graph = tensor_forest.RandomForestGraphs(hparams) 236 | 237 | 238 | train_op = forest_graph.training_graph(x, y) 239 | loss_op = forest_graph.training_loss(x, y) 240 | 241 | 242 | infer_op, _, _ = forest_graph.inference_graph(x) 243 | 244 | cost = tf.losses.mean_squared_error(labels=y, predictions=infer_op[:, 0]) 245 | 246 | 247 | 248 | init_vars = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), resources.initialize_resources(resources.shared_resources())) 249 | 250 | sess = tf.Session() 251 | 252 | sess.run(init_vars) 253 | 254 | 255 | for i in range(1, n_iter + 1): 256 | _, c = sess.run([train_op, cost], feed_dict={x: X_train, y: y_train}) 257 | print('Iteration %i, training loss: %f' % (i, c)) 258 | 259 | 260 | pred = sess.run(infer_op, feed_dict={x: X_test})[:, 0] 261 | print(pred) 262 | -------------------------------------------------------------------------------- /Chapter09/get_dji_data.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 9: Stock Price Prediction with Regression Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import pandas as pd 8 | 9 | 10 | # download from https://finance.yahoo.com/quote/%5EDJI/history?period1=1133413200&period2=1134190800&interval=1d&filter=history&frequency=1d 11 | mydata = pd.read_csv('20051201_20051210.csv', index_col='Date') 12 | 13 | 14 | 15 | 16 | def generate_features(df): 17 | """ 18 | Generate features for a stock/index based on historical price and performance 19 | @param df: dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adjusted Close" 20 | @return: dataframe, data set with new features 21 | """ 22 | df_new = pd.DataFrame() 23 | # 6 original features 24 | df_new['open'] = df['Open'] 25 | df_new['open_1'] = df['Open'].shift(1) 26 | df_new['close_1'] = df['Close'].shift(1) 27 | df_new['high_1'] = df['High'].shift(1) 28 | df_new['low_1'] = df['Low'].shift(1) 29 | df_new['volume_1'] = df['Volume'].shift(1) 30 | # 31 generated features 31 | # average price 32 | df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1) 33 | df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1) 34 | df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1) 35 | df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30'] 36 | df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365'] 37 | df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365'] 38 | # average volume 39 | df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1) 40 | df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1) 41 | df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1) 42 | df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30'] 43 | df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365'] 44 | df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365'] 45 | # standard deviation of prices 46 | df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1) 47 | df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1) 48 | df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1) 49 | df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30'] 50 | df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365'] 51 | df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365'] 52 | # standard deviation of volumes 53 | df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1) 54 | df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1) 55 | df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1) 56 | df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30'] 57 | df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365'] 58 | df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365'] 59 | # # return 60 | df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1) 61 | df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1) 62 | df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1) 63 | df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1) 64 | df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1) 65 | df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1) 66 | df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1) 67 | # the target 68 | df_new['close'] = df['Close'] 69 | df_new = df_new.dropna(axis=0) 70 | return df_new 71 | 72 | 73 | 74 | data_raw = pd.read_csv('19880101_20161231.csv', index_col='Date') 75 | data = generate_features(data_raw) 76 | print(data.round(decimals=3).head(5)) 77 | -------------------------------------------------------------------------------- /Chapter09/linear_regression.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 9: Stock Price Prediction with Regression Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | import numpy as np 7 | 8 | 9 | def compute_prediction(X, weights): 10 | """ Compute the prediction y_hat based on current weights 11 | Args: 12 | X (numpy.ndarray) 13 | weights (numpy.ndarray) 14 | Returns: 15 | numpy.ndarray, y_hat of X under weights 16 | """ 17 | predictions = np.dot(X, weights) 18 | return predictions 19 | 20 | 21 | def update_weights_gd(X_train, y_train, weights, learning_rate): 22 | """ Update weights by one step 23 | Args: 24 | X_train, y_train (numpy.ndarray, training data set) 25 | weights (numpy.ndarray) 26 | learning_rate (float) 27 | Returns: 28 | numpy.ndarray, updated weights 29 | """ 30 | predictions = compute_prediction(X_train, weights) 31 | weights_delta = np.dot(X_train.T, y_train - predictions) 32 | m = y_train.shape[0] 33 | weights += learning_rate / float(m) * weights_delta 34 | return weights 35 | 36 | 37 | def compute_cost(X, y, weights): 38 | """ Compute the cost J(w) 39 | Args: 40 | X, y (numpy.ndarray, data set) 41 | weights (numpy.ndarray) 42 | Returns: 43 | float 44 | """ 45 | predictions = compute_prediction(X, weights) 46 | cost = np.mean((predictions - y) ** 2 / 2.0) 47 | return cost 48 | 49 | 50 | def train_linear_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False): 51 | """ Train a linear regression model with gradient descent 52 | Args: 53 | X_train, y_train (numpy.ndarray, training data set) 54 | max_iter (int, number of iterations) 55 | learning_rate (float) 56 | fit_intercept (bool, with an intercept w0 or not) 57 | Returns: 58 | numpy.ndarray, learned weights 59 | """ 60 | if fit_intercept: 61 | intercept = np.ones((X_train.shape[0], 1)) 62 | X_train = np.hstack((intercept, X_train)) 63 | weights = np.zeros(X_train.shape[1]) 64 | for iteration in range(max_iter): 65 | weights = update_weights_gd(X_train, y_train, weights, learning_rate) 66 | # Check the cost for every 100 (for example) iterations 67 | if iteration % 100 == 0: 68 | print(compute_cost(X_train, y_train, weights)) 69 | return weights 70 | 71 | 72 | def predict(X, weights): 73 | if X.shape[1] == weights.shape[0] - 1: 74 | intercept = np.ones((X.shape[0], 1)) 75 | X = np.hstack((intercept, X)) 76 | return compute_prediction(X, weights) 77 | 78 | 79 | # A small example 80 | X_train = np.array([[6], [2], [3], [4], [1], [5], [2], [6], [4], [7]]) 81 | 82 | y_train = np.array([5.5, 1.6, 2.2, 3.7, 0.8, 5.2, 1.5, 5.3, 4.4, 6.8]) 83 | 84 | weights = train_linear_regression(X_train, y_train, max_iter=100, learning_rate=0.01, fit_intercept=True) 85 | 86 | X_test = np.array([[1.3], [3.5], [5.2], [2.8]]) 87 | 88 | predictions = predict(X_test, weights) 89 | 90 | import matplotlib.pyplot as plt 91 | plt.scatter(X_train[:, 0], y_train, marker='o', c='b') 92 | plt.scatter(X_test[:, 0], predictions, marker='*', c='k') 93 | plt.xlabel('x') 94 | plt.ylabel('y') 95 | plt.show() 96 | 97 | 98 | # The diabetes example 99 | from sklearn import datasets 100 | diabetes = datasets.load_diabetes() 101 | print(diabetes.data.shape) 102 | 103 | num_test = 30 104 | X_train = diabetes.data[:-num_test, :] 105 | y_train = diabetes.target[:-num_test] 106 | 107 | weights = train_linear_regression(X_train, y_train, max_iter=5000, learning_rate=1, fit_intercept=True) 108 | 109 | X_test = diabetes.data[-num_test:, :] 110 | y_test = diabetes.target[-num_test:] 111 | 112 | predictions = predict(X_test, weights) 113 | 114 | print(predictions) 115 | print(y_test) 116 | 117 | 118 | 119 | # Directly use SGDRegressor from scikit-learn 120 | from sklearn.linear_model import SGDRegressor 121 | regressor = SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, learning_rate='constant', eta0=0.01, n_iter=1000) 122 | regressor.fit(X_train, y_train) 123 | predictions = regressor.predict(X_test) 124 | print(predictions) 125 | 126 | 127 | 128 | # TensorFlow implementation of linear regression 129 | 130 | import tensorflow as tf 131 | n_features = int(X_train.shape[1]) 132 | learning_rate = 0.5 133 | n_iter = 1000 134 | 135 | x = tf.placeholder(tf.float32, shape=[None, n_features]) 136 | y = tf.placeholder(tf.float32, shape=[None]) 137 | W = tf.Variable(tf.ones([n_features, 1])) 138 | b = tf.Variable(tf.zeros([1])) 139 | 140 | pred = tf.add(tf.matmul(x, W), b)[:, 0] 141 | 142 | 143 | cost = tf.losses.mean_squared_error(labels=y, predictions=pred) 144 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) 145 | 146 | init_vars = tf.initialize_all_variables() 147 | sess = tf.Session() 148 | sess.run(init_vars) 149 | 150 | for i in range(1, n_iter+1): 151 | _, c = sess.run([optimizer, cost], feed_dict={x: X_train, y: y_train}) 152 | if i % 100 == 0: 153 | print('Iteration %i, training loss: %f' % (i, c)) 154 | 155 | predictions = sess.run(pred, feed_dict={x: X_test}) 156 | print(predictions) 157 | 158 | -------------------------------------------------------------------------------- /Chapter09/neural_network.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 9: Stock Price Prediction with Regression Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | import numpy as np 7 | 8 | 9 | def sigmoid(z): 10 | return 1.0 / (1 + np.exp(-z)) 11 | 12 | 13 | def sigmoid_derivative(z): 14 | return sigmoid(z) * (1.0 - sigmoid(z)) 15 | 16 | 17 | 18 | def train(X, y, n_hidden, learning_rate, n_iter): 19 | m, n_input = X.shape 20 | W1 = np.random.randn(n_input, n_hidden) 21 | b1 = np.zeros((1, n_hidden)) 22 | W2 = np.random.randn(n_hidden, 1) 23 | b2 = np.zeros((1, 1)) 24 | for i in range(1, n_iter+1): 25 | Z2 = np.matmul(X, W1) + b1 26 | A2 = sigmoid(Z2) 27 | Z3 = np.matmul(A2, W2) + b2 28 | A3 = Z3 29 | 30 | dZ3 = A3 - y 31 | dW2 = np.matmul(A2.T, dZ3) 32 | db2 = np.sum(dZ3, axis=0, keepdims=True) 33 | 34 | dZ2 = np.matmul(dZ3, W2.T) * sigmoid_derivative(Z2) 35 | dW1 = np.matmul(X.T, dZ2) 36 | db1 = np.sum(dZ2, axis=0) 37 | 38 | W2 = W2 - learning_rate * dW2 / m 39 | b2 = b2 - learning_rate * db2 / m 40 | W1 = W1 - learning_rate * dW1 / m 41 | b1 = b1 - learning_rate * db1 / m 42 | 43 | if i % 100 == 0: 44 | cost = np.mean((y - A3) ** 2) 45 | print('Iteration %i, training loss: %f' % (i, cost)) 46 | 47 | model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2} 48 | return model 49 | 50 | 51 | def predict(x, model): 52 | W1 = model['W1'] 53 | b1 = model['b1'] 54 | W2 = model['W2'] 55 | b2 = model['b2'] 56 | A2 = sigmoid(np.matmul(x, W1) + b1) 57 | A3 = np.matmul(A2, W2) + b2 58 | return A3 59 | 60 | 61 | 62 | from sklearn import datasets 63 | boston = datasets.load_boston() 64 | num_test = 10 # the last 10 samples as testing set 65 | 66 | from sklearn import preprocessing 67 | scaler = preprocessing.StandardScaler() 68 | 69 | X_train = boston.data[:-num_test, :] 70 | X_train = scaler.fit_transform(X_train) 71 | y_train = boston.target[:-num_test].reshape(-1, 1) 72 | X_test = boston.data[-num_test:, :] 73 | X_test = scaler.transform(X_test) 74 | y_test = boston.target[-num_test:] 75 | 76 | 77 | n_hidden = 20 78 | learning_rate = 0.1 79 | n_iter = 2000 80 | 81 | model = train(X_train, y_train, n_hidden, learning_rate, n_iter) 82 | predictions = predict(X_test, model) 83 | print(predictions) 84 | print(y_test) 85 | 86 | 87 | 88 | 89 | # Scikit-learn implementation of neural network 90 | 91 | from sklearn.neural_network import MLPRegressor 92 | nn_scikit = MLPRegressor(hidden_layer_sizes=(20, 8), activation='logistic', solver='lbfgs', 93 | learning_rate_init=0.1, random_state=42, max_iter=2000) 94 | nn_scikit.fit(X_train, y_train) 95 | predictions = nn_scikit.predict(X_test) 96 | print(predictions) 97 | print(np.mean((y_test - predictions) ** 2)) 98 | 99 | 100 | # TensorFlow implementation of neural network 101 | 102 | import tensorflow as tf 103 | n_features = int(X_train.shape[1]) 104 | n_hidden_1 = 20 105 | n_hidden_2 = 8 106 | 107 | learning_rate = 0.1 108 | n_iter = 2000 109 | 110 | x = tf.placeholder(tf.float32, shape=[None, n_features]) 111 | y = tf.placeholder(tf.float32, shape=[None, 1]) 112 | 113 | layer_1 = tf.nn.sigmoid(tf.layers.dense(x, n_hidden_1)) 114 | layer_2 = tf.nn.sigmoid(tf.layers.dense(layer_1, n_hidden_2)) 115 | pred = tf.layers.dense(layer_2, 1) 116 | 117 | 118 | cost = tf.losses.mean_squared_error(labels=y, predictions=pred) 119 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) 120 | 121 | init_vars = tf.initialize_all_variables() 122 | sess = tf.Session() 123 | sess.run(init_vars) 124 | 125 | for i in range(1, n_iter+1): 126 | _, c = sess.run([optimizer, cost], feed_dict={x: X_train, y: y_train}) 127 | if i % 100 == 0: 128 | print('Iteration %i, training loss: %f' % (i, c)) 129 | 130 | predictions = sess.run(pred, feed_dict={x: X_test}) 131 | print(predictions) 132 | 133 | 134 | 135 | 136 | # Keras implementation of neural network 137 | 138 | 139 | from keras import models 140 | from keras import layers 141 | 142 | model = models.Sequential() 143 | model.add(layers.Dense(n_hidden_1, activation="sigmoid", input_shape=(n_features, ))) 144 | model.add(layers.Dense(n_hidden_2, activation="sigmoid")) 145 | model.add(layers.Dense(1)) 146 | 147 | 148 | from keras import optimizers 149 | sgd = optimizers.SGD(lr=0.01) 150 | model.compile(loss='mean_squared_error', optimizer=sgd) 151 | 152 | 153 | model.fit( 154 | X_train, y_train, 155 | epochs=100, 156 | validation_data=(X_test, y_test) 157 | ) 158 | 159 | predictions = model.predict(X_test) 160 | print(predictions) 161 | -------------------------------------------------------------------------------- /Chapter09/regression_evaluation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 9: Stock Price Prediction with Regression Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn import datasets 8 | diabetes = datasets.load_diabetes() 9 | num_test = 30 # the last 30 samples as testing set 10 | X_train = diabetes.data[:-num_test, :] 11 | y_train = diabetes.target[:-num_test] 12 | X_test = diabetes.data[-num_test:, :] 13 | y_test = diabetes.target[-num_test:] 14 | param_grid = { 15 | "alpha": [1e-07, 1e-06, 1e-05], 16 | "penalty": [None, "l2"], 17 | "eta0": [0.001, 0.005, 0.01], 18 | "n_iter": [300, 1000, 3000] 19 | } 20 | 21 | from sklearn.linear_model import SGDRegressor 22 | from sklearn.model_selection import GridSearchCV 23 | regressor = SGDRegressor(loss='squared_loss', 24 | learning_rate='constant') 25 | grid_search = GridSearchCV(regressor, param_grid, cv=3) 26 | 27 | grid_search.fit(X_train, y_train) 28 | print(grid_search.best_params_) 29 | 30 | regressor_best = grid_search.best_estimator_ 31 | 32 | 33 | predictions = regressor_best.predict(X_test) 34 | 35 | 36 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 37 | print(mean_squared_error(y_test, predictions)) 38 | 39 | print(mean_absolute_error(y_test, predictions)) 40 | 41 | print(r2_score(y_test, predictions)) 42 | 43 | -------------------------------------------------------------------------------- /Chapter09/stock_prediction.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 9: Stock Price Prediction with Regression Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import pandas as pd 8 | from sklearn.model_selection import GridSearchCV 9 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 10 | from sklearn.preprocessing import StandardScaler 11 | 12 | 13 | def generate_features(df): 14 | """ 15 | Generate features for a stock/index based on historical price and performance 16 | @param df: dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adjusted Close" 17 | @return: dataframe, data set with new features 18 | """ 19 | df_new = pd.DataFrame() 20 | # 6 original features 21 | df_new['open'] = df['Open'] 22 | df_new['open_1'] = df['Open'].shift(1) 23 | df_new['close_1'] = df['Close'].shift(1) 24 | df_new['high_1'] = df['High'].shift(1) 25 | df_new['low_1'] = df['Low'].shift(1) 26 | df_new['volume_1'] = df['Volume'].shift(1) 27 | # 31 generated features 28 | # average price 29 | df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1) 30 | df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1) 31 | df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1) 32 | df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30'] 33 | df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365'] 34 | df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365'] 35 | # average volume 36 | df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1) 37 | df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1) 38 | df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1) 39 | df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30'] 40 | df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365'] 41 | df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365'] 42 | # standard deviation of prices 43 | df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1) 44 | df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1) 45 | df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1) 46 | df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30'] 47 | df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365'] 48 | df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365'] 49 | # standard deviation of volumes 50 | df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1) 51 | df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1) 52 | df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1) 53 | df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30'] 54 | df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365'] 55 | df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365'] 56 | # # return 57 | df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1) 58 | df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1) 59 | df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1) 60 | df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1) 61 | df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1) 62 | df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1) 63 | df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1) 64 | # the target 65 | df_new['close'] = df['Close'] 66 | df_new = df_new.dropna(axis=0) 67 | return df_new 68 | 69 | 70 | data_raw = pd.read_csv('19880101_20161231.csv', index_col='Date') 71 | data = generate_features(data_raw) 72 | 73 | start_train = '1988-01-01' 74 | end_train = '2015-12-31' 75 | 76 | start_test = '2016-01-01' 77 | end_test = '2016-12-31' 78 | 79 | data_train = data.ix[start_train:end_train] 80 | X_train = data_train.drop('close', axis=1).values 81 | y_train = data_train['close'].values 82 | 83 | print(X_train.shape) 84 | print(y_train.shape) 85 | 86 | data_test = data.ix[start_test:end_test] 87 | X_test = data_test.drop('close', axis=1).values 88 | y_test = data_test['close'].values 89 | 90 | print(X_test.shape) 91 | 92 | 93 | # First experiment with linear regression 94 | 95 | scaler = StandardScaler() 96 | 97 | X_scaled_train = scaler.fit_transform(X_train) 98 | X_scaled_test = scaler.transform(X_test) 99 | 100 | param_grid = { 101 | "alpha": [1e-5, 3e-5, 1e-4], 102 | "eta0": [0.01, 0.03, 0.1], 103 | } 104 | 105 | 106 | from sklearn.linear_model import SGDRegressor 107 | lr = SGDRegressor(penalty='l2', n_iter=1000) 108 | grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='r2') 109 | grid_search.fit(X_scaled_train, y_train) 110 | 111 | print(grid_search.best_params_) 112 | 113 | lr_best = grid_search.best_estimator_ 114 | 115 | predictions_lr = lr_best.predict(X_scaled_test) 116 | 117 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_lr))) 118 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_lr))) 119 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_lr))) 120 | 121 | 122 | # Experiment with random forest 123 | 124 | param_grid = { 125 | 'max_depth': [50, 70, 80], 126 | 'min_samples_split': [5, 10], 127 | 'max_features': ['auto', 'sqrt'], 128 | 'min_samples_leaf': [3, 5] 129 | 130 | } 131 | 132 | 133 | from sklearn.ensemble import RandomForestRegressor 134 | 135 | rf = RandomForestRegressor(n_estimators=100, n_jobs=-1) 136 | grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2', n_jobs=-1) 137 | grid_search.fit(X_train, y_train) 138 | 139 | print(grid_search.best_params_) 140 | rf_best = grid_search.best_estimator_ 141 | 142 | predictions_rf = rf_best.predict(X_test) 143 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_rf))) 144 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_rf))) 145 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_rf))) 146 | 147 | 148 | # Experiment with SVR 149 | 150 | param_grid = [ 151 | {'kernel': ['linear'], 'C': [100, 300, 500], 'epsilon': [0.00003, 0.0001]}, 152 | {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [10, 100, 1000], 'epsilon': [0.00003, 0.0001]} 153 | ] 154 | 155 | 156 | from sklearn.svm import SVR 157 | 158 | svr = SVR() 159 | grid_search = GridSearchCV(svr, param_grid, cv=2, scoring='r2') 160 | grid_search.fit(X_scaled_train, y_train) 161 | 162 | print(grid_search.best_params_) 163 | 164 | svr_best = grid_search.best_estimator_ 165 | 166 | predictions_svr = svr_best.predict(X_scaled_test) 167 | 168 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_svr))) 169 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_svr))) 170 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_svr))) 171 | 172 | 173 | # Experiment with neural network 174 | 175 | param_grid = { 176 | 'hidden_layer_sizes': [(50, 10), (30, 30)], 177 | 'activation': ['logistic', 'tanh', 'relu'], 178 | 'solver': ['sgd', 'adam'], 179 | 'learning_rate_init': [0.0001, 0.0003, 0.001, 0.01], 180 | 'alpha': [0.00003, 0.0001, 0.0003], 181 | 'batch_size': [30, 50] 182 | } 183 | 184 | 185 | from sklearn.neural_network import MLPRegressor 186 | 187 | nn = MLPRegressor(random_state=42, max_iter=2000) 188 | grid_search = GridSearchCV(nn, param_grid, cv=2, scoring='r2', n_jobs=-1) 189 | grid_search.fit(X_scaled_train, y_train) 190 | 191 | 192 | print(grid_search.best_params_) 193 | 194 | nn_best = grid_search.best_estimator_ 195 | 196 | predictions_nn = nn_best.predict(X_scaled_test) 197 | 198 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_nn))) 199 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_nn))) 200 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_nn))) 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /Chapter09/svr.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 9: Stock Price Prediction with Regression Algorithms 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn import datasets 8 | boston = datasets.load_boston() 9 | 10 | num_test = 10 # the last 10 samples as testing set 11 | X_train = boston.data[:-num_test, :] 12 | y_train = boston.target[:-num_test] 13 | X_test = boston.data[-num_test:, :] 14 | y_test = boston.target[-num_test:] 15 | 16 | from sklearn.svm import SVR 17 | regressor = SVR(C=0.1, epsilon=0.02, kernel='linear') 18 | 19 | regressor.fit(X_train, y_train) 20 | predictions = regressor.predict(X_test) 21 | print(predictions) 22 | -------------------------------------------------------------------------------- /Chapter10/dimensionality_reduction.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 10: Machine Learning Best Practices 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | 8 | 9 | from sklearn.datasets import load_digits 10 | dataset = load_digits() 11 | X, y = dataset.data, dataset.target 12 | 13 | from sklearn.svm import SVC 14 | from sklearn.model_selection import cross_val_score 15 | 16 | 17 | 18 | from sklearn.decomposition import PCA 19 | 20 | # Keep different number of top components 21 | N = [10, 15, 25, 35, 45] 22 | for n in N: 23 | pca = PCA(n_components=n) 24 | X_n_kept = pca.fit_transform(X) 25 | # Estimate accuracy on the data set with top n components 26 | classifier = SVC(gamma=0.005) 27 | score_n_components = cross_val_score(classifier, X_n_kept, y).mean() 28 | print('Score with the data set of top {0} components: {1:.2f}'.format(n, score_n_components)) -------------------------------------------------------------------------------- /Chapter10/feature_selection.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 10: Machine Learning Best Practices 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | 8 | import numpy as np 9 | from sklearn.datasets import load_digits 10 | dataset = load_digits() 11 | X, y = dataset.data, dataset.target 12 | print(X.shape) 13 | 14 | # Estimate accuracy on the original data set 15 | from sklearn.svm import SVC 16 | from sklearn.model_selection import cross_val_score 17 | classifier = SVC(gamma=0.005) 18 | score = cross_val_score(classifier, X, y).mean() 19 | print('Score with the original data set: {0:.2f}'.format(score)) 20 | 21 | 22 | # Feature selection with random forest 23 | from sklearn.ensemble import RandomForestClassifier 24 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1) 25 | random_forest.fit(X, y) 26 | 27 | # Sort features based on their importancies 28 | feature_sorted = np.argsort(random_forest.feature_importances_) 29 | 30 | # Select different number of top features 31 | K = [10, 15, 25, 35, 45] 32 | for k in K: 33 | top_K_features = feature_sorted[-k:] 34 | X_k_selected = X[:, top_K_features] 35 | # Estimate accuracy on the data set with k selected features 36 | classifier = SVC(gamma=0.005) 37 | score_k_features = cross_val_score(classifier, X_k_selected, y).mean() 38 | print('Score with the data set of top {0} features: {1:.2f}'.format(k, score_k_features)) 39 | 40 | -------------------------------------------------------------------------------- /Chapter10/generic_feature_engineering.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 10: Machine Learning Best Practices 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn.preprocessing import Binarizer 8 | 9 | X = [[4], [1], [3], [0]] 10 | binarizer = Binarizer(threshold=2.9) 11 | X_new = binarizer.fit_transform(X) 12 | print(X_new) 13 | 14 | 15 | 16 | 17 | from sklearn.preprocessing import PolynomialFeatures 18 | 19 | X = [[2, 4], 20 | [1, 3], 21 | [3, 2], 22 | [0, 3]] 23 | poly = PolynomialFeatures(degree=2) 24 | X_new = poly.fit_transform(X) 25 | print(X_new) 26 | -------------------------------------------------------------------------------- /Chapter10/imputation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 10: Machine Learning Best Practices 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | 8 | import numpy as np 9 | from sklearn.preprocessing import Imputer 10 | 11 | 12 | data_origin = [[30, 100], 13 | [20, 50], 14 | [35, np.nan], 15 | [25, 80], 16 | [30, 70], 17 | [40, 60]] 18 | 19 | 20 | imp_mean = Imputer(missing_values='NaN', strategy='mean') 21 | imp_mean.fit(data_origin) 22 | data_mean_imp = imp_mean.transform(data_origin) 23 | print(data_mean_imp) 24 | 25 | 26 | imp_median = Imputer(missing_values='NaN', strategy='median') 27 | imp_median.fit(data_origin) 28 | data_median_imp = imp_median.transform(data_origin) 29 | print(data_median_imp) 30 | 31 | # New samples 32 | new = [[20, np.nan], 33 | [30, np.nan], 34 | [np.nan, 70], 35 | [np.nan, np.nan]] 36 | new_mean_imp = imp_mean.transform(new) 37 | print(new_mean_imp) 38 | 39 | 40 | 41 | # Effects of discarding missing values and imputation 42 | from sklearn import datasets 43 | dataset = datasets.load_diabetes() 44 | X_full, y = dataset.data, dataset.target 45 | 46 | 47 | 48 | m, n = X_full.shape 49 | m_missing = int(m * 0.25) 50 | print(m, m_missing) 51 | 52 | 53 | np.random.seed(42) 54 | missing_samples = np.array([True] * m_missing + [False] * (m - m_missing)) 55 | np.random.shuffle(missing_samples) 56 | 57 | 58 | missing_features = np.random.randint(low=0, high=n, size=m_missing) 59 | 60 | X_missing = X_full.copy() 61 | X_missing[np.where(missing_samples)[0], missing_features] = np.nan 62 | 63 | 64 | # Discard samples containing missing values 65 | X_rm_missing = X_missing[~missing_samples, :] 66 | y_rm_missing = y[~missing_samples] 67 | 68 | # Estimate R^2 on the data set with missing samples removed 69 | from sklearn.ensemble import RandomForestRegressor 70 | from sklearn.model_selection import cross_val_score 71 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100) 72 | score_rm_missing = cross_val_score(regressor, X_rm_missing, y_rm_missing).mean() 73 | print('Score with the data set with missing samples removed: {0:.2f}'.format(score_rm_missing)) 74 | 75 | 76 | # Imputation with mean value 77 | imp_mean = Imputer(missing_values='NaN', strategy='mean') 78 | X_mean_imp = imp_mean.fit_transform(X_missing) 79 | # Estimate R^2 on the data set with missing samples removed 80 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100) 81 | score_mean_imp = cross_val_score(regressor, X_mean_imp, y).mean() 82 | print('Score with the data set with missing values replaced by mean: {0:.2f}'.format(score_mean_imp)) 83 | 84 | 85 | # Estimate R^2 on the full data set 86 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=500) 87 | score_full = cross_val_score(regressor, X_full, y).mean() 88 | print('Score with the full data set: {0:.2f}'.format(score_full)) 89 | 90 | 91 | -------------------------------------------------------------------------------- /Chapter10/save_reuse_model_tf.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 10: Machine Learning Best Practices 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import tensorflow as tf 8 | 9 | 10 | from sklearn import datasets 11 | cancer_data = datasets.load_breast_cancer() 12 | X = cancer_data.data 13 | Y = cancer_data.target 14 | 15 | n_features = int(X.shape[1]) 16 | learning_rate = 0.005 17 | n_iter = 200 18 | 19 | 20 | # Input and Target placeholders 21 | x = tf.placeholder(tf.float32, shape=[None, n_features]) 22 | y = tf.placeholder(tf.float32, shape=[None]) 23 | 24 | # Build the logistic regression model 25 | W = tf.Variable(tf.zeros([n_features, 1]), name='W') 26 | b = tf.Variable(tf.zeros([1]), name='b') 27 | 28 | logits = tf.add(tf.matmul(x, W), b)[:, 0] 29 | cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)) 30 | 31 | optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost) 32 | 33 | sess = tf.Session() 34 | 35 | sess.run(tf.global_variables_initializer()) 36 | 37 | for i in range(1, n_iter+1): 38 | _, c = sess.run([optimizer, cost], feed_dict={x: X, y: Y}) 39 | if i % 10 == 0: 40 | print('Iteration %i, training loss: %f' % (i, c)) 41 | 42 | # Save the trained model 43 | # create saver object 44 | saver = tf.train.Saver() 45 | 46 | file_path = './model_tf' 47 | saved_path = saver.save(sess, file_path) 48 | print('model saved in path: {}'.format(saved_path)) 49 | 50 | 51 | tf.reset_default_graph() 52 | 53 | # Load the graph from the file 54 | imported_graph = tf.train.import_meta_graph(file_path+'.meta') 55 | 56 | 57 | 58 | with tf.Session() as sess: 59 | # restore the saved model 60 | imported_graph.restore(sess, file_path) 61 | # print the loaded weights 62 | W_loaded, b_loaded = sess.run(['W:0','b:0']) 63 | print('Saved W = ', W_loaded) 64 | print('Saved b = ', b_loaded) 65 | 66 | -------------------------------------------------------------------------------- /Chapter10/save_reuse_monitor_model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 10: Machine Learning Best Practices 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | from sklearn import datasets 8 | dataset = datasets.load_diabetes() 9 | X, y = dataset.data, dataset.target 10 | 11 | num_new = 30 # the last 30 samples as new data set 12 | X_train = X[:-num_new, :] 13 | y_train = y[:-num_new] 14 | X_new = X[-num_new:, :] 15 | y_new = y[-num_new:] 16 | 17 | 18 | # Data pre-processing 19 | from sklearn.preprocessing import StandardScaler 20 | scaler = StandardScaler() 21 | scaler.fit(X_train) 22 | 23 | import pickle 24 | # Save the scaler 25 | pickle.dump(scaler, open("scaler.p", "wb" )) 26 | 27 | X_scaled_train = scaler.transform(X_train) 28 | 29 | 30 | # Regression model training 31 | from sklearn.svm import SVR 32 | regressor = SVR(C=20) 33 | regressor.fit(X_scaled_train, y_train) 34 | # Save the regressor 35 | pickle.dump(regressor, open("regressor.p", "wb")) 36 | 37 | 38 | # Deployment 39 | my_scaler = pickle.load(open("scaler.p", "rb" )) 40 | my_regressor = pickle.load(open("regressor.p", "rb")) 41 | 42 | X_scaled_new = my_scaler.transform(X_new) 43 | predictions = my_regressor.predict(X_scaled_new) 44 | 45 | 46 | # Monitor 47 | from sklearn.metrics import r2_score 48 | print('Health check on the model, R^2: {0:.3f}'.format(r2_score(y_new, predictions))) 49 | -------------------------------------------------------------------------------- /Chapter10/word_embedding.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) 3 | Chapter 10: Machine Learning Best Practices 4 | Author: Yuxi (Hayden) Liu 5 | ''' 6 | 7 | import gensim.downloader as api 8 | 9 | model = api.load("glove-twitter-25") 10 | 11 | 12 | vector = model.wv['computer'] 13 | print('Word computer is embedded into:\n', vector) 14 | 15 | similar_words = model.most_similar("computer") 16 | print('Top ten words most contextually relevant to computer:\n', similar_words) 17 | 18 | 19 | 20 | doc_sample = ['i', 'love', 'reading', 'python', 'machine', 'learning', 'by', 'example'] 21 | 22 | import numpy as np 23 | doc_vector = np.mean([model.wv[word] for word in doc_sample], axis=0) 24 | print('The document sample is embedded into:\n', doc_vector) 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Machine Learning By Example Second Edition 2 | **Implement machine learning algorithms and techniques to build intelligent systems** 3 | 4 | 5 | 6 | 7 | This is the code repository for my book [Python Machine Learning By Example Second Edition](https://www.amazon.com/Python-Machine-Learning-Example-Second-Edition-Hayden/dp/1789616727?utm_source=github&utm_medium=repository&utm_campaign=%22), published by [Packt](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning-example-second-edition), in March 2019. It contains all the supporting project files necessary to work through the book from start to finish. 8 | 9 | 10 | 11 | ## What is this book about? 12 | The surge in interest in machine learning (ML) is due to the fact that it revolutionizes automation by learning patterns in data and using them to make predictions and decisions. If you’re interested in ML, this book will serve as your entry point to ML. 13 | 14 | Python Machine Learning By Example begins with an introduction to important ML concepts and implementations using Python libraries. Each chapter of the book walks you through an industry adopted application. You’ll implement ML techniques in areas such as exploratory data analysis, feature engineering, and natural language processing (NLP) in a clear and easy-to-follow way. 15 | 16 | With the help of this extended and updated edition, you’ll understand how to tackle data-driven problems and implement your solutions with the powerful yet simple Python language and popular Python packages and tools such as TensorFlow, scikit-learn, gensim, and Keras. To aid your understanding of popular ML algorithms, the book covers interesting and easy-to-follow examples such as news topic modeling and classification, spam email detection, stock price forecasting, and more. 17 | 18 | By the end of the book, you’ll have put together a broad picture of the ML ecosystem and will be well-versed with the best practices of applying ML techniques to make the most out of new opportunities. 19 | 20 | This book covers the following exciting features: 21 | * Exploit the power of Python to explore the world of data mining and data analytics 22 | * Discover machine learning algorithms to solve complex challenges faced by data scientists today 23 | * Use Python libraries such as TensorFlow and Keras to create smart cognitive actions for your projects 24 | Table of contents: 25 | ``` 26 | Chapter 1: Getting Started with Machine Learning and Python 27 | Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques 28 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms 29 | Chapter 4: Detecting Spam Email with Naive Bayes 30 | Chapter 5: Classifying News Topic with Support Vector Machine 31 | Chapter 6: Predicting Online Ads Click-through with Tree-Based Algorithms 32 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression 33 | Chapter 8: Scaling Up Prediction to Terabyte Click Logs 34 | Chapter 9: Stock Price Prediction with Regression Algorithms 35 | Chapter 10: Machine Learning Best Practices 36 | ``` 37 | 38 | ## Get to Know the Author 39 | **Yuxi (Hayden) Liu** 40 | is an author of a series of machine learning books and an education enthusiast. His first book, the first edition of Python Machine Learning By Example, was a #1 bestseller in Amazon India in 2017 and 2018. His other books include R Deep Learning Projects and Hands-On Deep Learning Architectures with Python published by Packt. 41 | 42 | He is an experienced data scientist who's focused on developing machine learning and deep learning models and systems. He has worked in a variety of data-driven domains and has applied his machine learning expertise to computational advertising, recommendation, and network anomaly detection. He published five first-authored IEEE transaction and conference papers during his master's research at the University of Toronto. 43 | 44 | ## Get the Book 45 | * [Packt](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning-example-second-edition) 46 | * [Amazon.com](https://www.amazon.com/Python-Machine-Learning-Example-Second-Edition-Hayden/dp/1789616727?utm_source=github&utm_medium=repository&utm_campaign=%22) 47 | * [Amazon.in](https://www.amazon.in/Python-Machine-Learning-Example-Second-Edition-Hayden/dp/1789616727?utm_source=github&utm_medium=repository&utm_campaign=%22) 48 | * [Amazon.uk](https://www.amazon.co.uk/Python-Machine-Learning-Example-Second-Edition-Hayden/dp/1789616727?utm_source=github&utm_medium=repository&utm_campaign=%22) 49 | 50 | 51 | 52 | ## My Other Books 53 | * [Python Machine Learning By Example](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning-example) 54 | * [Hands-On Deep Learning Architectures with Python](https://www.packtpub.com/big-data-and-business-intelligence/hands-deep-learning-architectures-python) 55 | * [Step-by-Step Machine Learning with Python](https://www.packtpub.com/big-data-and-business-intelligence/step-step-machine-learning-python-video) 56 | * [R Deep Learning Projects](https://www.packtpub.com/big-data-and-business-intelligence/r-deep-learning-projects) 57 | 58 | --------------------------------------------------------------------------------