├── .DS_Store
├── .gitattributes
├── .gitignore
├── Chapter02
    ├── .DS_Store
    ├── exploring_nlp.py
    ├── getting_exploring_newsgroups.py
    ├── tSNE.py
    └── thinking_about_features.py
├── Chapter03
    ├── .DS_Store
    ├── kmeans_elbow.py
    ├── kmeans_from_scratch.py
    ├── kmeans_newsgroups.py
    ├── kmeans_sklearn.py
    ├── lda_newsgroups.py
    └── nmf_newsgroups.py
├── Chapter04
    ├── .DS_Store
    └── email_spam.py
├── Chapter05
    ├── CTG.xls
    ├── ctg.py
    ├── plot_rbf_kernels.py
    ├── svm_tf.py
    └── topic_categorization.py
├── Chapter06
    ├── avazu_ctr.py
    ├── avazu_ctr_tf.py
    └── decision_tree_submit.py
├── Chapter07
    ├── encoding.py
    ├── logistic_function.py
    ├── logistic_regression_from_scratch.py
    ├── logistic_regression_tf.py
    ├── random_forest_feature_selection.py
    └── scikit_logistic_regression.py
├── Chapter08
    ├── ctr.py
    ├── ctr_hashing.py
    └── ctr_interaction.py
├── Chapter09
    ├── 19880101_20161231.csv
    ├── 20051201_20051210.csv
    ├── decision_tree_regression.py
    ├── get_dji_data.py
    ├── linear_regression.py
    ├── neural_network.py
    ├── regression_evaluation.py
    ├── stock_prediction.py
    └── svr.py
├── Chapter10
    ├── dimensionality_reduction.py
    ├── feature_selection.py
    ├── generic_feature_engineering.py
    ├── imputation.py
    ├── save_reuse_model_tf.py
    ├── save_reuse_monitor_model.py
    └── word_embedding.py
├── LICENSE
└── README.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haydenliu/Python-Machine-Learning-By-Example-Second-Edition/199b861e0158a9bee57cc6f1ceb61b3a7d76dcba/.DS_Store


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows thumbnail cache files
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | ehthumbs_vista.db
 5 | 
 6 | # Folder config file
 7 | Desktop.ini
 8 | 
 9 | # Recycle Bin used on file shares
10 | $RECYCLE.BIN/
11 | 
12 | # Windows Installer files
13 | *.cab
14 | *.msi
15 | *.msm
16 | *.msp
17 | 
18 | # Windows shortcuts
19 | *.lnk
20 | 
21 | # =========================
22 | # Operating System Files
23 | # =========================
24 | 


--------------------------------------------------------------------------------
/Chapter02/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haydenliu/Python-Machine-Learning-By-Example-Second-Edition/199b861e0158a9bee57cc6f1ceb61b3a7d76dcba/Chapter02/.DS_Store


--------------------------------------------------------------------------------
/Chapter02/exploring_nlp.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from nltk.corpus import names
 8 | 
 9 | print(names.words()[:10])
10 | 
11 | print(len(names.words()))
12 | 
13 | 
14 | 
15 | from nltk.tokenize import word_tokenize
16 | sent = '''I am reading a book.
17 |           It is Python Machine Learning By Example,
18 |           2nd edition.'''
19 | 
20 | print(word_tokenize(sent))
21 | 
22 | 
23 | sent2 = 'I have been to U.K. and U.S.A.'
24 | print(word_tokenize(sent2))
25 | 
26 | 
27 | 
28 | import spacy
29 | 
30 | nlp = spacy.load('en_core_web_sm')
31 | tokens2 = nlp(sent2)
32 | 
33 | print([token.text for token in tokens2])
34 | 
35 | 
36 | from nltk.tokenize import sent_tokenize
37 | print(sent_tokenize(sent))
38 | 
39 | 
40 | import nltk
41 | tokens = word_tokenize(sent)
42 | print(nltk.pos_tag(tokens))
43 | nltk.help.upenn_tagset('PRP')
44 | nltk.help.upenn_tagset('VBP')
45 | 
46 | 
47 | 
48 | print([(token.text, token.pos_) for token in tokens2])
49 | 
50 | 
51 | 
52 | tokens3 = nlp('The book written by Hayden Liu in 2018 was sold at $30 in America')
53 | print([(token_ent.text, token_ent.label_) for token_ent in tokens3.ents])
54 | 
55 | 
56 | 
57 | from nltk.stem.porter import PorterStemmer
58 | porter_stemmer = PorterStemmer()
59 | porter_stemmer.stem('machines')
60 | porter_stemmer.stem('learning')
61 | 
62 | 
63 | from nltk.stem import WordNetLemmatizer
64 | lemmatizer = WordNetLemmatizer()
65 | lemmatizer.lemmatize('machines')


--------------------------------------------------------------------------------
/Chapter02/getting_exploring_newsgroups.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | 
 8 | from sklearn.datasets import fetch_20newsgroups
 9 | 
10 | 
11 | groups = fetch_20newsgroups()
12 | groups.keys()
13 | groups['target_names']
14 | groups.target
15 | 
16 | 
17 | import numpy as np
18 | np.unique(groups.target)
19 | 
20 | 
21 | 
22 | import seaborn as sns
23 | sns.distplot(groups.target)
24 | import matplotlib.pyplot as plt
25 | plt.show()
26 | 
27 | 
28 | groups.data[0]
29 | groups.target[0]
30 | groups.target_names[groups.target[0]]
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/Chapter02/tSNE.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from sklearn.datasets import fetch_20newsgroups
 8 | from sklearn.feature_extraction.text import CountVectorizer
 9 | 
10 | 
11 | categories_3 = ['talk.religion.misc', 'comp.graphics', 'sci.space']
12 | 
13 | groups_3 = fetch_20newsgroups(categories=categories_3)
14 | 
15 | 
16 | 
17 | def is_letter_only(word):
18 |     for char in word:
19 |         if not char.isalpha():
20 |             return False
21 |     return True
22 | 
23 | 
24 | 
25 | from nltk.corpus import names
26 | all_names = set(names.words())
27 | 
28 | 
29 | count_vector_sw = CountVectorizer(stop_words="english", max_features=500)
30 | 
31 | 
32 | from nltk.stem import WordNetLemmatizer
33 | lemmatizer = WordNetLemmatizer()
34 | 
35 | data_cleaned = []
36 | 
37 | for doc in groups_3.data:
38 |     doc = doc.lower()
39 |     doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
40 |     data_cleaned.append(doc_cleaned)
41 | 
42 | 
43 | data_cleaned_count_3 = count_vector_sw.fit_transform(data_cleaned)
44 | 
45 | 
46 | 
47 | 
48 | from sklearn.manifold import TSNE
49 | 
50 | 
51 | tsne_model = TSNE(n_components=2,  perplexity=40, random_state=42, learning_rate=500)
52 | 
53 | 
54 | data_tsne = tsne_model.fit_transform(data_cleaned_count_3.toarray())
55 | 
56 | 
57 | import matplotlib.pyplot as plt
58 | plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=groups_3.target)
59 | 
60 | plt.show()
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | categories_5 = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
68 |                 'comp.windows.x']
69 | groups_5 = fetch_20newsgroups(categories=categories_5)
70 | 
71 | count_vector_sw = CountVectorizer(stop_words="english", max_features=500)
72 | 
73 | data_cleaned = []
74 | 
75 | for doc in groups_5.data:
76 |     doc = doc.lower()
77 |     doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
78 |     data_cleaned.append(doc_cleaned)
79 | 
80 | data_cleaned_count_5 = count_vector_sw.fit_transform(data_cleaned)
81 | 
82 | data_tsne = tsne_model.fit_transform(data_cleaned_count_5.toarray())
83 | 
84 | plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=groups_5.target)
85 | 
86 | plt.show()


--------------------------------------------------------------------------------
/Chapter02/thinking_about_features.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from sklearn.datasets import fetch_20newsgroups
 8 | 
 9 | 
10 | groups = fetch_20newsgroups()
11 | 
12 | 
13 | 
14 | from sklearn.feature_extraction.text import CountVectorizer
15 | 
16 | count_vector = CountVectorizer(stop_words="english",max_features=500)
17 | data_count = count_vector.fit_transform(groups.data)
18 | 
19 | print(count_vector.get_feature_names())
20 | 
21 | data_count.toarray()[0]
22 | 
23 | 
24 | 
25 | def is_letter_only(word):
26 |     for char in word:
27 |         if not char.isalpha():
28 |             return False
29 |     return True
30 | 
31 | data_cleaned = []
32 | for doc in groups.data:
33 |     doc_cleaned = ' '.join(word for word in doc.split() if is_letter_only(word) )
34 |     data_cleaned.append(doc_cleaned)
35 | 
36 | 
37 | from sklearn.feature_extraction import stop_words
38 | print(stop_words.ENGLISH_STOP_WORDS)
39 | 
40 | 
41 | from nltk.corpus import names
42 | all_names = set(names.words())
43 | 
44 | 
45 | count_vector_sw = CountVectorizer(stop_words="english", max_features=500)
46 | 
47 | 
48 | from nltk.stem import WordNetLemmatizer
49 | lemmatizer = WordNetLemmatizer()
50 | 
51 | data_cleaned = []
52 | 
53 | for doc in groups.data:
54 |     doc = doc.lower()
55 |     doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
56 |     data_cleaned.append(doc_cleaned)
57 | 
58 | 
59 | data_cleaned_count = count_vector_sw.fit_transform(data_cleaned)
60 | 
61 | print(count_vector_sw.get_feature_names())
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/Chapter03/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haydenliu/Python-Machine-Learning-By-Example-Second-Edition/199b861e0158a9bee57cc6f1ceb61b3a7d76dcba/Chapter03/.DS_Store


--------------------------------------------------------------------------------
/Chapter03/kmeans_elbow.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | 
 8 | from sklearn import datasets
 9 | from sklearn.cluster import KMeans
10 | import numpy as np
11 | from matplotlib import pyplot as plt
12 | 
13 | iris = datasets.load_iris()
14 | X = iris.data
15 | y = iris.target
16 | 
17 | 
18 | k_list = list(range(1, 7))
19 | sse_list = [0] * len(k_list)
20 | 
21 | for k_ind, k in enumerate(k_list):
22 |     kmeans = KMeans(n_clusters=k, random_state=42)
23 |     kmeans.fit(X)
24 |     clusters = kmeans.labels_
25 |     centroids = kmeans.cluster_centers_
26 | 
27 |     sse = 0
28 |     for i in range(k):
29 |         cluster_i = np.where(clusters == i)
30 | 
31 |         sse += np.linalg.norm(X[cluster_i] - centroids[i])
32 | 
33 |     print('k={}, SSE={}'.format(k, sse))
34 |     sse_list[k_ind] = sse
35 | 
36 | 
37 | 
38 | plt.plot(k_list, sse_list)
39 | plt.show()
40 | 


--------------------------------------------------------------------------------
/Chapter03/kmeans_from_scratch.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | 
 8 | from sklearn import datasets
 9 | iris = datasets.load_iris()
10 | X = iris.data[:, 2:4]
11 | y = iris.target
12 | 
13 | import numpy as np
14 | from matplotlib import pyplot as plt
15 | y_0 = np.where(y==0)
16 | plt.scatter(X[y_0, 0], X[y_0, 1])
17 | y_1 = np.where(y==1)
18 | plt.scatter(X[y_1, 0], X[y_1, 1])
19 | y_2 = np.where(y==2)
20 | plt.scatter(X[y_2, 0], X[y_2, 1])
21 | plt.show()
22 | 
23 | 
24 | k = 3
25 | random_index = np.random.choice(range(len(X)), k)
26 | centroids = X[random_index]
27 | 
28 | 
29 | def visualize_centroids(X, centroids):
30 |     plt.scatter(X[:, 0], X[:, 1])
31 |     plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505')
32 |     plt.show()
33 | 
34 | 
35 | visualize_centroids(X, centroids)
36 | 
37 | 
38 | def dist(a, b):
39 |     return np.linalg.norm(a - b, axis=1)
40 | 
41 | def assign_cluster(x, centroids):
42 |     distances = dist(x, centroids)
43 |     cluster = np.argmin(distances)
44 |     return cluster
45 | 
46 | def update_centroids(X, centroids, clusters):
47 |     for i in range(k):
48 |         cluster_i = np.where(clusters == i)
49 |         centroids[i] = np.mean(X[cluster_i], axis=0)
50 | 
51 | 
52 | clusters = np.zeros(len(X))
53 | 
54 | tol = 0.0001
55 | max_iter = 100
56 | 
57 | iter = 0
58 | centroids_diff = 100000
59 | 
60 | from copy import deepcopy
61 | while iter < max_iter and centroids_diff > tol:
62 |     for i in range(len(X)):
63 |         clusters[i] = assign_cluster(X[i], centroids)
64 |     centroids_prev = deepcopy(centroids)
65 |     update_centroids(X, centroids, clusters)
66 |     iter += 1
67 |     centroids_diff = np.linalg.norm(centroids - centroids_prev)
68 |     print('Iteration:', str(iter))
69 |     print('Centroids:\n', centroids)
70 |     print('Centroids move: {:5.4f}'.format(centroids_diff))
71 |     visualize_centroids(X, centroids)
72 | 
73 | 
74 | for i in range(k):
75 |     cluster_i = np.where(clusters == i)
76 |     plt.scatter(X[cluster_i, 0], X[cluster_i, 1])
77 | plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505')
78 | plt.show()
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/Chapter03/kmeans_newsgroups.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from sklearn.datasets import fetch_20newsgroups
 8 | 
 9 | categories = [
10 |     'alt.atheism',
11 |     'talk.religion.misc',
12 |     'comp.graphics',
13 |     'sci.space',
14 | ]
15 | 
16 | 
17 | groups = fetch_20newsgroups(subset='all', categories=categories)
18 | 
19 | 
20 | labels = groups.target
21 | label_names = groups.target_names
22 | 
23 | 
24 | 
25 | def is_letter_only(word):
26 |     for char in word:
27 |         if not char.isalpha():
28 |             return False
29 |     return True
30 | 
31 | 
32 | 
33 | from nltk.corpus import names
34 | all_names = set(names.words())
35 | 
36 | 
37 | 
38 | 
39 | from nltk.stem import WordNetLemmatizer
40 | lemmatizer = WordNetLemmatizer()
41 | 
42 | data_cleaned = []
43 | 
44 | for doc in groups.data:
45 |     doc = doc.lower()
46 |     doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
47 |     data_cleaned.append(doc_cleaned)
48 | 
49 | 
50 | from sklearn.feature_extraction.text import CountVectorizer
51 | count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2)
52 | 
53 | from sklearn.feature_extraction.text import TfidfVectorizer
54 | tfidf_vector = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2)
55 | 
56 | data = tfidf_vector.fit_transform(data_cleaned)
57 | 
58 | 
59 | from sklearn.cluster import KMeans
60 | 
61 | k = 4
62 | kmeans = KMeans(n_clusters=k, random_state=42)
63 | 
64 | kmeans.fit(data)
65 | 
66 | clusters = kmeans.labels_
67 | 
68 | 
69 | 
70 | from collections import Counter
71 | print(Counter(clusters))
72 | 
73 | import numpy as np
74 | cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)}
75 | 
76 | terms = tfidf_vector.get_feature_names()
77 | centroids = kmeans.cluster_centers_
78 | for cluster, index_list in cluster_label.items():
79 |     counter = Counter(cluster_label[cluster])
80 |     print('cluster_{}: {} samples'.format(cluster, len(index_list)))
81 |     for label_index, count in sorted(counter.items(), key=lambda x: x[1], reverse=True):
82 |         print('{}: {} samples'.format(label_names[label_index], count))
83 |     print('Top 10 terms:')
84 |     for ind in centroids[cluster].argsort()[-10:]:
85 |         print(' %s' % terms[ind], end="")
86 |     print()
87 | 


--------------------------------------------------------------------------------
/Chapter03/kmeans_sklearn.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | 
 8 | from sklearn import datasets
 9 | iris = datasets.load_iris()
10 | X = iris.data[:, 2:4]
11 | y = iris.target
12 | 
13 | import numpy as np
14 | from matplotlib import pyplot as plt
15 | 
16 | k = 3
17 | from sklearn.cluster import KMeans
18 | kmeans_sk = KMeans(n_clusters=3, random_state=42)
19 | kmeans_sk.fit(X)
20 | clusters_sk = kmeans_sk.labels_
21 | centroids_sk = kmeans_sk.cluster_centers_
22 | 
23 | for i in range(k):
24 |     cluster_i = np.where(clusters_sk == i)
25 |     plt.scatter(X[cluster_i, 0], X[cluster_i, 1])
26 | plt.scatter(centroids_sk[:, 0], centroids_sk[:, 1], marker='*', s=200, c='#050505')
27 | plt.show()
28 | 


--------------------------------------------------------------------------------
/Chapter03/lda_newsgroups.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from sklearn.datasets import fetch_20newsgroups
 8 | 
 9 | categories = [
10 |     'alt.atheism',
11 |     'talk.religion.misc',
12 |     'comp.graphics',
13 |     'sci.space',
14 | ]
15 | 
16 | 
17 | groups = fetch_20newsgroups(subset='all', categories=categories)
18 | 
19 | 
20 | 
21 | def is_letter_only(word):
22 |     for char in word:
23 |         if not char.isalpha():
24 |             return False
25 |     return True
26 | 
27 | 
28 | 
29 | from nltk.corpus import names
30 | all_names = set(names.words())
31 | 
32 | 
33 | 
34 | from nltk.stem import WordNetLemmatizer
35 | lemmatizer = WordNetLemmatizer()
36 | 
37 | data_cleaned = []
38 | 
39 | for doc in groups.data:
40 |     doc = doc.lower()
41 |     doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
42 |     data_cleaned.append(doc_cleaned)
43 | 
44 | 
45 | 
46 | from sklearn.feature_extraction.text import CountVectorizer
47 | count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2)
48 | 
49 | 
50 | data = count_vector.fit_transform(data_cleaned)
51 | 
52 | 
53 | from sklearn.decomposition import LatentDirichletAllocation
54 | 
55 | t = 20
56 | lda = LatentDirichletAllocation(n_components=t, learning_method='batch',random_state=42)
57 | 
58 | lda.fit(data)
59 | 
60 | print(lda.components_)
61 | 
62 | terms = count_vector.get_feature_names()
63 | 
64 | 
65 | for topic_idx, topic in enumerate(lda.components_):
66 |         print("Topic {}:" .format(topic_idx))
67 |         print(" ".join([terms[i] for i in topic.argsort()[-10:]]))
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/Chapter03/nmf_newsgroups.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from sklearn.datasets import fetch_20newsgroups
 8 | 
 9 | categories = [
10 |     'alt.atheism',
11 |     'talk.religion.misc',
12 |     'comp.graphics',
13 |     'sci.space',
14 | ]
15 | 
16 | 
17 | groups = fetch_20newsgroups(subset='all', categories=categories)
18 | 
19 | 
20 | 
21 | def is_letter_only(word):
22 |     for char in word:
23 |         if not char.isalpha():
24 |             return False
25 |     return True
26 | 
27 | 
28 | 
29 | from nltk.corpus import names
30 | all_names = set(names.words())
31 | 
32 | 
33 | 
34 | from nltk.stem import WordNetLemmatizer
35 | lemmatizer = WordNetLemmatizer()
36 | 
37 | data_cleaned = []
38 | 
39 | for doc in groups.data:
40 |     doc = doc.lower()
41 |     doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
42 |     data_cleaned.append(doc_cleaned)
43 | 
44 | 
45 | from sklearn.feature_extraction.text import CountVectorizer
46 | count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2)
47 | 
48 | data = count_vector.fit_transform(data_cleaned)
49 | 
50 | 
51 | from sklearn.decomposition import NMF
52 | 
53 | t = 20
54 | nmf = NMF(n_components=t, random_state=42)
55 | 
56 | nmf.fit(data)
57 | 
58 | print(nmf.components_)
59 | 
60 | terms = count_vector.get_feature_names()
61 | 
62 | 
63 | for topic_idx, topic in enumerate(nmf.components_):
64 |         print("Topic {}:" .format(topic_idx))
65 |         print(" ".join([terms[i] for i in topic.argsort()[-10:]]))
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/Chapter04/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haydenliu/Python-Machine-Learning-By-Example-Second-Edition/199b861e0158a9bee57cc6f1ceb61b3a7d76dcba/Chapter04/.DS_Store


--------------------------------------------------------------------------------
/Chapter04/email_spam.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 4: Detecting Spam Email with Naive Bayes
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | 
  7 | # -*- coding: utf-8 -*-
  8 | 
  9 | import glob
 10 | import os
 11 | import numpy as np
 12 | 
 13 | 
 14 | file_path = 'enron1/ham/0007.1999-12-14.farmer.ham.txt'
 15 | with open(file_path, 'r') as infile:
 16 |     ham_sample = infile.read()
 17 | 
 18 | print(ham_sample)
 19 | 
 20 | file_path = 'enron1/spam/0058.2003-12-21.GP.spam.txt'
 21 | with open(file_path, 'r') as infile:
 22 |     spam_sample = infile.read()
 23 | 
 24 | print(spam_sample)
 25 | 
 26 | 
 27 | emails, labels = [], []
 28 | 
 29 | file_path = 'enron1/spam/'
 30 | for filename in glob.glob(os.path.join(file_path, '*.txt')):
 31 |     with open(filename, 'r', encoding="ISO-8859-1") as infile:
 32 |         emails.append(infile.read())
 33 |         labels.append(1)
 34 | 
 35 | file_path = 'enron1/ham/'
 36 | for filename in glob.glob(os.path.join(file_path, '*.txt')):
 37 |     with open(filename, 'r', encoding="ISO-8859-1") as infile:
 38 |         emails.append(infile.read())
 39 |         labels.append(0)
 40 | 
 41 | print(len(labels))
 42 | 
 43 | print(len(emails))
 44 | 
 45 | 
 46 | 
 47 | 
 48 | def is_letter_only(word):
 49 |     return word.isalpha()
 50 | 
 51 | from nltk.corpus import names
 52 | all_names = set(names.words())
 53 | 
 54 | from nltk.stem import WordNetLemmatizer
 55 | lemmatizer = WordNetLemmatizer()
 56 | 
 57 | def clean_text(docs):
 58 |     docs_cleaned = []
 59 |     for doc in docs:
 60 |         doc = doc.lower()
 61 |         doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
 62 |         docs_cleaned.append(doc_cleaned)
 63 |     return docs_cleaned
 64 | 
 65 | emails_cleaned = clean_text(emails)
 66 | 
 67 | from sklearn.feature_extraction.text import CountVectorizer
 68 | cv = CountVectorizer(stop_words="english", max_features=1000, max_df=0.5, min_df=2)
 69 | 
 70 | docs_cv = cv.fit_transform(emails_cleaned)
 71 | print(docs_cv[0])
 72 | 
 73 | terms = cv.get_feature_names()
 74 | print(terms[932])
 75 | print(terms[968])
 76 | print(terms[715])
 77 | 
 78 | 
 79 | 
 80 | def get_label_index(labels):
 81 |     from collections import defaultdict
 82 |     label_index = defaultdict(list)
 83 |     for index, label in enumerate(labels):
 84 |         label_index[label].append(index)
 85 |     return label_index
 86 | 
 87 | 
 88 | def get_prior(label_index):
 89 |     """
 90 |     Compute prior based on training samples
 91 |     @param label_index: grouped sample indices by class
 92 |     @return: dictionary, with class label as key, corresponding prior as the value
 93 |     """
 94 |     prior = {label: len(index) for label, index in label_index.items()}
 95 |     total_count = sum(prior.values())
 96 |     for label in prior:
 97 |         prior[label] /= float(total_count)
 98 |     return prior
 99 | 
100 | 
101 | label_index = get_label_index(labels)
102 | prior = get_prior(label_index)
103 | print('Prior:', prior)
104 | 
105 | 
106 | def get_likelihood(term_matrix, label_index, smoothing=0):
107 |     """
108 |     Compute likelihood based on training samples
109 |     @param term_matrix: sparse matrix of the term frequency features
110 |     @param label_index: grouped sample indices by class
111 |     @param smoothing: integer, additive Laplace smoothing parameter
112 |     @return: dictionary, with class as key, corresponding conditional probability P(feature|class) vector as value
113 |     """
114 |     likelihood = {}
115 |     for label, index in label_index.items():
116 |         likelihood[label] = term_matrix[index, :].sum(axis=0) + smoothing
117 |         likelihood[label] = np.asarray(likelihood[label])[0]
118 |         total_count = likelihood[label].sum()
119 |         likelihood[label] = likelihood[label] / float(total_count)
120 |     return likelihood
121 | 
122 | smoothing = 1
123 | likelihood = get_likelihood(docs_cv, label_index, smoothing)
124 | 
125 | print(len(likelihood[0]))
126 | 
127 | print(likelihood[0][:5])
128 | print(likelihood[1][:5])
129 | 
130 | 
131 | 
132 | def get_posterior(term_matrix, prior, likelihood):
133 |     """
134 |     Compute posterior of testing samples, based on prior and likelihood
135 |     @param term_matrix: sparse matrix of the term frequency features
136 |     @param prior: dictionary, with class label as key, corresponding prior as the value
137 |     @param likelihood: dictionary, with class label as key, corresponding conditional probability vector as value
138 |     @return: dictionary, with class label as key, corresponding posterior as value
139 |     """
140 |     num_docs = term_matrix.shape[0]
141 |     posteriors = []
142 |     for i in range(num_docs):
143 |         # posterior is proportional to prior * likelihood
144 |         # = exp(log(prior * likelihood))
145 |         # = exp(log(prior) + log(likelihood))
146 |         posterior = {key: np.log(prior_label) for key, prior_label in prior.items()}
147 |         for label, likelihood_label in likelihood.items():
148 |             term_document_vector = term_matrix.getrow(i)
149 |             counts = term_document_vector.data
150 |             indices = term_document_vector.indices
151 |             for count, index in zip(counts, indices):
152 |                 posterior[label] += np.log(likelihood_label[index]) * count
153 |         # exp(-1000):exp(-999) will cause zero division error,
154 |         # however it equates to exp(0):exp(1)
155 |         min_log_posterior = min(posterior.values())
156 |         for label in posterior:
157 |             try:
158 |                 posterior[label] = np.exp(posterior[label] - min_log_posterior)
159 |             except:
160 |                 posterior[label] = float('inf')
161 |         # normalize so that all sums up to 1
162 |         sum_posterior = sum(posterior.values())
163 |         for label in posterior:
164 |             if posterior[label] == float('inf'):
165 |                 posterior[label] = 1.0
166 |             else:
167 |                 posterior[label] /= sum_posterior
168 |         posteriors.append(posterior.copy())
169 |     return posteriors
170 | 
171 | 
172 | 
173 | emails_test = [
174 |     '''Subject: flat screens
175 |     hello ,
176 |     please call or contact regarding the other flat screens requested .
177 |     trisha tlapek - eb 3132 b
178 |     michael sergeev - eb 3132 a
179 |     also the sun blocker that was taken away from eb 3131 a .
180 |     trisha should two monitors also michael .
181 |     thanks
182 |     kevin moore''',
183 |     '''Subject: let ' s stop the mlm insanity !
184 |     still believe you can earn $ 100 , 000 fast in mlm ? get real !
185 |     get emm , a brand new system that replaces mlm with something that works !
186 |     start earning 1 , 000 ' s now ! up to $ 10 , 000 per week doing simple online tasks .
187 |     free info - breakfree @ luxmail . com - type " send emm info " in the subject box .
188 |     this message is sent in compliance of the proposed bill section 301 . per section 301 , paragraph ( a ) ( 2 ) ( c ) of s . 1618 . further transmission to you by the sender of this e - mail may be stopped at no cost to you by sending a reply to : " email address " with the word remove in the subject line .
189 |     ''',
190 | ]
191 | 
192 | emails_cleaned_test = clean_text(emails_test)
193 | term_docs_test = cv.transform(emails_cleaned_test)
194 | 
195 | 
196 | posterior = get_posterior(term_docs_test, prior, likelihood)
197 | print(posterior)
198 | 
199 | 
200 | 
201 | from sklearn.model_selection import train_test_split
202 | X_train, X_test, Y_train, Y_test = train_test_split(emails_cleaned, labels, test_size=0.33, random_state=42)
203 | 
204 | print(len(X_train), len(Y_train))
205 | len(X_test), len(Y_test)
206 | 
207 | term_docs_train = cv.fit_transform(X_train)
208 | 
209 | label_index = get_label_index(Y_train)
210 | prior = get_prior(label_index)
211 | likelihood = get_likelihood(term_docs_train, label_index, smoothing)
212 | 
213 | term_docs_test = cv.transform(X_test)
214 | 
215 | 
216 | posterior = get_posterior(term_docs_test, prior, likelihood)
217 | 
218 | correct = 0.0
219 | for pred, actual in zip(posterior, Y_test):
220 |     if actual == 1:
221 |         if pred[1] >= 0.5:
222 |             correct += 1
223 |     elif pred[0] > 0.5:
224 |         correct += 1
225 | 
226 | print('The accuracy on {0} testing samples is: {1:.1f}%'.format(len(Y_test), correct/len(Y_test)*100))
227 | 
228 | 
229 | 
230 | 
231 | from sklearn.naive_bayes import MultinomialNB
232 | clf = MultinomialNB(alpha=1.0, fit_prior=True)
233 | clf.fit(term_docs_train, Y_train)
234 | prediction_prob = clf.predict_proba(term_docs_test)
235 | 
236 | print(prediction_prob[0:10])
237 | 
238 | prediction = clf.predict(term_docs_test)
239 | 
240 | print(prediction[:10])
241 | 
242 | accuracy = clf.score(term_docs_test, Y_test)
243 | 
244 | print('The accuracy using MultinomialNB is: {0:.1f}%'.format(accuracy*100))
245 | 
246 | 
247 | 
248 | 
249 | from sklearn.metrics import confusion_matrix
250 | print(confusion_matrix(Y_test, prediction, labels=[0, 1]))
251 | 
252 | from sklearn.metrics import precision_score, recall_score, f1_score
253 | precision_score(Y_test, prediction, pos_label=1)
254 | recall_score(Y_test, prediction, pos_label=1)
255 | f1_score(Y_test, prediction, pos_label=1)
256 | 
257 | f1_score(Y_test, prediction, pos_label=0)
258 | 
259 | from sklearn.metrics import classification_report
260 | report = classification_report(Y_test, prediction)
261 | print(report)
262 | 
263 | 
264 | 
265 | 
266 | pos_prob = prediction_prob[:, 1]
267 | thresholds = np.arange(0.0, 1.2, 0.1)
268 | true_pos, false_pos = [0]*len(thresholds), [0]*len(thresholds)
269 | for pred, y in zip(pos_prob, Y_test):
270 |     for i, threshold in enumerate(thresholds):
271 |         if pred >= threshold:
272 |             if y == 1:
273 |                 true_pos[i] += 1
274 |             else:
275 |                 false_pos[i] += 1
276 |         else:
277 |             break
278 | 
279 | true_pos_rate = [tp / 516.0 for tp in true_pos]
280 | false_pos_rate = [fp / 1191.0 for fp in false_pos]
281 | 
282 | 
283 | import matplotlib.pyplot as plt
284 | plt.figure()
285 | lw = 2
286 | plt.plot(false_pos_rate, true_pos_rate, color='darkorange',
287 |          lw=lw)
288 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
289 | plt.xlim([0.0, 1.0])
290 | plt.ylim([0.0, 1.05])
291 | plt.xlabel('False Positive Rate')
292 | plt.ylabel('True Positive Rate')
293 | plt.title('Receiver Operating Characteristic')
294 | plt.legend(loc="lower right")
295 | plt.show()
296 | 
297 | 
298 | 
299 | 
300 | from sklearn.metrics import roc_auc_score
301 | roc_auc_score(Y_test, pos_prob)
302 | 
303 | 
304 | 
305 | from sklearn.model_selection import StratifiedKFold
306 | k = 10
307 | k_fold = StratifiedKFold(n_splits=k, random_state=42)
308 | cleaned_emails_np = np.array(emails_cleaned)
309 | labels_np = np.array(labels)
310 | 
311 | max_features_option = [2000, 8000, None]
312 | smoothing_factor_option = [0.5, 1.0, 2.0, 4.0]
313 | fit_prior_option = [True, False]
314 | 
315 | max_features_option = [None]
316 | smoothing_factor_option = [4.0, 10, 16, 20, 32]
317 | fit_prior_option = [True, False]
318 | 
319 | 
320 | auc_record = {}
321 | 
322 | for train_indices, test_indices in k_fold.split(emails_cleaned, labels):
323 |     X_train, X_test = cleaned_emails_np[train_indices], cleaned_emails_np[test_indices]
324 |     Y_train, Y_test = labels_np[train_indices], labels_np[test_indices]
325 |     for max_features in max_features_option:
326 |         if max_features not in auc_record:
327 |             auc_record[max_features] = {}
328 |         cv = CountVectorizer(stop_words="english", max_features=max_features, max_df=0.5, min_df=2)
329 |         term_docs_train = cv.fit_transform(X_train)
330 |         term_docs_test = cv.transform(X_test)
331 |         for alpha in smoothing_factor_option:
332 |             if alpha not in auc_record[max_features]:
333 |                 auc_record[max_features][alpha] = {}
334 |             for fit_prior in fit_prior_option:
335 |                 clf = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
336 |                 clf.fit(term_docs_train, Y_train)
337 |                 prediction_prob = clf.predict_proba(term_docs_test)
338 |                 pos_prob = prediction_prob[:, 1]
339 |                 auc = roc_auc_score(Y_test, pos_prob)
340 |                 auc_record[max_features][alpha][fit_prior] = auc + auc_record[max_features][alpha].get(fit_prior, 0.0)
341 | 
342 | 
343 | 
344 | print('max features  smoothing  fit prior  auc')
345 | for max_features, max_feature_record in auc_record.items():
346 |     for smoothing, smoothing_record in max_feature_record.items():
347 |         for fit_prior, auc in smoothing_record.items():
348 |             print('       {0}      {1}      {2}    {3:.5f}'.format(max_features, smoothing, fit_prior, auc/k))
349 | 
350 | 


--------------------------------------------------------------------------------
/Chapter05/CTG.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haydenliu/Python-Machine-Learning-By-Example-Second-Edition/199b861e0158a9bee57cc6f1ceb61b3a7d76dcba/Chapter05/CTG.xls


--------------------------------------------------------------------------------
/Chapter05/ctg.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 5: Classifying Newsgroup Topic with Support Vector Machine
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | import pandas as pd
 8 | df = pd.read_excel('CTG.xls', "Raw Data")
 9 | 
10 | X = df.ix[1:2126, 3:-2].values
11 | Y = df.ix[1:2126, -1].values
12 | 
13 | from collections import Counter
14 | Counter(Y)
15 | 
16 | from sklearn.model_selection import train_test_split
17 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
18 | 
19 | from sklearn.svm import SVC
20 | svc = SVC(kernel='rbf')
21 | 
22 | parameters = {'C': (100, 1e3, 1e4, 1e5),
23 |               'gamma': (1e-08, 1e-7, 1e-6, 1e-5)
24 |               }
25 | from sklearn.model_selection import GridSearchCV
26 | grid_search = GridSearchCV(svc, parameters, n_jobs=-1, cv=5)
27 | 
28 | 
29 | import timeit
30 | start_time = timeit.default_timer()
31 | grid_search.fit(X_train, Y_train)
32 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))
33 | 
34 | print(grid_search.best_params_)
35 | print(grid_search.best_score_)
36 | 
37 | svc_best = grid_search.best_estimator_
38 | 
39 | accuracy = svc_best.score(X_test, Y_test)
40 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100))
41 | 
42 | prediction = svc_best.predict(X_test)
43 | from sklearn.metrics import classification_report
44 | report = classification_report(Y_test, prediction)
45 | print(report)


--------------------------------------------------------------------------------
/Chapter05/plot_rbf_kernels.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 5: Classifying Newsgroup Topic with Support Vector Machine
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | from sklearn.svm import SVC
10 | 
11 | X = np.c_[# negative class
12 |           (.3, -.8),
13 |           (-1.5, -1),
14 |           (-1.3, -.8),
15 |           (-1.1, -1.3),
16 |           (-1.2, -.3),
17 |           (-1.3, -.5),
18 |           (-.6, 1.1),
19 |           (-1.4, 2.2),
20 |           (1, 1),
21 |           # positive class
22 |           (1.3, .8),
23 |           (1.2, .5),
24 |           (.2, -2),
25 |           (.5, -2.4),
26 |           (.2, -2.3),
27 |           (0, -2.7),
28 |           (1.3, 2.1)].T
29 | Y = [-1] * 8 + [1] * 8
30 | 
31 | gamma_option = [1, 2, 4]
32 | 
33 | for i, gamma in enumerate(gamma_option, 1):
34 |     svm = SVC(kernel='rbf', gamma=gamma)
35 |     svm.fit(X, Y)
36 |     plt.scatter(X[:, 0], X[:, 1], c=['b']*8+['r']*8, zorder=10, cmap=plt.cm.Paired)
37 |     plt.axis('tight')
38 |     XX, YY = np.mgrid[-3:3:200j, -3:3:200j]
39 |     Z = svm.decision_function(np.c_[XX.ravel(), YY.ravel()])
40 |     Z = Z.reshape(XX.shape)
41 |     plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
42 |     plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'], levels=[-.5, 0, .5])
43 |     plt.title('gamma = %d' % gamma)
44 |     plt.show()
45 | 
46 | 


--------------------------------------------------------------------------------
/Chapter05/svm_tf.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 5: Classifying Newsgroup Topic with Support Vector Machine
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | import tensorflow as tf
 8 | import numpy as np
 9 | 
10 | from sklearn import datasets
11 | cancer_data = datasets.load_breast_cancer()
12 | X = cancer_data.data
13 | Y = cancer_data.target
14 | 
15 | from collections import Counter
16 | print(Counter(Y))
17 | 
18 | 
19 | np.random.seed(42)
20 | train_indices = np.random.choice(len(Y), round(len(Y) * 0.8), replace=False)
21 | test_indices = np.array(list(set(range(len(Y))) - set(train_indices)))
22 | X_train = X[train_indices]
23 | X_test = X[test_indices]
24 | Y_train = Y[train_indices]
25 | Y_test = Y[test_indices]
26 | 
27 | 
28 | svm_tf = tf.contrib.learn.SVM(
29 |   feature_columns=(tf.contrib.layers.real_valued_column(column_name='x'),),
30 |   example_id_column='example_id')
31 | 
32 | 
33 | 
34 | input_fn_train = tf.estimator.inputs.numpy_input_fn(
35 |     x={'x': X_train, 'example_id': np.array(['%d' % i for i in range(len(Y_train))])},
36 |     y=Y_train,
37 |     num_epochs=None,
38 |     batch_size=100,
39 |     shuffle=True)
40 | 
41 | 
42 | 
43 | svm_tf.fit(input_fn=input_fn_train, max_steps=100)
44 | 
45 | 
46 | metrics = svm_tf.evaluate(input_fn=input_fn_train, steps=1)
47 | print('The training accuracy is: {0:.1f}%'.format(metrics['accuracy']*100))
48 | 
49 | 
50 | 
51 | input_fn_test = tf.estimator.inputs.numpy_input_fn(
52 |     x={'x': X_test, 'example_id': np.array(['%d' % (i + len(Y_train)) for i in range(len(X_test))])},
53 |     y=Y_test,
54 |     num_epochs=None,
55 |     shuffle=False)
56 | 
57 | 
58 | metrics = svm_tf.evaluate(input_fn=input_fn_test, steps=1)
59 | print('The testing accuracy is: {0:.1f}%'.format(metrics['accuracy']*100))
60 | 
61 | 


--------------------------------------------------------------------------------
/Chapter05/topic_categorization.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 5: Classifying Newsgroup Topic with Support Vector Machine
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | 
  7 | from sklearn.feature_extraction.text import TfidfVectorizer
  8 | from sklearn.datasets import fetch_20newsgroups
  9 | from nltk.corpus import names
 10 | from nltk.stem import WordNetLemmatizer
 11 | 
 12 | all_names = set(names.words())
 13 | lemmatizer = WordNetLemmatizer()
 14 | 
 15 | def is_letter_only(word):
 16 |     return word.isalpha()
 17 | 
 18 | from nltk.corpus import stopwords
 19 | stop_words = stopwords.words('english')
 20 | 
 21 | def clean_text(docs):
 22 |     docs_cleaned = []
 23 |     for doc in docs:
 24 |         doc = doc.lower()
 25 |         doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split()
 26 |                                if is_letter_only(word) and word not in all_names and word not in stop_words)
 27 |         docs_cleaned.append(doc_cleaned)
 28 |     return docs_cleaned
 29 | 
 30 | 
 31 | # Binary classification
 32 | categories = ['comp.graphics', 'sci.space']
 33 | 
 34 | data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
 35 | data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)
 36 | 
 37 | cleaned_train = clean_text(data_train.data)
 38 | label_train = data_train.target
 39 | cleaned_test = clean_text(data_test.data)
 40 | label_test = data_test.target
 41 | 
 42 | from collections import Counter
 43 | Counter(label_train)
 44 | 
 45 | tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=None)
 46 | term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
 47 | term_docs_test = tfidf_vectorizer.transform(cleaned_test)
 48 | 
 49 | from sklearn.svm import SVC
 50 | svm = SVC(kernel='linear', C=1.0, random_state=42)
 51 | svm.fit(term_docs_train, label_train)
 52 | accuracy = svm.score(term_docs_test, label_test)
 53 | print('The accuracy of binary classification is: {0:.1f}%'.format(accuracy*100))
 54 | 
 55 | 
 56 | 
 57 | # Multiclass classification
 58 | categories = [
 59 |     'alt.atheism',
 60 |     'talk.religion.misc',
 61 |     'comp.graphics',
 62 |     'sci.space',
 63 |     'rec.sport.hockey'
 64 | ]
 65 | data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
 66 | data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)
 67 | 
 68 | cleaned_train = clean_text(data_train.data)
 69 | label_train = data_train.target
 70 | cleaned_test = clean_text(data_test.data)
 71 | label_test = data_test.target
 72 | 
 73 | term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
 74 | term_docs_test = tfidf_vectorizer.transform(cleaned_test)
 75 | 
 76 | svm = SVC(kernel='linear', C=1.0, random_state=42)
 77 | svm.fit(term_docs_train, label_train)
 78 | accuracy = svm.score(term_docs_test, label_test)
 79 | print('The accuracy of 5-class classification is: {0:.1f}%'.format(accuracy*100))
 80 | 
 81 | from sklearn.metrics import classification_report
 82 | prediction = svm.predict(term_docs_test)
 83 | report = classification_report(label_test, prediction)
 84 | print(report)
 85 | 
 86 | 
 87 | 
 88 | # Grid search
 89 | 
 90 | categories = None
 91 | data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
 92 | data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)
 93 | 
 94 | cleaned_train = clean_text(data_train.data)
 95 | label_train = data_train.target
 96 | cleaned_test = clean_text(data_test.data)
 97 | label_test = data_test.target
 98 | 
 99 | tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=None)
100 | term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
101 | term_docs_test = tfidf_vectorizer.transform(cleaned_test)
102 | 
103 | parameters = {'C': [0.1, 1, 10, 100]}
104 | svc_libsvm = SVC(kernel='linear')
105 | 
106 | from sklearn.model_selection import GridSearchCV
107 | grid_search = GridSearchCV(svc_libsvm, parameters, n_jobs=-1, cv=5)
108 | 
109 | 
110 | import timeit
111 | start_time = timeit.default_timer()
112 | grid_search.fit(term_docs_train, label_train)
113 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))
114 | 
115 | print(grid_search.best_params_)
116 | print(grid_search.best_score_)
117 | 
118 | svc_libsvm_best = grid_search.best_estimator_
119 | accuracy = svc_libsvm_best.score(term_docs_test, label_test)
120 | print('The accuracy of 20-class classification is: {0:.1f}%'.format(accuracy*100))
121 | 
122 | 
123 | 
124 | 
125 | 
126 | from sklearn.svm import LinearSVC
127 | svc_linear = LinearSVC()
128 | grid_search = GridSearchCV(svc_linear, parameters, n_jobs=-1, cv=5)
129 | 
130 | start_time = timeit.default_timer()
131 | grid_search.fit(term_docs_train, label_train)
132 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))
133 | 
134 | print(grid_search.best_params_)
135 | print(grid_search.best_score_)
136 | svc_linear_best = grid_search.best_estimator_
137 | accuracy = svc_linear_best.score(term_docs_test, label_test)
138 | print('TThe accuracy of 20-class classification is: {0:.1f}%'.format(accuracy*100))
139 | 
140 | 
141 | 
142 | 
143 | # Pipeline
144 | from sklearn.pipeline import Pipeline
145 | 
146 | pipeline = Pipeline([
147 |     ('tfidf', TfidfVectorizer(stop_words='english')),
148 |     ('svc', LinearSVC()),
149 | ])
150 | 
151 | parameters_pipeline = {
152 |     'tfidf__max_df': (0.25, 0.5, 1.0),
153 |     'tfidf__max_features': (10000, None),
154 |     'tfidf__sublinear_tf': (True, False),
155 |     'tfidf__smooth_idf': (True, False),
156 |     'svc__C': (0.3, 1, 3),
157 | }
158 | 
159 | grid_search = GridSearchCV(pipeline, parameters_pipeline, n_jobs=-1, cv=5)
160 | 
161 | start_time = timeit.default_timer()
162 | grid_search.fit(cleaned_train, label_train)
163 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))
164 | 
165 | print(grid_search.best_params_)
166 | print(grid_search.best_score_)
167 | pipeline_best = grid_search.best_estimator_
168 | accuracy = pipeline_best.score(cleaned_test, label_test)
169 | print('The accuracy of 20-class classification is: {0:.1f}%'.format(accuracy*100))
170 | 


--------------------------------------------------------------------------------
/Chapter06/avazu_ctr.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 6: Predicting Online Ads Click-through with Tree-Based Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | import pandas as pd
 8 | n_rows = 300000
 9 | df = pd.read_csv("train", nrows=n_rows)
10 | print(df.head(5))
11 | 
12 | 
13 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
14 | Y = df['click'].values
15 | 
16 | print(X.shape)
17 | 
18 | n_train = int(n_rows * 0.9)
19 | X_train = X[:n_train]
20 | Y_train = Y[:n_train]
21 | X_test = X[n_train:]
22 | Y_test = Y[n_train:]
23 | 
24 | from sklearn.preprocessing import OneHotEncoder
25 | enc = OneHotEncoder(handle_unknown='ignore')
26 | X_train_enc = enc.fit_transform(X_train)
27 | 
28 | X_train_enc[0]
29 | print(X_train_enc[0])
30 | 
31 | 
32 | X_test_enc = enc.transform(X_test)
33 | 
34 | 
35 | 
36 | from sklearn.tree import DecisionTreeClassifier
37 | parameters = {'max_depth': [3, 10, None]}
38 | decision_tree = DecisionTreeClassifier(criterion='gini', min_samples_split=30)
39 | 
40 | from sklearn.model_selection import GridSearchCV
41 | grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3, scoring='roc_auc')
42 | 
43 | grid_search.fit(X_train_enc, Y_train)
44 | print(grid_search.best_params_)
45 | 
46 | decision_tree_best = grid_search.best_estimator_
47 | pos_prob = decision_tree_best.predict_proba(X_test_enc)[:, 1]
48 | 
49 | from sklearn.metrics import roc_auc_score
50 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(Y_test, pos_prob)))
51 | 
52 | import numpy as np
53 | pos_prob = np.zeros(len(Y_test))
54 | click_index = np.random.choice(len(Y_test), int(len(Y_test) *  51211.0/300000), replace=False)
55 | pos_prob[click_index] = 1
56 | 
57 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(Y_test, pos_prob)))
58 | 
59 | 
60 | from sklearn.ensemble import RandomForestClassifier
61 | 
62 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1)
63 | grid_search = GridSearchCV(random_forest, parameters, n_jobs=-1, cv=3, scoring='roc_auc')
64 | grid_search.fit(X_train_enc, Y_train)
65 | print(grid_search.best_params_)
66 | print(grid_search.best_score_)
67 | 
68 | random_forest_best = grid_search.best_estimator_
69 | pos_prob = random_forest_best.predict_proba(X_test_enc)[:, 1]
70 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(Y_test, pos_prob)))
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/Chapter06/avazu_ctr_tf.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 6: Predicting Online Ads Click-through with Tree-Based Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | import pandas as pd
 8 | n_rows = 300000
 9 | df = pd.read_csv("train", nrows=n_rows)
10 | print(df.head(5))
11 | 
12 | 
13 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
14 | Y = df['click'].values
15 | 
16 | print(X.shape)
17 | 
18 | n_train = int(n_rows * 0.9)
19 | X_train = X[:n_train]
20 | Y_train = Y[:n_train]
21 | X_test = X[n_train:]
22 | Y_test = Y[n_train:]
23 | 
24 | from sklearn.preprocessing import OneHotEncoder
25 | enc = OneHotEncoder(handle_unknown='ignore')
26 | X_train_enc = enc.fit_transform(X_train)
27 | 
28 | X_train_enc[0]
29 | print(X_train_enc[0])
30 | 
31 | 
32 | X_test_enc = enc.transform(X_test)
33 | 
34 | 
35 | import tensorflow as tf
36 | from tensorflow.contrib.tensor_forest.python import tensor_forest
37 | from tensorflow.python.ops import resources
38 | 
39 | 
40 | n_iter = 20
41 | n_classes = 2
42 | n_features = int(X_train_enc.toarray().shape[1])
43 | n_trees = 10
44 | max_nodes = 30000
45 | 
46 | 
47 | x = tf.placeholder(tf.float32, shape=[None, n_features])
48 | y = tf.placeholder(tf.int64, shape=[None])
49 | 
50 | 
51 | hparams = tensor_forest.ForestHParams(num_classes=n_classes, num_features=n_features, num_trees=n_trees,
52 |                                       max_nodes=max_nodes, split_after_samples=30).fill()
53 | 
54 | forest_graph = tensor_forest.RandomForestGraphs(hparams)
55 | 
56 | train_op = forest_graph.training_graph(x, y)
57 | loss_op = forest_graph.training_loss(x, y)
58 | 
59 | infer_op, _, _ = forest_graph.inference_graph(x)
60 | 
61 | auc = tf.metrics.auc(tf.cast(y, tf.int64), infer_op[:, 1])[1]
62 | 
63 | 
64 | init_vars = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), resources.initialize_resources(resources.shared_resources()))
65 | 
66 | sess = tf.Session()
67 | 
68 | sess.run(init_vars)
69 | 
70 | batch_size = 1000
71 | 
72 | import numpy as np
73 | indices = list(range(n_train))
74 | 
75 | def gen_batch(indices):
76 |     np.random.shuffle(indices)
77 |     for batch_i in range(int(n_train / batch_size)):
78 |         batch_index = indices[batch_i*batch_size: (batch_i+1)*batch_size]
79 |         yield X_train_enc[batch_index], Y_train[batch_index]
80 | 
81 | 
82 | for i in range(1, n_iter + 1):
83 |     for X_batch, Y_batch in gen_batch(indices):
84 |         _, l = sess.run([train_op, loss_op], feed_dict={x: X_batch.toarray(), y: Y_batch})
85 |     acc_train = sess.run(auc, feed_dict={x: X_train_enc.toarray(), y: Y_train})
86 |     print('Iteration %i, AUC of ROC on training set: %f' % (i, acc_train))
87 |     acc_test = sess.run(auc, feed_dict={x: X_test_enc.toarray(), y: Y_test})
88 |     print("AUC of ROC on testing set:", acc_test)
89 | 
90 | 


--------------------------------------------------------------------------------
/Chapter06/decision_tree_submit.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 6: Predicting Online Ads Click-through with Tree-Based Algorithms
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | 
 10 | 
 11 | # Plot Gini Impurity in binary case
 12 | pos_fraction = np.linspace(0.00, 1.00, 1000)
 13 | gini = 1 - pos_fraction**2 - (1-pos_fraction)**2
 14 | plt.plot(pos_fraction, gini)
 15 | plt.xlabel('Positive fraction')
 16 | plt.ylabel('Gini Impurity')
 17 | plt.ylim(0, 1)
 18 | plt.show()
 19 | 
 20 | 
 21 | # Given labels of a data set, the Gini Impurity calculation function
 22 | def gini_impurity(labels):
 23 |     # When the set is empty, it is also pure
 24 |     if not labels:
 25 |         return 0
 26 |     # Count the occurrences of each label
 27 |     counts = np.unique(labels, return_counts=True)[1]
 28 |     fractions = counts / float(len(labels))
 29 |     return 1 - np.sum(fractions ** 2)
 30 | 
 31 | print('{0:.4f}'.format(gini_impurity([1, 1, 0, 1, 0])))
 32 | print('{0:.4f}'.format(gini_impurity([1, 1, 0, 1, 0, 0])))
 33 | print('{0:.4f}'.format(gini_impurity([1, 1, 1, 1])))
 34 | 
 35 | 
 36 | # Plot entropy in binary case
 37 | pos_fraction = np.linspace(0.00, 1.00, 1000)
 38 | ent = - (pos_fraction * np.log2(pos_fraction) + (1 - pos_fraction) * np.log2(1 - pos_fraction))
 39 | plt.plot(pos_fraction, ent)
 40 | plt.xlabel('Positive fraction')
 41 | plt.ylabel('Entropy')
 42 | plt.ylim(0, 1)
 43 | plt.show()
 44 | 
 45 | 
 46 | # Given labels of a data set, the entropy calculation function
 47 | def entropy(labels):
 48 |     if not labels:
 49 |         return 0
 50 |     counts = np.unique(labels, return_counts=True)[1]
 51 |     fractions = counts / float(len(labels))
 52 |     return - np.sum(fractions * np.log2(fractions))
 53 | 
 54 | print('{0:.4f}'.format(entropy([1, 1, 0, 1, 0])))
 55 | print('{0:.4f}'.format(entropy([1, 1, 0, 1, 0, 0])))
 56 | print('{0:.4f}'.format(entropy([1, 1, 1, 1])))
 57 | 
 58 | 
 59 | def information_gain(y, mask, func=entropy):
 60 |     s1 = np.sum(mask)
 61 |     s2 = mask.size - s1
 62 |     if (s1 == 0 | s2 == 0): return 0
 63 |     return func(y) - s1 / float(s1 + s2) * func(y[mask]) - s2 / float(s1 + s2) * func(y[np.logical_not(mask)])
 64 | 
 65 | 
 66 | criterion_function = {'gini': gini_impurity, 'entropy': entropy}
 67 | def weighted_impurity(groups, criterion='gini'):
 68 |     """
 69 |     Calculate weighted impurity of children after a split
 70 |     @param groups: list of children, and a child consists a list of class labels
 71 |     @param criterion: metric to measure the quality of a split, 'gini' for Gini Impurity or 'entropy' for Information Gain
 72 |     @return: float, weighted impurity
 73 |     """
 74 |     total = sum(len(group) for group in groups)
 75 |     weighted_sum = 0.0
 76 |     for group in groups:
 77 |         weighted_sum += len(group) / float(total) * criterion_function[criterion](group)
 78 |     return weighted_sum
 79 | 
 80 | 
 81 | children_1 = [[1, 0, 1], [0, 1]]
 82 | children_2 = [[1, 1], [0, 0, 1]]
 83 | print('Entropy of #1 split: {0:.4f}'.format(weighted_impurity(children_1, 'entropy')))
 84 | print('Entropy of #2 split: {0:.4f}'.format(weighted_impurity(children_2, 'entropy')))
 85 | 
 86 | 
 87 | 
 88 | def gini_impurity_np(labels):
 89 |     # When the set is empty, it is also pure
 90 |     if labels.size == 0:
 91 |         return 0
 92 |     # Count the occurrences of each label
 93 |     counts = np.unique(labels, return_counts=True)[1]
 94 |     fractions = counts / float(len(labels))
 95 |     return 1 - np.sum(fractions ** 2)
 96 | 
 97 | 
 98 | def entropy_np(labels):
 99 |     # When the set is empty, it is also pure
100 |     if labels.size == 0:
101 |         return 0
102 |     counts = np.unique(labels, return_counts=True)[1]
103 |     fractions = counts / float(len(labels))
104 |     return - np.sum(fractions * np.log2(fractions))
105 | 
106 | 
107 | criterion_function_np = {'gini': gini_impurity_np, 'entropy': entropy_np}
108 | def weighted_impurity(groups, criterion='gini'):
109 |     """
110 |     Calculate weighted impurity of children after a split
111 |     @param groups: list of children, and a child consists a list of class labels
112 |     @param criterion: metric to measure the quality of a split, 'gini' for Gini Impurity or 'entropy' for Information Gain
113 |     @return: float, weighted impurity
114 |     """
115 |     total = sum(len(group) for group in groups)
116 |     weighted_sum = 0.0
117 |     for group in groups:
118 |         weighted_sum += len(group) / float(total) * criterion_function_np[criterion](group)
119 |     return weighted_sum
120 | 
121 | 
122 | def split_node(X, y, index, value):
123 |     """
124 |     Split data set X, y based on a feature and a value
125 |     @param X: numpy.ndarray, dataset feature
126 |     @param y: numpy.ndarray, dataset target
127 |     @param index: int, index of the feature used for splitting
128 |     @param value: value of the feature used for splitting
129 |     @return: list, list: left and right child, a child is in the format of [X, y]
130 |     """
131 |     x_index = X[:, index]
132 |     # if this feature is numerical
133 |     if X[0, index].dtype.kind in ['i', 'f']:
134 |         mask = x_index >= value
135 |     # if this feature is categorical
136 |     else:
137 |         mask = x_index == value
138 |     # split into left and right child
139 |     left = [X[~mask, :], y[~mask]]
140 |     right = [X[mask, :], y[mask]]
141 |     return left, right
142 | 
143 | 
144 | def get_best_split(X, y, criterion):
145 |     """
146 |     Obtain the best splitting point and resulting children for the data set X, y
147 |     @param X: numpy.ndarray, dataset feature
148 |     @param y: numpy.ndarray, dataset target
149 |     @param criterion: gini or entropy
150 |     @return: dict {index: index of the feature, value: feature value, children: left and right children}
151 |     """
152 |     best_index, best_value, best_score, children = None, None, 1, None
153 |     for index in range(len(X[0])):
154 |         for value in np.sort(np.unique(X[:, index])):
155 |             groups = split_node(X, y, index, value)
156 |             impurity = weighted_impurity([groups[0][1], groups[1][1]], criterion)
157 |             if impurity < best_score:
158 |                 best_index, best_value, best_score, children = index, value, impurity, groups
159 |     return {'index': best_index, 'value': best_value, 'children': children}
160 | 
161 | 
162 | 
163 | def get_leaf(labels):
164 |     # Obtain the leaf as the majority of the labels
165 |     return np.bincount(labels).argmax()
166 | 
167 | 
168 | 
169 | def split(node, max_depth, min_size, depth, criterion):
170 |     """
171 |     Split children of a node to construct new nodes or assign them terminals
172 |     @param node: dict, with children info
173 |     @param max_depth: int, maximal depth of the tree
174 |     @param min_size: int, minimal samples required to further split a child
175 |     @param depth: int, current depth of the node
176 |     @param criterion: gini or entropy
177 |     """
178 |     left, right = node['children']
179 |     del (node['children'])
180 |     if left[1].size == 0:
181 |         node['right'] = get_leaf(right[1])
182 |         return
183 |     if right[1].size == 0:
184 |         node['left'] = get_leaf(left[1])
185 |         return
186 |     # Check if the current depth exceeds the maximal depth
187 |     if depth >= max_depth:
188 |         node['left'], node['right'] = get_leaf(left[1]), get_leaf(right[1])
189 |         return
190 |     # Check if the left child has enough samples
191 |     if left[1].size <= min_size:
192 |         node['left'] = get_leaf(left[1])
193 |     else:
194 |         # It has enough samples, we further split it
195 |         result = get_best_split(left[0], left[1], criterion)
196 |         result_left, result_right = result['children']
197 |         if result_left[1].size == 0:
198 |             node['left'] = get_leaf(result_right[1])
199 |         elif result_right[1].size == 0:
200 |             node['left'] = get_leaf(result_left[1])
201 |         else:
202 |             node['left'] = result
203 |             split(node['left'], max_depth, min_size, depth + 1, criterion)
204 |     # Check if the right child has enough samples
205 |     if right[1].size <= min_size:
206 |         node['right'] = get_leaf(right[1])
207 |     else:
208 |         # It has enough samples, we further split it
209 |         result = get_best_split(right[0], right[1], criterion)
210 |         result_left, result_right = result['children']
211 |         if result_left[1].size == 0:
212 |             node['right'] = get_leaf(result_right[1])
213 |         elif result_right[1].size == 0:
214 |             node['right'] = get_leaf(result_left[1])
215 |         else:
216 |             node['right'] = result
217 |             split(node['right'], max_depth, min_size, depth + 1, criterion)
218 | 
219 | 
220 | def train_tree(X_train, y_train, max_depth, min_size, criterion='gini'):
221 |     """
222 |     Construction of a tree starts here
223 |     @param X_train: list of training samples (feature)
224 |     @param y_train: list of training samples (target)
225 |     @param max_depth: int, maximal depth of the tree
226 |     @param min_size: int, minimal samples required to further split a child
227 |     @param criterion: gini or entropy
228 |     """
229 |     X = np.array(X_train)
230 |     y = np.array(y_train)
231 |     root = get_best_split(X, y, criterion)
232 |     split(root, max_depth, min_size, 1, criterion)
233 |     return root
234 | 
235 | 
236 | 
237 | CONDITION = {'numerical': {'yes': '>=', 'no': '<'},
238 |              'categorical': {'yes': 'is', 'no': 'is not'}}
239 | def visualize_tree(node, depth=0):
240 |     if isinstance(node, dict):
241 |         if node['value'].dtype.kind in ['i', 'f']:
242 |             condition = CONDITION['numerical']
243 |         else:
244 |             condition = CONDITION['categorical']
245 |         print('{}|- X{} {} {}'.format(depth * '  ', node['index'] + 1, condition['no'], node['value']))
246 |         if 'left' in node:
247 |             visualize_tree(node['left'], depth + 1)
248 |         print('{}|- X{} {} {}'.format(depth * '  ', node['index'] + 1, condition['yes'], node['value']))
249 |         if 'right' in node:
250 |             visualize_tree(node['right'], depth + 1)
251 |     else:
252 |         print('{}[{}]'.format(depth * '  ', node))
253 | 
254 | 
255 | X_train = [['tech', 'professional'],
256 |            ['fashion', 'student'],
257 |            ['fashion', 'professional'],
258 |            ['sports', 'student'],
259 |            ['tech', 'student'],
260 |            ['tech', 'retired'],
261 |            ['sports', 'professional']]
262 | 
263 | y_train = [1,
264 |            0,
265 |            0,
266 |            0,
267 |            1,
268 |            0,
269 |            1]
270 | 
271 | tree = train_tree(X_train, y_train, 2, 2)
272 | visualize_tree(tree)
273 | 
274 | 
275 | 
276 | 
277 | X_train_n = [[6, 7],
278 |            [2, 4],
279 |            [7, 2],
280 |            [3, 6],
281 |            [4, 7],
282 |            [5, 2],
283 |            [1, 6],
284 |            [2, 0],
285 |            [6, 3],
286 |            [4, 1]]
287 | 
288 | y_train_n = [0,
289 |            0,
290 |            0,
291 |            0,
292 |            0,
293 |            1,
294 |            1,
295 |            1,
296 |            1,
297 |            1]
298 | 
299 | tree = train_tree(X_train_n, y_train_n, 2, 2)
300 | visualize_tree(tree)
301 | 
302 | 
303 | from sklearn.tree import DecisionTreeClassifier
304 | tree_sk = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_split=2)
305 | tree_sk.fit(X_train_n, y_train_n)
306 | 
307 | from sklearn.tree import export_graphviz
308 | export_graphviz(tree_sk, out_file='tree.dot', feature_names=['X1', 'X2'], impurity=False, filled=True, class_names=['0', '1'])
309 | 
310 | 
311 | 


--------------------------------------------------------------------------------
/Chapter07/encoding.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from sklearn.feature_extraction import DictVectorizer
 8 | 
 9 | 
10 | X_dict = [{'interest': 'tech', 'occupation': 'professional'},
11 |           {'interest': 'fashion', 'occupation': 'student'},
12 |           {'interest': 'fashion', 'occupation': 'professional'},
13 |           {'interest': 'sports', 'occupation': 'student'},
14 |           {'interest': 'tech', 'occupation': 'student'},
15 |           {'interest': 'tech', 'occupation': 'retired'},
16 |           {'interest': 'sports', 'occupation': 'professional'}]
17 | 
18 | dict_one_hot_encoder = DictVectorizer(sparse=False)
19 | X_encoded = dict_one_hot_encoder.fit_transform(X_dict)
20 | print(X_encoded)
21 | 
22 | print(dict_one_hot_encoder.vocabulary_)
23 | 
24 | 
25 | new_dict = [{'interest': 'sports', 'occupation': 'retired'}]
26 | new_encoded = dict_one_hot_encoder.transform(new_dict)
27 | print(new_encoded)
28 | 
29 | print(dict_one_hot_encoder.inverse_transform(new_encoded))
30 | 
31 | 
32 | # new category not encountered before
33 | new_dict = [{'interest': 'unknown_interest', 'occupation': 'retired'},
34 |             {'interest': 'tech', 'occupation': 'unseen_occupation'}]
35 | new_encoded = dict_one_hot_encoder.transform(new_dict)
36 | print(new_encoded)
37 | 
38 | 
39 | 
40 | import pandas as pd
41 | df = pd.DataFrame({'score': ['low',
42 |                              'high',
43 |                              'medium',
44 |                              'medium',
45 |                              'low']})
46 | print(df)
47 | 
48 | mapping = {'low':1, 'medium':2, 'high':3}
49 | df['score'] = df['score'].replace(mapping)
50 | 
51 | print(df)


--------------------------------------------------------------------------------
/Chapter07/logistic_function.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | def sigmoid(input):
11 |     return 1.0 / (1 + np.exp(-input))
12 | 
13 | 
14 | import matplotlib.pyplot as plt
15 | z = np.linspace(-8, 8, 1000)
16 | y = sigmoid(z)
17 | plt.plot(z, y)
18 | plt.axhline(y=0, ls='dotted', color='k')
19 | plt.axhline(y=0.5, ls='dotted', color='k')
20 | plt.axhline(y=1, ls='dotted', color='k')
21 | plt.yticks([0.0, 0.25, 0.5, 0.75, 1.0])
22 | plt.xlabel('z')
23 | plt.ylabel('y(z)')
24 | plt.show()
25 | 
26 | 
27 | # plot sample cost vs y_hat (prediction), for y (truth) = 1
28 | y_hat = np.linspace(0, 1, 1000)
29 | cost = -np.log(y_hat)
30 | plt.plot(y_hat, cost)
31 | plt.xlabel('Prediction')
32 | plt.ylabel('Cost')
33 | plt.xlim(0, 1)
34 | plt.ylim(0, 7)
35 | plt.show()
36 | 
37 | # plot sample cost vs y_hat (prediction), for y (truth) = 0
38 | y_hat = np.linspace(0, 1, 1000)
39 | cost = -np.log(1 - y_hat)
40 | plt.plot(y_hat, cost)
41 | plt.xlabel('Prediction')
42 | plt.ylabel('Cost')
43 | plt.xlim(0, 1)
44 | plt.ylim(0, 7)
45 | plt.show()
46 | 
47 | 


--------------------------------------------------------------------------------
/Chapter07/logistic_regression_from_scratch.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | 
  7 | import numpy as np
  8 | 
  9 | def sigmoid(input):
 10 |     return 1.0 / (1 + np.exp(-input))
 11 | 
 12 | 
 13 | 
 14 | # Gradient descent based logistic regression from scratch
 15 | def compute_prediction(X, weights):
 16 |     """ Compute the prediction y_hat based on current weights
 17 |     Args:
 18 |         X (numpy.ndarray)
 19 |         weights (numpy.ndarray)
 20 |     Returns:
 21 |         numpy.ndarray, y_hat of X under weights
 22 |     """
 23 |     z = np.dot(X, weights)
 24 |     predictions = sigmoid(z)
 25 |     return predictions
 26 | 
 27 | def update_weights_gd(X_train, y_train, weights, learning_rate):
 28 |     """ Update weights by one step
 29 |     Args:
 30 |         X_train, y_train (numpy.ndarray, training data set)
 31 |         weights (numpy.ndarray)
 32 |         learning_rate (float)
 33 |     Returns:
 34 |         numpy.ndarray, updated weights
 35 |     """
 36 |     predictions = compute_prediction(X_train, weights)
 37 |     weights_delta = np.dot(X_train.T, y_train - predictions)
 38 |     m = y_train.shape[0]
 39 |     weights += learning_rate / float(m) * weights_delta
 40 |     return weights
 41 | 
 42 | def compute_cost(X, y, weights):
 43 |     """ Compute the cost J(w)
 44 |     Args:
 45 |         X, y (numpy.ndarray, data set)
 46 |         weights (numpy.ndarray)
 47 |     Returns:
 48 |         float
 49 |     """
 50 |     predictions = compute_prediction(X, weights)
 51 |     cost = np.mean(-y * np.log(predictions) - (1 - y) * np.log(1 - predictions))
 52 |     return cost
 53 | 
 54 | def train_logistic_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False):
 55 |     """ Train a logistic regression model
 56 |     Args:
 57 |         X_train, y_train (numpy.ndarray, training data set)
 58 |         max_iter (int, number of iterations)
 59 |         learning_rate (float)
 60 |         fit_intercept (bool, with an intercept w0 or not)
 61 |     Returns:
 62 |         numpy.ndarray, learned weights
 63 |     """
 64 |     if fit_intercept:
 65 |         intercept = np.ones((X_train.shape[0], 1))
 66 |         X_train = np.hstack((intercept, X_train))
 67 |     weights = np.zeros(X_train.shape[1])
 68 |     for iteration in range(max_iter):
 69 |         weights = update_weights_gd(X_train, y_train, weights, learning_rate)
 70 |         # Check the cost for every 100 (for example) iterations
 71 |         if iteration % 100 == 0:
 72 |             print(compute_cost(X_train, y_train, weights))
 73 |     return weights
 74 | 
 75 | def predict(X, weights):
 76 |     if X.shape[1] == weights.shape[0] - 1:
 77 |         intercept = np.ones((X.shape[0], 1))
 78 |         X = np.hstack((intercept, X))
 79 |     return compute_prediction(X, weights)
 80 | 
 81 | 
 82 | # A example
 83 | X_train = np.array([[6, 7],
 84 |                     [2, 4],
 85 |                     [3, 6],
 86 |                     [4, 7],
 87 |                     [1, 6],
 88 |                     [5, 2],
 89 |                     [2, 0],
 90 |                     [6, 3],
 91 |                     [4, 1],
 92 |                     [7, 2]])
 93 | 
 94 | y_train = np.array([0,
 95 |                     0,
 96 |                     0,
 97 |                     0,
 98 |                     0,
 99 |                     1,
100 |                     1,
101 |                     1,
102 |                     1,
103 |                     1])
104 | 
105 | weights = train_logistic_regression(X_train, y_train, max_iter=1000, learning_rate=0.1, fit_intercept=True)
106 | 
107 | X_test = np.array([[6, 1],
108 |                    [1, 3],
109 |                    [3, 1],
110 |                    [4, 5]])
111 | 
112 | predictions = predict(X_test, weights)
113 | 
114 | import matplotlib.pyplot as plt
115 | plt.scatter(X_train[:,0], X_train[:,1], c=['b']*5+['k']*5, marker='o')
116 | colours = ['k' if prediction >= 0.5 else 'b' for prediction in predictions]
117 | plt.scatter(X_test[:,0], X_test[:,1], marker='*', c=colours)
118 | plt.xlabel('x1')
119 | plt.ylabel('x2')
120 | plt.show()
121 | 
122 | 
123 | 
124 | 
125 | import pandas as pd
126 | n_rows = 300000
127 | df = pd.read_csv("train", nrows=n_rows)
128 | 
129 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
130 | Y = df['click'].values
131 | 
132 | n_train = 100000
133 | X_train = X[:n_train]
134 | Y_train = Y[:n_train]
135 | X_test = X[n_train:]
136 | Y_test = Y[n_train:]
137 | 
138 | from sklearn.preprocessing import OneHotEncoder
139 | enc = OneHotEncoder(handle_unknown='ignore')
140 | X_train_enc = enc.fit_transform(X_train)
141 | 
142 | X_test_enc = enc.transform(X_test)
143 | 
144 | 
145 | import timeit
146 | start_time = timeit.default_timer()
147 | weights = train_logistic_regression(X_train_enc.toarray(), Y_train, max_iter=10000, learning_rate=0.01, fit_intercept=True)
148 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))
149 | 
150 | 
151 | pred = predict(X_test_enc.toarray(), weights)
152 | from sklearn.metrics import roc_auc_score
153 | print('Training samples: {0}, AUC on testing set: {1:.3f}'.format(n_train, roc_auc_score(Y_test, pred)))
154 | 
155 | 
156 | 
157 | 
158 | 
159 | def update_weights_sgd(X_train, y_train, weights, learning_rate):
160 |     """ One weight update iteration: moving weights by one step based on each individual sample
161 |     Args:
162 |         X_train, y_train (numpy.ndarray, training data set)
163 |         weights (numpy.ndarray)
164 |         learning_rate (float)
165 |     Returns:
166 |         numpy.ndarray, updated weights
167 |     """
168 |     for X_each, y_each in zip(X_train, y_train):
169 |         prediction = compute_prediction(X_each, weights)
170 |         weights_delta = X_each.T * (y_each - prediction)
171 |         weights += learning_rate * weights_delta
172 |     return weights
173 | 
174 | def train_logistic_regression_sgd(X_train, y_train, max_iter, learning_rate, fit_intercept=False):
175 |     """ Train a logistic regression model via SGD
176 |     Args:
177 |         X_train, y_train (numpy.ndarray, training data set)
178 |         max_iter (int, number of iterations)
179 |         learning_rate (float)
180 |         fit_intercept (bool, with an intercept w0 or not)
181 |     Returns:
182 |         numpy.ndarray, learned weights
183 |     """
184 |     if fit_intercept:
185 |         intercept = np.ones((X_train.shape[0], 1))
186 |         X_train = np.hstack((intercept, X_train))
187 |     weights = np.zeros(X_train.shape[1])
188 |     for iteration in range(max_iter):
189 |         weights = update_weights_sgd(X_train, y_train, weights, learning_rate)
190 |         # Check the cost for every 2 (for example) iterations
191 |         if iteration % 2 == 0:
192 |             print(compute_cost(X_train, y_train, weights))
193 |     return weights
194 | 
195 | 
196 | # Train the SGD model based on 100000 samples
197 | start_time = timeit.default_timer()
198 | weights = train_logistic_regression_sgd(X_train_enc.toarray(), Y_train, max_iter=10, learning_rate=0.01, fit_intercept=True)
199 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))
200 | pred = predict(X_test_enc.toarray(), weights)
201 | print('Training samples: {0}, AUC on testing set: {1:.3f}'.format(n_train, roc_auc_score(Y_test, pred)))
202 | 
203 | 


--------------------------------------------------------------------------------
/Chapter07/logistic_regression_tf.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | import pandas as pd
 5 | n_rows = 300000
 6 | df = pd.read_csv("train", nrows=n_rows)
 7 | 
 8 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
 9 | Y = df['click'].values
10 | 
11 | n_train = int(n_rows * 0.9)
12 | X_train = X[:n_train]
13 | Y_train = Y[:n_train]
14 | X_test = X[n_train:]
15 | Y_test = Y[n_train:]
16 | 
17 | from sklearn.preprocessing import OneHotEncoder
18 | enc = OneHotEncoder(handle_unknown='ignore')
19 | X_train_enc = enc.fit_transform(X_train)
20 | X_test_enc = enc.transform(X_test)
21 | 
22 | 
23 | n_features = int(X_train_enc.toarray().shape[1])
24 | learning_rate = 0.001
25 | n_iter = 20
26 | 
27 | 
28 | # Input and Target placeholders
29 | x = tf.placeholder(tf.float32, shape=[None, n_features])
30 | y = tf.placeholder(tf.float32, shape=[None])
31 | 
32 | # Build the logistic regression model
33 | W = tf.Variable(tf.zeros([n_features, 1]))
34 | b = tf.Variable(tf.zeros([1]))
35 | 
36 | logits = tf.add(tf.matmul(x, W), b)[:, 0]
37 | pred = tf.nn.sigmoid(logits)
38 | 
39 | cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits))
40 | auc = tf.metrics.auc(tf.cast(y, tf.int64), pred)[1]
41 | 
42 | 
43 | optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
44 | 
45 | 
46 | 
47 | # Initialize the variables (i.e. assign their default value)
48 | init_vars = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
49 | 
50 | 
51 | batch_size = 1000
52 | 
53 | import numpy as np
54 | indices = list(range(n_train))
55 | 
56 | def gen_batch(indices):
57 |     np.random.shuffle(indices)
58 |     for batch_i in range(int(n_train / batch_size)):
59 |         batch_index = indices[batch_i*batch_size: (batch_i+1)*batch_size]
60 |         yield X_train_enc[batch_index], Y_train[batch_index]
61 | 
62 | 
63 | sess = tf.Session()
64 | 
65 | sess.run(init_vars)
66 | 
67 | 
68 | for i in range(1, n_iter+1):
69 |     avg_cost = 0.
70 |     for X_batch, Y_batch in gen_batch(indices):
71 |         _, c = sess.run([optimizer, cost], feed_dict={x: X_batch.toarray(), y: Y_batch})
72 |         avg_cost += c / int(n_train / batch_size)
73 |     print('Iteration %i, training loss: %f' % (i, avg_cost))
74 | 
75 | 
76 | auc_test = sess.run(auc, feed_dict={x: X_test_enc.toarray(), y: Y_test})
77 | print("AUC of ROC on testing set:", auc_test)
78 | 


--------------------------------------------------------------------------------
/Chapter07/random_forest_feature_selection.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | import numpy as np
 8 | from sklearn.metrics import roc_auc_score
 9 | 
10 | 
11 | import pandas as pd
12 | n_rows = 100000
13 | df = pd.read_csv("train", nrows=n_rows)
14 | 
15 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
16 | Y = df['click'].values
17 | 
18 | 
19 | X_train = X
20 | Y_train = Y
21 | 
22 | 
23 | from sklearn.preprocessing import OneHotEncoder
24 | enc = OneHotEncoder(handle_unknown='ignore')
25 | X_train_enc = enc.fit_transform(X_train)
26 | 
27 | 
28 | 
29 | # Feature selection with random forest
30 | 
31 | from sklearn.ensemble import RandomForestClassifier
32 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1)
33 | random_forest.fit(X_train_enc.toarray(), Y_train)
34 | 
35 | 
36 | 
37 | 
38 | feature_imp = random_forest.feature_importances_
39 | print(feature_imp)
40 | 
41 | # bottom 10 weights and the corresponding 10 least important features
42 | feature_names = enc.get_feature_names()
43 | print(np.sort(feature_imp)[:10])
44 | bottom_10 = np.argsort(feature_imp)[:10]
45 | print('10 least important features are:\n', feature_names[bottom_10])
46 | 
47 | # top 10 weights and the corresponding 10 most important features
48 | print(np.sort(feature_imp)[-10:])
49 | top_10 = np.argsort(feature_imp)[-10:]
50 | print('10 most important features are:\n', feature_names[top_10])
51 | 
52 | 


--------------------------------------------------------------------------------
/Chapter07/scikit_logistic_regression.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | 
  7 | import numpy as np
  8 | from sklearn.metrics import roc_auc_score
  9 | 
 10 | 
 11 | import pandas as pd
 12 | n_rows = 300000
 13 | df = pd.read_csv("train", nrows=n_rows)
 14 | 
 15 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
 16 | Y = df['click'].values
 17 | 
 18 | n_train = 100000
 19 | X_train = X[:n_train]
 20 | Y_train = Y[:n_train]
 21 | X_test = X[n_train:]
 22 | Y_test = Y[n_train:]
 23 | 
 24 | from sklearn.preprocessing import OneHotEncoder
 25 | enc = OneHotEncoder(handle_unknown='ignore')
 26 | X_train_enc = enc.fit_transform(X_train)
 27 | 
 28 | X_test_enc = enc.transform(X_test)
 29 | 
 30 | # # Use scikit-learn package
 31 | from sklearn.linear_model import SGDClassifier
 32 | sgd_lr = SGDClassifier(loss='log', penalty=None, fit_intercept=True, n_iter=10, learning_rate='constant', eta0=0.01)
 33 | sgd_lr.fit(X_train_enc.toarray(), Y_train)
 34 | 
 35 | pred = sgd_lr.predict_proba(X_test_enc.toarray())[:, 1]
 36 | print('Training samples: {0}, AUC on testing set: {1:.3f}'.format(n_train, roc_auc_score(Y_test, pred)))
 37 | 
 38 | 
 39 | 
 40 | # Feature selection with L1 regularization
 41 | 
 42 | sgd_lr_l1 = SGDClassifier(loss='log', penalty='l1', alpha=0.0001, fit_intercept=True, n_iter=10, learning_rate='constant', eta0=0.01)
 43 | sgd_lr_l1.fit(X_train_enc.toarray(), Y_train)
 44 | 
 45 | coef_abs = np.abs(sgd_lr_l1.coef_)
 46 | print(coef_abs)
 47 | 
 48 | # bottom 10 weights and the corresponding 10 least important features
 49 | print(np.sort(coef_abs)[0][:10])
 50 | 
 51 | feature_names = enc.get_feature_names()
 52 | bottom_10 = np.argsort(coef_abs)[0][:10]
 53 | print('10 least important features are:\n', feature_names[bottom_10])
 54 | 
 55 | # top 10 weights and the corresponding 10 most important features
 56 | print(np.sort(coef_abs)[0][-10:])
 57 | top_10 = np.argsort(coef_abs)[0][-10:]
 58 | print('10 most important features are:\n', feature_names[top_10])
 59 | 
 60 | 
 61 | 
 62 | # Online learning
 63 | 
 64 | 
 65 | n_rows = 100000 * 11
 66 | df = pd.read_csv("train", nrows=n_rows)
 67 | 
 68 | X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
 69 | Y = df['click'].values
 70 | 
 71 | n_train = 100000 * 10
 72 | X_train = X[:n_train]
 73 | Y_train = Y[:n_train]
 74 | X_test = X[n_train:]
 75 | Y_test = Y[n_train:]
 76 | 
 77 | from sklearn.preprocessing import OneHotEncoder
 78 | enc = OneHotEncoder(handle_unknown='ignore')
 79 | enc.fit(X_train)
 80 | 
 81 | 
 82 | # The number of iterations is set to 1 if using partial_fit.
 83 | sgd_lr_online = SGDClassifier(loss='log', penalty=None, fit_intercept=True, n_iter=1, learning_rate='constant', eta0=0.01)
 84 | 
 85 | import timeit
 86 | start_time = timeit.default_timer()
 87 | 
 88 | 
 89 | # Use the first 1,000,000 samples for training, and the next 100,000 for testing
 90 | for i in range(10):
 91 |     x_train = X_train[i*100000:(i+1)*100000]
 92 |     y_train = Y_train[i*100000:(i+1)*100000]
 93 |     x_train_enc = enc.transform(x_train)
 94 |     sgd_lr_online.partial_fit(x_train_enc.toarray(), y_train, classes=[0, 1])
 95 | 
 96 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time))
 97 | 
 98 | x_test_enc = enc.transform(X_test)
 99 | 
100 | pred = sgd_lr_online.predict_proba(x_test_enc.toarray())[:, 1]
101 | print('Training samples: {0}, AUC on testing set: {1:.3f}'.format(n_train * 10, roc_auc_score(Y_test, pred)))
102 | 
103 | 
104 | 
105 | # Multiclass classification with logistic regression
106 | 
107 | from sklearn import datasets
108 | digits = datasets.load_digits()
109 | n_samples = len(digits.images)
110 | X = digits.images.reshape((n_samples, -1))
111 | Y = digits.target
112 | 
113 | from sklearn.model_selection import train_test_split
114 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
115 | 
116 | from sklearn.model_selection import GridSearchCV
117 | parameters = {'penalty': ['l2', None],
118 |               'alpha': [1e-07, 1e-06, 1e-05, 1e-04],
119 |               'eta0': [0.01, 0.1, 1, 10]}
120 | 
121 | sgd_lr = SGDClassifier(loss='log', learning_rate='constant', eta0=0.01, fit_intercept=True, n_iter=10)
122 | 
123 | grid_search = GridSearchCV(sgd_lr, parameters, n_jobs=-1, cv=5)
124 | 
125 | grid_search.fit(X_train, Y_train)
126 | print(grid_search.best_params_)
127 | 
128 | sgd_lr_best = grid_search.best_estimator_
129 | accuracy = sgd_lr_best.score(X_test, Y_test)
130 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100))
131 | 


--------------------------------------------------------------------------------
/Chapter08/ctr.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 8: Scaling Up Learning On Massive Click Logs
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | 
  7 | from pyspark.sql import SparkSession
  8 | 
  9 | 
 10 | spark = SparkSession\
 11 |     .builder\
 12 |     .appName("CTR")\
 13 |     .getOrCreate()
 14 | 
 15 | 
 16 | 
 17 | from pyspark.sql.types import StructField, StringType, StructType, IntegerType
 18 | 
 19 | schema = StructType([
 20 |     StructField("id", StringType(), True),
 21 |     StructField("click", IntegerType(), True),
 22 |     StructField("hour", IntegerType(), True),
 23 |     StructField("C1", StringType(), True),
 24 |     StructField("banner_pos", StringType(), True),
 25 |     StructField("site_id", StringType(), True),
 26 |     StructField("site_domain", StringType(), True),
 27 |     StructField("site_category", StringType(), True),
 28 |     StructField("app_id", StringType(), True),
 29 |     StructField("app_domain", StringType(), True),
 30 |     StructField("app_category", StringType(), True),
 31 |     StructField("device_id", StringType(), True),
 32 |     StructField("device_ip", StringType(), True),
 33 |     StructField("device_model", StringType(), True),
 34 |     StructField("device_type", StringType(), True),
 35 |     StructField("device_conn_type", StringType(), True),
 36 |     StructField("C14", StringType(), True),
 37 |     StructField("C15", StringType(), True),
 38 |     StructField("C16", StringType(), True),
 39 |     StructField("C17", StringType(), True),
 40 |     StructField("C18", StringType(), True),
 41 |     StructField("C19", StringType(), True),
 42 |     StructField("C20", StringType(), True),
 43 |     StructField("C21", StringType(), True),
 44 | ])
 45 | 
 46 | 
 47 | # Download data in: https://www.kaggle.com/c/avazu-ctr-prediction/data
 48 | df = spark.read.csv("filepath/train", schema=schema, header=True)
 49 | 
 50 | 
 51 | df.printSchema()
 52 | 
 53 | df.count()
 54 | 
 55 | df = df.drop('id').drop('hour').drop('device_id').drop('device_ip')
 56 | 
 57 | df = df.withColumnRenamed("click", "label")
 58 | 
 59 | df.columns
 60 | 
 61 | 
 62 | df_train, df_test = df.randomSplit([0.7, 0.3], 42)
 63 | 
 64 | df_train.cache()
 65 | df_train.count()
 66 | 
 67 | df_test.cache()
 68 | df_test.count()
 69 | 
 70 | 
 71 | 
 72 | categorical = df_train.columns
 73 | categorical.remove('label')
 74 | print(categorical)
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoderEstimator
 81 | 
 82 | 
 83 | 
 84 | 
 85 | indexers = [
 86 |     StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)).setHandleInvalid("keep")
 87 |     for c in categorical
 88 | ]
 89 | 
 90 | encoder = OneHotEncoderEstimator(
 91 |     inputCols=[indexer.getOutputCol() for indexer in indexers],
 92 |     outputCols=[
 93 |         "{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers]
 94 | )
 95 | 
 96 | assembler = VectorAssembler(
 97 |     inputCols=encoder.getOutputCols(),
 98 |     outputCol="features"
 99 | )
100 | 
101 | stages = indexers + [encoder, assembler]
102 | 
103 | from pyspark.ml import Pipeline
104 | 
105 | 
106 | pipeline = Pipeline(stages=stages)
107 | 
108 | 
109 | one_hot_encoder = pipeline.fit(df_train)
110 | 
111 | 
112 | df_train_encoded = one_hot_encoder.transform(df_train)
113 | 
114 | 
115 | df_train_encoded.show()
116 | 
117 | df_train_encoded = df_train_encoded.select(["label", "features"])
118 | 
119 | df_train_encoded.show()
120 | 
121 | df_train_encoded.cache()
122 | 
123 | df_train.unpersist()
124 | 
125 | 
126 | 
127 | df_test_encoded = one_hot_encoder.transform(df_test)
128 | 
129 | 
130 | 
131 | df_test_encoded = df_test_encoded.select(["label", "features"])
132 | 
133 | df_test_encoded.show()
134 | 
135 | df_test_encoded.cache()
136 | 
137 | df_test.unpersist()
138 | 
139 | 
140 | 
141 | from pyspark.ml.classification import LogisticRegression
142 | 
143 | classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000)
144 | 
145 | lr_model = classifier.fit(df_train_encoded)
146 | 
147 | 
148 | df_train_encoded.unpersist()
149 | 
150 | predictions = lr_model.transform(df_test_encoded)
151 | 
152 | df_test_encoded.unpersist()
153 | 
154 | predictions.cache()
155 | 
156 | predictions.show()
157 | 
158 | 
159 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
160 | 
161 | ev = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", metricName = "areaUnderROC")
162 | print(ev.evaluate(predictions))
163 | 
164 | 
165 | spark.stop()
166 | 


--------------------------------------------------------------------------------
/Chapter08/ctr_hashing.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 8: Scaling Up Learning On Massive Click Logs
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | 
  7 | from pyspark.sql import SparkSession
  8 | 
  9 | 
 10 | spark = SparkSession\
 11 |     .builder\
 12 |     .appName("CTR")\
 13 |     .getOrCreate()
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | from pyspark.sql.types import StructField, StringType, StructType, IntegerType
 20 | 
 21 | schema = StructType([
 22 |     StructField("id", StringType(), True),
 23 |     StructField("click", IntegerType(), True),
 24 |     StructField("hour", IntegerType(), True),
 25 |     StructField("C1", StringType(), True),
 26 |     StructField("banner_pos", StringType(), True),
 27 |     StructField("site_id", StringType(), True),
 28 |     StructField("site_domain", StringType(), True),
 29 |     StructField("site_category", StringType(), True),
 30 |     StructField("app_id", StringType(), True),
 31 |     StructField("app_domain", StringType(), True),
 32 |     StructField("app_category", StringType(), True),
 33 |     StructField("device_id", StringType(), True),
 34 |     StructField("device_ip", StringType(), True),
 35 |     StructField("device_model", StringType(), True),
 36 |     StructField("device_type", StringType(), True),
 37 |     StructField("device_conn_type", StringType(), True),
 38 |     StructField("C14", StringType(), True),
 39 |     StructField("C15", StringType(), True),
 40 |     StructField("C16", StringType(), True),
 41 |     StructField("C17", StringType(), True),
 42 |     StructField("C18", StringType(), True),
 43 |     StructField("C19", StringType(), True),
 44 |     StructField("C20", StringType(), True),
 45 |     StructField("C21", StringType(), True),
 46 | ])
 47 | 
 48 | 
 49 | 
 50 | df = spark.read.csv("file:///Users/hayden/dev/project/my_python2_book/ch7/train", schema=schema, header=True)
 51 | 
 52 | 
 53 | df = df.drop('id').drop('hour').drop('device_id').drop('device_ip')
 54 | 
 55 | df = df.withColumnRenamed("click", "label")
 56 | 
 57 | 
 58 | df_train, df_test = df.randomSplit([0.7, 0.3], 42)
 59 | 
 60 | df_train.cache()
 61 | 
 62 | df_test.cache()
 63 | 
 64 | 
 65 | 
 66 | categorical = df_train.columns
 67 | categorical.remove('label')
 68 | print(categorical)
 69 | 
 70 | 
 71 | 
 72 | from pyspark.ml.feature import FeatureHasher
 73 | hasher = FeatureHasher(numFeatures=10000, inputCols=categorical,
 74 |                        outputCol="features")
 75 | 
 76 | hasher.transform(df_train).select("features").show()
 77 | 
 78 | from pyspark.ml.classification import LogisticRegression
 79 | 
 80 | classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000)
 81 | 
 82 | stages = [hasher, classifier]
 83 | 
 84 | from pyspark.ml import Pipeline
 85 | 
 86 | pipeline = Pipeline(stages=stages)
 87 | 
 88 | 
 89 | model = pipeline.fit(df_train)
 90 | 
 91 | predictions = model.transform(df_test)
 92 | 
 93 | 
 94 | predictions.cache()
 95 | 
 96 | 
 97 | 
 98 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
 99 | 
100 | ev = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", metricName = "areaUnderROC")
101 | print(ev.evaluate(predictions))
102 | 
103 | 
104 | spark.stop()
105 | 


--------------------------------------------------------------------------------
/Chapter08/ctr_interaction.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 8: Scaling Up Learning On Massive Click Logs
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | 
  7 | from pyspark.sql import SparkSession
  8 | 
  9 | 
 10 | spark = SparkSession\
 11 |     .builder\
 12 |     .appName("CTR")\
 13 |     .getOrCreate()
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | from pyspark.sql.types import StructField, StringType, StructType, IntegerType
 20 | 
 21 | schema = StructType([
 22 |     StructField("id", StringType(), True),
 23 |     StructField("click", IntegerType(), True),
 24 |     StructField("hour", IntegerType(), True),
 25 |     StructField("C1", StringType(), True),
 26 |     StructField("banner_pos", StringType(), True),
 27 |     StructField("site_id", StringType(), True),
 28 |     StructField("site_domain", StringType(), True),
 29 |     StructField("site_category", StringType(), True),
 30 |     StructField("app_id", StringType(), True),
 31 |     StructField("app_domain", StringType(), True),
 32 |     StructField("app_category", StringType(), True),
 33 |     StructField("device_id", StringType(), True),
 34 |     StructField("device_ip", StringType(), True),
 35 |     StructField("device_model", StringType(), True),
 36 |     StructField("device_type", StringType(), True),
 37 |     StructField("device_conn_type", StringType(), True),
 38 |     StructField("C14", StringType(), True),
 39 |     StructField("C15", StringType(), True),
 40 |     StructField("C16", StringType(), True),
 41 |     StructField("C17", StringType(), True),
 42 |     StructField("C18", StringType(), True),
 43 |     StructField("C19", StringType(), True),
 44 |     StructField("C20", StringType(), True),
 45 |     StructField("C21", StringType(), True),
 46 | ])
 47 | 
 48 | 
 49 | 
 50 | df = spark.read.csv("file:///Users/hayden/dev/project/my_python2_book/ch7/train", schema=schema, header=True)
 51 | 
 52 | 
 53 | df = df.drop('id').drop('hour').drop('device_id').drop('device_ip')
 54 | 
 55 | df = df.withColumnRenamed("click", "label")
 56 | 
 57 | 
 58 | 
 59 | 
 60 | df_train, df_test = df.randomSplit([0.7, 0.3], 42)
 61 | 
 62 | df_train.cache()
 63 | 
 64 | df_test.cache()
 65 | 
 66 | 
 67 | 
 68 | categorical = df_train.columns
 69 | categorical.remove('label')
 70 | print(categorical)
 71 | 
 72 | 
 73 | 
 74 | cat_inter = ['C14', 'C15']
 75 | 
 76 | concat = '+'.join(categorical)
 77 | interaction = ':'.join(cat_inter)
 78 | formula = "label ~ " + concat + '+' + interaction
 79 | 
 80 | print(formula)
 81 | 
 82 | from pyspark.ml.feature import RFormula
 83 | interactor = RFormula(
 84 |     formula=formula,
 85 |     featuresCol="features",
 86 |     labelCol="label").setHandleInvalid("keep")
 87 | 
 88 | interactor.fit(df_train).transform(df_train).select("features").show()
 89 | 
 90 | from pyspark.ml.classification import LogisticRegression
 91 | 
 92 | classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000)
 93 | 
 94 | stages = [interactor, classifier]
 95 | 
 96 | from pyspark.ml import Pipeline
 97 | 
 98 | pipeline = Pipeline(stages=stages)
 99 | 
100 | 
101 | model = pipeline.fit(df_train)
102 | 
103 | predictions = model.transform(df_test)
104 | 
105 | 
106 | predictions.cache()
107 | 
108 | predictions.show()
109 | 
110 | 
111 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
112 | 
113 | ev = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", metricName = "areaUnderROC")
114 | print(ev.evaluate(predictions))
115 | 
116 | 
117 | spark.stop()
118 | 
119 | 


--------------------------------------------------------------------------------
/Chapter09/20051201_20051210.csv:
--------------------------------------------------------------------------------
1 | Date,Open,High,Low,Close,Adj Close,Volume
2 | 2005-12-01,10806.030273,10934.900391,10806.030273,10912.570313,10912.570313,256980000
3 | 2005-12-02,10912.009766,10921.370117,10861.660156,10877.509766,10877.509766,214900000
4 | 2005-12-05,10876.950195,10876.950195,10810.669922,10835.009766,10835.009766,237340000
5 | 2005-12-06,10835.410156,10936.200195,10835.410156,10856.860352,10856.860352,264630000
6 | 2005-12-07,10856.860352,10868.059570,10764.009766,10810.910156,10810.910156,243490000
7 | 2005-12-08,10808.429688,10847.250000,10729.669922,10755.120117,10755.120117,253290000
8 | 2005-12-09,10751.759766,10805.950195,10729.910156,10778.580078,10778.580078,238930000
9 | 


--------------------------------------------------------------------------------
/Chapter09/decision_tree_regression.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 9: Stock Price Prediction with Regression Algorithms
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | import numpy as np
  7 | 
  8 | 
  9 | # Mean squared error calculation function given continuous targets of a data set,
 10 | def mse(targets):
 11 |     # When the set is empty
 12 |     if targets.size == 0:
 13 |         return 0
 14 |     return np.var(targets)
 15 | 
 16 | def weighted_mse(groups):
 17 |     """ Calculate weighted MSE of children after a split
 18 |     Args:
 19 |         groups (list of children, and a child consists a list of targets)
 20 |     Returns:
 21 |         float, weighted impurity
 22 |     """
 23 |     total = sum(len(group) for group in groups)
 24 |     weighted_sum = 0.0
 25 |     for group in groups:
 26 |         weighted_sum += len(group) / float(total) * mse(group)
 27 |     return weighted_sum
 28 | 
 29 | 
 30 | print('{0:.4f}'.format(mse(np.array([1, 2, 3]))))
 31 | print('{0:.4f}'.format(weighted_mse([np.array([1, 2, 3]), np.array([1, 2])])))
 32 | 
 33 | print('type-semi: {0:.4f}'.format(weighted_mse([np.array([600, 400, 700]), np.array([700, 800])])))
 34 | print('bedroom-2: {0:.4f}'.format(weighted_mse([np.array([700, 400]), np.array([600, 800, 700])])))
 35 | print('bedroom-3: {0:.4f}'.format(weighted_mse([np.array([600, 800]), np.array([700, 400, 700])])))
 36 | print('bedroom-4: {0:.4f}'.format(weighted_mse([np.array([700]), np.array([600, 700, 800, 400])])))
 37 | 
 38 | 
 39 | print('bedroom-2: {0:.4f}'.format(weighted_mse([np.array([]), np.array([600, 400, 700])])))
 40 | print('bedroom-3: {0:.4f}'.format(weighted_mse([np.array([400]), np.array([600, 700])])))
 41 | print('bedroom-4: {0:.4f}'.format(weighted_mse([np.array([400, 600]), np.array([700])])))
 42 | 
 43 | 
 44 | 
 45 | 
 46 | def split_node(X, y, index, value):
 47 |     """ Split data set X, y based on a feature and a value
 48 |     Args:
 49 |         X, y (numpy.ndarray, data set)
 50 |         index (int, index of the feature used for splitting)
 51 |         value (value of the feature used for splitting)
 52 |     Returns:
 53 |         list, list: left and right child, a child is in the format of [X, y]
 54 |     """
 55 |     x_index = X[:, index]
 56 |     # if this feature is numerical
 57 |     if type(X[0, index]) in [int, float]:
 58 |         mask = x_index >= value
 59 |     # if this feature is categorical
 60 |     else:
 61 |         mask = x_index == value
 62 |     # split into left and right child
 63 |     left = [X[~mask, :], y[~mask]]
 64 |     right = [X[mask, :], y[mask]]
 65 |     return left, right
 66 | 
 67 | 
 68 | def get_best_split(X, y):
 69 |     """ Obtain the best splitting point and resulting children for the data set X, y
 70 |     Args:
 71 |         X, y (numpy.ndarray, data set)
 72 |         criterion (gini or entropy)
 73 |     Returns:
 74 |         dict {index: index of the feature, value: feature value, children: left and right children}
 75 |     """
 76 |     best_index, best_value, best_score, children = None, None, 1e10, None
 77 |     for index in range(len(X[0])):
 78 |         for value in np.sort(np.unique(X[:, index])):
 79 |             groups = split_node(X, y, index, value)
 80 |             impurity = weighted_mse([groups[0][1], groups[1][1]])
 81 |             if impurity < best_score:
 82 |                 best_index, best_value, best_score, children = index, value, impurity, groups
 83 |     return {'index': best_index, 'value': best_value, 'children': children}
 84 | 
 85 | 
 86 | 
 87 | def get_leaf(targets):
 88 |     # Obtain the leaf as the mean of the targets
 89 |     return np.mean(targets)
 90 | 
 91 | 
 92 | 
 93 | def split(node, max_depth, min_size, depth):
 94 |     """ Split children of a node to construct new nodes or assign them terminals
 95 |     Args:
 96 |         node (dict, with children info)
 97 |         max_depth (int, maximal depth of the tree)
 98 |         min_size (int, minimal samples required to further split a child)
 99 |         depth (int, current depth of the node)
100 |     """
101 |     left, right = node['children']
102 |     del (node['children'])
103 |     if left[1].size == 0:
104 |         node['right'] = get_leaf(right[1])
105 |         return
106 |     if right[1].size == 0:
107 |         node['left'] = get_leaf(left[1])
108 |         return
109 |     # Check if the current depth exceeds the maximal depth
110 |     if depth >= max_depth:
111 |         node['left'], node['right'] = get_leaf(left[1]), get_leaf(right[1])
112 |         return
113 |     # Check if the left child has enough samples
114 |     if left[1].size <= min_size:
115 |         node['left'] = get_leaf(left[1])
116 |     else:
117 |         # It has enough samples, we further split it
118 |         result = get_best_split(left[0], left[1])
119 |         result_left, result_right = result['children']
120 |         if result_left[1].size == 0:
121 |             node['left'] = get_leaf(result_right[1])
122 |         elif result_right[1].size == 0:
123 |             node['left'] = get_leaf(result_left[1])
124 |         else:
125 |             node['left'] = result
126 |             split(node['left'], max_depth, min_size, depth + 1)
127 |     # Check if the right child has enough samples
128 |     if right[1].size <= min_size:
129 |         node['right'] = get_leaf(right[1])
130 |     else:
131 |         # It has enough samples, we further split it
132 |         result = get_best_split(right[0], right[1])
133 |         result_left, result_right = result['children']
134 |         if result_left[1].size == 0:
135 |             node['right'] = get_leaf(result_right[1])
136 |         elif result_right[1].size == 0:
137 |             node['right'] = get_leaf(result_left[1])
138 |         else:
139 |             node['right'] = result
140 |             split(node['right'], max_depth, min_size, depth + 1)
141 | 
142 | 
143 | def train_tree(X_train, y_train, max_depth, min_size):
144 |     """ Construction of a tree starts here
145 |     Args:
146 |         X_train,  y_train (list, list, training data)
147 |         max_depth (int, maximal depth of the tree)
148 |         min_size (int, minimal samples required to further split a child)
149 |     """
150 |     root = get_best_split(X_train, y_train)
151 |     split(root, max_depth, min_size, 1)
152 |     return root
153 | 
154 | 
155 | 
156 | CONDITION = {'numerical': {'yes': '>=', 'no': '<'},
157 |              'categorical': {'yes': 'is', 'no': 'is not'}}
158 | def visualize_tree(node, depth=0):
159 |     if isinstance(node, dict):
160 |         if type(node['value']) in [int, float]:
161 |             condition = CONDITION['numerical']
162 |         else:
163 |             condition = CONDITION['categorical']
164 |         print('{}|- X{} {} {}'.format(depth * '  ', node['index'] + 1, condition['no'], node['value']))
165 |         if 'left' in node:
166 |             visualize_tree(node['left'], depth + 1)
167 |         print('{}|- X{} {} {}'.format(depth * '  ', node['index'] + 1, condition['yes'], node['value']))
168 |         if 'right' in node:
169 |             visualize_tree(node['right'], depth + 1)
170 |     else:
171 |         print('{}[{}]'.format(depth * '  ', node))
172 | 
173 | 
174 | X_train = np.array([['semi', 3],
175 |                     ['detached', 2],
176 |                     ['detached', 3],
177 |                     ['semi', 2],
178 |                     ['semi', 4]], dtype=object)
179 | 
180 | y_train = np.array([600, 700, 800, 400, 700])
181 | 
182 | tree = train_tree(X_train, y_train, 2, 2)
183 | visualize_tree(tree)
184 | 
185 | 
186 | 
187 | # Directly use DecisionTreeRegressor from scikit-learn
188 | from sklearn import datasets
189 | boston = datasets.load_boston()
190 | 
191 | num_test = 10    # the last 10 samples as testing set
192 | X_train = boston.data[:-num_test, :]
193 | y_train = boston.target[:-num_test]
194 | X_test = boston.data[-num_test:, :]
195 | y_test = boston.target[-num_test:]
196 | 
197 | from sklearn.tree import DecisionTreeRegressor
198 | regressor = DecisionTreeRegressor(max_depth=10, min_samples_split=3)
199 | 
200 | regressor.fit(X_train, y_train)
201 | predictions = regressor.predict(X_test)
202 | print(predictions)
203 | print(y_test)
204 | 
205 | 
206 | from sklearn.ensemble import RandomForestRegressor
207 | regressor = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=3)
208 | regressor.fit(X_train, y_train)
209 | predictions = regressor.predict(X_test)
210 | print(predictions)
211 | 
212 | 
213 | 
214 | 
215 | 
216 | import tensorflow as tf
217 | from tensorflow.contrib.tensor_forest.python import tensor_forest
218 | from tensorflow.python.ops import resources
219 | 
220 | 
221 | n_iter = 20
222 | n_features = int(X_train.shape[1])
223 | n_trees = 10
224 | max_nodes = 30000
225 | 
226 | 
227 | x = tf.placeholder(tf.float32, shape=[None, n_features])
228 | y = tf.placeholder(tf.float32, shape=[None])
229 | 
230 | 
231 | hparams = tensor_forest.ForestHParams(num_classes=1, regression=True, num_features=n_features, num_trees=n_trees,
232 |                                       max_nodes=max_nodes, split_after_samples=30).fill()
233 | 
234 | 
235 | forest_graph = tensor_forest.RandomForestGraphs(hparams)
236 | 
237 | 
238 | train_op = forest_graph.training_graph(x, y)
239 | loss_op = forest_graph.training_loss(x, y)
240 | 
241 | 
242 | infer_op, _, _ = forest_graph.inference_graph(x)
243 | 
244 | cost = tf.losses.mean_squared_error(labels=y, predictions=infer_op[:, 0])
245 | 
246 | 
247 | 
248 | init_vars = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), resources.initialize_resources(resources.shared_resources()))
249 | 
250 | sess = tf.Session()
251 | 
252 | sess.run(init_vars)
253 | 
254 | 
255 | for i in range(1, n_iter + 1):
256 |     _, c = sess.run([train_op, cost], feed_dict={x: X_train, y: y_train})
257 |     print('Iteration %i, training loss: %f' % (i, c))
258 | 
259 | 
260 | pred = sess.run(infer_op, feed_dict={x: X_test})[:, 0]
261 | print(pred)
262 | 


--------------------------------------------------------------------------------
/Chapter09/get_dji_data.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 9: Stock Price Prediction with Regression Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | import pandas as pd
 8 | 
 9 | 
10 | # download from https://finance.yahoo.com/quote/%5EDJI/history?period1=1133413200&period2=1134190800&interval=1d&filter=history&frequency=1d
11 | mydata = pd.read_csv('20051201_20051210.csv', index_col='Date')
12 | 
13 | 
14 | 
15 | 
16 | def generate_features(df):
17 |     """
18 |     Generate features for a stock/index based on historical price and performance
19 |     @param df: dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adjusted Close"
20 |     @return: dataframe, data set with new features
21 |     """
22 |     df_new = pd.DataFrame()
23 |     # 6 original features
24 |     df_new['open'] = df['Open']
25 |     df_new['open_1'] = df['Open'].shift(1)
26 |     df_new['close_1'] = df['Close'].shift(1)
27 |     df_new['high_1'] = df['High'].shift(1)
28 |     df_new['low_1'] = df['Low'].shift(1)
29 |     df_new['volume_1'] = df['Volume'].shift(1)
30 |     # 31 generated features
31 |     # average price
32 |     df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1)
33 |     df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1)
34 |     df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1)
35 |     df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30']
36 |     df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365']
37 |     df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365']
38 |     # average volume
39 |     df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1)
40 |     df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1)
41 |     df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1)
42 |     df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30']
43 |     df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365']
44 |     df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365']
45 |     # standard deviation of prices
46 |     df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1)
47 |     df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1)
48 |     df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1)
49 |     df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30']
50 |     df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365']
51 |     df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365']
52 |     # standard deviation of volumes
53 |     df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1)
54 |     df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1)
55 |     df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1)
56 |     df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30']
57 |     df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365']
58 |     df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365']
59 |     # # return
60 |     df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1)
61 |     df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1)
62 |     df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1)
63 |     df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1)
64 |     df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1)
65 |     df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1)
66 |     df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1)
67 |     # the target
68 |     df_new['close'] = df['Close']
69 |     df_new = df_new.dropna(axis=0)
70 |     return df_new
71 | 
72 | 
73 | 
74 | data_raw = pd.read_csv('19880101_20161231.csv', index_col='Date')
75 | data = generate_features(data_raw)
76 | print(data.round(decimals=3).head(5))
77 | 


--------------------------------------------------------------------------------
/Chapter09/linear_regression.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 9: Stock Price Prediction with Regression Algorithms
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | import numpy as np
  7 | 
  8 | 
  9 | def compute_prediction(X, weights):
 10 |     """ Compute the prediction y_hat based on current weights
 11 |     Args:
 12 |         X (numpy.ndarray)
 13 |         weights (numpy.ndarray)
 14 |     Returns:
 15 |         numpy.ndarray, y_hat of X under weights
 16 |     """
 17 |     predictions = np.dot(X, weights)
 18 |     return predictions
 19 | 
 20 | 
 21 | def update_weights_gd(X_train, y_train, weights, learning_rate):
 22 |     """ Update weights by one step
 23 |     Args:
 24 |         X_train, y_train (numpy.ndarray, training data set)
 25 |         weights (numpy.ndarray)
 26 |         learning_rate (float)
 27 |     Returns:
 28 |         numpy.ndarray, updated weights
 29 |     """
 30 |     predictions = compute_prediction(X_train, weights)
 31 |     weights_delta = np.dot(X_train.T, y_train - predictions)
 32 |     m = y_train.shape[0]
 33 |     weights += learning_rate / float(m) * weights_delta
 34 |     return weights
 35 | 
 36 | 
 37 | def compute_cost(X, y, weights):
 38 |     """ Compute the cost J(w)
 39 |     Args:
 40 |         X, y (numpy.ndarray, data set)
 41 |         weights (numpy.ndarray)
 42 |     Returns:
 43 |         float
 44 |     """
 45 |     predictions = compute_prediction(X, weights)
 46 |     cost = np.mean((predictions - y) ** 2 / 2.0)
 47 |     return cost
 48 | 
 49 | 
 50 | def train_linear_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False):
 51 |     """ Train a linear regression model with gradient descent
 52 |     Args:
 53 |         X_train, y_train (numpy.ndarray, training data set)
 54 |         max_iter (int, number of iterations)
 55 |         learning_rate (float)
 56 |         fit_intercept (bool, with an intercept w0 or not)
 57 |     Returns:
 58 |         numpy.ndarray, learned weights
 59 |     """
 60 |     if fit_intercept:
 61 |         intercept = np.ones((X_train.shape[0], 1))
 62 |         X_train = np.hstack((intercept, X_train))
 63 |     weights = np.zeros(X_train.shape[1])
 64 |     for iteration in range(max_iter):
 65 |         weights = update_weights_gd(X_train, y_train, weights, learning_rate)
 66 |         # Check the cost for every 100 (for example) iterations
 67 |         if iteration % 100 == 0:
 68 |             print(compute_cost(X_train, y_train, weights))
 69 |     return weights
 70 | 
 71 | 
 72 | def predict(X, weights):
 73 |     if X.shape[1] == weights.shape[0] - 1:
 74 |         intercept = np.ones((X.shape[0], 1))
 75 |         X = np.hstack((intercept, X))
 76 |     return compute_prediction(X, weights)
 77 | 
 78 | 
 79 | # A small example
 80 | X_train = np.array([[6], [2], [3], [4], [1], [5], [2], [6], [4], [7]])
 81 | 
 82 | y_train = np.array([5.5, 1.6, 2.2, 3.7, 0.8, 5.2, 1.5, 5.3, 4.4, 6.8])
 83 | 
 84 | weights = train_linear_regression(X_train, y_train, max_iter=100, learning_rate=0.01, fit_intercept=True)
 85 | 
 86 | X_test = np.array([[1.3], [3.5], [5.2], [2.8]])
 87 | 
 88 | predictions = predict(X_test, weights)
 89 | 
 90 | import matplotlib.pyplot as plt
 91 | plt.scatter(X_train[:, 0], y_train, marker='o', c='b')
 92 | plt.scatter(X_test[:, 0], predictions, marker='*', c='k')
 93 | plt.xlabel('x')
 94 | plt.ylabel('y')
 95 | plt.show()
 96 | 
 97 | 
 98 | # The diabetes example
 99 | from sklearn import datasets
100 | diabetes = datasets.load_diabetes()
101 | print(diabetes.data.shape)
102 | 
103 | num_test = 30
104 | X_train = diabetes.data[:-num_test, :]
105 | y_train = diabetes.target[:-num_test]
106 | 
107 | weights = train_linear_regression(X_train, y_train, max_iter=5000, learning_rate=1, fit_intercept=True)
108 | 
109 | X_test = diabetes.data[-num_test:, :]
110 | y_test = diabetes.target[-num_test:]
111 | 
112 | predictions = predict(X_test, weights)
113 | 
114 | print(predictions)
115 | print(y_test)
116 | 
117 | 
118 | 
119 | # Directly use SGDRegressor from scikit-learn
120 | from sklearn.linear_model import SGDRegressor
121 | regressor = SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, learning_rate='constant', eta0=0.01, n_iter=1000)
122 | regressor.fit(X_train, y_train)
123 | predictions = regressor.predict(X_test)
124 | print(predictions)
125 | 
126 | 
127 | 
128 | # TensorFlow implementation of linear regression
129 | 
130 | import tensorflow as tf
131 | n_features = int(X_train.shape[1])
132 | learning_rate = 0.5
133 | n_iter = 1000
134 | 
135 | x = tf.placeholder(tf.float32, shape=[None, n_features])
136 | y = tf.placeholder(tf.float32, shape=[None])
137 | W = tf.Variable(tf.ones([n_features, 1]))
138 | b = tf.Variable(tf.zeros([1]))
139 | 
140 | pred = tf.add(tf.matmul(x, W), b)[:, 0]
141 | 
142 | 
143 | cost = tf.losses.mean_squared_error(labels=y, predictions=pred)
144 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
145 | 
146 | init_vars = tf.initialize_all_variables()
147 | sess = tf.Session()
148 | sess.run(init_vars)
149 | 
150 | for i in range(1, n_iter+1):
151 |     _, c = sess.run([optimizer, cost], feed_dict={x: X_train, y: y_train})
152 |     if i % 100 == 0:
153 |         print('Iteration %i, training loss: %f' % (i, c))
154 | 
155 | predictions = sess.run(pred, feed_dict={x: X_test})
156 | print(predictions)
157 | 
158 | 


--------------------------------------------------------------------------------
/Chapter09/neural_network.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 9: Stock Price Prediction with Regression Algorithms
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | import numpy as np
  7 | 
  8 | 
  9 | def sigmoid(z):
 10 |     return 1.0 / (1 + np.exp(-z))
 11 | 
 12 | 
 13 | def sigmoid_derivative(z):
 14 |     return sigmoid(z) * (1.0 - sigmoid(z))
 15 | 
 16 | 
 17 | 
 18 | def train(X, y, n_hidden, learning_rate, n_iter):
 19 |     m, n_input = X.shape
 20 |     W1 = np.random.randn(n_input, n_hidden)
 21 |     b1 = np.zeros((1, n_hidden))
 22 |     W2 = np.random.randn(n_hidden, 1)
 23 |     b2 = np.zeros((1, 1))
 24 |     for i in range(1, n_iter+1):
 25 |         Z2 = np.matmul(X, W1) + b1
 26 |         A2 = sigmoid(Z2)
 27 |         Z3 = np.matmul(A2, W2) + b2
 28 |         A3 = Z3
 29 | 
 30 |         dZ3 = A3 - y
 31 |         dW2 = np.matmul(A2.T, dZ3)
 32 |         db2 = np.sum(dZ3, axis=0, keepdims=True)
 33 | 
 34 |         dZ2 = np.matmul(dZ3, W2.T) * sigmoid_derivative(Z2)
 35 |         dW1 = np.matmul(X.T, dZ2)
 36 |         db1 = np.sum(dZ2, axis=0)
 37 | 
 38 |         W2 = W2 - learning_rate * dW2 / m
 39 |         b2 = b2 - learning_rate * db2 / m
 40 |         W1 = W1 - learning_rate * dW1 / m
 41 |         b1 = b1 - learning_rate * db1 / m
 42 | 
 43 |         if i % 100 == 0:
 44 |             cost = np.mean((y - A3) ** 2)
 45 |             print('Iteration %i, training loss: %f' % (i, cost))
 46 | 
 47 |     model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
 48 |     return model
 49 | 
 50 | 
 51 | def predict(x, model):
 52 |     W1 = model['W1']
 53 |     b1 = model['b1']
 54 |     W2 = model['W2']
 55 |     b2 = model['b2']
 56 |     A2 = sigmoid(np.matmul(x, W1) + b1)
 57 |     A3 = np.matmul(A2, W2) + b2
 58 |     return A3
 59 | 
 60 | 
 61 | 
 62 | from sklearn import datasets
 63 | boston = datasets.load_boston()
 64 | num_test = 10  # the last 10 samples as testing set
 65 | 
 66 | from sklearn import preprocessing
 67 | scaler = preprocessing.StandardScaler()
 68 | 
 69 | X_train = boston.data[:-num_test, :]
 70 | X_train = scaler.fit_transform(X_train)
 71 | y_train = boston.target[:-num_test].reshape(-1, 1)
 72 | X_test = boston.data[-num_test:, :]
 73 | X_test = scaler.transform(X_test)
 74 | y_test = boston.target[-num_test:]
 75 | 
 76 | 
 77 | n_hidden = 20
 78 | learning_rate = 0.1
 79 | n_iter = 2000
 80 | 
 81 | model = train(X_train, y_train, n_hidden, learning_rate, n_iter)
 82 | predictions = predict(X_test, model)
 83 | print(predictions)
 84 | print(y_test)
 85 | 
 86 | 
 87 | 
 88 | 
 89 | # Scikit-learn implementation of neural network
 90 | 
 91 | from sklearn.neural_network import MLPRegressor
 92 | nn_scikit = MLPRegressor(hidden_layer_sizes=(20, 8), activation='logistic', solver='lbfgs',
 93 |                          learning_rate_init=0.1, random_state=42, max_iter=2000)
 94 | nn_scikit.fit(X_train, y_train)
 95 | predictions = nn_scikit.predict(X_test)
 96 | print(predictions)
 97 | print(np.mean((y_test - predictions) ** 2))
 98 | 
 99 | 
100 | # TensorFlow implementation of neural network
101 | 
102 | import tensorflow as tf
103 | n_features = int(X_train.shape[1])
104 | n_hidden_1 = 20
105 | n_hidden_2 = 8
106 | 
107 | learning_rate = 0.1
108 | n_iter = 2000
109 | 
110 | x = tf.placeholder(tf.float32, shape=[None, n_features])
111 | y = tf.placeholder(tf.float32, shape=[None, 1])
112 | 
113 | layer_1 = tf.nn.sigmoid(tf.layers.dense(x, n_hidden_1))
114 | layer_2 = tf.nn.sigmoid(tf.layers.dense(layer_1, n_hidden_2))
115 | pred = tf.layers.dense(layer_2, 1)
116 | 
117 | 
118 | cost = tf.losses.mean_squared_error(labels=y, predictions=pred)
119 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
120 | 
121 | init_vars = tf.initialize_all_variables()
122 | sess = tf.Session()
123 | sess.run(init_vars)
124 | 
125 | for i in range(1, n_iter+1):
126 |     _, c = sess.run([optimizer, cost], feed_dict={x: X_train, y: y_train})
127 |     if i % 100 == 0:
128 |         print('Iteration %i, training loss: %f' % (i, c))
129 | 
130 | predictions = sess.run(pred, feed_dict={x: X_test})
131 | print(predictions)
132 | 
133 | 
134 | 
135 | 
136 | # Keras implementation of neural network
137 | 
138 | 
139 | from keras import models
140 | from keras import layers
141 | 
142 | model = models.Sequential()
143 | model.add(layers.Dense(n_hidden_1, activation="sigmoid", input_shape=(n_features, )))
144 | model.add(layers.Dense(n_hidden_2, activation="sigmoid"))
145 | model.add(layers.Dense(1))
146 | 
147 | 
148 | from keras import optimizers
149 | sgd = optimizers.SGD(lr=0.01)
150 | model.compile(loss='mean_squared_error', optimizer=sgd)
151 | 
152 | 
153 | model.fit(
154 |     X_train, y_train,
155 |     epochs=100,
156 |     validation_data=(X_test, y_test)
157 | )
158 | 
159 | predictions = model.predict(X_test)
160 | print(predictions)
161 | 


--------------------------------------------------------------------------------
/Chapter09/regression_evaluation.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 9: Stock Price Prediction with Regression Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from sklearn import datasets
 8 | diabetes = datasets.load_diabetes()
 9 | num_test = 30    # the last 30 samples as testing set
10 | X_train = diabetes.data[:-num_test, :]
11 | y_train = diabetes.target[:-num_test]
12 | X_test = diabetes.data[-num_test:, :]
13 | y_test = diabetes.target[-num_test:]
14 | param_grid = {
15 |     "alpha": [1e-07, 1e-06, 1e-05],
16 |     "penalty": [None, "l2"],
17 |     "eta0": [0.001, 0.005, 0.01],
18 |     "n_iter": [300, 1000, 3000]
19 | }
20 | 
21 | from sklearn.linear_model import SGDRegressor
22 | from sklearn.model_selection import GridSearchCV
23 | regressor = SGDRegressor(loss='squared_loss',
24 |                              learning_rate='constant')
25 | grid_search = GridSearchCV(regressor, param_grid, cv=3)
26 | 
27 | grid_search.fit(X_train, y_train)
28 | print(grid_search.best_params_)
29 | 
30 | regressor_best = grid_search.best_estimator_
31 | 
32 | 
33 | predictions = regressor_best.predict(X_test)
34 | 
35 | 
36 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
37 | print(mean_squared_error(y_test, predictions))
38 | 
39 | print(mean_absolute_error(y_test, predictions))
40 | 
41 | print(r2_score(y_test, predictions))
42 | 
43 | 


--------------------------------------------------------------------------------
/Chapter09/stock_prediction.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
  3 | Chapter 9: Stock Price Prediction with Regression Algorithms
  4 | Author: Yuxi (Hayden) Liu
  5 | '''
  6 | 
  7 | import pandas as pd
  8 | from sklearn.model_selection import GridSearchCV
  9 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
 10 | from sklearn.preprocessing import StandardScaler
 11 | 
 12 | 
 13 | def generate_features(df):
 14 |     """
 15 |     Generate features for a stock/index based on historical price and performance
 16 |     @param df: dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adjusted Close"
 17 |     @return: dataframe, data set with new features
 18 |     """
 19 |     df_new = pd.DataFrame()
 20 |     # 6 original features
 21 |     df_new['open'] = df['Open']
 22 |     df_new['open_1'] = df['Open'].shift(1)
 23 |     df_new['close_1'] = df['Close'].shift(1)
 24 |     df_new['high_1'] = df['High'].shift(1)
 25 |     df_new['low_1'] = df['Low'].shift(1)
 26 |     df_new['volume_1'] = df['Volume'].shift(1)
 27 |     # 31 generated features
 28 |     # average price
 29 |     df_new['avg_price_5'] = df['Close'].rolling(5).mean().shift(1)
 30 |     df_new['avg_price_30'] = df['Close'].rolling(21).mean().shift(1)
 31 |     df_new['avg_price_365'] = df['Close'].rolling(252).mean().shift(1)
 32 |     df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30']
 33 |     df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365']
 34 |     df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365']
 35 |     # average volume
 36 |     df_new['avg_volume_5'] = df['Volume'].rolling(5).mean().shift(1)
 37 |     df_new['avg_volume_30'] = df['Volume'].rolling(21).mean().shift(1)
 38 |     df_new['avg_volume_365'] = df['Volume'].rolling(252).mean().shift(1)
 39 |     df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30']
 40 |     df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365']
 41 |     df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365']
 42 |     # standard deviation of prices
 43 |     df_new['std_price_5'] = df['Close'].rolling(5).std().shift(1)
 44 |     df_new['std_price_30'] = df['Close'].rolling(21).std().shift(1)
 45 |     df_new['std_price_365'] = df['Close'].rolling(252).std().shift(1)
 46 |     df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30']
 47 |     df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365']
 48 |     df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365']
 49 |     # standard deviation of volumes
 50 |     df_new['std_volume_5'] = df['Volume'].rolling(5).std().shift(1)
 51 |     df_new['std_volume_30'] = df['Volume'].rolling(21).std().shift(1)
 52 |     df_new['std_volume_365'] = df['Volume'].rolling(252).std().shift(1)
 53 |     df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30']
 54 |     df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365']
 55 |     df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365']
 56 |     # # return
 57 |     df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1)
 58 |     df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1)
 59 |     df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1)
 60 |     df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1)
 61 |     df_new['moving_avg_5'] = df_new['return_1'].rolling(5).mean().shift(1)
 62 |     df_new['moving_avg_30'] = df_new['return_1'].rolling(21).mean().shift(1)
 63 |     df_new['moving_avg_365'] = df_new['return_1'].rolling(252).mean().shift(1)
 64 |     # the target
 65 |     df_new['close'] = df['Close']
 66 |     df_new = df_new.dropna(axis=0)
 67 |     return df_new
 68 | 
 69 | 
 70 | data_raw = pd.read_csv('19880101_20161231.csv', index_col='Date')
 71 | data = generate_features(data_raw)
 72 | 
 73 | start_train = '1988-01-01'
 74 | end_train = '2015-12-31'
 75 | 
 76 | start_test = '2016-01-01'
 77 | end_test = '2016-12-31'
 78 | 
 79 | data_train = data.ix[start_train:end_train]
 80 | X_train = data_train.drop('close', axis=1).values
 81 | y_train = data_train['close'].values
 82 | 
 83 | print(X_train.shape)
 84 | print(y_train.shape)
 85 | 
 86 | data_test = data.ix[start_test:end_test]
 87 | X_test = data_test.drop('close', axis=1).values
 88 | y_test = data_test['close'].values
 89 | 
 90 | print(X_test.shape)
 91 | 
 92 | 
 93 | # First experiment with linear regression
 94 | 
 95 | scaler = StandardScaler()
 96 | 
 97 | X_scaled_train = scaler.fit_transform(X_train)
 98 | X_scaled_test = scaler.transform(X_test)
 99 | 
100 | param_grid = {
101 |     "alpha": [1e-5, 3e-5, 1e-4],
102 |     "eta0": [0.01, 0.03, 0.1],
103 | }
104 | 
105 | 
106 | from sklearn.linear_model import SGDRegressor
107 | lr = SGDRegressor(penalty='l2', n_iter=1000)
108 | grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='r2')
109 | grid_search.fit(X_scaled_train, y_train)
110 | 
111 | print(grid_search.best_params_)
112 | 
113 | lr_best = grid_search.best_estimator_
114 | 
115 | predictions_lr = lr_best.predict(X_scaled_test)
116 | 
117 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_lr)))
118 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_lr)))
119 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_lr)))
120 | 
121 | 
122 | # Experiment with random forest
123 | 
124 | param_grid = {
125 |     'max_depth': [50, 70, 80],
126 |     'min_samples_split': [5, 10],
127 |     'max_features': ['auto', 'sqrt'],
128 |     'min_samples_leaf': [3, 5]
129 | 
130 | }
131 | 
132 | 
133 | from sklearn.ensemble import RandomForestRegressor
134 | 
135 | rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
136 | grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2', n_jobs=-1)
137 | grid_search.fit(X_train, y_train)
138 | 
139 | print(grid_search.best_params_)
140 | rf_best = grid_search.best_estimator_
141 | 
142 | predictions_rf = rf_best.predict(X_test)
143 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_rf)))
144 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_rf)))
145 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_rf)))
146 | 
147 | 
148 | # Experiment with SVR
149 | 
150 | param_grid = [
151 |     {'kernel': ['linear'], 'C': [100, 300, 500], 'epsilon': [0.00003, 0.0001]},
152 |     {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [10, 100, 1000], 'epsilon': [0.00003, 0.0001]}
153 | ]
154 | 
155 | 
156 | from sklearn.svm import SVR
157 | 
158 | svr = SVR()
159 | grid_search = GridSearchCV(svr, param_grid, cv=2, scoring='r2')
160 | grid_search.fit(X_scaled_train, y_train)
161 | 
162 | print(grid_search.best_params_)
163 | 
164 | svr_best = grid_search.best_estimator_
165 | 
166 | predictions_svr = svr_best.predict(X_scaled_test)
167 | 
168 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_svr)))
169 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_svr)))
170 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_svr)))
171 | 
172 | 
173 | # Experiment with neural network
174 | 
175 | param_grid = {
176 |     'hidden_layer_sizes': [(50, 10), (30, 30)],
177 |     'activation': ['logistic', 'tanh', 'relu'],
178 |     'solver': ['sgd', 'adam'],
179 |     'learning_rate_init': [0.0001, 0.0003, 0.001, 0.01],
180 |     'alpha': [0.00003, 0.0001, 0.0003],
181 |     'batch_size': [30, 50]
182 | }
183 | 
184 | 
185 | from sklearn.neural_network import MLPRegressor
186 | 
187 | nn = MLPRegressor(random_state=42, max_iter=2000)
188 | grid_search = GridSearchCV(nn, param_grid, cv=2, scoring='r2', n_jobs=-1)
189 | grid_search.fit(X_scaled_train, y_train)
190 | 
191 | 
192 | print(grid_search.best_params_)
193 | 
194 | nn_best = grid_search.best_estimator_
195 | 
196 | predictions_nn = nn_best.predict(X_scaled_test)
197 | 
198 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_nn)))
199 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_nn)))
200 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_nn)))
201 | 
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------
/Chapter09/svr.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 9: Stock Price Prediction with Regression Algorithms
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from sklearn import datasets
 8 | boston = datasets.load_boston()
 9 | 
10 | num_test = 10    # the last 10 samples as testing set
11 | X_train = boston.data[:-num_test, :]
12 | y_train = boston.target[:-num_test]
13 | X_test = boston.data[-num_test:, :]
14 | y_test = boston.target[-num_test:]
15 | 
16 | from sklearn.svm import SVR
17 | regressor = SVR(C=0.1, epsilon=0.02, kernel='linear')
18 | 
19 | regressor.fit(X_train, y_train)
20 | predictions = regressor.predict(X_test)
21 | print(predictions)
22 | 


--------------------------------------------------------------------------------
/Chapter10/dimensionality_reduction.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 10: Machine Learning Best Practices
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | 
 8 | 
 9 | from sklearn.datasets import load_digits
10 | dataset = load_digits()
11 | X, y = dataset.data, dataset.target
12 | 
13 | from sklearn.svm import SVC
14 | from sklearn.model_selection import cross_val_score
15 | 
16 | 
17 | 
18 | from sklearn.decomposition import PCA
19 | 
20 | # Keep different number of top components
21 | N = [10, 15, 25, 35, 45]
22 | for n in N:
23 |     pca = PCA(n_components=n)
24 |     X_n_kept = pca.fit_transform(X)
25 |     # Estimate accuracy on the data set with top n components
26 |     classifier = SVC(gamma=0.005)
27 |     score_n_components = cross_val_score(classifier, X_n_kept, y).mean()
28 |     print('Score with the data set of top {0} components: {1:.2f}'.format(n, score_n_components))


--------------------------------------------------------------------------------
/Chapter10/feature_selection.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 10: Machine Learning Best Practices
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | 
 8 | import numpy as np
 9 | from sklearn.datasets import load_digits
10 | dataset = load_digits()
11 | X, y = dataset.data, dataset.target
12 | print(X.shape)
13 | 
14 | # Estimate accuracy on the original data set
15 | from sklearn.svm import SVC
16 | from sklearn.model_selection import cross_val_score
17 | classifier = SVC(gamma=0.005)
18 | score = cross_val_score(classifier, X, y).mean()
19 | print('Score with the original data set: {0:.2f}'.format(score))
20 | 
21 | 
22 | # Feature selection with random forest
23 | from sklearn.ensemble import RandomForestClassifier
24 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1)
25 | random_forest.fit(X, y)
26 | 
27 | # Sort features based on their importancies
28 | feature_sorted = np.argsort(random_forest.feature_importances_)
29 | 
30 | # Select different number of top features
31 | K = [10, 15, 25, 35, 45]
32 | for k in K:
33 |     top_K_features = feature_sorted[-k:]
34 |     X_k_selected = X[:, top_K_features]
35 |     # Estimate accuracy on the data set with k selected features
36 |     classifier = SVC(gamma=0.005)
37 |     score_k_features = cross_val_score(classifier, X_k_selected, y).mean()
38 |     print('Score with the data set of top {0} features: {1:.2f}'.format(k, score_k_features))
39 | 
40 | 


--------------------------------------------------------------------------------
/Chapter10/generic_feature_engineering.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 10: Machine Learning Best Practices
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from sklearn.preprocessing import Binarizer
 8 | 
 9 | X = [[4], [1], [3], [0]]
10 | binarizer = Binarizer(threshold=2.9)
11 | X_new = binarizer.fit_transform(X)
12 | print(X_new)
13 | 
14 | 
15 | 
16 | 
17 | from sklearn.preprocessing import PolynomialFeatures
18 | 
19 | X = [[2, 4],
20 |      [1, 3],
21 |      [3, 2],
22 |      [0, 3]]
23 | poly = PolynomialFeatures(degree=2)
24 | X_new = poly.fit_transform(X)
25 | print(X_new)
26 | 


--------------------------------------------------------------------------------
/Chapter10/imputation.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 10: Machine Learning Best Practices
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | 
 8 | import numpy as np
 9 | from sklearn.preprocessing import Imputer
10 | 
11 | 
12 | data_origin = [[30, 100],
13 |                [20, 50],
14 |                [35, np.nan],
15 |                [25, 80],
16 |                [30, 70],
17 |                [40, 60]]
18 | 
19 | 
20 | imp_mean = Imputer(missing_values='NaN', strategy='mean')
21 | imp_mean.fit(data_origin)
22 | data_mean_imp = imp_mean.transform(data_origin)
23 | print(data_mean_imp)
24 | 
25 | 
26 | imp_median = Imputer(missing_values='NaN', strategy='median')
27 | imp_median.fit(data_origin)
28 | data_median_imp = imp_median.transform(data_origin)
29 | print(data_median_imp)
30 | 
31 | # New samples
32 | new = [[20, np.nan],
33 |        [30, np.nan],
34 |        [np.nan, 70],
35 |        [np.nan, np.nan]]
36 | new_mean_imp = imp_mean.transform(new)
37 | print(new_mean_imp)
38 | 
39 | 
40 | 
41 | # Effects of discarding missing values and imputation
42 | from sklearn import datasets
43 | dataset = datasets.load_diabetes()
44 | X_full, y = dataset.data, dataset.target
45 | 
46 | 
47 | 
48 | m, n = X_full.shape
49 | m_missing = int(m * 0.25)
50 | print(m, m_missing)
51 | 
52 | 
53 | np.random.seed(42)
54 | missing_samples = np.array([True] * m_missing + [False] * (m - m_missing))
55 | np.random.shuffle(missing_samples)
56 | 
57 | 
58 | missing_features = np.random.randint(low=0, high=n, size=m_missing)
59 | 
60 | X_missing = X_full.copy()
61 | X_missing[np.where(missing_samples)[0], missing_features] = np.nan
62 | 
63 | 
64 | # Discard samples containing missing values
65 | X_rm_missing = X_missing[~missing_samples, :]
66 | y_rm_missing = y[~missing_samples]
67 | 
68 | # Estimate R^2 on the data set with missing samples removed
69 | from sklearn.ensemble import RandomForestRegressor
70 | from sklearn.model_selection import cross_val_score
71 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100)
72 | score_rm_missing = cross_val_score(regressor, X_rm_missing, y_rm_missing).mean()
73 | print('Score with the data set with missing samples removed: {0:.2f}'.format(score_rm_missing))
74 | 
75 | 
76 | # Imputation with mean value
77 | imp_mean = Imputer(missing_values='NaN', strategy='mean')
78 | X_mean_imp = imp_mean.fit_transform(X_missing)
79 | # Estimate R^2 on the data set with missing samples removed
80 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100)
81 | score_mean_imp = cross_val_score(regressor, X_mean_imp, y).mean()
82 | print('Score with the data set with missing values replaced by mean: {0:.2f}'.format(score_mean_imp))
83 | 
84 | 
85 | # Estimate R^2 on the full data set
86 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=500)
87 | score_full = cross_val_score(regressor, X_full, y).mean()
88 | print('Score with the full data set: {0:.2f}'.format(score_full))
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/Chapter10/save_reuse_model_tf.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 10: Machine Learning Best Practices
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | from sklearn import datasets
11 | cancer_data = datasets.load_breast_cancer()
12 | X = cancer_data.data
13 | Y = cancer_data.target
14 | 
15 | n_features = int(X.shape[1])
16 | learning_rate = 0.005
17 | n_iter = 200
18 | 
19 | 
20 | # Input and Target placeholders
21 | x = tf.placeholder(tf.float32, shape=[None, n_features])
22 | y = tf.placeholder(tf.float32, shape=[None])
23 | 
24 | # Build the logistic regression model
25 | W = tf.Variable(tf.zeros([n_features, 1]), name='W')
26 | b = tf.Variable(tf.zeros([1]), name='b')
27 | 
28 | logits = tf.add(tf.matmul(x, W), b)[:, 0]
29 | cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits))
30 | 
31 | optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
32 | 
33 | sess = tf.Session()
34 | 
35 | sess.run(tf.global_variables_initializer())
36 | 
37 | for i in range(1, n_iter+1):
38 |     _, c = sess.run([optimizer, cost], feed_dict={x: X, y: Y})
39 |     if i % 10 == 0:
40 |         print('Iteration %i, training loss: %f' % (i, c))
41 | 
42 | # Save the trained model
43 | # create saver object
44 | saver = tf.train.Saver()
45 | 
46 | file_path = './model_tf'
47 | saved_path = saver.save(sess, file_path)
48 | print('model saved in path: {}'.format(saved_path))
49 | 
50 | 
51 | tf.reset_default_graph()
52 | 
53 | # Load the graph from the file
54 | imported_graph = tf.train.import_meta_graph(file_path+'.meta')
55 | 
56 | 
57 | 
58 | with tf.Session() as sess:
59 |     # restore the saved model
60 |     imported_graph.restore(sess, file_path)
61 |     # print the loaded weights
62 |     W_loaded, b_loaded = sess.run(['W:0','b:0'])
63 |     print('Saved W = ', W_loaded)
64 |     print('Saved b = ', b_loaded)
65 | 
66 | 


--------------------------------------------------------------------------------
/Chapter10/save_reuse_monitor_model.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 10: Machine Learning Best Practices
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | from sklearn import datasets
 8 | dataset = datasets.load_diabetes()
 9 | X, y = dataset.data, dataset.target
10 | 
11 | num_new = 30    # the last 30 samples as new data set
12 | X_train = X[:-num_new, :]
13 | y_train = y[:-num_new]
14 | X_new = X[-num_new:, :]
15 | y_new = y[-num_new:]
16 | 
17 | 
18 | # Data pre-processing
19 | from sklearn.preprocessing import StandardScaler
20 | scaler = StandardScaler()
21 | scaler.fit(X_train)
22 | 
23 | import pickle
24 | # Save the scaler
25 | pickle.dump(scaler, open("scaler.p", "wb" ))
26 | 
27 | X_scaled_train = scaler.transform(X_train)
28 | 
29 | 
30 | # Regression model training
31 | from sklearn.svm import SVR
32 | regressor = SVR(C=20)
33 | regressor.fit(X_scaled_train, y_train)
34 | # Save the regressor
35 | pickle.dump(regressor, open("regressor.p", "wb"))
36 | 
37 | 
38 | # Deployment
39 | my_scaler = pickle.load(open("scaler.p", "rb" ))
40 | my_regressor = pickle.load(open("regressor.p", "rb"))
41 | 
42 | X_scaled_new = my_scaler.transform(X_new)
43 | predictions = my_regressor.predict(X_scaled_new)
44 | 
45 | 
46 | # Monitor
47 | from sklearn.metrics import r2_score
48 | print('Health check on the model, R^2: {0:.3f}'.format(r2_score(y_new, predictions)))
49 | 


--------------------------------------------------------------------------------
/Chapter10/word_embedding.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
 3 | Chapter 10: Machine Learning Best Practices
 4 | Author: Yuxi (Hayden) Liu
 5 | '''
 6 | 
 7 | import gensim.downloader as api
 8 | 
 9 | model = api.load("glove-twitter-25")
10 | 
11 | 
12 | vector = model.wv['computer']
13 | print('Word computer is embedded into:\n', vector)
14 | 
15 | similar_words = model.most_similar("computer")
16 | print('Top ten words most contextually relevant to computer:\n', similar_words)
17 | 
18 | 
19 | 
20 | doc_sample = ['i', 'love', 'reading', 'python', 'machine', 'learning', 'by', 'example']
21 | 
22 | import numpy as np
23 | doc_vector = np.mean([model.wv[word] for word in doc_sample], axis=0)
24 | print('The document sample is embedded into:\n', doc_vector)
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python Machine Learning By Example Second Edition
 2 | **Implement machine learning algorithms and techniques to build intelligent systems**
 3 | 
 4 | 
 5 | <a href="https://www.amazon.in/Python-Machine-Learning-Example-Second-Edition-Hayden/dp/1789616727?utm_source=github&utm_medium=repository&utm_campaign="><img src="https://images-na.ssl-images-amazon.com/images/I/51A5AUe1wAL._SX404_BO1,204,203,200_.jpg" alt="" height="256px" align="right"></a>
 6 | 
 7 | This is the code repository for my book [Python Machine Learning By Example Second Edition](https://www.amazon.com/Python-Machine-Learning-Example-Second-Edition-Hayden/dp/1789616727?utm_source=github&utm_medium=repository&utm_campaign=%22), published by [Packt](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning-example-second-edition), in March 2019. It contains all the supporting project files necessary to work through the book from start to finish.
 8 | 
 9 | 
10 | 
11 | ## What is this book about?
12 | The surge in interest in machine learning (ML) is due to the fact that it revolutionizes automation by learning patterns in data and using them to make predictions and decisions. If you’re interested in ML, this book will serve as your entry point to ML.
13 | 
14 | Python Machine Learning By Example begins with an introduction to important ML concepts and implementations using Python libraries. Each chapter of the book walks you through an industry adopted application. You’ll implement ML techniques in areas such as exploratory data analysis, feature engineering, and natural language processing (NLP) in a clear and easy-to-follow way.
15 | 
16 | With the help of this extended and updated edition, you’ll understand how to tackle data-driven problems and implement your solutions with the powerful yet simple Python language and popular Python packages and tools such as TensorFlow, scikit-learn, gensim, and Keras. To aid your understanding of popular ML algorithms, the book covers interesting and easy-to-follow examples such as news topic modeling and classification, spam email detection, stock price forecasting, and more.
17 | 
18 | By the end of the book, you’ll have put together a broad picture of the ML ecosystem and will be well-versed with the best practices of applying ML techniques to make the most out of new opportunities.
19 | 
20 | This book covers the following exciting features:
21 | * Exploit the power of Python to explore the world of data mining and data analytics
22 | * Discover machine learning algorithms to solve complex challenges faced by data scientists today
23 | * Use Python libraries such as TensorFlow and Keras to create smart cognitive actions for your projects
24 | Table of contents:
25 | ```
26 | Chapter 1: Getting Started with Machine Learning and Python
27 | Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques
28 | Chapter 3: Mining the 20 Newsgroups Dataset with Clustering and Topic Modeling Algorithms
29 | Chapter 4: Detecting Spam Email with Naive Bayes
30 | Chapter 5: Classifying News Topic with Support Vector Machine
31 | Chapter 6: Predicting Online Ads Click-through with Tree-Based Algorithms
32 | Chapter 7: Predicting Online Ads Click-through with Logistic Regression
33 | Chapter 8: Scaling Up Prediction to Terabyte Click Logs
34 | Chapter 9: Stock Price Prediction with Regression Algorithms
35 | Chapter 10: Machine Learning Best Practices
36 | ```
37 |  
38 | ## Get to Know the Author
39 | **Yuxi (Hayden) Liu**
40 | is an author of a series of machine learning books and an education enthusiast. His first book, the first edition of Python Machine Learning By Example, was a #1 bestseller in Amazon India in 2017 and 2018. His other books include R Deep Learning Projects and Hands-On Deep Learning Architectures with Python published by Packt.
41 | 
42 | He is an experienced data scientist who's focused on developing machine learning and deep learning models and systems. He has worked in a variety of data-driven domains and has applied his machine learning expertise to computational advertising, recommendation, and network anomaly detection. He published five first-authored IEEE transaction and conference papers during his master's research at the University of Toronto.
43 | 
44 | ## Get the Book
45 | * [Packt](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning-example-second-edition)
46 | * [Amazon.com](https://www.amazon.com/Python-Machine-Learning-Example-Second-Edition-Hayden/dp/1789616727?utm_source=github&utm_medium=repository&utm_campaign=%22)
47 | * [Amazon.in](https://www.amazon.in/Python-Machine-Learning-Example-Second-Edition-Hayden/dp/1789616727?utm_source=github&utm_medium=repository&utm_campaign=%22)
48 | * [Amazon.uk](https://www.amazon.co.uk/Python-Machine-Learning-Example-Second-Edition-Hayden/dp/1789616727?utm_source=github&utm_medium=repository&utm_campaign=%22)
49 | 
50 | 
51 | 
52 | ## My Other Books
53 | * [Python Machine Learning By Example](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning-example)
54 | * [Hands-On Deep Learning Architectures with Python](https://www.packtpub.com/big-data-and-business-intelligence/hands-deep-learning-architectures-python)
55 | * [Step-by-Step Machine Learning with Python](https://www.packtpub.com/big-data-and-business-intelligence/step-step-machine-learning-python-video)
56 | * [R Deep Learning Projects](https://www.packtpub.com/big-data-and-business-intelligence/r-deep-learning-projects)
57 |  
58 | 


--------------------------------------------------------------------------------