├── .gitattributes ├── .gitignore ├── Chapter02 ├── .ropeproject │ ├── config.py │ ├── globalnames │ ├── history │ └── objectdb ├── 0_getting.py ├── 1viz_words.py ├── 2clean_words.py ├── 3post_clustering.py └── 4topic_model.py ├── Chapter03 └── email_spam.py ├── Chapter04 ├── 1email_spam_tfidf_submit.py ├── 2topic_categorization.py ├── 3plot_rbf_kernels.py ├── 4ctg.py └── CTG.xls ├── Chapter05 ├── 1decision_tree_submit.py └── 2avazu_ctr.py ├── Chapter06 ├── 1one_hot_encode.py ├── 2logistic_function.py ├── 3logistic_regression_from_scratch.py ├── 4random_forest_feature_selection.py └── 5scikit_logistic_regression.py ├── Chapter07 ├── 1stock_price_prediction.py ├── 2linear_regression.py ├── 3decision_tree_regression.py └── 4support_vector_regression.py ├── Chapter08 ├── 1imputation.py ├── 2feature_selection.py ├── 3dimensionality_reduction.py ├── 4generic_feature_engineering.py ├── 5save_reuse_monitor_model.py ├── regressor.p └── scaler.p ├── LICENSE └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /Chapter02/.ropeproject/config.py: -------------------------------------------------------------------------------- 1 | # The default ``config.py`` 2 | 3 | 4 | def set_prefs(prefs): 5 | """This function is called before opening the project""" 6 | 7 | # Specify which files and folders to ignore in the project. 8 | # Changes to ignored resources are not added to the history and 9 | # VCSs. Also they are not returned in `Project.get_files()`. 10 | # Note that ``?`` and ``*`` match all characters but slashes. 11 | # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' 12 | # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' 13 | # '.svn': matches 'pkg/.svn' and all of its children 14 | # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' 15 | # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' 16 | prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject', 17 | '.hg', '.svn', '_svn', '.git'] 18 | 19 | # Specifies which files should be considered python files. It is 20 | # useful when you have scripts inside your project. Only files 21 | # ending with ``.py`` are considered to be python files by 22 | # default. 23 | #prefs['python_files'] = ['*.py'] 24 | 25 | # Custom source folders: By default rope searches the project 26 | # for finding source folders (folders that should be searched 27 | # for finding modules). You can add paths to that list. Note 28 | # that rope guesses project source folders correctly most of the 29 | # time; use this if you have any problems. 30 | # The folders should be relative to project root and use '/' for 31 | # separating folders regardless of the platform rope is running on. 32 | # 'src/my_source_folder' for instance. 33 | #prefs.add('source_folders', 'src') 34 | 35 | # You can extend python path for looking up modules 36 | #prefs.add('python_path', '~/python/') 37 | 38 | # Should rope save object information or not. 39 | prefs['save_objectdb'] = True 40 | prefs['compress_objectdb'] = False 41 | 42 | # If `True`, rope analyzes each module when it is being saved. 43 | prefs['automatic_soa'] = True 44 | # The depth of calls to follow in static object analysis 45 | prefs['soa_followed_calls'] = 0 46 | 47 | # If `False` when running modules or unit tests "dynamic object 48 | # analysis" is turned off. This makes them much faster. 49 | prefs['perform_doa'] = True 50 | 51 | # Rope can check the validity of its object DB when running. 52 | prefs['validate_objectdb'] = True 53 | 54 | # How many undos to hold? 55 | prefs['max_history_items'] = 32 56 | 57 | # Shows whether to save history across sessions. 58 | prefs['save_history'] = True 59 | prefs['compress_history'] = False 60 | 61 | # Set the number spaces used for indenting. According to 62 | # :PEP:`8`, it is best to use 4 spaces. Since most of rope's 63 | # unit-tests use 4 spaces it is more reliable, too. 64 | prefs['indent_size'] = 4 65 | 66 | # Builtin and c-extension modules that are allowed to be imported 67 | # and inspected by rope. 68 | prefs['extension_modules'] = [] 69 | 70 | # Add all standard c-extensions to extension_modules list. 71 | prefs['import_dynload_stdmods'] = True 72 | 73 | # If `True` modules with syntax errors are considered to be empty. 74 | # The default value is `False`; When `False` syntax errors raise 75 | # `rope.base.exceptions.ModuleSyntaxError` exception. 76 | prefs['ignore_syntax_errors'] = False 77 | 78 | # If `True`, rope ignores unresolvable imports. Otherwise, they 79 | # appear in the importing namespace. 80 | prefs['ignore_bad_imports'] = False 81 | 82 | 83 | def project_opened(project): 84 | """This function is called after opening the project""" 85 | # Do whatever you like here! 86 | -------------------------------------------------------------------------------- /Chapter02/.ropeproject/globalnames: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Machine-Learning-By-Example/6ee2be561e511bd0a1c0b3d481ad3950ea3f1815/Chapter02/.ropeproject/globalnames -------------------------------------------------------------------------------- /Chapter02/.ropeproject/history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Machine-Learning-By-Example/6ee2be561e511bd0a1c0b3d481ad3950ea3f1815/Chapter02/.ropeproject/history -------------------------------------------------------------------------------- /Chapter02/.ropeproject/objectdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Machine-Learning-By-Example/6ee2be561e511bd0a1c0b3d481ad3950ea3f1815/Chapter02/.ropeproject/objectdb -------------------------------------------------------------------------------- /Chapter02/0_getting.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import fetch_20newsgroups 2 | groups = fetch_20newsgroups() 3 | 4 | groups.keys() 5 | groups['target_names'] 6 | groups.target 7 | import numpy as np 8 | np.unique(groups.target) 9 | 10 | groups.data[0] 11 | groups.target[0] 12 | 13 | groups.target_names[groups.target[0]] 14 | 15 | len(groups.data[0]) 16 | len(groups.data[1]) 17 | -------------------------------------------------------------------------------- /Chapter02/1viz_words.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | from sklearn.datasets import fetch_20newsgroups 6 | 7 | cv = CountVectorizer(stop_words="english", max_features=500) 8 | groups = fetch_20newsgroups() 9 | transformed = cv.fit_transform(groups.data) 10 | print(cv.get_feature_names()) 11 | 12 | sns.distplot(np.log(transformed.toarray().sum(axis=0))) 13 | plt.xlabel('Log Count') 14 | plt.ylabel('Frequency') 15 | plt.title('Distribution Plot of 500 Word Counts') 16 | plt.show() 17 | -------------------------------------------------------------------------------- /Chapter02/2clean_words.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | from sklearn.datasets import fetch_20newsgroups 3 | from nltk.corpus import names 4 | from nltk.stem import WordNetLemmatizer 5 | 6 | 7 | def letters_only(astr): 8 | for c in astr: 9 | if not c.isalpha(): 10 | return False 11 | 12 | return True 13 | 14 | cv = CountVectorizer(stop_words="english", max_features=500) 15 | groups = fetch_20newsgroups() 16 | cleaned = [] 17 | all_names = set(names.words()) 18 | lemmatizer = WordNetLemmatizer() 19 | 20 | for post in groups.data: 21 | cleaned.append(' '.join([lemmatizer.lemmatize(word.lower()) 22 | for word in post.split() 23 | if letters_only(word) 24 | and word not in all_names])) 25 | 26 | transformed = cv.fit_transform(cleaned) 27 | print(cv.get_feature_names()) -------------------------------------------------------------------------------- /Chapter02/3post_clustering.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | from sklearn.datasets import fetch_20newsgroups 3 | from nltk.corpus import names 4 | from nltk.stem import WordNetLemmatizer 5 | from sklearn.cluster import KMeans 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def letters_only(astr): 10 | for c in astr: 11 | if not c.isalpha(): 12 | return False 13 | 14 | return True 15 | 16 | cv = CountVectorizer(stop_words="english", max_features=500) 17 | groups = fetch_20newsgroups() 18 | cleaned = [] 19 | all_names = set(names.words()) 20 | lemmatizer = WordNetLemmatizer() 21 | 22 | for post in groups.data: 23 | cleaned.append(' '.join([lemmatizer.lemmatize(word.lower()) 24 | for word in post.split() 25 | if letters_only(word) 26 | and word not in all_names])) 27 | 28 | transformed = cv.fit_transform(cleaned) 29 | km = KMeans(n_clusters=20) 30 | km.fit(transformed) 31 | labels = groups.target 32 | plt.scatter(labels, km.labels_) 33 | plt.xlabel('Newsgroup') 34 | plt.ylabel('Cluster') 35 | plt.show() 36 | -------------------------------------------------------------------------------- /Chapter02/4topic_model.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | from sklearn.datasets import fetch_20newsgroups 3 | from nltk.corpus import names 4 | from nltk.stem import WordNetLemmatizer 5 | from sklearn.decomposition import NMF 6 | 7 | 8 | def letters_only(astr): 9 | for c in astr: 10 | if not c.isalpha(): 11 | return False 12 | 13 | return True 14 | 15 | cv = CountVectorizer(stop_words="english", max_features=500) 16 | groups = fetch_20newsgroups() 17 | cleaned = [] 18 | all_names = set(names.words()) 19 | lemmatizer = WordNetLemmatizer() 20 | 21 | for post in groups.data: 22 | cleaned.append(' '.join([lemmatizer.lemmatize(word.lower()) 23 | for word in post.split() 24 | if letters_only(word) 25 | and word not in all_names])) 26 | 27 | transformed = cv.fit_transform(cleaned) 28 | nmf = NMF(n_components=100, random_state=43).fit(transformed) 29 | 30 | for topic_idx, topic in enumerate(nmf.components_): 31 | label = '{}: '.format(topic_idx) 32 | print(label, " ".join([cv.get_feature_names()[i] 33 | for i in topic.argsort()[:-9:-1]])) 34 | -------------------------------------------------------------------------------- /Chapter03/email_spam.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | from nltk.corpus import names 3 | from nltk.stem import WordNetLemmatizer 4 | 5 | import glob 6 | import os 7 | import numpy as np 8 | 9 | 10 | file_path = 'enron1/ham/0007.1999-12-14.farmer.ham.txt' 11 | with open(file_path, 'r') as infile: 12 | ham_sample = infile.read() 13 | print(ham_sample) 14 | 15 | file_path = 'enron1/spam/0058.2003-12-21.GP.spam.txt' 16 | with open(file_path, 'r') as infile: 17 | spam_sample = infile.read() 18 | print(spam_sample) 19 | 20 | cv = CountVectorizer(stop_words="english", max_features=500) 21 | 22 | emails, labels = [], [] 23 | 24 | file_path = 'enron1/spam/' 25 | for filename in glob.glob(os.path.join(file_path, '*.txt')): 26 | with open(filename, 'r', encoding = "ISO-8859-1") as infile: 27 | emails.append(infile.read()) 28 | labels.append(1) 29 | 30 | file_path = 'enron1/ham/' 31 | for filename in glob.glob(os.path.join(file_path, '*.txt')): 32 | with open(filename, 'r', encoding = "ISO-8859-1") as infile: 33 | emails.append(infile.read()) 34 | labels.append(0) 35 | 36 | 37 | 38 | def letters_only(astr): 39 | return astr.isalpha() 40 | 41 | 42 | all_names = set(names.words()) 43 | lemmatizer = WordNetLemmatizer() 44 | 45 | 46 | def clean_text(docs): 47 | cleaned_docs = [] 48 | for doc in docs: 49 | cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower()) 50 | for word in doc.split() 51 | if letters_only(word) 52 | and word not in all_names])) 53 | return cleaned_docs 54 | 55 | 56 | cleaned_emails = clean_text(emails) 57 | term_docs = cv.fit_transform(cleaned_emails) 58 | print(term_docs [0]) 59 | 60 | feature_mapping = cv.vocabulary 61 | feature_names = cv.get_feature_names() 62 | 63 | def get_label_index(labels): 64 | from collections import defaultdict 65 | label_index = defaultdict(list) 66 | for index, label in enumerate(labels): 67 | label_index[label].append(index) 68 | return label_index 69 | 70 | 71 | def get_prior(label_index): 72 | """ Compute prior based on training samples 73 | Args: 74 | label_index (grouped sample indices by class) 75 | Returns: 76 | dictionary, with class label as key, corresponding prior as the value 77 | """ 78 | prior = {label: len(index) for label, index in label_index.items()} 79 | total_count = sum(prior.values()) 80 | for label in prior: 81 | prior[label] /= float(total_count) 82 | return prior 83 | 84 | 85 | def get_likelihood(term_document_matrix, label_index, smoothing=0): 86 | """ Compute likelihood based on training samples 87 | Args: 88 | term_document_matrix (sparse matrix) 89 | label_index (grouped sample indices by class) 90 | smoothing (integer, additive Laplace smoothing parameter) 91 | Returns: 92 | dictionary, with class as key, corresponding conditional probability P(feature|class) vector as value 93 | """ 94 | likelihood = {} 95 | for label, index in label_index.items(): 96 | likelihood[label] = term_document_matrix[index, :].sum(axis=0) + smoothing 97 | likelihood[label] = np.asarray(likelihood[label])[0] 98 | total_count = likelihood[label].sum() 99 | likelihood[label] = likelihood[label] / float(total_count) 100 | return likelihood 101 | 102 | feature_names[:5] 103 | 104 | 105 | def get_posterior(term_document_matrix, prior, likelihood): 106 | """ Compute posterior of testing samples, based on prior and likelihood 107 | Args: 108 | term_document_matrix (sparse matrix) 109 | prior (dictionary, with class label as key, corresponding prior as the value) 110 | likelihood (dictionary, with class label as key, corresponding conditional probability vector as value) 111 | Returns: 112 | dictionary, with class label as key, corresponding posterior as value 113 | """ 114 | num_docs = term_document_matrix.shape[0] 115 | posteriors = [] 116 | for i in range(num_docs): 117 | # posterior is proportional to prior * likelihood 118 | # = exp(log(prior * likelihood)) 119 | # = exp(log(prior) + log(likelihood)) 120 | posterior = {key: np.log(prior_label) for key, prior_label in prior.items()} 121 | for label, likelihood_label in likelihood.items(): 122 | term_document_vector = term_document_matrix.getrow(i) 123 | counts = term_document_vector.data 124 | indices = term_document_vector.indices 125 | for count, index in zip(counts, indices): 126 | posterior[label] += np.log(likelihood_label[index]) * count 127 | # exp(-1000):exp(-999) will cause zero division error, 128 | # however it equates to exp(0):exp(1) 129 | min_log_posterior = min(posterior.values()) 130 | for label in posterior: 131 | try: 132 | posterior[label] = np.exp(posterior[label] - min_log_posterior) 133 | except: 134 | # if one's log value is excessively large, assign it infinity 135 | posterior[label] = float('inf') 136 | # normalize so that all sums up to 1 137 | sum_posterior = sum(posterior.values()) 138 | for label in posterior: 139 | if posterior[label] == float('inf'): 140 | posterior[label] = 1.0 141 | else: 142 | posterior[label] /= sum_posterior 143 | posteriors.append(posterior.copy()) 144 | return posteriors 145 | 146 | 147 | label_index = get_label_index(labels) 148 | prior = get_prior(label_index) 149 | 150 | smoothing = 1 151 | likelihood = get_likelihood(term_docs, label_index, smoothing) 152 | 153 | 154 | 155 | emails_test = [ 156 | '''Subject: flat screens 157 | hello , 158 | please call or contact regarding the other flat screens requested . 159 | trisha tlapek - eb 3132 b 160 | michael sergeev - eb 3132 a 161 | also the sun blocker that was taken away from eb 3131 a . 162 | trisha should two monitors also michael . 163 | thanks 164 | kevin moore''', 165 | '''Subject: having problems in bed ? we can help ! 166 | cialis allows men to enjoy a fully normal sex life without having to plan the sexual act . 167 | if we let things terrify us , life will not be worth living . 168 | brevity is the soul of lingerie . 169 | suspicion always haunts the guilty mind .''', 170 | ] 171 | 172 | cleaned_test = clean_text(emails_test) 173 | term_docs_test = cv.transform(cleaned_test) 174 | posterior = get_posterior(term_docs_test, prior, likelihood) 175 | print(posterior) 176 | 177 | 178 | 179 | from sklearn.model_selection import train_test_split 180 | X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails, labels, test_size=0.33, random_state=42) 181 | 182 | len(X_train), len(Y_train) 183 | len(X_test), len(Y_test) 184 | 185 | term_docs_train = cv.fit_transform(X_train) 186 | label_index = get_label_index(Y_train) 187 | prior = get_prior(label_index) 188 | likelihood = get_likelihood(term_docs_train, label_index, smoothing) 189 | 190 | term_docs_test = cv.transform(X_test) 191 | posterior = get_posterior(term_docs_test, prior, likelihood) 192 | 193 | correct = 0.0 194 | for pred, actual in zip(posterior, Y_test): 195 | if actual == 1: 196 | if pred[1] >= 0.5: 197 | correct += 1 198 | elif pred[0] > 0.5: 199 | correct += 1 200 | 201 | print('The accuracy on {0} testing samples is: {1:.1f}%'.format(len(Y_test), correct/len(Y_test)*100)) 202 | 203 | 204 | 205 | 206 | from sklearn.naive_bayes import MultinomialNB 207 | clf = MultinomialNB(alpha=1.0, fit_prior=True) 208 | clf.fit(term_docs_train, Y_train) 209 | prediction_prob = clf.predict_proba(term_docs_test) 210 | prediction_prob[0:10] 211 | prediction = clf.predict(term_docs_test) 212 | prediction[:10] 213 | accuracy = clf.score(term_docs_test, Y_test) 214 | print('The accuracy using MultinomialNB is: {0:.1f}%'.format(accuracy*100)) 215 | 216 | 217 | 218 | from sklearn.metrics import confusion_matrix 219 | confusion_matrix(Y_test, prediction, labels=[0, 1]) 220 | 221 | from sklearn.metrics import precision_score, recall_score, f1_score 222 | precision_score(Y_test, prediction, pos_label=1) 223 | recall_score(Y_test, prediction, pos_label=1) 224 | f1_score(Y_test, prediction, pos_label=1) 225 | 226 | f1_score(Y_test, prediction, pos_label=0) 227 | 228 | from sklearn.metrics import classification_report 229 | report = classification_report(Y_test, prediction) 230 | print(report) 231 | 232 | 233 | 234 | 235 | pos_prob = prediction_prob[:, 1] 236 | thresholds = np.arange(0.0, 1.2, 0.1) 237 | true_pos, false_pos = [0]*len(thresholds), [0]*len(thresholds) 238 | for pred, y in zip(pos_prob, Y_test): 239 | for i, threshold in enumerate(thresholds): 240 | if pred >= threshold: 241 | if y == 1: 242 | true_pos[i] += 1 243 | else: 244 | false_pos[i] += 1 245 | else: 246 | break 247 | 248 | true_pos_rate = [tp / 516.0 for tp in true_pos] 249 | false_pos_rate = [fp / 1191.0 for fp in false_pos] 250 | 251 | 252 | import matplotlib.pyplot as plt 253 | plt.figure() 254 | lw = 2 255 | plt.plot(false_pos_rate, true_pos_rate, color='darkorange', 256 | lw=lw) 257 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 258 | plt.xlim([0.0, 1.0]) 259 | plt.ylim([0.0, 1.05]) 260 | plt.xlabel('False Positive Rate') 261 | plt.ylabel('True Positive Rate') 262 | plt.title('Receiver Operating Characteristic') 263 | plt.legend(loc="lower right") 264 | plt.show() 265 | 266 | 267 | 268 | 269 | from sklearn.metrics import roc_auc_score 270 | roc_auc_score(Y_test, pos_prob) 271 | 272 | 273 | 274 | from sklearn.model_selection import StratifiedKFold 275 | k = 10 276 | k_fold = StratifiedKFold(n_splits=k) 277 | # convert to numpy array for more efficient slicing 278 | cleaned_emails_np = np.array(cleaned_emails) 279 | labels_np = np.array(labels) 280 | 281 | max_features_option = [2000, 4000, 8000] 282 | smoothing_factor_option = [0.5, 1.0, 1.5, 2.0] 283 | fit_prior_option = [True, False] 284 | auc_record = {} 285 | 286 | for train_indices, test_indices in k_fold.split(cleaned_emails, labels): 287 | X_train, X_test = cleaned_emails_np[train_indices], cleaned_emails_np[test_indices] 288 | Y_train, Y_test = labels_np[train_indices], labels_np[test_indices] 289 | for max_features in max_features_option: 290 | if max_features not in auc_record: 291 | auc_record[max_features] = {} 292 | cv = CountVectorizer(stop_words="english", max_features=max_features) 293 | term_docs_train = cv.fit_transform(X_train) 294 | term_docs_test = cv.transform(X_test) 295 | for smoothing_factor in smoothing_factor_option: 296 | if smoothing_factor not in auc_record[max_features]: 297 | auc_record[max_features][smoothing_factor] = {} 298 | for fit_prior in fit_prior_option: 299 | clf = MultinomialNB(alpha=smoothing_factor, fit_prior=fit_prior) 300 | clf.fit(term_docs_train, Y_train) 301 | prediction_prob = clf.predict_proba(term_docs_test) 302 | pos_prob = prediction_prob[:, 1] 303 | auc = roc_auc_score(Y_test, pos_prob) 304 | auc_record[max_features][smoothing_factor][fit_prior] \ 305 | = auc + auc_record[max_features][smoothing_factor].get(fit_prior, 0.0) 306 | 307 | print(auc_record) 308 | 309 | print('max features smoothing fit prior auc') 310 | for max_features, max_feature_record in auc_record.items(): 311 | for smoothing, smoothing_record in max_feature_record.items(): 312 | for fit_prior, auc in smoothing_record.items(): 313 | print(' {0} {1} {2} {3:.4f}'.format(max_features, smoothing, fit_prior, auc/k)) 314 | 315 | 316 | -------------------------------------------------------------------------------- /Chapter04/1email_spam_tfidf_submit.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import names 2 | from nltk.stem import WordNetLemmatizer 3 | import glob 4 | import os 5 | import numpy as np 6 | from sklearn.naive_bayes import MultinomialNB 7 | from sklearn.metrics import roc_auc_score 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | 10 | 11 | emails, labels = [], [] 12 | 13 | file_path = '../enron1/spam/' 14 | for filename in glob.glob(os.path.join(file_path, '*.txt')): 15 | with open(filename, 'r', encoding = "ISO-8859-1") as infile: 16 | emails.append(infile.read()) 17 | labels.append(1) 18 | 19 | file_path = '../enron1/ham/' 20 | for filename in glob.glob(os.path.join(file_path, '*.txt')): 21 | with open(filename, 'r', encoding = "ISO-8859-1") as infile: 22 | emails.append(infile.read()) 23 | labels.append(0) 24 | 25 | def letters_only(astr): 26 | for c in astr: 27 | if not c.isalpha(): 28 | return False 29 | return True 30 | 31 | all_names = set(names.words()) 32 | lemmatizer = WordNetLemmatizer() 33 | 34 | def clean_text(docs): 35 | cleaned_docs = [] 36 | for doc in docs: 37 | cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower()) 38 | for word in doc.split() 39 | if letters_only(word) 40 | and word not in all_names])) 41 | return cleaned_docs 42 | 43 | cleaned_emails = clean_text(emails) 44 | 45 | from sklearn.model_selection import StratifiedKFold 46 | k = 10 47 | k_fold = StratifiedKFold(n_splits=k) 48 | # convert to numpy array for more efficient slicing 49 | cleaned_emails_np = np.array(cleaned_emails) 50 | labels_np = np.array(labels) 51 | 52 | smoothing_factor_option = [1.0, 2.0, 3.0, 4.0, 5.0] 53 | from collections import defaultdict 54 | auc_record = defaultdict(float) 55 | 56 | for train_indices, test_indices in k_fold.split(cleaned_emails, labels): 57 | X_train, X_test = cleaned_emails_np[train_indices], cleaned_emails_np[test_indices] 58 | Y_train, Y_test = labels_np[train_indices], labels_np[test_indices] 59 | tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000) 60 | term_docs_train = tfidf_vectorizer.fit_transform(X_train) 61 | term_docs_test = tfidf_vectorizer.transform(X_test) 62 | for smoothing_factor in smoothing_factor_option: 63 | clf = MultinomialNB(alpha=smoothing_factor, fit_prior=True) 64 | clf.fit(term_docs_train, Y_train) 65 | prediction_prob = clf.predict_proba(term_docs_test) 66 | pos_prob = prediction_prob[:, 1] 67 | auc = roc_auc_score(Y_test, pos_prob) 68 | auc_record[smoothing_factor] += auc 69 | 70 | print(auc_record) 71 | 72 | print('max features smoothing fit prior auc') 73 | for smoothing, smoothing_record in auc_record.items(): 74 | print(' 8000 {0} true {1:.4f}'.format(smoothing, smoothing_record/k)) 75 | 76 | 77 | -------------------------------------------------------------------------------- /Chapter04/2topic_categorization.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | from sklearn.datasets import fetch_20newsgroups 3 | from nltk.corpus import names 4 | from nltk.stem import WordNetLemmatizer 5 | 6 | all_names = set(names.words()) 7 | lemmatizer = WordNetLemmatizer() 8 | 9 | def letters_only(astr): 10 | for c in astr: 11 | if not c.isalpha(): 12 | return False 13 | return True 14 | 15 | def clean_text(docs): 16 | cleaned_docs = [] 17 | for doc in docs: 18 | cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower()) 19 | for word in doc.split() 20 | if letters_only(word) 21 | and word not in all_names])) 22 | return cleaned_docs 23 | 24 | 25 | # Binary classification 26 | categories = ['comp.graphics', 'sci.space'] 27 | 28 | data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42) 29 | data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42) 30 | 31 | cleaned_train = clean_text(data_train.data) 32 | label_train = data_train.target 33 | cleaned_test = clean_text(data_test.data) 34 | label_test = data_test.target 35 | 36 | from collections import Counter 37 | Counter(label_train) 38 | 39 | tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000) 40 | term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train) 41 | term_docs_test = tfidf_vectorizer.transform(cleaned_test) 42 | 43 | from sklearn.svm import SVC 44 | svm = SVC(kernel='linear', C=1.0, random_state=42) 45 | svm.fit(term_docs_train, label_train) 46 | accuracy = svm.score(term_docs_test, label_test) 47 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100)) 48 | 49 | 50 | # Multiclass classification 51 | categories = [ 52 | 'alt.atheism', 53 | 'talk.religion.misc', 54 | 'comp.graphics', 55 | 'sci.space', 56 | 'rec.sport.hockey' 57 | ] 58 | data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42) 59 | data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42) 60 | 61 | cleaned_train = clean_text(data_train.data) 62 | label_train = data_train.target 63 | cleaned_test = clean_text(data_test.data) 64 | label_test = data_test.target 65 | 66 | term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train) 67 | term_docs_test = tfidf_vectorizer.transform(cleaned_test) 68 | 69 | svm = SVC(kernel='linear', C=1.0, random_state=42) 70 | svm.fit(term_docs_train, label_train) 71 | accuracy = svm.score(term_docs_test, label_test) 72 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100)) 73 | 74 | from sklearn.metrics import classification_report 75 | prediction = svm.predict(term_docs_test) 76 | report = classification_report(label_test, prediction) 77 | print(report) 78 | 79 | 80 | # Grid search 81 | 82 | categories = None 83 | data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42) 84 | data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42) 85 | 86 | cleaned_train = clean_text(data_train.data) 87 | label_train = data_train.target 88 | cleaned_test = clean_text(data_test.data) 89 | label_test = data_test.target 90 | 91 | tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000) 92 | term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train) 93 | term_docs_test = tfidf_vectorizer.transform(cleaned_test) 94 | 95 | parameters = {'C': [0.1, 1, 10, 100]} 96 | svc_libsvm = SVC(kernel='linear') 97 | 98 | from sklearn.model_selection import GridSearchCV 99 | grid_search = GridSearchCV(svc_libsvm, parameters, n_jobs=-1, cv=3) 100 | 101 | 102 | import timeit 103 | start_time = timeit.default_timer() 104 | grid_search.fit(term_docs_train, label_train) 105 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 106 | 107 | print(grid_search.best_params_) 108 | print(grid_search.best_score_) 109 | 110 | svc_libsvm_best = grid_search.best_estimator_ 111 | accuracy = svc_libsvm_best.score(term_docs_test, label_test) 112 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100)) 113 | 114 | 115 | from sklearn.svm import LinearSVC 116 | svc_linear = LinearSVC() 117 | grid_search = GridSearchCV(svc_linear, parameters, n_jobs=-1, cv=3) 118 | 119 | start_time = timeit.default_timer() 120 | grid_search.fit(term_docs_train, label_train) 121 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 122 | 123 | print(grid_search.best_params_) 124 | print(grid_search.best_score_) 125 | svc_linear_best = grid_search.best_estimator_ 126 | accuracy = svc_linear_best.score(term_docs_test, label_test) 127 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100)) 128 | 129 | 130 | 131 | # Pipeline 132 | from sklearn.pipeline import Pipeline 133 | 134 | pipeline = Pipeline([ 135 | ('tfidf', TfidfVectorizer(stop_words='english')), 136 | ('svc', LinearSVC()), 137 | ]) 138 | 139 | parameters_pipeline = { 140 | 'tfidf__max_df': (0.25, 0.5), 141 | 'tfidf__max_features': (40000, 50000), 142 | 'tfidf__sublinear_tf': (True, False), 143 | 'tfidf__smooth_idf': (True, False), 144 | 'svc__C': (0.1, 1, 10, 100), 145 | } 146 | 147 | grid_search = GridSearchCV(pipeline, parameters_pipeline, n_jobs=-1, cv=3) 148 | 149 | start_time = timeit.default_timer() 150 | grid_search.fit(cleaned_train, label_train) 151 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 152 | 153 | print(grid_search.best_params_) 154 | print(grid_search.best_score_) 155 | pipeline_best = grid_search.best_estimator_ 156 | accuracy = pipeline_best.score(cleaned_test, label_test) 157 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100)) 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /Chapter04/3plot_rbf_kernels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.svm import SVC 4 | 5 | X = np.c_[# negative class 6 | (.3, -.8), 7 | (-1.5, -1), 8 | (-1.3, -.8), 9 | (-1.1, -1.3), 10 | (-1.2, -.3), 11 | (-1.3, -.5), 12 | (-.6, 1.1), 13 | (-1.4, 2.2), 14 | (1, 1), 15 | # positive class 16 | (1.3, .8), 17 | (1.2, .5), 18 | (.2, -2), 19 | (.5, -2.4), 20 | (.2, -2.3), 21 | (0, -2.7), 22 | (1.3, 2.1)].T 23 | Y = [-1] * 8 + [1] * 8 24 | 25 | gamma_option = [1, 2, 4] 26 | plt.figure(1, figsize=(4*len(gamma_option), 4)) 27 | 28 | for i, gamma in enumerate(gamma_option, 1): 29 | svm = SVC(kernel='rbf', gamma=gamma) 30 | svm.fit(X, Y) 31 | plt.subplot(1, len(gamma_option), i) 32 | plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired) 33 | plt.axis('tight') 34 | XX, YY = np.mgrid[-3:3:200j, -3:3:200j] 35 | Z = svm.decision_function(np.c_[XX.ravel(), YY.ravel()]) 36 | Z = Z.reshape(XX.shape) 37 | plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired) 38 | plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'], levels=[-.5, 0, .5]) 39 | plt.title('gamma = %d' % gamma) 40 | 41 | plt.show() 42 | 43 | -------------------------------------------------------------------------------- /Chapter04/4ctg.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | df = pd.read_excel('CTG.xls', "Raw Data") 3 | 4 | X = df.ix[1:2126, 3:-2].values 5 | Y = df.ix[1:2126, -1].values # 3 class classification 6 | # Y = df.ix[2:2126, -2].values 7 | 8 | from collections import Counter 9 | Counter(Y) 10 | 11 | from sklearn.model_selection import train_test_split 12 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 13 | 14 | from sklearn.svm import SVC 15 | svc = SVC(kernel='rbf') 16 | 17 | parameters = {'C': (100, 1e3, 1e4, 1e5), 18 | 'gamma': (1e-08, 1e-7, 1e-6, 1e-5) 19 | } 20 | from sklearn.model_selection import GridSearchCV 21 | grid_search = GridSearchCV(svc, parameters, n_jobs=-1, cv=3) 22 | 23 | 24 | import timeit 25 | start_time = timeit.default_timer() 26 | grid_search.fit(X_train, Y_train) 27 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 28 | 29 | print(grid_search.best_params_) 30 | print(grid_search.best_score_) 31 | 32 | svc_best = grid_search.best_estimator_ 33 | 34 | accuracy = svc_best.score(X_test, Y_test) 35 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100)) 36 | 37 | prediction = svc_best.predict(X_test) 38 | from sklearn.metrics import classification_report 39 | report = classification_report(Y_test, prediction) 40 | print(report) 41 | -------------------------------------------------------------------------------- /Chapter04/CTG.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Machine-Learning-By-Example/6ee2be561e511bd0a1c0b3d481ad3950ea3f1815/Chapter04/CTG.xls -------------------------------------------------------------------------------- /Chapter05/1decision_tree_submit.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | 5 | # Plot Gini Impurity in binary case 6 | pos_fraction = np.linspace(0.00, 1.00, 1000) 7 | gini = 1 - pos_fraction**2 - (1-pos_fraction)**2 8 | plt.plot(pos_fraction, gini) 9 | plt.xlabel('Positive fraction') 10 | plt.ylabel('Gini Impurity') 11 | plt.ylim(0, 1) 12 | plt.show() 13 | 14 | 15 | # Given labels of a data set, the Gini Impurity calculation function 16 | def gini_impurity(labels): 17 | # When the set is empty, it is also pure 18 | if not labels: 19 | return 0 20 | # Count the occurrences of each label 21 | counts = np.unique(labels, return_counts=True)[1] 22 | fractions = counts / float(len(labels)) 23 | return 1 - np.sum(fractions ** 2) 24 | 25 | print('{0:.4f}'.format(gini_impurity([1, 1, 0, 1, 0]))) 26 | print('{0:.4f}'.format(gini_impurity([1, 1, 0, 1, 0, 0]))) 27 | print('{0:.4f}'.format(gini_impurity([1, 1, 1, 1]))) 28 | 29 | 30 | # Plot entropy in binary case 31 | pos_fraction = np.linspace(0.00, 1.00, 1000) 32 | ent = - (pos_fraction * np.log2(pos_fraction) + (1 - pos_fraction) * np.log2(1 - pos_fraction)) 33 | plt.plot(pos_fraction, ent) 34 | plt.xlabel('Positive fraction') 35 | plt.ylabel('Entropy') 36 | plt.ylim(0, 1) 37 | plt.show() 38 | 39 | 40 | # Given labels of a data set, the entropy calculation function 41 | def entropy(labels): 42 | if not labels: 43 | return 0 44 | counts = np.unique(labels, return_counts=True)[1] 45 | fractions = counts / float(len(labels)) 46 | return - np.sum(fractions * np.log2(fractions)) 47 | 48 | print('{0:.4f}'.format(entropy([1, 1, 0, 1, 0]))) 49 | print('{0:.4f}'.format(entropy([1, 1, 0, 1, 0, 0]))) 50 | print('{0:.4f}'.format(entropy([1, 1, 1, 1]))) 51 | 52 | 53 | def information_gain(y, mask, func=entropy): 54 | s1 = np.sum(mask) 55 | s2 = mask.size - s1 56 | if (s1 == 0 | s2 == 0): return 0 57 | return func(y) - s1 / float(s1 + s2) * func(y[mask]) - s2 / float(s1 + s2) * func(y[np.logical_not(mask)]) 58 | 59 | 60 | criterion_function = {'gini': gini_impurity, 'entropy': entropy} 61 | def weighted_impurity(groups, criterion='gini'): 62 | """ Calculate weighted impurity of children after a split 63 | Args: 64 | groups (list of children, and a child consists a list of class labels) 65 | criterion (metric to measure the quality of a split, 'gini' for Gini Impurity or 'entropy' for Information Gain) 66 | Returns: 67 | float, weighted impurity 68 | """ 69 | total = sum(len(group) for group in groups) 70 | weighted_sum = 0.0 71 | for group in groups: 72 | weighted_sum += len(group) / float(total) * criterion_function[criterion](group) 73 | return weighted_sum 74 | 75 | 76 | children_1 = [[1, 0, 1], [0, 1]] 77 | children_2 = [[1, 1], [0, 0, 1]] 78 | print('Entropy of #1 split: {0:.4f}'.format(weighted_impurity(children_1, 'entropy'))) 79 | print('Entropy of #2 split: {0:.4f}'.format(weighted_impurity(children_2, 'entropy'))) 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | def gini_impurity(labels): 88 | # When the set is empty, it is also pure 89 | if labels.size == 0: 90 | return 0 91 | # Count the occurrences of each label 92 | counts = np.unique(labels, return_counts=True)[1] 93 | fractions = counts / float(len(labels)) 94 | return 1 - np.sum(fractions ** 2) 95 | 96 | def entropy(labels): 97 | # When the set is empty, it is also pure 98 | if labels.size == 0: 99 | return 0 100 | counts = np.unique(labels, return_counts=True)[1] 101 | fractions = counts / float(len(labels)) 102 | return - np.sum(fractions * np.log2(fractions)) 103 | 104 | criterion_function = {'gini': gini_impurity, 'entropy': entropy} 105 | def weighted_impurity(groups, criterion='gini'): 106 | """ Calculate weighted impurity of children after a split 107 | Args: 108 | groups (list of children, and a child consists a list of class labels) 109 | criterion (metric to measure the quality of a split, 'gini' for Gini Impurity or 'entropy' for Information Gain) 110 | Returns: 111 | float, weighted impurity 112 | """ 113 | total = sum(len(group) for group in groups) 114 | weighted_sum = 0.0 115 | for group in groups: 116 | weighted_sum += len(group) / float(total) * criterion_function[criterion](group) 117 | return weighted_sum 118 | 119 | 120 | def split_node(X, y, index, value): 121 | """ Split data set X, y based on a feature and a value 122 | Args: 123 | X, y (numpy.ndarray, data set) 124 | index (int, index of the feature used for splitting) 125 | value (value of the feature used for splitting) 126 | Returns: 127 | list, list: left and right child, a child is in the format of [X, y] 128 | """ 129 | x_index = X[:, index] 130 | # if this feature is numerical 131 | if X[0, index].dtype.kind in ['i', 'f']: 132 | mask = x_index >= value 133 | # if this feature is categorical 134 | else: 135 | mask = x_index == value 136 | # split into left and right child 137 | left = [X[~mask, :], y[~mask]] 138 | right = [X[mask, :], y[mask]] 139 | return left, right 140 | 141 | 142 | def get_best_split(X, y, criterion): 143 | """ Obtain the best splitting point and resulting children for the data set X, y 144 | Args: 145 | X, y (numpy.ndarray, data set) 146 | criterion (gini or entropy) 147 | Returns: 148 | dict {index: index of the feature, value: feature value, children: left and right children} 149 | """ 150 | best_index, best_value, best_score, children = None, None, 1, None 151 | for index in range(len(X[0])): 152 | for value in np.sort(np.unique(X[:, index])): 153 | groups = split_node(X, y, index, value) 154 | impurity = weighted_impurity([groups[0][1], groups[1][1]], criterion) 155 | if impurity < best_score: 156 | best_index, best_value, best_score, children = index, value, impurity, groups 157 | return {'index': best_index, 'value': best_value, 'children': children} 158 | 159 | 160 | 161 | def get_leaf(labels): 162 | # Obtain the leaf as the majority of the labels 163 | return np.bincount(labels).argmax() 164 | 165 | 166 | 167 | def split(node, max_depth, min_size, depth, criterion): 168 | """ Split children of a node to construct new nodes or assign them terminals 169 | Args: 170 | node (dict, with children info) 171 | max_depth (int, maximal depth of the tree) 172 | min_size (int, minimal samples required to further split a child) 173 | depth (int, current depth of the node) 174 | criterion (gini or entropy) 175 | """ 176 | left, right = node['children'] 177 | del (node['children']) 178 | if left[1].size == 0: 179 | node['right'] = get_leaf(right[1]) 180 | return 181 | if right[1].size == 0: 182 | node['left'] = get_leaf(left[1]) 183 | return 184 | # Check if the current depth exceeds the maximal depth 185 | if depth >= max_depth: 186 | node['left'], node['right'] = get_leaf(left[1]), get_leaf(right[1]) 187 | return 188 | # Check if the left child has enough samples 189 | if left[1].size <= min_size: 190 | node['left'] = get_leaf(left[1]) 191 | else: 192 | # It has enough samples, we further split it 193 | result = get_best_split(left[0], left[1], criterion) 194 | result_left, result_right = result['children'] 195 | if result_left[1].size == 0: 196 | node['left'] = get_leaf(result_right[1]) 197 | elif result_right[1].size == 0: 198 | node['left'] = get_leaf(result_left[1]) 199 | else: 200 | node['left'] = result 201 | split(node['left'], max_depth, min_size, depth + 1, criterion) 202 | # Check if the right child has enough samples 203 | if right[1].size <= min_size: 204 | node['right'] = get_leaf(right[1]) 205 | else: 206 | # It has enough samples, we further split it 207 | result = get_best_split(right[0], right[1], criterion) 208 | result_left, result_right = result['children'] 209 | if result_left[1].size == 0: 210 | node['right'] = get_leaf(result_right[1]) 211 | elif result_right[1].size == 0: 212 | node['right'] = get_leaf(result_left[1]) 213 | else: 214 | node['right'] = result 215 | split(node['right'], max_depth, min_size, depth + 1, criterion) 216 | 217 | 218 | def train_tree(X_train, y_train, max_depth, min_size, criterion='gini'): 219 | """ Construction of a tree starts here 220 | Args: 221 | X_train, y_train (list, list, training data) 222 | max_depth (int, maximal depth of the tree) 223 | min_size (int, minimal samples required to further split a child) 224 | criterion (gini or entropy) 225 | """ 226 | X = np.array(X_train) 227 | y = np.array(y_train) 228 | root = get_best_split(X, y, criterion) 229 | split(root, max_depth, min_size, 1, criterion) 230 | return root 231 | 232 | 233 | 234 | CONDITION = {'numerical': {'yes': '>=', 'no': '<'}, 235 | 'categorical': {'yes': 'is', 'no': 'is not'}} 236 | def visualize_tree(node, depth=0): 237 | if isinstance(node, dict): 238 | if node['value'].dtype.kind in ['i', 'f']: 239 | condition = CONDITION['numerical'] 240 | else: 241 | condition = CONDITION['categorical'] 242 | print('{}|- X{} {} {}'.format(depth * ' ', node['index'] + 1, condition['no'], node['value'])) 243 | if 'left' in node: 244 | visualize_tree(node['left'], depth + 1) 245 | print('{}|- X{} {} {}'.format(depth * ' ', node['index'] + 1, condition['yes'], node['value'])) 246 | if 'right' in node: 247 | visualize_tree(node['right'], depth + 1) 248 | else: 249 | print('{}[{}]'.format(depth * ' ', node)) 250 | 251 | 252 | X_train = [['tech', 'professional'], 253 | ['fashion', 'student'], 254 | ['fashion', 'professional'], 255 | ['sports', 'student'], 256 | ['tech', 'student'], 257 | ['tech', 'retired'], 258 | ['sports', 'professional']] 259 | 260 | y_train = [1, 261 | 0, 262 | 0, 263 | 0, 264 | 1, 265 | 0, 266 | 1] 267 | 268 | tree = train_tree(X_train, y_train, 2, 2) 269 | visualize_tree(tree) 270 | 271 | 272 | 273 | 274 | X_train_n = [[6, 7], 275 | [2, 4], 276 | [7, 2], 277 | [3, 6], 278 | [4, 7], 279 | [5, 2], 280 | [1, 6], 281 | [2, 0], 282 | [6, 3], 283 | [4, 1]] 284 | 285 | y_train_n = [0, 286 | 0, 287 | 0, 288 | 0, 289 | 0, 290 | 1, 291 | 1, 292 | 1, 293 | 1, 294 | 1] 295 | 296 | tree = train_tree(X_train_n, y_train_n, 2, 2) 297 | visualize_tree(tree) 298 | 299 | 300 | from sklearn.tree import DecisionTreeClassifier 301 | tree_sk = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_split=2) 302 | tree_sk.fit(X_train_n, y_train_n) 303 | 304 | from sklearn.tree import export_graphviz 305 | export_graphviz(tree_sk, out_file='tree.dot', feature_names=['X1', 'X2'], impurity=False, filled=True, class_names=['0', '1']) 306 | -------------------------------------------------------------------------------- /Chapter05/2avazu_ctr.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | def read_ad_click_data(n, offset=0): 4 | X_dict, y = [], [] 5 | with open('train.csv', 'r') as csvfile: 6 | reader = csv.DictReader(csvfile) 7 | for i in range(offset): 8 | next(reader) 9 | i = 0 10 | for row in reader: 11 | i += 1 12 | y.append(int(row['click'])) 13 | del row['click'], row['id'], row['hour'], row['device_id'], row['device_ip'] 14 | X_dict.append(dict(row)) 15 | if i >= n: 16 | break 17 | return X_dict, y 18 | 19 | n = 100000 20 | X_dict_train, y_train = read_ad_click_data(n) 21 | print(X_dict_train[0]) 22 | print(X_dict_train[1]) 23 | 24 | 25 | from sklearn.feature_extraction import DictVectorizer 26 | dict_one_hot_encoder = DictVectorizer(sparse=False) 27 | X_train = dict_one_hot_encoder.fit_transform(X_dict_train) 28 | print(len(X_train[0])) 29 | 30 | X_dict_test, y_test = read_ad_click_data(n, n) 31 | X_test = dict_one_hot_encoder.transform(X_dict_test) 32 | print(len(X_test[0])) 33 | 34 | 35 | from sklearn.tree import DecisionTreeClassifier 36 | parameters = {'max_depth': [3, 10, None]} 37 | decision_tree = DecisionTreeClassifier(criterion='gini', min_samples_split=30) 38 | 39 | from sklearn.model_selection import GridSearchCV 40 | grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3, scoring='roc_auc') 41 | 42 | grid_search.fit(X_train, y_train) 43 | print(grid_search.best_params_) 44 | 45 | decision_tree_best = grid_search.best_estimator_ 46 | pos_prob = decision_tree_best.predict_proba(X_test)[:, 1] 47 | 48 | from sklearn.metrics import roc_auc_score 49 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(y_test, pos_prob))) 50 | 51 | 52 | 53 | from sklearn.ensemble import RandomForestClassifier 54 | 55 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1) 56 | grid_search = GridSearchCV(random_forest, parameters, n_jobs=-1, cv=3, scoring='roc_auc') 57 | grid_search.fit(X_train, y_train) 58 | print(grid_search.best_params_) 59 | print(grid_search.best_score_) 60 | 61 | random_forest_best = grid_search.best_estimator_ 62 | pos_prob = random_forest_best.predict_proba(X_test)[:, 1] 63 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(y_test, pos_prob))) 64 | 65 | 66 | -------------------------------------------------------------------------------- /Chapter06/1one_hot_encode.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction import DictVectorizer 2 | 3 | 4 | X_dict = [{'interest': 'tech', 'occupation': 'professional'}, 5 | {'interest': 'fashion', 'occupation': 'student'}, 6 | {'interest': 'fashion', 'occupation': 'professional'}, 7 | {'interest': 'sports', 'occupation': 'student'}, 8 | {'interest': 'tech', 'occupation': 'student'}, 9 | {'interest': 'tech', 'occupation': 'retired'}, 10 | {'interest': 'sports', 'occupation': 'professional'}] 11 | 12 | dict_one_hot_encoder = DictVectorizer(sparse=False) 13 | X_encoded = dict_one_hot_encoder.fit_transform(X_dict) 14 | print(X_encoded) 15 | 16 | print(dict_one_hot_encoder.vocabulary_) 17 | 18 | new_dict = [{'interest': 'sports', 'occupation': 'retired'}] 19 | new_encoded = dict_one_hot_encoder.transform(new_dict) 20 | print(new_encoded) 21 | 22 | print(dict_one_hot_encoder.inverse_transform(new_encoded)) 23 | 24 | 25 | # new category not encountered before 26 | new_dict = [{'interest': 'unknown_interest', 'occupation': 'retired'}, 27 | {'interest': 'tech', 'occupation': 'unseen_occupation'}] 28 | new_encoded = dict_one_hot_encoder.transform(new_dict) 29 | print(new_encoded) 30 | 31 | 32 | 33 | import numpy as np 34 | X_str = np.array([['tech', 'professional'], 35 | ['fashion', 'student'], 36 | ['fashion', 'professional'], 37 | ['sports', 'student'], 38 | ['tech', 'student'], 39 | ['tech', 'retired'], 40 | ['sports', 'professional']]) 41 | 42 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 43 | label_encoder = LabelEncoder() 44 | X_int = label_encoder.fit_transform(X_str.ravel()).reshape(*X_str.shape) 45 | print(X_int) 46 | 47 | one_hot_encoder = OneHotEncoder() 48 | X_encoded = one_hot_encoder.fit_transform(X_int).toarray() 49 | print(X_encoded) 50 | 51 | 52 | 53 | # new category not encountered before 54 | new_str = np.array([['unknown_interest', 'retired'], 55 | ['tech', 'unseen_occupation'], 56 | ['unknown_interest', 'unseen_occupation']]) 57 | 58 | def string_to_dict(columns, data_str): 59 | data_dict = [] 60 | for sample_str in data_str: 61 | data_dict.append({column: value for column, value in zip(columns, sample_str)}) 62 | return data_dict 63 | 64 | columns = ['interest', 'occupation'] 65 | new_encoded = dict_one_hot_encoder.transform(string_to_dict(columns, new_str)) 66 | print(new_encoded) -------------------------------------------------------------------------------- /Chapter06/2logistic_function.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def sigmoid(input): 5 | return 1.0 / (1 + np.exp(-input)) 6 | 7 | 8 | import matplotlib.pyplot as plt 9 | z = np.linspace(-8, 8, 1000) 10 | y = sigmoid(z) 11 | plt.plot(z, y) 12 | plt.axhline(y=0, ls='dotted', color='k') 13 | plt.axhline(y=0.5, ls='dotted', color='k') 14 | plt.axhline(y=1, ls='dotted', color='k') 15 | plt.yticks([0.0, 0.25, 0.5, 0.75, 1.0]) 16 | plt.xlabel('z') 17 | plt.ylabel('y(z)') 18 | plt.show() 19 | 20 | 21 | # plot sample cost vs y_hat (prediction), for y (truth) = 1 22 | y_hat = np.linspace(0, 1, 1000) 23 | cost = -np.log(y_hat) 24 | plt.plot(y_hat, cost) 25 | plt.xlabel('Prediction') 26 | plt.ylabel('Cost') 27 | plt.xlim(0, 1) 28 | plt.ylim(0, 7) 29 | plt.show() 30 | 31 | # plot sample cost vs y_hat (prediction), for y (truth) = 0 32 | y_hat = np.linspace(0, 1, 1000) 33 | cost = -np.log(1 - y_hat) 34 | plt.plot(y_hat, cost) 35 | plt.xlabel('Prediction') 36 | plt.ylabel('Cost') 37 | plt.xlim(0, 1) 38 | plt.ylim(0, 7) 39 | plt.show() 40 | 41 | -------------------------------------------------------------------------------- /Chapter06/3logistic_regression_from_scratch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def sigmoid(input): 5 | return 1.0 / (1 + np.exp(-input)) 6 | 7 | 8 | 9 | # Gradient descent based logistic regression from scratch 10 | def compute_prediction(X, weights): 11 | """ Compute the prediction y_hat based on current weights 12 | Args: 13 | X (numpy.ndarray) 14 | weights (numpy.ndarray) 15 | Returns: 16 | numpy.ndarray, y_hat of X under weights 17 | """ 18 | z = np.dot(X, weights) 19 | predictions = sigmoid(z) 20 | return predictions 21 | 22 | def update_weights_gd(X_train, y_train, weights, learning_rate): 23 | """ Update weights by one step 24 | Args: 25 | X_train, y_train (numpy.ndarray, training data set) 26 | weights (numpy.ndarray) 27 | learning_rate (float) 28 | Returns: 29 | numpy.ndarray, updated weights 30 | """ 31 | predictions = compute_prediction(X_train, weights) 32 | weights_delta = np.dot(X_train.T, y_train - predictions) 33 | m = y_train.shape[0] 34 | weights += learning_rate / float(m) * weights_delta 35 | return weights 36 | 37 | def compute_cost(X, y, weights): 38 | """ Compute the cost J(w) 39 | Args: 40 | X, y (numpy.ndarray, data set) 41 | weights (numpy.ndarray) 42 | Returns: 43 | float 44 | """ 45 | predictions = compute_prediction(X, weights) 46 | cost = np.mean(-y * np.log(predictions) - (1 - y) * np.log(1 - predictions)) 47 | return cost 48 | 49 | def train_logistic_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False): 50 | """ Train a logistic regression model 51 | Args: 52 | X_train, y_train (numpy.ndarray, training data set) 53 | max_iter (int, number of iterations) 54 | learning_rate (float) 55 | fit_intercept (bool, with an intercept w0 or not) 56 | Returns: 57 | numpy.ndarray, learned weights 58 | """ 59 | if fit_intercept: 60 | intercept = np.ones((X_train.shape[0], 1)) 61 | X_train = np.hstack((intercept, X_train)) 62 | weights = np.zeros(X_train.shape[1]) 63 | for iteration in range(max_iter): 64 | weights = update_weights_gd(X_train, y_train, weights, learning_rate) 65 | # Check the cost for every 100 (for example) iterations 66 | if iteration % 1000 == 0: 67 | print(compute_cost(X_train, y_train, weights)) 68 | return weights 69 | 70 | def predict(X, weights): 71 | if X.shape[1] == weights.shape[0] - 1: 72 | intercept = np.ones((X.shape[0], 1)) 73 | X = np.hstack((intercept, X)) 74 | return compute_prediction(X, weights) 75 | 76 | 77 | # A example 78 | X_train = np.array([[6, 7], 79 | [2, 4], 80 | [3, 6], 81 | [4, 7], 82 | [1, 6], 83 | [5, 2], 84 | [2, 0], 85 | [6, 3], 86 | [4, 1], 87 | [7, 2]]) 88 | 89 | y_train = np.array([0, 90 | 0, 91 | 0, 92 | 0, 93 | 0, 94 | 1, 95 | 1, 96 | 1, 97 | 1, 98 | 1]) 99 | 100 | weights = train_logistic_regression(X_train, y_train, max_iter=1000, learning_rate=0.1, fit_intercept=True) 101 | 102 | X_test = np.array([[6, 1], 103 | [1, 3], 104 | [3, 1], 105 | [4, 5]]) 106 | 107 | predictions = predict(X_test, weights) 108 | 109 | plt.scatter(X_train[:,0], X_train[:,1], c=['b']*5+['k']*5, marker='o') 110 | colours = ['k' if prediction >= 0.5 else 'b' for prediction in predictions] 111 | plt.scatter(X_test[:,0], X_test[:,1], marker='*', c=colours) 112 | plt.xlabel('x1') 113 | plt.ylabel('x2') 114 | plt.show() 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | # Deploy logistic regression by gradient descent to click-through prediction 125 | 126 | import csv 127 | 128 | def read_ad_click_data(n, offset=0): 129 | X_dict, y = [], [] 130 | with open('train.csv', 'r') as csvfile: 131 | reader = csv.DictReader(csvfile) 132 | for i in range(offset): 133 | next(reader) 134 | i = 0 135 | for row in reader: 136 | i += 1 137 | y.append(int(row['click'])) 138 | del row['click'], row['id'], row['hour'], row['device_id'], row['device_ip'] 139 | X_dict.append(row) 140 | if i >= n: 141 | break 142 | return X_dict, y 143 | 144 | n = 10000 145 | X_dict_train, y_train = read_ad_click_data(n) 146 | 147 | from sklearn.feature_extraction import DictVectorizer 148 | dict_one_hot_encoder = DictVectorizer(sparse=False) 149 | X_train = dict_one_hot_encoder.fit_transform(X_dict_train) 150 | 151 | X_dict_test, y_test = read_ad_click_data(n, n) 152 | X_test = dict_one_hot_encoder.transform(X_dict_test) 153 | 154 | X_train_10k = X_train 155 | y_train_10k = np.array(y_train) 156 | 157 | import timeit 158 | start_time = timeit.default_timer() 159 | weights = train_logistic_regression(X_train_10k, y_train_10k, max_iter=10000, learning_rate=0.01, fit_intercept=True) 160 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 161 | 162 | X_test_10k = X_test 163 | 164 | predictions = predict(X_test_10k, weights) 165 | from sklearn.metrics import roc_auc_score 166 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(y_test, predictions))) 167 | 168 | 169 | 170 | n = 100000 171 | X_dict_train, y_train = read_ad_click_data(n) 172 | dict_one_hot_encoder = DictVectorizer(sparse=False) 173 | X_train = dict_one_hot_encoder.fit_transform(X_dict_train) 174 | 175 | X_train_100k = X_train 176 | y_train_100k = np.array(y_train) 177 | 178 | start_time = timeit.default_timer() 179 | weights = train_logistic_regression(X_train_100k, y_train_100k, max_iter=10000, learning_rate=0.01, fit_intercept=True) 180 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 181 | 182 | 183 | 184 | 185 | 186 | def update_weights_sgd(X_train, y_train, weights, learning_rate): 187 | """ One weight update iteration: moving weights by one step based on each individual sample 188 | Args: 189 | X_train, y_train (numpy.ndarray, training data set) 190 | weights (numpy.ndarray) 191 | learning_rate (float) 192 | Returns: 193 | numpy.ndarray, updated weights 194 | """ 195 | for X_each, y_each in zip(X_train, y_train): 196 | prediction = compute_prediction(X_each, weights) 197 | weights_delta = X_each.T * (y_each - prediction) 198 | weights += learning_rate * weights_delta 199 | return weights 200 | 201 | def train_logistic_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False): 202 | """ Train a logistic regression model 203 | Args: 204 | X_train, y_train (numpy.ndarray, training data set) 205 | max_iter (int, number of iterations) 206 | learning_rate (float) 207 | fit_intercept (bool, with an intercept w0 or not) 208 | Returns: 209 | numpy.ndarray, learned weights 210 | """ 211 | if fit_intercept: 212 | intercept = np.ones((X_train.shape[0], 1)) 213 | X_train = np.hstack((intercept, X_train)) 214 | weights = np.zeros(X_train.shape[1]) 215 | for iteration in range(max_iter): 216 | weights = update_weights_sgd(X_train, y_train, weights, learning_rate) 217 | # Check the cost for every 2 (for example) iterations 218 | if iteration % 2 == 0: 219 | print(compute_cost(X_train, y_train, weights)) 220 | return weights 221 | 222 | 223 | # Train the SGD model based on 10000 samples 224 | start_time = timeit.default_timer() 225 | weights = train_logistic_regression(X_train_10k, y_train_10k, max_iter=5, learning_rate=0.01, fit_intercept=True) 226 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 227 | predictions = predict(X_test_10k, weights) 228 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(y_test, predictions))) 229 | 230 | 231 | # Train the SGD model based on 100000 samples 232 | start_time = timeit.default_timer() 233 | weights = train_logistic_regression(X_train_100k, y_train_100k, max_iter=5, learning_rate=0.01, fit_intercept=True) 234 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 235 | 236 | # Examine the performance on the next 10000 samples 237 | X_dict_test, y_test_next10k = read_ad_click_data(10000, 100000) 238 | X_test_next10k = dict_one_hot_encoder.transform(X_dict_test) 239 | predictions = predict(X_test_next10k, weights) 240 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(y_test_next10k, predictions))) 241 | -------------------------------------------------------------------------------- /Chapter06/4random_forest_feature_selection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | import csv 5 | 6 | def read_ad_click_data(n, offset=0): 7 | X_dict, y = [], [] 8 | with open('train.csv', 'r') as csvfile: 9 | reader = csv.DictReader(csvfile) 10 | for i in range(offset): 11 | next(reader) 12 | i = 0 13 | for row in reader: 14 | i += 1 15 | y.append(int(row['click'])) 16 | del row['click'], row['id'], row['hour'], row['device_id'], row['device_ip'] 17 | X_dict.append(row) 18 | if i >= n: 19 | break 20 | return X_dict, y 21 | 22 | n = 10000 23 | X_dict_train, y_train = read_ad_click_data(n) 24 | 25 | from sklearn.feature_extraction import DictVectorizer 26 | dict_one_hot_encoder = DictVectorizer(sparse=False) 27 | X_train = dict_one_hot_encoder.fit_transform(X_dict_train) 28 | 29 | X_dict_test, y_test = read_ad_click_data(n, n) 30 | X_test = dict_one_hot_encoder.transform(X_dict_test) 31 | 32 | X_train_10k = X_train 33 | y_train_10k = np.array(y_train) 34 | 35 | 36 | # Feature selection with random forest 37 | 38 | from sklearn.ensemble import RandomForestClassifier 39 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1) 40 | random_forest.fit(X_train_10k, y_train_10k) 41 | 42 | 43 | # bottom 10 weights and the corresponding 10 least important features 44 | print(np.sort(random_forest.feature_importances_)[:10]) 45 | print(np.argsort(random_forest.feature_importances_)[:10]) 46 | # top 10 weights and the corresponding 10 most important features 47 | print(np.sort(random_forest.feature_importances_)[-10:]) 48 | print(np.argsort(random_forest.feature_importances_)[-10:]) 49 | 50 | print(dict_one_hot_encoder.feature_names_[393]) 51 | 52 | top500_feature = np.argsort(random_forest.feature_importances_)[-500:] 53 | X_train_10k_selected = X_train_10k[:, top500_feature] 54 | print(X_train_10k_selected.shape) 55 | -------------------------------------------------------------------------------- /Chapter06/5scikit_logistic_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import csv 3 | from sklearn.metrics import roc_auc_score 4 | 5 | 6 | def read_ad_click_data(n, offset=0): 7 | X_dict, y = [], [] 8 | with open('train.csv', 'r') as csvfile: 9 | reader = csv.DictReader(csvfile) 10 | for i in range(offset): 11 | next(reader) 12 | i = 0 13 | for row in reader: 14 | i += 1 15 | y.append(int(row['click'])) 16 | del row['click'], row['id'], row['hour'], row['device_id'], row['device_ip'] 17 | X_dict.append(row) 18 | if i >= n: 19 | break 20 | return X_dict, y 21 | 22 | n = 10000 23 | X_dict_train, y_train = read_ad_click_data(n) 24 | 25 | from sklearn.feature_extraction import DictVectorizer 26 | dict_one_hot_encoder = DictVectorizer(sparse=False) 27 | X_train = dict_one_hot_encoder.fit_transform(X_dict_train) 28 | 29 | X_dict_test, y_test = read_ad_click_data(n, n) 30 | X_test = dict_one_hot_encoder.transform(X_dict_test) 31 | 32 | X_train_10k = X_train 33 | y_train_10k = np.array(y_train) 34 | 35 | n = 100000 36 | X_dict_train, y_train = read_ad_click_data(n) 37 | dict_one_hot_encoder = DictVectorizer(sparse=False) 38 | X_train = dict_one_hot_encoder.fit_transform(X_dict_train) 39 | 40 | X_train_100k = X_train 41 | y_train_100k = np.array(y_train) 42 | 43 | X_dict_test, y_test_next10k = read_ad_click_data(10000, 100000) 44 | X_test_next10k = dict_one_hot_encoder.transform(X_dict_test) 45 | 46 | # Use scikit-learn package 47 | from sklearn.linear_model import SGDClassifier 48 | sgd_lr = SGDClassifier(loss='log', penalty=None, fit_intercept=True, n_iter=5, learning_rate='constant', eta0=0.01) 49 | sgd_lr.fit(X_train_100k, y_train_100k) 50 | 51 | predictions = sgd_lr.predict_proba(X_test_next10k)[:, 1] 52 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(y_test_next10k, predictions))) 53 | 54 | 55 | 56 | # Feature selection with L1 regularization 57 | 58 | l1_feature_selector = SGDClassifier(loss='log', penalty='l1', alpha=0.0001, fit_intercept=True, n_iter=5, learning_rate='constant', eta0=0.01) 59 | l1_feature_selector.fit(X_train_10k, y_train_10k) 60 | X_train_10k_selected = l1_feature_selector.transform(X_train_10k) 61 | print(X_train_10k_selected.shape) 62 | print(X_train_10k.shape) 63 | 64 | # bottom 10 weights and the corresponding 10 least important features 65 | print(np.sort(l1_feature_selector.coef_)[0][:10]) 66 | print(np.argsort(l1_feature_selector.coef_)[0][:10]) 67 | # top 10 weights and the corresponding 10 most important features 68 | print(np.sort(l1_feature_selector.coef_)[0][-10:]) 69 | print(np.argsort(l1_feature_selector.coef_)[0][-10:]) 70 | 71 | 72 | 73 | # Online learning 74 | 75 | # The number of iterations is set to 1 if using partial_fit. 76 | sgd_lr = SGDClassifier(loss='log', penalty=None, fit_intercept=True, n_iter=1, learning_rate='constant', eta0=0.01) 77 | 78 | import timeit 79 | start_time = timeit.default_timer() 80 | 81 | # there are 40428968 labelled samples, use the first ten 100k samples for training, and the next 100k for testing 82 | for i in range(20): 83 | X_dict_train, y_train_every_100k = read_ad_click_data(100000, i * 100000) 84 | X_train_every_100k = dict_one_hot_encoder.transform(X_dict_train) 85 | sgd_lr.partial_fit(X_train_every_100k, y_train_every_100k, classes=[0, 1]) 86 | 87 | 88 | print("--- %0.3fs seconds ---" % (timeit.default_timer() - start_time)) 89 | 90 | X_dict_test, y_test_next10k = read_ad_click_data(10000, (i + 1) * 200000) 91 | X_test_next10k = dict_one_hot_encoder.transform(X_dict_test) 92 | 93 | 94 | predictions = sgd_lr.predict_proba(X_test_next10k)[:, 1] 95 | print('The ROC AUC on testing set is: {0:.3f}'.format(roc_auc_score(y_test_next10k, predictions))) 96 | 97 | 98 | # Multiclass classification with logistic regression 99 | 100 | from sklearn.feature_extraction.text import TfidfVectorizer 101 | from sklearn.datasets import fetch_20newsgroups 102 | from sklearn.linear_model import SGDClassifier 103 | from nltk.corpus import names 104 | from nltk.stem import WordNetLemmatizer 105 | 106 | all_names = set(names.words()) 107 | lemmatizer = WordNetLemmatizer() 108 | 109 | def letters_only(astr): 110 | for c in astr: 111 | if not c.isalpha(): 112 | return False 113 | return True 114 | 115 | def clean_text(docs): 116 | cleaned_docs = [] 117 | for doc in docs: 118 | cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower()) 119 | for word in doc.split() 120 | if letters_only(word) 121 | and word not in all_names])) 122 | return cleaned_docs 123 | 124 | data_train = fetch_20newsgroups(subset='train', categories=None, random_state=42) 125 | data_test = fetch_20newsgroups(subset='test', categories=None, random_state=42) 126 | 127 | cleaned_train = clean_text(data_train.data) 128 | label_train = data_train.target 129 | cleaned_test = clean_text(data_test.data) 130 | label_test = data_test.target 131 | 132 | tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=40000) 133 | term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train) 134 | term_docs_test = tfidf_vectorizer.transform(cleaned_test) 135 | 136 | # combined with grid search 137 | from sklearn.model_selection import GridSearchCV 138 | parameters = {'penalty': ['l2', None], 139 | 'alpha': [1e-07, 1e-06, 1e-05, 1e-04], 140 | 'eta0': [0.01, 0.1, 1, 10]} 141 | 142 | sgd_lr = SGDClassifier(loss='log', learning_rate='constant', eta0=0.01, fit_intercept=True, n_iter=10) 143 | 144 | grid_search = GridSearchCV(sgd_lr, parameters, n_jobs=-1, cv=3) 145 | 146 | grid_search.fit(term_docs_train, label_train) 147 | print(grid_search.best_params_) 148 | 149 | sgd_lr_best = grid_search.best_estimator_ 150 | accuracy = sgd_lr_best.score(term_docs_test, label_test) 151 | print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100)) 152 | -------------------------------------------------------------------------------- /Chapter07/1stock_price_prediction.py: -------------------------------------------------------------------------------- 1 | import quandl 2 | 3 | 4 | mydata = quandl.get("YAHOO/INDEX_DJI", start_date="2005-12-01", end_date="2005-12-05") 5 | 6 | 7 | 8 | 9 | import pandas as pd 10 | 11 | 12 | authtoken = 'DrxQ6jniVGwDnrDrrb_Y' 13 | 14 | def get_data_quandl(symbol, start_date, end_date): 15 | data = quandl.get(symbol, start_date=start_date, end_date=end_date, authtoken=authtoken) 16 | return data 17 | 18 | 19 | def generate_features(df): 20 | """ Generate features for a stock/index based on historical price and performance 21 | Args: 22 | df (dataframe with columns "Open", "Close", "High", "Low", "Volume", "Adjusted Close") 23 | Returns: 24 | dataframe, data set with new features 25 | """ 26 | df_new = pd.DataFrame() 27 | # 6 original features 28 | df_new['open'] = df['Open'] 29 | df_new['open_1'] = df['Open'].shift(1) 30 | df_new['close_1'] = df['Close'].shift(1) 31 | df_new['high_1'] = df['High'].shift(1) 32 | df_new['low_1'] = df['Low'].shift(1) 33 | df_new['volume_1'] = df['Volume'].shift(1) 34 | # 31 original features 35 | # average price 36 | df_new['avg_price_5'] = pd.rolling_mean(df['Close'], window=5).shift(1) 37 | df_new['avg_price_30'] = pd.rolling_mean(df['Close'], window=21).shift(1) 38 | df_new['avg_price_365'] = pd.rolling_mean(df['Close'], window=252).shift(1) 39 | df_new['ratio_avg_price_5_30'] = df_new['avg_price_5'] / df_new['avg_price_30'] 40 | df_new['ratio_avg_price_5_365'] = df_new['avg_price_5'] / df_new['avg_price_365'] 41 | df_new['ratio_avg_price_30_365'] = df_new['avg_price_30'] / df_new['avg_price_365'] 42 | # average volume 43 | df_new['avg_volume_5'] = pd.rolling_mean(df['Volume'], window=5).shift(1) 44 | df_new['avg_volume_30'] = pd.rolling_mean(df['Volume'], window=21).shift(1) 45 | df_new['avg_volume_365'] = pd.rolling_mean(df['Volume'], window=252).shift(1) 46 | df_new['ratio_avg_volume_5_30'] = df_new['avg_volume_5'] / df_new['avg_volume_30'] 47 | df_new['ratio_avg_volume_5_365'] = df_new['avg_volume_5'] / df_new['avg_volume_365'] 48 | df_new['ratio_avg_volume_30_365'] = df_new['avg_volume_30'] / df_new['avg_volume_365'] 49 | # standard deviation of prices 50 | df_new['std_price_5'] = pd.rolling_std(df['Close'], window=5).shift(1) 51 | df_new['std_price_30'] = pd.rolling_std(df['Close'], window=21).shift(1) 52 | df_new['std_price_365'] = pd.rolling_std(df['Close'], window=252).shift(1) 53 | df_new['ratio_std_price_5_30'] = df_new['std_price_5'] / df_new['std_price_30'] 54 | df_new['ratio_std_price_5_365'] = df_new['std_price_5'] / df_new['std_price_365'] 55 | df_new['ratio_std_price_30_365'] = df_new['std_price_30'] / df_new['std_price_365'] 56 | # standard deviation of volumes 57 | df_new['std_volume_5'] = pd.rolling_std(df['Volume'], window=5).shift(1) 58 | df_new['std_volume_30'] = pd.rolling_std(df['Volume'], window=21).shift(1) 59 | df_new['std_volume_365'] = pd.rolling_std(df['Volume'], window=252).shift(1) 60 | df_new['ratio_std_volume_5_30'] = df_new['std_volume_5'] / df_new['std_volume_30'] 61 | df_new['ratio_std_volume_5_365'] = df_new['std_volume_5'] / df_new['std_volume_365'] 62 | df_new['ratio_std_volume_30_365'] = df_new['std_volume_30'] / df_new['std_volume_365'] 63 | # # return 64 | df_new['return_1'] = ((df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1) 65 | df_new['return_5'] = ((df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1) 66 | df_new['return_30'] = ((df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1) 67 | df_new['return_365'] = ((df['Close'] - df['Close'].shift(252)) / df['Close'].shift(252)).shift(1) 68 | df_new['moving_avg_5'] = pd.rolling_mean(df_new['return_1'], window=5) 69 | df_new['moving_avg_30'] = pd.rolling_mean(df_new['return_1'], window=21) 70 | df_new['moving_avg_365'] = pd.rolling_mean(df_new['return_1'], window=252) 71 | # the target 72 | df_new['close'] = df['Close'] 73 | df_new = df_new.dropna(axis=0) 74 | return df_new 75 | 76 | 77 | symbol = 'YAHOO/INDEX_DJI' 78 | start = '2001-01-01' 79 | end = '2014-12-31' 80 | data_raw = get_data_quandl(symbol, start, end) 81 | data = generate_features(data_raw) 82 | data.round(decimals=3).head(3) 83 | 84 | 85 | symbol = 'YAHOO/INDEX_DJI' 86 | start = '1988-01-01' 87 | end = '2015-12-31' 88 | data_raw = get_data_quandl(symbol, start, end) 89 | data = generate_features(data_raw) 90 | 91 | # next day prediction 92 | import datetime 93 | start_train = datetime.datetime(1988, 1, 1, 0, 0) 94 | end_train = datetime.datetime(2014, 12, 31, 0, 0) 95 | 96 | data_train = data.ix[start_train:end_train] 97 | X_columns = list(data.drop(['close'], axis=1).columns) 98 | y_column = 'close' 99 | X_train = data_train[X_columns] 100 | y_train = data_train[y_column] 101 | 102 | start_test = datetime.datetime(2015, 1, 1, 0, 0) 103 | end_test = datetime.datetime(2015, 12, 31, 0, 0) 104 | data_test = data.ix[start_test:end_test] 105 | X_test = data_test[X_columns] 106 | y_test = data_test[y_column] 107 | 108 | 109 | from sklearn.model_selection import GridSearchCV 110 | # First experiment with linear regression 111 | 112 | # SGD is very sensitive to data with features at different scales. Hence we need to do feature scaling before training. 113 | from sklearn.preprocessing import StandardScaler 114 | scaler = StandardScaler() 115 | scaler.fit(X_train) 116 | X_scaled_train = scaler.transform(X_train) 117 | X_scaled_test = scaler.transform(X_test) 118 | 119 | param_grid = { 120 | "alpha": [1e-5, 3e-5, 1e-4], 121 | "eta0": [0.01, 0.03, 0.1], 122 | } 123 | 124 | from sklearn.linear_model import SGDRegressor 125 | lr = SGDRegressor(penalty='l2', n_iter=1000) 126 | grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='neg_mean_absolute_error') 127 | grid_search.fit(X_scaled_train, y_train) 128 | 129 | print(grid_search.best_params_) 130 | 131 | lr_best = grid_search.best_estimator_ 132 | # print(grid_search.best_score_) 133 | 134 | predictions_lr = lr_best.predict(X_scaled_test) 135 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 136 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_lr))) 137 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_lr))) 138 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_lr))) 139 | 140 | 141 | 142 | # Next experiment with random forest 143 | 144 | param_grid = { 145 | "max_depth": [30, 50], 146 | "min_samples_split": [5, 10, 20], 147 | 148 | } 149 | 150 | from sklearn.ensemble import RandomForestRegressor 151 | rf = RandomForestRegressor(n_estimators=1000) 152 | grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_absolute_error') 153 | grid_search.fit(X_train, y_train) 154 | 155 | print(grid_search.best_params_) 156 | # print(grid_search.best_score_) 157 | 158 | rf_best = grid_search.best_estimator_ 159 | predictions_rf = rf_best.predict(X_test) 160 | 161 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_rf))) 162 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_rf))) 163 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_rf))) 164 | 165 | 166 | 167 | 168 | # Finally experiment with SVR 169 | param_grid = { 170 | "C": [1000, 3000, 10000], 171 | "epsilon": [0.00001, 0.00003, 0.0001], 172 | } 173 | 174 | from sklearn.svm import SVR 175 | svr = SVR(kernel='linear') 176 | grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_absolute_error') 177 | grid_search.fit(X_scaled_train, y_train) 178 | 179 | print(grid_search.best_params_) 180 | 181 | svr_best = grid_search.best_estimator_ 182 | # print grid_search.best_score_ 183 | 184 | predictions_svr = svr_best.predict(X_scaled_test) 185 | 186 | print('MSE: {0:.3f}'.format(mean_squared_error(y_test, predictions_svr))) 187 | print('MAE: {0:.3f}'.format(mean_absolute_error(y_test, predictions_svr))) 188 | print('R^2: {0:.3f}'.format(r2_score(y_test, predictions_svr))) 189 | 190 | 191 | 192 | 193 | import matplotlib.pyplot as plt 194 | 195 | dates = data_test.index.values 196 | plot_truth, = plt.plot(dates, y_test, 'k') 197 | plot_lr, = plt.plot(dates, predictions_lr, 'r') 198 | plot_rf, = plt.plot(dates, predictions_rf, 'b') 199 | plot_svr, = plt.plot(dates, predictions_svr, 'g') 200 | plt.legend([plot_truth, plot_lr, plot_rf, plot_svr], ['Truth', 'Linear regression', 'Random forest', 'SVR']) 201 | plt.title('Stock price prediction vs truth') 202 | plt.show() 203 | -------------------------------------------------------------------------------- /Chapter07/2linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # Gradient descent based linear regression from scratch 5 | def compute_prediction(X, weights): 6 | """ Compute the prediction y_hat based on current weights 7 | Args: 8 | X (numpy.ndarray) 9 | weights (numpy.ndarray) 10 | Returns: 11 | numpy.ndarray, y_hat of X under weights 12 | """ 13 | predictions = np.dot(X, weights) 14 | return predictions 15 | 16 | def update_weights_gd(X_train, y_train, weights, learning_rate): 17 | """ Update weights by one step 18 | Args: 19 | X_train, y_train (numpy.ndarray, training data set) 20 | weights (numpy.ndarray) 21 | learning_rate (float) 22 | Returns: 23 | numpy.ndarray, updated weights 24 | """ 25 | predictions = compute_prediction(X_train, weights) 26 | weights_delta = np.dot(X_train.T, y_train - predictions) 27 | m = y_train.shape[0] 28 | weights += learning_rate / float(m) * weights_delta 29 | return weights 30 | 31 | def compute_cost(X, y, weights): 32 | """ Compute the cost J(w) 33 | Args: 34 | X, y (numpy.ndarray, data set) 35 | weights (numpy.ndarray) 36 | Returns: 37 | float 38 | """ 39 | predictions = compute_prediction(X, weights) 40 | cost = np.mean((predictions - y) ** 2 / 2.0) 41 | return cost 42 | 43 | def train_linear_regression(X_train, y_train, max_iter, learning_rate, fit_intercept=False): 44 | """ Train a linear regression model with gradient descent 45 | Args: 46 | X_train, y_train (numpy.ndarray, training data set) 47 | max_iter (int, number of iterations) 48 | learning_rate (float) 49 | fit_intercept (bool, with an intercept w0 or not) 50 | Returns: 51 | numpy.ndarray, learned weights 52 | """ 53 | if fit_intercept: 54 | intercept = np.ones((X_train.shape[0], 1)) 55 | X_train = np.hstack((intercept, X_train)) 56 | weights = np.zeros(X_train.shape[1]) 57 | for iteration in range(max_iter): 58 | weights = update_weights_gd(X_train, y_train, weights, learning_rate) 59 | # Check the cost for every 100 (for example) iterations 60 | if iteration % 100 == 0: 61 | print(compute_cost(X_train, y_train, weights)) 62 | return weights 63 | 64 | def predict(X, weights): 65 | if X.shape[1] == weights.shape[0] - 1: 66 | intercept = np.ones((X.shape[0], 1)) 67 | X = np.hstack((intercept, X)) 68 | return compute_prediction(X, weights) 69 | 70 | 71 | # A small example 72 | X_train = np.array([[6], [2], [3], [4], [1], [5], [2], [6], [4], [7]]) 73 | 74 | y_train = np.array([5.5, 1.6, 2.2, 3.7, 0.8, 5.2, 1.5, 5.3, 4.4, 6.8]) 75 | 76 | weights = train_linear_regression(X_train, y_train, max_iter=100, learning_rate=0.01, fit_intercept=True) 77 | 78 | X_test = np.array([[1.3], [3.5], [5.2], [2.8]]) 79 | 80 | predictions = predict(X_test, weights) 81 | 82 | import matplotlib.pyplot as plt 83 | plt.scatter(X_train[:, 0], y_train, marker='o', c='b') 84 | plt.scatter(X_test[:, 0], predictions, marker='*', c='k') 85 | plt.xlabel('x') 86 | plt.ylabel('y') 87 | plt.show() 88 | 89 | 90 | # The diabetes example 91 | from sklearn import datasets 92 | diabetes = datasets.load_diabetes() 93 | print(diabetes.data.shape) 94 | 95 | num_test = 30 # the last 30 samples as testing set 96 | X_train = diabetes.data[:-num_test, :] 97 | y_train = diabetes.target[:-num_test] 98 | 99 | weights = train_linear_regression(X_train, y_train, max_iter=5000, learning_rate=1, fit_intercept=True) 100 | 101 | X_test = diabetes.data[-num_test:, :] 102 | y_test = diabetes.target[-num_test:] 103 | 104 | predictions = predict(X_test, weights) 105 | 106 | print(predictions) 107 | print(y_test) 108 | 109 | 110 | 111 | 112 | 113 | 114 | # Directly use SGDRegressor from scikit-learn 115 | from sklearn.linear_model import SGDRegressor 116 | regressor = SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, learning_rate='constant', eta0=0.01, n_iter=1000) 117 | regressor.fit(X_train, y_train) 118 | predictions = regressor.predict(X_test) 119 | print(predictions) 120 | print(regressor.score(X_test, y_test)) 121 | 122 | 123 | # Measuring model performance after hyperparameter tuning with grid search 124 | diabetes = datasets.load_diabetes() 125 | num_test = 30 # the last 30 samples as testing set 126 | X_train = diabetes.data[:-num_test, :] 127 | y_train = diabetes.target[:-num_test] 128 | X_test = diabetes.data[-num_test:, :] 129 | y_test = diabetes.target[-num_test:] 130 | 131 | param_grid = { 132 | "alpha": [1e-07, 1e-06, 1e-05], 133 | "penalty": [None, "l2"], 134 | "eta0": [0.001, 0.005, 0.01], 135 | "n_iter": [300, 1000, 3000] 136 | } 137 | 138 | from sklearn.model_selection import GridSearchCV 139 | regressor = SGDRegressor(loss='squared_loss', learning_rate='constant') 140 | grid_search = GridSearchCV(regressor, param_grid, cv=3) 141 | grid_search.fit(X_train, y_train) 142 | 143 | print(grid_search.best_params_) 144 | regressor_best = grid_search.best_estimator_ 145 | # regressor_best.score(X_test, y_test) 146 | 147 | predictions = regressor_best.predict(X_test) 148 | 149 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 150 | mean_squared_error(y_test, predictions) 151 | mean_absolute_error(y_test, predictions) 152 | r2_score(y_test, predictions) 153 | 154 | -------------------------------------------------------------------------------- /Chapter07/3decision_tree_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # Mean squared error calculation function given continuous targets of a data set, 5 | def mse(targets): 6 | # When the set is empty 7 | if targets.size == 0: 8 | return 0 9 | return np.var(targets) 10 | 11 | def weighted_mse(groups): 12 | """ Calculate weighted MSE of children after a split 13 | Args: 14 | groups (list of children, and a child consists a list of targets) 15 | Returns: 16 | float, weighted impurity 17 | """ 18 | total = sum(len(group) for group in groups) 19 | weighted_sum = 0.0 20 | for group in groups: 21 | weighted_sum += len(group) / float(total) * mse(group) 22 | return weighted_sum 23 | 24 | print('{0:.4f}'.format(mse(np.array([1, 2, 3])))) 25 | print('{0:.4f}'.format(weighted_mse([np.array([1, 2, 3]), np.array([1, 2])]))) 26 | 27 | print('type-semi: {0:.4f}'.format(weighted_mse([np.array([600, 400, 700]), np.array([700, 800])]))) 28 | print('bedroom-2: {0:.4f}'.format(weighted_mse([np.array([700, 400]), np.array([600, 800, 700])]))) 29 | print('bedroom-3: {0:.4f}'.format(weighted_mse([np.array([600, 800]), np.array([700, 400, 700])]))) 30 | print('bedroom-4: {0:.4f}'.format(weighted_mse([np.array([700]), np.array([600, 700, 800, 400])]))) 31 | 32 | 33 | print('bedroom-2: {0:.4f}'.format(weighted_mse([np.array([]), np.array([600, 400, 700])]))) 34 | print('bedroom-3: {0:.4f}'.format(weighted_mse([np.array([400]), np.array([600, 700])]))) 35 | print('bedroom-4: {0:.4f}'.format(weighted_mse([np.array([400, 600]), np.array([700])]))) 36 | 37 | 38 | 39 | 40 | def split_node(X, y, index, value): 41 | """ Split data set X, y based on a feature and a value 42 | Args: 43 | X, y (numpy.ndarray, data set) 44 | index (int, index of the feature used for splitting) 45 | value (value of the feature used for splitting) 46 | Returns: 47 | list, list: left and right child, a child is in the format of [X, y] 48 | """ 49 | x_index = X[:, index] 50 | # if this feature is numerical 51 | if type(X[0, index]) in [int, float]: 52 | mask = x_index >= value 53 | # if this feature is categorical 54 | else: 55 | mask = x_index == value 56 | # split into left and right child 57 | left = [X[~mask, :], y[~mask]] 58 | right = [X[mask, :], y[mask]] 59 | return left, right 60 | 61 | 62 | def get_best_split(X, y): 63 | """ Obtain the best splitting point and resulting children for the data set X, y 64 | Args: 65 | X, y (numpy.ndarray, data set) 66 | criterion (gini or entropy) 67 | Returns: 68 | dict {index: index of the feature, value: feature value, children: left and right children} 69 | """ 70 | best_index, best_value, best_score, children = None, None, 1e10, None 71 | for index in range(len(X[0])): 72 | for value in np.sort(np.unique(X[:, index])): 73 | groups = split_node(X, y, index, value) 74 | impurity = weighted_mse([groups[0][1], groups[1][1]]) 75 | if impurity < best_score: 76 | best_index, best_value, best_score, children = index, value, impurity, groups 77 | return {'index': best_index, 'value': best_value, 'children': children} 78 | 79 | 80 | 81 | def get_leaf(targets): 82 | # Obtain the leaf as the mean of the targets 83 | return np.mean(targets) 84 | 85 | 86 | 87 | def split(node, max_depth, min_size, depth): 88 | """ Split children of a node to construct new nodes or assign them terminals 89 | Args: 90 | node (dict, with children info) 91 | max_depth (int, maximal depth of the tree) 92 | min_size (int, minimal samples required to further split a child) 93 | depth (int, current depth of the node) 94 | """ 95 | left, right = node['children'] 96 | del (node['children']) 97 | if left[1].size == 0: 98 | node['right'] = get_leaf(right[1]) 99 | return 100 | if right[1].size == 0: 101 | node['left'] = get_leaf(left[1]) 102 | return 103 | # Check if the current depth exceeds the maximal depth 104 | if depth >= max_depth: 105 | node['left'], node['right'] = get_leaf(left[1]), get_leaf(right[1]) 106 | return 107 | # Check if the left child has enough samples 108 | if left[1].size <= min_size: 109 | node['left'] = get_leaf(left[1]) 110 | else: 111 | # It has enough samples, we further split it 112 | result = get_best_split(left[0], left[1]) 113 | result_left, result_right = result['children'] 114 | if result_left[1].size == 0: 115 | node['left'] = get_leaf(result_right[1]) 116 | elif result_right[1].size == 0: 117 | node['left'] = get_leaf(result_left[1]) 118 | else: 119 | node['left'] = result 120 | split(node['left'], max_depth, min_size, depth + 1) 121 | # Check if the right child has enough samples 122 | if right[1].size <= min_size: 123 | node['right'] = get_leaf(right[1]) 124 | else: 125 | # It has enough samples, we further split it 126 | result = get_best_split(right[0], right[1]) 127 | result_left, result_right = result['children'] 128 | if result_left[1].size == 0: 129 | node['right'] = get_leaf(result_right[1]) 130 | elif result_right[1].size == 0: 131 | node['right'] = get_leaf(result_left[1]) 132 | else: 133 | node['right'] = result 134 | split(node['right'], max_depth, min_size, depth + 1) 135 | 136 | 137 | def train_tree(X_train, y_train, max_depth, min_size): 138 | """ Construction of a tree starts here 139 | Args: 140 | X_train, y_train (list, list, training data) 141 | max_depth (int, maximal depth of the tree) 142 | min_size (int, minimal samples required to further split a child) 143 | """ 144 | root = get_best_split(X_train, y_train) 145 | split(root, max_depth, min_size, 1) 146 | return root 147 | 148 | 149 | 150 | CONDITION = {'numerical': {'yes': '>=', 'no': '<'}, 151 | 'categorical': {'yes': 'is', 'no': 'is not'}} 152 | def visualize_tree(node, depth=0): 153 | if isinstance(node, dict): 154 | if type(node['value']) in [int, float]: 155 | condition = CONDITION['numerical'] 156 | else: 157 | condition = CONDITION['categorical'] 158 | print('{}|- X{} {} {}'.format(depth * ' ', node['index'] + 1, condition['no'], node['value'])) 159 | if 'left' in node: 160 | visualize_tree(node['left'], depth + 1) 161 | print('{}|- X{} {} {}'.format(depth * ' ', node['index'] + 1, condition['yes'], node['value'])) 162 | if 'right' in node: 163 | visualize_tree(node['right'], depth + 1) 164 | else: 165 | print('{}[{}]'.format(depth * ' ', node)) 166 | 167 | 168 | X_train = np.array([['semi', 3], 169 | ['detached', 2], 170 | ['detached', 3], 171 | ['semi', 2], 172 | ['semi', 4]], dtype=object) 173 | 174 | y_train = np.array([600, 700, 800, 400, 700]) 175 | 176 | tree = train_tree(X_train, y_train, 2, 2) 177 | visualize_tree(tree) 178 | 179 | 180 | 181 | # Directly use DecisionTreeRegressor from scikit-learn 182 | from sklearn import datasets 183 | boston = datasets.load_boston() 184 | 185 | num_test = 10 # the last 10 samples as testing set 186 | X_train = boston.data[:-num_test, :] 187 | y_train = boston.target[:-num_test] 188 | X_test = boston.data[-num_test:, :] 189 | y_test = boston.target[-num_test:] 190 | 191 | from sklearn.tree import DecisionTreeRegressor 192 | regressor = DecisionTreeRegressor(max_depth=10, min_samples_split=3) 193 | 194 | regressor.fit(X_train, y_train) 195 | predictions = regressor.predict(X_test) 196 | print(predictions) 197 | print(y_test) 198 | 199 | 200 | from sklearn.ensemble import RandomForestRegressor 201 | regressor = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=3) 202 | regressor.fit(X_train, y_train) 203 | predictions = regressor.predict(X_test) 204 | print(predictions) 205 | print(y_test) 206 | 207 | -------------------------------------------------------------------------------- /Chapter07/4support_vector_regression.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets 2 | boston = datasets.load_boston() 3 | 4 | num_test = 10 # the last 10 samples as testing set 5 | X_train = boston.data[:-num_test, :] 6 | y_train = boston.target[:-num_test] 7 | X_test = boston.data[-num_test:, :] 8 | y_test = boston.target[-num_test:] 9 | 10 | from sklearn.svm import SVR 11 | regressor = SVR(C=0.1, epsilon=0.02, kernel='linear') 12 | 13 | regressor.fit(X_train, y_train) 14 | predictions = regressor.predict(X_test) 15 | print(predictions) 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /Chapter08/1imputation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.preprocessing import Imputer 3 | 4 | # Represent the unknown value by np.nan in numpy 5 | data_origin = [[30, 100], 6 | [20, 50], 7 | [35, np.nan], 8 | [25, 80], 9 | [30, 70], 10 | [40, 60]] 11 | 12 | # Imputation with the mean value 13 | imp_mean = Imputer(missing_values='NaN', strategy='mean') 14 | imp_mean.fit(data_origin) 15 | data_mean_imp = imp_mean.transform(data_origin) 16 | print(data_mean_imp) 17 | 18 | # Imputation with the median value 19 | imp_median = Imputer(missing_values='NaN', strategy='median') 20 | imp_median.fit(data_origin) 21 | data_median_imp = imp_median.transform(data_origin) 22 | print(data_median_imp) 23 | 24 | # New samples 25 | new = [[20, np.nan], 26 | [30, np.nan], 27 | [np.nan, 70], 28 | [np.nan, np.nan]] 29 | new_mean_imp = imp_mean.transform(new) 30 | print(new_mean_imp) 31 | 32 | 33 | 34 | # Effects of discarding missing values and imputation 35 | from sklearn import datasets 36 | dataset = datasets.load_diabetes() 37 | X_full, y = dataset.data, dataset.target 38 | 39 | 40 | # Simulate a corrupted data set by adding 25% missing values 41 | m, n = X_full.shape 42 | m_missing = int(m * 0.25) 43 | print(m, m_missing) 44 | 45 | # Randomly select m_missing samples 46 | np.random.seed(42) 47 | missing_samples = np.array([True] * m_missing + [False] * (m - m_missing)) 48 | np.random.shuffle(missing_samples) 49 | 50 | # For each missing sample, randomly select 1 out of n features 51 | missing_features = np.random.randint(low=0, high=n, size=m_missing) 52 | # Represent missing values by nan 53 | X_missing = X_full.copy() 54 | X_missing[np.where(missing_samples)[0], missing_features] = np.nan 55 | 56 | 57 | # Discard samples containing missing values 58 | X_rm_missing = X_missing[~missing_samples, :] 59 | y_rm_missing = y[~missing_samples] 60 | 61 | # Estimate R^2 on the data set with missing samples removed 62 | from sklearn.ensemble import RandomForestRegressor 63 | from sklearn.model_selection import cross_val_score 64 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100) 65 | score_rm_missing = cross_val_score(regressor, X_rm_missing, y_rm_missing).mean() 66 | print('Score with the data set with missing samples removed: {0:.2f}'.format(score_rm_missing)) 67 | 68 | 69 | # Imputation with mean value 70 | imp_mean = Imputer(missing_values='NaN', strategy='mean') 71 | X_mean_imp = imp_mean.fit_transform(X_missing) 72 | # Estimate R^2 on the data set with missing samples removed 73 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100) 74 | score_mean_imp = cross_val_score(regressor, X_mean_imp, y).mean() 75 | print('Score with the data set with missing values replaced by mean: {0:.2f}'.format(score_mean_imp)) 76 | 77 | 78 | # Estimate R^2 on the full data set 79 | regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=500) 80 | score_full = cross_val_score(regressor, X_full, y).mean() 81 | print('Score with the full data set: {0:.2f}'.format(score_full)) 82 | 83 | 84 | # # Imputation with median value 85 | # imp_mean = Imputer(missing_values='NaN', strategy='median') 86 | # X_mean_imp = imp_mean.fit_transform(X_missing) 87 | # # Estimate R^2 on the data set with missing samples removed 88 | # regressor = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=100) 89 | # score_mean_imp = cross_val_score(regressor, X_mean_imp, y).mean() 90 | # print('Score with the data set with missing values replaced by mean: {0:.2f}'.format(score_mean_imp)) 91 | 92 | -------------------------------------------------------------------------------- /Chapter08/2feature_selection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_digits 3 | dataset = load_digits() 4 | X, y = dataset.data, dataset.target 5 | print(X.shape) 6 | 7 | # Estimate accuracy on the original data set 8 | from sklearn.svm import SVC 9 | from sklearn.model_selection import cross_val_score 10 | classifier = SVC(gamma=0.005) 11 | score = cross_val_score(classifier, X, y).mean() 12 | print('Score with the original data set: {0:.2f}'.format(score)) 13 | 14 | 15 | # Feature selection with random forest 16 | from sklearn.ensemble import RandomForestClassifier 17 | random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1) 18 | random_forest.fit(X, y) 19 | 20 | # Sort features based on their importancies 21 | feature_sorted = np.argsort(random_forest.feature_importances_) 22 | 23 | # Select different number of top features 24 | K = [10, 15, 25, 35, 45] 25 | for k in K: 26 | top_K_features = feature_sorted[-k:] 27 | X_k_selected = X[:, top_K_features] 28 | # Estimate accuracy on the data set with k selected features 29 | classifier = SVC(gamma=0.005) 30 | score_k_features = cross_val_score(classifier, X_k_selected, y).mean() 31 | print('Score with the data set of top {0} features: {1:.2f}'.format(k, score_k_features)) 32 | 33 | -------------------------------------------------------------------------------- /Chapter08/3dimensionality_reduction.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_digits 2 | dataset = load_digits() 3 | X, y = dataset.data, dataset.target 4 | 5 | from sklearn.svm import SVC 6 | from sklearn.model_selection import cross_val_score 7 | 8 | 9 | 10 | from sklearn.decomposition import PCA 11 | 12 | # Keep different number of top components 13 | N = [10, 15, 25, 35, 45] 14 | for n in N: 15 | pca = PCA(n_components=n) 16 | X_n_kept = pca.fit_transform(X) 17 | # Estimate accuracy on the data set with top n components 18 | classifier = SVC(gamma=0.005) 19 | score_n_components = cross_val_score(classifier, X_n_kept, y).mean() 20 | print('Score with the data set of top {0} components: {1:.2f}'.format(n, score_n_components)) -------------------------------------------------------------------------------- /Chapter08/4generic_feature_engineering.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import Binarizer 2 | 3 | X = [[4], [1], [3], [0]] 4 | binarizer = Binarizer(threshold=2.9) 5 | X_new = binarizer.fit_transform(X) 6 | print(X_new) 7 | 8 | 9 | 10 | 11 | from sklearn.preprocessing import PolynomialFeatures 12 | 13 | X = [[2, 4], 14 | [1, 3], 15 | [3, 2], 16 | [0, 3]] 17 | poly = PolynomialFeatures(degree=2) 18 | X_new = poly.fit_transform(X) 19 | print(X_new) 20 | -------------------------------------------------------------------------------- /Chapter08/5save_reuse_monitor_model.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets 2 | dataset = datasets.load_diabetes() 3 | X, y = dataset.data, dataset.target 4 | 5 | num_new = 30 # the last 30 samples as new data set 6 | X_train = X[:-num_new, :] 7 | y_train = y[:-num_new] 8 | X_new = X[-num_new:, :] 9 | y_new = y[-num_new:] 10 | 11 | 12 | # Data pre-processing 13 | from sklearn.preprocessing import StandardScaler 14 | scaler = StandardScaler() 15 | scaler.fit(X_train) 16 | 17 | import pickle 18 | # Save the scaler 19 | pickle.dump(scaler, open("scaler.p", "wb" )) 20 | 21 | X_scaled_train = scaler.transform(X_train) 22 | 23 | 24 | # Regression model training 25 | from sklearn.svm import SVR 26 | regressor = SVR(C=20) 27 | regressor.fit(X_scaled_train, y_train) 28 | # Save the regressor 29 | pickle.dump(regressor, open("regressor.p", "wb")) 30 | 31 | 32 | # Deployment 33 | my_scaler = pickle.load(open("scaler.p", "rb" )) 34 | my_regressor = pickle.load(open("regressor.p", "rb")) 35 | 36 | X_scaled_new = my_scaler.transform(X_new) 37 | predictions = my_regressor.predict(X_scaled_new) 38 | 39 | 40 | # Monitor 41 | from sklearn.metrics import r2_score 42 | print('Health check on the model, R^2: {0:.3f}'.format(r2_score(y_new, predictions))) 43 | -------------------------------------------------------------------------------- /Chapter08/regressor.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Machine-Learning-By-Example/6ee2be561e511bd0a1c0b3d481ad3950ea3f1815/Chapter08/regressor.p -------------------------------------------------------------------------------- /Chapter08/scaler.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Machine-Learning-By-Example/6ee2be561e511bd0a1c0b3d481ad3950ea3f1815/Chapter08/scaler.p -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## $5 Tech Unlocked 2021! 5 | [Buy and download this product for only $5 on PacktPub.com](https://www.packtpub.com/) 6 | ----- 7 | *The $5 campaign runs from __December 15th 2020__ to __January 13th 2021.__* 8 | 9 | # Python Machine Learning By Example 10 | This is the code repository for [Python Machine Learning By Example](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning-example?utm_source=github&utm_medium=repository&utm_campaign=9781783553112), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish. 11 | ## About the Book 12 | This book starts with an introduction to machine learning and the Python language and shows you how to complete the setup. Moving ahead, you will learn all the important concepts such as, exploratory data analysis, data preprocessing, feature extraction, data visualization and clustering, classification, regression and model performance evaluation. With the help of various projects included, you will find it intriguing to acquire the mechanics of several important machine learning algorithms – they are no more obscure as they thought. Also, you will be guided step by step to build your own models from scratch. Toward the end, you will gather a broad picture of the machine learning ecosystem and best practices of applying machine learning techniques. 13 | 14 | 15 | ## Instructions and Navigation 16 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02. 17 | 18 | Chapter 1 is introductory and does not contain any code. 19 | All other chapters contain code. 20 | Some data files can be found in the folders, others can be downloaded from the links provided in the chapters. 21 | 22 | The code will look like the following: 23 | ``` 24 | >>> from nltk.corpus import names 25 | >>> from nltk.stem import WordNetLemmatizer 26 | >>> def letters_only(astr): 27 | ``` 28 | 29 | The following are required for you to utilize this book: 30 | scikit-learn 0.18.0 31 | Numpy 1.1 32 | Matplotlib 1.5.1 33 | NLTK 3.2.2 34 | pandas 0.19.2 35 | GraphViz 36 | Quandl Python API 37 | You can use a 64-bit architecture, 2GHz CPU, and 8GB RAM to perform all the steps in this book. You will require at least 8GB of hard disk space.. 38 | 39 | ## Related Products 40 | * [Python Machine Learning](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781783555130) 41 | 42 | * [Python Machine Learning Blueprints: Intuitive data projects you can relate to](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning-blueprints-intuitive-data-projects-you-ca?utm_source=github&utm_medium=repository&utm_campaign=9781784394752) 43 | 44 | * [Learning Predictive Analytics with Python](https://www.packtpub.com/big-data-and-business-intelligence/learning-predictive-analytics-python?utm_source=github&utm_medium=repository&utm_campaign=9781783983261) 45 | 46 | ### Download a free PDF 47 | 48 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
49 |

https://packt.link/free-ebook/9781783553112

--------------------------------------------------------------------------------