├── .gitattributes ├── README.md ├── .gitignore ├── inspect_LSA.py ├── runClassification_LSA.py └── getReutersTextArticles.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a simple text classification example using Latent Semantic Analysis (LSA), written in Python and using the scikit-learn library. 2 | 3 | This code goes along with an LSA tutorial blog post I wrote [here](http://mccormickml.com/2016/03/25/lsa-for-text-classification-tutorial/). 4 | 5 | Steps: 6 | 7 | 1. [Optional]: Run `getReutersTextArticles.py` to download the Reuters dataset and extract the raw text. This step has already been performed for you, and the dataset is stored in the 'data' folder. 8 | 2. Run `runClassification_LSA.py` to apply LSA to the dataset and then test classification accuracy. 9 | 3. Run `inspect_LSA.py` to gain some insight into what LSA is doing. 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | -------------------------------------------------------------------------------- /inspect_LSA.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Perform some analysis on the top components of SVD. 4 | 5 | This script takes articles from the Reuters classification dataset, then 6 | applies LSA to them to create compact feature vectors. 7 | 8 | We look at some properties of these vectors and the SVD matrix in order to gain 9 | some insight into how they work. 10 | 11 | @author: Chris McCormick 12 | """ 13 | 14 | import pickle 15 | import time 16 | import numpy 17 | 18 | from sklearn.feature_extraction.text import TfidfVectorizer 19 | from sklearn.decomposition import TruncatedSVD 20 | from sklearn.pipeline import make_pipeline 21 | from sklearn.preprocessing import Normalizer 22 | 23 | from pylab import * 24 | 25 | import random 26 | 27 | ############################################################################### 28 | # Load the raw text dataset. 29 | ############################################################################### 30 | 31 | print("Loading dataset...") 32 | 33 | # The raw text dataset is stored as tuple in the form: 34 | # (X_train_raw, y_train_raw, X_test_raw, y_test) 35 | # The 'filtered' dataset excludes any articles that we failed to retrieve 36 | # fingerprints for. 37 | raw_text_dataset = pickle.load( open( "data/raw_text_dataset.pickle", "rb" ) ) 38 | X_train_raw = raw_text_dataset[0] 39 | 40 | print(" %d training examples" % (len(X_train_raw))) 41 | 42 | ############################################################################### 43 | # Use LSA to vectorize the articles. 44 | ############################################################################### 45 | 46 | # Tfidf vectorizer: 47 | # - Strips out “stop words” 48 | # - Filters out terms that occur in more than half of the docs (max_df=0.5) 49 | # - Filters out terms that occur in only one document (min_df=2). 50 | # - Selects the 10,000 most frequently occuring words in the corpus. 51 | # - Normalizes the vector (L2 norm of 1.0) to normalize the effect of 52 | # document length on the tf-idf values. 53 | vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, 54 | min_df=2, stop_words='english', 55 | use_idf=True) 56 | 57 | # Build the tfidf vectorizer from the training data ("fit"), and apply it 58 | # ("transform"). 59 | X_train_tfidf = vectorizer.fit_transform(X_train_raw) 60 | 61 | print(" Actual number of tfidf features: %d" % X_train_tfidf.get_shape()[1]) 62 | 63 | # Get the words that correspond to each of the features. 64 | feat_names = vectorizer.get_feature_names() 65 | 66 | # Print ten random terms from the vocabulary 67 | print("Some random words in the vocabulary:") 68 | for i in range(0, 10): 69 | featNum = random.randint(0, len(feat_names)) 70 | print(" %s" % feat_names[featNum]) 71 | 72 | print("\nPerforming dimensionality reduction using LSA") 73 | t0 = time.time() 74 | 75 | # Project the tfidf vectors onto the first N principal components. 76 | # Though this is significantly fewer features than the original tfidf vector, 77 | # they are stronger features, and the accuracy is higher. 78 | svd = TruncatedSVD(100) 79 | lsa = make_pipeline(svd, Normalizer(copy=False)) 80 | 81 | # Run SVD on the training data, then project the training data. 82 | X_train_lsa = lsa.fit_transform(X_train_tfidf) 83 | 84 | # The SVD matrix will have one row per component, and one column per feature 85 | # of the original data. 86 | 87 | #for compNum in range(0, 100, 10): 88 | for compNum in range(0, 10): 89 | 90 | comp = svd.components_[compNum] 91 | 92 | # Sort the weights in the first component, and get the indeces 93 | indeces = numpy.argsort(comp).tolist() 94 | 95 | # Reverse the indeces, so we have the largest weights first. 96 | indeces.reverse() 97 | 98 | # Grab the top 10 terms which have the highest weight in this component. 99 | terms = [feat_names[weightIndex] for weightIndex in indeces[0:10]] 100 | weights = [comp[weightIndex] for weightIndex in indeces[0:10]] 101 | 102 | # Display these terms and their weights as a horizontal bar graph. 103 | # The horizontal bar graph displays the first item on the bottom; reverse 104 | # the order of the terms so the biggest one is on top. 105 | terms.reverse() 106 | weights.reverse() 107 | positions = arange(10) + .5 # the bar centers on the y axis 108 | 109 | figure(compNum) 110 | barh(positions, weights, align='center') 111 | yticks(positions, terms) 112 | xlabel('Weight') 113 | title('Strongest terms for component %d' % (compNum)) 114 | grid(True) 115 | show() 116 | 117 | 118 | -------------------------------------------------------------------------------- /runClassification_LSA.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Run k-NN classification on the Reuters text dataset using LSA. 4 | 5 | This script leverages modules in scikit-learn for performing tf-idf and SVD. 6 | 7 | Classification is performed using k-NN with k=5 (majority wins). 8 | 9 | The script measures the accuracy of plain tf-idf as a baseline, then LSA to 10 | show the improvement. 11 | 12 | @author: Chris McCormick 13 | """ 14 | 15 | import pickle 16 | import time 17 | 18 | from sklearn.feature_extraction.text import TfidfVectorizer 19 | from sklearn.decomposition import TruncatedSVD 20 | from sklearn.pipeline import make_pipeline 21 | from sklearn.preprocessing import Normalizer 22 | from sklearn.neighbors import KNeighborsClassifier 23 | 24 | 25 | ############################################################################### 26 | # Load the raw text dataset. 27 | ############################################################################### 28 | 29 | print("Loading dataset...") 30 | 31 | # The raw text dataset is stored as tuple in the form: 32 | # (X_train_raw, y_train_raw, X_test_raw, y_test) 33 | # The 'filtered' dataset excludes any articles that we failed to retrieve 34 | # fingerprints for. 35 | raw_text_dataset = pickle.load( open( "data/raw_text_dataset.pickle", "rb" ) ) 36 | X_train_raw = raw_text_dataset[0] 37 | y_train_labels = raw_text_dataset[1] 38 | X_test_raw = raw_text_dataset[2] 39 | y_test_labels = raw_text_dataset[3] 40 | 41 | # The Reuters dataset consists of ~100 categories. However, we are going to 42 | # simplify this to a binary classification problem. The 'positive class' will 43 | # be the articles related to "acquisitions" (or "acq" in the dataset). All 44 | # other articles will be negative. 45 | y_train = ["acq" in y for y in y_train_labels] 46 | y_test = ["acq" in y for y in y_test_labels] 47 | 48 | print(" %d training examples (%d positive)" % (len(y_train), sum(y_train))) 49 | print(" %d test examples (%d positive)" % (len(y_test), sum(y_test))) 50 | 51 | 52 | ############################################################################### 53 | # Use LSA to vectorize the articles. 54 | ############################################################################### 55 | 56 | # Tfidf vectorizer: 57 | # - Strips out “stop words” 58 | # - Filters out terms that occur in more than half of the docs (max_df=0.5) 59 | # - Filters out terms that occur in only one document (min_df=2). 60 | # - Selects the 10,000 most frequently occuring words in the corpus. 61 | # - Normalizes the vector (L2 norm of 1.0) to normalize the effect of 62 | # document length on the tf-idf values. 63 | vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, 64 | min_df=2, stop_words='english', 65 | use_idf=True) 66 | 67 | # Build the tfidf vectorizer from the training data ("fit"), and apply it 68 | # ("transform"). 69 | X_train_tfidf = vectorizer.fit_transform(X_train_raw) 70 | 71 | print(" Actual number of tfidf features: %d" % X_train_tfidf.get_shape()[1]) 72 | 73 | print("\nPerforming dimensionality reduction using LSA") 74 | t0 = time.time() 75 | 76 | # Project the tfidf vectors onto the first N principal components. 77 | # Though this is significantly fewer features than the original tfidf vector, 78 | # they are stronger features, and the accuracy is higher. 79 | svd = TruncatedSVD(100) 80 | lsa = make_pipeline(svd, Normalizer(copy=False)) 81 | 82 | # Run SVD on the training data, then project the training data. 83 | X_train_lsa = lsa.fit_transform(X_train_tfidf) 84 | 85 | print(" done in %.3fsec" % (time.time() - t0)) 86 | 87 | explained_variance = svd.explained_variance_ratio_.sum() 88 | print(" Explained variance of the SVD step: {}%".format(int(explained_variance * 100))) 89 | 90 | 91 | # Now apply the transformations to the test data as well. 92 | X_test_tfidf = vectorizer.transform(X_test_raw) 93 | X_test_lsa = lsa.transform(X_test_tfidf) 94 | 95 | 96 | ############################################################################### 97 | # Run classification of the test articles 98 | ############################################################################### 99 | 100 | print("\nClassifying tfidf vectors...") 101 | 102 | # Time this step. 103 | t0 = time.time() 104 | 105 | # Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance, 106 | # and brute-force calculation of distances. 107 | knn_tfidf = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine') 108 | knn_tfidf.fit(X_train_tfidf, y_train) 109 | 110 | # Classify the test vectors. 111 | p = knn_tfidf.predict(X_test_tfidf) 112 | 113 | # Measure accuracy 114 | numRight = 0; 115 | for i in range(0,len(p)): 116 | if p[i] == y_test[i]: 117 | numRight += 1 118 | 119 | print(" (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0)) 120 | 121 | # Calculate the elapsed time (in seconds) 122 | elapsed = (time.time() - t0) 123 | print(" done in %.3fsec" % elapsed) 124 | 125 | 126 | print("\nClassifying LSA vectors...") 127 | 128 | # Time this step. 129 | t0 = time.time() 130 | 131 | # Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance, 132 | # and brute-force calculation of distances. 133 | knn_lsa = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine') 134 | knn_lsa.fit(X_train_lsa, y_train) 135 | 136 | # Classify the test vectors. 137 | p = knn_lsa.predict(X_test_lsa) 138 | 139 | # Measure accuracy 140 | numRight = 0; 141 | for i in range(0,len(p)): 142 | if p[i] == y_test[i]: 143 | numRight += 1 144 | 145 | print(" (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0)) 146 | 147 | # Calculate the elapsed time (in seconds) 148 | elapsed = (time.time() - t0) 149 | print(" done in %.3fsec" % elapsed) 150 | 151 | -------------------------------------------------------------------------------- /getReutersTextArticles.py: -------------------------------------------------------------------------------- 1 | """ 2 | ====================================================== 3 | Create raw text dataset from Reuters 4 | ====================================================== 5 | 6 | This script uses the code from the scikit-learn example 7 | plot_out_of_core_classification.py for retrieving the Reuters dataset. 8 | 9 | The dataset used in this example is Reuters-21578 as provided by the UCI ML 10 | repository. It will be automatically downloaded and uncompressed on first run. 11 | """ 12 | 13 | # Authors: Eustache Diemert 14 | # @FedericoV 15 | # License: BSD 3 clause 16 | 17 | from __future__ import print_function 18 | 19 | from glob import glob 20 | import itertools 21 | import os.path 22 | import re 23 | import tarfile 24 | 25 | import numpy as np 26 | 27 | from sklearn.externals.six.moves import html_parser 28 | from sklearn.externals.six.moves import urllib 29 | from sklearn.datasets import get_data_home 30 | 31 | import pickle 32 | 33 | 34 | ############################################################################### 35 | # Reuters Dataset related routines 36 | ############################################################################### 37 | 38 | def _not_in_sphinx(): 39 | # Hack to detect whether we are running by the sphinx builder 40 | return '__file__' in globals() 41 | 42 | class ReutersParser(html_parser.HTMLParser): 43 | """Utility class to parse a SGML file and yield documents one at a time.""" 44 | 45 | def __init__(self, encoding='latin-1'): 46 | html_parser.HTMLParser.__init__(self) 47 | self._reset() 48 | self.encoding = encoding 49 | 50 | def handle_starttag(self, tag, attrs): 51 | method = 'start_' + tag 52 | getattr(self, method, lambda x: None)(attrs) 53 | 54 | def handle_endtag(self, tag): 55 | method = 'end_' + tag 56 | getattr(self, method, lambda: None)() 57 | 58 | def _reset(self): 59 | self.in_title = 0 60 | self.in_body = 0 61 | self.in_topics = 0 62 | self.in_topic_d = 0 63 | self.title = "" 64 | self.body = "" 65 | self.topics = [] 66 | self.topic_d = "" 67 | 68 | def parse(self, fd): 69 | self.docs = [] 70 | for chunk in fd: 71 | self.feed(chunk.decode(self.encoding)) 72 | for doc in self.docs: 73 | yield doc 74 | self.docs = [] 75 | self.close() 76 | 77 | def handle_data(self, data): 78 | if self.in_body: 79 | self.body += data 80 | elif self.in_title: 81 | self.title += data 82 | elif self.in_topic_d: 83 | self.topic_d += data 84 | 85 | def start_reuters(self, attributes): 86 | pass 87 | 88 | def end_reuters(self): 89 | self.body = re.sub(r'\s+', r' ', self.body) 90 | self.docs.append({'title': self.title, 91 | 'body': self.body, 92 | 'topics': self.topics}) 93 | self._reset() 94 | 95 | def start_title(self, attributes): 96 | self.in_title = 1 97 | 98 | def end_title(self): 99 | self.in_title = 0 100 | 101 | def start_body(self, attributes): 102 | self.in_body = 1 103 | 104 | def end_body(self): 105 | self.in_body = 0 106 | 107 | def start_topics(self, attributes): 108 | self.in_topics = 1 109 | 110 | def end_topics(self): 111 | self.in_topics = 0 112 | 113 | def start_d(self, attributes): 114 | self.in_topic_d = 1 115 | 116 | def end_d(self): 117 | self.in_topic_d = 0 118 | self.topics.append(self.topic_d) 119 | self.topic_d = "" 120 | 121 | 122 | def stream_reuters_documents(data_path=None): 123 | """Iterate over documents of the Reuters dataset. 124 | 125 | The Reuters archive will automatically be downloaded and uncompressed if 126 | the `data_path` directory does not exist. 127 | 128 | Documents are represented as dictionaries with 'body' (str), 129 | 'title' (str), 'topics' (list(str)) keys. 130 | 131 | """ 132 | 133 | DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' 134 | 'reuters21578-mld/reuters21578.tar.gz') 135 | ARCHIVE_FILENAME = 'reuters21578.tar.gz' 136 | 137 | if data_path is None: 138 | data_path = os.path.join(get_data_home(), "reuters") 139 | if not os.path.exists(data_path): 140 | """Download the dataset.""" 141 | print("downloading dataset (once and for all) into %s" % 142 | data_path) 143 | os.mkdir(data_path) 144 | 145 | def progress(blocknum, bs, size): 146 | total_sz_mb = '%.2f MB' % (size / 1e6) 147 | current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) 148 | if _not_in_sphinx(): 149 | print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), 150 | end='') 151 | 152 | archive_path = os.path.join(data_path, ARCHIVE_FILENAME) 153 | urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path, 154 | reporthook=progress) 155 | if _not_in_sphinx(): 156 | print('\r', end='') 157 | print("untarring Reuters dataset...") 158 | tarfile.open(archive_path, 'r:gz').extractall(data_path) 159 | print("done.") 160 | 161 | parser = ReutersParser() 162 | for filename in glob(os.path.join(data_path, "*.sgm")): 163 | for doc in parser.parse(open(filename, 'rb')): 164 | yield doc 165 | 166 | def get_minibatch(doc_iter, size, pos_class): 167 | """Extract a minibatch of examples, return a tuple X_text, y. 168 | 169 | Note: size is before excluding invalid docs with no topics assigned. 170 | 171 | """ 172 | data = [(u'{title}\n\n{body}'.format(**doc), doc['topics']) 173 | for doc in itertools.islice(doc_iter, size) 174 | if doc['topics']] 175 | 176 | # If there's no data, just return empty lists. 177 | if not len(data): 178 | return np.asarray([], dtype=int), np.asarray([], dtype=int).tolist() 179 | 180 | # Otherwise, retrieve the articles and class labels. zip just splits apart 181 | # the two variables. 182 | X_text, y = zip(*data) 183 | 184 | # Convert X_text and y from tuples to lists. 185 | X_text = list(X_text) 186 | y = list(y) 187 | 188 | # Convert the class labels to a list. 189 | #y = np.asarray(y, dtype=int).tolist() 190 | 191 | # For some reason, some of these articles are just whitespace. Look for 192 | # these and remove them. 193 | toRemove = [] 194 | docNum = 0 195 | 196 | # For each article... 197 | for article in X_text: 198 | # If the article is just whitespace, or is empty, we'll remove it 199 | if article.isspace() or (article == ""): 200 | toRemove.append(docNum) 201 | 202 | docNum += 1 203 | 204 | # Remove the empty articles. Do this in reverse order so as not to corrupt 205 | # the indeces as we go. 206 | toRemove.reverse() 207 | for i in toRemove: 208 | del X_text[i] 209 | del y[i] 210 | 211 | return X_text, y 212 | 213 | 214 | 215 | def iter_minibatches(doc_iter, minibatch_size): 216 | """Generator of minibatches.""" 217 | X_text, y = get_minibatch(doc_iter, minibatch_size) 218 | while len(X_text): 219 | yield X_text, y 220 | X_text, y = get_minibatch(doc_iter, minibatch_size) 221 | 222 | ############################################################################### 223 | # Main 224 | ############################################################################### 225 | 226 | # Iterator over parsed Reuters SGML files. 227 | data_stream = stream_reuters_documents() 228 | 229 | # The Reuter's dataset includes many different classes, but we're just going to 230 | # do binary classification. We'll use 'acq' (articles related to 231 | # "acquisitions"--one of the most prevalent classes in the dataset) as the 232 | # positive class, and all other article topics will be used as negative 233 | # examples. 234 | positive_class = 'acq' 235 | 236 | # Retrieve a set of examples from the dataset to use as the training set, then 237 | # another set of examples to use as the test set. The actual number will 238 | # be smaller because it will exclude "invalid docs with no topics assigned". 239 | X_train_raw, y_train_raw = get_minibatch(data_stream, 5000, positive_class) 240 | X_test_raw, y_test_raw = get_minibatch(data_stream, 5000, positive_class) 241 | 242 | print("Train set is %d documents" % (len(y_train_raw))) 243 | print("Test set is %d documents" % (len(y_test_raw))) 244 | 245 | # Dump the dataset to a pickle file. 246 | pickle.dump((X_train_raw, y_train_raw, X_test_raw, y_test_raw), open("data/raw_text_dataset.pickle", "wb")) 247 | --------------------------------------------------------------------------------