├── .gitattributes
├── README.md
├── .gitignore
├── inspect_LSA.py
├── runClassification_LSA.py
└── getReutersTextArticles.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This is a simple text classification example using Latent Semantic Analysis (LSA), written in Python and using the scikit-learn library. 
 2 | 
 3 | This code goes along with an LSA tutorial blog post I wrote [here](http://mccormickml.com/2016/03/25/lsa-for-text-classification-tutorial/).
 4 | 
 5 | Steps:
 6 | 
 7 | 1. [Optional]: Run `getReutersTextArticles.py` to download the Reuters dataset and extract the raw text. This step has already been performed for you, and the dataset is stored in the 'data' folder.
 8 | 2. Run `runClassification_LSA.py` to apply LSA to the dataset and then test classification accuracy.
 9 | 3. Run `inspect_LSA.py` to gain some insight into what LSA is doing.
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 | 
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 | 


--------------------------------------------------------------------------------
/inspect_LSA.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Perform some analysis on the top components of SVD.
  4 | 
  5 | This script takes articles from the Reuters classification dataset, then
  6 | applies LSA to them to create compact feature vectors.
  7 | 
  8 | We look at some properties of these vectors and the SVD matrix in order to gain
  9 | some insight into how they work.
 10 | 
 11 | @author: Chris McCormick
 12 | """
 13 | 
 14 | import pickle
 15 | import time
 16 | import numpy
 17 | 
 18 | from sklearn.feature_extraction.text import TfidfVectorizer
 19 | from sklearn.decomposition import TruncatedSVD
 20 | from sklearn.pipeline import make_pipeline
 21 | from sklearn.preprocessing import Normalizer
 22 | 
 23 | from pylab import *
 24 | 
 25 | import random
 26 | 
 27 | ###############################################################################
 28 | #  Load the raw text dataset.
 29 | ###############################################################################
 30 | 
 31 | print("Loading dataset...")
 32 | 
 33 | # The raw text dataset is stored as tuple in the form:
 34 | # (X_train_raw, y_train_raw, X_test_raw, y_test)
 35 | # The 'filtered' dataset excludes any articles that we failed to retrieve
 36 | # fingerprints for.
 37 | raw_text_dataset = pickle.load( open( "data/raw_text_dataset.pickle", "rb" ) )
 38 | X_train_raw = raw_text_dataset[0]
 39 | 
 40 | print("  %d training examples" % (len(X_train_raw)))
 41 | 
 42 | ###############################################################################
 43 | #  Use LSA to vectorize the articles.
 44 | ###############################################################################
 45 | 
 46 | # Tfidf vectorizer:
 47 | #   - Strips out “stop words”
 48 | #   - Filters out terms that occur in more than half of the docs (max_df=0.5)
 49 | #   - Filters out terms that occur in only one document (min_df=2).
 50 | #   - Selects the 10,000 most frequently occuring words in the corpus.
 51 | #   - Normalizes the vector (L2 norm of 1.0) to normalize the effect of 
 52 | #     document length on the tf-idf values. 
 53 | vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
 54 |                              min_df=2, stop_words='english',
 55 |                              use_idf=True)
 56 | 
 57 | # Build the tfidf vectorizer from the training data ("fit"), and apply it 
 58 | # ("transform").
 59 | X_train_tfidf = vectorizer.fit_transform(X_train_raw)
 60 | 
 61 | print("  Actual number of tfidf features: %d" % X_train_tfidf.get_shape()[1])
 62 | 
 63 | # Get the words that correspond to each of the features.
 64 | feat_names = vectorizer.get_feature_names()
 65 | 
 66 | # Print ten random terms from the vocabulary
 67 | print("Some random words in the vocabulary:")
 68 | for i in range(0, 10):
 69 |     featNum = random.randint(0, len(feat_names))
 70 |     print("  %s" % feat_names[featNum])
 71 |     
 72 | print("\nPerforming dimensionality reduction using LSA")
 73 | t0 = time.time()
 74 | 
 75 | # Project the tfidf vectors onto the first N principal components.
 76 | # Though this is significantly fewer features than the original tfidf vector,
 77 | # they are stronger features, and the accuracy is higher.
 78 | svd = TruncatedSVD(100)
 79 | lsa = make_pipeline(svd, Normalizer(copy=False))
 80 | 
 81 | # Run SVD on the training data, then project the training data.
 82 | X_train_lsa = lsa.fit_transform(X_train_tfidf)
 83 | 
 84 | # The SVD matrix will have one row per component, and one column per feature
 85 | # of the original data.
 86 | 
 87 | #for compNum in range(0, 100, 10):
 88 | for compNum in range(0, 10):
 89 | 
 90 |     comp = svd.components_[compNum]
 91 |     
 92 |     # Sort the weights in the first component, and get the indeces
 93 |     indeces = numpy.argsort(comp).tolist()
 94 |     
 95 |     # Reverse the indeces, so we have the largest weights first.
 96 |     indeces.reverse()
 97 |     
 98 |     # Grab the top 10 terms which have the highest weight in this component.        
 99 |     terms = [feat_names[weightIndex] for weightIndex in indeces[0:10]]    
100 |     weights = [comp[weightIndex] for weightIndex in indeces[0:10]]    
101 |    
102 |     # Display these terms and their weights as a horizontal bar graph.    
103 |     # The horizontal bar graph displays the first item on the bottom; reverse
104 |     # the order of the terms so the biggest one is on top.
105 |     terms.reverse()
106 |     weights.reverse()
107 |     positions = arange(10) + .5    # the bar centers on the y axis
108 |     
109 |     figure(compNum)
110 |     barh(positions, weights, align='center')
111 |     yticks(positions, terms)
112 |     xlabel('Weight')
113 |     title('Strongest terms for component %d' % (compNum))
114 |     grid(True)
115 |     show()
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/runClassification_LSA.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Run k-NN classification on the Reuters text dataset using LSA.
  4 | 
  5 | This script leverages modules in scikit-learn for performing tf-idf and SVD.
  6 | 
  7 | Classification is performed using k-NN with k=5 (majority wins).
  8 | 
  9 | The script measures the accuracy of plain tf-idf as a baseline, then LSA to
 10 | show the improvement.
 11 | 
 12 | @author: Chris McCormick
 13 | """
 14 | 
 15 | import pickle
 16 | import time
 17 | 
 18 | from sklearn.feature_extraction.text import TfidfVectorizer
 19 | from sklearn.decomposition import TruncatedSVD
 20 | from sklearn.pipeline import make_pipeline
 21 | from sklearn.preprocessing import Normalizer
 22 | from sklearn.neighbors import KNeighborsClassifier
 23 | 
 24 | 
 25 | ###############################################################################
 26 | #  Load the raw text dataset.
 27 | ###############################################################################
 28 | 
 29 | print("Loading dataset...")
 30 | 
 31 | # The raw text dataset is stored as tuple in the form:
 32 | # (X_train_raw, y_train_raw, X_test_raw, y_test)
 33 | # The 'filtered' dataset excludes any articles that we failed to retrieve
 34 | # fingerprints for.
 35 | raw_text_dataset = pickle.load( open( "data/raw_text_dataset.pickle", "rb" ) )
 36 | X_train_raw = raw_text_dataset[0]
 37 | y_train_labels = raw_text_dataset[1] 
 38 | X_test_raw = raw_text_dataset[2]
 39 | y_test_labels = raw_text_dataset[3]
 40 | 
 41 | # The Reuters dataset consists of ~100 categories. However, we are going to
 42 | # simplify this to a binary classification problem. The 'positive class' will
 43 | # be the articles related to "acquisitions" (or "acq" in the dataset). All
 44 | # other articles will be negative.
 45 | y_train = ["acq" in y for y in y_train_labels]
 46 | y_test = ["acq" in y for y in y_test_labels]
 47 | 
 48 | print("  %d training examples (%d positive)" % (len(y_train), sum(y_train)))
 49 | print("  %d test examples (%d positive)" % (len(y_test), sum(y_test)))
 50 | 
 51 | 
 52 | ###############################################################################
 53 | #  Use LSA to vectorize the articles.
 54 | ###############################################################################
 55 | 
 56 | # Tfidf vectorizer:
 57 | #   - Strips out “stop words”
 58 | #   - Filters out terms that occur in more than half of the docs (max_df=0.5)
 59 | #   - Filters out terms that occur in only one document (min_df=2).
 60 | #   - Selects the 10,000 most frequently occuring words in the corpus.
 61 | #   - Normalizes the vector (L2 norm of 1.0) to normalize the effect of 
 62 | #     document length on the tf-idf values. 
 63 | vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
 64 |                              min_df=2, stop_words='english',
 65 |                              use_idf=True)
 66 | 
 67 | # Build the tfidf vectorizer from the training data ("fit"), and apply it 
 68 | # ("transform").
 69 | X_train_tfidf = vectorizer.fit_transform(X_train_raw)
 70 | 
 71 | print("  Actual number of tfidf features: %d" % X_train_tfidf.get_shape()[1])
 72 | 
 73 | print("\nPerforming dimensionality reduction using LSA")
 74 | t0 = time.time()
 75 | 
 76 | # Project the tfidf vectors onto the first N principal components.
 77 | # Though this is significantly fewer features than the original tfidf vector,
 78 | # they are stronger features, and the accuracy is higher.
 79 | svd = TruncatedSVD(100)
 80 | lsa = make_pipeline(svd, Normalizer(copy=False))
 81 | 
 82 | # Run SVD on the training data, then project the training data.
 83 | X_train_lsa = lsa.fit_transform(X_train_tfidf)
 84 | 
 85 | print("  done in %.3fsec" % (time.time() - t0))
 86 | 
 87 | explained_variance = svd.explained_variance_ratio_.sum()
 88 | print("  Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))
 89 | 
 90 | 
 91 | # Now apply the transformations to the test data as well.
 92 | X_test_tfidf = vectorizer.transform(X_test_raw)
 93 | X_test_lsa = lsa.transform(X_test_tfidf)
 94 | 
 95 | 
 96 | ###############################################################################
 97 | #  Run classification of the test articles
 98 | ###############################################################################
 99 | 
100 | print("\nClassifying tfidf vectors...")
101 | 
102 | # Time this step.
103 | t0 = time.time()
104 | 
105 | # Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance, 
106 | # and brute-force calculation of distances.
107 | knn_tfidf = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine')
108 | knn_tfidf.fit(X_train_tfidf, y_train)
109 | 
110 | # Classify the test vectors.
111 | p = knn_tfidf.predict(X_test_tfidf)
112 | 
113 | # Measure accuracy
114 | numRight = 0;
115 | for i in range(0,len(p)):
116 |     if p[i] == y_test[i]:
117 |         numRight += 1
118 | 
119 | print("  (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0))
120 | 
121 | # Calculate the elapsed time (in seconds)
122 | elapsed = (time.time() - t0)
123 | print("  done in %.3fsec" % elapsed)
124 | 
125 | 
126 | print("\nClassifying LSA vectors...")
127 | 
128 | # Time this step.
129 | t0 = time.time()
130 | 
131 | # Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance, 
132 | # and brute-force calculation of distances.
133 | knn_lsa = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine')
134 | knn_lsa.fit(X_train_lsa, y_train)
135 | 
136 | # Classify the test vectors.
137 | p = knn_lsa.predict(X_test_lsa)
138 | 
139 | # Measure accuracy
140 | numRight = 0;
141 | for i in range(0,len(p)):
142 |     if p[i] == y_test[i]:
143 |         numRight += 1
144 | 
145 | print("  (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0))
146 | 
147 | # Calculate the elapsed time (in seconds)
148 | elapsed = (time.time() - t0)    
149 | print("    done in %.3fsec" % elapsed)
150 | 
151 | 


--------------------------------------------------------------------------------
/getReutersTextArticles.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ======================================================
  3 | Create raw text dataset from Reuters
  4 | ======================================================
  5 | 
  6 | This script uses the code from the scikit-learn example 
  7 | plot_out_of_core_classification.py for retrieving the Reuters dataset.
  8 | 
  9 | The dataset used in this example is Reuters-21578 as provided by the UCI ML
 10 | repository. It will be automatically downloaded and uncompressed on first run.
 11 | """
 12 | 
 13 | # Authors: Eustache Diemert <eustache@diemert.fr>
 14 | #          @FedericoV <https://github.com/FedericoV/>
 15 | # License: BSD 3 clause
 16 | 
 17 | from __future__ import print_function
 18 | 
 19 | from glob import glob
 20 | import itertools
 21 | import os.path
 22 | import re
 23 | import tarfile
 24 | 
 25 | import numpy as np
 26 | 
 27 | from sklearn.externals.six.moves import html_parser
 28 | from sklearn.externals.six.moves import urllib
 29 | from sklearn.datasets import get_data_home
 30 | 
 31 | import pickle
 32 | 
 33 | 
 34 | ###############################################################################
 35 | # Reuters Dataset related routines
 36 | ###############################################################################
 37 | 
 38 | def _not_in_sphinx():
 39 |     # Hack to detect whether we are running by the sphinx builder
 40 |     return '__file__' in globals()
 41 | 
 42 | class ReutersParser(html_parser.HTMLParser):
 43 |     """Utility class to parse a SGML file and yield documents one at a time."""
 44 | 
 45 |     def __init__(self, encoding='latin-1'):
 46 |         html_parser.HTMLParser.__init__(self)
 47 |         self._reset()
 48 |         self.encoding = encoding
 49 | 
 50 |     def handle_starttag(self, tag, attrs):
 51 |         method = 'start_' + tag
 52 |         getattr(self, method, lambda x: None)(attrs)
 53 | 
 54 |     def handle_endtag(self, tag):
 55 |         method = 'end_' + tag
 56 |         getattr(self, method, lambda: None)()
 57 | 
 58 |     def _reset(self):
 59 |         self.in_title = 0
 60 |         self.in_body = 0
 61 |         self.in_topics = 0
 62 |         self.in_topic_d = 0
 63 |         self.title = ""
 64 |         self.body = ""
 65 |         self.topics = []
 66 |         self.topic_d = ""
 67 | 
 68 |     def parse(self, fd):
 69 |         self.docs = []
 70 |         for chunk in fd:
 71 |             self.feed(chunk.decode(self.encoding))
 72 |             for doc in self.docs:
 73 |                 yield doc
 74 |             self.docs = []
 75 |         self.close()
 76 | 
 77 |     def handle_data(self, data):
 78 |         if self.in_body:
 79 |             self.body += data
 80 |         elif self.in_title:
 81 |             self.title += data
 82 |         elif self.in_topic_d:
 83 |             self.topic_d += data
 84 | 
 85 |     def start_reuters(self, attributes):
 86 |         pass
 87 | 
 88 |     def end_reuters(self):
 89 |         self.body = re.sub(r'\s+', r' ', self.body)
 90 |         self.docs.append({'title': self.title,
 91 |                           'body': self.body,
 92 |                           'topics': self.topics})
 93 |         self._reset()
 94 | 
 95 |     def start_title(self, attributes):
 96 |         self.in_title = 1
 97 | 
 98 |     def end_title(self):
 99 |         self.in_title = 0
100 | 
101 |     def start_body(self, attributes):
102 |         self.in_body = 1
103 | 
104 |     def end_body(self):
105 |         self.in_body = 0
106 | 
107 |     def start_topics(self, attributes):
108 |         self.in_topics = 1
109 | 
110 |     def end_topics(self):
111 |         self.in_topics = 0
112 | 
113 |     def start_d(self, attributes):
114 |         self.in_topic_d = 1
115 | 
116 |     def end_d(self):
117 |         self.in_topic_d = 0
118 |         self.topics.append(self.topic_d)
119 |         self.topic_d = ""
120 | 
121 | 
122 | def stream_reuters_documents(data_path=None):
123 |     """Iterate over documents of the Reuters dataset.
124 | 
125 |     The Reuters archive will automatically be downloaded and uncompressed if
126 |     the `data_path` directory does not exist.
127 | 
128 |     Documents are represented as dictionaries with 'body' (str),
129 |     'title' (str), 'topics' (list(str)) keys.
130 | 
131 |     """
132 | 
133 |     DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
134 |                     'reuters21578-mld/reuters21578.tar.gz')
135 |     ARCHIVE_FILENAME = 'reuters21578.tar.gz'
136 | 
137 |     if data_path is None:
138 |         data_path = os.path.join(get_data_home(), "reuters")
139 |     if not os.path.exists(data_path):
140 |         """Download the dataset."""
141 |         print("downloading dataset (once and for all) into %s" %
142 |               data_path)
143 |         os.mkdir(data_path)
144 | 
145 |         def progress(blocknum, bs, size):
146 |             total_sz_mb = '%.2f MB' % (size / 1e6)
147 |             current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
148 |             if _not_in_sphinx():
149 |                 print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
150 |                       end='')
151 | 
152 |         archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
153 |         urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
154 |                                    reporthook=progress)
155 |         if _not_in_sphinx():
156 |             print('\r', end='')
157 |         print("untarring Reuters dataset...")
158 |         tarfile.open(archive_path, 'r:gz').extractall(data_path)
159 |         print("done.")
160 | 
161 |     parser = ReutersParser()
162 |     for filename in glob(os.path.join(data_path, "*.sgm")):
163 |         for doc in parser.parse(open(filename, 'rb')):
164 |             yield doc
165 | 
166 | def get_minibatch(doc_iter, size, pos_class):
167 |     """Extract a minibatch of examples, return a tuple X_text, y.
168 | 
169 |     Note: size is before excluding invalid docs with no topics assigned.
170 | 
171 |     """
172 |     data = [(u'{title}\n\n{body}'.format(**doc), doc['topics'])
173 |             for doc in itertools.islice(doc_iter, size)
174 |             if doc['topics']]
175 | 
176 |     # If there's no data, just return empty lists.    
177 |     if not len(data):
178 |         return np.asarray([], dtype=int), np.asarray([], dtype=int).tolist()
179 |     
180 |     # Otherwise, retrieve the articles and class labels. zip just splits apart
181 |     # the two variables.
182 |     X_text, y = zip(*data)
183 | 
184 |     # Convert X_text and y from tuples to lists.    
185 |     X_text = list(X_text)    
186 |     y = list(y)
187 |     
188 |     # Convert the class labels to a list.
189 |     #y = np.asarray(y, dtype=int).tolist()    
190 |     
191 |     # For some reason, some of these articles are just whitespace. Look for 
192 |     # these and remove them. 
193 |     toRemove = []
194 |     docNum = 0
195 |     
196 |     # For each article...
197 |     for article in X_text:
198 |         # If the article is just whitespace, or is empty, we'll remove it        
199 |         if article.isspace() or (article == ""):
200 |             toRemove.append(docNum)
201 |             
202 |         docNum += 1
203 |     
204 |     # Remove the empty articles. Do this in reverse order so as not to corrupt
205 |     # the indeces as we go.
206 |     toRemove.reverse()
207 |     for i in toRemove:
208 |         del X_text[i]
209 |         del y[i]
210 |     
211 |     return X_text, y
212 | 
213 | 
214 | 
215 | def iter_minibatches(doc_iter, minibatch_size):
216 |     """Generator of minibatches."""
217 |     X_text, y = get_minibatch(doc_iter, minibatch_size)
218 |     while len(X_text):
219 |         yield X_text, y
220 |         X_text, y = get_minibatch(doc_iter, minibatch_size)
221 | 
222 | ###############################################################################
223 | # Main
224 | ###############################################################################
225 | 
226 | # Iterator over parsed Reuters SGML files.
227 | data_stream = stream_reuters_documents()
228 | 
229 | # The Reuter's dataset includes many different classes, but we're just going to
230 | # do binary classification. We'll use 'acq' (articles related to 
231 | # "acquisitions"--one of the most prevalent classes in the dataset) as the 
232 | # positive class, and all other article topics will be used as negative 
233 | # examples.
234 | positive_class = 'acq'
235 | 
236 | # Retrieve a set of examples from the dataset to use as the training set, then 
237 | # another set of examples to use as the test set. The actual number will
238 | # be smaller because it will exclude "invalid docs with no topics assigned".
239 | X_train_raw, y_train_raw = get_minibatch(data_stream, 5000, positive_class)
240 | X_test_raw, y_test_raw = get_minibatch(data_stream, 5000, positive_class)
241 | 
242 | print("Train set is %d documents" % (len(y_train_raw)))
243 | print("Test set is %d documents" % (len(y_test_raw)))
244 | 
245 | # Dump the dataset to a pickle file.
246 | pickle.dump((X_train_raw, y_train_raw, X_test_raw, y_test_raw), open("data/raw_text_dataset.pickle", "wb"))
247 | 


--------------------------------------------------------------------------------