├── tfidf_teststopwords.txt
├── tfidf_testcorpus.txt
├── README.txt
├── tfidf_test.py
└── tfidf.py


/tfidf_teststopwords.txt:
--------------------------------------------------------------------------------
1 | moon
2 | 


--------------------------------------------------------------------------------
/tfidf_testcorpus.txt:
--------------------------------------------------------------------------------
1 | 50
2 | the:23
3 | a:17
4 | girl:1
5 | moon:1
6 | said:5
7 | phone:2
8 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 | Forked from http://code.google.com/p/tfidf/ on 15 august 2012.
2 | 
3 | Licensed under the LGPL, for details see individual source files.


--------------------------------------------------------------------------------
/tfidf_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # Copyright (C) 2009.  All rights reserved.
  4 | 
  5 | __author__ = "Niniane Wang"
  6 | __email__ = "niniane at gmail dot com"
  7 | 
  8 | import math
  9 | import tfidf
 10 | import unittest
 11 | 
 12 | DEFAULT_IDF_UNITTEST = 1.0
 13 | 
 14 | def get_exected_idf(num_docs_total, num_docs_term):
 15 |    return math.log(float(1 + num_docs_total) / (1 + num_docs_term))
 16 | 
 17 | class TfIdfTest(unittest.TestCase):
 18 |   def testGetIdf(self):
 19 |     """Test querying the IDF for existent and nonexistent terms."""
 20 |     my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", \
 21 |                            DEFAULT_IDF = DEFAULT_IDF_UNITTEST)
 22 | 
 23 |     # Test querying for a nonexistent term.
 24 |     self.assertEqual(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("nonexistent"))
 25 |     self.assertEqual(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("THE"))
 26 | 
 27 |     self.assertTrue(my_tfidf.get_idf("a") > my_tfidf.get_idf("the"))
 28 |     self.assertAlmostEquals(my_tfidf.get_idf("girl"), my_tfidf.get_idf("moon"))
 29 | 
 30 |   def testKeywords(self):
 31 |     """Test retrieving keywords from a document, ordered by tf-idf."""
 32 |     my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", DEFAULT_IDF = 0.01)
 33 | 
 34 |     # Test retrieving keywords when there is only one keyword.
 35 |     keywords = my_tfidf.get_doc_keywords("the spoon and the fork")
 36 |     self.assertEqual("the", keywords[0][0])
 37 | 
 38 |     # Test retrieving multiple keywords.
 39 |     keywords = my_tfidf.get_doc_keywords("the girl said hello over the phone")
 40 |     self.assertEqual("girl", keywords[0][0])
 41 |     self.assertEqual("phone", keywords[1][0])
 42 |     self.assertEqual("said", keywords[2][0])
 43 |     self.assertEqual("the", keywords[3][0])
 44 | 
 45 |   def testAddCorpus(self):
 46 |     """Test adding input documents to the corpus."""
 47 |     my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", \
 48 |                            DEFAULT_IDF = DEFAULT_IDF_UNITTEST)
 49 | 
 50 |     self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water"))
 51 |     self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1),
 52 |       my_tfidf.get_idf("moon"))
 53 |     self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5),
 54 |       my_tfidf.get_idf("said"))
 55 | 
 56 |     my_tfidf.add_input_document("water, moon")
 57 | 
 58 |     self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1),
 59 |       my_tfidf.get_idf("water"))
 60 |     self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 2),
 61 |       my_tfidf.get_idf("moon"))
 62 |     self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5),
 63 |       my_tfidf.get_idf("said"))
 64 | 
 65 |   def testNoCorpusFiles(self):
 66 |     my_tfidf = tfidf.TfIdf(DEFAULT_IDF = DEFAULT_IDF_UNITTEST)
 67 | 
 68 |     self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("moon"))
 69 |     self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water"))
 70 |     self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("said"))
 71 | 
 72 |     my_tfidf.add_input_document("moon")
 73 |     my_tfidf.add_input_document("moon said hello")
 74 | 
 75 |     self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water"))
 76 |     self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1),
 77 |       my_tfidf.get_idf("said"))
 78 |     self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 2),
 79 |       my_tfidf.get_idf("moon"))
 80 | 
 81 |   def testStopwordFile(self):
 82 |     my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", "tfidf_teststopwords.txt",
 83 |                            DEFAULT_IDF = DEFAULT_IDF_UNITTEST)
 84 | 
 85 |     self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water"))
 86 |     self.assertEquals(0, my_tfidf.get_idf("moon"))
 87 |     self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5),
 88 |       my_tfidf.get_idf("said"))
 89 | 
 90 |     my_tfidf.add_input_document("moon")
 91 |     my_tfidf.add_input_document("moon and water")
 92 | 
 93 |     self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1),
 94 |                             my_tfidf.get_idf("water"))
 95 |     self.assertEquals(0, my_tfidf.get_idf("moon"))
 96 |     self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5),
 97 |       my_tfidf.get_idf("said"))
 98 | 
 99 | #need to add some utf-8 handling tests
100 | 
101 | def main():
102 |   unittest.main()
103 | 
104 | if __name__ == '__main__':
105 |   main()
106 | 


--------------------------------------------------------------------------------
/tfidf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # Copyright 2009  Niniane Wang (niniane@gmail.com)
  4 | # Reviewed by Alex Mendes da Costa.
  5 | #
  6 | # Modified in 2012 by Benjamin Fields (me@benfields.net)
  7 | #
  8 | # This is a simple Tf-idf library.  The algorithm is described in
  9 | #   http://en.wikipedia.org/wiki/Tf-idf
 10 | #
 11 | # This library is free software; you can redistribute it and/or
 12 | # modify it under the terms of the GNU Lesser General Public
 13 | # License as published by the Free Software Foundation; either
 14 | # version 3 of the License, or (at your option) any later version.
 15 | #
 16 | # Tfidf is distributed in the hope that it will be useful,
 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 19 | # Lesser General Public License for more details:
 20 | #
 21 | #   http://www.gnu.org/licenses/lgpl.txt
 22 | 
 23 | __author__ = "Niniane Wang"
 24 | __email__ = "niniane at gmail dot com"
 25 | 
 26 | import math
 27 | import re
 28 | import codecs
 29 | from operator import itemgetter
 30 | 
 31 | class TfIdf:
 32 | 
 33 |   """Tf-idf class implementing http://en.wikipedia.org/wiki/Tf-idf.
 34 |   
 35 |      The library constructs an IDF corpus and stopword list either from
 36 |      documents specified by the client, or by reading from input files.  It
 37 |      computes IDF for a specified term based on the corpus, or generates
 38 |      keywords ordered by tf-idf for a specified document.
 39 |   """
 40 | 
 41 |   def __init__(self, corpus_filename = None, stopword_filename = None,
 42 |                DEFAULT_IDF = 1.5):
 43 |     """Initialize the idf dictionary.  
 44 |     
 45 |        If a corpus file is supplied, reads the idf dictionary from it, in the
 46 |        format of:
 47 |          # of total documents
 48 |          term: # of documents containing the term
 49 | 
 50 |        If a stopword file is specified, reads the stopword list from it, in
 51 |        the format of one stopword per line.
 52 | 
 53 |        The DEFAULT_IDF value is returned when a query term is not found in the
 54 |        idf corpus.
 55 |     """
 56 |     self.num_docs = 0
 57 |     self.term_num_docs = {}     # term : num_docs_containing_term
 58 |     self.stopwords = set([])
 59 |     self.idf_default = DEFAULT_IDF
 60 | 
 61 |     if corpus_filename:
 62 |         self.merge_corpus_document(corpus_filename)
 63 | 
 64 |     if stopword_filename:
 65 |       stopword_file = codecs.open(stopword_filename, "r", encoding='utf-8')
 66 |       self.stopwords = set([line.strip() for line in stopword_file])
 67 | 
 68 |   def get_tokens(self, str):
 69 |     """Break a string into tokens, preserving URL tags as an entire token.
 70 | 
 71 |        This implementation does not preserve case.  
 72 |        Clients may wish to override this behavior with their own tokenization.
 73 |     """
 74 |     return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())
 75 | 
 76 |   def merge_corpus_document(self, corpus_filename):
 77 |     """slurp in a corpus document, adding it to the existing corpus model
 78 |     """
 79 |     corpus_file = codecs.open(corpus_filename, "r", encoding='utf-8')
 80 | 
 81 |     # Load number of documents.
 82 |     line = corpus_file.readline()
 83 |     self.num_docs += int(line.strip())
 84 | 
 85 |     # Reads "term:frequency" from each subsequent line in the file.
 86 |     for line in corpus_file:
 87 |       tokens = line.rsplit(":",1)
 88 |       term = tokens[0].strip()
 89 |       try:
 90 |           frequency = int(tokens[1].strip())
 91 |       except IndexError, err:
 92 |           if line in ("","\t"):
 93 |               #catch blank lines
 94 |               print "line is blank"
 95 |               continue
 96 |           else:
 97 |               raise
 98 |       if self.term_num_docs.has_key(term):
 99 |         self.term_num_docs[term] += frequency
100 |       else:
101 |         self.term_num_docs[term] = frequency
102 | 
103 |   def add_input_document(self, input):
104 |     """Add terms in the specified document to the idf dictionary."""
105 |     self.num_docs += 1
106 |     words = set(self.get_tokens(input))
107 |     for word in words:
108 |       if word in self.term_num_docs:
109 |         self.term_num_docs[word] += 1
110 |       else:
111 |         self.term_num_docs[word] = 1
112 | 
113 |   def save_corpus_to_file(self, idf_filename, stopword_filename,
114 |                           STOPWORD_PERCENTAGE_THRESHOLD = 0.01):
115 |     """Save the idf dictionary and stopword list to the specified file."""
116 |     output_file = codecs.open(idf_filename, "w", encoding='utf-8')
117 | 
118 |     output_file.write(str(self.num_docs) + "\n")
119 |     for term, num_docs in self.term_num_docs.items():
120 |       output_file.write(term + ": " + str(num_docs) + "\n")
121 | 
122 |     sorted_terms = sorted(self.term_num_docs.items(), key=itemgetter(1),
123 |                           reverse=True)
124 |     stopword_file = open(stopword_filename, "w")
125 |     for term, num_docs in sorted_terms:
126 |       if num_docs < STOPWORD_PERCENTAGE_THRESHOLD * self.num_docs:
127 |         break
128 | 
129 |       stopword_file.write(term + "\n")
130 | 
131 |   def get_num_docs(self):
132 |     """Return the total number of documents in the IDF corpus."""
133 |     return self.num_docs
134 | 
135 |   def get_idf(self, term):
136 |     """Retrieve the IDF for the specified term. 
137 |     
138 |        This is computed by taking the logarithm of ( 
139 |        (number of documents in corpus) divided by (number of documents
140 |         containing this term) ).
141 |      """
142 |     if term in self.stopwords:
143 |       return 0
144 | 
145 |     if not term in self.term_num_docs:
146 |       return self.idf_default
147 | 
148 |     return math.log(float(1 + self.get_num_docs()) / 
149 |       (1 + self.term_num_docs[term]))
150 | 
151 |   def get_doc_keywords(self, curr_doc):
152 |     """Retrieve terms and corresponding tf-idf for the specified document.
153 | 
154 |        The returned terms are ordered by decreasing tf-idf.
155 |     """
156 |     tfidf = {}
157 |     tokens = self.get_tokens(curr_doc)
158 |     tokens_set = set(tokens)
159 |     for word in tokens_set:
160 |       mytf = float(tokens.count(word)) / len(tokens_set)
161 |       myidf = self.get_idf(word)
162 |       tfidf[word] = mytf * myidf
163 | 
164 |     return sorted(tfidf.items(), key=itemgetter(1), reverse=True)
165 | 


--------------------------------------------------------------------------------