├── tfidf_teststopwords.txt ├── tfidf_testcorpus.txt ├── README.txt ├── tfidf_test.py └── tfidf.py /tfidf_teststopwords.txt: -------------------------------------------------------------------------------- 1 | moon 2 | -------------------------------------------------------------------------------- /tfidf_testcorpus.txt: -------------------------------------------------------------------------------- 1 | 50 2 | the:23 3 | a:17 4 | girl:1 5 | moon:1 6 | said:5 7 | phone:2 8 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Forked from http://code.google.com/p/tfidf/ on 15 august 2012. 2 | 3 | Licensed under the LGPL, for details see individual source files. -------------------------------------------------------------------------------- /tfidf_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (C) 2009. All rights reserved. 4 | 5 | __author__ = "Niniane Wang" 6 | __email__ = "niniane at gmail dot com" 7 | 8 | import math 9 | import tfidf 10 | import unittest 11 | 12 | DEFAULT_IDF_UNITTEST = 1.0 13 | 14 | def get_exected_idf(num_docs_total, num_docs_term): 15 | return math.log(float(1 + num_docs_total) / (1 + num_docs_term)) 16 | 17 | class TfIdfTest(unittest.TestCase): 18 | def testGetIdf(self): 19 | """Test querying the IDF for existent and nonexistent terms.""" 20 | my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", \ 21 | DEFAULT_IDF = DEFAULT_IDF_UNITTEST) 22 | 23 | # Test querying for a nonexistent term. 24 | self.assertEqual(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("nonexistent")) 25 | self.assertEqual(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("THE")) 26 | 27 | self.assertTrue(my_tfidf.get_idf("a") > my_tfidf.get_idf("the")) 28 | self.assertAlmostEquals(my_tfidf.get_idf("girl"), my_tfidf.get_idf("moon")) 29 | 30 | def testKeywords(self): 31 | """Test retrieving keywords from a document, ordered by tf-idf.""" 32 | my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", DEFAULT_IDF = 0.01) 33 | 34 | # Test retrieving keywords when there is only one keyword. 35 | keywords = my_tfidf.get_doc_keywords("the spoon and the fork") 36 | self.assertEqual("the", keywords[0][0]) 37 | 38 | # Test retrieving multiple keywords. 39 | keywords = my_tfidf.get_doc_keywords("the girl said hello over the phone") 40 | self.assertEqual("girl", keywords[0][0]) 41 | self.assertEqual("phone", keywords[1][0]) 42 | self.assertEqual("said", keywords[2][0]) 43 | self.assertEqual("the", keywords[3][0]) 44 | 45 | def testAddCorpus(self): 46 | """Test adding input documents to the corpus.""" 47 | my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", \ 48 | DEFAULT_IDF = DEFAULT_IDF_UNITTEST) 49 | 50 | self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water")) 51 | self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1), 52 | my_tfidf.get_idf("moon")) 53 | self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5), 54 | my_tfidf.get_idf("said")) 55 | 56 | my_tfidf.add_input_document("water, moon") 57 | 58 | self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1), 59 | my_tfidf.get_idf("water")) 60 | self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 2), 61 | my_tfidf.get_idf("moon")) 62 | self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5), 63 | my_tfidf.get_idf("said")) 64 | 65 | def testNoCorpusFiles(self): 66 | my_tfidf = tfidf.TfIdf(DEFAULT_IDF = DEFAULT_IDF_UNITTEST) 67 | 68 | self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("moon")) 69 | self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water")) 70 | self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("said")) 71 | 72 | my_tfidf.add_input_document("moon") 73 | my_tfidf.add_input_document("moon said hello") 74 | 75 | self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water")) 76 | self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1), 77 | my_tfidf.get_idf("said")) 78 | self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 2), 79 | my_tfidf.get_idf("moon")) 80 | 81 | def testStopwordFile(self): 82 | my_tfidf = tfidf.TfIdf("tfidf_testcorpus.txt", "tfidf_teststopwords.txt", 83 | DEFAULT_IDF = DEFAULT_IDF_UNITTEST) 84 | 85 | self.assertEquals(DEFAULT_IDF_UNITTEST, my_tfidf.get_idf("water")) 86 | self.assertEquals(0, my_tfidf.get_idf("moon")) 87 | self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5), 88 | my_tfidf.get_idf("said")) 89 | 90 | my_tfidf.add_input_document("moon") 91 | my_tfidf.add_input_document("moon and water") 92 | 93 | self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 1), 94 | my_tfidf.get_idf("water")) 95 | self.assertEquals(0, my_tfidf.get_idf("moon")) 96 | self.assertAlmostEquals(get_exected_idf(my_tfidf.get_num_docs(), 5), 97 | my_tfidf.get_idf("said")) 98 | 99 | #need to add some utf-8 handling tests 100 | 101 | def main(): 102 | unittest.main() 103 | 104 | if __name__ == '__main__': 105 | main() 106 | -------------------------------------------------------------------------------- /tfidf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2009 Niniane Wang (niniane@gmail.com) 4 | # Reviewed by Alex Mendes da Costa. 5 | # 6 | # Modified in 2012 by Benjamin Fields (me@benfields.net) 7 | # 8 | # This is a simple Tf-idf library. The algorithm is described in 9 | # http://en.wikipedia.org/wiki/Tf-idf 10 | # 11 | # This library is free software; you can redistribute it and/or 12 | # modify it under the terms of the GNU Lesser General Public 13 | # License as published by the Free Software Foundation; either 14 | # version 3 of the License, or (at your option) any later version. 15 | # 16 | # Tfidf is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 | # Lesser General Public License for more details: 20 | # 21 | # http://www.gnu.org/licenses/lgpl.txt 22 | 23 | __author__ = "Niniane Wang" 24 | __email__ = "niniane at gmail dot com" 25 | 26 | import math 27 | import re 28 | import codecs 29 | from operator import itemgetter 30 | 31 | class TfIdf: 32 | 33 | """Tf-idf class implementing http://en.wikipedia.org/wiki/Tf-idf. 34 | 35 | The library constructs an IDF corpus and stopword list either from 36 | documents specified by the client, or by reading from input files. It 37 | computes IDF for a specified term based on the corpus, or generates 38 | keywords ordered by tf-idf for a specified document. 39 | """ 40 | 41 | def __init__(self, corpus_filename = None, stopword_filename = None, 42 | DEFAULT_IDF = 1.5): 43 | """Initialize the idf dictionary. 44 | 45 | If a corpus file is supplied, reads the idf dictionary from it, in the 46 | format of: 47 | # of total documents 48 | term: # of documents containing the term 49 | 50 | If a stopword file is specified, reads the stopword list from it, in 51 | the format of one stopword per line. 52 | 53 | The DEFAULT_IDF value is returned when a query term is not found in the 54 | idf corpus. 55 | """ 56 | self.num_docs = 0 57 | self.term_num_docs = {} # term : num_docs_containing_term 58 | self.stopwords = set([]) 59 | self.idf_default = DEFAULT_IDF 60 | 61 | if corpus_filename: 62 | self.merge_corpus_document(corpus_filename) 63 | 64 | if stopword_filename: 65 | stopword_file = codecs.open(stopword_filename, "r", encoding='utf-8') 66 | self.stopwords = set([line.strip() for line in stopword_file]) 67 | 68 | def get_tokens(self, str): 69 | """Break a string into tokens, preserving URL tags as an entire token. 70 | 71 | This implementation does not preserve case. 72 | Clients may wish to override this behavior with their own tokenization. 73 | """ 74 | return re.findall(r"|<[^\>]*>|[\w'@#]+", str.lower()) 75 | 76 | def merge_corpus_document(self, corpus_filename): 77 | """slurp in a corpus document, adding it to the existing corpus model 78 | """ 79 | corpus_file = codecs.open(corpus_filename, "r", encoding='utf-8') 80 | 81 | # Load number of documents. 82 | line = corpus_file.readline() 83 | self.num_docs += int(line.strip()) 84 | 85 | # Reads "term:frequency" from each subsequent line in the file. 86 | for line in corpus_file: 87 | tokens = line.rsplit(":",1) 88 | term = tokens[0].strip() 89 | try: 90 | frequency = int(tokens[1].strip()) 91 | except IndexError, err: 92 | if line in ("","\t"): 93 | #catch blank lines 94 | print "line is blank" 95 | continue 96 | else: 97 | raise 98 | if self.term_num_docs.has_key(term): 99 | self.term_num_docs[term] += frequency 100 | else: 101 | self.term_num_docs[term] = frequency 102 | 103 | def add_input_document(self, input): 104 | """Add terms in the specified document to the idf dictionary.""" 105 | self.num_docs += 1 106 | words = set(self.get_tokens(input)) 107 | for word in words: 108 | if word in self.term_num_docs: 109 | self.term_num_docs[word] += 1 110 | else: 111 | self.term_num_docs[word] = 1 112 | 113 | def save_corpus_to_file(self, idf_filename, stopword_filename, 114 | STOPWORD_PERCENTAGE_THRESHOLD = 0.01): 115 | """Save the idf dictionary and stopword list to the specified file.""" 116 | output_file = codecs.open(idf_filename, "w", encoding='utf-8') 117 | 118 | output_file.write(str(self.num_docs) + "\n") 119 | for term, num_docs in self.term_num_docs.items(): 120 | output_file.write(term + ": " + str(num_docs) + "\n") 121 | 122 | sorted_terms = sorted(self.term_num_docs.items(), key=itemgetter(1), 123 | reverse=True) 124 | stopword_file = open(stopword_filename, "w") 125 | for term, num_docs in sorted_terms: 126 | if num_docs < STOPWORD_PERCENTAGE_THRESHOLD * self.num_docs: 127 | break 128 | 129 | stopword_file.write(term + "\n") 130 | 131 | def get_num_docs(self): 132 | """Return the total number of documents in the IDF corpus.""" 133 | return self.num_docs 134 | 135 | def get_idf(self, term): 136 | """Retrieve the IDF for the specified term. 137 | 138 | This is computed by taking the logarithm of ( 139 | (number of documents in corpus) divided by (number of documents 140 | containing this term) ). 141 | """ 142 | if term in self.stopwords: 143 | return 0 144 | 145 | if not term in self.term_num_docs: 146 | return self.idf_default 147 | 148 | return math.log(float(1 + self.get_num_docs()) / 149 | (1 + self.term_num_docs[term])) 150 | 151 | def get_doc_keywords(self, curr_doc): 152 | """Retrieve terms and corresponding tf-idf for the specified document. 153 | 154 | The returned terms are ordered by decreasing tf-idf. 155 | """ 156 | tfidf = {} 157 | tokens = self.get_tokens(curr_doc) 158 | tokens_set = set(tokens) 159 | for word in tokens_set: 160 | mytf = float(tokens.count(word)) / len(tokens_set) 161 | myidf = self.get_idf(word) 162 | tfidf[word] = mytf * myidf 163 | 164 | return sorted(tfidf.items(), key=itemgetter(1), reverse=True) 165 | --------------------------------------------------------------------------------