├── .gitignore
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── qdr
    ├── __init__.py
    ├── _ranker.cc
    ├── ranker.pxd
    ├── ranker.pyx
    └── trainer.py
├── requirements.txt
├── setup.py
└── test
    ├── common.py
    ├── test_ranker.py
    └── test_trainer.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | 
 3 | # vim temp files
 4 | *.swp
 5 | 
 6 | # nose coverage info
 7 | .coverage
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Packages
13 | *.egg
14 | *.egg-info
15 | dist
16 | build
17 | 
18 | # cython genenerated in the build process
19 | qdr/ranker.cpp
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - 2.7
 4 | script: make test
 5 | virtualenv:
 6 |   system_site_packages: true
 7 | cache:
 8 |   - apt
 9 |   - pip
10 | install:
11 |   - sudo apt-get -y install python-numpy
12 |   - pip install -r requirements.txt --use-mirrors
13 |   - python setup.py build_ext --inplace
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Moz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | clean:
 2 | 	# Remove the build
 3 | 	rm -rf build dist
 4 | 	# And all of our pyc files
 5 | 	rm -f qdr/*.pyc test/*.pyc
 6 | 	# All compiled files
 7 | 	rm -f qdr/*.so qdr/ranker.cpp
 8 | 	# And lastly, .coverage files
 9 | 	rm -f .coverage
10 | 
11 | test: nose
12 | 
13 | nose:
14 | 	rm -rf .coverage
15 | 	nosetests --exe --cover-package=qdr --with-coverage --cover-branches -v --cover-erase 
16 | 
17 | unittest:
18 | 	python -m unittest discover -s test
19 | 
20 | build: clean
21 | 	python setup.py build_ext --inplace
22 | 
23 | install: build
24 | 	python setup.py install
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | qdr
  2 | ===
  3 | 
  4 | [![Build Status](https://api.travis-ci.org/seomoz/qdr.png)](https://travis-ci.org/seomoz/qdr)
  5 | 
  6 | Query-Document Relevance ranking functions
  7 | 
  8 | This repository implements a few query-document similarity functions,
  9 | commonly used in information retrieval applications.  It supports:
 10 | 
 11 | * TF-IDF
 12 | * Okapi BM25
 13 | * Language Model
 14 | 
 15 | This implementation includes pure Python code for iteratively training
 16 | models from a large corpus, and a C++ implementation of the scoring
 17 | functions with Cython wrappers for fast evaluation.
 18 | 
 19 | Each of these ranking functions has a few "magic" constants.  Currently
 20 | these are hard coded to values recommend in the literature, but if the
 21 | need arises can be configurable.  Relevant references:
 22 | 
 23 | * For TF-IDF see [Salton and Buckley, "Term-weighting approaches in automatic text retrieval"](http://scholar.google.com/scholar?hl=en&as_sdt=0,48&q=salton+and+buckley+%22Term-weighting+approaches+in+automatic+text+retrieval%22+Information+Processing+%26+Management,+vol+24,+1988) ("best fully weighted system tfc * nfx" (Table 2, first line))
 24 | * For Okapi BM25, see ["An Introduction to Information Retrieval" by Manning et al.](http://nlp.stanford.edu/IR-book/) (Section 11.4.3 (page 233), eqn 11.32).
 25 | * For the Language Model approach, see [Zhai and Lafferty "A Study of Smoothing
 26 | Methods for Language Models Applied to Ad Hoc Information Retrieval"](http://scholar.google.com/scholar?q=Zhai+and+Lafferty+"A+Study+of+Smoothing+Methods+for+Language+Models+Applied+to+Ad+Hoc+Information+Retrieval")
 27 | 
 28 | Usage
 29 | =====
 30 | 
 31 | All tokenization and word normalization is handled client side, and all methods
 32 | that accept queries or documents assume they are lists of byte strings,
 33 | not unicode.
 34 | 
 35 | There are two separate steps to using the ranking functions: training
 36 | and scoring.
 37 | 
 38 | ## Training
 39 | 
 40 | The `Trainer` class supports incremental training from a large corpus,
 41 | combining separately trained models for map-reduce type data flows,
 42 | pruning of infrequent tokens from large models and serialization.  Typical
 43 | usage:
 44 | 
 45 | ```python
 46 | from qdr import Trainer
 47 | 
 48 | # load corpus -- it's an iterable of documents, each document is a
 49 | # list of byte strings
 50 | model = Trainer()
 51 | model.train(corpus)
 52 | 
 53 | # the train method adds documents incrementally so it can be updated with
 54 | # additional documents by calling train again
 55 | model.train(another_corpus)
 56 | 
 57 | # write to a file
 58 | model.serialize_to_file(outputfile)
 59 | ```
 60 | 
 61 | For map-reduce type work, the method `update_counts_from_trained` will
 62 | merge the contents of two `Trainer` instances:
 63 | 
 64 | ```python
 65 | # map step -- typically this is parallelized
 66 | for k, corpus in enumerate(corpus_chunks):
 67 |     model = Trainer()
 68 |     model.train(corpus)
 69 |     model.serialize_to_file("file%s.gz" % k)
 70 | 
 71 | # reduce step
 72 | model = Trainer.load_from_file("file0.gz")
 73 | for k in xrange(1, len(corpus_chunks)):
 74 |     model2 = Trainer.load_from_file("file%s.gz" % k)
 75 |     model.update_counts_from_trained(model2)
 76 | 
 77 | # prune the final model if needed
 78 | model.prune(min_word_count, min_doc_count)
 79 | ```
 80 | 
 81 | ## Scoring
 82 | 
 83 | Typical usage:
 84 | 
 85 | ```python
 86 | from qdr import QueryDocumentRelevance
 87 | 
 88 | scorer = QueryDocumentRelevance.load_from_file('trained_model.gz')
 89 | # document, query are lists of byte strings
 90 | relevance_scores = scorer.score(document, query)
 91 | ```
 92 | 
 93 | For scoring batches of queries against a single document, the `score_batch`
 94 | method is more efficient then calling `score` repeatedly:
 95 | ```
 96 | # queries is a list of queries, each query is a list of tokens:
 97 | relevance_scores = scorer.score(document, queries)
 98 | ```
 99 | 
100 | # Installing
101 | 
102 | ```
103 | sudo pip install -r requirements.txt
104 | sudo make install
105 | ```
106 | 
107 | # Contributing
108 | Contributions welcome!  Fork, commit, then open a pull request.
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/qdr/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .ranker import QueryDocumentRelevance
3 | from .trainer import Trainer
4 | 
5 | 


--------------------------------------------------------------------------------
/qdr/_ranker.cc:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <math.h>
  3 | #include <string>
  4 | #include <vector>
  5 | #include <unordered_map>
  6 | #include <stdexcept>
  7 | 
  8 | 
  9 | typedef std::unordered_map<std::string, std::pair<uint64_t, uint64_t> >
 10 |     counts_t;
 11 | 
 12 | 
 13 | // count of word occurances in a doc
 14 | typedef std::unordered_map<std::string, uint64_t> word_counts_t;
 15 | 
 16 | // a tokenized document
 17 | typedef std::vector<std::string> doc_t;
 18 | 
 19 | word_counts_t count_words(const doc_t& document)
 20 | {
 21 |     /// Given a vector of tokens, return a map token -> count in vector
 22 |     // heuristic to set final hash sparsity.  goal = 10%
 23 |     // number of keys in ret is numher unique tokens in document
 24 |     // in avg web page this is about 1/3 the number of total tokens
 25 |     // so this gives hash sparsity of about 1/9 or about 10%
 26 |     word_counts_t ret(document.size() * 3);
 27 |     for (doc_t::const_iterator it = document.begin();
 28 |         it != document.end(); ++it)
 29 |     {
 30 |         word_counts_t::iterator got = ret.find(*it);
 31 |         if (got == ret.end())
 32 |         {
 33 |             // first time we've seen this word
 34 |             ret[*it] = 1;
 35 |         }
 36 |         else
 37 |             ret[*it] += 1;
 38 |     }
 39 |     return ret;
 40 | }
 41 | 
 42 | struct scores_t {
 43 |     double tfidf;
 44 |     double bm25;
 45 |     double lm_jm;
 46 |     double lm_dirichlet;
 47 |     double lm_ad;
 48 | };
 49 | 
 50 | struct lm_scores_t {
 51 |     double jm;
 52 |     double dirichlet;
 53 |     double ad;
 54 | };
 55 | 
 56 | class QDR
 57 | {
 58 |     public:
 59 |         QDR(counts_t& counts, uint64_t total_docs);
 60 |         ~QDR();
 61 | 
 62 |         // compute the similarity scores
 63 |         scores_t score(doc_t& document, doc_t& query);
 64 | 
 65 |         // compute similiary scores for a single doc but list of queries
 66 |         std::vector<scores_t> score_batch(doc_t& document,
 67 |             std::vector<doc_t>& queries);
 68 | 
 69 |         // get the IDF for a given word
 70 |         double get_idf(const std::string& word);
 71 | 
 72 |     private:
 73 |         counts_t counts;
 74 |         uint64_t total_docs;
 75 |         uint64_t nwords_vocab;
 76 |         uint64_t nwords;
 77 | 
 78 |         // useful constant for language model
 79 |         double n2p1;
 80 | 
 81 |         // compute the TF-IDF similarity
 82 |         double tfidf(const word_counts_t& doc_counts,
 83 |             const word_counts_t& query_counts);
 84 | 
 85 |         // BM25
 86 |         double okapi_bm25(const word_counts_t& doc_counts,
 87 |             const word_counts_t& query_counts,
 88 |             const double& Ld);
 89 | 
 90 |         // language model
 91 |         lm_scores_t lm(const word_counts_t& doc_counts,
 92 |             const word_counts_t& query_counts,
 93 |             const double& sum_count_w_given_doc);
 94 | 
 95 |         // low level function used by public methods
 96 |         scores_t score_single(const word_counts_t& doc_counts,
 97 |             const word_counts_t& query_counts,
 98 |             const double& nwords_document);
 99 | 
100 |         // disable some default constructors
101 |         QDR();
102 |         QDR& operator= (const QDR& other);
103 |         QDR(const QDR& other);
104 | 
105 | };
106 | 
107 | QDR::QDR(counts_t& counts_in, uint64_t total_docs) : total_docs(total_docs)
108 | {
109 |     // copy the counts data to private member and add up the total
110 |     // number of words and words in vocab
111 |     nwords_vocab = 0;
112 |     nwords = 0;
113 | 
114 |     for (counts_t::iterator it = counts_in.begin(); it != counts_in.end();
115 |         ++it)
116 |     {
117 |         counts[it->first] = it->second;
118 |         ++nwords_vocab;
119 |         nwords += it->second.first;
120 |     }
121 | 
122 |     n2p1 = double(nwords) + double(nwords_vocab) + 1.0;
123 | }
124 | 
125 | QDR::~QDR() {}
126 | 
127 | double QDR::get_idf(const std::string& word)
128 | {
129 |     /// Get log(total docs / doc count for word) == IDF
130 |     double doc_freq;
131 |     counts_t::const_iterator got = counts.find(word);
132 |     if (got != counts.end())
133 |     {
134 |         // this word is in the corpus
135 |         doc_freq = got->second.second;
136 |     }
137 |     else
138 |         doc_freq = 1.0;
139 |     return log(double(total_docs) / doc_freq);
140 | }
141 | 
142 | scores_t QDR::score(doc_t& document, doc_t& query)
143 | {
144 |     if (document.empty() || query.empty())
145 |         throw std::invalid_argument(
146 |             "Document and query both need to be non-empty");
147 | 
148 |     // compute counts, doc length, etc now since they are used
149 |     // in multiple components
150 |     word_counts_t query_counts = count_words(query);
151 |     word_counts_t doc_counts = count_words(document);
152 | 
153 |     double nwords_document = 0.0;
154 |     for (word_counts_t::const_iterator it_doc = doc_counts.begin();
155 |         it_doc != doc_counts.end(); ++it_doc)
156 |         nwords_document += it_doc->second;
157 | 
158 |     // now the scores
159 |     return score_single(doc_counts, query_counts, nwords_document);
160 | }
161 | 
162 | std::vector<scores_t> QDR::score_batch(
163 |     doc_t& document, std::vector<doc_t>& queries)
164 | {
165 |     if (document.empty())
166 |         throw std::invalid_argument(
167 |             "Document and query both need to be non-empty");
168 | 
169 |     // compute document level counts once, then iterate through queries
170 |     word_counts_t doc_counts = count_words(document);
171 | 
172 |     double nwords_document = 0.0;
173 |     for (word_counts_t::const_iterator it_doc = doc_counts.begin();
174 |         it_doc != doc_counts.end(); ++it_doc)
175 |         nwords_document += it_doc->second;
176 | 
177 |     // iterate through queries, compute scores
178 |     std::vector<scores_t> ret;
179 |     ret.reserve(queries.size());
180 |     for (std::vector<doc_t>::iterator it = queries.begin();
181 |         it != queries.end(); ++it)
182 |     {
183 |         if (it->size() == 0)
184 |             throw std::invalid_argument(
185 |                 "Document and query both need to be non-empty");
186 |         word_counts_t query_counts = count_words(*it);
187 |         ret.push_back(score_single(doc_counts, query_counts, nwords_document));
188 |     }
189 | 
190 |     return ret;
191 | }
192 | 
193 | scores_t QDR::score_single(const word_counts_t& doc_counts,
194 |         const word_counts_t& query_counts,
195 |         const double& nwords_document)
196 | {
197 |     scores_t scores;
198 |     scores.tfidf = tfidf(doc_counts, query_counts);
199 |     scores.bm25 = okapi_bm25(doc_counts, query_counts, nwords_document);
200 | 
201 |     lm_scores_t lm_scores = lm(doc_counts, query_counts, nwords_document);
202 |     scores.lm_jm = lm_scores.jm;
203 |     scores.lm_dirichlet = lm_scores.dirichlet;
204 |     scores.lm_ad = lm_scores.ad;
205 | 
206 |     return scores;
207 | }
208 | 
209 | double QDR::tfidf(
210 |     const word_counts_t& doc_counts, const word_counts_t& query_counts)
211 | {
212 |     /// Compute TF-IDF similarity between the document and query
213 |     // Computes the cosine similarity score of the query and document
214 |     // using the "best fully weighted system tfc * nfx" (Table 2, first line)
215 |     // from Salton and Buckley, "Term-weighting approaches in automatic text
216 |     // retrieval", Information Processing & Mangement, vol 24, 1988"""
217 | 
218 |     // strategy: make document and query vectors then take inner product
219 |     // only need to make the document vector for each query word
220 |     // since other entries are zeroed out in inner product
221 |     std::vector<double> doc_vector(query_counts.size());
222 |     std::vector<double> query_vector(query_counts.size());
223 | 
224 |     // need the maximum count of query word occurances
225 |     double max_query = 0.0;
226 |     for (word_counts_t::const_iterator it_query = query_counts.begin();
227 |         it_query != query_counts.end(); ++it_query)
228 |     {
229 |         if (it_query->second > max_query)
230 |             max_query = it_query->second;
231 |     }
232 | 
233 |     // now make the vectors
234 |     for (word_counts_t::const_iterator it_query = query_counts.begin();
235 |         it_query != query_counts.end(); ++it_query)
236 |     {
237 |         // compute the idf
238 |         double idf = get_idf(it_query->first);
239 | 
240 |         // query vector
241 |         query_vector.push_back(
242 |             (0.5 + 0.5 * it_query->second / max_query) * idf);
243 | 
244 |         // doc vector - if word is in doc then compute it, otherwise it's 0
245 |         word_counts_t::const_iterator got = doc_counts.find(it_query->first);
246 |         if (got != doc_counts.end())
247 |             doc_vector.push_back(idf * got->second);
248 |         else
249 |             doc_vector.push_back(0.0);
250 |     }
251 | 
252 |     // need to normalize the document vector with the euclidian length
253 |     // the doc_vector variable only includes elements in the doc vector
254 |     // corresponding to query words but we need the length of the full
255 |     // vector here.  therefore need to iterate through doc_counts
256 |     double doc_vector_len = 0.0;
257 |     for (word_counts_t::const_iterator it_doc = doc_counts.begin();
258 |         it_doc != doc_counts.end(); ++it_doc)
259 |     {
260 |         double p = it_doc->second * get_idf(it_doc->first);
261 |         doc_vector_len += p * p;
262 |     }
263 |     doc_vector_len = sqrt(doc_vector_len);
264 | 
265 |     // finally compute the score
266 |     double score = 0.0;
267 |     for (std::size_t k = 0; k < doc_vector.size(); ++k)
268 |         score += query_vector[k] * doc_vector[k];
269 |     return score / doc_vector_len;
270 | }
271 | 
272 | 
273 | // magic constants for okapi_bm25 function.  we can allow these to
274 | // be configurable down the road if needed.
275 | const double BM25_K1 = 1.6;
276 | const double BM25_B = 0.75;
277 | 
278 | double QDR::okapi_bm25(
279 |     const word_counts_t& doc_counts, const word_counts_t& query_counts,
280 |     const double& Ld)
281 | {
282 |     /// Okapi BM25 ranking function
283 |     // See "An Introduction to Information Retrieval" by Manning,
284 |     //    Raghavan, Schutz.  Section 11.4.3 (page 233), eqn 11.32
285 |     // the ranking function is:
286 |     // SUM_{t in query} log(N / df[t]) * (k1 + 1) * tf[td] /
287 |     //                  (k1 * ((1 - b) + b * (Ld / Lave)) + tf[td])
288 |     //
289 |     // where N = number docs in corpus
290 |     //  df[t]  =  number docs with term t
291 |     //  tf[td] = number of occurrences of term t in this document
292 |     //  Ld = length of this document (# words)
293 |     //  Lave = average length of documents in corpus
294 |     //  k1, b = free parameters, empirically set to about
295 |     //  k1 = 1.2 - 2.0
296 |     //  b = 0.75
297 | 
298 |     double Lave = double(nwords) / double(total_docs);
299 |     double Ld_Lave = Ld / Lave;
300 | 
301 |     double score = 0.0;
302 |     for (word_counts_t::const_iterator it_query = query_counts.begin();
303 |         it_query != query_counts.end(); ++it_query)
304 |     {
305 |         word_counts_t::const_iterator got = doc_counts.find(it_query->first);
306 |         if (got != doc_counts.end())
307 |         {
308 |             double idf = get_idf(it_query->first);
309 |             double tf_doc = got->second;
310 |             score += it_query->second * idf * (BM25_K1 + 1.0) *
311 |                 tf_doc /
312 |                 (BM25_K1 * ((1.0 - BM25_B) + BM25_B * Ld_Lave) + tf_doc);
313 |         }
314 |         // else query word not in document
315 |         // in this case numerator == 0 so this contribution to score is 0
316 |     }
317 | 
318 |     return score;
319 | }
320 | 
321 | 
322 | // magic constants for the language model function
323 | // see Zhai and Lafferty "A Study of Smoothing
324 | // Methods for Language Models Applied to Ad Hoc Information Retrieval"
325 | // in this paper, we are interested in the parameter values
326 | // for "title queries" (short queries).  From the conclusion,
327 | // 3rd to last paragraph:
328 | // Jelinek-Mercer
329 | const double LAM = 0.1;
330 | // Dirichlet
331 | const double MU = 2000.0;
332 | // absolute discount
333 | const double DELTA = 0.7;
334 | 
335 | lm_scores_t QDR::lm(const word_counts_t& doc_counts,
336 |     const word_counts_t& query_counts, const double& sum_count_w_given_doc)
337 | {
338 |     /// Language model relevance
339 |     //    Returns:
340 |     //      jm: Jelinek-Mercer score,
341 |     //      dirichlet: Dirichlet score,
342 |     //      ad: Absolute Discount score
343 | 
344 |     // Uses the formula in Zhai and Lafferty "A Study of Smoothing
345 |     // Methods for Language Models Applied to Ad Hoc Information Retrieval",
346 |     // SIGIR 2001.  See Table 1.
347 | 
348 |     // NOTE: the original paper used PorterStemmer first and no
349 |     // stop words
350 | 
351 |     //relevance = p(w | d) = ps(w | d) if w in document
352 |     //          alpha_d * p(w | C)     if w not in document
353 |     //
354 |     // ps(w | d) given in table 1
355 |     //  alpha_d given in table 1
356 |     //  p(w | C) == corpus unigram probability
357 | 
358 |     // sum_w c(w; d) = total words in document = |d| (for absolute discount)
359 | 
360 |     // |d|_u for absolute discount
361 |     double unique_terms = doc_counts.size();
362 | 
363 |     lm_scores_t alpha_d = {
364 |         LAM,
365 |         MU / (sum_count_w_given_doc + MU),
366 |         DELTA * unique_terms / sum_count_w_given_doc
367 |     };
368 | 
369 |     // we'll return log(probability)
370 |     lm_scores_t ret = {0.0, 0.0, 0.0};
371 |     for (word_counts_t::const_iterator it_query = query_counts.begin();
372 |         it_query != query_counts.end(); ++it_query)
373 |     {
374 |         // need to compute probability of word given unigram language model
375 |         // for corpus.  using add one smoothing,
376 |         // P(word) = (count + 1) / (N words corpus + N words vocab + 1)
377 |         // (denominator: without smoothing is N words corpus,
378 |         //      but the +1 smoothing adds in the N words vocab + "unknown" word
379 |         counts_t::const_iterator got_corpus = counts.find(it_query->first);
380 |         uint64_t word_count_corpus = 0;
381 |         if (got_corpus != counts.end())
382 |             word_count_corpus = got_corpus->second.first;
383 |         double p_w_given_C = (word_count_corpus + 1.0) / n2p1;
384 | 
385 |         // count of word in document
386 |         word_counts_t::const_iterator got_doc = doc_counts.find(
387 |             it_query->first);
388 | 
389 |         double query_count = it_query->second;
390 | 
391 |         if (got_doc != doc_counts.end())
392 |         {
393 |             // word is in the document
394 |             double count_w_given_d = got_doc->second;
395 |             double p_ml_w_given_d = count_w_given_d / sum_count_w_given_doc;
396 |             ret.jm += query_count * log(
397 |                 (1.0 - LAM) * p_ml_w_given_d + LAM * p_w_given_C);
398 |             ret.dirichlet += query_count * log(
399 |                 (count_w_given_d + MU * p_w_given_C) /
400 |                 (sum_count_w_given_doc + MU));
401 |             ret.ad += query_count * log(
402 |                 std::max(count_w_given_d - DELTA, 0.0) /
403 |                 sum_count_w_given_doc + alpha_d.ad * p_w_given_C);
404 |         }
405 |         else
406 |         {
407 |             // word not in doc, so score = alpha_d * p(w | C)
408 |             ret.jm += query_count * log(alpha_d.jm * p_w_given_C);
409 |             ret.dirichlet += query_count * log(alpha_d.dirichlet * p_w_given_C);
410 |             ret.ad += query_count * log(alpha_d.ad * p_w_given_C);
411 |         }
412 |     }
413 | 
414 |     return ret;
415 | }
416 | 
417 | 
418 | 


--------------------------------------------------------------------------------
/qdr/ranker.pxd:
--------------------------------------------------------------------------------
 1 | cimport cython
 2 | 
 3 | from libcpp.vector cimport vector
 4 | from libcpp.pair cimport pair
 5 | from libcpp.string cimport string
 6 | from libcpp.unordered_map cimport unordered_map
 7 | from libc.stdint cimport uint64_t
 8 | 
 9 | # wrappers for the C++ classes we'll use
10 | cdef extern from "_ranker.cc":
11 |     ctypedef unordered_map[string, pair[uint64_t, uint64_t] ] counts_t
12 |     ctypedef vector[string] doc_t
13 |     ctypedef struct scores_t:
14 |         double tfidf
15 |         double bm25
16 |         double lm_jm
17 |         double lm_dirichlet
18 |         double lm_ad
19 |     cdef cppclass QDR:
20 |         QDR(counts_t& counts_in, uint64_t total_docs)
21 |         scores_t score(doc_t& document, doc_t& query) except +
22 |         vector[scores_t] score_batch(
23 |             doc_t& document, vector[doc_t]& queries) except +
24 |         double get_idf(string)
25 | 
26 | # only need to define C attributes and methods here
27 | cdef class QueryDocumentRelevance:
28 |     cdef QDR *_qdr_ptr
29 | 
30 | 


--------------------------------------------------------------------------------
/qdr/ranker.pyx:
--------------------------------------------------------------------------------
 1 | 
 2 | # c imports
 3 | cimport cython
 4 | from ranker cimport *
 5 | from .trainer import load_model
 6 | 
 7 | cdef class QueryDocumentRelevance:
 8 |     def __cinit__(self, counts, total_docs):
 9 |         '''
10 |         Load the model and construct the C++ class
11 | 
12 |         counts: the token -> (word count, document count) map from the corpus
13 |         total_docs: total documents in the corpus
14 |         '''
15 |         self._qdr_ptr = new QDR(counts, total_docs)
16 | 
17 |     def __dealloc__(self):
18 |         del self._qdr_ptr
19 | 
20 |     def score(self, document, query):
21 |         '''
22 |         Compute the query-document relevance scores
23 | 
24 |         document and query are tokenized lists of words
25 |         '''
26 |         # cython will handle the conversion for us...
27 |         return self._qdr_ptr.score(document, query)
28 | 
29 |     def score_batch(self, document, queries):
30 |         '''
31 |         Compute the query-document relevance scores for a group of queries
32 |             against a single document
33 | 
34 |         document is a list of tokenized words
35 |         queries is a list of queries, each query is a list of tokenized words
36 |         '''
37 |         return self._qdr_ptr.score_batch(document, queries)
38 | 
39 |     def get_idf(self, word):
40 |         return self._qdr_ptr.get_idf(word)
41 | 
42 |     @classmethod
43 |     def load_from_file(cls, inputfile):
44 |         ndocs, counts = load_model(inputfile)
45 |         ret = cls(counts, ndocs)
46 |         return ret
47 | 
48 | 


--------------------------------------------------------------------------------
/qdr/trainer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | "Train" the query-document relevance model (compute document counts from
  3 | a corpus)
  4 | '''
  5 | 
  6 | import gzip
  7 | 
  8 | from contextlib import closing
  9 | 
 10 | # the model is written to a flat text file with the following format:
 11 | #
 12 | # first line is the total number of documents
 13 | # subsequent lines contain the token, total count, total document count
 14 | #   separated by tabs
 15 | 
 16 | def load_model(inputfile):
 17 |     '''
 18 |     Return total docs, counts dict
 19 |     '''
 20 |     with closing(gzip.GzipFile(inputfile, 'r')) as f:
 21 |         ndocs = int(f.readline().strip())
 22 |         counts = {}
 23 |         for line in f:
 24 |             word, count1, count2 = line.strip().split('\t')
 25 |             counts[word] = [int(count1), int(count2)]
 26 |     return ndocs, counts
 27 | 
 28 | def write_model(ndocs, counts, outputfile):
 29 |     '''Write to output file'''
 30 |     with closing(gzip.GzipFile(outputfile, 'w')) as f:
 31 |         f.write("%s\n" % ndocs)
 32 |         for word, count in counts.iteritems():
 33 |             f.write("%s\t%s\t%s\n" % (word, count[0], count[1]))
 34 | 
 35 | 
 36 | class Trainer(object):
 37 |     '''
 38 |     Compute document counts from a corpus
 39 | 
 40 |     To support TF-IDF, BM25 and language model ranking functions need:
 41 |     
 42 |     a unigram language model (token -> total count in corpus)
 43 |     corpus document counts (token -> total documents in corpus)
 44 |     number total docs in corpus for TFIDF
 45 |     average document length (=total words / total docs) for BM25
 46 |     '''
 47 |     def __init__(self):
 48 |         # _counts: word -> (total count, total document count)
 49 |         self._counts = {}
 50 |         self._total_docs = 0
 51 | 
 52 |     def train(self, corpus):
 53 |         '''
 54 |         Add the documents in corpus to the current model
 55 |         corpus = iterator of documents, each document is list of tokens, e.g.
 56 |         corpus = [['words', 'in', 'doc1'], ['document', 'two', 'here']]
 57 |         '''
 58 |         for doc in corpus:
 59 |             if len(doc) > 0:
 60 |                 self._total_docs += 1
 61 |                 doc_words = set()
 62 |                 for word in doc:
 63 |                     # first the total count
 64 |                     if word not in self._counts:
 65 |                         self._counts[word] = [1, 0]
 66 |                     else:
 67 |                         self._counts[word][0] += 1
 68 | 
 69 |                     # now the document count
 70 |                     if word not in doc_words:
 71 |                         self._counts[word][1] += 1
 72 |                         doc_words.add(word)
 73 | 
 74 |     def update_counts_from_trained(self, qd):
 75 |         '''
 76 |         Update the current model with the counts from another model
 77 |         qd is another Trainer instance 
 78 |         '''
 79 |         for word, count in qd._counts.iteritems():
 80 |             try:
 81 |                 self._counts[word][0] += count[0]
 82 |                 self._counts[word][1] += count[1]
 83 |             except KeyError:
 84 |                 self._counts[word] = count
 85 |         self._total_docs += qd._total_docs
 86 | 
 87 |     def prune(self, min_count, min_doc_count):
 88 |         '''
 89 |         Remove all words with count less then min_count
 90 |         or document count less then min_doc_count
 91 |         '''
 92 |         words_to_remove = []
 93 |         for word, count in self._counts.iteritems():
 94 |             if count[0] < min_count or count[1] < min_doc_count:
 95 |                 words_to_remove.append(word)
 96 | 
 97 |         for word in words_to_remove:
 98 |             del self._counts[word]
 99 | 
100 |     @classmethod
101 |     def load_from_file(cls, inputfile):
102 |         ndocs, counts = load_model(inputfile)
103 |         ret = cls()
104 |         ret._total_docs = ndocs
105 |         ret._counts = counts
106 |         return ret
107 | 
108 |     def serialize_to_file(self, outputfile):
109 |         write_model(self._total_docs, self._counts, outputfile)
110 | 
111 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | nose
2 | coverage
3 | Cython>=0.21
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from distutils.core import setup
 3 | from distutils.extension import Extension
 4 | from Cython.Distutils import build_ext
 5 | 
 6 | ext_modules = [
 7 |     Extension(
 8 |         "qdr.ranker",
 9 |         sources=['qdr/ranker.pyx'],
10 |         extra_compile_args=['-std=c++0x'],
11 |         language="c++"),
12 | ]
13 | 
14 | 
15 | setup(name='qdr',
16 |     version='0.0',
17 |     description='Query-Document Relevance',
18 |     author='Moz Data Science',
19 |     packages=['qdr'],
20 |     package_dir={'qdr': 'qdr'},
21 |     cmdclass={'build_ext': build_ext},
22 |     ext_modules=ext_modules
23 | )
24 | 
25 | 


--------------------------------------------------------------------------------
/test/common.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # some test data
 3 | corpus = ["he went down to the store".split(),
 4 |         "he needed a shovel from the store to shovel the snow".split()]
 5 | corpus_update = ["the snow was five feet deep".split()]
 6 | 
 7 | corpus_unigrams = {
 8 |  'a': [1, 1],
 9 |  'deep': [1, 1],
10 |  'down': [1, 1],
11 |  'feet': [1, 1],
12 |  'five': [1, 1],
13 |  'from': [1, 1],
14 |  'he': [2, 2],
15 |  'needed': [1, 1],
16 |  'shovel': [2, 1],
17 |  'snow': [2, 2],
18 |  'store': [2, 2],
19 |  'the': [4, 3],
20 |  'to': [2, 2],
21 |  'was': [1, 1],
22 |  'went': [1, 1],
23 | }
24 | corpus_ndocs = 3
25 | 
26 | query = ["buy", "snow", "shovel", "shovel"]
27 | document = ["the", "store", "sells", "snow", "shovel", "snow"]
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/test/test_ranker.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import unittest
  3 | import numpy as np
  4 | 
  5 | from qdr import ranker
  6 | 
  7 | # some test data
  8 | from common import *
  9 | 
 10 | 
 11 | class TestRanker(unittest.TestCase):
 12 |     def _get_qd(self):
 13 |         return ranker.QueryDocumentRelevance(corpus_unigrams, corpus_ndocs)
 14 | 
 15 |     def test_idf(self):
 16 |         qd = self._get_qd()
 17 |         self.assertAlmostEqual(qd.get_idf('deep'), np.log(corpus_ndocs / 1.0))
 18 |         self.assertAlmostEqual(qd.get_idf('the'), np.log(corpus_ndocs / 3.0))
 19 |         self.assertAlmostEqual(qd.get_idf('not_in_corpus'),
 20 |                                np.log(corpus_ndocs / 1.0))
 21 | 
 22 |     def test_empty(self):
 23 |         '''
 24 |         Empty queries or documents raise an exception
 25 |         '''
 26 |         qd = self._get_qd()
 27 |         self.assertRaises(ValueError, qd.score, [], query)
 28 |         self.assertRaises(ValueError, qd.score, document, [])
 29 |         self.assertRaises(ValueError, qd.score, [], [])
 30 |         self.assertRaises(ValueError, qd.score_batch, [], [query])
 31 |         self.assertRaises(ValueError, qd.score_batch, document, [query, []])
 32 |         # but empty queries in score_batch just return an empty list
 33 |         self.assertEqual(qd.score_batch(document, []), [])
 34 | 
 35 |     def test_tfidf(self):
 36 |         qd = self._get_qd()
 37 |         computed_score = qd.score(document, query)['tfidf']
 38 | 
 39 |         max_query_tf = 2.0
 40 |         query_vector = np.array([
 41 |                      (0.5 + 0.5 / max_query_tf) * qd.get_idf("buy"),
 42 |                      (0.5 + 0.5 / max_query_tf) * qd.get_idf("snow"),
 43 |                      (0.5 + 0.5 * 2.0 / max_query_tf) * qd.get_idf("shovel")])
 44 | 
 45 |         doc_vector = np.array([0.0,
 46 |                       2.0 * qd.get_idf("snow"),
 47 |                       1.0 * qd.get_idf("shovel")])
 48 |         doc_length = np.sqrt(np.sum(np.array([
 49 |                         1.0 * qd.get_idf("the"),
 50 |                         1.0 * qd.get_idf("store"),
 51 |                         1.0 * qd.get_idf("sells"),
 52 |                         2.0 * qd.get_idf("snow"),
 53 |                         1.0 * qd.get_idf("shovel")]) ** 2))
 54 | 
 55 |         expected_score = np.sum(query_vector * doc_vector) / doc_length
 56 |         self.assertAlmostEqual(computed_score, expected_score)
 57 | 
 58 |     def test_bm25(self):
 59 |         qd = self._get_qd()
 60 |         computed_score = qd.score(document, query)['bm25']
 61 | 
 62 |         # SUM_{t in query} log(N / df[t]) * (k1 + 1) * tf[td] /
 63 |         #                  (k1 * ((1 - b) + b * (Ld / Lave)) + tf[td])
 64 | 
 65 |         k1 = 1.6
 66 |         b = 0.75
 67 |         Lave = sum([len(ele) for ele in corpus]
 68 |                  + [len(ele) for ele in corpus_update]) / float(corpus_ndocs)
 69 | 
 70 |         score_buy = 0.0 # not in document
 71 |         score_snow = np.log(float(corpus_ndocs) / corpus_unigrams['snow'][1]) \
 72 |             * (k1 + 1.0) * 2.0 / (k1 * ((1.0 - b) + b * (6.0 / Lave)) + 2.0)
 73 |         score_shovel = \
 74 |             np.log(float(corpus_ndocs) / corpus_unigrams['shovel'][1]) \
 75 |             * (k1 + 1.0) * 1.0 / (k1 * ((1.0 - b) + b * (6.0 / Lave)) + 1.0)
 76 |         actual_score = score_buy + score_snow + 2.0 * score_shovel
 77 | 
 78 |         self.assertAlmostEqual(computed_score, actual_score)
 79 | 
 80 |     def test_lm(self):
 81 |         qd = self._get_qd()
 82 |         computed_scores = qd.score(document, query)
 83 | 
 84 |         lam = 0.1
 85 |         mu = 2000.0
 86 |         delta = 0.7
 87 | 
 88 |         jm = 0.0
 89 |         dirichlet = 0.0
 90 |         ad = 0.0
 91 |         sum_w_cwd_doc = float(len(document))
 92 |         nwords_corpus = sum(v[0] for v in corpus_unigrams.itervalues())
 93 |         n2p1 = len(corpus_unigrams) + nwords_corpus + 1
 94 |         for word in query:
 95 |             try:
 96 |                 word_count_corpus = corpus_unigrams[word][0]
 97 |             except KeyError:
 98 |                 word_count_corpus = 0
 99 |             corpus_prob = (word_count_corpus + 1.0) / n2p1
100 | 
101 |             cwd = 0
102 |             for doc_word in document:
103 |                 if doc_word == word:
104 |                     cwd += 1
105 | 
106 |             if cwd == 0:
107 |                 # not in document
108 |                 jm += np.log(lam * corpus_prob)
109 |                 dirichlet += np.log(mu / (sum_w_cwd_doc + mu) * corpus_prob)
110 |                 ad += np.log(
111 |                     delta * len(set(document)) / sum_w_cwd_doc * corpus_prob)
112 |             else:
113 |                 jm += np.log(
114 |                         (1.0 - lam) * cwd / sum_w_cwd_doc + lam * corpus_prob)
115 |                 dirichlet += np.log(
116 |                         (cwd + mu * corpus_prob) / (sum_w_cwd_doc + mu))
117 |                 ad += np.log(
118 |                    max(cwd - delta, 0.0) / sum_w_cwd_doc +
119 |                    delta * len(set(document)) / sum_w_cwd_doc * corpus_prob)
120 | 
121 |         self.assertAlmostEqual(computed_scores['lm_jm'], jm)
122 |         self.assertAlmostEqual(computed_scores['lm_dirichlet'], dirichlet)
123 |         self.assertAlmostEqual(computed_scores['lm_ad'], ad)
124 | 
125 |     def test_load_from_file(self):
126 |         import os
127 |         from tempfile import mkstemp
128 |         from qdr.trainer import write_model
129 | 
130 |         t = mkstemp()
131 |         write_model(corpus_ndocs, corpus_unigrams, t[1])
132 |         qd = ranker.QueryDocumentRelevance.load_from_file(t[1])
133 | 
134 |         # we'll just check that one of the word counts is correct
135 |         self.assertAlmostEqual(qd.get_idf('the'), np.log(corpus_ndocs / 3.0))
136 | 
137 |         os.remove(t[1])
138 | 
139 |     def test_score_batch(self):
140 |         qd = self._get_qd()
141 |         queries = [query, ['buy', 'shovel']]
142 |         scores = qd.score_batch(document, queries)
143 |         # we'll assume that the score single works...
144 |         self.assertEqual(len(scores), 2)
145 | 
146 | 
147 | if __name__ == '__main__':
148 |     unittest.main()
149 | 
150 | 


--------------------------------------------------------------------------------
/test/test_trainer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import unittest
 4 | 
 5 | from tempfile import mkstemp 
 6 | 
 7 | from qdr import trainer
 8 | 
 9 | # some test data
10 | from common import *
11 | 
12 | class TestTrainer(unittest.TestCase):
13 |     def _get_qd(self):
14 |         qd = trainer.Trainer()
15 |         qd.train(corpus)
16 |         qd.train(corpus_update)
17 |         return qd
18 | 
19 |     def test_train(self):
20 |         qd = self._get_qd()
21 |         self.assertEqual(qd._counts, corpus_unigrams)
22 |         self.assertEqual(qd._total_docs, corpus_ndocs)
23 | 
24 |     def test_update_from_trained(self):
25 |         qd = trainer.Trainer()
26 |         qd.train(corpus)
27 |         qd2 = trainer.Trainer()
28 |         qd2.train(corpus_update)
29 |         qd.update_counts_from_trained(qd2)
30 | 
31 |         self.assertEqual(qd._counts, corpus_unigrams)
32 |         self.assertEqual(qd._total_docs, corpus_ndocs)
33 | 
34 |     def test_serialize(self):
35 |         '''
36 |         We should be able to write out the model then read it back in
37 |         '''
38 |         qd = self._get_qd()
39 |         t = mkstemp()
40 |         qd.serialize_to_file(t[1])
41 | 
42 |         # load from file and check it
43 |         qd2 = trainer.Trainer.load_from_file(t[1])
44 |         self.assertEqual(qd2._counts, corpus_unigrams)
45 |         self.assertEqual(qd2._total_docs, corpus_ndocs)
46 | 
47 |         os.unlink(t[1])
48 | 
49 |     def test_prune(self):
50 |         qd = self._get_qd()
51 |         qd.prune(2, 0)
52 | 
53 |         self.assertEqual(qd._counts,
54 |             {'he': [2, 2], 'shovel': [2, 1], 'snow': [2, 2], 'store': [2, 2],
55 |              'the': [4, 3], 'to': [2, 2]})
56 | 
57 |         qd.prune(2, 3)
58 |         self.assertEqual(qd._counts, {'the': [4, 3]})
59 | 
60 | if __name__ == '__main__':
61 |     unittest.main()
62 | 
63 | 


--------------------------------------------------------------------------------