├── .gitignore ├── src ├── settings.py ├── util.py ├── api.py ├── rd-plugin.user.js └── recommendations.py ├── nginx.conf └── README.markdown /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | *# 4 | #* 5 | .#* 6 | 7 | src/keys 8 | -------------------------------------------------------------------------------- /src/settings.py: -------------------------------------------------------------------------------- 1 | recommendations_per_doi = 5 2 | 3 | minhash_rounds = 100 # more rounds means better recommendations but longer processing time 4 | 5 | data_dir = "/mnt/var/springer-recommendations/" 6 | -------------------------------------------------------------------------------- /nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80; 3 | 4 | access_log on; 5 | error_log on; 6 | 7 | location /api { 8 | proxy_pass http://127.0.0.1:8000/; 9 | proxy_redirect off; 10 | 11 | proxy_set_header Host $host; 12 | proxy_set_header X-Real-IP $remote_addr; 13 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/util.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from datetime import datetime 3 | import functools 4 | 5 | def log(name, event): 6 | sys.stderr.write("%s %s - %s\n" % (datetime.now(), name, event)) 7 | sys.stderr.flush() 8 | 9 | def timed(fn): 10 | @functools.wraps(fn) 11 | def wrapped(*args): 12 | log(fn.func_name, 'started') 13 | result = fn(*args) 14 | log(fn.func_name, 'finished') 15 | return result 16 | return wrapped 17 | -------------------------------------------------------------------------------- /src/api.py: -------------------------------------------------------------------------------- 1 | """Quick way to preview results""" 2 | 3 | import os 4 | import flask 5 | import urllib 6 | import ujson 7 | import plyvel 8 | 9 | import settings 10 | 11 | app = flask.Flask('Springer Recommendations') 12 | 13 | recs_db = plyvel.DB(os.path.join(settings.data_dir, 'recs_db'), create_if_missing=True) 14 | 15 | def load_recs(filename): 16 | for line in open(filename, 'r'): 17 | doi, recs = ujson.loads(line) 18 | recs_db.put(doi.encode('utf8'), ujson.dumps(recs)) 19 | 20 | @app.route('/recommendations/') 21 | def get_recommendations(doi): 22 | print "Got: ", doi 23 | try: 24 | print ujson.loads(recs_db.get(doi.encode('utf8'))) 25 | recs = ujson.loads(recs_db.get(doi.encode('utf8'))) 26 | except: 27 | recs = [] 28 | return flask.jsonify(recommendations=recs) 29 | 30 | if __name__ == '__main__': 31 | load_recs(os.path.join(settings.data_dir, 'raw_recs')) 32 | app.run(port=8000) 33 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | Generates 'people who read this also read...'-style recommendations based on the [Jaccard similarity](http://en.wikipedia.org/wiki/Jaccard_index) between their readership sets. This code scales to large (1B rows) datasets in limited memory by using external sorting and [locality sensitive hashing](http://en.wikipedia.org/wiki/Locality_sensitive_hashing). Built for [Springer](http://link.springer.com). 2 | 3 | # Installation (on Ubuntu 11.04) 4 | 5 | Install dependencies 6 | 7 | ``` bash 8 | sudo apt-get install build-essential python python-dev python-pip ipython git-core 9 | sudo pip install ujson 10 | ``` 11 | 12 | Setup springer-recommendations 13 | 14 | ``` bash 15 | git clone git://github.com/jamii/springer-recommendations.git 16 | sudo mkdir -p /mnt/var/springer-recommendations 17 | sudo chown $USER:$USER /mnt/var/springer-recommendations 18 | ``` 19 | 20 | # Operation 21 | 22 | The recommendations engine reads a newline-separated list of input filenames on stdin and prints a list of recommendations on stdout. 23 | 24 | Each input file should contain a newline-separated list of json-encoded [user, doi] pairs. The user field may be any unique string eg ip address or session id. 25 | 26 | ``` json 27 | ["1yud2mlgpalm2cqeyyz0o44n","10.1007\/s10526-004-6592-1"] 28 | ["q4lprrkmbr3gpvosjao0dzwm","10.1007\/978-3-540-69934-7_13"] 29 | ["3jc2hnohgreyhvlurpg3m1sn","10.1007\/978-3-8348-8229-5_14"] 30 | ["uigkldnerjvgghxvjp2ptm0i","10.1007\/s00125-009-1355-2"] 31 | ["mmnqkjwawkcz4tqjcxfam4jz","10.1007\/978-3-8274-2313-9_4"] 32 | ["e3ie31mmad2epuxno1gpidmx","10.1007\/s10549-012-2108-3"] 33 | ["1adokad3mbbg0aaexcl1yb3a","10.1007\/978-3-8349-6622-3_5"] 34 | ["fzfrjqgnizgprfxstcal12fu","10.1007\/978-3-7643-8777-8_1"] 35 | ["ihcnriijo040rchrgbytvlpg","10.1007\/BF00309663"] 36 | ``` 37 | 38 | For each DOI in the logs, the output contains a line of related DOIs and their Jaccard similarity to the first DOI. 39 | 40 | ``` json 41 | ["10.2478\/s11532-009-0129-5",[["10.1007\/978-1-61737-985-7_11",0.24],["10.1007\/BF01011432",0.56],["10.1007\/BF01524716",0.11],["10.1007\/BF02458601",0.87],["10.1007\/s002140050205",0.97]]] 42 | ["10.2478\/s11532-010-0087-y",[["10.1007\/BF02660070",1.0],["10.1007\/BF02988680",1.0],["10.1007\/s00709-010-0225-6",1.0],["10.1007\/s00709-010-0233-6",1.0],["10.1023\/A:1022137619834",1.0]]] 43 | ["10.2478\/s11534-010-0072-2",[["10.2478\/s11534-011-0014-7",1.0]]] 44 | ["10.2478\/s11534-011-0014-7",[["10.2478\/s11534-010-0072-2",1.0]]] 45 | ["10.2478\/s11535-011-0006-z",[["10.1007\/BF02532915",1.0],["10.1023\/A:1013623806248",1.0],["10.1134\/S1019331608020019",1.0]]] 46 | ``` 47 | 48 | Example usage: 49 | 50 | ``` bash 51 | find /mnt/var/springer-recommendations/logs-*.json | nohup python springer-recommendations/src/recommendations.py > recommendations.json 2> recommendations.log & 52 | ``` 53 | -------------------------------------------------------------------------------- /src/rd-plugin.user.js: -------------------------------------------------------------------------------- 1 | // ==UserScript== 2 | // @name Recommendations plugin for rd.springer.com 3 | // @namespace http://scattered-thoughts.net 4 | // @require http://ajax.googleapis.com/ajax/libs/jquery/1.5.1/jquery.min.js 5 | // @include http*://rd.springer.com/article/* 6 | // ==/UserScript== 7 | 8 | // This is just a quick hack for previewing results. Not well tested. 9 | 10 | $( 11 | function () { 12 | var api_key = null; // get your own :-P 13 | 14 | var expander = $('#abstract-related').clone(true, true); 15 | expander[0].id = 'abstract-recommendations'; 16 | expander.find('h2')[0].textContent = 'Recommendations (loading)'; 17 | expander.find('.expander-content').empty(); 18 | expander.prependTo($('.document-aside')); 19 | 20 | // copied from minimized springer source 21 | var toggleExpander = function (_) { 22 | var b = $(expander).closest(".expander"); 23 | if ($(".expander-content", b).is(":visible")) { 24 | b.removeClass("expander-open").find(".expander-content").slideUp(); 25 | } else { 26 | b.addClass("expander-open").find(".expander-content").slideDown(); 27 | if ($("#pub-date-graph").length) { 28 | var d = new SearchResultsGraph(); 29 | d.init(); 30 | } 31 | } 32 | }; 33 | // end of copy 34 | 35 | expander.find('.expander-title').click(toggleExpander); 36 | 37 | var getTitle = function(doi, cont) { 38 | url = 'http://springer.api.mashery.com/metadata/json?api_key=' + api_key + '&q=' + encodeURIComponent('doi:' + doi); 39 | GM_xmlhttpRequest({ 40 | method: 'GET', 41 | url: url, 42 | onload: function(response) { 43 | var title = $.parseJSON(response.responseText).records[0].title; 44 | cont(title); 45 | } 46 | }); 47 | }; 48 | 49 | expander.append('
    '); 50 | var ol = expander.find('ol'); 51 | var addRecommendation = function (doi, score) { 52 | var url = 'http://rd.springer.com/article/' + doi; 53 | var a = ''; 54 | var span = ' (score ' + score.toFixed(3) + ')'; 55 | var li = $('
  1. ' + a + span + '
  2. '); 56 | ol.append(li); 57 | getTitle(doi, 58 | function (title) { 59 | li.find('a')[0].textContent = title; 60 | }); 61 | }; 62 | 63 | var doi = document.getElementById('abstract-about-doi').textContent; 64 | var url = 'http://ec2-107-20-105-237.compute-1.amazonaws.com/api/recommendations/' + doi; 65 | GM_xmlhttpRequest({ 66 | method: 'GET', 67 | url: url, 68 | onload: function(response) { 69 | var recommendations = $.parseJSON(response.responseText).recommendations; 70 | recommendations.sort(); 71 | recommendations.reverse(); 72 | 73 | $.each(recommendations, 74 | function (_, row) { 75 | addRecommendation(row[0], row[1]); 76 | }); 77 | 78 | if (recommendations.length > 0) { 79 | expander.removeClass("expander-empty"); 80 | } else { 81 | expander.addClass("expander-empty"); 82 | }; 83 | 84 | expander.find('h2')[0].textContent = 'Recommendations (' + recommendations.length + ')'; 85 | } 86 | }); 87 | } 88 | ); 89 | -------------------------------------------------------------------------------- /src/recommendations.py: -------------------------------------------------------------------------------- 1 | """Fast, scalable item-item recommendations based on Das, Abhinandan S., et al. "Google news personalization: scalable online collaborative filtering." Proceedings of the 16th international conference on World Wide Web. ACM, 2007.""" 2 | 3 | import os 4 | import sys 5 | import shutil 6 | import subprocess 7 | import tempfile 8 | import itertools 9 | import random 10 | import operator 11 | from array import array 12 | 13 | import ujson 14 | 15 | import util 16 | import settings 17 | 18 | # have to keep an explicit reference to the stashes because many itertools constructs don't 19 | stashes = [] 20 | 21 | class stash(): 22 | """On-disk cache of a list of rows""" 23 | def __init__(self, rows=[]): 24 | stashes.append(self) 25 | self.file = tempfile.NamedTemporaryFile(dir=settings.data_dir) 26 | self.name = self.file.name 27 | dumps = ujson.dumps # don't want to do this lookup inside the loop below 28 | self.file.writelines(("%s\n" % dumps(row) for row in rows)) 29 | self.file.flush() 30 | 31 | @staticmethod 32 | def sorted(rows): 33 | """A sorted, de-duped stash""" 34 | if isinstance(rows, stash): 35 | in_stash = rows 36 | else: 37 | in_stash = stash(rows) 38 | out_stash = stash() 39 | subprocess.check_call(['sort', '-T', settings.data_dir, '-S', '80%', '-u', in_stash.name, '-o', out_stash.name]) 40 | return out_stash 41 | 42 | @staticmethod 43 | def from_file(file): 44 | out_stash = stash() 45 | out_stash.file = file 46 | return out_stash 47 | 48 | def __iter__(self): 49 | self.file.seek(0) # always iterate from the start 50 | return itertools.imap(ujson.loads, self.file) 51 | 52 | def __len__(self): 53 | result = subprocess.check_output(['wc', '-l', self.file.name]) 54 | count, _ = result.split() 55 | return int(count) 56 | 57 | def save_as(self, name): 58 | shutil.copy(self.file.name, os.path.join(settings.data_dir, name)) 59 | 60 | class priority_queues(): 61 | """A number of fixed-size priority queues packed together for low memory usage""" 62 | def __init__(self, num_queues, max_size): 63 | self.num_queues = num_queues 64 | self.max_size = max_size 65 | self.entries = array('f', itertools.repeat(-1, num_queues * max_size)) 66 | self.priorities = array('f', itertools.repeat(0.0, num_queues * max_size)) 67 | 68 | def insert(self, queue_index, entry, priority): 69 | """Insert an entry into the queue at the given index. Requires entry >= 0, priority >= 0.0""" 70 | for i in xrange(queue_index * self.max_size, (queue_index + 1) * self.max_size): 71 | if self.entries[i] == entry: 72 | break 73 | elif priority > self.priorities[i]: 74 | self.entries[i], entry = entry, self.entries[i] 75 | self.priorities[i], priority = priority, self.priorities[i] 76 | 77 | def __iter__(self): 78 | for queue_index in xrange(0, self.num_queues): 79 | for entry_index in xrange(0, self.max_size): 80 | i = (queue_index * self.max_size) + entry_index 81 | entry = self.entries[i] 82 | priority = self.priorities[i] 83 | if entry >= 0 and priority > 0: 84 | yield [queue_index, entry, priority] 85 | 86 | def grouped(rows): 87 | """Group rows by their first column""" 88 | return itertools.groupby(rows, operator.itemgetter(0)) 89 | 90 | def numbered(rows, labels): 91 | """For each row, replace the first column by its index in labels. Assumes both rows and labels are sorted. Returns a new iter.""" 92 | labels = iter(labels) 93 | label = labels.next() 94 | index = 0 95 | for row in rows: 96 | while label != row[0]: 97 | label = labels.next() 98 | index += 1 99 | row[0] = index 100 | yield row 101 | 102 | def unnumber(rows, labels, column=0): 103 | """For each row, lookup column as an index in labels. Assumes both rows and labels are sorted. Modifies rows in place.""" 104 | labels = iter(labels) 105 | label = labels.next() 106 | index = 0 107 | for row in rows: 108 | while index != row[column]: 109 | label = labels.next() 110 | index += 1 111 | row[column] = label 112 | 113 | @util.timed 114 | def preprocess(raw_edges): 115 | """Replace string DOIs and users by integer indices for more compact representation later""" 116 | util.log('preprocess', 'copying input') 117 | raw_edges = stash(raw_edges) 118 | 119 | util.log('preprocess', 'collating') 120 | raw_users = stash.sorted((user for user, doi in raw_edges)) 121 | raw_dois = stash.sorted((doi for user, doi in raw_edges)) 122 | 123 | util.log('preprocess', 'labelling') 124 | edges = raw_edges 125 | edges = numbered(stash.sorted(edges), raw_users) 126 | edges = ((doi, user) for user, doi in edges) 127 | edges = numbered(stash.sorted(edges), raw_dois) 128 | edges = stash(edges) 129 | 130 | return raw_dois, edges 131 | 132 | def jaccard_similarity(users1, users2): 133 | """Jaccard similarity between two sets represented as sorted arrays of integers. See http://en.wikipedia.org/wiki/Jaccard_index""" 134 | intersection = 0 135 | difference = 0 136 | i = 0 137 | j = 0 138 | while (i < len(users1)) and (j < len(users2)): 139 | if users1[i] < users2[j]: 140 | difference += 1 141 | i += 1 142 | elif users1[i] > users2[j]: 143 | difference += 1 144 | j += 1 145 | else: 146 | intersection += 1 147 | i += 1 148 | j += 1 149 | difference += (len(users1) - i) + (len(users2) - j) 150 | return float(intersection) / (float(intersection) + float(difference)) 151 | 152 | @util.timed 153 | def minhash_round(buckets): 154 | """Probabalistic algorithm for finding edges with high jaccard scores (see http://en.wikipedia.org/wiki/MinHash). Modifies buckets in-place.""" 155 | seed = random.getrandbits(64) 156 | util.log('minhash_round', 'hashing into buckets') 157 | for bucket in buckets: 158 | users = bucket[3] 159 | bucket[0] = min((hash((seed, user, seed)) for user in users)) # minhash 160 | bucket[1] = random.random() # prevents bias towards adjacent dois caused by sorting 161 | util.log('minhash_round', 'sorting buckets') 162 | buckets.sort() 163 | util.log('minhash_round', 'checking scores') 164 | for (_, _, doi1, users1), (_, _, doi2, users2) in itertools.izip(buckets, buckets[1:]): 165 | score = jaccard_similarity(users1, users2) 166 | yield doi1, doi2, score 167 | 168 | @util.timed 169 | def recommendations(edges, num_dois): 170 | """For each doi in edges, try to find the nearest settings.recommendations_per_doi DOIs by Jaccard similarity using minhashing""" 171 | # list of (minhash, random, user, doi) 172 | buckets = [[0, 0, doi, array('I', sorted(((user for _, user in group))))] for doi, group in grouped(edges)] 173 | 174 | # store a priority queue of recommendations per doi 175 | recs = priority_queues(num_queues = num_dois, max_size = settings.recommendations_per_doi) 176 | 177 | for round in xrange(0, settings.minhash_rounds): 178 | for doi1, doi2, score in minhash_round(buckets): 179 | recs.insert(doi1, doi2, score) 180 | recs.insert(doi2, doi1, score) 181 | 182 | return recs 183 | 184 | @util.timed 185 | def postprocess(raw_dois, recs): 186 | """Turn integer DOIs and users back into strings""" 187 | recs.sort(key=operator.itemgetter(1)) 188 | unnumber(recs, raw_dois, column=1) 189 | recs.sort(key=operator.itemgetter(0)) 190 | unnumber(recs, raw_dois, column=0) 191 | return stash(((doi, [(score, rec) for (_, score, rec) in group]) for doi, group in grouped(recs))) 192 | 193 | def main(): 194 | raw_edges = itertools.chain.from_iterable((stash.from_file(open(dump_filename.rstrip())) for dump_filename in sys.stdin.readlines())) 195 | # raw_edges = itertools.islice(raw_edges, 1000) # for quick testing 196 | raw_dois, edges = preprocess(raw_edges) 197 | util.log('main', '%i unique edges' % len(edges)) 198 | recs = list(recommendations(edges, len(raw_dois))) 199 | raw_recs = postprocess(raw_dois, recs) 200 | sys.stdout.writelines(("%s\n" % ujson.dumps(row) for row in raw_recs)) 201 | sys.stdout.flush() 202 | 203 | if __name__ == '__main__': 204 | # import cProfile 205 | # cProfile.run('main()', 'prof') 206 | main() 207 | --------------------------------------------------------------------------------