├── .gitignore ├── comments.pyc ├── requirements.txt ├── comments.py ├── README.md ├── mongo.py ├── kNN.py └── dataset.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv/* 2 | *.pyc 3 | -------------------------------------------------------------------------------- /comments.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicx24/SubredditRecommendationEngine/HEAD/comments.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | multiprocessing==2.6.2.1 2 | praw==2.1.20 3 | pymongo==2.8 4 | requests==2.5.3 5 | six==1.9.0 6 | update-checker==0.11 7 | wsgiref==0.1.2 8 | -------------------------------------------------------------------------------- /comments.py: -------------------------------------------------------------------------------- 1 | #Pass this file a username, and it'll return a list of subreddits that user has commented in. 2 | 3 | import praw 4 | from praw.handlers import MultiprocessHandler 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A recommendation system for subreddits using the K Nearest Neighbors algorithm between users and subscribed subreddits. 2 | 3 | Coming soon: 4 | 5 | A load balancing mechanism using a Gearman Job Server to distribute jobs among a group of nodes. 6 | 7 | The kNN algorithm, but multithreaded. 8 | 9 | A web API. 10 | 11 | A second option to get recommended subreddits using Jaccard Similarity. 12 | 13 | Blog Posts. 14 | 15 | -------------------------------------------------------------------------------- /mongo.py: -------------------------------------------------------------------------------- 1 | from pymongo import * 2 | import datetime 3 | 4 | def insertUser(username, subreddits, client): 5 | user = { 6 | 'username': username, 7 | 'subreddits' : subreddits, 8 | 'updated' : datetime.datetime.utcnow() 9 | } 10 | #client = MongoCient() 11 | db = client.Reddit 12 | collection = db.users 13 | userID = collection.update({'username': user['username']}, {'username': user['username'], 'subreddits' : user['subreddits'], 'updated': user['updated']}, upsert=True) 14 | return userID 15 | 16 | def insertSub(sub, client): 17 | sub = { 18 | 'name': sub, 19 | 'updated' : datetime.datetime.utcnow() 20 | } 21 | #client = MongoCient() 22 | db = client.Reddit 23 | collection = db.subreddits 24 | userID = collection.update({'name': sub['name']}, {'name':sub['name'], 'updated': sub['updated']}, upsert=True) 25 | return userID 26 | 27 | def queryUser(username, client): 28 | #client = MongoCient() 29 | collection = client.Reddit.users 30 | user = collection.find_one({'username': username}) 31 | return user 32 | 33 | def update(username, subreddits, client): 34 | #client = MongoCient() 35 | collection = client.Reddit.users 36 | collection.update({'username': username}, {"$set": {'subreddits': subreddits}}) 37 | 38 | def subreddits(client): 39 | return client.Reddit.subreddits.find() 40 | 41 | def allUsersInArray(userArray, client): 42 | #client = MongoCient() 43 | return client.Reddit.users.find({'username': {"$in": userArray}}) 44 | 45 | def tempUserList(client): 46 | return client.Reddit.temp.find() 47 | 48 | def allUsers(client): 49 | return client.Reddit.users.find() 50 | 51 | def tempBulkInsert(users, client): 52 | return client.Reddit.temp.insert(users) 53 | -------------------------------------------------------------------------------- /kNN.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import math 4 | from mongo import * 5 | from pymongo import * 6 | import dataset 7 | import operator 8 | import numpy as np 9 | 10 | def createUserVector(username): 11 | client = MongoClient() 12 | user = queryUser(username, client) 13 | unique_subs = list(subreddits(client)) 14 | vector = [0]*len(unique_subs) 15 | for i in range(len(unique_subs)): 16 | if unique_subs[i]['name'] in user['subreddits']: 17 | vector[i] = 10 18 | return vector 19 | 20 | def vectorDistance(user1, user2): 21 | vector1 = createUserVector(user1) 22 | vector2 = createUserVector(user2) 23 | # dist = 0 24 | # for i in range(len(vector1)): 25 | # dist += pow(vector1[i] - vector2[i], 2) 26 | # return math.sqrt(dist) 27 | return np.linalg.norm(np.array(vector1) - np.array(vector2)) 28 | 29 | def getNeighbors(username, k): 30 | client = MongoClient() 31 | distances = [] 32 | for user in allUsers(client): 33 | if len(distances) > k: 34 | break 35 | dist = vectorDistance(username, user['username']) 36 | distances.append((user['username'], dist)) 37 | distances.sort(key=operator.itemgetter(1)) 38 | return distances 39 | 40 | def getRecommendedSubreddit(username): 41 | client = MongoClient() 42 | neighbors = getNeighbors(username, 70) 43 | users = allUsersInArray([neighbor[0] for neighbor in neighbors], client) 44 | banned = queryUser(username, client)['subreddits'] 45 | subredditFrequency = {} 46 | totalsubs = [sub for user in users for sub in user['subreddits']] 47 | subredditFrequency = {word : totalsubs.count(word) for word in set(totalsubs) if word not in banned} 48 | return max(subredditFrequency, key=subredditFrequency.get) 49 | 50 | 51 | def main(username): 52 | dataset.getComments(username) 53 | return getRecommendedSubreddit(username) 54 | 55 | if __name__ == "__main__": 56 | username = raw_input() 57 | print(main(username)) 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import * 2 | import praw 3 | from praw.handlers import MultiprocessHandler 4 | from mongo import * 5 | from pymongo import * 6 | import datetime 7 | import sys 8 | import requests 9 | 10 | def getSubredditUsers(subreddit): 11 | """ 12 | Get the commentors in a subreddit. 13 | """ 14 | client = MongoClient() 15 | reddit = praw.Reddit(user_agent="kNN Subreddit Recommendation Engine", handler=MultiprocessHandler()) 16 | subreddit = reddit.get_subreddit(subreddit) 17 | comments = subreddit.get_comments(limit=250) 18 | currentUsers = allUsers(client) 19 | if currentUsers: 20 | found = [user['username'] for user in currentUsers] 21 | else: 22 | found = [] 23 | users = [] 24 | for comment in comments: 25 | if comment.author.name not in found: 26 | users.append({'user':comment.author.name}) 27 | return tempBulkInsert(users, client) 28 | 29 | def getComments(username): 30 | """ 31 | Return the subreddits a user has commented in. 32 | """ 33 | try: 34 | unique_subs = [] 35 | client = MongoClient() 36 | reddit = praw.Reddit(user_agent="kNN Subreddit Recommendation Engine", handler=MultiprocessHandler()) 37 | user = reddit.get_redditor(username) 38 | subs = [] 39 | for comment in user.get_comments(limit=250): 40 | if comment.subreddit.display_name not in subs: 41 | subs.append(comment.subreddit.display_name) 42 | insertSub(comment.subreddit.display_name, client) 43 | return insertUser(username, subs, client) 44 | except requests.exceptions.HTTPError as e: 45 | print e 46 | pass 47 | #def updateSubs(): 48 | 49 | def getSubreddits(): 50 | #reddit = praw.Reddit(user_agent="kNN Subreddit Recommendation", handler=MultiprocessHandler()) 51 | return ['all'] 52 | #Eventually, get all subreddits with over 10,000 users and go from there. 53 | #return subreddits(MongoClient()) 54 | 55 | def cron(user): 56 | client = MongoClient() 57 | if abs(datetime.datetime.utcnow() - user['updated']).days >= 1: 58 | return getComments(username) 59 | 60 | def main(): 61 | try: 62 | pool = Pool(processes=(cpu_count()*6)) 63 | subs = getSubreddits() 64 | pool.map(getSubredditUsers, subs) 65 | users = [user['user'] for user in tempUserList(MongoClient())] 66 | pool.map(getComments, users) 67 | pool.close() 68 | except KeyboardInterrupt: 69 | pool.terminate() 70 | sys.exit() 71 | 72 | #TEST. This will fail so hard. 73 | 74 | if __name__ == "__main__": 75 | main() 76 | 77 | 78 | 79 | 80 | 81 | 82 | --------------------------------------------------------------------------------