├── socialcrawl ├── __init__.py ├── clients │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ ├── test_facebook.py │ │ ├── test_twitter.py │ │ ├── test_cache.py │ │ └── test_crawler.py │ ├── management │ │ ├── __init__.py │ │ └── commands │ │ │ ├── __init__.py │ │ │ ├── twitter.py │ │ │ └── facebook.py │ ├── exceptions.py │ ├── crawler.py │ ├── twitter.py │ ├── facebook.py │ └── cache.py ├── networks │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ ├── test_models.py │ │ └── test_views.py │ ├── urls.py │ ├── tasks.py │ ├── api.py │ ├── views.py │ └── models.py ├── auth_sample.py ├── urls.py └── settings.py ├── test_requirements.txt ├── .gitignore ├── requirements.txt ├── manage.py ├── LICENSE └── README.md /socialcrawl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /socialcrawl/clients/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /socialcrawl/networks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /socialcrawl/clients/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /socialcrawl/networks/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /socialcrawl/clients/management/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | nose 2 | django-nose 3 | -------------------------------------------------------------------------------- /socialcrawl/clients/management/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | auth.py 3 | <<<<<<< HEAD 4 | data.db 5 | ======= 6 | >>>>>>> master 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | twython==3.0 2 | facepy==0.8 3 | django==1.5 4 | django-celery==3.0.21 5 | -------------------------------------------------------------------------------- /socialcrawl/auth_sample.py: -------------------------------------------------------------------------------- 1 | """Authentication tokens for twitter and facebook""" 2 | TWITTER_ACCESS_TOKEN = "MyTwitterAccessToken" 3 | FACEBOOK_ACCESS_TOKEN = "MyFacebookAccessToken" 4 | -------------------------------------------------------------------------------- /socialcrawl/clients/exceptions.py: -------------------------------------------------------------------------------- 1 | """Social APIs exception classes""" 2 | 3 | 4 | class AuthError(Exception): 5 | pass 6 | 7 | 8 | class ProfileNotFound(Exception): 9 | pass 10 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "socialcrawl.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /socialcrawl/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import patterns, include, url 2 | 3 | from django.contrib import admin 4 | admin.autodiscover() 5 | 6 | urlpatterns = patterns('', 7 | url(r'^api/', include('socialcrawl.networks.urls')), 8 | url(r'^admin/', include(admin.site.urls)), 9 | ) 10 | 11 | handler404 = 'socialcrawl.networks.views.notfound' 12 | -------------------------------------------------------------------------------- /socialcrawl/clients/crawler.py: -------------------------------------------------------------------------------- 1 | from socialcrawl.clients.cache import CachedClient 2 | from socialcrawl.clients.twitter import TwitterClient 3 | from socialcrawl.clients.facebook import FacebookClient 4 | 5 | 6 | class CachedTwitterClient(TwitterClient, CachedClient): 7 | pass 8 | 9 | 10 | class CachedFacebookClient(FacebookClient, CachedClient): 11 | pass 12 | -------------------------------------------------------------------------------- /socialcrawl/networks/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import patterns, url 2 | 3 | from socialcrawl.networks import views 4 | 5 | urlpatterns = patterns('', 6 | url(r'^v1/networks$', views.networks, name='networks'), 7 | url(r'^v1/profiles/(?P\w+)$', views.profiles, name='profiles'), 8 | url(r'^v1/profiles/(?P\w+)/(?P[\w.]+)$', views.profiles, name='profiles'), 9 | ) 10 | -------------------------------------------------------------------------------- /socialcrawl/networks/tasks.py: -------------------------------------------------------------------------------- 1 | from celery import task 2 | 3 | from socialcrawl.clients.crawler import CachedTwitterClient, CachedFacebookClient 4 | 5 | CLIENTS = { 6 | 'twitter': CachedTwitterClient(), 7 | 'facebook': CachedFacebookClient(), 8 | } 9 | 10 | 11 | @task() 12 | def fetch(username, network): 13 | data = CLIENTS[network].fetch_profile(username) 14 | CLIENTS[network].save_profile(username, data) 15 | -------------------------------------------------------------------------------- /socialcrawl/clients/management/commands/twitter.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand, CommandError 2 | from socialcrawl.clients.crawler import CachedTwitterClient 3 | 4 | 5 | class Command(BaseCommand): 6 | help = 'runs your code in the django environment' 7 | 8 | def handle(self, *args, **options): 9 | if len(args) != 1: 10 | raise CommandError('You need to specify a username') 11 | client = CachedTwitterClient() 12 | self.stdout.write(str(client.get_profile(args[0]))) 13 | -------------------------------------------------------------------------------- /socialcrawl/clients/management/commands/facebook.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand, CommandError 2 | from socialcrawl.clients.crawler import CachedFacebookClient 3 | 4 | 5 | class Command(BaseCommand): 6 | help = 'runs your code in the django environment' 7 | 8 | def handle(self, *args, **options): 9 | if len(args) != 1: 10 | raise CommandError('You need to specify a username') 11 | client = CachedFacebookClient() 12 | self.stdout.write(str(client.get_profile(args[0]))) 13 | -------------------------------------------------------------------------------- /socialcrawl/networks/api.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from django.db import models 4 | from django.http import HttpResponse 5 | 6 | CONTENT_TYPE = 'application/json' 7 | 8 | 9 | class APIResponse(HttpResponse): 10 | 11 | def __init__(self, data, status=200): 12 | if isinstance(data, models.query.QuerySet): 13 | blob = json.dumps([obj.hydrate() for obj in data]) 14 | elif isinstance(data, models.Model): 15 | blob = json.dumps(data.hydrate()) 16 | else: 17 | blob = json.dumps(data) 18 | super(APIResponse, self).__init__(blob, 19 | status=status, 20 | content_type=CONTENT_TYPE) 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Miquel Torres 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /socialcrawl/networks/tests/test_models.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | from django.core.exceptions import ValidationError 3 | 4 | from socialcrawl.networks.models import Profile 5 | 6 | 7 | class TestModels(TestCase): 8 | 9 | def test_required_fields(self): 10 | """Should create a Profile entry when all required fields are given""" 11 | p = Profile(username='twitter', network='T') 12 | p.save() 13 | p = Profile.objects.get(username='twitter') 14 | self.assertEqual(p.username, 'twitter') 15 | self.assertEqual(p.network, 'T') 16 | self.assertEqual(p.name, u'') 17 | 18 | def test_missing_username(self): 19 | """Should raise ValidationError when creating a profile without username""" 20 | p = Profile(network='T') 21 | self.assertRaises(ValidationError, p.save) 22 | self.assertEqual(0, len(Profile.objects.all())) 23 | 24 | def test_missing_network(self): 25 | """Should raise ValidationError when creating a profile without network""" 26 | p = Profile(username='twitter') 27 | self.assertRaises(ValidationError, p.save) 28 | self.assertEqual(0, len(Profile.objects.all())) 29 | -------------------------------------------------------------------------------- /socialcrawl/clients/twitter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import twython 4 | 5 | from socialcrawl.auth import TWITTER_ACCESS_TOKEN 6 | from socialcrawl.clients.exceptions import AuthError, ProfileNotFound 7 | 8 | 9 | class TwitterClient(object): 10 | 11 | def __init__(self, access_token=TWITTER_ACCESS_TOKEN): 12 | self.client = twython.Twython(access_token=access_token) 13 | self.network = 'T' 14 | 15 | def fetch_profile(self, username): 16 | """Fetches given profile from Twitter""" 17 | try: 18 | user = self.client.show_user(screen_name=username) 19 | except twython.exceptions.TwythonError as e: 20 | if e.error_code == 401: 21 | raise AuthError(e) 22 | elif e.error_code == 404: 23 | raise ProfileNotFound 24 | else: 25 | raise e 26 | return { 27 | 'name': user['name'], 28 | 'description': user['description'] or '', 29 | 'popularity': user['followers_count'] 30 | } 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("username") 36 | args = parser.parse_args() 37 | client = TwitterClient() 38 | print client.fetch_profile(args.username) 39 | -------------------------------------------------------------------------------- /socialcrawl/networks/views.py: -------------------------------------------------------------------------------- 1 | from socialcrawl.networks.tasks import fetch 2 | from socialcrawl.networks.models import Profile 3 | from socialcrawl.networks.api import APIResponse 4 | 5 | SUPPORTED_NETWORKS = [n[1].lower() for n in Profile.SOCIAL_NETWORKS] 6 | 7 | 8 | def notfound(request): 9 | """JSONified 404""" 10 | return APIResponse({'error': 'Not Found'}, status=404) 11 | 12 | 13 | def networks(request): 14 | """Returns list of supported social networks""" 15 | data = [{'name': n} for n in SUPPORTED_NETWORKS] 16 | return APIResponse(data) 17 | 18 | 19 | def profiles(request, network, username=None): 20 | """Returns list or detail view for social profiles""" 21 | if network not in SUPPORTED_NETWORKS: 22 | return APIResponse({'error': 'Network Not Supported'}, status=404) 23 | else: 24 | network_key = network[0].upper() 25 | if username: 26 | try: 27 | data = Profile.objects.get(username=username, network=network_key) 28 | except Profile.DoesNotExist: 29 | fetch.delay(username, network) 30 | return APIResponse({'status': 'processing'}, 202) 31 | return APIResponse(data) 32 | else: 33 | data = Profile.objects.filter(network=network_key) 34 | return APIResponse(data) 35 | -------------------------------------------------------------------------------- /socialcrawl/networks/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | from django.core.exceptions import ValidationError 3 | 4 | 5 | class Profile(models.Model): 6 | SOCIAL_NETWORKS = ( 7 | ('T', 'Twitter'), 8 | ('F', 'Facebook'), 9 | ) 10 | username = models.CharField(max_length=100, blank=False) 11 | name = models.CharField(max_length=100) 12 | description = models.CharField(max_length=200) 13 | popularity = models.IntegerField(null=True) 14 | updated = models.DateTimeField(auto_now=True) 15 | network = models.CharField(max_length=1, 16 | blank=False, 17 | choices=SOCIAL_NETWORKS) 18 | 19 | def clean(self): 20 | if not self.username: 21 | raise ValidationError('No username defined') 22 | elif not self.network: 23 | raise ValidationError('No network defined') 24 | 25 | def save(self, *args, **kwargs): 26 | self.clean() 27 | super(Profile, self).save(*args, **kwargs) 28 | 29 | class Meta: 30 | unique_together = ('username', 'network') 31 | 32 | def hydrate(self): 33 | return { 34 | 'username': self.username, 35 | 'name': self.name, 36 | 'description': self.description, 37 | 'popularity': self.popularity, 38 | } 39 | -------------------------------------------------------------------------------- /socialcrawl/clients/tests/test_facebook.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from socialcrawl.auth import FACEBOOK_ACCESS_TOKEN 4 | from socialcrawl.clients.exceptions import AuthError, ProfileNotFound 5 | from socialcrawl.clients.facebook import FacebookClient 6 | 7 | 8 | class TestFacebook(TestCase): 9 | 10 | def setUp(self): 11 | """Sets up the facebook client""" 12 | self.client = FacebookClient(FACEBOOK_ACCESS_TOKEN) 13 | 14 | def test_bad_access_token(self): 15 | """Should raise AuthError when facebook access token is wrong""" 16 | client = FacebookClient('bad_access_token') 17 | self.assertRaises(AuthError, client.fetch_profile, 'zuck') 18 | 19 | def test_profile_not_found(self): 20 | """Should raise ProfileNotFound when profile does not exist on facebook 21 | """ 22 | self.assertRaises(ProfileNotFound, self.client.fetch_profile, 'inexistentuser5555') 23 | 24 | def test_profile_found(self): 25 | """Should return dictionary with profile info when profile exists on facebook""" 26 | profile = self.client.fetch_profile('zuck') 27 | expected_keys = ['popularity', 'name', 'description'] 28 | self.assertTrue(all(key in profile for key in expected_keys)) 29 | self.assertEqual(profile['name'], u'Mark Zuckerberg') 30 | self.assertEqual(profile['popularity'], None) 31 | -------------------------------------------------------------------------------- /socialcrawl/clients/tests/test_twitter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from socialcrawl.auth import TWITTER_ACCESS_TOKEN 4 | from socialcrawl.clients.exceptions import AuthError, ProfileNotFound 5 | from socialcrawl.clients.twitter import TwitterClient 6 | 7 | 8 | class TestTwitterClient(unittest.TestCase): 9 | 10 | def setUp(self): 11 | """Sets up the twitter client""" 12 | super(TestTwitterClient, self).setUp() 13 | self.client = TwitterClient(TWITTER_ACCESS_TOKEN) 14 | 15 | def test_bad_access_token(self): 16 | """Should raise AuthError when twitter access token is wrong""" 17 | client = TwitterClient('bad_access_token') 18 | self.assertRaises(AuthError, client.fetch_profile, 'twitter') 19 | 20 | def test_profile_not_found(self): 21 | """Should raise ProfileNotFound when profile does not exist on twitter 22 | """ 23 | self.assertRaises(ProfileNotFound, self.client.fetch_profile, 'inexistentuser5555') 24 | 25 | def test_profile_found(self): 26 | """Should return dictionary with profile info when profile exists on twitter""" 27 | profile = self.client.fetch_profile('twitter') 28 | expected_keys = ['popularity', 'name', 'description'] 29 | self.assertTrue(all(key in profile for key in expected_keys)) 30 | self.assertEqual(profile['name'], u'Twitter') 31 | self.assertTrue(profile['popularity'] > 20000000) 32 | -------------------------------------------------------------------------------- /socialcrawl/clients/facebook.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import facepy 4 | 5 | from socialcrawl.auth import FACEBOOK_ACCESS_TOKEN 6 | from socialcrawl.clients.exceptions import AuthError, ProfileNotFound 7 | 8 | 9 | class FacebookClient(object): 10 | 11 | def __init__(self, access_token=FACEBOOK_ACCESS_TOKEN): 12 | self.client = facepy.GraphAPI(access_token) 13 | self.network = 'F' 14 | 15 | def fetch_profile(self, username): 16 | """Returns basic profile info for the given username""" 17 | query = ("SELECT name,about_me,friend_count FROM user WHERE " 18 | "username='{0}'".format(username)) 19 | try: 20 | user = self.client.fql(query) 21 | except facepy.exceptions.FacepyError as e: 22 | if 'access token' in e.message: 23 | raise AuthError(e.message) 24 | else: 25 | print e.message 26 | raise e 27 | if len(user['data']) == 1: 28 | return { 29 | 'name': user['data'][0]['name'], 30 | 'description': user['data'][0]['about_me'] or '', 31 | 'popularity': user['data'][0]['friend_count'] 32 | } 33 | else: 34 | raise ProfileNotFound 35 | 36 | 37 | if __name__ == "__main__": 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument("username") 40 | args = parser.parse_args() 41 | client = FacebookClient() 42 | print client.fetch_profile(args.username) 43 | -------------------------------------------------------------------------------- /socialcrawl/clients/cache.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from django.utils import timezone 4 | 5 | from socialcrawl.settings import CACHE_MAX_AGE 6 | from socialcrawl.networks.models import Profile 7 | 8 | 9 | class CachedClient(object): 10 | 11 | def __init__(self): 12 | super(CachedClient, self).__init__() 13 | self.network = None 14 | 15 | def _uptodate(self, timestamp): 16 | """Returns True if timestamp has not expired, False otherwise""" 17 | if timestamp < timezone.now() - timedelta(minutes=CACHE_MAX_AGE): 18 | return False 19 | else: 20 | return True 21 | 22 | def load_profile_from_db(self, username): 23 | """Loads profile from DB, returns None is there is no entry""" 24 | try: 25 | p = Profile.objects.get(username=username, network=self.network) 26 | except Profile.DoesNotExist: 27 | return None, None 28 | data = { 29 | 'name': p.name, 30 | 'description': p.description, 31 | 'popularity': p.popularity 32 | } 33 | return data, p 34 | 35 | def save_profile(self, username, data, cached=None): 36 | """Saves profile to the DB with given username and data""" 37 | p = Profile( 38 | username=username, 39 | network=self.network, 40 | name=data['name'], 41 | description=data['description'], 42 | popularity=data['popularity'] 43 | ) 44 | if cached: 45 | p.id = cached.id 46 | p.save() 47 | 48 | def get_profile(self, username): 49 | """Returns basic profile info for the given username 50 | First tries from cache, if not present or old it fetches data 51 | directly from twitter and caches it 52 | 53 | """ 54 | data, cached = self.load_profile_from_db(username) 55 | if data and self._uptodate(cached.updated): 56 | return data 57 | else: 58 | data = self.fetch_profile(username) 59 | self.save_profile(username, data, cached) 60 | return data 61 | 62 | def fetch_profile(self, username): 63 | """To be implemented by the social network client""" 64 | raise NotImplementedError("You need to subclass CachedClient") 65 | -------------------------------------------------------------------------------- /socialcrawl/networks/tests/test_views.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from django.test import TestCase 4 | from django.test.client import Client 5 | 6 | from socialcrawl.networks.models import Profile 7 | 8 | 9 | class BaseTest(TestCase): 10 | 11 | def setUp(self): 12 | super(BaseTest, self).setUp() 13 | self.client = Client() 14 | 15 | 16 | class TestNotFound(BaseTest): 17 | 18 | def test_not_found(self): 19 | """Should return a JSON error message when URL not found""" 20 | resp = self.client.get('/api/badurl/') 21 | self.assertEqual(resp.status_code, 404) 22 | self.assertEqual(resp.content, json.dumps({'error': 'Not Found'})) 23 | 24 | 25 | class TestNetworks(BaseTest): 26 | 27 | def test_network_list(self): 28 | """Should return a list of supported networks when calling /networks""" 29 | resp = self.client.get('/api/v1/networks') 30 | self.assertEqual(resp.status_code, 200) 31 | data = json.loads(resp.content) 32 | self.assertEqual(len(data), len(Profile.SOCIAL_NETWORKS)) 33 | 34 | 35 | class TestProfiles(BaseTest): 36 | 37 | def setUp(self): 38 | super(TestProfiles, self).setUp() 39 | Profile(username='twitter', network='T', description='Cached for T').save() 40 | Profile(username='twitter', network='F', description='Cached for F').save() 41 | 42 | def test_profiles_list(self): 43 | """Should return a list of profiles when present in DB for a given network""" 44 | resp = self.client.get('/api/v1/profiles/twitter') 45 | self.assertEqual(resp.status_code, 200) 46 | data = json.loads(resp.content) 47 | self.assertEqual(len(data), 1) 48 | self.assertEqual(data[0]['description'], 'Cached for T') 49 | 50 | def test_profiles_cached(self): 51 | """Should return profile when found in DB""" 52 | resp = self.client.get('/api/v1/profiles/twitter/twitter') 53 | self.assertEqual(resp.status_code, 200) 54 | data = json.loads(resp.content) 55 | self.assertEqual(data['description'], 'Cached for T') 56 | 57 | def test_profiles_not_cached(self): 58 | """Should return 202 and enqueue job when profile not found in DB""" 59 | resp = self.client.get('/api/v1/profiles/facebook/zuck') 60 | self.assertEqual(resp.status_code, 202) 61 | data = json.loads(resp.content) 62 | self.assertEqual(data['status'], 'processing') 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | socialcrawl 2 | ========= 3 | 4 | Script for fetching profile information from social networks 5 | 6 | ## Installation 7 | 8 | Python 2.7.x is required, plus some python packages which can be installed by typing: 9 | 10 | pip install -r requirements.txt 11 | 12 | For the job queue, you need to install a broker. Per default RabbitMQ on localhost is used: 13 | 14 | sudo apt-get install rabbitmq-server 15 | 16 | ## Configuration 17 | 18 | Create a file for social network authentication at `socialcrawl/auth.py`. A sample file can be 19 | found at `socialcrawl/auth_sample.py`. 20 | 21 | ### Twitter 22 | 23 | You will need to register an application, generate an OAuth access token for it and define `TWITTER_ACCESS_TOKEN` 24 | 25 | ### Facebook 26 | 27 | You will need to register an app, generate an OAuth access token and define `FACEBOOK_ACCESS_TOKEN` 28 | 29 | ### Cache 30 | 31 | The maximum staleness of the cached data is defined in `settings.CACHE_MAX_AGE`, in minutes. 32 | 33 | Per default a local `data.db` sqlite DB is used for caching, which can be installed in debian based systems by typing: 34 | 35 | sudo apt-get install sqlite 36 | 37 | You will need to create the DB: 38 | 39 | python manage.py syncdb 40 | 41 | If you want to use another SQL DB or otherwise change the defaults edit `settings.DATABASES`. 42 | 43 | ## Running the scripts 44 | 45 | ### Client scripts 46 | 47 | To get twitter profile info for the user 'twitter': 48 | 49 | PYTHONPATH=. python socialcrawl/clients/twitter.py twitter 50 | 51 | For the Facebook 'zuck' user: 52 | 53 | PYTHONPATH=. python socialcrawl/clients/facebook.py zuck 54 | 55 | ### Cached scripts 56 | 57 | You can use a cached version of the scripts that use Django's ORM to cache data locally: 58 | 59 | python manage.py twitter twitter 60 | python manage.py facebook zuck 61 | 62 | ## API 63 | 64 | To start the web server locally type: 65 | 66 | python manage.py runserver 67 | 68 | Resources are: 69 | 70 | - `/api/v1/networks` 71 | - `/api/v1/profiles/` 72 | - `/api/v1/profiles//` 73 | 74 | ## Job Queue 75 | 76 | To start locally the job queue for asynchronous profile fetching: 77 | 78 | python manage.py celery worker --loglevel=info 79 | 80 | ## Tests 81 | 82 | Test dependencies can be installed by typing: 83 | 84 | pip install -r test_requirements.txt 85 | 86 | To run the test suite: 87 | 88 | python manage.py test -v 2 -s socialcrawl 89 | -------------------------------------------------------------------------------- /socialcrawl/clients/tests/test_cache.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from django.test import TestCase 4 | from django.utils import timezone 5 | 6 | from socialcrawl.networks.models import Profile 7 | from socialcrawl.clients.cache import CachedClient 8 | 9 | 10 | class TestCachedClient(TestCase): 11 | 12 | def setUp(self): 13 | """Sets up the cached client""" 14 | super(TestCachedClient, self).setUp() 15 | self.client = CachedClient() 16 | self.client.network = 'T' 17 | 18 | def test_get_profile_not_cached(self): 19 | """Should raise NotImplementedError when profile not found in DB""" 20 | self.assertRaises(NotImplementedError, self.client.get_profile, 'twitter') 21 | 22 | def test_get_profile_already_cached(self): 23 | """Should load from DB and not fetch from network when when profile is found in DB""" 24 | p = Profile(username='twitter', network='T', description='Cached in DB') 25 | p.save() 26 | profile = self.client.get_profile('twitter') 27 | self.assertEqual(profile['description'], u'Cached in DB') 28 | 29 | def test_load_profile(self): 30 | """Should load profile from DB when it can be found""" 31 | p = Profile(username='twitter', network='T', description='Cached in DB') 32 | p.save() 33 | data, timestamp = self.client.load_profile_from_db('twitter') 34 | self.assertEqual(data['description'], 'Cached in DB') 35 | 36 | def test_load_profile_not_found(self): 37 | """Should return None when profile not found""" 38 | p = Profile(username='twitter', network='F', description='Cached in DB') 39 | p.save() 40 | data, timestamp = self.client.load_profile_from_db('twitter') 41 | self.assertEqual(data, None) 42 | 43 | def test_save_profile(self): 44 | """Should save profile to DB when passing required arguments and fields""" 45 | profile_data = {'name': 'Twitter Inc.', 'description': u'foobar', 'popularity': 1} 46 | self.client.save_profile('twitter', profile_data) 47 | p = Profile.objects.get(username='twitter') 48 | self.assertEqual(p.description, u'foobar') 49 | 50 | def test_uptodate_current(self): 51 | """Should return True when timestamp is up-to-date""" 52 | self.assertTrue(self.client._uptodate(timezone.now())) 53 | 54 | def test_uptodate_old(self): 55 | """Should return False when timestamp is too old""" 56 | old_timestamp = timezone.make_aware(datetime(year=2013, month=5, day=30), timezone.get_default_timezone()) 57 | self.assertFalse(self.client._uptodate(old_timestamp)) 58 | -------------------------------------------------------------------------------- /socialcrawl/clients/tests/test_crawler.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | from socialcrawl.networks.models import Profile 4 | from socialcrawl.clients.crawler import CachedTwitterClient, CachedFacebookClient 5 | 6 | 7 | class TestCachedTwitterClient(TestCase): 8 | 9 | def setUp(self): 10 | """Sets up a cached twitter client""" 11 | super(TestCachedTwitterClient, self).setUp() 12 | self.client = CachedTwitterClient() 13 | 14 | def test_profile_not_cached(self): 15 | """Should fetch from twitter and save to DB when profile not found in DB""" 16 | # First Cache an object from a different network 17 | p = Profile(username='twitter', network='F', description='Cached in DB') 18 | p.save() 19 | profile = self.client.get_profile('twitter') 20 | expected_description = u'Your official source for news, updates and tips from Twitter, Inc.' 21 | self.assertEqual(profile['description'], expected_description) 22 | p = Profile.objects.get(username='twitter', network='T') 23 | self.assertEqual(p.name, 'Twitter') 24 | self.assertEqual(p.network, 'T') 25 | self.assertEqual(p.description, expected_description) 26 | 27 | def test_profile_already_cached(self): 28 | """Should load from DB and not fetch from Twitter when when profile is found in DB""" 29 | p = Profile(username='twitter', network='T', description='Cached in DB') 30 | p.save() 31 | profile = self.client.get_profile('twitter') 32 | self.assertEqual(profile['description'], u'Cached in DB') 33 | 34 | 35 | class TestCachedFacebookClient(TestCase): 36 | 37 | def setUp(self): 38 | """Sets up a cached facebook client""" 39 | super(TestCachedFacebookClient, self).setUp() 40 | self.client = CachedFacebookClient() 41 | 42 | def test_profile_not_cached(self): 43 | """Should fetch from Facebook and save to DB when profile not found in DB""" 44 | profile = self.client.get_profile('zuck') 45 | expected_description = u'' 46 | self.assertEqual(profile['description'], expected_description) 47 | p = Profile.objects.get(username='zuck') 48 | self.assertEqual(p.name, u'Mark Zuckerberg') 49 | self.assertEqual(p.network, 'F') 50 | self.assertEqual(p.description, expected_description) 51 | 52 | def test_profile_already_cached(self): 53 | """Should load from DB and not fetch from Facebook when when profile is found in DB""" 54 | p = Profile(username='zuck', network='F', description='Cached in DB') 55 | p.save() 56 | profile = self.client.get_profile('zuck') 57 | self.assertEqual(profile['description'], u'Cached in DB') 58 | -------------------------------------------------------------------------------- /socialcrawl/settings.py: -------------------------------------------------------------------------------- 1 | # Django settings for socialcrawl project. 2 | import djcelery 3 | djcelery.setup_loader() 4 | 5 | BROKER_URL = 'amqp://guest:guest@localhost:5672/' 6 | 7 | DEBUG = True 8 | TEMPLATE_DEBUG = DEBUG 9 | 10 | ADMINS = ( 11 | # ('Your Name', 'your_email@example.com'), 12 | ) 13 | 14 | MANAGERS = ADMINS 15 | 16 | DATABASES = { 17 | 'default': { 18 | 'ENGINE': 'django.db.backends.sqlite3', # Add 'postgresql_psycopg2', 'mysql', 'sqlite3' or 'oracle'. 19 | 'NAME': 'data.db', # Or path to database file if using sqlite3. 20 | # The following settings are not used with sqlite3: 21 | 'USER': '', 22 | 'PASSWORD': '', 23 | 'HOST': '', # Empty for localhost through domain sockets or '127.0.0.1' for localhost through TCP. 24 | 'PORT': '', # Set to empty string for default. 25 | } 26 | } 27 | 28 | # Hosts/domain names that are valid for this site; required if DEBUG is False 29 | # See https://docs.djangoproject.com/en/1.5/ref/settings/#allowed-hosts 30 | ALLOWED_HOSTS = [] 31 | 32 | # Local time zone for this installation. Choices can be found here: 33 | # http://en.wikipedia.org/wiki/List_of_tz_zones_by_name 34 | # although not all choices may be available on all operating systems. 35 | # In a Windows environment this must be set to your system time zone. 36 | TIME_ZONE = 'America/Chicago' 37 | 38 | # Language code for this installation. All choices can be found here: 39 | # http://www.i18nguy.com/unicode/language-identifiers.html 40 | LANGUAGE_CODE = 'en-us' 41 | 42 | SITE_ID = 1 43 | 44 | # If you set this to False, Django will make some optimizations so as not 45 | # to load the internationalization machinery. 46 | USE_I18N = True 47 | 48 | # If you set this to False, Django will not format dates, numbers and 49 | # calendars according to the current locale. 50 | USE_L10N = True 51 | 52 | # If you set this to False, Django will not use timezone-aware datetimes. 53 | USE_TZ = True 54 | 55 | # Absolute filesystem path to the directory that will hold user-uploaded files. 56 | # Example: "/var/www/example.com/media/" 57 | MEDIA_ROOT = '' 58 | 59 | # URL that handles the media served from MEDIA_ROOT. Make sure to use a 60 | # trailing slash. 61 | # Examples: "http://example.com/media/", "http://media.example.com/" 62 | MEDIA_URL = '' 63 | 64 | # Absolute path to the directory static files should be collected to. 65 | # Don't put anything in this directory yourself; store your static files 66 | # in apps' "static/" subdirectories and in STATICFILES_DIRS. 67 | # Example: "/var/www/example.com/static/" 68 | STATIC_ROOT = '' 69 | 70 | # URL prefix for static files. 71 | # Example: "http://example.com/static/", "http://static.example.com/" 72 | STATIC_URL = '/static/' 73 | 74 | # Additional locations of static files 75 | STATICFILES_DIRS = ( 76 | # Put strings here, like "/home/html/static" or "C:/www/django/static". 77 | # Always use forward slashes, even on Windows. 78 | # Don't forget to use absolute paths, not relative paths. 79 | ) 80 | 81 | # List of finder classes that know how to find static files in 82 | # various locations. 83 | STATICFILES_FINDERS = ( 84 | 'django.contrib.staticfiles.finders.FileSystemFinder', 85 | 'django.contrib.staticfiles.finders.AppDirectoriesFinder', 86 | # 'django.contrib.staticfiles.finders.DefaultStorageFinder', 87 | ) 88 | 89 | # Make this unique, and don't share it with anybody. 90 | SECRET_KEY = '6$ajl4%_m46p2j%azgjtf-ca2l1ttlrya^2h5g%7a2dx$px&fp' 91 | 92 | # List of callables that know how to import templates from various sources. 93 | TEMPLATE_LOADERS = ( 94 | 'django.template.loaders.filesystem.Loader', 95 | 'django.template.loaders.app_directories.Loader', 96 | # 'django.template.loaders.eggs.Loader', 97 | ) 98 | 99 | MIDDLEWARE_CLASSES = ( 100 | 'django.middleware.common.CommonMiddleware', 101 | 'django.contrib.sessions.middleware.SessionMiddleware', 102 | 'django.middleware.csrf.CsrfViewMiddleware', 103 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 104 | 'django.contrib.messages.middleware.MessageMiddleware', 105 | # Uncomment the next line for simple clickjacking protection: 106 | # 'django.middleware.clickjacking.XFrameOptionsMiddleware', 107 | ) 108 | 109 | ROOT_URLCONF = 'socialcrawl.urls' 110 | 111 | # Python dotted path to the WSGI application used by Django's runserver. 112 | WSGI_APPLICATION = 'socialcrawl.wsgi.application' 113 | 114 | TEMPLATE_DIRS = ( 115 | # Put strings here, like "/home/html/django_templates" or "C:/www/django/templates". 116 | # Always use forward slashes, even on Windows. 117 | # Don't forget to use absolute paths, not relative paths. 118 | ) 119 | 120 | INSTALLED_APPS = ( 121 | 'django.contrib.auth', 122 | 'django.contrib.contenttypes', 123 | 'django.contrib.sessions', 124 | 'django.contrib.messages', 125 | 'django.contrib.staticfiles', 126 | 'django.contrib.admin', 127 | "djcelery", 128 | 'django_nose', 129 | 'socialcrawl.clients', 130 | 'socialcrawl.networks', 131 | ) 132 | 133 | TEST_RUNNER = 'django_nose.NoseTestSuiteRunner' 134 | 135 | CACHE_MAX_AGE = 10 # in Minutes 136 | --------------------------------------------------------------------------------