├── socialscraper
    ├── __init__.py
    ├── tests
    │   ├── __init__.py
    │   ├── unit
    │   │   └── __init__.py
    │   ├── integration
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   ├── twitter.py
    │   │   └── facebook.py
    │   └── __main__.py
    ├── adapters
    │   ├── __init__.py
    │   └── adapter_sqlalchemy.py
    ├── facebook
    │   ├── __init__.py
    │   ├── nograph
    │   │   ├── __init__.py
    │   │   ├── members.py
    │   │   ├── feed.py
    │   │   ├── feed2.py
    │   │   ├── friends.py
    │   │   ├── likes.py
    │   │   └── about.py
    │   ├── graphapi
    │   │   ├── likes.py
    │   │   ├── members.py
    │   │   ├── groups.py
    │   │   ├── __init__.py
    │   │   ├── about.py
    │   │   └── feed.py
    │   ├── public
    │   │   └── __init__.py
    │   ├── models.py
    │   ├── auth.py
    │   ├── scraper.py
    │   └── graphsearch
    │   │   └── __init__.py
    ├── twitter
    │   ├── __init__.py
    │   ├── models.py
    │   └── scraper.py
    └── base.py
├── Makefile
├── .gitignore
├── requirements.txt
├── .secret.example
├── README.md
└── setup.py


/socialscraper/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/socialscraper/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/socialscraper/adapters/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/socialscraper/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = []


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | clean:
2 | 	find . -name '*.pyc' -delete
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .DS_Store
3 | .secret
4 | venv
5 | *.pickle


--------------------------------------------------------------------------------
/socialscraper/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['facebook', 'twitter']


--------------------------------------------------------------------------------
/socialscraper/facebook/__init__.py:
--------------------------------------------------------------------------------
1 | from .scraper import FacebookScraper
2 | from . import models


--------------------------------------------------------------------------------
/socialscraper/twitter/__init__.py:
--------------------------------------------------------------------------------
1 | from .scraper import TwitterScraper
2 | from . import models
3 | 


--------------------------------------------------------------------------------
/socialscraper/tests/__main__.py:
--------------------------------------------------------------------------------
1 | import unittest, os
2 | 
3 | from . import unit
4 | from . import integration
5 | 
6 | unittest.main()


--------------------------------------------------------------------------------
/socialscraper/tests/integration/__main__.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | from .facebook import TestFacebookScraper
4 | from .twitter import TestTwitterScraper
5 | 
6 | unittest.main()


--------------------------------------------------------------------------------
/socialscraper/facebook/nograph/__init__.py:
--------------------------------------------------------------------------------
1 | from .about import get_about
2 | from .feed import get_feed
3 | from .feed2 import get_feed2
4 | from .likes import get_likes
5 | from .friends import get_friends
6 | from .members import get_members


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.3.2
 2 | cssselect==0.9.1
 3 | enum34==0.9.23
 4 | facebook-sdk==0.4.0
 5 | lxml==3.3.5
 6 | mechanize==0.2.5
 7 | python-dateutil==2.2
 8 | requests==2.3.0
 9 | six==1.6.1
10 | wsgiref==0.1.2
11 | SQLAlchemy==0.9.0
12 | celery==3.1.7


--------------------------------------------------------------------------------
/.secret.example:
--------------------------------------------------------------------------------
 1 | export TWITTER_EMAIL=""
 2 | export TWITTER_USERNAME=""
 3 | export TWITTER_PASSWORD=""
 4 | 
 5 | export FACEBOOK_EMAIL=""
 6 | export FACEBOOK_USERNAME=""
 7 | export FACEBOOK_PASSWORD=""
 8 | 
 9 | export TWITTER_CONSUMER_KEY=''
10 | export TWITTER_CONSUMER_KEY=''
11 | export TWITTER_ACCESS_TOKEN_KEY=''
12 | export TWITTER_ACCESS_TOKEN_SECRET=''
13 | 
14 | export FACEBOOK_APP_ID=''
15 | export FACEBOOK_APP_SECRET=''
16 | export FACEBOOK_APP_TOKEN=''
17 | 
18 | # https://developers.facebook.com/tools/explorer/
19 | export FACEBOOK_USER_TOKEN=''


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | socialscraper
 2 | =========
 3 | 
 4 | pip install -e git://github.com/alpaca/socialscraper.git#egg=socialscraper --upgrade
 5 | 
 6 | Facebook Tests
 7 | ```
 8 | python -m socialscraper.tests.integration.facebook
 9 | ```
10 | 
11 | ```
12 | # see methods on the FacebookScraper class here:
13 | # https://github.com/alpaca/socialscraper/blob/master/socialscraper/facebook/scraper.py
14 | # such as get_about, get_feed, get_likes, get_fans
15 | 
16 | # or use the scraper's authenticated browser to scrape your own content
17 | # scraper.browser.get("xxxx")
18 | ```
19 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(name='socialscraper',
 4 |       version='0.0.1',
 5 |       description='Scrapers for Social Networks',
 6 |       author='Moritz Gellner',
 7 |       author_email='moritz.gellner@gmail.com',
 8 |       url='http://dev.alpaca.io/',
 9 |       packages=['socialscraper'],
10 |       install_requires=[
11 |         'beautifulsoup4==4.3.2',
12 |         'cssselect==0.9.1',        
13 |         'enum34==0.9.23',
14 |         'facebook-sdk==0.4.0',
15 |         'lxml==3.3.4',        
16 |       	'mechanize==0.2.5',
17 | 	      'python-dateutil==2.2',
18 |       	'requests==2.2.1',
19 |         'six==1.6.1',
20 |         'wsgiref==0.1.2'
21 |       ]
22 |      )


--------------------------------------------------------------------------------
/socialscraper/twitter/models.py:
--------------------------------------------------------------------------------
 1 | from ..base import BaseModel, Column
 2 | 
 3 | class TwitterTweet(BaseModel):
 4 |     __tablename__ = 'twitter_tweets'
 5 |     __attrs__ = ['id', 'timestamp', 'user', 'content']
 6 |     id = Column('id', 'BigInteger', primary_key=True)
 7 |     timestamp = Column('timestamp', 'BigInteger')
 8 |     user = Column('user', 'String', foreign_key=True, foreign_key_reference="twitter_users.screen_name")
 9 |     content = Column('content', 'Text')
10 | 
11 | class TwitterUser(BaseModel):
12 |     __tablename__ = 'twitter_users'
13 |     __attrs__ = ['id', 'screen_name']
14 |     id = Column('id', 'BigInteger', primary_key=True)
15 |     screen_name = Column('screen_name', 'String', unique=True)


--------------------------------------------------------------------------------
/socialscraper/facebook/graphapi/likes.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from time import time
 3 | from datetime import datetime
 4 | from ...base import ScrapingError
 5 | from ..models import FacebookPage
 6 | 
 7 | from . import get_object
 8 | from . import get_connections
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | def get_likes(api, username):
13 |     after = ''
14 |     while True:
15 |         profile = api.get_object(str(username) + "/likes", after=after, fields="category,id,name,username")
16 |         if profile['data'] == []: break
17 |         after = profile['paging']['cursors']['after']
18 |         for item in profile['data']:
19 |             yield FacebookPage(page_id=int(item.get('id')), username=item.get('username') ,type=item.get('category'), name=item.get('name'))
20 |             # print item.get('type') + ": " + item.get('story', '') + item.get('message', '')


--------------------------------------------------------------------------------
/socialscraper/facebook/graphapi/members.py:
--------------------------------------------------------------------------------
 1 | import logging, requests
 2 | from time import time
 3 | from datetime import datetime
 4 | from ...base import ScrapingError
 5 | from ..models import FacebookUser
 6 | 
 7 | from . import get_object
 8 | from . import get_connections
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | def get_members(api, graph_name):
13 |     # after = ''
14 |     # while True:
15 |     #     profile = api.get_object(str(username) + "/members", after=after, fields="id,username,name")
16 |     #     if profile['data'] == []: break
17 |     #     after = profile['paging'].get('cursors', {}).get('after', "shit")
18 |     #     for item in profile['data']:
19 |     #         
20 | 
21 |     members = api.get_connections(str(graph_name), "members")
22 |     while members.get('data', False):
23 |         for item in members['data']:
24 |         	yield FacebookUser(uid=int(item.get('id')), username=item.get('username'), name=item.get('name'))
25 |         members = requests.get(members['paging']['next']).json()


--------------------------------------------------------------------------------
/socialscraper/facebook/graphapi/groups.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from time import time
 3 | from datetime import datetime
 4 | from ...base import ScrapingError
 5 | from ..models import FacebookGroup
 6 | 
 7 | from . import get_object
 8 | from . import get_connections
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | def get_group(api, username):
13 | 	item = api.get_object(str(username))
14 | 	return FacebookGroup(group_id=int(item.get('id')), username=int(item.get('id')), name=item.get('name'), icon=item.get('icon'), privacy=item.get('privacy'), description=item.get('description'))
15 | 
16 | def get_groups(api, username):
17 |     after = ''
18 |     while True:
19 |         profile = api.get_object(str(username) + "/groups", after=after, fields="id,name")
20 |         if profile['data'] == []: break
21 |         after = profile['paging']['cursors']['after']
22 |         for result in profile['data']:
23 |             item = api.get_object(result.get('id'))
24 |             yield FacebookGroup(group_id=int(item.get('id')), username=int(item.get('id')), name=item.get('name'), icon=item.get('icon'), privacy=item.get('privacy'), description=item.get('description'))


--------------------------------------------------------------------------------
/socialscraper/facebook/graphapi/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from facebook import GraphAPIError
 3 | 
 4 | from ...base import ScrapingError
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | def get_object(api, username):
 9 |     try:
10 |         profile = api.get_object(username)
11 |     except GraphAPIError:
12 |         raise ValueError("Can't get object %s" % username)
13 | 
14 |     return profile
15 | 
16 | def get_connections(api, username, connection):
17 |     try:
18 |         profile = api.get_connections(username, connection)
19 |     except GraphAPIError:
20 |         raise ValueError("Can't get connection %s, %s" % username, connection)
21 | 
22 |     return profile
23 | 
24 | def get_attributes(api,graph_obj,attributes):
25 |     """Get multiple attributes of a given a graph_name or graph_id."""
26 |     ret_attributes = []
27 |     data = get_object(api,graph_obj)
28 |     for attribute in attributes:
29 |         ret_attributes.append(data.get(attribute, None))
30 |     return ret_attributes
31 | 
32 | def get_username(api,graph_obj):
33 |     ret_attributes = []
34 |     data = get_object(api,graph_obj)
35 |     return data.get('username', None)
36 | 
37 | from .about import get_about
38 | from .feed import get_feed
39 | from .likes import get_likes
40 | from .groups import get_groups, get_group
41 | from .members import get_members


--------------------------------------------------------------------------------
/socialscraper/tests/integration/twitter.py:
--------------------------------------------------------------------------------
 1 | import unittest, os
 2 | from ...twitter import TwitterScraper
 3 | 
 4 | # from mock import patch 
 5 | # @mock.patch('requests.get', mock.Mock(side_effect = lambda k:{'aurl': 'a response', 'burl' : 'b response'}.get(k, 'unhandled request %s'%k) ))
 6 | 
 7 | class TestTwitterScraper(unittest.TestCase):
 8 | 
 9 | 	def setUp(self):
10 | 		self.email = os.getenv("TWITTER_EMAIL")
11 | 		self.username = os.getenv("TWITTER_USERNAME")
12 | 		self.password = os.getenv('TWITTER_PASSWORD')
13 | 
14 | 		self.test_username = "MaloneJena"
15 | 		self.test_userid = 2304205154
16 | 		
17 | 		self.scraper = TwitterScraper()
18 | 		self.scraper.add_user(username=self.username, password=self.password)
19 | 
20 | 	def test_with_id_and_screenname(self):
21 | 		# user = self.test_username
22 | 		# id = self.test_userid
23 | 		# followers_from_user = self.scraper.get_followers(user)
24 | 		# followers_from_id = self.scraper.get_followers(id)
25 | 		# self.assertEqual([f.username for f in followers_from_user].sort(),[f.username for f in followers_from_id].sort())
26 | 
27 | 		# for follower in self.scraper.get_followers('aljohri'):
28 | 		# 	print follower
29 | 
30 | 		for tweet in self.scraper.get_feed_by_screen_name('aljohri'):
31 | 			print tweet
32 | 
33 | 		self.assertTrue(True)
34 | 
35 | if __name__ == "__main__":
36 | 	unittest.main()


--------------------------------------------------------------------------------
/socialscraper/facebook/graphapi/about.py:
--------------------------------------------------------------------------------
 1 | import logging, json
 2 | from ...base import ScrapingError
 3 | from ..models import FacebookUser
 4 | 
 5 | from . import get_object
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | """
10 | Ignoring the idea of tiered permissions, we consider any user for whom 
11 | we find "non-public" data to be a public user.
12 | 
13 | Non-public data is defined as any key not defined in PUBLIC_KEYS.
14 | """
15 | 
16 | PUBLIC_KEYS = [
17 |     'id', 
18 |     'name',     
19 |     'first_name', 
20 |     'middle_name',
21 |     'last_name',
22 |     'gender',  
23 |     'link', 
24 |     'locale', 
25 |     'updated_time', 
26 |     'username'
27 | ]
28 | 
29 | def get_about(api, username):
30 | 
31 |     def check_public_profile(profile):
32 |         for key in profile.keys(): 
33 |             if key not in PUBLIC_KEYS:
34 |                 return True
35 |                 break
36 |         return False
37 | 
38 |     profile = get_object(api, username)
39 | 
40 |     employer = json.dumps(profile.get('work')) if profile.get('work') else None
41 |     data = json.dumps(profile) if profile else None
42 |     hometown = json.dumps(profile.get('hometown')) if profile.get('hometown') else None
43 |     currentcity = json.dumps(profile.get('currentcity')) if profile.get('currentcity') else None
44 | 
45 |     user = FacebookUser(
46 |         uid=int(profile.get('id')), 
47 |         username=username, 
48 |         email=profile.get('email'), 
49 |         birthday=profile.get('birthday'), 
50 |         sex=profile.get('gender'), 
51 |         college=None, 
52 |         employer=employer,
53 |         highschool=None,
54 |         currentcity=currentcity,
55 |         hometown=hometown,
56 |         locale=profile.get('locale'),
57 |         data=data
58 |     )
59 | 
60 |     return user


--------------------------------------------------------------------------------
/socialscraper/facebook/graphapi/feed.py:
--------------------------------------------------------------------------------
 1 | import logging, requests
 2 | from time import time
 3 | from datetime import datetime
 4 | from urlparse import urlparse, parse_qs
 5 | from facebook import GraphAPIError
 6 | 
 7 | from . import get_object
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | def get_feed(api, graph_name, start="", end=datetime.now()):
12 |     """
13 | 
14 |     Returns:
15 | 
16 |         feed from start date to end date.
17 | 
18 |     If dates not specified starts from present and continues
19 |     reverse chronologically.
20 | 
21 |     Input:
22 |         api: GraphAPI
23 |         graph_name: string
24 |         start: datetime
25 |         end: datetime
26 | 
27 |     Caveats:
28 |         The Facebook GraphAPI ['paging']['next'] and ['paging']['previous'] url 
29 |         ignores end limits.
30 | 
31 |         For example, if I search for posts between Date X to Y. The first result
32 |         will be between these dates. 
33 | 
34 |         If I blindly go to the next url, it will 
35 |         keep until=Y and traverse reverse chronologically indefinitely.
36 | 
37 |         Similarly, if I blindly go to the previous url, it will keep since=X and
38 |         traverse forward chronologically indefinitely.
39 | 
40 |         For this reason, I re-append the since parameter parameter to the next url
41 |         and reappend the until parameter to the previous url.
42 | 
43 |     """
44 | 
45 |     # if start, end ar passed in as None
46 |     if start == None: start = ""
47 |     if end == None: end = datetime.now()
48 | 
49 |     logger.info("Getting feed since %s until %s" % 
50 |         (
51 |             start.strftime('%Y-%m-%d %H:%M:%S') if isinstance(start, datetime) else "indefinite", 
52 |             end.strftime('%Y-%m-%d %H:%M:%S')
53 |         )
54 |     )
55 | 
56 |     feed = api.get_connections(graph_name, "feed", 
57 |         since=int((start-datetime(1970,1,1)).total_seconds()) if isinstance(start, datetime) else "",
58 |         until=int((end-datetime(1970,1,1)).total_seconds())
59 |     )
60 | 
61 |     
62 |     while feed['data']:
63 |         for item in feed['data']: yield item
64 | 
65 |         # Hacky fix for Facebook GraphAPI. See method doctstring.
66 |         feed['paging']['next'] += "&since=%d" % int((start-datetime(1970,1,1)).total_seconds()) if isinstance(start, datetime) else ""
67 |         feed['paging']['previous'] += "&until=%d" % int((end-datetime(1970,1,1)).total_seconds())
68 | 
69 |         feed = requests.get(feed['paging']['next']).json()
70 | 


--------------------------------------------------------------------------------
/socialscraper/tests/integration/facebook.py:
--------------------------------------------------------------------------------
 1 | import unittest, os, pprint, logging, pickle, itertools
 2 | from ...facebook import FacebookScraper
 3 | 
 4 | logging.basicConfig(level=logging.WARN)
 5 | pp = pprint.PrettyPrinter(indent=4)
 6 | 
 7 | def enumerate_and_run_twice(gen):
 8 |     return itertools.takewhile(lambda (i,x): i < 2, enumerate(gen))
 9 | 
10 | class TestFacebookScraper(unittest.TestCase):
11 | 
12 |     def setUp(self):
13 |         self.email = os.getenv("FACEBOOK_EMAIL")
14 |         self.username = os.getenv("FACEBOOK_USERNAME")
15 |         self.password = os.getenv("FACEBOOK_PASSWORD")
16 |         self.app_token = os.getenv('FACEBOOK_APP_TOKEN')
17 |         self.user_token = os.getenv('FACEBOOK_USER_TOKEN')
18 | 
19 |         self.test_username = "todd.warren.seattle"
20 |         self.test_pagename = "mightynest"
21 | 
22 |         self.scraper_type = "nograph"
23 | 
24 |         if not os.path.isfile('facebook_scraper.pickle'):
25 |            self.scraper = FacebookScraper(scraper_type=self.scraper_type)
26 |            self.scraper.add_user(email=os.getenv('FACEBOOK_EMAIL'), password=os.getenv('FACEBOOK_PASSWORD'))
27 |            self.scraper.login()
28 |            pickle.dump(self.scraper, open('facebook_scraper.pickle', 'wb'))
29 |         else:
30 |            self.scraper = pickle.load(open('facebook_scraper.pickle', 'rb'))
31 |            self.scraper.scraper_type = self.scraper_type
32 | 
33 |     # @unittest.skip("testing skipping")
34 |     def test_graphsearch_pages_liked(self):
35 |         gen = self.scraper.graph_search(self.test_username, "pages-liked")
36 |         for i,item in enumerate_and_run_twice(gen):
37 |             print item
38 | 
39 |     # @unittest.skip("testing skipping")
40 |     def test_graphsearch_likers(self):
41 |         gen = self.scraper.graph_search(self.test_pagename, "likers")
42 |         for i,item in enumerate_and_run_twice(gen):
43 |             print item
44 | 
45 |     # @unittest.skip("testing skipping")
46 |     def test_graphsearch_friends(self):
47 |         gen = self.scraper.graph_search(self.test_username, "friends")
48 |         for i,item in enumerate_and_run_twice(gen):
49 |             print item
50 | 
51 |     # @unittest.skip("testing skipping")
52 |     def test_nograph_likes(self):
53 |         gen = self.scraper.get_likes_nograph(self.test_username)
54 |         for i,item in enumerate_and_run_twice(gen):
55 |             print item
56 | 
57 |     # @unittest.skip("testing skipping")
58 |     def test_nograph_friends(self):
59 |         gen = self.scraper.get_friends_nograph(self.test_username)
60 |         for i,item in enumerate_and_run_twice(gen):
61 |             print item
62 | 
63 | if __name__ == "__main__":
64 |     unittest.main()


--------------------------------------------------------------------------------
/socialscraper/facebook/nograph/members.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import re, json, lxml, urllib
 4 | from bs4 import BeautifulSoup
 5 | from ...base import ScrapingError
 6 | from ..models import FacebookUser
 7 | 
 8 | from ..import graphapi, public
 9 | 
10 | 
11 | def get_members(browser, current_user, graph_name, graph_id = None, api = None):
12 | 
13 | 
14 |     def _result_to_model(result):
15 | 
16 |         url = result[0]
17 |         name = result[1]
18 |         uid = result[2]
19 | 
20 |         # print url, name
21 | 
22 |         username = public.parse_url(url)
23 | 
24 |         # if api:
25 |         #     print username
26 |         #     try:
27 |         #         uid, category = graphapi.get_attributes(api, username, ["id", "category"])
28 |         #     except ValueError as e:
29 |         #         print e
30 |         #         uid = category = None
31 |         # else:
32 |         #     uid, category = public.get_attributes(username, ["id", "category"])
33 | 
34 |         if uid == None:
35 |             print "Couldn't find UID of %s"
36 |             return None
37 |             # raise ValueError("Couldn't find uid of %s" % username)
38 | 
39 |         uid = int(uid) if uid else None
40 | 
41 |         return FacebookUser(uid=uid, username=username, url=url, name=name)
42 | 
43 | 
44 |     response = browser.get("https://www.facebook.com/groups/%s/" % graph_id)
45 |     soup = BeautifulSoup(response.content.replace('<!--','').replace('-->',''))
46 |     num_members_text = soup.find(text=re.compile("Members\s\([\d,]+\)"))
47 |     if num_members_text:
48 |         num_members = int(num_members_text.replace("Members (", "").replace(")", "").replace(",", ""))
49 |     else:
50 |         num_members_text = soup.find(text=re.compile("\d+\smembers")) # groups i am part of
51 |         if num_members_text:
52 |             num_members = int(num_members_text.replace(" members", "").replace(",", ""))
53 | 
54 |     step = 97
55 |     for page in range(1,num_members,step):
56 | 
57 |         response = browser.get("https://www.facebook.com/ajax/browser/list/group_members/?id=%s&gid=%s&edge=groups%%3Amembers&order=default&view=list&start=%d&__a=1" % (graph_id, graph_id, page))
58 |         data = json.loads(response.content[9:])
59 | 
60 |         try:
61 |             doc = lxml.html.fromstring(data['domops'][0][3]['__html'])
62 |         except lxml.etree.XMLSyntaxError as e:
63 |             continue
64 | 
65 |         for link in doc.cssselect("a[data-gt*=eng_type]"):
66 |             url = link.get('href')
67 |             name = link.text_content()
68 |             uid = re.search(r'\/ajax\/hovercard\/user.php\?id=(\d+)&', link.get('data-hovercard')).groups()[0]
69 |             result = (url, name, uid)
70 |             result2ield = _result_to_model(result)
71 |             if result2ield: yield result2ield
72 | 
73 |         # current_results = filter(lambda (url,name): name != '' and name != 'See More' and 'FriendFriends' not in name, map(lambda x: (x.get('href'), unicode(x.text_content())) , doc.cssselect('a')))
74 | 


--------------------------------------------------------------------------------
/socialscraper/facebook/nograph/feed.py:
--------------------------------------------------------------------------------
  1 | import logging, lxml.html, json, urllib, re, datetime, dateutil
  2 | # from .models import FacebookUser, FacebookStatus
  3 | 
  4 | from .. import public
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | BASE_URL = 'https://www.facebook.com/%s'
  9 | AJAX_URL = "https://www.facebook.com/ajax/pagelet/generic.php/ProfileTimelineSectionPagelet"
 10 | regex_4real = re.compile("if \(self != top\) {parent\.require\(\"JSONPTransport\"\)\.respond\(\d+, ({.*}),\"jsmods\"", re.MULTILINE|re.DOTALL)
 11 | 
 12 | from enum import Enum
 13 | class QueryType(Enum):
 14 |     everything = 25
 15 |     highlights = 8
 16 |     recent = 36
 17 | 
 18 | import pprint
 19 | pp = pprint.PrettyPrinter(indent=4)
 20 | 
 21 | def get_feed(browser, current_user, graph_name, graph_id = None, api = None):
 22 | 
 23 |     if not graph_id:
 24 |         graph_id = public.get_id(graph_name)
 25 | 
 26 |     def _find_script_tag(raw_html, phrase):
 27 |         doc = lxml.html.fromstring(raw_html)
 28 |         script_tag = filter(lambda x: x.text_content().find(phrase) != -1, doc.cssselect('script'))
 29 |         if not script_tag: return None
 30 |         return json.loads(script_tag[0].text_content()[24:-1])
 31 | 
 32 |     def _get_payload(ajax_data, uid, ajaxpipe_token, page):
 33 |         return {
 34 |             "ajaxpipe": 1,
 35 |             "ajaxpipe_token": ajaxpipe_token,
 36 |             "data": json.dumps(ajax_data),
 37 |             "__user": current_user.id,
 38 |             "__dyn": "7n8ajEAMCBynzpQ9UoHaEWy6zECiq78hAKGgyiGGeqheCu6popG",
 39 |         }
 40 | 
 41 |     response = browser.get(BASE_URL % graph_name)
 42 |     cursor_tag = _find_script_tag(response.text, "section_container_id")
 43 |     if not cursor_tag: return None
 44 |     
 45 |     regex = re.compile("{\"ajaxpipe_token\":\"(.*)\",\"lhsh\":\"(.*)\"}")
 46 |     r = regex.search(response.text)
 47 |     
 48 |     ajax_data = json.loads(str(cursor_tag['jscc_map'])[105:-93])
 49 | 
 50 |     del ajax_data['section_container_id']
 51 |     del ajax_data['section_pagelet_id']
 52 |     del ajax_data['unit_container_id']
 53 |     del ajax_data['current_scrubber_key']
 54 |     del ajax_data['require_click']
 55 |     del ajax_data['buffer']
 56 |     del ajax_data['adjust_buffer']
 57 |     del ajax_data['showing_esc']
 58 |     del ajax_data['remove_dupes']
 59 |     del ajax_data['num_visible_units']
 60 |     del ajax_data['tipld']
 61 | 
 62 |     ajax_data['query_type'] = QueryType.everything.value
 63 | 
 64 |     # datetime.datetime.fromtimestamp(1398927599)
 65 |     # datetime.datetime(2012,04,01,0,0).strftime('%s')
 66 |     tNow = datetime.datetime.now()
 67 |     start = datetime.date(tNow.year, tNow.month, 1)
 68 |     end = datetime.date(tNow.year, tNow.month+1, 1)
 69 | 
 70 |     while True:
 71 |         start += dateutil.relativedelta.relativedelta(months=-1)
 72 |         end += dateutil.relativedelta.relativedelta(months=-1)
 73 | 
 74 |         logger.info(start.strftime("%A %d %B %Y") + "to" + end.strftime("%A %d %B %Y"))
 75 | 
 76 |         ajax_data['start'] = start.strftime('%s') 
 77 |         ajax_data['end'] = end.strftime('%s')
 78 | 
 79 |         page_counter = 0
 80 | 
 81 |         while True:
 82 |             ajax_data['page_index'] = page_counter
 83 |             payload = _get_payload(ajax_data, current_user.id, r.groups()[0], page_counter)
 84 |             response = browser.get(AJAX_URL + "?%s" % urllib.urlencode(payload))
 85 |             doc = lxml.html.fromstring(response.text)
 86 |             
 87 |             test = doc.cssselect('script')[2].text_content()
 88 |             blah = regex_4real.findall(test)[0]
 89 |             blah = blah + "}}"
 90 |             yay = json.loads(blah)
 91 |             da_html = yay['payload']['content'].get('_segment_' + str(page_counter) + '_0_left', None)
 92 |             if not da_html: da_html = yay['payload']['content'].get('_segment_0_0', None)
 93 | 
 94 |             test2 = doc.cssselect('script')[4].text_content()
 95 |             if len(test2) > 750:
 96 |                 blah2 = regex_4real.findall(test2)[0]
 97 |                 blah2 = blah2 + "}}"
 98 |                 yay2 = json.loads(blah2)
 99 |                 da_html2 = yay2['payload']['content'].get('_segment_' + str(page_counter) + '_1_left', None)
100 |                 if not da_html2: da_html2 = yay2['payload']['content'].get('_segment_0_0', None)
101 |             else:
102 |                 da_html2 = None
103 |             
104 |             if da_html: 
105 |                 uh = lxml.html.fromstring(da_html)
106 |                 for el in uh.cssselect('div[role]'):
107 |                     print el.text_content()
108 |                     print ""
109 |             if da_html2:
110 |                 uh2 = lxml.html.fromstring(da_html2)
111 |                 for el2 in uh2.cssselect('div[role]'):
112 |                     print el2.text_content()
113 |                     print ""
114 | 
115 |             if not da_html and not da_html2:
116 |                 break
117 | 
118 |             page_counter += 1
119 | 
120 |         # if not da_html and page_counter == 0: 
121 |         #     pp.pprint(payload)
122 |         #     break
123 | 
124 |         if end < datetime.date(2004,1,1):
125 |             break


--------------------------------------------------------------------------------
/socialscraper/facebook/nograph/feed2.py:
--------------------------------------------------------------------------------
  1 | import logging, lxml.html, json, urllib, re, datetime, dateutil, collections, urlparse
  2 | # from .models import FacebookUser, FacebookStatus
  3 | 
  4 | from .. import public
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | BASE_URL = 'https://www.facebook.com/%s'
  9 | AJAX_URL = "https://www.facebook.com/ajax/pagelet/generic.php/PagePostsSectionPagelet"
 10 | regex_4real = re.compile("if \(self != top\) {parent\.require\(\"JSONPTransport\"\)\.respond\(\d+, ({.*}),\"jsmods\"", re.MULTILINE|re.DOTALL)
 11 | 
 12 | from enum import Enum
 13 | class QueryType(Enum):
 14 |     everything = 25
 15 |     highlights = 8
 16 |     recent = 36
 17 | 
 18 | import pprint
 19 | pp = pprint.PrettyPrinter(indent=4)
 20 | 
 21 | def get_feed2(browser, current_user, graph_name, graph_id = None, api = None):
 22 | 
 23 |     if not graph_id:
 24 |         graph_id = public.get_id(graph_name)
 25 | 
 26 |     def _find_script_tag(raw_html, phrase, index):
 27 |         doc = lxml.html.fromstring(raw_html)
 28 |         script_tag = filter(lambda x: x.text_content().find(phrase) != -1, doc.cssselect('script'))
 29 |         if not script_tag: return None
 30 |         return json.loads(script_tag[index].text_content()[24:-1])
 31 | 
 32 |     def _get_payload(ajax_data, uid, ajaxpipe_token, page):
 33 |         payload = collections.OrderedDict()
 34 |         payload['data'] = json.dumps(ajax_data)
 35 |         payload['__user'] = current_user.id
 36 |         payload['__a'] = 1
 37 |         payload['__dyn'] = "7n8anEAMCBynzpQ9UoHFaeExEW9J6yUgByV9GiyGGEVFLO0xBxC9V8CdBUgDyQqVayahk"
 38 |         payload['__req'] = "1f"
 39 |         payload['__rev'] = 1377599
 40 |         # payload['ajaxpipe'] = 1
 41 |         # payload['ajaxpipe_token'] = ajaxpipe_token
 42 |         return payload
 43 | 
 44 |     response = browser.get(BASE_URL % graph_name)
 45 |     cursor_tag = _find_script_tag(response.text, "PagesPostsSection", 1)
 46 |     if not cursor_tag: raise "couldn't find PagesPostsSection"
 47 | 
 48 |     # ajax_data = cursor_tag['jsmods']['instances'][4][2][2]
 49 |     # del ajax_data['post_section']['filter_after_timestamp']
 50 | 
 51 |     regex = re.compile("{\"ajaxpipe_token\":\"(.*)\",\"lhsh\":\"(.*)\"}")
 52 |     r = regex.search(response.text)
 53 | 
 54 |     # datetime.datetime.fromtimestamp(1398927599)
 55 |     # datetime.datetime(2012,04,01,0,0).strftime('%s')
 56 |     tNow = datetime.datetime.now()
 57 |     start = datetime.date(tNow.year, tNow.month, 1)
 58 |     end = datetime.date(tNow.year, tNow.month+1, 1)
 59 | 
 60 |     while True:
 61 | 
 62 |         print start.strftime("%A %d %B %Y") + " to " + end.strftime("%A %d %B %Y")
 63 | 
 64 |         page_counter = 0
 65 | 
 66 |         while True:
 67 | 
 68 |             segment_counter = 0
 69 | 
 70 |             while True:
 71 | 
 72 |                 ajax_data = collections.OrderedDict()
 73 |                 ajax_data['segment_index'] = segment_counter
 74 |                 ajax_data['page_index'] = page_counter
 75 |                 ajax_data['page'] = graph_id
 76 |                 ajax_data['column'] = "main"
 77 |                 ajax_data['post_section'] = collections.OrderedDict()
 78 |                 ajax_data['post_section']['profile_id'] = graph_id
 79 |                 ajax_data['post_section']['start'] = start.strftime('%s')
 80 |                 ajax_data['post_section']['end'] = end.strftime('%s')
 81 |                 ajax_data['post_section']['query_type'] = QueryType.everything.value
 82 |                 ajax_data['post_section']['filter'] = 1
 83 |                 ajax_data['post_section']['is_pages_redesign'] = True
 84 |                 ajax_data['section_index'] = 0
 85 |                 ajax_data['hidden'] = False
 86 |                 # ajax_data['posts_loaded'] = posts_counter
 87 |                 ajax_data['show_all_posts'] = True
 88 | 
 89 |                 payload = _get_payload(ajax_data, current_user.id, r.groups()[0], page_counter)
 90 |                 response = browser.get(AJAX_URL + "?%s" % urllib.urlencode(payload))
 91 | 
 92 |                 data = json.loads(response.content[9:])
 93 |                 if not data['payload']:
 94 |                     # print page_counter, segment_counter, "No Results"
 95 |                     break
 96 |                 doc = lxml.html.fromstring(data['payload'])
 97 |                 # pp.pprint(dict(ajax_data.items()))
 98 |                 for article in doc.cssselect("div[role]"):
 99 | 
100 |                     heading = article.cssselect('h5')[0].text_content().strip()
101 |                     if not heading == "Killed By Police shared a link.": continue
102 | 
103 |                     text = article.cssselect('.userContent')[0].text_content()
104 |                     relative_time_posted = article.cssselect('.uiLinkSubtle')[0].text_content()
105 |                     fb_url = article.cssselect('.uiLinkSubtle')[0].get('href')
106 |                     raw_url = article.cssselect('h5 a[onmouseover]')[0].get('href')
107 |                     real_url = urlparse.parse_qs(urlparse.urlparse(raw_url).query)['u'][0]
108 | 
109 |                     print text + "\t" + fb_url + "\t" + real_url
110 | 
111 | 
112 |                 segment_counter += 1
113 | 
114 |             start += dateutil.relativedelta.relativedelta(months=-1)
115 |             end += dateutil.relativedelta.relativedelta(months=-1)
116 | 
117 |             if end < datetime.date(2004,1,1):
118 |                 break
119 | 


--------------------------------------------------------------------------------
/socialscraper/facebook/public/__init__.py:
--------------------------------------------------------------------------------
  1 | import logging, requests, json, re, pdb
  2 | from bs4 import BeautifulSoup
  3 | from ..models import FacebookUser, FacebookPage
  4 | from ...base import ScrapingError
  5 | 
  6 | regex = re.compile("https:\/\/www.facebook.com\/(.*)")
  7 | regex2 = re.compile("https:\/\/www.facebook.com\/profile.php\?id=(.*)\&ref")
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | """
 12 | Getting the id using the public method can get less data than ideal.
 13 | 
 14 | Pages dealing with alcohol cannot be retrieved via the public method.
 15 | For example: https://www.facebook.com/zeitgeistusn
 16 | 
 17 | The graphapi can be helpful here. Using a user access token (or perhaps even an app 
 18 | access token) we can get the id, name, and other attributes (in the same format).
 19 | 
 20 | """
 21 | 
 22 | def get_id(graph_name):
 23 |     "Get the graph ID given a name."""
 24 |     get_response = lambda : requests.get('https://graph.facebook.com/' + graph_name)
 25 |     response = get_response()
 26 |     counter = 0
 27 |     while response.status_code == 400 and counter < 3:
 28 |         response = get_response()
 29 |         counter += 1
 30 |     id = json.loads(response.text).get('id', None)
 31 |     return int(id) if id else None
 32 | 
 33 | def get_name(graph_id):
 34 |     """Get the graph name given a graph ID."""
 35 |     response = requests.get('https://graph.facebook.com/' + graph_id)
 36 |     name = json.loads(response.text).get('name', None)
 37 |     return name
 38 | 
 39 | def get_attribute(graph_obj,attribute):
 40 |     """Get attribute of a given a graph_name or graph_id."""
 41 |     response = requests.get('https://graph.facebook.com/' + graph_obj)
 42 |     name = json.loads(response.text).get(attribute, None)
 43 |     return name
 44 | 
 45 | def get_attributes(graph_obj,attributes):
 46 |     """Get multiple attributes of a given a graph_name or graph_id."""
 47 |     ret_attributes = []
 48 |     response = requests.get('https://graph.facebook.com/' + graph_obj)
 49 |     data = json.loads(response.text)
 50 |     for attribute in attributes:
 51 |         ret_attributes.append(data.get(attribute, None))
 52 |     return ret_attributes
 53 | 
 54 | regex1 = re.compile("^https:\/\/www.facebook.com\/([^?\n]+)(?:\?(?:f)?ref.*)?$")
 55 | regex2 = re.compile("https:\/\/www.facebook.com\/profile.php\?id=(.*)\&(?:f)?ref")
 56 | regex3 = re.compile("\/groups\/(.*)\/.*")
 57 | def parse_url(url):
 58 |     # fix this via regex
 59 |     # url = url.replace("?fref=pb&hc_location=profile_browser", "")
 60 |     # url = url.replace("?fref=pb&hc_location=friends_tab", "")
 61 |     url = url.replace("=pb&hc_location=profile_browser", "")
 62 |     url = url.replace("=pb&hc_location=friends_tab", "")
 63 |     regex_result = regex1.findall(url)
 64 |     if not regex_result:
 65 |         regex_result = regex3.findall(url)
 66 |     if regex_result:
 67 |         username = regex_result[0]
 68 |         if username == None: raise ValueError("No username was parsed %s" % url)
 69 |         if 'pages/' in username:
 70 |             username = username.split('/')[-1]
 71 |     else: # old style user that doesn't have username, only uid
 72 |         regex_result2 = regex2.findall(url)
 73 |         if not regex_result2: raise ValueError("URL not parseable %s" % url)
 74 |         username = regex_result2[0]
 75 | 
 76 |     return username
 77 | 
 78 | def get_pages_liked(username):
 79 |     url = "https://www.facebook.com/%s/likes" % username
 80 |     headers = {
 81 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
 82 |         'Accept': 'accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 83 |         'Accept-Encoding': 'gzip,deflate,sdch',
 84 |         'Accept-Language': 'en-US,en;q=0.8,nb;q=0.6',
 85 |         'Cache-Control': 'max-age=0'
 86 |     }
 87 |     resp = requests.get(url, headers = headers)
 88 | 
 89 |     if "Security Check" in resp.text:
 90 |         # pdb.set_trace()
 91 |         raise ScrapingError("Security Check")
 92 | 
 93 |     html = re.sub(r'(<!--)|(-->)',' ',resp.text)
 94 |     soup = BeautifulSoup(html)
 95 | 
 96 |     container = soup.findAll("div",["timelineFavorites"])
 97 |     if container: 
 98 |         container = container[0]
 99 |     
100 |         for link in container.findAll('a','mediaRowItem'):
101 |             print "link: %s" % link
102 |             username,uid = parse_url(link['href'])
103 |             try:
104 |                 link['class']
105 |                 # @TODO: return facebook page instead
106 |                 yield { 'link': link['href'],
107 |                         'name': link.text,
108 |                         'username': username,
109 |                         'uid': uid,
110 |                         'num_likes': get_attribute(username,'likes'),
111 |                         'talking_about_count': get_attribute(username,'talking_about_count'),
112 |                         'hometown': get_attribute(username,'hometown') }
113 |             except KeyError:
114 |                 pass
115 | 
116 |         for link in container.findAll('a'):
117 |             print "link: %s" % link
118 |             try:
119 |                 link['class']
120 |             except KeyError:
121 |                 try:
122 |                     username,uid = parse_url(link['href'])
123 |                     yield { 'link': link['href'],
124 |                         'name': link.text,
125 |                         'username': username,
126 |                         'uid': uid,
127 |                         'num_likes': get_attribute(username,'likes'),
128 |                         'talking_about_count': get_attribute(username,'talking_about_count'),
129 |                         'hometown': get_attribute(username,'hometown') }
130 |                 except ValueError:
131 |                     continue
132 |     else:
133 |         # pdb.set_trace()
134 |         raise ScrapingError("User %s has no likes or tight privacy settings." % username)
135 | 


--------------------------------------------------------------------------------
/socialscraper/facebook/models.py:
--------------------------------------------------------------------------------
  1 | from ..base import BaseModel, Column
  2 | 
  3 | class FacebookUser(BaseModel):
  4 |     __tablename__ = "facebook_users"
  5 |     __attrs__ = [
  6 |         'uid', 'username', 'email', 'birthday', 
  7 |         'name', 'locale', 'profile_url', 'sex',
  8 |         'college', 'employer', 'highschool', 'currentcity', 
  9 |         'hometown', 'misc', 'data', 'donor', 'contact_time',
 10 |         'scrape_status', 'nu'
 11 |     ]
 12 | 
 13 |     uid = Column("uid", "BigInteger", primary_key=True)
 14 |     username = Column("username")
 15 |     email = Column("email")
 16 |     birthday = Column("birthday", "Date")
 17 |     name = Column("name")
 18 |     locale = Column("locale")
 19 |     profile_url = Column("profile_url")
 20 |     sex = Column("sex")
 21 |     
 22 |     college = Column("college")
 23 |     employer = Column("employer")
 24 |     highschool = Column("highschool")
 25 |     currentcity = Column("currentcity")
 26 |     hometown = Column("hometown")
 27 |     misc = Column("misc")
 28 |     data = Column("data")
 29 |     donor = Column("donor", "String")
 30 |     contact_time = Column("contact_time")
 31 |     scrape_status = Column("scrape_status", "Integer") # empty = not attempted, 0 = can't get likes, 1 = scrape in progress, 2 = scrape finished
 32 |     nu = Column("nu", "Integer")
 33 | 
 34 | class FacebookFamily(BaseModel):
 35 |     __tablename__ = "facebook_families"
 36 |     __attrs__ = ['profile_id', 'relationship', 'uid', 'name']
 37 | 
 38 |     profile_id = Column("profile_id", "BigInteger", primary_key=True, foreign_key=True, foreign_key_reference="facebook_users.uid")
 39 |     relationship = Column("relationship")
 40 |     uid = Column("uid","BigInteger", primary_key=True, foreign_key=True, foreign_key_reference="facebook_users.uid") # foreign key
 41 |     name = Column("name")
 42 | 
 43 | class FacebookPage(BaseModel):
 44 |     __tablename__ = "facebook_pages"
 45 |     __attrs__ = [
 46 |         'about', 'username', 'page_id', 'is_verified', 
 47 |         'keywords', 'name', 'url', 'type', 'num_likes',
 48 |         'talking_about_count', 'hometown', 'misc', 'data'
 49 |     ]
 50 | 
 51 |     about = Column("about","Text")
 52 |     username = Column("username")
 53 |     page_id = Column("page_id","BigInteger", primary_key=True) # primary key
 54 |     is_verified = Column("is_verified","Boolean")
 55 |     keywords = Column("keywords")
 56 |     # location = Column("location","BigInteger", foreign_key=True, foreign_key_reference="facebook_locations.loc_id") # foreign key
 57 |     name = Column("name")
 58 |     url = Column("url")
 59 |     type = Column("type")
 60 |     num_likes = Column("num_likes","BigInteger")
 61 |     talking_about_count = Column("talking_about_count", "BigInteger")
 62 |     hometown = Column("hometown")
 63 |     misc = Column("misc")
 64 |     data = Column("data")
 65 | 
 66 | class FacebookStatus(BaseModel):
 67 |     __tablename__ = "facebook_statuses"    
 68 |     __attrs__ = ['like_count', 'message', 'status_id', 'uid', 'time']
 69 | 
 70 |     like_count = Column("like_count","Integer")
 71 |     message = Column("message","Text")
 72 |     status_id = Column("status_id","BigInteger", primary_key=True)
 73 |     uid = Column("uid","BigInteger")
 74 |     time = Column("time","Date")
 75 | 
 76 | class FacebookLocation(BaseModel):
 77 |     __tablename__ = "facebook_locations"
 78 |     __attrs__ = [
 79 |         'gid', 'loc_id', 'street', 'city', 
 80 |         'state', 'country', 'zip', 'address', 
 81 |         'latitude', 'longitude', 'name'
 82 |     ]
 83 | 
 84 |     gid = Column("gid", "BigInteger")
 85 |     loc_id = Column("loc_id", "BigInteger", primary_key=True)
 86 |     street = Column("street")
 87 |     city = Column("city")
 88 |     state = Column("state")
 89 |     country = Column("country")
 90 |     zip = Column("zip")
 91 |     address = Column("address")
 92 |     latitude = Column("latitude")
 93 |     longitude = Column("longitude")
 94 |     name = Column("name")
 95 | 
 96 | class FacebookGroup(BaseModel):
 97 |     __tablename__ = "facebook_groups"
 98 |     __attrs__ = ['group_id', 'username', 'url', 'name', 'description', 'icon', 'privacy', 'status']
 99 | 
100 |     group_id = Column("group_id", "BigInteger", primary_key=True)
101 |     username = Column("username")
102 |     url = Column("url")
103 |     name = Column("name")
104 |     description = Column("description")
105 |     icon = Column("icon")
106 |     privacy = Column("privacy")
107 |     status = Column("status")
108 | 
109 | ######################################## Join Tables ########################################
110 | 
111 | class FacebookFriend(BaseModel):
112 |     __tablename__ = "facebook_friends"
113 |     __attrs__ = ['uid1', 'uid2']
114 | 
115 |     uid1 = Column("uid1", "BigInteger", primary_key=True, unique=False, foreign_key=True, foreign_key_reference="facebook_users.uid")
116 |     uid2 = Column("uid2", "BigInteger", primary_key=True, unique=False, foreign_key=True, foreign_key_reference="facebook_users.uid")
117 | 
118 | 
119 | class FacebookPagesUsers(BaseModel):
120 |     __tablename__ = "facebook_pages_users"
121 |     __attrs__ = ['uid', 'page_id', 'type']
122 | 
123 |     uid = Column("uid", "BigInteger", primary_key=True, unique=False, foreign_key=True, foreign_key_reference="facebook_users.uid")
124 |     page_id = Column("page_id", "BigInteger", primary_key=True, unique=False, foreign_key=True, foreign_key_reference="facebook_pages.page_id")
125 |     type = Column("type")
126 | 
127 | class FacebookGroupsUsers(BaseModel):
128 |     __tablename__ = "facebook_groups_users"
129 |     __attrs__ = ['uid', 'group_id']
130 | 
131 |     uid = Column("uid", "BigInteger", primary_key=True, unique=False, foreign_key=True, foreign_key_reference="facebook_users.uid")
132 |     group_id = Column("group_id", "BigInteger", primary_key=True, unique=False, foreign_key=True, foreign_key_reference="facebook_groups.group_id")
133 | 
134 | __all__ = [
135 |     'FacebookUser',
136 |     'FacebookFamily',
137 |     'FacebookPage',
138 |     'FacebookStatus',
139 |     'FacebookLocation',
140 |     'FacebookFriend',
141 |     'FacebookPagesUsers',
142 |     'FacebookGroup',
143 |     'FacebookGroupsUsers'
144 | ]
145 | 
146 | 


--------------------------------------------------------------------------------
/socialscraper/facebook/nograph/friends.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import re, json, lxml, urllib
  4 | from bs4 import BeautifulSoup
  5 | from ...base import ScrapingError
  6 | from ..models import FacebookUser
  7 | 
  8 | from ..import graphapi, public
  9 | 
 10 | # AJAX_URL = "https://www.facebook.com/ajax/pagelet/generic.php/ManualCurationOGGridCollectionPagelet"
 11 | AJAX_URL = "https://www.facebook.com/ajax/pagelet/generic.php/AllFriendsAppCollectionPagelet"
 12 | FRIENDS_URL = "https://www.facebook.com/%s/%s"
 13 | FRIENDS_URL2 = "https://www.facebook.com/profile.php?id=%s&sk=%s"
 14 | 
 15 | def get_id(browser, current_user, graph_name, graph_id=None, api=None):
 16 |     response = browser.get("https://www.facebook.com/%s" % graph_name)
 17 |     return re.search(r'&#123;&quot;profile_owner&quot;:&quot;(\d+)&quot;,&quot;ref&quot;:&quot;timeline:timeline&quot;&#125;', response.text).groups()[0]
 18 | 
 19 | def get_friends(browser, current_user, graph_name, graph_id = None, api = None):
 20 | 
 21 |     def _find_script_tag(raw_html, phrase):
 22 |         doc = lxml.html.fromstring(raw_html)
 23 |         script_tag = filter(lambda x: x.text_content().find(phrase) != -1, doc.cssselect('script'))
 24 |         if not script_tag: return None
 25 |         return json.loads(script_tag[0].text_content()[24:-1])
 26 | 
 27 |     def _parse_cursor_data(raw_json):
 28 |         if raw_json.get('error'): raise ScrapingError(raw_json.get('errorDescription'))
 29 |         require = raw_json['jsmods']['require']
 30 |         tester = lambda x: x[0] == "TimelineAppCollection" and x[1] == "enableContentLoader"
 31 |         cursor_parameter = map(lambda x: [x[3][0].replace("pagelet_timeline_app_collection_", ""), x[3][2]], filter(tester, require))
 32 |         return cursor_parameter
 33 | 
 34 |     def _get_payload(ajax_data, uid):
 35 |         return {
 36 |             'data': json.dumps(ajax_data), 
 37 |             '__user': uid, 
 38 |             '__a': 1, 
 39 |             '__req': 'h', 
 40 |             '__dyn': '7n8ajEyl2qmumdDgDxyKBgWDxi9ACxO4oKA8ABGeqrWo8popyUWdDx24QqUkBBzEy78S8zU',
 41 |             '__rev': 1505336
 42 |         }
 43 | 
 44 |     def _result_to_model(result):
 45 | 
 46 |         url = result[0]
 47 |         name = result[1]
 48 |         uid = result[2]
 49 | 
 50 |         username = public.parse_url(url)
 51 | 
 52 |         # import pdb; pdb.set_trace()
 53 | 
 54 |         # if api:
 55 |         #     uid, category = graphapi.get_attributes(api, username, ["id", "category"])
 56 |         # else:
 57 |         #     uid = get_id(browser, current_user, username)
 58 |         #     # uid, category = public.get_attributes(username, ["id", "category"])
 59 | 
 60 |         if uid == None:
 61 |             print "Couldn't find UID of %s"
 62 |             raise ValueError("Couldn't find uid of %s" % username)
 63 | 
 64 |         uid = int(uid) if uid else None
 65 | 
 66 |         return FacebookUser(uid=uid, username=username, url=url, name=name)
 67 | 
 68 |     if re.match(r'^\d+$', graph_name):
 69 |         response = browser.get(FRIENDS_URL2 % (graph_name, "friends_all"))
 70 |     else:
 71 |         response = browser.get(FRIENDS_URL % (graph_name, "friends_all"))
 72 | 
 73 |     soup = BeautifulSoup(response.content.replace('<!--','').replace('-->',''))
 74 |     # print response.content
 75 | 
 76 |     for link in soup.findAll('a'):
 77 |         try:
 78 |             if 'eng_type' in link['data-gt']:
 79 | 
 80 |                 url = link['href']
 81 |                 name = link.text
 82 |                 uid = re.search(r'\/ajax\/hovercard\/user.php\?id=(\d+)&', link['data-hovercard']).groups()[0]
 83 |                 result = (url, name, uid)
 84 | 
 85 |                 # print result
 86 | 
 87 |                 try:
 88 |                     yield _result_to_model(result)
 89 |                 except ValueError:
 90 |                     continue
 91 | 
 92 |         except KeyError:
 93 |             continue
 94 | 
 95 |     cursor_tag = _find_script_tag(response.text, "enableContentLoader")
 96 |     cursor_data = _parse_cursor_data(cursor_tag) if cursor_tag else None
 97 | 
 98 |     if not cursor_data: return
 99 |     
100 |     ajax_data = {
101 |         'collection_token': cursor_data[0][0],
102 |         'cursor': cursor_data[0][1],
103 |         'profile_id': int(cursor_data[0][0].split(':')[0]),
104 |         'tab_key': "friends_all",
105 |         'overview': 'false',
106 |         'sk': 'likes',
107 |         'ftid': 'null',
108 |         'order': 'null',
109 |         'importer_state': 'null'
110 |     }
111 | 
112 |     while True:
113 | 
114 |         # print ajax_data
115 | 
116 |         payload = _get_payload(ajax_data, current_user.id)
117 |         response = browser.get(AJAX_URL + "?%s" % urllib.urlencode(payload))
118 | 
119 |         # PARSE PAGE
120 | 
121 |         data = json.loads(response.content[9:])
122 |         soup = BeautifulSoup(data['payload'])
123 |         for link in soup.findAll('a'):
124 |             try:
125 |                 if 'eng_type' in link['data-gt']:
126 |                     url = link['href']
127 |                     name = link.text
128 |                     uid = re.search(r'\/ajax\/hovercard\/user.php\?id=(\d+)&', link['data-hovercard']).groups()[0]
129 |                     result = (url, name, uid)
130 | 
131 |                     # print result
132 | 
133 |                     try:
134 |                         yield _result_to_model(result)
135 |                     except ValueError:
136 |                         continue
137 | 
138 |             except KeyError:
139 |                 continue
140 | 
141 |         # import pdb; pdb.set_trace()
142 | 
143 |         # FIND NEXT CURSOR
144 | 
145 |         regex = re.compile("href=\\\\\"(.*?)\"")
146 | 
147 |         tester = lambda x: x.find('next_cursor') != -1
148 |         thing = regex.findall(response.text)
149 |         thing2 = filter(tester, thing)
150 | 
151 |         # NO NEXT CURSOR FOUND
152 | 
153 |         if not thing2: break
154 | 
155 |         regex2 = re.compile("next_cursor=(.*)")
156 |         new_cursor = regex2.findall(thing2[0])[0].replace("\\u00253D\\", "=").replace("u00253D\\", "=")
157 |         ajax_data['cursor'] = new_cursor
158 | 
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/socialscraper/facebook/nograph/likes.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import re, json, lxml, urllib
  4 | from bs4 import BeautifulSoup
  5 | from ...base import ScrapingError
  6 | from ..models import FacebookPage
  7 | 
  8 | from ..import graphapi, public
  9 | 
 10 | # AJAX_URL = "https://www.facebook.com/ajax/pagelet/generic.php/ManualCurationOGGridCollectionPagelet"
 11 | AJAX_URL = "https://www.facebook.com/ajax/pagelet/generic.php/LikesWithFollowCollectionPagelet"
 12 | LIKES_URL = "https://www.facebook.com/%s/%s"
 13 | 
 14 | def get_likes(browser, current_user, graph_name, graph_id = None, api = None):
 15 | 
 16 |     def _find_script_tag(raw_html, phrase):
 17 |         doc = lxml.html.fromstring(raw_html)
 18 |         script_tag = filter(lambda x: x.text_content().find(phrase) != -1, doc.cssselect('script'))
 19 |         if not script_tag: return None
 20 |         return json.loads(script_tag[0].text_content()[24:-1])
 21 | 
 22 |     def _parse_cursor_data(raw_json):
 23 |         if raw_json.get('error'): raise ScrapingError(raw_json.get('errorDescription'))
 24 |         require = raw_json['jsmods']['require']
 25 |         tester = lambda x: x[0] == "TimelineAppCollection" and x[1] == "enableContentLoader"
 26 |         cursor_parameter = map(lambda x: [x[3][0].replace("pagelet_timeline_app_collection_", ""), x[3][2]], filter(tester, require))
 27 |         return cursor_parameter
 28 | 
 29 |     def _get_payload(ajax_data, uid):
 30 |         return {
 31 |             'data': json.dumps(ajax_data), 
 32 |             '__user': uid, 
 33 |             '__a': 1, 
 34 |             '__req': 'o', 
 35 |             '__dyn': '7n8ajEyl2qmumdDgDxyKBgWDxi9ACxO4oKA8ABGeqrWo8popyUWdDx24QqUkBBzEy78S8zU',
 36 |             '__rev': 1505336
 37 |         }
 38 | 
 39 |     def _result_to_model(result):
 40 | 
 41 |         url = result[0]
 42 |         name = result[1]
 43 | 
 44 |         username = public.parse_url(url)
 45 | 
 46 |         if api:
 47 |             page_id, category = graphapi.get_attributes(api, username, ["id", "category"])
 48 |         else:
 49 |             page_id, category = public.get_attributes(username, ["id", "category"])
 50 | 
 51 |         if page_id == None: 
 52 |             print "Couldn't find page_id of %s"
 53 |             raise ValueError("Couldn't find page_id of %s" % username)
 54 | 
 55 |         page_id = int(page_id) if page_id else None
 56 | 
 57 |         return FacebookPage(page_id=page_id, username=username, url=url, name=name, type=category)
 58 | 
 59 |     response = browser.get(LIKES_URL % (graph_name, 'likes'))
 60 |     soup = BeautifulSoup(response.content.replace('<!--','').replace('-->',''))
 61 | 
 62 |     CURRENT_LIKES_TYPES = []
 63 | 
 64 |     try:
 65 |         for x in soup.findAll('div', {'aria-role': 'tablist'})[0]: 
 66 |             if   'People' in x.text:        CURRENT_LIKES_TYPES.append('likes_people')
 67 |             elif 'Restaurants' in x.text:   CURRENT_LIKES_TYPES.append('likes_restaurants')
 68 |             elif 'Sports' in x.text:        CURRENT_LIKES_TYPES.append('likes_sports')
 69 |             elif 'Clothing' in x.text:      CURRENT_LIKES_TYPES.append('likes_clothing')
 70 |             elif 'Other' in x.text:         CURRENT_LIKES_TYPES.append('likes_other')
 71 |     except IndexError:
 72 |         raise ScrapingError("No likes for username %s" % graph_name)
 73 | 
 74 |     for likes_type in CURRENT_LIKES_TYPES:
 75 |         response = browser.get(LIKES_URL % (graph_name, likes_type))
 76 | 
 77 |         soup = BeautifulSoup(response.content.replace('<!--','').replace('-->',''))
 78 |         # print response.content
 79 | 
 80 |         for link in soup.findAll('a'):
 81 |             try:
 82 |                 if 'eng_type' in link['data-gt']:
 83 | 
 84 |                     url = link['href']
 85 |                     name = link.text
 86 |                     result = (url, name)
 87 | 
 88 |                     try:
 89 |                         yield _result_to_model(result)
 90 |                     except ValueError:
 91 |                         continue
 92 | 
 93 |             except KeyError:
 94 |                 continue
 95 | 
 96 |         cursor_tag = _find_script_tag(response.text, "enableContentLoader")
 97 |         cursor_data = _parse_cursor_data(cursor_tag) if cursor_tag else None
 98 | 
 99 |         if not cursor_data: continue
100 |         
101 |         ajax_data = {
102 |             'collection_token': cursor_data[0][0],
103 |             'cursor': cursor_data[0][1],
104 |             'profile_id': int(cursor_data[0][0].split(':')[0]),
105 |             'tab_key': likes_type,
106 |             'overview': 'false',
107 |             'sk': 'likes',
108 |             'ftid': 'null',
109 |             'order': 'null',
110 |             'importer_state': 'null'
111 |         }
112 | 
113 |         while True:
114 | 
115 |             # print ajax_data
116 | 
117 |             payload = _get_payload(ajax_data, current_user.id)
118 |             response = browser.get(AJAX_URL + "?%s" % urllib.urlencode(payload))
119 |         
120 |             # PARSE PAGE
121 | 
122 |             data = json.loads(response.content[9:])
123 |             soup = BeautifulSoup(data['payload'])
124 |             for link in soup.findAll('a'):
125 |                 try:
126 |                     if 'eng_type' in link['data-gt']:
127 |                         url = link['href']
128 |                         name = link.text
129 |                         result = (url, name)
130 | 
131 |                         try:
132 |                             yield _result_to_model(result)
133 |                         except ValueError:
134 |                             continue
135 | 
136 |                 except KeyError:
137 |                     continue
138 | 
139 |             # FIND NEXT CURSOR
140 | 
141 |             regex = re.compile("href=\\\\\"(.*?)\"")
142 | 
143 |             tester = lambda x: x.find('next_cursor') != -1
144 |             thing = regex.findall(response.text)
145 |             thing2 = filter(tester, thing)
146 | 
147 |             # NO NEXT CURSOR FOUND
148 | 
149 |             if not thing2: break
150 | 
151 |             regex2 = re.compile("next_cursor=(.*)")
152 |             new_cursor = regex2.findall(thing2[0])[0].replace("\\u00253D\\", "=").replace("u00253D\\", "=")
153 | 
154 |             ajax_data['cursor'] = new_cursor
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/socialscraper/adapters/adapter_sqlalchemy.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | from datetime import datetime
  5 | from .. import facebook, twitter
  6 | 
  7 | import sqlalchemy
  8 | 
  9 | # try:
 10 | #     import sqlalchemy
 11 | # except ImportError:
 12 | #     raise Exception("You can't use the sqlalchemy adapter without installing sqlalchemy!")
 13 | 
 14 | from sqlalchemy import Table, MetaData, Column, ForeignKey, Integer, String, BigInteger, Date, Text, Boolean, Float
 15 | from sqlalchemy.orm import relationship, backref
 16 | from sqlalchemy import select
 17 | 
 18 | class BaseSQLModel(object):
 19 | 
 20 |     def to_json(self):
 21 |         d = {}
 22 |         for column in self.__table__.columns:
 23 |             val = getattr(self, column.name)
 24 |             d[column.name] = val
 25 |         return d
 26 | 
 27 | def make_models(db, base_classes):
 28 | 
 29 |     """
 30 |     base_classes = (db.Model, BaseModel)
 31 |     make_models(base_classes)
 32 |     """
 33 | 
 34 |     def get_model_properties(model):
 35 |         properties = {}
 36 |         pkeys = []
 37 |         for col in model.get_columns():
 38 |             if col.foreign_key:
 39 |                 properties[col.name] = Column(col.name, eval(col.type), ForeignKey(col.foreign_key_reference), primary_key=col.primary_key, unique=col.unique)
 40 |             else:
 41 |                 properties[col.name] = Column(col.name, eval(col.type), primary_key=col.primary_key, unique=col.unique)
 42 |             if col.primary_key:
 43 |                 pkeys.append(col.name)
 44 |         properties['__tablename__'] = model.__tablename__
 45 | 
 46 |         return properties
 47 | 
 48 |     base_classes = base_classes + (BaseSQLModel,)
 49 | 
 50 |     FacebookUser = type('FacebookUser', base_classes, get_model_properties(facebook.models.FacebookUser))
 51 |     FacebookFamily = type('FacebookFamily', base_classes, get_model_properties(facebook.models.FacebookFamily))
 52 |     FacebookLocation = type('FacebookLocation', base_classes, get_model_properties(facebook.models.FacebookLocation))
 53 |     FacebookFriend = type('FacebookFriend', base_classes, get_model_properties(facebook.models.FacebookFriend))
 54 |     FacebookPage = type('FacebookPage', base_classes, get_model_properties(facebook.models.FacebookPage))
 55 |     FacebookStatus = type('FacebookStatus', base_classes, get_model_properties(facebook.models.FacebookStatus))
 56 |     FacebookGroup = type('FacebookGroup', base_classes, get_model_properties(facebook.models.FacebookGroup))
 57 |     FacebookPagesUsers = type('FacebookPagesUsers', base_classes, get_model_properties(facebook.models.FacebookPagesUsers))
 58 |     FacebookGroupsUsers = type('FacebookGroupsUsers', base_classes, get_model_properties(facebook.models.FacebookGroupsUsers))
 59 | 
 60 |     TwitterUser = type('TwitterUser', base_classes, get_model_properties(twitter.models.TwitterUser))
 61 |     TwitterTweet = type('TwitterTweet', base_classes, get_model_properties(twitter.models.TwitterTweet))
 62 | 
 63 |     FacebookUser.pages = relationship('FacebookPage', secondary=FacebookPagesUsers.__table__)
 64 |     FacebookPage.users = relationship('FacebookUser', secondary=FacebookPagesUsers.__table__)
 65 | 
 66 |     FacebookUser.groups = relationship('FacebookGroup', secondary=FacebookGroupsUsers.__table__)
 67 |     FacebookGroup.users = relationship('FacebookUser', secondary=FacebookGroupsUsers.__table__)
 68 | 
 69 |     # http://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-viii-followers-contacts-and-friends
 70 |     FacebookUser.friends = relationship('FacebookUser',
 71 |       secondary = FacebookFriend.__table__,
 72 |       primaryjoin = (FacebookFriend.__table__.c.uid1 == FacebookUser.uid),
 73 |       secondaryjoin = (FacebookFriend.__table__.c.uid2 == FacebookUser.uid),
 74 |       backref = backref('_friends', lazy = 'dynamic'),
 75 |       lazy = 'dynamic'
 76 |     )
 77 | 
 78 |     # http://stackoverflow.com/questions/9116924/how-can-i-achieve-a-self-referencing-many-to-many-relationship-on-the-sqlalchemy
 79 |     friendship_union = select([FacebookFriend.__table__.c.uid1, FacebookFriend.__table__.c.uid2]). \
 80 |                         union(select([FacebookFriend.__table__.c.uid2, FacebookFriend.__table__.c.uid1])).alias()
 81 | 
 82 |     FacebookUser.all_friends = relationship('FacebookUser',
 83 |        secondary=friendship_union,
 84 |        primaryjoin=FacebookUser.uid==friendship_union.c.uid1,
 85 |        secondaryjoin=FacebookUser.uid==friendship_union.c.uid2,
 86 |        viewonly=True,
 87 |        lazy = 'dynamic'
 88 |     )
 89 | 
 90 |     def friend(self, user):
 91 |         if not self.is_friend(user):
 92 |             self.friends.append(user)
 93 |             return self
 94 | 
 95 |     def unfriend(self, user):
 96 |         if self.is_friend(user):
 97 |             self.friends.remove(user)
 98 |             return self
 99 | 
100 |     def is_friend(self, user):
101 |         return self.friends.filter(FacebookFriend.__table__.c.uid2 == user.uid).count() > 0
102 | 
103 |     FacebookUser.friend = friend
104 |     FacebookUser.unfriend = unfriend
105 |     FacebookUser.is_friend = is_friend
106 | 
107 |     # FacebookUser.locations = relationship('FacebookLocation') uid -> gid
108 |     # FacebookPage.locations = relationship('FacebookLocation') page_id -> gid
109 | 
110 |     def to_json(self):
111 |         dic = super(FacebookUser,self).to_json()
112 |         dic['pages'] = [pg.to_json() for pg in self.pages]
113 |         dic['locations'] = [loc.to_json() for loc in self.locations]
114 |         return dic
115 | 
116 |     FacebookUser.to_json = to_json
117 | 
118 |     return {
119 |         'FacebookUser': FacebookUser,
120 |         'FacebookFamily': FacebookFamily,
121 |         'FacebookLocation': FacebookLocation,
122 |         'FacebookFriend': FacebookFriend,
123 |         'FacebookPage': FacebookPage,
124 |         'FacebookGroup': FacebookGroup,
125 |         'FacebookStatus': FacebookStatus,
126 |         'FacebookPagesUsers': FacebookPagesUsers,
127 |         'FacebookGroupsUsers': FacebookGroupsUsers,
128 |         'TwitterUser': TwitterUser,
129 |         'TwitterTweet': TwitterTweet
130 |     }
131 | 
132 | def convert_result(sqlalchemymodel, socialscrapermodel):
133 |     for col in socialscrapermodel.get_columns():
134 |         if not getattr(sqlalchemymodel, col.name):
135 |             setattr(sqlalchemymodel, col.name, getattr(socialscrapermodel, col.name))
136 | 
137 | 


--------------------------------------------------------------------------------
/socialscraper/facebook/auth.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import logging, lxml.html, re, sys
  4 | from ..base import ScrapingError
  5 | 
  6 | # logging.basicConfig(level=logging.DEBUG)
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | FACEBOOK_MOBILE_URL = 'https://m.facebook.com'
 10 | MOBILE_LOGIN_URL = FACEBOOK_MOBILE_URL + '/login.php'
 11 | MOBILE_PROFILE_URL = FACEBOOK_MOBILE_URL + '/profile.php'
 12 | MOBILE_CHECKPOINT_URL = FACEBOOK_MOBILE_URL + '/login/checkpoint/'
 13 | 
 14 | FACEBOOK_DESKTOP_URL = 'https://www.facebook.com'
 15 | DESKTOP_PROFILE_URL = FACEBOOK_DESKTOP_URL + '/profile.php'
 16 | 
 17 | INPUT_ERROR = ["We didn't recognize your email address or phone number."]
 18 | 
 19 | REVIEW_RECENT_LOGIN_CONTINUE = [
 20 |     "Review Recent Login", 
 21 |     "Someone recently tried to log into your account from an unknown browser. " + 
 22 |     "Please review this login."
 23 | ]
 24 | 
 25 | REVIEW_RECENT_LOGIN_OKAY = [
 26 |     "Review Recent Login", 
 27 |     "Login near", 
 28 |     "from", 
 29 |     "This is Okay", 
 30 |     "I don&#039;t recognize"
 31 | ]
 32 | 
 33 | REMEMBER_BROWSER = [
 34 |     "Remember Browser", 
 35 |     "You have already saved the maximum number of computers for your account. " + 
 36 |     "To remove existing computers, please visit your Account Settings after you login.  " + 
 37 |     "For now, please save this browser."
 38 | ]
 39 | 
 40 | REMEMBER_BROWSER2 = [
 41 |     "Remember Browser",
 42 |     "Please save the browser you just verified."
 43 | ]
 44 | 
 45 | REMEMBER_BROWSER3 = [
 46 |     "Remember Browser",
 47 |     "Please save this",
 48 |     "browser if you use it often."
 49 | ]
 50 | 
 51 | LOGGED_IN = [
 52 |     "Home", 
 53 |     "Messages", 
 54 |     "Notifications", 
 55 |     "Chat", 
 56 |     "Friends", 
 57 |     "logout.php"
 58 | ]
 59 | 
 60 | LOCKED = [
 61 |     "Your Account is Temporarily Locked",
 62 |     "detected suspicious activity coming from your IP address"
 63 | ]
 64 | 
 65 | SECURITY_CHECK = [
 66 |     "Security Check",
 67 |     "Please enter the text below"
 68 | ]
 69 | 
 70 | PHONE_CONFIRMATION = [
 71 |     "Phone Number Confirmation"
 72 | ]
 73 | 
 74 | def state(response_text, test_strings):
 75 |     return all(s in response_text for s in test_strings)
 76 | 
 77 | def login(browser, email, password, username=None):
 78 |     """
 79 | 
 80 |     Facebook Login
 81 | 
 82 |     browser: non-authenticated requests session
 83 |     email: email used to log in to Facebook
 84 |     password: password used to log into Facebook
 85 |     username: (optional) if not supplied, it will be found from the PROFILE_URL
 86 | 
 87 |     Given a requests session, email, and password, authenticate the session.
 88 |     Returns authenticated user's username if not given.
 89 | 
 90 |     Because logging into Facebook can be relatively non-deterministic based on
 91 |     how often the account has been used, how many friends it has, how recently 
 92 |     it was created, etc. I created a simple state machine and listed the states
 93 |     above. 
 94 | 
 95 |     It's very easy to add new states, they are based on strings that are found 
 96 |     on the resulting page.
 97 | 
 98 |     """
 99 | 
100 |     logger.info("Begin Facebook Authentication")
101 |     response = browser.get(FACEBOOK_MOBILE_URL, timeout=1)
102 |     logger.debug('Loaded Facebook Mobile Browser')
103 |     payload = {'email': email, 'pass': password}
104 |     response = browser.post(MOBILE_LOGIN_URL , data=payload)
105 |     logger.debug('Initial Login')
106 | 
107 |     if "Your password was incorrect." in response.text:
108 |         raise ScrapingError("Your password was incorrect.")
109 | 
110 |     def get_base_payload(response_content):
111 |         doc = lxml.html.fromstring(response_content)
112 |         return {
113 |             'lsd': doc.cssselect("input[name=lsd]")[0].get('value'),
114 |             'charset_test': doc.cssselect("input[name=charset_test]")[0].get('value'),
115 |             'nh': doc.cssselect("input[name=nh]")[0].get('value')
116 |         }
117 | 
118 |     if not state(response.text, LOGGED_IN) and not state(response.text, PHONE_CONFIRMATION):
119 |         base_payload = get_base_payload(response.content)
120 | 
121 |     while not state(response.text, LOGGED_IN):
122 | 
123 |         if state(response.text, INPUT_ERROR):
124 |             raise ScrapingError("We didn't recognize your email address or phone number.")
125 |         elif state(response.text, REVIEW_RECENT_LOGIN_CONTINUE):
126 |             payload = { 'submit[Continue]': 'Continue' }
127 |             payload.update(base_payload)
128 |             response = browser.post(MOBILE_CHECKPOINT_URL, data=payload)
129 |             logger.debug('Review Recent Login -- Click Continue')
130 |         elif state(response.text, REVIEW_RECENT_LOGIN_OKAY):
131 |             payload = { 'submit[This is Okay]': 'This is Okay' }
132 |             payload.update(base_payload)
133 |             response = browser.post(MOBILE_CHECKPOINT_URL, data=payload)
134 |             logger.debug('Review Recent Login -- Click Okay')
135 |         elif state(response.text, REMEMBER_BROWSER) or state(response.text, REMEMBER_BROWSER2) or state(response.text, REMEMBER_BROWSER3):
136 |             payload = {
137 |                 'submit[Continue]': 'Continue',
138 |                 'name_action_selected': 'dont_save'
139 |             }
140 |             payload.update(base_payload)
141 |             response = browser.post(MOBILE_CHECKPOINT_URL, data=payload)
142 |             logger.debug('Remember Browser -- Click Don\'t Save')
143 |         elif state(response.text, LOCKED):
144 |             raise ScrapingError("Account is locked.")
145 |         elif state(response.text, PHONE_CONFIRMATION):
146 |             response = browser.get("https://m.facebook.com/phoneacqwrite/?s=1&source=m_mobile_mirror_interstitial")
147 |             logger.debug('Phone Number Confirmation -- Click Skip')
148 |         else:
149 |             print response.text
150 |             import pdb; pdb.set_trace()
151 | 
152 |     logger.info("Facebook Authentication Complete")
153 | 
154 |     def get_auth_username():
155 |         """Get username of logged in user."""
156 |         response = browser.get(DESKTOP_PROFILE_URL)
157 |         doc = lxml.html.fromstring(response.content)
158 |         username = doc.cssselect('noscript meta')[0].get('content').replace('0; URL=/', '').replace('?_fb_noscript=1', '')
159 |         logger.debug('Retrieve username from profile')
160 |         return username
161 | 
162 |     if not username: username = get_auth_username()
163 | 
164 |     if username == "profile.php": 
165 |         raise ScrapingError("You need to pass in the id as the username to the auth method because this user only has an id.")
166 |     
167 |     return username
168 | 
169 | def logout(browser):
170 |     browser.post('http://www.facebook.com/logout.php')
171 | 


--------------------------------------------------------------------------------
/socialscraper/twitter/scraper.py:
--------------------------------------------------------------------------------
  1 | from ..base import BaseScraper, BaseUser, UsageError, FeedItem
  2 | import requests, json, bs4
  3 | 
  4 | from .models import TwitterUser, TwitterTweet
  5 | 
  6 | # class TwitterUser(BaseUser):
  7 | #     """Container for the info associated w/ a Twitter user"""
  8 | #     def __init__(self, screen_name=None, id=None):
  9 | #         super(TwitterUser, self).__init__(id=id, username=screen_name)
 10 | 
 11 | #     @property
 12 | #     def screen_name(self):
 13 | #         return self.username
 14 | 
 15 | #     @screen_name.setter
 16 | #     def screen_name(self, value):
 17 | #         self.username = value
 18 | 
 19 | class Tweet(FeedItem):
 20 |     """Container for a tweet on a timeline."""
 21 |     def __init__(self, id, content_timestamp=None,content=None,item_type=None):
 22 |         FeedItem.__init__(self,id, timestamp=int(content_timestamp), content=content.decode('utf8','ignore').encode('utf-8','ignore'), type=item_type)
 23 | 
 24 | class TwitterScraper(BaseScraper):
 25 |     def __init__(self,user_agents = None):
 26 |         """Initialize the twitter scraper."""
 27 |         BaseScraper.__init__(self,user_agents)
 28 | 
 29 |     def get_feed_by_screen_name(self,screen_name):
 30 |         """Get a twitter user's feed given their screen name."""
 31 |         user = TwitterUser(screen_name, self.id_from_screen_name(screen_name))
 32 |         cursor = str(999999999999999999)
 33 |         tweets = []
 34 | 
 35 |         while True:
 36 |             tweet_json = self._get_json("tweets",user.screen_name,cursor)
 37 | 
 38 |             html = tweet_json["items_html"]
 39 |             soup = bs4.BeautifulSoup(html)
 40 |             root_containers = soup.select(".ProfileTweet")
 41 | 
 42 |             # old style twitter profile
 43 |             if not root_containers:
 44 |                 root_containers = soup.select(".js-stream-tweet")
 45 | 
 46 |             if not root_containers:
 47 |                 break
 48 | 
 49 |             for container in root_containers:
 50 | 
 51 |                 # ignore retweets
 52 |                 if container.get('data-retweet-id'): continue
 53 | 
 54 |                 tweet_id = container.get('data-tweet-id')
 55 |                 text_container = container.select('.js-tweet-text')
 56 |                 timestamp_container = container.select('.js-short-timestamp')
 57 | 
 58 |                 if not tweet_id and not text_container and not timestamp_container: 
 59 |                     continue
 60 | 
 61 |                 text_container = text_container[0]
 62 |                 timestamp_container = timestamp_container[0]
 63 | 
 64 |                 cur_tweet = Tweet(id=tweet_id,
 65 |                                   content_timestamp=timestamp_container["data-time"],
 66 |                                   content=text_container.text.encode('utf-8','ignore'))
 67 | 
 68 |                 yield cur_tweet
 69 | 
 70 |             # # text_containers = soup.findAll("p","js-tweet-text")
 71 |             # # timestamp_containers = soup.findAll("span","_timestamp")
 72 |             # for container in zip(timestamp_containers,text_containers):
 73 |             #     cur_tweet = Tweet(id=hash(container[1].text.encode('utf-8','ignore')),
 74 |             #                       content_timestamp=container[0]["data-time"],
 75 |             #                       content=container[1].text.encode('utf-8','ignore'))
 76 |             #     # tweets.append(cur_tweet)
 77 |             #     yield cur_tweet
 78 | 
 79 |             
 80 |             if not tweet_json["has_more_items"]:
 81 |                 break
 82 | 
 83 |             cursor = tweet_json["max_id"]
 84 |         #return tweets
 85 | 
 86 |     def get_feed_by_id(self,id):
 87 |         """Get a user's twitter feed given their user ID."""
 88 |         return self.get_feed_by_screen_name(self.screen_name_from_id(int(id)))
 89 | 
 90 |     def get_followers(self,id_or_username,max=-1):
 91 |         """Get a twitter user's feed given their numeric ID or 
 92 |         username. Type checking is used to determine the category of 
 93 |         the input argument - an ``int`` is interpreted as a numeric ID,
 94 |         while a ``string`` is interpreted as a username. 
 95 |         If max is a positive number, get_followers will only retrieve 
 96 |         up to that number of followers (note that the actual number
 97 |         returned may be slightly larger due to the parsing mechanics.)
 98 |         """
 99 | 
100 |         # Determine whether the input argument is a numeric ID or a username
101 |         user = TwitterUser()
102 | 
103 |         if (type(id_or_username) == int) or (type(id_or_username) == float):
104 |             # ... if it's an ID, get the corresponding username
105 |             user.id = id_or_username
106 |             user.screen_name = self.screen_name_from_id(user.id)
107 |         else:
108 |             user.screen_name = id_or_username
109 |             user.id = self.id_from_screen_name(user.screen_name)
110 | 
111 |         cursor = None
112 | 
113 |         while True:
114 |             follower_json = self._get_json("followers",user.screen_name,cursor)
115 |             
116 |             # parse follower json
117 |             html = follower_json["items_html"]
118 |             soup = bs4.BeautifulSoup(html)
119 |             user_containers = soup.findAll("div",["js-actionable-user",
120 |                                                   "js-profile-popup-actionable",
121 |                                                   "account"])
122 |             for container in user_containers:
123 |                 cur_user = TwitterUser(container['data-screen-name']
124 |                                         .encode('utf-8','ignore'),
125 |                                    int(container['data-user-id']
126 |                                         .encode('utf-8','ignore')))
127 |                 yield cur_user
128 | 
129 |             if not follower_json["has_more_items"]:
130 |                 break
131 | 
132 |             cursor = follower_json["cursor"]
133 | 
134 |     def screen_name_from_id(self,user_id):
135 |         """Get a user's screen name from their ID."""
136 |         url = "https://twitter.com/account/redirect_by_id/%i" % user_id
137 |         resp = requests.request("GET", url, allow_redirects=False)
138 |         screen_name = resp.headers['location'].split('/')[-1]
139 | 
140 |         return screen_name
141 | 
142 | 
143 |     def id_from_screen_name(self,screen_name):
144 |         """Get a user's ID from their screen name."""
145 |         # @TODO: need to investigate if this is scalable
146 |         url = "http://mytwitterid.com/api.php?screen_name=%s" % screen_name
147 |         resp = requests.get(url)
148 |         print resp.text
149 |         try:
150 |             return json.loads(resp.text)[0]["user"]["id"]
151 |         except KeyError:
152 |             return hash(screen_name)
153 |     
154 |     def _get_json(self, type_, screen_name, cursor):
155 |         """Internal method to get the JSON response for a particular 
156 |         twitter request (eg. followers or tweets.)
157 |         """
158 |         if type_ == "followers":
159 |             base_url = "https://twitter.com/%s/followers/users?" % screen_name
160 |         elif type_ == "tweets":
161 |             base_url = "https://twitter.com/i/profiles/show/%s/timeline?" % screen_name
162 |         else:
163 |             raise UsageError()
164 |         
165 |         if cursor and type_ == "followers":
166 |             base_url += "&cursor=" + str(cursor)
167 |         elif cursor and type_ == "tweets":
168 |             base_url += "&max_id=" + str(cursor)
169 | 
170 |         # print base_url
171 |         resp = self._browser.open(base_url)
172 |         if "redirect_after_login" in resp.geturl():
173 |             # login first
174 |             self.login()
175 |             resp = self._browser.submit()
176 |             
177 |         return json.loads(resp.read())
178 | 
179 |     def login(self):
180 |         user_acct = self.pick_random_user()
181 |         self._browser.select_form(nr=1)
182 |         self._browser.form["session[username_or_email]"] = user_acct.username
183 |         self._browser.form["session[password]"] = user_acct.password
184 | 


--------------------------------------------------------------------------------
/socialscraper/facebook/scraper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import logging, requests, pickle, os
  4 | from requests.adapters import HTTPAdapter
  5 | from facebook import GraphAPI, GraphAPIError
  6 | from ..base import BaseScraper, ScrapingError
  7 | 
  8 | from . import auth
  9 | from . import public
 10 | from . import nograph
 11 | from . import graphapi
 12 | from . import graphsearch
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | FACEBOOK_MOBILE_URL = 'https://m.facebook.com'
 17 | FACEBOOK_USER_TOKEN = os.getenv('FACEBOOK_USER_TOKEN')
 18 | 
 19 | class FacebookSession(requests.sessions.Session):
 20 | 
 21 |     def get(self, url, **kwargs):
 22 |         response = super(FacebookSession, self).get(url, **kwargs)
 23 | 
 24 |         if not auth.state(response.text, auth.LOCKED) and not auth.state(response.text, auth.SECURITY_CHECK):
 25 |             return response
 26 |         else:
 27 |             raise ScrapingError("Account locked. Stop scraping!")
 28 | 
 29 | class FacebookScraper(BaseScraper):
 30 | 
 31 |     def __init__(self,user_agents=None, pickled_session=None, pickled_api=None, scraper_type="graphapi"):
 32 |         """Initialize the Facebook scraper."""
 33 |         
 34 |         self.scraper_type = scraper_type
 35 | 
 36 |         self.cur_user = None
 37 | 
 38 |         if pickled_session: self.browser = pickle.loads(pickled_session)
 39 |         else:  self.browser = FacebookSession()
 40 | 
 41 |         if pickled_api: self.api = pickle.loads(pickled_api)
 42 |         else: self.api = None
 43 | 
 44 |         # TODO: write method to just pickle the whole FacebookScraper instead
 45 | 
 46 |         BaseScraper.__init__(self, user_agents)
 47 |         self.browser.headers = { 'User-Agent': self.cur_user_agent }
 48 |         self.browser.mount(FACEBOOK_MOBILE_URL, HTTPAdapter(pool_connections=500, pool_maxsize=500, max_retries=3, pool_block=True))
 49 | 
 50 |     def init_api(self, pickled_api=None):
 51 | 
 52 |         if pickled_api: self.api = pickle.loads(pickled_api)
 53 |         else: self.api = GraphAPI(access_token=FACEBOOK_USER_TOKEN)
 54 | 
 55 |         try:
 56 |             #self.api.get_object('me')
 57 |             pass
 58 |         except (GraphAPIError, AttributeError):
 59 |             raise ScrapingError("Need a valid FACEBOOK_USER_TOKEN or initializing the api.")
 60 |             self.api = None
 61 | 
 62 |         return bool(self.api)
 63 | 
 64 |     def login(self):
 65 |         """Logs user into facebook."""
 66 |         self.cur_user = self.pick_random_user()
 67 |         self.cur_user.username = auth.login(self.browser, self.cur_user.email, self.cur_user.password, username=self.cur_user.username)
 68 |         self.cur_user.id = self.get_graph_id(self.cur_user.username)
 69 | 
 70 |     def logout(self):
 71 |         auth.logout(self.browser)
 72 |         self.cur_user = None
 73 | 
 74 |     def login_required(func):
 75 |         def _login_required(*args):
 76 |             if args[0].cur_user == None:
 77 |                 raise ScrapingError("Cannot use method %s without logging in." % func)
 78 |             ret = func(*args)
 79 |             return ret
 80 |         return _login_required
 81 | 
 82 |     def api_required(func):
 83 |         def _api_requred(*args):
 84 |             if args[0].api == None:
 85 |                 raise ScrapingError("Cannot use method %s without a valid FACEBOOK_USER_TOKEN or initializing the api." % func)
 86 |             ret = func(*args)
 87 |             return ret
 88 |         return _api_requred
 89 | 
 90 |     def get_graph_id(self, graph_name):
 91 |         return public.get_id(graph_name)
 92 | 
 93 |     def get_graph_name(self, graph_id):
 94 |         return public.get_name(graph_id)
 95 | 
 96 |     def get_graph_attribute(self, graph_id, attribute):
 97 |         return public.get_attribute(graph_id,attribute)
 98 | 
 99 |     # wrapper methods
100 | 
101 |     def get_about(self, graph_name, graph_id=None):
102 |         if self.scraper_type == "graphapi": return self.get_about_api(graph_name)
103 |         elif self.scraper_type == "nograph": return self.get_about_nograph(graph_name, graph_id)
104 |         elif self.scraper_type == "graphsearch": raise NotImplementedError("get_about with graphsearch")
105 | 
106 |     def get_feed(self, graph_name, graph_id=None):
107 |         if self.scraper_type == "api": return self.get_feed_api(graph_name)
108 |         elif self.scraper_type == "nograph": return self.get_feed_nograph(graph_name, graph_id)
109 |         elif self.scraper_type == "graphsearch": raise NotImplementedError("get_feed with graphsearch")
110 | 
111 |     def get_feed2(self, graph_name, graph_id=None):
112 |         if self.scraper_type == "api": raise NotImplementedError("")
113 |         elif self.scraper_type == "nograph": return self.get_feed_nograph2(graph_name, graph_id)
114 |         elif self.scraper_type == "graphsearch": raise NotImplementedError("")
115 | 
116 |     def get_likes(self, graph_name, graph_id=None):
117 |         if self.scraper_type == "api": return self.get_likes_api(graph_name)
118 |         elif self.scraper_type == "nograph": return self.get_likes_nograph(graph_name)
119 |         elif self.scraper_type == "graphsearch": return self.graph_search(graph_name, "pages-liked")
120 |         elif self.scraper_type == "public": return public.get_pages_liked(graph_name)
121 | 
122 |     def get_fans(self, graph_name, graph_id=None):
123 |         if self.scraper_type == "api": raise NotImplementedError("get_fans with graphapi")
124 |         elif self.scraper_type == "nograph": raise NotImplementedError("get_fans with nograph")
125 |         elif self.scraper_type == "graphsearch": return self.graph_search(graph_name, "likers")
126 | 
127 |     def get_friends(self, graph_name, graph_id=None):
128 |         if self.scraper_type == "api": raise Exception("can't do this using graph api any more")
129 |         elif self.scraper_type == "nograph": return self.get_friends_nograph(graph_name)
130 |         elif self.scraper_type == "graphsearch": return self.graph_search(graph_name, "friends")
131 | 
132 |     # graphapi
133 | 
134 |     @api_required
135 |     def get_username_api(self, graph_name):
136 |         return graphapi.get_username(self.api, graph_name)
137 | 
138 |     @api_required
139 |     def get_feed_api(self, graph_name):
140 |         return graphapi.get_feed(self.api, graph_name)
141 | 
142 |     @api_required
143 |     def get_about_api(self, graph_name):
144 |         return graphapi.get_about(self.api, graph_name)
145 | 
146 |     @api_required
147 |     def get_likes_api(self, graph_name):
148 |         return graphapi.get_likes(self.api, graph_name)
149 | 
150 |     # nograph
151 | 
152 |     @login_required
153 |     def get_feed_nograph(self, graph_name, graph_id=None):
154 |         return nograph.get_feed(self.browser, self.cur_user, graph_name, graph_id, self.api)
155 | 
156 |     @login_required
157 |     def get_feed_nograph2(self, graph_name, graph_id=None):
158 |         return nograph.get_feed2(self.browser, self.cur_user, graph_name, graph_id, self.api)
159 | 
160 |     @login_required
161 |     def get_about_nograph(self, graph_name, graph_id=None):
162 |         return nograph.get_about(self.browser, self.cur_user, graph_name, graph_id, self.api)
163 | 
164 |     @login_required
165 |     def get_likes_nograph(self, graph_name, graph_id=None):
166 |         return nograph.get_likes(self.browser, self.cur_user, graph_name, graph_id, self.api)
167 | 
168 |     @login_required
169 |     def get_friends_nograph(self, graph_name, graph_id=None):
170 |         return nograph.get_friends(self.browser, self.cur_user, graph_name, graph_id, self.api)
171 | 
172 |     @login_required
173 |     def get_members_nograph(self, graph_name, graph_id=None):
174 |         return nograph.get_members(self.browser, self.cur_user, graph_name, graph_id, self.api)
175 | 
176 |     # graphsearch
177 | 
178 |     @login_required
179 |     def graph_search(self, graph_name, method_name, graph_id=None):
180 |         for result in graphsearch.search(self.browser, self.cur_user, graph_name, method_name, graph_id, api=self.api): yield result
181 | 
182 |     # for result in self.graph_search(page_name,"likers"): yield result
183 |     # for result in self.graph_search(user_name,"pages-liked"): yield result
184 |     # for result in public.get_pages_liked_nograph(user_name): yield result
185 | 


--------------------------------------------------------------------------------
/socialscraper/facebook/graphsearch/__init__.py:
--------------------------------------------------------------------------------
  1 | from time import sleep
  2 | import logging, lxml.html, json, urllib, re
  3 | from ...base import ScrapingError
  4 | from ..models import FacebookUser, FacebookPage
  5 | 
  6 | from ..import graphapi, public
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | SEARCH_URL = 'https://www.facebook.com/search'
 11 | AJAX_URL = 'https://www.facebook.com/ajax/pagelet/generic.php/BrowseScrollingSetPagelet'
 12 | 
 13 | def search(browser, current_user, graph_name, method_name, graph_id = None, api = None):
 14 |     """
 15 |     Facebook Graph Search Generator
 16 | 
 17 |     General Usage:
 18 | 
 19 |     for result in search(browser, current_user, graph_name, method_name):
 20 |         print result
 21 | 
 22 |     browser: authenticated requests session (see auth.py)
 23 |     current_user: authenticated user
 24 |     graph_name: name of Facebook graph object such as a user name or page name
 25 |     method_name: name of internal Facebook graph search methods;
 26 |                  list: 'pages-liked', 'likers', 'users-named'
 27 | 
 28 |     Example:
 29 | 
 30 |     for result in search(browser, current_user, "al.johri", "pages-liked"):
 31 |         print result
 32 | 
 33 |     for result in search(browser, current_user, "mightynest", "likers"):
 34 |         print result
 35 | 
 36 |     """
 37 | 
 38 |     def _find_script_tag(raw_html, phrase):
 39 |         doc = lxml.html.fromstring(raw_html)
 40 |         script_tag = filter(lambda x: x.text_content().find(phrase) != -1, doc.cssselect('script'))
 41 |         if not script_tag: return None
 42 |         return json.loads(script_tag[0].text_content()[24:-1])
 43 | 
 44 |     def _parse_ajax_data(raw_json):
 45 |         require = raw_json['jsmods']['require']
 46 |         tester = lambda x: x[0] == "BrowseScrollingPager" and x[1] == "init"
 47 |         data_parameter = map(lambda x: x[3][1], filter(tester, require))[0]
 48 |         return data_parameter
 49 | 
 50 |     def _parse_cursor_data(raw_json):
 51 |         if raw_json.get('error'): raise ScrapingError(raw_json.get('errorDescription'))
 52 |         require = raw_json['jsmods']['require']
 53 |         tester = lambda x: x[0] == "BrowseScrollingPager" and x[1] == "pageletComplete"
 54 |         cursor_parameter = map(lambda x: x[3][0], filter(tester, require))[0]
 55 |         return cursor_parameter
 56 | 
 57 |     def _parse_result(raw_html):
 58 |         doc = lxml.html.fromstring(raw_html)
 59 |         # import pdb; pdb.set_trace()
 60 |         # items = map(lambda x: (x.get('href'), x.text_content()), doc.cssselect('div[data-bt*=title]'))
 61 |         # items = doc.cssselect('div[data-bt*=title] > a')
 62 |         # for item in items:
 63 |         #     url = item.get('href')
 64 |         #     number_of_items = item.getparent().getparent().cssselect('div[data-bt*=snippets] > div>div ')[0].text_content()
 65 |             # print url, number_of_items
 66 | 
 67 |         # methods to get id
 68 |         # x.getparent().getparent().cssselect('.FriendRequestOutgoing')[0].get('data-profileid'))
 69 |         # 
 70 |         el_id = lambda x: json.loads(x.getparent().getparent().getparent().getparent().getparent().get('data-bt'))['id']
 71 |         return map(lambda x: (x.get('href'), x.text_content(), el_id(x)), doc.cssselect('div[data-bt*=title] > a'))
 72 | 
 73 |     def _get_payload(ajax_data, uid):
 74 |         return {
 75 |             'data': json.dumps(ajax_data), 
 76 |             '__user': uid, 
 77 |             '__a': 1, 
 78 |             '__req': 'a', 
 79 |             '__dyn': '7n8apij35CCzpQ9UmWOGUGy1m9ACwKyaF3pqzAQ',
 80 |             '__rev': 1106672
 81 |         }
 82 | 
 83 |     def _result_to_model(result, method_name):
 84 |         url = result[0]
 85 |         name = result[1]
 86 |         uid = result[2]
 87 |         # num_members = result[2]
 88 | 
 89 |         # print(url, name, num_members)
 90 | 
 91 |         # import pdb; pdb.set_trace()
 92 | 
 93 |         username = public.parse_url(url)
 94 | 
 95 |         # if api:
 96 |         #     uid, category = graphapi.get_attributes(api, username, ["id", "category"])
 97 |         # else:
 98 |         #     uid, category = public.get_attributes(username, ["id", "category"])
 99 | 
100 | 
101 |         if uid == None: 
102 |             print "Couldn't find UID of %s" % username
103 |             # raise ValueError("Couldn't find uid of %s" % username)
104 | 
105 |         uid = int(uid) if uid else None
106 | 
107 |         if method_name == "pages-liked":
108 |             return FacebookPage(page_id=uid, username=username, url=url, name=name, type=category)
109 |         elif method_name == "likers" or method_name == "friends":
110 |             return FacebookUser(uid=uid, username=username, url=url, name=name)
111 |         elif method_name == "groups":
112 |             return (uid, url, name, category)
113 |         else:
114 |             raise ScrapingError("Wut kinda model is %. Check out da _result_to_model method" % method_name)
115 | 
116 |     # https://www.facebook.com/search/str/ruchi/users-named
117 |     # https://www.facebook.com/search/str/ruchi/users-named/me/friends/intersect?ref=filter
118 |     # https://www.facebook.com/search/str/ruchi/users-named/228401243342/students/intersect?ref=filter
119 |     # https://www.facebook.com/search/str/ruchi/users-named/males/intersect?ref=filter
120 |     # https://www.facebook.com/search/str/ruchi/users-named/females/intersect?ref=filter
121 |     # https://www.facebook.com/search/str/ruchi/users-named/108641632493225/residents/present/intersect?ref=filter
122 |     # https://www.facebook.com/search/str/ruchi/users-named/108659242498155/residents/present/intersect?ref=filter
123 |     # https://www.facebook.com/search/str/ruchi/users-named/106517799384578/residents/present/intersect?ref=filter
124 |     # https://www.facebook.com/search/str/ruchi/users-named/108007405887967/visitors/intersect
125 |     def _graph_request(graph_id, method_name, post_data = None):
126 |         if not post_data:
127 |             response = browser.get(SEARCH_URL + "/%s/%s" % (graph_id, method_name))
128 |             cursor_tag = _find_script_tag(response.text, "cursor")
129 |             ajax_tag = _find_script_tag(response.text, "encoded_query")
130 |             cursor_data = _parse_cursor_data(cursor_tag) if cursor_tag else None
131 |             ajax_data = _parse_ajax_data(ajax_tag) if ajax_tag else None
132 |             post_data = dict(cursor_data.items() + ajax_data.items()) if ajax_data and cursor_data else None
133 | 
134 |             current_results = []
135 | 
136 |             # Extract current_results from first page
137 |             for element in lxml.html.fromstring(response.text).cssselect(".hidden_elem"): 
138 |                 comment = element.xpath("comment()")
139 |                 if not comment: continue
140 |                 element_from_comment = lxml.html.tostring(comment[0])[5:-4]
141 |                 doc = lxml.html.fromstring(element_from_comment)
142 |                 # import pdb; pdb.set_trace()
143 |                 # potentially num_members x.getparent().getparent().cssselect('div[class="_52eh"]')[0].text_content()
144 |                 # potentially data profile id 
145 |                 el_id = lambda x: json.loads(x.getparent().getparent().getparent().getparent().getparent().get('data-bt'))['id']
146 |                 current_results += map(lambda x: (x.get('href'), x.text_content(), el_id(x)), doc.cssselect('div[data-bt*=title] > a'))
147 |         else:
148 |             payload = _get_payload(post_data, current_user.id)
149 |             response = browser.get(AJAX_URL + "?%s" % urllib.urlencode(payload))
150 |             raw_json = json.loads(response.content[9:])
151 |             raw_html = raw_json['payload']
152 | 
153 |             post_data = _parse_cursor_data(raw_json)
154 |             current_results = _parse_result(raw_html)
155 |         return post_data, current_results
156 | 
157 |     # Main Facebook Graph Search
158 | 
159 |     if not graph_id: graph_id = public.get_id(graph_name)
160 |     post_data, current_results = _graph_request(graph_id, method_name)
161 |     # import pdb; pdb.set_trace()
162 |     for result in current_results: 
163 |         try:
164 |             yield _result_to_model(result, method_name)
165 |         except ValueError:
166 |             continue
167 | 
168 |     while post_data:
169 |         current_post_data, current_results = _graph_request(graph_id, method_name, post_data)
170 |         if current_post_data == None or current_results == None: break
171 |         # print current_results
172 |         for result in current_results: 
173 |             try:
174 |                 yield _result_to_model(result, method_name)
175 |             except ValueError:
176 |                 continue            
177 |         post_data.update(current_post_data)


--------------------------------------------------------------------------------
/socialscraper/facebook/nograph/about.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import logging, lxml.html
  4 | from dateutil import parser
  5 | 
  6 | from .. import public
  7 | from ..models import FacebookUser
  8 | from ...base import ScrapingError
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | ABOUT_URL = "https://www.facebook.com/%s/info"
 13 | 
 14 | def get_about(browser, current_user, graph_name, graph_id=None, api=None):
 15 | 
 16 |     # shit gets weird when graph_name == current_user.username
 17 |     if current_user.username == graph_name:
 18 |         raise ScrapingError("don't scrape yourself plz")
 19 | 
 20 |     ret = {
 21 |         "family": {},
 22 |         "experiences": {},
 23 |         "relationships": {},
 24 |         "places": {},
 25 |         "contact": {},
 26 |         "basic": {},
 27 |         "about": [],
 28 |         "quotes": [],
 29 |         "event": {}
 30 |     }
 31 | 
 32 |     def get_text(element_list):
 33 |         if element_list: return element_list[0].text_content()
 34 | 
 35 |     def get_previous(element_list):
 36 |         if element_list: return [element_list[0].getprevious()]
 37 | 
 38 |     def get_rows(data):
 39 |         for table in data.cssselect('tbody'): 
 40 |             for row in table.cssselect('tr'): 
 41 |                 yield row
 42 | 
 43 |     def get_cells(data):
 44 |         for table in data.cssselect('tbody'): 
 45 |             for row in table.cssselect('tr'): 
 46 |                 for cell in row.cssselect('td'):
 47 |                     yield cell
 48 | 
 49 |     def parse_experience(cell):
 50 |         for experience in cell.cssselect(".experienceContent"):
 51 |             experience_title = get_text(experience.cssselect(".experienceTitle"))
 52 |             # experience_title_url = experience.cssselect(".experienceTitle")[0]
 53 |             experience_body = get_text(experience.cssselect(".experienceBody"))
 54 |             yield experience_title, experience_body
 55 | 
 56 |     def parse_generic_cell(cell):
 57 |         previous_cell = get_previous(cell.cssselect('.aboutSubtitle'))
 58 |         if previous_cell and previous_cell[0].cssselect('a'):
 59 |             name_url = previous_cell[0].cssselect('a')[0].get('href')
 60 |         else:
 61 |             name_url = None
 62 | 
 63 |         name = get_text(previous_cell)
 64 |         content = get_text(cell.cssselect('.aboutSubtitle'))
 65 |         print name_url
 66 |         return name, content
 67 |         # return (name, name_url), content
 68 | 
 69 |     def parse_generic_row(row):
 70 |         name = get_text(row.cssselect('th'))
 71 |         content = get_text(row.cssselect('td'))
 72 |         return name, content
 73 | 
 74 |     response = browser.get(ABOUT_URL % graph_name)
 75 |     for element in lxml.html.fromstring(response.text).cssselect(".hidden_elem"): 
 76 |         comment = element.xpath("comment()")
 77 |         if not comment: continue
 78 |         
 79 |         element_from_comment = lxml.html.tostring(comment[0])[5:-4]
 80 |         doc = lxml.html.fromstring(element_from_comment)
 81 |         fbTimelineSection = doc.cssselect('.fbTimelineSection.fbTimelineCompactSection')
 82 |         fbTimelineFamilyGrid = doc.cssselect('.fbTimelineFamilyGrid')
 83 |         fbTimelineAboutMeHeader = doc.cssselect('.fbTimelineAboutMeHeader')
 84 | 
 85 |         # this is for scraping a Page
 86 |         if fbTimelineAboutMeHeader:
 87 |           title = get_text(fbTimelineAboutMeHeader[0].cssselect('.uiHeaderTitle'))
 88 |           # print title
 89 | 
 90 |           if "About" in title:
 91 |             pass
 92 |               # print doc.text_content() 
 93 |           elif "Basic Info" in title:
 94 |             pass
 95 |               # print doc.text_content()
 96 | 
 97 |         if fbTimelineFamilyGrid:
 98 |             familyList = fbTimelineFamilyGrid[0].cssselect('.familyList')[0]
 99 |             for member in familyList:
100 |                 name, status = parse_generic_cell(member)
101 |                 ret['family'][status] = name
102 | 
103 |                 # FacebookFamily(profile_id=, relationship=status, uid=, name=name)
104 | 
105 |         if fbTimelineSection:
106 |             title = get_text(fbTimelineSection[0].cssselect('.uiHeaderTitle'))
107 |             data = fbTimelineSection[0].cssselect('.profileInfoTable') if fbTimelineSection else None
108 | 
109 |             if not title or not data: continue
110 |             
111 |             # experiences_keys = ['College', 'Employers', 'High School']
112 |             if "Work and Education" in title:
113 |                 for row in get_rows(data[0]):
114 |                     if not row.cssselect('th'): continue
115 |                     header = row.cssselect('th')[0].text_content() 
116 |                     ret['experiences'][header] = {}
117 |                     for cell in row.cssselect('td'): 
118 |                         for experienceTitle, experienceBody in parse_experience(cell): 
119 |                             ret['experiences'][header][experienceTitle] = experienceBody
120 | 
121 |             # relationships_keys = ['In a relationship']
122 |             elif "Relationship" in title:
123 |                 for cell in get_cells(data[0]): 
124 |                     name, status = parse_generic_cell(cell)
125 |                     ret['relationships'][status] = name
126 | 
127 |             # places_keys = ['Current City', 'Hometown']
128 |             elif "Places Lived" in title:
129 |                 for cell in get_cells(data[0]): 
130 |                     name, status = parse_generic_cell(cell)
131 |                     ret['places'][status] = name
132 | 
133 |             # contact_keys = ['Address', 'Email', 'Mobile Phones']
134 |             elif "Contact Information" in title:
135 |                 for row in get_rows(data[0]): 
136 |                     name, status = parse_generic_row(row)
137 |                     ret['contact'][name] = status
138 | 
139 |             # basic_keys = ['Birthday', 'Gender', 'Interested In', 'Languages', 'Political Views', 'Relationship Status']
140 |             elif "Basic Information" in title:
141 |                 for row in get_rows(data[0]): 
142 |                     name, status = parse_generic_row(row)
143 |                     ret['basic'][name] = status
144 | 
145 |             # about_keys = None
146 |             elif "About" in title:
147 |                 data = fbTimelineSection[0].getchildren()[1]
148 |                 for quote in data.cssselect('.profileText'): 
149 |                     ret['about'].append(quote.text_content())
150 | 
151 |             # quotes_keys = None
152 |             elif "Favorite Quotations" in title:
153 |                 data = fbTimelineSection[0].getchildren()[1]
154 |                 for quote in data.cssselect('.profileText'):
155 |                     ret['quotes'].append(quote.text_content())
156 | 
157 |             # family_keys = ['Brother']
158 |             elif "Family" in title: # empty
159 |                 pass # this will be empty Family information above in 'fbTimelineFamilyGrid'
160 | 
161 |             # events_keys = None
162 |             elif "Life Events" in title:
163 |                 # TODO: parse life events
164 |                 data = fbTimelineSection[0].getchildren()[1].text_content()
165 |                 pass
166 |             elif "Pages" in title:
167 |                 pass
168 |                 # TODO: parse pages admined/owned by user
169 |             elif 'Favorites' in title:
170 |                 pass
171 |             else:
172 |                 raise ScrapingError("Unrecognized fbTimelineSection %s" % title)
173 | 
174 |     if not graph_id: graph_id = public.get_id(graph_name)
175 |     
176 |     birthday = ret['basic'].get('Birthday', None)
177 |     birthday = parser.parse(birthday) if birthday else None
178 |     sex = ret['basic'].get('Gender', None)
179 |     email = ret['contact'].get('Email', None)
180 |     college = ret['experiences'].get('College', None)
181 |     employer = ret['experiences'].get('Employers', None)
182 |     highschool = ret['experiences'].get('High School', None)
183 |     currentcity = ret['places'].get('Current City', None)
184 |     hometown = ret['places'].get('Hometown', None)
185 | 
186 |     sex = sex.lower() if sex else None
187 |     
188 |     email = email if email and not "Ask for" in email else None
189 |     college = unicode(college) if college is not None else None
190 |     employer = unicode(employer) if employer is not None else None
191 |     highschool = unicode(highschool) if highschool is not None else None
192 |     currentcity = unicode(currentcity) if currentcity is not None else None
193 |     hometown = unicode(hometown) if hometown is not None else None
194 | 
195 |     user = FacebookUser(
196 |         uid=graph_id, 
197 |         username=graph_name, 
198 |         email=email, 
199 |         birthday=birthday, 
200 |         sex=sex, 
201 |         college=college, 
202 |         employer=employer,
203 |         highschool=highschool,
204 |         currentcity=currentcity,
205 |         hometown=hometown
206 |     )
207 | 
208 |     print ret
209 |     
210 |     return user


--------------------------------------------------------------------------------
/socialscraper/base.py:
--------------------------------------------------------------------------------
  1 | from mechanize import Browser
  2 | import random
  3 | 
  4 | class ScrapeAccount(object):
  5 | 
  6 |     __attrs__ = ['password', 'email', 'username', 'id']
  7 | 
  8 |     def __init__(self, password, id=None, email=None, username=None):
  9 |         # if not email and not username: raise UsageError("Username or Email not specified.")
 10 |         self.id = id
 11 |         self.email = email
 12 |         self.username = username
 13 |         self.password = password
 14 | 
 15 |     def __str__(self):
 16 |         return "ScrapeAccount %s, %d, %s, %s" % (self.email, self.id, self.username, "".join(map(lambda x: '*', self.password)))
 17 | 
 18 |     def __repr__(self):
 19 |         return "%s(email=%s, id=%d, username=%s, password=%s)" % (self.__class__.__name__, 
 20 |                                                            self.email, 
 21 |                                                            self.id,
 22 |                                                            self.username, 
 23 |                                                            "".join(map(lambda x: '*', self.password)))
 24 | 
 25 |     def __getstate__(self):
 26 |         return dict((attr, getattr(self, attr, None)) for attr in self.__attrs__)
 27 | 
 28 |     def __setstate__(self, state):
 29 |         for attr, value in state.items():
 30 |             setattr(self, attr, value)
 31 | 
 32 | class BaseScraper(object):
 33 |     """The base class for all social media scrapers in the package.
 34 | 
 35 |     Handles browser emulation (using mechanize) and user agent selection
 36 |     for the browser.
 37 |     """
 38 | 
 39 |     __attrs__ = ['browser', 'user_agents', 'cur_user_agent', 'users', 'cur_user', 'scraper_type', 'api']
 40 | 
 41 |     default_user_agents = set([
 42 |         # 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36'
 43 |         # 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'
 44 |         # 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'
 45 |         # 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
 46 |         # 'Konqueror/3.0-rc4; (Konqueror/3.0-rc4; i686 Linux;;datecode)'
 47 |         # 'Opera/9.52 (X11; Linux i686; U; en)'
 48 |         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
 49 |     ])
 50 | 
 51 |     class _Browser(Browser):
 52 |         """Subclass of mechanize.Browser that allows the browser to
 53 |         smoothly handle XHTML.
 54 |         """
 55 |         # disable the html check to allow for XHTML
 56 |         def viewing_html(self):
 57 |             import mechanize
 58 |             mechanize.Browser.viewing_html(self)
 59 |             return True
 60 | 
 61 |     def __init__(self,user_agents = None):
 62 |         """Optionally supply a list of user agents for the browser to
 63 |         select from.
 64 |         If no user agents are supplied, one is picked from a set of
 65 |         sensible defaults (see BaseScraper.default_user_agents).
 66 |         """
 67 |         self._browser = BaseScraper._Browser()
 68 |         self._browser.set_handle_robots(False)
 69 |         if user_agents:
 70 |             self.user_agents = set(user_agents)
 71 |         else:
 72 |             self.user_agents = BaseScraper.default_user_agents
 73 |         self.set_random_user_agent()
 74 | 
 75 |         self.users = []
 76 |         return
 77 | 
 78 |     def set_user_agent(self,user_agent):
 79 |         """Set the browser's current user agent.
 80 |         If user_agent is not in the set of user agents maintained by this
 81 |         BaseScraper instance, it is added to the set.
 82 |         """
 83 |         if user_agent not in self.user_agents:
 84 |             self.user_agents.add(user_agent)
 85 |         self._browser.addheaders = [('User-Agent',user_agent)]
 86 |         self.cur_user_agent = user_agent
 87 |         return
 88 | 
 89 |     def set_random_user_agent(self):
 90 |         """Pick a random user agent from the set of possible agents."""
 91 |         self.set_user_agent(random.choice(list(self.user_agents)))
 92 | 
 93 |     def add_user(self, password, id=None, email=None, username=None):
 94 |         """Set the account information to use when a login is required."""
 95 |         self.users.append(ScrapeAccount(id=int(id) if id else None, email=email, username=username, password=password))
 96 |         return
 97 | 
 98 |     def pick_random_user(self):
 99 |         if len(self.users) == 0:
100 |             raise UsageError
101 |         self.cur_user = random.choice(self.users)
102 |         return self.cur_user
103 | 
104 |     def __getstate__(self):
105 |         return dict((attr, getattr(self, attr, None)) for attr in self.__attrs__)
106 | 
107 |     def __setstate__(self, state):
108 |         for attr, value in state.items():
109 |             setattr(self, attr, value)
110 | 
111 | class BaseUser(object):
112 | 
113 |     __attrs__ = ['id', 'username', 'email']
114 | 
115 |     def __init__(self, id=None, username=None, email=None):
116 |         self.id = id
117 |         self.username = username
118 |         self.email = email
119 |         
120 |     def __str__(self):
121 |         return "%s (%i)" % (self.username, self.id)
122 | 
123 |     def __repr__(self):
124 |         return "%s(id=%i, username=%s, email=%s)" % (self.__class__.__name__, self.id, self.username, self.email)
125 | 
126 |     def __getstate__(self):
127 |         return dict((attr, getattr(self, attr, None)) for attr in self.__attrs__)
128 | 
129 |     def __setstate__(self, state):
130 |         for attr, value in state.items():
131 |             setattr(self, attr, value)
132 | 
133 | 
134 | class FeedItem(object):
135 |     def __init__(self, id, content=None, timestamp=None, type=None):
136 |         self.id = int(id)
137 |         self.content = content
138 |         self.type = type
139 |         self.timestamp = timestamp
140 | 
141 |     def __str__(self):
142 |         return "FeedItem<%s>(%i): %s" % (self.type, self.id, self.content)
143 | 
144 |     def __repr__(self):
145 |         return "%s(id=%i, content=%s, timestamp=%s, type=%s)" % (self.__class__.__name__, self.id, self.content, self.timestamp, self.type)
146 | 
147 | 
148 | class UsageError(Exception):
149 |     
150 |     errno = -98
151 | 
152 |     def __init__(self,message=None):
153 |         super(UsageError, self).__init__(message)
154 |         self.message = message
155 |         self.strerror = message
156 |     def __repr__(self):
157 |         return str(type(self)) + ((": %s" % self.message) if self.message else "")
158 | 
159 | class ScrapingError(Exception):
160 | 
161 |     errno = -99
162 | 
163 |     def __init__(self,message=None):
164 |         super(ScrapingError, self).__init__(message)
165 |         self.message = message
166 |         self.strerror = message
167 | 
168 | class BaseModel(object):
169 |     """
170 |     Usage: 
171 | 
172 |     Base(uid=10, username="test")
173 |     """
174 | 
175 |     @classmethod
176 |     def get_columns(cls):
177 |         columns = map(lambda x: getattr(cls, x), cls.__attrs__)
178 |         return columns
179 | 
180 |     def __init__(self,**kwargs):
181 |         for column in self.get_columns():
182 |             setattr(self,column.name,kwargs.get(column.name,None))
183 | 
184 |     # the weird if statements are to prevent 
185 |     # putting quotes around BigInteger/Integer or None
186 |     # might want to create a datetime out of column.type == "Date" ?
187 |     def __repr__(self):
188 |         attributes = u""
189 |         for column in self.get_columns():
190 |             value = getattr(self,column.name)
191 |             if value is None:
192 |                 attributes += u"%s=None, " % (column.name)
193 |             elif column.type == "BigInteger" or column.type == "Integer": 
194 |                 attributes += u"%s=%i, " % (column.name, value)
195 |             else: 
196 |                 attributes += u"%s=\"%s\", " % (column.name, value)
197 | 
198 |         attributes = attributes[:-2] # remove comma and space at the end
199 |         ret = u"%s(%s)" % (self.__class__.__name__, attributes)
200 |         return ret.encode('utf-8', 'ignore')
201 | 
202 | class Column(BaseModel):
203 |     """
204 |     Usage:
205 | 
206 |     Column('uid', 'BigInteger', primary_key=True, foreign_key=True, foreign_key_reference="user.uid")
207 | 
208 |     If no type is specified, it is assumed to be "String".
209 |     """
210 | 
211 |     def __init__(self, name, column_type=None, **options):
212 |         self.name = name
213 |         self.type = column_type if column_type else "String"
214 |         self.primary_key = options.get('primary_key', False)
215 |         self.foreign_key = options.get('foreign_key', False)
216 |         self.unique = options.get('unique', False)
217 |         
218 |         # if self.primary_key: self.unique = True
219 | 
220 |         if self.foreign_key:
221 |             try:
222 |                 self.foreign_key_reference = options['foreign_key_reference']
223 |             except IndexError:
224 |                 raise ScrapingError("Foreign Key Reference must be defined if foreign_key=True")
225 | 


--------------------------------------------------------------------------------