├── .gitignore
├── AUTHORS.txt
├── LICENSE.txt
├── README.md
├── examples
    ├── quickstart.py
    └── without_date_range.py
├── requirements.txt
├── setup.py
└── twitterwebsearch
    ├── __init__.py
    ├── io.py
    ├── parser.py
    └── searcher.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore Eclipse project files
 2 | /.project
 3 | /.pydevproject
 4 | 
 5 | # Ignore compiled Python files
 6 | *.pyc
 7 | 
 8 | # Ignore example output files
 9 | /examples/example.ljson
10 | 
11 | 


--------------------------------------------------------------------------------
/AUTHORS.txt:
--------------------------------------------------------------------------------
1 | Raynor Vliegendhart <shinnonoir+git@gmail.com>
2 | Egbert Bouman <ebouman@gmail.com>
3 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 R. Vliegendhart <ShinNoNoir@gmail.com>
 2 |  
 3 | Licensed under the MIT license:
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining 
 6 | a copy of this software and associated documentation files (the "Software"),
 7 | to deal in the Software without restriction, including without limitation 
 8 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 9 | and/or sell copies of the Software, and to permit persons to whom the 
10 | Software is furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
20 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
21 | IN THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # twitterwebsearch
 2 | The goal of this Python package is to automate the process for finding tweets 
 3 | older than several weeks.
 4 | These older tweets cannot be found using the Twitter
 5 | search API, but can be found through the web interface.
 6 | 
 7 | 
 8 | ## Installation:
 9 |     pip install https://github.com/ShinNoNoir/twitterwebsearch/archive/master.zip
10 | 
11 | ## Small example:
12 | 
13 |     import json
14 |     import twitterwebsearch
15 |     
16 |     QUERY = '@shinnonoir since:2010-01-20 until:2010-02-01'
17 |     
18 |     def main():
19 |         tweets = twitterwebsearch.search(QUERY)
20 |         tweets = list(tweets) # convert generator into list
21 |         print json.dumps(tweets, indent=2)
22 |     
23 |     if __name__ == '__main__':
24 |         main()
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/examples/quickstart.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Quickstart example.
 3 | '''
 4 | 
 5 | import json
 6 | import twitterwebsearch
 7 | import twitterwebsearch.io
 8 | 
 9 | QUERY = '@shinnonoir since:2010-01-20 until:2010-02-01'
10 | 
11 | def main():
12 |     tweets = twitterwebsearch.search(QUERY)
13 |     tweets = list(tweets) # convert generator into list
14 |     
15 |     print json.dumps(tweets, indent=2)
16 |     
17 |     twitterwebsearch.io.save_tweets(tweets, 'example.ljson')
18 |     loaded_tweets = twitterwebsearch.io.load_tweets('example.ljson')
19 |     
20 |     assert list(loaded_tweets) == tweets
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 


--------------------------------------------------------------------------------
/examples/without_date_range.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Example without date range.
 3 | '''
 4 | 
 5 | import time
 6 | import twitterwebsearch
 7 | 
 8 | QUERY = 'president'
 9 | CUTOFF = 500
10 | 
11 | def main():
12 |     start_time = time.time()
13 |     for i, tweet in enumerate(twitterwebsearch.search(QUERY)):
14 |         n = i+1
15 |         print '%s tweets crawled...\r' % n,
16 |         
17 |         if n >= CUTOFF:
18 |             break
19 |     
20 |     end_time = time.time()
21 |     
22 |     print '\nTook %s seconds' % (end_time - start_time) 
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | requests
3 | 
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | if sys.version < '2.7':
 6 |     print 'Python >= 2.7 required'
 7 |     sys.exit(1)
 8 | 
 9 | from setuptools import setup
10 | 
11 | long_description = '''
12 | A simple Python package for using Twitter search functionality
13 | that is only available through the Twitter web interface
14 | (such as searching for tweets older than a few weeks).'''.strip()
15 | 
16 | setup(
17 |     name = 'twitterwebsearch',
18 |     version='0.2.5',
19 |     author = 'Raynor Vliegendhart',
20 |     author_email = 'ShinNoNoir@gmail.com',
21 |     url = 'https://github.com/ShinNoNoir/twitterwebsearch',
22 |     
23 |     packages=['twitterwebsearch'],
24 |     
25 |     description = "Package for Twitter's web search",
26 |     long_description = long_description,
27 |     platforms = 'Any',
28 |     license = 'MIT (see: LICENSE.txt)',
29 |     keywords = 'Twitter, search',
30 |     
31 |     install_requires = open('requirements.txt').readlines(),
32 | )
33 | 


--------------------------------------------------------------------------------
/twitterwebsearch/__init__.py:
--------------------------------------------------------------------------------
1 | from searcher import search
2 | 


--------------------------------------------------------------------------------
/twitterwebsearch/io.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for saving and loading Tweets.
 3 | """
 4 | import json
 5 | import twitterwebsearch.parser
 6 | 
 7 | def read_search_results(path):
 8 |     with open(path, 'r') as fh:
 9 |         for tweet in twitterwebsearch.parser.parse_search_results(fh.read()):
10 |             yield tweet
11 | 
12 | def save_tweets(tweets, path):
13 |     with open(path, 'w') as fh:
14 |         for tweet in tweets:
15 |             print >>fh, json.dumps(tweet)
16 | 
17 | def load_tweets(path):
18 |     with open(path, 'r') as fh:
19 |         for line in fh:
20 |             yield json.loads(line)
21 | 
22 | 


--------------------------------------------------------------------------------
/twitterwebsearch/parser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Parser module for parsing Twitter search results obtained through the web 
 3 | interface.
 4 | """
 5 | import bs4
 6 | 
 7 | def has_class(class_name):
 8 |     return lambda class_: class_ and class_name in class_.split()
 9 | 
10 | only_tweet_tags = bs4.SoupStrainer('div', class_=has_class('tweet'), **{'data-tweet-id': True})
11 | 
12 | def parse_tweet_tag(tag, expand_emojis=True):
13 |     tweet_id = tag['data-tweet-id']
14 |     permalink = tag['data-permalink-path']
15 |     screen_name = tag['data-screen-name']
16 |     name = tag['data-name']
17 |     user_id = tag['data-user-id']
18 |     
19 |     content_div = tag.find('div', class_=has_class('content'))
20 |     tweet_body_tag = content_div.find('p', class_=has_class('tweet-text'))
21 |     
22 |     if tweet_body_tag is None:
23 |         # Might be a censored tweet, skip
24 |         return
25 |     
26 |     if expand_emojis:
27 |         for emoji_tag in tweet_body_tag.find_all(class_='Emoji'):
28 |             emoji_tag.insert(0, emoji_tag['alt'])
29 | 
30 |     
31 |     lang = tweet_body_tag['lang']
32 |     tweet_text = tweet_body_tag.text
33 |     
34 |     urls = [
35 |         a['data-expanded-url']
36 |         for a in tweet_body_tag.find_all('a', class_=has_class('twitter-timeline-link'))
37 |         if 'data-expanded-url' in a.attrs
38 |     ]
39 |     
40 |     mentions = [
41 |         a.text
42 |         for a in tweet_body_tag.find_all('a', class_=has_class('twitter-atreply'))
43 |     ]
44 |     
45 |     timestamp = int(content_div.find(**{'data-time-ms':True})['data-time-ms'])/1000.
46 |     
47 |     footer_div = content_div.find('div', class_=has_class('stream-item-footer'))
48 |     def get_stats(stats_type):
49 |         span = footer_div.find('span', class_=has_class("ProfileTweet-action--%s" % stats_type))
50 |         spanspan = span.find('span', class_=has_class("ProfileTweet-actionCount"))
51 |         return int(spanspan['data-tweet-stat-count'])
52 |     
53 |     footer_div.find('span', class_="ProfileTweet-action--retweet")
54 |     retweet_count = get_stats('retweet')
55 |     favorite_count = get_stats('favorite')
56 |     
57 |     tweet = dict(tweet_id=tweet_id,
58 |                  permalink=permalink,
59 |                  screen_name=screen_name,
60 |                  name=name,
61 |                  user_id=user_id,
62 |                  lang=lang,
63 |                  tweet_text=tweet_text,
64 |                  urls=urls,
65 |                  mentions=mentions,
66 |                  retweet_count=retweet_count,
67 |                  favorite_count=favorite_count,
68 |                  timestamp=timestamp)
69 |     
70 |     return tweet
71 |     
72 | def parse_search_results(html):
73 |     soup_tweets = bs4.BeautifulSoup(html, 'html.parser', parse_only=only_tweet_tags, from_encoding='utf-8')
74 |     for tag in soup_tweets:
75 |         tweet = parse_tweet_tag(tag)
76 |         if tweet is not None:
77 |             yield tweet
78 | 
79 | 


--------------------------------------------------------------------------------
/twitterwebsearch/searcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for using the web interface of Twitter's search.
 3 | """
 4 | import json
 5 | import time
 6 | import urllib
 7 | import requests
 8 | from twitterwebsearch.parser import parse_search_results
 9 | 
10 | 
11 | TWITTER_PROFILE_URL = 'https://twitter.com/{term}'
12 | TWITTER_PROFILE_MORE_URL = 'https://twitter.com/i/profiles/show/{term}/timeline?include_available_features=1&include_entities=1&max_position={max_position}'
13 | TWITTER_SEARCH_URL = 'https://twitter.com/search?q={term}&src=typd&vertical=default&f=tweets'
14 | TWITTER_SEARCH_MORE_URL = 'https://twitter.com/i/search/timeline?q={term}&src=typd&vertical=default&f=tweets&include_available_features=1&include_entities=1&max_position={max_position}'
15 | 
16 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 Edge/12.0'
17 | 
18 | DEFAULT_SLEEP = 0.5
19 | 
20 | def find_value(html, key):
21 |     pos_begin = html.find(key) + len(key) + 2
22 |     pos_end = html.find('"', pos_begin)
23 |     return html[pos_begin: pos_end]
24 | 
25 | def download_tweets(search=None, profile=None, sleep=DEFAULT_SLEEP):
26 |     assert search or profile
27 | 
28 |     term = (search or profile)
29 |     url = TWITTER_SEARCH_URL if search else TWITTER_PROFILE_URL
30 |     url_more = TWITTER_SEARCH_MORE_URL if search else TWITTER_PROFILE_MORE_URL
31 | 
32 |     response = requests.get(url.format(term=urllib.quote_plus(term)), headers={'User-agent': USER_AGENT}).text
33 |     max_position = find_value(response, 'data-max-position')
34 |     min_position = find_value(response, 'data-min-position')
35 | 
36 |     for tweet in parse_search_results(response.encode('utf8')):
37 |         yield tweet
38 | 
39 |     has_more_items = True
40 |     last_min_position = None
41 |     while has_more_items:
42 |         response = requests.get(url_more.format(term=urllib.quote_plus(term), max_position=min_position), headers={'User-agent': USER_AGENT}).text
43 |         try:
44 |             response_dict = json.loads(response)
45 |         except:
46 |             import datetime
47 |             with open('__debug.response_%s.txt' % datetime.datetime.now().strftime('%Y-%m-%d.%H%M'), 'wb') as fh:
48 |                 print >>fh, repr(response)
49 |             raise
50 |         
51 |         min_position = response_dict['min_position']
52 |         has_more_items = response_dict['has_more_items'] if profile else last_min_position != min_position
53 | 
54 |         for tweet in parse_search_results(response_dict['items_html'].encode('utf8')):
55 |             yield tweet
56 | 
57 |             if search:
58 |                 has_more_items = True
59 | 
60 |         last_min_position = min_position
61 |         time.sleep(sleep)
62 | 
63 | 
64 | 
65 | def search(query):
66 |     for tweet in download_tweets(search=query):
67 |         yield tweet
68 | 
69 | 


--------------------------------------------------------------------------------