├── .gitignore ├── AUTHORS.txt ├── LICENSE.txt ├── README.md ├── examples ├── quickstart.py └── without_date_range.py ├── requirements.txt ├── setup.py └── twitterwebsearch ├── __init__.py ├── io.py ├── parser.py └── searcher.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore Eclipse project files 2 | /.project 3 | /.pydevproject 4 | 5 | # Ignore compiled Python files 6 | *.pyc 7 | 8 | # Ignore example output files 9 | /examples/example.ljson 10 | 11 | -------------------------------------------------------------------------------- /AUTHORS.txt: -------------------------------------------------------------------------------- 1 | Raynor Vliegendhart 2 | Egbert Bouman 3 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 R. Vliegendhart 2 | 3 | Licensed under the MIT license: 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the "Software"), 7 | to deal in the Software without restriction, including without limitation 8 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | and/or sell copies of the Software, and to permit persons to whom the 10 | Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # twitterwebsearch 2 | The goal of this Python package is to automate the process for finding tweets 3 | older than several weeks. 4 | These older tweets cannot be found using the Twitter 5 | search API, but can be found through the web interface. 6 | 7 | 8 | ## Installation: 9 | pip install https://github.com/ShinNoNoir/twitterwebsearch/archive/master.zip 10 | 11 | ## Small example: 12 | 13 | import json 14 | import twitterwebsearch 15 | 16 | QUERY = '@shinnonoir since:2010-01-20 until:2010-02-01' 17 | 18 | def main(): 19 | tweets = twitterwebsearch.search(QUERY) 20 | tweets = list(tweets) # convert generator into list 21 | print json.dumps(tweets, indent=2) 22 | 23 | if __name__ == '__main__': 24 | main() 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /examples/quickstart.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Quickstart example. 3 | ''' 4 | 5 | import json 6 | import twitterwebsearch 7 | import twitterwebsearch.io 8 | 9 | QUERY = '@shinnonoir since:2010-01-20 until:2010-02-01' 10 | 11 | def main(): 12 | tweets = twitterwebsearch.search(QUERY) 13 | tweets = list(tweets) # convert generator into list 14 | 15 | print json.dumps(tweets, indent=2) 16 | 17 | twitterwebsearch.io.save_tweets(tweets, 'example.ljson') 18 | loaded_tweets = twitterwebsearch.io.load_tweets('example.ljson') 19 | 20 | assert list(loaded_tweets) == tweets 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /examples/without_date_range.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Example without date range. 3 | ''' 4 | 5 | import time 6 | import twitterwebsearch 7 | 8 | QUERY = 'president' 9 | CUTOFF = 500 10 | 11 | def main(): 12 | start_time = time.time() 13 | for i, tweet in enumerate(twitterwebsearch.search(QUERY)): 14 | n = i+1 15 | print '%s tweets crawled...\r' % n, 16 | 17 | if n >= CUTOFF: 18 | break 19 | 20 | end_time = time.time() 21 | 22 | print '\nTook %s seconds' % (end_time - start_time) 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | requests 3 | 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if sys.version < '2.7': 6 | print 'Python >= 2.7 required' 7 | sys.exit(1) 8 | 9 | from setuptools import setup 10 | 11 | long_description = ''' 12 | A simple Python package for using Twitter search functionality 13 | that is only available through the Twitter web interface 14 | (such as searching for tweets older than a few weeks).'''.strip() 15 | 16 | setup( 17 | name = 'twitterwebsearch', 18 | version='0.2.5', 19 | author = 'Raynor Vliegendhart', 20 | author_email = 'ShinNoNoir@gmail.com', 21 | url = 'https://github.com/ShinNoNoir/twitterwebsearch', 22 | 23 | packages=['twitterwebsearch'], 24 | 25 | description = "Package for Twitter's web search", 26 | long_description = long_description, 27 | platforms = 'Any', 28 | license = 'MIT (see: LICENSE.txt)', 29 | keywords = 'Twitter, search', 30 | 31 | install_requires = open('requirements.txt').readlines(), 32 | ) 33 | -------------------------------------------------------------------------------- /twitterwebsearch/__init__.py: -------------------------------------------------------------------------------- 1 | from searcher import search 2 | -------------------------------------------------------------------------------- /twitterwebsearch/io.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for saving and loading Tweets. 3 | """ 4 | import json 5 | import twitterwebsearch.parser 6 | 7 | def read_search_results(path): 8 | with open(path, 'r') as fh: 9 | for tweet in twitterwebsearch.parser.parse_search_results(fh.read()): 10 | yield tweet 11 | 12 | def save_tweets(tweets, path): 13 | with open(path, 'w') as fh: 14 | for tweet in tweets: 15 | print >>fh, json.dumps(tweet) 16 | 17 | def load_tweets(path): 18 | with open(path, 'r') as fh: 19 | for line in fh: 20 | yield json.loads(line) 21 | 22 | -------------------------------------------------------------------------------- /twitterwebsearch/parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parser module for parsing Twitter search results obtained through the web 3 | interface. 4 | """ 5 | import bs4 6 | 7 | def has_class(class_name): 8 | return lambda class_: class_ and class_name in class_.split() 9 | 10 | only_tweet_tags = bs4.SoupStrainer('div', class_=has_class('tweet'), **{'data-tweet-id': True}) 11 | 12 | def parse_tweet_tag(tag, expand_emojis=True): 13 | tweet_id = tag['data-tweet-id'] 14 | permalink = tag['data-permalink-path'] 15 | screen_name = tag['data-screen-name'] 16 | name = tag['data-name'] 17 | user_id = tag['data-user-id'] 18 | 19 | content_div = tag.find('div', class_=has_class('content')) 20 | tweet_body_tag = content_div.find('p', class_=has_class('tweet-text')) 21 | 22 | if tweet_body_tag is None: 23 | # Might be a censored tweet, skip 24 | return 25 | 26 | if expand_emojis: 27 | for emoji_tag in tweet_body_tag.find_all(class_='Emoji'): 28 | emoji_tag.insert(0, emoji_tag['alt']) 29 | 30 | 31 | lang = tweet_body_tag['lang'] 32 | tweet_text = tweet_body_tag.text 33 | 34 | urls = [ 35 | a['data-expanded-url'] 36 | for a in tweet_body_tag.find_all('a', class_=has_class('twitter-timeline-link')) 37 | if 'data-expanded-url' in a.attrs 38 | ] 39 | 40 | mentions = [ 41 | a.text 42 | for a in tweet_body_tag.find_all('a', class_=has_class('twitter-atreply')) 43 | ] 44 | 45 | timestamp = int(content_div.find(**{'data-time-ms':True})['data-time-ms'])/1000. 46 | 47 | footer_div = content_div.find('div', class_=has_class('stream-item-footer')) 48 | def get_stats(stats_type): 49 | span = footer_div.find('span', class_=has_class("ProfileTweet-action--%s" % stats_type)) 50 | spanspan = span.find('span', class_=has_class("ProfileTweet-actionCount")) 51 | return int(spanspan['data-tweet-stat-count']) 52 | 53 | footer_div.find('span', class_="ProfileTweet-action--retweet") 54 | retweet_count = get_stats('retweet') 55 | favorite_count = get_stats('favorite') 56 | 57 | tweet = dict(tweet_id=tweet_id, 58 | permalink=permalink, 59 | screen_name=screen_name, 60 | name=name, 61 | user_id=user_id, 62 | lang=lang, 63 | tweet_text=tweet_text, 64 | urls=urls, 65 | mentions=mentions, 66 | retweet_count=retweet_count, 67 | favorite_count=favorite_count, 68 | timestamp=timestamp) 69 | 70 | return tweet 71 | 72 | def parse_search_results(html): 73 | soup_tweets = bs4.BeautifulSoup(html, 'html.parser', parse_only=only_tweet_tags, from_encoding='utf-8') 74 | for tag in soup_tweets: 75 | tweet = parse_tweet_tag(tag) 76 | if tweet is not None: 77 | yield tweet 78 | 79 | -------------------------------------------------------------------------------- /twitterwebsearch/searcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for using the web interface of Twitter's search. 3 | """ 4 | import json 5 | import time 6 | import urllib 7 | import requests 8 | from twitterwebsearch.parser import parse_search_results 9 | 10 | 11 | TWITTER_PROFILE_URL = 'https://twitter.com/{term}' 12 | TWITTER_PROFILE_MORE_URL = 'https://twitter.com/i/profiles/show/{term}/timeline?include_available_features=1&include_entities=1&max_position={max_position}' 13 | TWITTER_SEARCH_URL = 'https://twitter.com/search?q={term}&src=typd&vertical=default&f=tweets' 14 | TWITTER_SEARCH_MORE_URL = 'https://twitter.com/i/search/timeline?q={term}&src=typd&vertical=default&f=tweets&include_available_features=1&include_entities=1&max_position={max_position}' 15 | 16 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 Edge/12.0' 17 | 18 | DEFAULT_SLEEP = 0.5 19 | 20 | def find_value(html, key): 21 | pos_begin = html.find(key) + len(key) + 2 22 | pos_end = html.find('"', pos_begin) 23 | return html[pos_begin: pos_end] 24 | 25 | def download_tweets(search=None, profile=None, sleep=DEFAULT_SLEEP): 26 | assert search or profile 27 | 28 | term = (search or profile) 29 | url = TWITTER_SEARCH_URL if search else TWITTER_PROFILE_URL 30 | url_more = TWITTER_SEARCH_MORE_URL if search else TWITTER_PROFILE_MORE_URL 31 | 32 | response = requests.get(url.format(term=urllib.quote_plus(term)), headers={'User-agent': USER_AGENT}).text 33 | max_position = find_value(response, 'data-max-position') 34 | min_position = find_value(response, 'data-min-position') 35 | 36 | for tweet in parse_search_results(response.encode('utf8')): 37 | yield tweet 38 | 39 | has_more_items = True 40 | last_min_position = None 41 | while has_more_items: 42 | response = requests.get(url_more.format(term=urllib.quote_plus(term), max_position=min_position), headers={'User-agent': USER_AGENT}).text 43 | try: 44 | response_dict = json.loads(response) 45 | except: 46 | import datetime 47 | with open('__debug.response_%s.txt' % datetime.datetime.now().strftime('%Y-%m-%d.%H%M'), 'wb') as fh: 48 | print >>fh, repr(response) 49 | raise 50 | 51 | min_position = response_dict['min_position'] 52 | has_more_items = response_dict['has_more_items'] if profile else last_min_position != min_position 53 | 54 | for tweet in parse_search_results(response_dict['items_html'].encode('utf8')): 55 | yield tweet 56 | 57 | if search: 58 | has_more_items = True 59 | 60 | last_min_position = min_position 61 | time.sleep(sleep) 62 | 63 | 64 | 65 | def search(query): 66 | for tweet in download_tweets(search=query): 67 | yield tweet 68 | 69 | --------------------------------------------------------------------------------