├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── twanalyze.config.template ├── twanalyze.py └── twanalyze ├── __init__.py ├── oauth.py ├── parse.py ├── report.py └── twitter.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | # Configuration Files 39 | *.config 40 | 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, LCI Technology Group, LLC 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the name of LCI Technology Group, LLC nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | twanalyze 2 | ========= 3 | 4 | Twanalyze downloads account information and up to 3200 of the most recent tweets for the specified screen_name. The downloaded tweets are analyzed and a report is generated that includes the top 20 hashtags, mentions, links, tweet times, locations, and phrases used in each tweet. 5 | 6 | Prerequisites 7 | ------------- 8 | Twanalyze is dependent on the [requests](http://docs.python-requests.org/en/latest/index.html), [simplekml](http://simplekml.readthedocs.org/en/latest/), and [nltk](http://nltk.org/) libraries. 9 | * `pip install requests` 10 | * `pip install simplekml` 11 | * `pip install nltk` 12 | 13 | Twanalyze also needs a Twitter API key. You can get an API key by signing in to https://dev.twitter.com/apps with your Twitter username and password. Once you are signed in, click Create a new application. 14 | 15 | Configuration 16 | ------------- 17 | Once you have obtained the Twitter API key, you will need to add the consumer key, consumer secret, token, and token_secret to the twanalyze.config.template file and rename the file to twanalyze.config. 18 | 19 | Usage 20 | ----- 21 | To use twanalyze, provide a Twitter screen_name, a report file name, and a report format. 22 | 23 | `python twanalyze.py screen_name filename html|kml|md|raw|all` 24 | 25 | Report Formats 26 | -------------- 27 | Twanalyze supports four report formats, which are described below. A report format must be specified when launching the script. If an invalid report format is given then a Markdown report will be generated. 28 | 29 | * Markdown - Creates a Markdown formatted report with user details and the analysis results. 30 | * HTML - Creates an HTML formatted report with user details and the analysis results. 31 | * KML - Creates a KML file with lat and lon coordinates and timestamp for any tweets that contain location data. 32 | * Raw - Creates a JSON file with all of the downloaded user data and tweets. This could be a very large file. 33 | * All - Creates reports in Markdown, HTML, and KML formats. 34 | 35 | Sample Markdown Report 36 | ---------------------- 37 | Twanalyze Report 38 | ================ 39 | averagesecguy 40 | ------------- 41 | Name: Stephen Haywood 42 | Description: I have worked professionally as a programmer, school teacher, computer teacher, sysadmin and now as an information security auditor. 43 | Location: 44 | Time Zone: Eastern Time (US & Canada) 45 | UTC Offset: -5 46 | Tweets: 4916 47 | Favorites: 612 48 | Listed: 40 49 | Followers: 920 50 | Following: 392 51 | 52 | Hashtags 53 | -------- 54 | #dc423 - 15 55 | ##dc423 - 8 56 | #derbycon - 8 57 | #cispa - 7 58 | #secchat - 7 59 | #ff - 5 60 | #python - 5 61 | #cha - 4 62 | #chabiz - 4 63 | #dc865 - 4 64 | #derbycon. - 4 65 | #infosec - 3 66 | #latenighthacking - 3 67 | #metasploit - 3 68 | #shodan - 3 69 | #cfaa - 2 70 | #cha. - 2 71 | #dc423. - 2 72 | #dc865. - 2 73 | #fivewordtechhorror - 2 74 | 75 | Mentions 76 | -------- 77 | @sawaba - 100 78 | @tothehilt - 100 79 | @jakx_ - 83 80 | @jgamblin - 75 81 | @synackpwn - 69 82 | @adamcaudill - 68 83 | @tatanus - 57 84 | @jimmyvo - 50 85 | @itsecurity - 43 86 | @erickolb - 42 87 | @gepeto42 - 38 88 | @carlos_perez - 36 89 | @hrbrmstr - 35 90 | @jadedsecurity - 35 91 | @jodieswafford - 35 92 | @dave_rel1k - 34 93 | @gattaca - 33 94 | @netpwn - 30 95 | @mubix - 27 96 | @0xabad1dea - 26 97 | 98 | Links 99 | ----- 100 | https://t.co/pYTSa5dkkV - 4 101 | http://t.co/fuClE544f2 - 3 102 | https://t.co/o31LqifFFf - 3 103 | https://t.co/t8v9VZytw7 - 3 104 | http://t.co/0fSolBwg - 2 105 | http://t.co/4sWv7a3J - 2 106 | http://t.co/8XeKy5KGzD - 2 107 | http://t.co/CdLRaji0ZZ - 2 108 | http://t.co/DX41GkciCH - 2 109 | http://t.co/Fb0L6GJh - 2 110 | http://t.co/G3Dasqn3 - 2 111 | http://t.co/Ir549YJc - 2 112 | http://t.co/J15cxqJYX8 - 2 113 | http://t.co/Jqqi0uFFsq - 2 114 | http://t.co/L2a42NuF0K - 2 115 | http://t.co/Q3sfeINBKM - 2 116 | http://t.co/bRK3c8u2 - 2 117 | http://t.co/bmfySo3D - 2 118 | http://t.co/dhkwMb79w5 - 2 119 | http://t.co/fkCqdyPlKn - 2 120 | 121 | 3-word Phrases 122 | -------------- 123 | thanks for the - 34 124 | a lot of - 25 125 | if you are - 21 126 | let me know - 20 127 | i have to - 17 128 | if you have - 15 129 | me know if - 14 130 | be able to - 13 131 | i want to - 13 132 | looking forward to - 13 133 | i need to - 12 134 | know if you - 12 135 | how do you - 11 136 | i have a - 11 137 | let me know. - 11 138 | you have to - 11 139 | a couple of - 9 140 | is there a - 9 141 | what is the - 9 142 | how do i - 8 143 | 144 | 4-word Phrases 145 | -------------- 146 | let me know if - 14 147 | me know if you - 10 148 | may be able to - 7 149 | [at] averagesecurityguy [dot] info - 6 150 | know if you have - 6 151 | stephen [at] averagesecurityguy [dot] - 6 152 | thanks for the help. - 6 153 | if you are a - 5 154 | if you have any - 5 155 | this looks like a - 5 156 | but i don't think - 4 157 | is one of the - 4 158 | not the same as - 4 159 | thanks for the offer. - 4 160 | @jodieswafford @jakx_ @erickolb @sawaba - 3 161 | @tatanus @jodieswafford @jakx_ @erickolb - 3 162 | a good way to - 3 163 | a lot of good - 3 164 | anyone know of a - 3 165 | can anyone recommend a - 3 166 | 167 | 5-word Phrases 168 | -------------- 169 | let me know if you - 10 170 | stephen [at] averagesecurityguy [dot] info - 6 171 | me know if you have - 5 172 | know if you have any - 4 173 | @tatanus @jodieswafford @jakx_ @erickolb @sawaba - 3 174 | if you have any questions. - 3 175 | is it just me or - 3 176 | is not the same as - 3 177 | let me know if there - 3 178 | wish i could have been - 3 179 | 14. if you are a - 2 180 | @csoandy @gisellis so wim, which - 2 181 | @csoandy that is the biggest - 2 182 | @gisellis so wim, which ones - 2 183 | @hrbrmstr @jaredpfost @jayjacobs added a - 2 184 | @isaiahmc yes, i have the - 2 185 | @jadedsecurity @thegrugq @chort0 @amazingant @dakami - 2 186 | @jaredpfost @jayjacobs added a better - 2 187 | @jayjacobs added a better explanation. - 2 188 | @jimmyvo i'll never be bought - 2 189 | 190 | Timestamps 191 | ---------- 192 | 15:00:00 +0000 - 281 193 | 14:00:00 +0000 - 269 194 | 19:00:00 +0000 - 255 195 | 17:00:00 +0000 - 235 196 | 18:00:00 +0000 - 223 197 | 16:00:00 +0000 - 201 198 | 20:00:00 +0000 - 191 199 | 01:00:00 +0000 - 150 200 | 13:00:00 +0000 - 145 201 | 03:00:00 +0000 - 144 202 | 02:00:00 +0000 - 133 203 | 21:00:00 +0000 - 117 204 | 00:00:00 +0000 - 114 205 | 23:00:00 +0000 - 111 206 | 22:00:00 +0000 - 96 207 | 04:00:00 +0000 - 75 208 | 05:00:00 +0000 - 54 209 | 12:00:00 +0000 - 36 210 | 06:00:00 +0000 - 12 211 | 07:00:00 +0000 - 4 212 | 213 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/averagesecurityguy/twanalyze/3d9a73351b83edf529f6fd9c95ebb4d404b90cec/__init__.py -------------------------------------------------------------------------------- /twanalyze.config.template: -------------------------------------------------------------------------------- 1 | { 2 | "consumer_key": "", 3 | "consumer_secret": "", 4 | "token": "", 5 | "token_secret": "" 6 | } -------------------------------------------------------------------------------- /twanalyze.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import json 4 | import logging 5 | 6 | import twanalyze.twitter 7 | import twanalyze.report 8 | import twanalyze.parse 9 | 10 | #----------------------------------------------------------------------------- 11 | # Functions 12 | #----------------------------------------------------------------------------- 13 | def load_configuration(filename): 14 | with open(filename) as config_file: 15 | return json.loads(config_file.read()) 16 | 17 | 18 | #----------------------------------------------------------------------------- 19 | # Main Program 20 | #----------------------------------------------------------------------------- 21 | if len(sys.argv) != 4: 22 | print 'Usage: twanalyze screen_name report_file_name html|kml|md|raw|all' 23 | sys.exit() 24 | 25 | cfg = load_configuration('twanalyze.config') 26 | tw = twanalyze.twitter.Twitter(cfg['consumer_key'], cfg['consumer_secret'], 27 | cfg['token'], cfg['token_secret']) 28 | 29 | # Get data 30 | screen_name = sys.argv[1] 31 | user = tw.user(screen_name) 32 | tweets = tw.tweets(screen_name) 33 | 34 | # Analyze data 35 | analysis = twanalyze.parse.parse_tweets(tweets) 36 | 37 | # Report analysis 38 | report_file = sys.argv[2] 39 | format = sys.argv[3].lower() 40 | 41 | if format == 'all': 42 | twanalyze.report.create_html_report(user, analysis, report_file) 43 | twanalyze.report.create_kml_report(tweets, report_file) 44 | twanalyze.report.create_markdown_report(user, analysis, report_file) 45 | elif format == 'html': 46 | twanalyze.report.create_html_report(user, analysis, report_file) 47 | elif format == 'kml': 48 | twanalyze.report.create_kml_report(tweets, report_file) 49 | elif format == 'raw': 50 | twanalyze.report.create_raw_report(user, tweets, report_file) 51 | else: 52 | if format != 'md': 53 | logging.warning('Invalid report format, defaulting to Markdown.') 54 | twanalyze.report.create_markdown_report(user, analysis, report_file) 55 | -------------------------------------------------------------------------------- /twanalyze/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/averagesecurityguy/twanalyze/3d9a73351b83edf529f6fd9c95ebb4d404b90cec/twanalyze/__init__.py -------------------------------------------------------------------------------- /twanalyze/oauth.py: -------------------------------------------------------------------------------- 1 | import time 2 | import base64 3 | import random 4 | import urllib 5 | import urlparse 6 | import hmac 7 | import hashlib 8 | from requests.auth import AuthBase 9 | 10 | 11 | class TwitterSingleOAuth(AuthBase): 12 | '''Creates an authorization header for a single user Twitter Oauth 13 | request. Three-legged auth is not supported.''' 14 | def __init__(self, ck=None, cs=None, at=None, ats=None): 15 | self.consumer_key = ck 16 | self.consumer_secret = cs 17 | self.access_token = at 18 | self.access_token_secret = ats 19 | self.__nonce = None 20 | self.__time = None 21 | 22 | def __call__(self, r): 23 | '''Return the authorization header needed.''' 24 | self.__base_url = self.__get_base_url(r.url) 25 | self.__body = self.__get_body_params(r.body) 26 | self.__query = self.__get_query_params(r.url) 27 | 28 | r.headers['Authorization'] = self.__generate_auth_string(r) 29 | return r 30 | 31 | def __enc(self, string): 32 | encoded_str = urllib.quote(string, safe='') 33 | return encoded_str.replace('+', '%20').replace('%7E', '~') 34 | 35 | def __get_nonce(self, length=32): 36 | n = '' 37 | for i in range(length): 38 | n += random.choice('0123456789ABCDEF') 39 | 40 | return n 41 | 42 | def __get_base_url(self, url): 43 | url = url.split('?') 44 | 45 | return url[0] 46 | 47 | def __get_query_params(self, url): 48 | q = {} 49 | query = urlparse.urlparse(url).query 50 | 51 | if query != '': 52 | for param in query.split('&'): 53 | key, val = param.split('=') 54 | q[key] = val 55 | 56 | return q 57 | 58 | def __get_body_params(self, body): 59 | b = {} 60 | 61 | if body is not None: 62 | body = body.replace('+', ' ') 63 | body = urllib.unquote(body) 64 | for p in body.split('&'): 65 | key, val = p.split('=') 66 | b[key] = val 67 | 68 | return b 69 | 70 | def __calculate_signature(self, r): 71 | base = self.__generate_base_string(r) 72 | key = self.__generate_signing_key() 73 | signature = hmac.new(key, base, hashlib.sha1) 74 | 75 | return base64.b64encode(signature.digest()) 76 | 77 | def __generate_base_string(self, r): 78 | base = r.method.upper() + '&' 79 | 80 | base += self.__enc(self.__base_url) + '&' 81 | base += self.__enc(self.__generate_parameter_string(r)) 82 | 83 | return base 84 | 85 | def __generate_parameter_string(self, r): 86 | p = {} 87 | p['oauth_consumer_key'] = self.__enc(self.consumer_key) 88 | p['oauth_nonce'] = self.__enc(self.__nonce) 89 | p['oauth_signature_method'] = 'HMAC-SHA1' 90 | p['oauth_timestamp'] = self.__time 91 | p['oauth_token'] = self.__enc(self.access_token) 92 | p['oauth_version'] = '1.0' 93 | 94 | for k, v in self.__query.iteritems(): 95 | p[self.__enc(k)] = self.__enc(v) 96 | 97 | for k, v in self.__body.iteritems(): 98 | p[self.__enc(k)] = self.__enc(v) 99 | 100 | pstr = '&'.join(['{0}={1}'.format(k, p[k]) for k in sorted(p)]) 101 | 102 | return pstr 103 | 104 | def __generate_signing_key(self): 105 | key = self.__enc(self.consumer_secret) 106 | key += '&' 107 | key += self.__enc(self.access_token_secret) 108 | 109 | return key 110 | 111 | def __generate_auth_string(self, r): 112 | self.__nonce = self.__get_nonce() 113 | self.__time = int(time.time()) 114 | a = 'OAuth ' 115 | a += 'oauth_consumer_key="{0}", '.format(self.__enc(self.consumer_key)) 116 | a += 'oauth_nonce="{0}", '.format(self.__enc(self.__nonce)) 117 | a += 'oauth_signature="{0}", '.format(self.__enc(self.__calculate_signature(r))) 118 | a += 'oauth_signature_method="HMAC-SHA1", ' 119 | a += 'oauth_timestamp="{0}", '.format(self.__time) 120 | a += 'oauth_token="{0}", '.format(self.__enc(self.access_token)) 121 | a += 'oauth_version="1.0"' 122 | 123 | return a 124 | -------------------------------------------------------------------------------- /twanalyze/parse.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import re 3 | 4 | def __parse_hashtags(t): 5 | ''' 6 | Get any hashtags from the hashtags entity. 7 | ''' 8 | if t.get('entities') is not None: 9 | return [h['text'].lower() for h in t['entities']['hashtags']] 10 | else: 11 | return [] 12 | 13 | def __parse_links(t): 14 | ''' 15 | Get any links from the urls entity. 16 | ''' 17 | if t.get('entities') is not None: 18 | return [u['expanded_url'] for u in t['entities']['urls']] 19 | else: 20 | return [] 21 | 22 | 23 | def __parse_mentions(t): 24 | ''' 25 | Get any mentions from the user_mentions entity. 26 | ''' 27 | if t.get('entities') is not None: 28 | return [m['screen_name'].lower() for m in t['entities']['user_mentions']] 29 | else: 30 | return [] 31 | 32 | 33 | def __parse_phrases(t, count): 34 | ngrams = nltk.util.ngrams(t['text'].lower().split(), count) 35 | return [' '.join(ngram) for ngram in ngrams] 36 | 37 | 38 | def __parse_time(timestamp): 39 | ''' 40 | Get the timestamp for the tweet, remove the seconds and minutes, and 41 | store the timestamp for analysis. 42 | ''' 43 | m = re.search(r'(\d\d:\d\d:\d\d \+\d\d\d\d)', timestamp) 44 | if m is not None: 45 | return re.sub(r':\d\d:\d\d ', ':00:00 ', m.group(1)) 46 | else: 47 | return None 48 | 49 | 50 | def __parse_place(place): 51 | ''' 52 | Capture the place id, country, and full_name. 53 | ''' 54 | if place is not None: 55 | return (place['id'], place['country'], place['full_name']) 56 | else: 57 | return None 58 | 59 | 60 | def parse_tweets(tweets): 61 | analysis = {'hashtags': [], 'mentions': [], 'links': [], 'phrase3': [], 62 | 'phrase4': [], 'phrase5': [], 'times': [], 'places': []} 63 | 64 | for tweet in tweets: 65 | # ht, mt, li = __parse_words(tweet) 66 | analysis['hashtags'].extend(__parse_hashtags(tweet)) 67 | analysis['mentions'].extend(__parse_mentions(tweet)) 68 | analysis['links'].extend(__parse_links(tweet)) 69 | 70 | analysis['phrase3'].extend(__parse_phrases(tweet, 3)) 71 | analysis['phrase4'].extend(__parse_phrases(tweet, 4)) 72 | analysis['phrase5'].extend(__parse_phrases(tweet, 5)) 73 | 74 | time = __parse_time(tweet['created_at']) 75 | if time is not None: 76 | analysis['times'].append(time) 77 | 78 | place = __parse_place(tweet['place']) 79 | if place is not None: 80 | analysis['places'].append(place) 81 | 82 | return analysis 83 | -------------------------------------------------------------------------------- /twanalyze/report.py: -------------------------------------------------------------------------------- 1 | import simplekml 2 | import nltk 3 | import logging 4 | import json 5 | 6 | #----------------------------------------------------------------------------- 7 | # RAW REPORT 8 | #----------------------------------------------------------------------------- 9 | def create_raw_report(user, tweets, filename): 10 | if not filename.endswith('.json'): 11 | filename = filename + '.json' 12 | 13 | logging.info('Writing RAW report to {0}.'.format(filename)) 14 | 15 | report = {'user': user, 'tweets': tweets} 16 | 17 | raw = open(filename, 'w') 18 | raw.write(json.dumps(report, indent=2)) 19 | raw.close() 20 | 21 | 22 | #----------------------------------------------------------------------------- 23 | # KML REPORT 24 | #----------------------------------------------------------------------------- 25 | def create_kml_report(tweets, filename): 26 | if not filename.endswith('.kml'): 27 | filename = filename + '.kml' 28 | 29 | logging.info('Writing KML report to {0}.'.format(filename)) 30 | 31 | kml = simplekml.Kml() 32 | 33 | for tweet in tweets: 34 | if tweet['coordinates'] is not None: 35 | timestamp = simplekml.TimeStamp(when=tweet['created_at']) 36 | kml.newpoint(name=tweet['id_str'], 37 | description=tweet['text'], 38 | timestamp=timestamp, 39 | coords=[tuple(tweet['coordinates']['coordinates'])]) 40 | 41 | kml.save(filename) 42 | 43 | 44 | #----------------------------------------------------------------------------- 45 | # MARKDOWN REPORT 46 | #----------------------------------------------------------------------------- 47 | def __md_user(user): 48 | ''' 49 | Print various attributes of the user. 50 | ''' 51 | u = u'' 52 | u += u'{0}\n'.format(user['screen_name']) 53 | u += '-' * len(user['screen_name']) + '\n' 54 | u += u'Name: {0}\n'.format(user['name']) 55 | u += u'Description: {0}\n'.format(user['description']) 56 | u += u'Location: {0}\n'.format(user['location']) 57 | u += 'Time Zone: {0}\n'.format(user['time_zone']) 58 | u += 'UTC Offset: {0}\n'.format(user['utc_offset']/3600) 59 | u += 'Tweets: {0}\n'.format(user['statuses_count']) 60 | u += 'Favorites: {0}\n'.format(user['favourites_count']) 61 | u += 'Listed: {0}\n'.format(user['listed_count']) 62 | u += 'Followers: {0}\n'.format(user['followers_count']) 63 | u += 'Following: {0}\n'.format(user['friends_count']) 64 | u += '\n' 65 | 66 | return u.encode('utf-8') 67 | 68 | 69 | def __md_distribution(title, items, top=20): 70 | ''' 71 | Calculate the frequency distribution of the list of items. Convert the 72 | top most frequent items in the list to Markdown 73 | ''' 74 | d = u'' 75 | if len(items) != 0: 76 | d += '{0}\n'.format(title) 77 | d += '-' * len(title) + '\n' 78 | 79 | dist = nltk.FreqDist(items) 80 | 81 | for k in dist.keys()[:top]: 82 | if title == 'Hashtags': 83 | link = 'https://twitter.com/search?q=%23{0}'.format(k) 84 | d += u'* [#{0}]({1}) - {2}\n'.format(k, link, dist[k]) 85 | elif title == 'Mentions': 86 | link = 'https://twitter.com/{0}'.format(k) 87 | d += u'* [@{0}]({1}) - {2}\n'.format(k, link, dist[k]) 88 | elif title == 'Links': 89 | d += u'* [{0}]({0}) - {1}\n'.format(k, dist[k]) 90 | else: 91 | d += u'* {0} - {1}\n'.format(k, dist[k]) 92 | 93 | d += '\n' 94 | 95 | return d.encode('utf-8') 96 | 97 | 98 | def create_markdown_report(user, analysis, filename): 99 | if not filename.endswith('.md'): 100 | filename = filename + '.md' 101 | 102 | logging.info('Writing Markdown report to {0}.'.format(filename)) 103 | 104 | md = open(filename, 'w') 105 | md.write('Twanalyze Report\n') 106 | md.write('================\n') 107 | md.write(__md_user(user)) 108 | md.write(__md_distribution('Hashtags', analysis['hashtags'])) 109 | md.write(__md_distribution('Mentions', analysis['mentions'])) 110 | md.write(__md_distribution('Links', analysis['links'])) 111 | md.write(__md_distribution('3-word Phrases', analysis['phrase3'])) 112 | md.write(__md_distribution('4-word Phrases', analysis['phrase4'])) 113 | md.write(__md_distribution('5-word Phrases', analysis['phrase5'])) 114 | md.write(__md_distribution('Timestamps', analysis['times'], top=24)) 115 | md.write(__md_distribution('Places', analysis['places'])) 116 | md.close() 117 | 118 | #----------------------------------------------------------------------------- 119 | # HTML REPORT 120 | #----------------------------------------------------------------------------- 121 | def __html_user(user): 122 | ''' 123 | Print various attributes of the user in HTML. 124 | ''' 125 | u = u'' 126 | u += u'

{0}

\n'.format(user['screen_name']) 127 | u += '' 139 | 140 | return u.encode('utf-8') 141 | 142 | 143 | def __html_distribution(title, items, top=20): 144 | ''' 145 | Calculate the frequency distribution of the list of items. Print the 20 146 | most frequent items in the list. 147 | ''' 148 | d = u'' 149 | if len(items) != 0: 150 | d += '

{0}

\n'.format(title) 151 | d += '\n' 171 | 172 | return d.encode('utf-8') 173 | 174 | 175 | def create_html_report(user, analysis, filename): 176 | if not filename.endswith('.html'): 177 | filename = filename + '.html' 178 | 179 | logging.info('Writing HTML report to {0}.'.format(filename)) 180 | 181 | html = open(filename, 'w') 182 | html.write('\n\n\n') 183 | html.write('\n\n\n') 188 | html.write('

Twanalyze Report

\n') 189 | html.write(__html_user(user)) 190 | html.write(__html_distribution('Hashtags', analysis['hashtags'])) 191 | html.write(__html_distribution('Mentions', analysis['mentions'])) 192 | html.write(__html_distribution('Links', analysis['links'])) 193 | html.write(__html_distribution('3-word Phrases', analysis['phrase3'])) 194 | html.write(__html_distribution('4-word Phrases', analysis['phrase4'])) 195 | html.write(__html_distribution('5-word Phrases', analysis['phrase5'])) 196 | html.write(__html_distribution('Timestamps', analysis['times'], top=24)) 197 | html.write(__html_distribution('Places', analysis['places'])) 198 | html.write('\n\n') 199 | html.close() 200 | -------------------------------------------------------------------------------- /twanalyze/twitter.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import oauth 3 | 4 | class Twitter(): 5 | def __init__(self, ck, cs, t, ts): 6 | self.consumer_key = ck 7 | self.consumer_secret = cs 8 | self.token = t 9 | self.token_secret = ts 10 | self.__base = 'https://api.twitter.com/1.1' 11 | self.__ssn = requests.Session() 12 | self.__ssn.auth = oauth.TwitterSingleOAuth(self.consumer_key, 13 | self.consumer_secret, 14 | self.token, 15 | self.token_secret) 16 | 17 | def __connect(self, method, resource, params=None, data=None): 18 | '''Make a GET or POST request to the API.''' 19 | url = self.__base + resource 20 | 21 | if method.upper() == 'GET': 22 | r = self.__ssn.get(url, params=params) 23 | if r.status_code == 429: 24 | return {} 25 | 26 | return r.json() 27 | 28 | if method.upper() == 'POST': 29 | r = self.__ssn.post(url, data=data) 30 | if r.status_code == 429: 31 | return {} 32 | 33 | return r.json() 34 | 35 | def __users(self, url, params={}, count=200, total=3000): 36 | ''' 37 | Returns a list of items provided by the URL using the specified 38 | parameters. Should work for the following URLs: 39 | 40 | followers/list 41 | friends/list 42 | ''' 43 | users = [] 44 | cursor = -1 45 | 46 | while cursor != 0: 47 | params['next_cursor'] = cursor 48 | params['count'] = count 49 | 50 | resp = self.__connect('GET', url, params=params) 51 | if resp == {}: 52 | cursor = 0 53 | else: 54 | users.extend(resp['users']) 55 | cursor = resp['next_cursor'] 56 | 57 | return users 58 | 59 | def __statuses(self, url, params={}, count=200, total=800): 60 | ''' 61 | Get the statuses provided by the URL using the specified parameters. 62 | ''' 63 | params['count'] = count 64 | 65 | statuses = [] 66 | requested = 0 67 | 68 | while requested < total: 69 | ids = [] 70 | resp = self.__connect('GET', url, params=params) 71 | 72 | if resp == {}: 73 | requested = total 74 | else: 75 | ids = [r['id'] for r in resp] 76 | statuses.extend(resp) 77 | # for r in resp: 78 | # statuses.append(r) 79 | # ids.append(r['id']) 80 | 81 | params['max_id'] = min(ids) 82 | requested += count 83 | 84 | return statuses 85 | 86 | def followers(self, screen_name): 87 | ''' 88 | Get the followers of the specified user account. 89 | ''' 90 | params = {'screen_name': screen_name, 91 | 'include_user_entities': False} 92 | 93 | return self.__users('/followers/list.json', params) 94 | 95 | def friends(self, screen_name): 96 | ''' 97 | Get the friends (people they follow) of the specified user account. 98 | ''' 99 | params = {'screen_name': screen_name, 100 | 'include_user_entities': False} 101 | 102 | return self.__users('friends/list.json', params) 103 | 104 | def tweets(self, screen_name): 105 | ''' 106 | Get the last 3200 statuses for the specified screen_name. Replies are 107 | included in the results. 108 | ''' 109 | params = {'screen_name': screen_name, 'include_rts': False} 110 | return self.__statuses('/statuses/user_timeline.json', 111 | params=params, 112 | total=3200) 113 | 114 | def user(self, screen_name): 115 | ''' 116 | Get general information about the specified user account. 117 | ''' 118 | params = {'screen_name': screen_name} 119 | return self.__connect('GET', '/users/show.json', params=params) 120 | --------------------------------------------------------------------------------