├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── twanalyze.config.template
├── twanalyze.py
└── twanalyze
    ├── __init__.py
    ├── oauth.py
    ├── parse.py
    ├── report.py
    └── twitter.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 | 
22 | # Installer logs
23 | pip-log.txt
24 | 
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | 
38 | # Configuration Files
39 | *.config
40 | 
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, LCI Technology Group, LLC
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice, this
11 |   list of conditions and the following disclaimer in the documentation and/or
12 |   other materials provided with the distribution.
13 | 
14 | * Neither the name of LCI Technology Group, LLC nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | twanalyze
  2 | =========
  3 | 
  4 | Twanalyze downloads account information and up to 3200 of the most recent tweets for the specified screen_name. The downloaded tweets are analyzed and a report is generated that includes the top 20 hashtags, mentions, links, tweet times, locations, and phrases used in each tweet.
  5 | 
  6 | Prerequisites
  7 | -------------
  8 | Twanalyze is dependent on the [requests](http://docs.python-requests.org/en/latest/index.html), [simplekml](http://simplekml.readthedocs.org/en/latest/), and [nltk](http://nltk.org/) libraries.
  9 | * `pip install requests`
 10 | * `pip install simplekml`
 11 | * `pip install nltk`
 12 | 
 13 | Twanalyze also needs a Twitter API key. You can get an API key by signing in to https://dev.twitter.com/apps with your Twitter username and password. Once you are signed in, click Create a new application.
 14 | 
 15 | Configuration
 16 | -------------
 17 | Once you have obtained the Twitter API key, you will need to add the consumer key, consumer secret, token, and token_secret to the twanalyze.config.template file and rename the file to twanalyze.config.
 18 | 
 19 | Usage
 20 | -----
 21 | To use twanalyze, provide a Twitter screen_name, a report file name, and a report format.
 22 | 
 23 | `python twanalyze.py screen_name filename html|kml|md|raw|all`
 24 | 
 25 | Report Formats
 26 | --------------
 27 | Twanalyze supports four report formats, which are described below. A report format must be specified when launching the script. If an invalid report format is given then a Markdown report will be generated.
 28 | 
 29 | * Markdown - Creates a Markdown formatted report with user details and the analysis results.
 30 | * HTML - Creates an HTML formatted report with user details and the analysis results.
 31 | * KML - Creates a KML file with lat and lon coordinates and timestamp for any tweets that contain location data.
 32 | * Raw - Creates a JSON file with all of the downloaded user data and tweets. This could be a very large file.
 33 | * All - Creates reports in Markdown, HTML, and KML formats.
 34 | 
 35 | Sample Markdown Report
 36 | ----------------------
 37 | 	Twanalyze Report
 38 | 	================
 39 | 	averagesecguy
 40 | 	-------------
 41 | 	Name: Stephen Haywood
 42 | 	Description: I have worked professionally as a programmer, school teacher, computer teacher, sysadmin and now as an information security auditor.
 43 | 	Location: 
 44 | 	Time Zone: Eastern Time (US & Canada)
 45 | 	UTC Offset: -5
 46 | 	Tweets: 4916
 47 | 	Favorites: 612
 48 | 	Listed: 40
 49 | 	Followers: 920
 50 | 	Following: 392
 51 | 
 52 | 	Hashtags
 53 | 	--------
 54 | 	#dc423 - 15
 55 | 	##dc423 - 8
 56 | 	#derbycon - 8
 57 | 	#cispa - 7
 58 | 	#secchat - 7
 59 | 	#ff - 5
 60 | 	#python - 5
 61 | 	#cha - 4
 62 | 	#chabiz - 4
 63 | 	#dc865 - 4
 64 | 	#derbycon. - 4
 65 | 	#infosec - 3
 66 | 	#latenighthacking - 3
 67 | 	#metasploit - 3
 68 | 	#shodan - 3
 69 | 	#cfaa - 2
 70 | 	#cha. - 2
 71 | 	#dc423. - 2
 72 | 	#dc865. - 2
 73 | 	#fivewordtechhorror - 2
 74 | 
 75 | 	Mentions
 76 | 	--------
 77 | 	@sawaba - 100
 78 | 	@tothehilt - 100
 79 | 	@jakx_ - 83
 80 | 	@jgamblin - 75
 81 | 	@synackpwn - 69
 82 | 	@adamcaudill - 68
 83 | 	@tatanus - 57
 84 | 	@jimmyvo - 50
 85 | 	@itsecurity - 43
 86 | 	@erickolb - 42
 87 | 	@gepeto42 - 38
 88 | 	@carlos_perez - 36
 89 | 	@hrbrmstr - 35
 90 | 	@jadedsecurity - 35
 91 | 	@jodieswafford - 35
 92 | 	@dave_rel1k - 34
 93 | 	@gattaca - 33
 94 | 	@netpwn - 30
 95 | 	@mubix - 27
 96 | 	@0xabad1dea - 26
 97 | 
 98 | 	Links
 99 | 	-----
100 | 	https://t.co/pYTSa5dkkV - 4
101 | 	http://t.co/fuClE544f2 - 3
102 | 	https://t.co/o31LqifFFf - 3
103 | 	https://t.co/t8v9VZytw7 - 3
104 | 	http://t.co/0fSolBwg - 2
105 | 	http://t.co/4sWv7a3J - 2
106 | 	http://t.co/8XeKy5KGzD - 2
107 | 	http://t.co/CdLRaji0ZZ - 2
108 | 	http://t.co/DX41GkciCH - 2
109 | 	http://t.co/Fb0L6GJh - 2
110 | 	http://t.co/G3Dasqn3 - 2
111 | 	http://t.co/Ir549YJc - 2
112 | 	http://t.co/J15cxqJYX8 - 2
113 | 	http://t.co/Jqqi0uFFsq - 2
114 | 	http://t.co/L2a42NuF0K - 2
115 | 	http://t.co/Q3sfeINBKM - 2
116 | 	http://t.co/bRK3c8u2 - 2
117 | 	http://t.co/bmfySo3D - 2
118 | 	http://t.co/dhkwMb79w5 - 2
119 | 	http://t.co/fkCqdyPlKn - 2
120 | 
121 | 	3-word Phrases
122 | 	--------------
123 | 	thanks for the - 34
124 | 	a lot of - 25
125 | 	if you are - 21
126 | 	let me know - 20
127 | 	i have to - 17
128 | 	if you have - 15
129 | 	me know if - 14
130 | 	be able to - 13
131 | 	i want to - 13
132 | 	looking forward to - 13
133 | 	i need to - 12
134 | 	know if you - 12
135 | 	how do you - 11
136 | 	i have a - 11
137 | 	let me know. - 11
138 | 	you have to - 11
139 | 	a couple of - 9
140 | 	is there a - 9
141 | 	what is the - 9
142 | 	how do i - 8
143 | 
144 | 	4-word Phrases
145 | 	--------------
146 | 	let me know if - 14
147 | 	me know if you - 10
148 | 	may be able to - 7
149 | 	[at] averagesecurityguy [dot] info - 6
150 | 	know if you have - 6
151 | 	stephen [at] averagesecurityguy [dot] - 6
152 | 	thanks for the help. - 6
153 | 	if you are a - 5
154 | 	if you have any - 5
155 | 	this looks like a - 5
156 | 	but i don't think - 4
157 | 	is one of the - 4
158 | 	not the same as - 4
159 | 	thanks for the offer. - 4
160 | 	@jodieswafford @jakx_ @erickolb @sawaba - 3
161 | 	@tatanus @jodieswafford @jakx_ @erickolb - 3
162 | 	a good way to - 3
163 | 	a lot of good - 3
164 | 	anyone know of a - 3
165 | 	can anyone recommend a - 3
166 | 
167 | 	5-word Phrases
168 | 	--------------
169 | 	let me know if you - 10
170 | 	stephen [at] averagesecurityguy [dot] info - 6
171 | 	me know if you have - 5
172 | 	know if you have any - 4
173 | 	@tatanus @jodieswafford @jakx_ @erickolb @sawaba - 3
174 | 	if you have any questions. - 3
175 | 	is it just me or - 3
176 | 	is not the same as - 3
177 | 	let me know if there - 3
178 | 	wish i could have been - 3
179 | 	14. if you are a - 2
180 | 	@csoandy @gisellis so wim, which - 2
181 | 	@csoandy that is the biggest - 2
182 | 	@gisellis so wim, which ones - 2
183 | 	@hrbrmstr @jaredpfost @jayjacobs added a - 2
184 | 	@isaiahmc yes, i have the - 2
185 | 	@jadedsecurity @thegrugq @chort0 @amazingant @dakami - 2
186 | 	@jaredpfost @jayjacobs added a better - 2
187 | 	@jayjacobs added a better explanation. - 2
188 | 	@jimmyvo i'll never be bought - 2
189 | 
190 | 	Timestamps
191 | 	----------
192 | 	15:00:00 +0000 - 281
193 | 	14:00:00 +0000 - 269
194 | 	19:00:00 +0000 - 255
195 | 	17:00:00 +0000 - 235
196 | 	18:00:00 +0000 - 223
197 | 	16:00:00 +0000 - 201
198 | 	20:00:00 +0000 - 191
199 | 	01:00:00 +0000 - 150
200 | 	13:00:00 +0000 - 145
201 | 	03:00:00 +0000 - 144
202 | 	02:00:00 +0000 - 133
203 | 	21:00:00 +0000 - 117
204 | 	00:00:00 +0000 - 114
205 | 	23:00:00 +0000 - 111
206 | 	22:00:00 +0000 - 96
207 | 	04:00:00 +0000 - 75
208 | 	05:00:00 +0000 - 54
209 | 	12:00:00 +0000 - 36
210 | 	06:00:00 +0000 - 12
211 | 	07:00:00 +0000 - 4
212 | 
213 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/averagesecurityguy/twanalyze/3d9a73351b83edf529f6fd9c95ebb4d404b90cec/__init__.py


--------------------------------------------------------------------------------
/twanalyze.config.template:
--------------------------------------------------------------------------------
1 | {
2 |   "consumer_key": "",
3 |   "consumer_secret": "",
4 |   "token": "",
5 |   "token_secret": ""
6 | }


--------------------------------------------------------------------------------
/twanalyze.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import json
 4 | import logging
 5 | 
 6 | import twanalyze.twitter
 7 | import twanalyze.report
 8 | import twanalyze.parse
 9 | 
10 | #-----------------------------------------------------------------------------
11 | # Functions
12 | #-----------------------------------------------------------------------------
13 | def load_configuration(filename):
14 |     with open(filename) as config_file:
15 |         return json.loads(config_file.read())
16 | 
17 | 
18 | #-----------------------------------------------------------------------------
19 | # Main Program
20 | #-----------------------------------------------------------------------------
21 | if len(sys.argv) != 4:
22 |     print 'Usage: twanalyze screen_name report_file_name html|kml|md|raw|all'
23 |     sys.exit()
24 | 
25 | cfg = load_configuration('twanalyze.config')
26 | tw = twanalyze.twitter.Twitter(cfg['consumer_key'], cfg['consumer_secret'],
27 |                      cfg['token'], cfg['token_secret'])
28 | 
29 | # Get data
30 | screen_name = sys.argv[1]
31 | user = tw.user(screen_name)
32 | tweets = tw.tweets(screen_name)
33 | 
34 | # Analyze data
35 | analysis = twanalyze.parse.parse_tweets(tweets)
36 | 
37 | # Report analysis
38 | report_file = sys.argv[2]
39 | format = sys.argv[3].lower()
40 | 
41 | if format == 'all':
42 |     twanalyze.report.create_html_report(user, analysis, report_file)
43 |     twanalyze.report.create_kml_report(tweets, report_file)
44 |     twanalyze.report.create_markdown_report(user, analysis, report_file)
45 | elif format == 'html':
46 |     twanalyze.report.create_html_report(user, analysis, report_file)
47 | elif format == 'kml':
48 |     twanalyze.report.create_kml_report(tweets, report_file)
49 | elif format == 'raw':
50 |     twanalyze.report.create_raw_report(user, tweets, report_file)
51 | else:
52 |     if format != 'md':
53 |         logging.warning('Invalid report format, defaulting to Markdown.')
54 |     twanalyze.report.create_markdown_report(user, analysis, report_file)
55 | 


--------------------------------------------------------------------------------
/twanalyze/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/averagesecurityguy/twanalyze/3d9a73351b83edf529f6fd9c95ebb4d404b90cec/twanalyze/__init__.py


--------------------------------------------------------------------------------
/twanalyze/oauth.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import base64
  3 | import random
  4 | import urllib
  5 | import urlparse
  6 | import hmac
  7 | import hashlib
  8 | from requests.auth import AuthBase
  9 | 
 10 | 
 11 | class TwitterSingleOAuth(AuthBase):
 12 |     '''Creates an authorization header for a single user Twitter Oauth
 13 |     request. Three-legged auth is not supported.'''
 14 |     def __init__(self, ck=None, cs=None, at=None, ats=None):
 15 |         self.consumer_key = ck
 16 |         self.consumer_secret = cs
 17 |         self.access_token = at
 18 |         self.access_token_secret = ats
 19 |         self.__nonce = None
 20 |         self.__time = None
 21 | 
 22 |     def __call__(self, r):
 23 |         '''Return the authorization header needed.'''
 24 |         self.__base_url = self.__get_base_url(r.url)
 25 |         self.__body = self.__get_body_params(r.body)
 26 |         self.__query = self.__get_query_params(r.url)
 27 | 
 28 |         r.headers['Authorization'] = self.__generate_auth_string(r)
 29 |         return r
 30 | 
 31 |     def __enc(self, string):
 32 |         encoded_str = urllib.quote(string, safe='')
 33 |         return encoded_str.replace('+', '%20').replace('%7E', '~')
 34 | 
 35 |     def __get_nonce(self, length=32):
 36 |         n = ''
 37 |         for i in range(length):
 38 |             n += random.choice('0123456789ABCDEF')
 39 | 
 40 |         return n
 41 | 
 42 |     def __get_base_url(self, url):
 43 |         url = url.split('?')
 44 | 
 45 |         return url[0]
 46 | 
 47 |     def __get_query_params(self, url):
 48 |         q = {}
 49 |         query = urlparse.urlparse(url).query
 50 | 
 51 |         if query != '':
 52 |             for param in query.split('&'):
 53 |                 key, val = param.split('=')
 54 |                 q[key] = val
 55 | 
 56 |         return q
 57 | 
 58 |     def __get_body_params(self, body):
 59 |         b = {}
 60 | 
 61 |         if body is not None:
 62 |             body = body.replace('+', ' ')
 63 |             body = urllib.unquote(body)
 64 |             for p in body.split('&'):
 65 |                 key, val = p.split('=')
 66 |                 b[key] = val
 67 | 
 68 |         return b
 69 | 
 70 |     def __calculate_signature(self, r):
 71 |         base = self.__generate_base_string(r)
 72 |         key = self.__generate_signing_key()
 73 |         signature = hmac.new(key, base, hashlib.sha1)
 74 | 
 75 |         return base64.b64encode(signature.digest())
 76 | 
 77 |     def __generate_base_string(self, r):
 78 |         base = r.method.upper() + '&'
 79 | 
 80 |         base += self.__enc(self.__base_url) + '&'
 81 |         base += self.__enc(self.__generate_parameter_string(r))
 82 | 
 83 |         return base
 84 | 
 85 |     def __generate_parameter_string(self, r):
 86 |         p = {}
 87 |         p['oauth_consumer_key'] = self.__enc(self.consumer_key)
 88 |         p['oauth_nonce'] = self.__enc(self.__nonce)
 89 |         p['oauth_signature_method'] = 'HMAC-SHA1'
 90 |         p['oauth_timestamp'] = self.__time
 91 |         p['oauth_token'] = self.__enc(self.access_token)
 92 |         p['oauth_version'] = '1.0'
 93 | 
 94 |         for k, v in self.__query.iteritems():
 95 |             p[self.__enc(k)] = self.__enc(v)
 96 | 
 97 |         for k, v in self.__body.iteritems():
 98 |             p[self.__enc(k)] = self.__enc(v)
 99 | 
100 |         pstr = '&'.join(['{0}={1}'.format(k, p[k]) for k in sorted(p)])
101 | 
102 |         return pstr
103 | 
104 |     def __generate_signing_key(self):
105 |         key = self.__enc(self.consumer_secret)
106 |         key += '&'
107 |         key += self.__enc(self.access_token_secret)
108 | 
109 |         return key
110 | 
111 |     def __generate_auth_string(self, r):
112 |         self.__nonce = self.__get_nonce()
113 |         self.__time = int(time.time())
114 |         a = 'OAuth '
115 |         a += 'oauth_consumer_key="{0}", '.format(self.__enc(self.consumer_key))
116 |         a += 'oauth_nonce="{0}", '.format(self.__enc(self.__nonce))
117 |         a += 'oauth_signature="{0}", '.format(self.__enc(self.__calculate_signature(r)))
118 |         a += 'oauth_signature_method="HMAC-SHA1", '
119 |         a += 'oauth_timestamp="{0}", '.format(self.__time)
120 |         a += 'oauth_token="{0}", '.format(self.__enc(self.access_token))
121 |         a += 'oauth_version="1.0"'
122 | 
123 |         return a
124 | 


--------------------------------------------------------------------------------
/twanalyze/parse.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import re
 3 | 
 4 | def __parse_hashtags(t):
 5 |     '''
 6 |     Get any hashtags from the hashtags entity.
 7 |     '''
 8 |     if t.get('entities') is not None:
 9 |         return [h['text'].lower() for h in t['entities']['hashtags']]
10 |     else:
11 |         return []
12 | 
13 | def __parse_links(t):
14 |     '''
15 |     Get any links from the urls entity.
16 |     '''
17 |     if t.get('entities') is not None:
18 |         return [u['expanded_url'] for u in t['entities']['urls']]
19 |     else:
20 |         return []
21 | 
22 | 
23 | def __parse_mentions(t):
24 |     '''
25 |     Get any mentions from the user_mentions entity.
26 |     '''
27 |     if t.get('entities') is not None:
28 |         return [m['screen_name'].lower() for m in t['entities']['user_mentions']]
29 |     else:
30 |         return []
31 | 
32 | 
33 | def __parse_phrases(t, count):
34 |     ngrams = nltk.util.ngrams(t['text'].lower().split(), count)
35 |     return [' '.join(ngram) for ngram in ngrams]
36 | 
37 | 
38 | def __parse_time(timestamp):
39 |     '''
40 |     Get the timestamp for the tweet, remove the seconds and minutes, and
41 |     store the timestamp for analysis.
42 |     '''
43 |     m = re.search(r'(\d\d:\d\d:\d\d \+\d\d\d\d)', timestamp)
44 |     if m is not None:
45 |         return re.sub(r':\d\d:\d\d ', ':00:00 ', m.group(1))
46 |     else:
47 |         return None
48 | 
49 | 
50 | def __parse_place(place):
51 |     '''
52 |     Capture the place id, country, and full_name.
53 |     '''
54 |     if place is not None:
55 |         return (place['id'], place['country'], place['full_name'])
56 |     else:
57 |         return None
58 | 
59 | 
60 | def parse_tweets(tweets):
61 |     analysis = {'hashtags': [], 'mentions': [], 'links': [], 'phrase3': [],
62 |                 'phrase4': [], 'phrase5': [], 'times': [], 'places': []}
63 | 
64 |     for tweet in tweets:
65 |         # ht, mt, li = __parse_words(tweet)
66 |         analysis['hashtags'].extend(__parse_hashtags(tweet))
67 |         analysis['mentions'].extend(__parse_mentions(tweet))
68 |         analysis['links'].extend(__parse_links(tweet))
69 | 
70 |         analysis['phrase3'].extend(__parse_phrases(tweet, 3))
71 |         analysis['phrase4'].extend(__parse_phrases(tweet, 4))
72 |         analysis['phrase5'].extend(__parse_phrases(tweet, 5))
73 | 
74 |         time = __parse_time(tweet['created_at'])
75 |         if time is not None:
76 |             analysis['times'].append(time)
77 | 
78 |         place = __parse_place(tweet['place'])
79 |         if place is not None:
80 |             analysis['places'].append(place)
81 | 
82 |     return analysis
83 | 


--------------------------------------------------------------------------------
/twanalyze/report.py:
--------------------------------------------------------------------------------
  1 | import simplekml
  2 | import nltk
  3 | import logging
  4 | import json
  5 | 
  6 | #-----------------------------------------------------------------------------
  7 | # RAW REPORT
  8 | #-----------------------------------------------------------------------------
  9 | def create_raw_report(user, tweets, filename):
 10 |     if not filename.endswith('.json'):
 11 |         filename = filename + '.json'
 12 | 
 13 |     logging.info('Writing RAW report to {0}.'.format(filename))
 14 | 
 15 |     report = {'user': user, 'tweets': tweets}
 16 | 
 17 |     raw = open(filename, 'w')
 18 |     raw.write(json.dumps(report, indent=2))
 19 |     raw.close()
 20 | 
 21 | 
 22 | #-----------------------------------------------------------------------------
 23 | # KML REPORT
 24 | #-----------------------------------------------------------------------------
 25 | def create_kml_report(tweets, filename):
 26 |     if not filename.endswith('.kml'):
 27 |         filename = filename + '.kml'
 28 | 
 29 |     logging.info('Writing KML report to {0}.'.format(filename))
 30 | 
 31 |     kml = simplekml.Kml()
 32 | 
 33 |     for tweet in tweets:
 34 |         if tweet['coordinates'] is not None:
 35 |             timestamp = simplekml.TimeStamp(when=tweet['created_at'])
 36 |             kml.newpoint(name=tweet['id_str'],
 37 |                          description=tweet['text'],
 38 |                          timestamp=timestamp,
 39 |                          coords=[tuple(tweet['coordinates']['coordinates'])])
 40 | 
 41 |     kml.save(filename)
 42 | 
 43 | 
 44 | #-----------------------------------------------------------------------------
 45 | # MARKDOWN REPORT
 46 | #-----------------------------------------------------------------------------
 47 | def __md_user(user):
 48 |     '''
 49 |     Print various attributes of the user.
 50 |     '''
 51 |     u = u''
 52 |     u += u'{0}\n'.format(user['screen_name']) 
 53 |     u += '-' * len(user['screen_name']) + '\n'
 54 |     u += u'Name: {0}\n'.format(user['name'])
 55 |     u += u'Description: {0}\n'.format(user['description'])
 56 |     u += u'Location: {0}\n'.format(user['location'])
 57 |     u += 'Time Zone: {0}\n'.format(user['time_zone'])
 58 |     u += 'UTC Offset: {0}\n'.format(user['utc_offset']/3600)
 59 |     u += 'Tweets: {0}\n'.format(user['statuses_count'])
 60 |     u += 'Favorites: {0}\n'.format(user['favourites_count'])
 61 |     u += 'Listed: {0}\n'.format(user['listed_count'])
 62 |     u += 'Followers: {0}\n'.format(user['followers_count'])
 63 |     u += 'Following: {0}\n'.format(user['friends_count'])
 64 |     u += '\n'
 65 | 
 66 |     return u.encode('utf-8')
 67 | 
 68 | 
 69 | def __md_distribution(title, items, top=20):
 70 |     '''
 71 |     Calculate the frequency distribution of the list of items. Convert the
 72 |     top most frequent items in the list to Markdown
 73 |     '''
 74 |     d = u''
 75 |     if len(items) != 0:
 76 |         d += '{0}\n'.format(title)
 77 |         d += '-' * len(title) + '\n'
 78 | 
 79 |         dist = nltk.FreqDist(items)
 80 | 
 81 |         for k in dist.keys()[:top]:
 82 |             if title == 'Hashtags':
 83 |                 link = 'https://twitter.com/search?q=%23{0}'.format(k)
 84 |                 d += u'* [#{0}]({1}) - {2}\n'.format(k, link, dist[k])
 85 |             elif title == 'Mentions':
 86 |                 link = 'https://twitter.com/{0}'.format(k)
 87 |                 d += u'* [@{0}]({1}) - {2}\n'.format(k, link, dist[k])
 88 |             elif title == 'Links':
 89 |                 d += u'* [{0}]({0}) - {1}\n'.format(k, dist[k])
 90 |             else:
 91 |                 d += u'* {0} - {1}\n'.format(k, dist[k])
 92 | 
 93 |         d += '\n'
 94 | 
 95 |     return d.encode('utf-8')
 96 | 
 97 | 
 98 | def create_markdown_report(user, analysis, filename):
 99 |     if not filename.endswith('.md'):
100 |         filename = filename + '.md'
101 | 
102 |     logging.info('Writing Markdown report to {0}.'.format(filename))
103 |     
104 |     md = open(filename, 'w')
105 |     md.write('Twanalyze Report\n')
106 |     md.write('================\n')
107 |     md.write(__md_user(user))
108 |     md.write(__md_distribution('Hashtags', analysis['hashtags']))
109 |     md.write(__md_distribution('Mentions', analysis['mentions']))
110 |     md.write(__md_distribution('Links', analysis['links']))
111 |     md.write(__md_distribution('3-word Phrases', analysis['phrase3']))
112 |     md.write(__md_distribution('4-word Phrases', analysis['phrase4']))
113 |     md.write(__md_distribution('5-word Phrases', analysis['phrase5']))
114 |     md.write(__md_distribution('Timestamps', analysis['times'], top=24))
115 |     md.write(__md_distribution('Places', analysis['places']))
116 |     md.close()
117 | 
118 | #-----------------------------------------------------------------------------
119 | # HTML REPORT
120 | #-----------------------------------------------------------------------------
121 | def __html_user(user):
122 |     '''
123 |     Print various attributes of the user in HTML.
124 |     '''
125 |     u = u''
126 |     u += u'<h2>{0}</h2>\n'.format(user['screen_name'])
127 |     u += '<ul>\n' 
128 |     u += u'<li>Name: {0}</li>\n'.format(user['name'])
129 |     u += u'<li>Description: {0}</li>\n'.format(user['description'])
130 |     u += u'<li>Location: {0}</li>\n'.format(user['location'])
131 |     u += '<li>Time Zone: {0}</li>\n'.format(user['time_zone'])
132 |     u += '<li>UTC Offset: {0}</li>\n'.format(user['utc_offset']/3600)
133 |     u += '<li>Tweets: {0}</li>\n'.format(user['statuses_count'])
134 |     u += '<li>Favorites: {0}</li>\n'.format(user['favourites_count'])
135 |     u += '<li>Listed: {0}</li>\n'.format(user['listed_count'])
136 |     u += '<li>Followers: {0}</li>\n'.format(user['followers_count'])
137 |     u += '<li>Following: {0}</li>\n'.format(user['friends_count'])
138 |     u += '</ul>'
139 | 
140 |     return u.encode('utf-8')
141 | 
142 | 
143 | def __html_distribution(title, items, top=20):
144 |     '''
145 |     Calculate the frequency distribution of the list of items. Print the 20
146 |     most frequent items in the list.
147 |     '''
148 |     d = u''
149 |     if len(items) != 0:
150 |         d += '<h2>{0}</h2>\n'.format(title)
151 |         d += '<ul>\n'
152 | 
153 |         dist = nltk.FreqDist(items)
154 | 
155 |         for k in dist.keys()[:top]:
156 |             if title == 'Hashtags':
157 |                 link = 'https://twitter.com/search?q=%23{0}'.format(k)
158 |                 d += u'<li><a href="{0}">'.format(link)
159 |                 d += u'#{0}</a> - {1}</li>\n'.format(k, dist[k])
160 |             elif title == 'Mentions':
161 |                 link = 'https://twitter.com/{0}'.format(k)
162 |                 d += u'<li><a href="{0}">'.format(link)
163 |                 d += u'@{0}</a> - {1}</li>\n'.format(k, dist[k])
164 |             elif title == 'Links':
165 |                 d += u'<li><a href="{0}">'.format(k)
166 |                 d += u'{0}</a> - {1}</li>\n'.format(k, dist[k])
167 |             else:
168 |                 d += u'<li>{0} - {1}</li>\n'.format(k, dist[k])
169 | 
170 |         d += '</ul>\n'
171 | 
172 |     return d.encode('utf-8')
173 | 
174 | 
175 | def create_html_report(user, analysis, filename):
176 |     if not filename.endswith('.html'):
177 |         filename = filename + '.html'
178 | 
179 |     logging.info('Writing HTML report to {0}.'.format(filename))
180 | 
181 |     html = open(filename, 'w')
182 |     html.write('<html>\n<head>\n<meta charset="utf-8">\n')
183 |     html.write('<style type="text/css">\n')
184 |     html.write('h1, h2 {font-family: Georgia, "Times New Roman", serif;}\n')
185 |     html.write('body {font-family: Helvetica, Tahoma, Arial, sans-serif;}\n')
186 |     html.write('li {margin: 0; padding: 0; list-style-type: none;}\n')
187 |     html.write('</style>\n</head>\n<body>\n')
188 |     html.write('<h1>Twanalyze Report</h1>\n')
189 |     html.write(__html_user(user))
190 |     html.write(__html_distribution('Hashtags', analysis['hashtags']))
191 |     html.write(__html_distribution('Mentions', analysis['mentions']))
192 |     html.write(__html_distribution('Links', analysis['links']))
193 |     html.write(__html_distribution('3-word Phrases', analysis['phrase3']))
194 |     html.write(__html_distribution('4-word Phrases', analysis['phrase4']))
195 |     html.write(__html_distribution('5-word Phrases', analysis['phrase5']))
196 |     html.write(__html_distribution('Timestamps', analysis['times'], top=24))
197 |     html.write(__html_distribution('Places', analysis['places']))
198 |     html.write('</body>\n</html>\n')
199 |     html.close()
200 | 


--------------------------------------------------------------------------------
/twanalyze/twitter.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import oauth
  3 | 
  4 | class Twitter():
  5 |     def __init__(self, ck, cs, t, ts):
  6 |         self.consumer_key = ck
  7 |         self.consumer_secret = cs
  8 |         self.token = t
  9 |         self.token_secret = ts
 10 |         self.__base = 'https://api.twitter.com/1.1'
 11 |         self.__ssn = requests.Session()
 12 |         self.__ssn.auth = oauth.TwitterSingleOAuth(self.consumer_key,
 13 |                                                    self.consumer_secret,
 14 |                                                    self.token,
 15 |                                                    self.token_secret)
 16 | 
 17 |     def __connect(self, method, resource, params=None, data=None):
 18 |         '''Make a GET or POST request to the API.'''
 19 |         url = self.__base + resource
 20 | 
 21 |         if method.upper() == 'GET':
 22 |             r = self.__ssn.get(url, params=params)
 23 |             if r.status_code == 429:
 24 |                 return {}
 25 | 
 26 |             return r.json()
 27 | 
 28 |         if method.upper() == 'POST':
 29 |             r = self.__ssn.post(url, data=data)
 30 |             if r.status_code == 429:
 31 |                 return {}
 32 | 
 33 |             return r.json()
 34 | 
 35 |     def __users(self, url, params={}, count=200, total=3000):
 36 |         '''
 37 |         Returns a list of items provided by the URL using the specified
 38 |         parameters. Should work for the following URLs:
 39 | 
 40 |         followers/list
 41 |         friends/list
 42 |         '''
 43 |         users = []
 44 |         cursor = -1
 45 | 
 46 |         while cursor != 0:
 47 |             params['next_cursor'] = cursor
 48 |             params['count'] = count
 49 | 
 50 |             resp = self.__connect('GET', url, params=params)
 51 |             if resp == {}:
 52 |                 cursor = 0
 53 |             else:
 54 |                 users.extend(resp['users'])
 55 |                 cursor = resp['next_cursor']
 56 | 
 57 |         return users
 58 | 
 59 |     def __statuses(self, url, params={}, count=200, total=800):
 60 |         '''
 61 |         Get the statuses provided by the URL using the specified parameters.
 62 |         '''
 63 |         params['count'] = count
 64 | 
 65 |         statuses = []
 66 |         requested = 0
 67 | 
 68 |         while requested < total:
 69 |             ids = []
 70 |             resp = self.__connect('GET', url, params=params)
 71 | 
 72 |             if resp == {}:
 73 |                 requested = total
 74 |             else:
 75 |                 ids = [r['id'] for r in resp]
 76 |                 statuses.extend(resp)
 77 |                 # for r in resp:
 78 |                 #     statuses.append(r)
 79 |                 #     ids.append(r['id'])
 80 | 
 81 |                 params['max_id'] = min(ids)
 82 |                 requested += count
 83 | 
 84 |         return statuses
 85 | 
 86 |     def followers(self, screen_name):
 87 |         '''
 88 |         Get the followers of the specified user account.
 89 |         ''' 
 90 |         params = {'screen_name': screen_name,
 91 |                   'include_user_entities': False}
 92 |     
 93 |         return self.__users('/followers/list.json', params)
 94 | 
 95 |     def friends(self, screen_name):
 96 |         '''
 97 |         Get the friends (people they follow) of the specified user account.
 98 |         '''
 99 |         params = {'screen_name': screen_name,
100 |                   'include_user_entities': False}
101 | 
102 |         return self.__users('friends/list.json', params)
103 | 
104 |     def tweets(self, screen_name):
105 |         '''
106 |         Get the last 3200 statuses for the specified screen_name. Replies are
107 |         included in the results.
108 |         '''
109 |         params = {'screen_name': screen_name, 'include_rts': False}
110 |         return self.__statuses('/statuses/user_timeline.json',
111 |                                params=params,
112 |                                total=3200)
113 | 
114 |     def user(self, screen_name):
115 |         '''
116 |         Get general information about the specified user account.
117 |         '''
118 |         params = {'screen_name': screen_name}
119 |         return self.__connect('GET', '/users/show.json', params=params)
120 | 


--------------------------------------------------------------------------------