├── requirements.txt
├── .gitignore
├── LICENSE
├── README.md
└── python
    └── streaming_downloader.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | tweepy 
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, Mark Dredze, mdredze@cs.jhu.edu
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |   Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 |   Redistributions in binary form must reproduce the above copyright notice, this
11 |   list of conditions and the following disclaimer in the documentation and/or
12 |   other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
21 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Twitter Stream Downloader
 2 | =========================
 3 | 
 4 | This tutorial provides an excellent introduction to collection Twitter data. It's more recent than updates to this library:
 5 | http://socialmedia-class.org/twittertutorial.html
 6 | 
 7 | A simple Python script to download tweets from the Twitter streaming API. Works with API version 1.1.
 8 | 
 9 | The script to run is python/streaming_downloader.py.
10 | 
11 | This requires you to have a consumer_key, consumer_secret, access_token, and
12 | access_token_secret. To obtain these:
13 | - go to dev.twitter.com
14 | - login and create a new application
15 | - Create an access token
16 | - This will give you all four of the above. Remember, do not share these with anyone.
17 | 
18 | If you run the script with the --help flag it will show valid options.
19 | 
20 | The code creates files as year/month/timestamp.gz at least once every 24
21 | hours. Changing this behavior isn't too hard but requires modifying the code.
22 | 
23 | The code requires tweepy. I am using version 3.8.0.
24 | https://github.com/tweepy/tweepy
25 | 
26 | The script also supports the flag "pid_file". This will create a file with the PID of the running job. This is useful if you want to create a cron job that watches the script to make sure it is still running.
27 | 
28 | stream_type: There are three supported stream types. location, keyword and sample. I didn't put in the username stream type, but it should be easy to add.
29 | 
30 | If you use a keyword file, the format should be:
31 | track=keyword1,keyword2,keyword3 ...
32 | 
33 | Location files are similar:
34 | locations=value1,value2,value3,value4
35 | 
36 | These files are provided using the "parameters-file" argument.
37 | 
38 | 


--------------------------------------------------------------------------------
/python/streaming_downloader.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Mark Dredze. All rights reserved.
  2 | # This software is released under the 2-clause BSD license.
  3 | # Mark Dredze <mdredze@cs.jhu.edu>
  4 | 
  5 | import tweepy
  6 | 
  7 | import argparse
  8 | import datetime
  9 | import http
 10 | import logging
 11 | import os
 12 | import time
 13 | import gzip
 14 | 
 15 | 
 16 | class ParameterFileModifiedException(Exception):
 17 | 	pass
 18 | 
 19 | 
 20 | class FileListener(tweepy.streaming.StreamListener):
 21 | 	def __init__(self, path, restart_time, parameters_filename=None):
 22 | 		self.path = path
 23 | 		self.current_file = None
 24 | 		self.restart_time = restart_time
 25 | 		self.file_start_time = time.time()
 26 | 		self.file_start_date = datetime.datetime.now()
 27 | 		self.parameters_filename = parameters_filename
 28 | 		self.parameters_filename_last_modified_time = None
 29 | 
 30 | 		if self.parameters_filename:
 31 | 			self.reset_parameter_file_mtime()
 32 | 
 33 | 	def _check_parameters_file_modification(self):
 34 | 		if self.parameters_filename:
 35 | 			current_mtime = os.path.getmtime(self.parameters_filename)
 36 | 			if current_mtime != self.parameters_filename_last_modified_time:
 37 | 				raise ParameterFileModifiedException()
 38 | 
 39 | 	def _ensure_file(self):
 40 | 		# Should we start a new file?
 41 | 		start_new_file = False
 42 | 		if not self.current_file:
 43 | 			# There is no existing file
 44 | 			start_new_file = True
 45 | 		elif self.current_file.closed:
 46 | 			# The existing file is closed
 47 | 			start_new_file = True
 48 | 		elif time.time() - self.restart_time > self.file_start_time:
 49 | 			# The amount of time that has passed for a restart is due.
 50 | 			start_new_file = True
 51 | 		elif self.file_start_date.day != datetime.datetime.now().day:
 52 | 			# It is a new day
 53 | 			start_new_file = True
 54 | 
 55 | 		if start_new_file:
 56 | 			self._start_new_file()
 57 | 
 58 | 	def on_data(self, data):
 59 | 		self._ensure_file()
 60 | 		self._check_parameters_file_modification()
 61 | 
 62 | 		if data.startswith('{'):
 63 | 			self.current_file.write(data)
 64 | 			if not data.endswith('\n'):
 65 | 				self.current_file.write('\n')
 66 | 
 67 | 	def on_error(self, status):
 68 | 		logging.error(status)
 69 | 
 70 | 	def _start_new_file(self):
 71 | 		if self.current_file and not self.current_file.closed:
 72 | 			self.current_file.close()
 73 | 		
 74 | 		local_time_obj = time.localtime()
 75 | 		current_datetime = time.strftime("%Y_%m_%d_%H_%M_%S", local_time_obj)
 76 | 		year = time.strftime("%Y", local_time_obj)
 77 | 		month = time.strftime("%m", local_time_obj)
 78 | 		
 79 | 		full_path = os.path.join(self.path, year)
 80 | 		full_path = os.path.join(full_path, month)
 81 | 		try:
 82 | 			os.makedirs(full_path)
 83 | 			logging.info('Created %s' % full_path)
 84 | 		except FileExistsError:
 85 | 			pass
 86 | 
 87 | 		filename = os.path.join(full_path, '%s.gz' % current_datetime)
 88 | 		self.current_file = gzip.open(filename, 'wt')
 89 | 		self.file_start_time = time.time()
 90 | 		logging.info('Starting new file: %s' % filename)
 91 | 		self.file_start_date = datetime.datetime.now()
 92 | 
 93 | 	def close_file(self):
 94 | 		if self.current_file and not self.current_file.closed:
 95 | 			logging.info('Closing current file')
 96 | 			self.current_file.close()
 97 | 
 98 | 	def reset_parameter_file_mtime(self):
 99 | 		self.parameters_filename_last_modified_time = os.path.getmtime(self.parameters_filename)
100 | 
101 | 
102 | def load_stream_parameters(parameters_filename, stream_type):
103 | 	with open(parameters_filename, 'r') as input:
104 | 		content = '\n'.join(input.readlines())
105 | 
106 | 	index = content.find('=')
107 | 	if index != -1:
108 | 		content = content[index+1:]
109 | 	return_value = content.split(',')
110 | 	
111 | 	if stream_type.lower() == 'location':
112 | 		for ii, entry in enumerate(return_value):
113 | 			return_value[ii] = float(entry)
114 | 	
115 | 	return return_value
116 | 
117 | 
118 | def main():
119 | 	parser = argparse.ArgumentParser(description='Download streaming data from Twitter.')
120 | 	parser.add_argument('--consumer-key', required=True, help='the consumer key')
121 | 	parser.add_argument('--consumer-secret', required=True, help='the consumer key secret')
122 | 	parser.add_argument('--access-token', required=True, help='the access token')
123 | 	parser.add_argument('--access-token-secret', required=True, help='the access token secret')
124 | 	parser.add_argument('--stream-type', choices=['sample', 'location', 'keyword'], required=True,
125 | 						help='the type of stream to run')
126 | 	parser.add_argument('--output-directory', required=True, help='where to save the output files')
127 | 	parser.add_argument('--parameters-filename', required=False,
128 | 						help='file containing parameters for the stream (required for location and keyword.)')
129 | 	parser.add_argument('--pid-file', required=False, help='filename to store the process id')
130 | 	parser.add_argument('--log', help='log filename (default: write to console)')
131 | 	parser.add_argument('--log-level', default='INFO', choices=['CRITICAL', 'DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARNING'], help='log filename (default: write to console)')
132 | 	parser.add_argument('--check-for-new-parameters', action='store_true',
133 | 						help='checks the parameters file timestamp every minute for changes')
134 | 
135 | 	args = parser.parse_args()
136 | 
137 | 	# Setup the logger
138 | 	log_level = getattr(logging, args.log_level.upper())
139 | 	log_format = '%(asctime)s %(levelname)s %(message)s'
140 | 	log_date_format = '%m/%d/%Y %I:%M:%S %p'
141 | 
142 | 	if args.log:
143 | 		# Setup the logger to a file if one is provided
144 | 		logging.basicConfig(filename=args.log, level=log_level, format=log_format, datefmt=log_date_format)
145 | 	else:
146 | 		logging.basicConfig(level=log_level, format=log_format, datefmt=log_date_format)
147 | 
148 | 	# Get the stream arguments
149 | 	if args.stream_type == 'location' or args.stream_type == 'keyword':
150 | 		if not args.parameters_filename:
151 | 			raise ValueError('--parameters-filename is requires when the stream is of type "location" or "keyword"')
152 | 		stream_parameters = load_stream_parameters(args.parameters_filename, args.stream_type)
153 | 
154 | 	if args.pid_file:
155 | 		with open(args.pid_file, 'w') as writer:
156 | 			writer.write(str(os.getpid()))
157 | 
158 | 	restart_time = 86400
159 | 	if args.check_for_new_parameters:
160 | 		listener = FileListener(args.output_directory, restart_time, parameters_filename=args.parameters_filename)
161 | 	else:
162 | 		listener = FileListener(args.output_directory, restart_time)
163 | 
164 | 	auth = tweepy.OAuthHandler(args.consumer_key, args.consumer_secret)
165 | 	auth.set_access_token(args.access_token, args.access_token_secret)
166 | 
167 | 	try:
168 | 		while True:
169 | 			try:
170 | 				logging.info('Connecting to the stream of type {}'.format(args.stream_type))
171 | 				stream = tweepy.Stream(auth = auth, listener = listener)
172 | 
173 | 				if args.stream_type == 'location':
174 | 					stream.filter(locations=stream_parameters)
175 | 				elif args.stream_type == 'keyword':
176 | 					stream.filter(track=stream_parameters)
177 | 				else:
178 | 					stream.sample()
179 | 			except http.client.IncompleteRead as e:
180 | 				logging.exception('Got IncompleteRead exception')
181 | 				listener.close_file()
182 | 			except ParameterFileModifiedException as e:
183 | 				# The parameter file changed. Reaload the parameters and create a new stream.
184 | 				logging.info('Parameters file has changed; reloading')
185 | 				listener.close_file()
186 | 				stream_parameters = load_stream_parameters(args.parameters_filename, args.stream_type)
187 | 				listener.reset_parameter_file_mtime()
188 | 
189 | 	except Exception as e:
190 | 		logging.exception('Got exception on main handler')
191 | 
192 | 	listener.close_file()
193 | 	logging.info('Exiting.')
194 | 
195 | 
196 | if __name__ == '__main__':
197 | 	main()
198 | 


--------------------------------------------------------------------------------