├── requirements.txt ├── .gitignore ├── LICENSE ├── README.md └── python └── streaming_downloader.py /requirements.txt: -------------------------------------------------------------------------------- 1 | tweepy 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Mark Dredze, mdredze@cs.jhu.edu 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 21 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Twitter Stream Downloader 2 | ========================= 3 | 4 | This tutorial provides an excellent introduction to collection Twitter data. It's more recent than updates to this library: 5 | http://socialmedia-class.org/twittertutorial.html 6 | 7 | A simple Python script to download tweets from the Twitter streaming API. Works with API version 1.1. 8 | 9 | The script to run is python/streaming_downloader.py. 10 | 11 | This requires you to have a consumer_key, consumer_secret, access_token, and 12 | access_token_secret. To obtain these: 13 | - go to dev.twitter.com 14 | - login and create a new application 15 | - Create an access token 16 | - This will give you all four of the above. Remember, do not share these with anyone. 17 | 18 | If you run the script with the --help flag it will show valid options. 19 | 20 | The code creates files as year/month/timestamp.gz at least once every 24 21 | hours. Changing this behavior isn't too hard but requires modifying the code. 22 | 23 | The code requires tweepy. I am using version 3.8.0. 24 | https://github.com/tweepy/tweepy 25 | 26 | The script also supports the flag "pid_file". This will create a file with the PID of the running job. This is useful if you want to create a cron job that watches the script to make sure it is still running. 27 | 28 | stream_type: There are three supported stream types. location, keyword and sample. I didn't put in the username stream type, but it should be easy to add. 29 | 30 | If you use a keyword file, the format should be: 31 | track=keyword1,keyword2,keyword3 ... 32 | 33 | Location files are similar: 34 | locations=value1,value2,value3,value4 35 | 36 | These files are provided using the "parameters-file" argument. 37 | 38 | -------------------------------------------------------------------------------- /python/streaming_downloader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Mark Dredze. All rights reserved. 2 | # This software is released under the 2-clause BSD license. 3 | # Mark Dredze 4 | 5 | import tweepy 6 | 7 | import argparse 8 | import datetime 9 | import http 10 | import logging 11 | import os 12 | import time 13 | import gzip 14 | 15 | 16 | class ParameterFileModifiedException(Exception): 17 | pass 18 | 19 | 20 | class FileListener(tweepy.streaming.StreamListener): 21 | def __init__(self, path, restart_time, parameters_filename=None): 22 | self.path = path 23 | self.current_file = None 24 | self.restart_time = restart_time 25 | self.file_start_time = time.time() 26 | self.file_start_date = datetime.datetime.now() 27 | self.parameters_filename = parameters_filename 28 | self.parameters_filename_last_modified_time = None 29 | 30 | if self.parameters_filename: 31 | self.reset_parameter_file_mtime() 32 | 33 | def _check_parameters_file_modification(self): 34 | if self.parameters_filename: 35 | current_mtime = os.path.getmtime(self.parameters_filename) 36 | if current_mtime != self.parameters_filename_last_modified_time: 37 | raise ParameterFileModifiedException() 38 | 39 | def _ensure_file(self): 40 | # Should we start a new file? 41 | start_new_file = False 42 | if not self.current_file: 43 | # There is no existing file 44 | start_new_file = True 45 | elif self.current_file.closed: 46 | # The existing file is closed 47 | start_new_file = True 48 | elif time.time() - self.restart_time > self.file_start_time: 49 | # The amount of time that has passed for a restart is due. 50 | start_new_file = True 51 | elif self.file_start_date.day != datetime.datetime.now().day: 52 | # It is a new day 53 | start_new_file = True 54 | 55 | if start_new_file: 56 | self._start_new_file() 57 | 58 | def on_data(self, data): 59 | self._ensure_file() 60 | self._check_parameters_file_modification() 61 | 62 | if data.startswith('{'): 63 | self.current_file.write(data) 64 | if not data.endswith('\n'): 65 | self.current_file.write('\n') 66 | 67 | def on_error(self, status): 68 | logging.error(status) 69 | 70 | def _start_new_file(self): 71 | if self.current_file and not self.current_file.closed: 72 | self.current_file.close() 73 | 74 | local_time_obj = time.localtime() 75 | current_datetime = time.strftime("%Y_%m_%d_%H_%M_%S", local_time_obj) 76 | year = time.strftime("%Y", local_time_obj) 77 | month = time.strftime("%m", local_time_obj) 78 | 79 | full_path = os.path.join(self.path, year) 80 | full_path = os.path.join(full_path, month) 81 | try: 82 | os.makedirs(full_path) 83 | logging.info('Created %s' % full_path) 84 | except FileExistsError: 85 | pass 86 | 87 | filename = os.path.join(full_path, '%s.gz' % current_datetime) 88 | self.current_file = gzip.open(filename, 'wt') 89 | self.file_start_time = time.time() 90 | logging.info('Starting new file: %s' % filename) 91 | self.file_start_date = datetime.datetime.now() 92 | 93 | def close_file(self): 94 | if self.current_file and not self.current_file.closed: 95 | logging.info('Closing current file') 96 | self.current_file.close() 97 | 98 | def reset_parameter_file_mtime(self): 99 | self.parameters_filename_last_modified_time = os.path.getmtime(self.parameters_filename) 100 | 101 | 102 | def load_stream_parameters(parameters_filename, stream_type): 103 | with open(parameters_filename, 'r') as input: 104 | content = '\n'.join(input.readlines()) 105 | 106 | index = content.find('=') 107 | if index != -1: 108 | content = content[index+1:] 109 | return_value = content.split(',') 110 | 111 | if stream_type.lower() == 'location': 112 | for ii, entry in enumerate(return_value): 113 | return_value[ii] = float(entry) 114 | 115 | return return_value 116 | 117 | 118 | def main(): 119 | parser = argparse.ArgumentParser(description='Download streaming data from Twitter.') 120 | parser.add_argument('--consumer-key', required=True, help='the consumer key') 121 | parser.add_argument('--consumer-secret', required=True, help='the consumer key secret') 122 | parser.add_argument('--access-token', required=True, help='the access token') 123 | parser.add_argument('--access-token-secret', required=True, help='the access token secret') 124 | parser.add_argument('--stream-type', choices=['sample', 'location', 'keyword'], required=True, 125 | help='the type of stream to run') 126 | parser.add_argument('--output-directory', required=True, help='where to save the output files') 127 | parser.add_argument('--parameters-filename', required=False, 128 | help='file containing parameters for the stream (required for location and keyword.)') 129 | parser.add_argument('--pid-file', required=False, help='filename to store the process id') 130 | parser.add_argument('--log', help='log filename (default: write to console)') 131 | parser.add_argument('--log-level', default='INFO', choices=['CRITICAL', 'DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARNING'], help='log filename (default: write to console)') 132 | parser.add_argument('--check-for-new-parameters', action='store_true', 133 | help='checks the parameters file timestamp every minute for changes') 134 | 135 | args = parser.parse_args() 136 | 137 | # Setup the logger 138 | log_level = getattr(logging, args.log_level.upper()) 139 | log_format = '%(asctime)s %(levelname)s %(message)s' 140 | log_date_format = '%m/%d/%Y %I:%M:%S %p' 141 | 142 | if args.log: 143 | # Setup the logger to a file if one is provided 144 | logging.basicConfig(filename=args.log, level=log_level, format=log_format, datefmt=log_date_format) 145 | else: 146 | logging.basicConfig(level=log_level, format=log_format, datefmt=log_date_format) 147 | 148 | # Get the stream arguments 149 | if args.stream_type == 'location' or args.stream_type == 'keyword': 150 | if not args.parameters_filename: 151 | raise ValueError('--parameters-filename is requires when the stream is of type "location" or "keyword"') 152 | stream_parameters = load_stream_parameters(args.parameters_filename, args.stream_type) 153 | 154 | if args.pid_file: 155 | with open(args.pid_file, 'w') as writer: 156 | writer.write(str(os.getpid())) 157 | 158 | restart_time = 86400 159 | if args.check_for_new_parameters: 160 | listener = FileListener(args.output_directory, restart_time, parameters_filename=args.parameters_filename) 161 | else: 162 | listener = FileListener(args.output_directory, restart_time) 163 | 164 | auth = tweepy.OAuthHandler(args.consumer_key, args.consumer_secret) 165 | auth.set_access_token(args.access_token, args.access_token_secret) 166 | 167 | try: 168 | while True: 169 | try: 170 | logging.info('Connecting to the stream of type {}'.format(args.stream_type)) 171 | stream = tweepy.Stream(auth = auth, listener = listener) 172 | 173 | if args.stream_type == 'location': 174 | stream.filter(locations=stream_parameters) 175 | elif args.stream_type == 'keyword': 176 | stream.filter(track=stream_parameters) 177 | else: 178 | stream.sample() 179 | except http.client.IncompleteRead as e: 180 | logging.exception('Got IncompleteRead exception') 181 | listener.close_file() 182 | except ParameterFileModifiedException as e: 183 | # The parameter file changed. Reaload the parameters and create a new stream. 184 | logging.info('Parameters file has changed; reloading') 185 | listener.close_file() 186 | stream_parameters = load_stream_parameters(args.parameters_filename, args.stream_type) 187 | listener.reset_parameter_file_mtime() 188 | 189 | except Exception as e: 190 | logging.exception('Got exception on main handler') 191 | 192 | listener.close_file() 193 | logging.info('Exiting.') 194 | 195 | 196 | if __name__ == '__main__': 197 | main() 198 | --------------------------------------------------------------------------------