└── install.py /install.py: -------------------------------------------------------------------------------- 1 | # 2 | # Sanders-Twitter Sentiment Corpus Install Script 3 | # Version 0.1 4 | # 5 | # Pulls tweet data from Twitter because ToS prevents distributing it directly. 6 | # 7 | # Right now we use unauthenticated requests, which are rate-limited to 150/hr. 8 | # We use 125/hr to stay safe. 9 | # 10 | # We could more than double the download speed by using authentication with 11 | # OAuth logins. But for now, this is too much of a PITA to implement. Just let 12 | # the script run over a weekend and you'll have all the data. 13 | # 14 | # - Niek Sanders 15 | # njs@sananalytics.com 16 | # October 20, 2011 17 | # 18 | # 19 | # Excuse the ugly code. I threw this together as quickly as possible and I 20 | # don't normally code in Python. 21 | # 22 | import csv, getpass, json, os, time, urllib 23 | 24 | 25 | def get_user_params(): 26 | 27 | user_params = {} 28 | 29 | # get user input params 30 | #user_params['inList'] = raw_input( '\nInput file [./corpus.csv]: ' ) 31 | #user_params['outList'] = raw_input( 'Results file [./full-corpus.csv]: ' ) 32 | #user_params['rawDir'] = raw_input( 'Raw data dir [./rawdata/]: ' ) 33 | 34 | # apply defaults 35 | #if user_params['inList'] == '': 36 | user_params['inList'] = './corpus.csv' 37 | #if user_params['outList'] == '': 38 | user_params['outList'] = './full-corpus.csv' 39 | #if user_params['rawDir'] == '': 40 | user_params['rawDir'] = './rawdata/' 41 | 42 | return user_params 43 | 44 | 45 | def dump_user_params( user_params ): 46 | 47 | # dump user params for confirmation 48 | print 'Input: ' + user_params['inList'] 49 | print 'Output: ' + user_params['outList'] 50 | print 'Raw data: ' + user_params['rawDir'] 51 | return 52 | 53 | 54 | def read_total_list( in_filename ): 55 | 56 | # read total fetch list csv 57 | fp = open( in_filename, 'rb' ) 58 | reader = csv.reader( fp, delimiter=',', quotechar='"' ) 59 | 60 | total_list = [] 61 | for row in reader: 62 | total_list.append( row ) 63 | 64 | return total_list 65 | 66 | 67 | def purge_already_fetched( fetch_list, raw_dir ): 68 | 69 | # list of tweet ids that still need downloading 70 | rem_list = [] 71 | 72 | # check each tweet to see if we have it 73 | for item in fetch_list: 74 | 75 | # check if json file exists 76 | tweet_file = raw_dir + item[2] + '.json' 77 | if os.path.exists( tweet_file ): 78 | 79 | # attempt to parse json file 80 | try: 81 | parse_tweet_json( tweet_file ) 82 | print '--> already downloaded #' + item[2] 83 | except RuntimeError: 84 | rem_list.append( item ) 85 | else: 86 | rem_list.append( item ) 87 | 88 | return rem_list 89 | 90 | 91 | def get_time_left_str( cur_idx, fetch_list, download_pause ): 92 | 93 | tweets_left = len(fetch_list) - cur_idx 94 | total_seconds = tweets_left * download_pause 95 | 96 | str_hr = int( total_seconds / 3600 ) 97 | str_min = int((total_seconds - str_hr*3600) / 60) 98 | str_sec = total_seconds - str_hr*3600 - str_min*60 99 | 100 | return '%dh %dm %ds' % (str_hr, str_min, str_sec) 101 | 102 | 103 | def download_tweets( fetch_list, raw_dir ): 104 | 105 | # ensure raw data directory exists 106 | if not os.path.exists( raw_dir ): 107 | os.mkdir( raw_dir ) 108 | 109 | # stay within rate limits 110 | max_tweets_per_hr = 125 111 | download_pause_sec = 3600 / max_tweets_per_hr 112 | 113 | # download tweets 114 | for idx in range(0,len(fetch_list)): 115 | 116 | # current item 117 | item = fetch_list[idx] 118 | 119 | # print status 120 | trem = get_time_left_str( idx, fetch_list, download_pause_sec ) 121 | print '--> downloading tweet #%s (%d of %d) (%s left)' % \ 122 | (item[2], idx+1, len(fetch_list), trem) 123 | 124 | # pull data 125 | url = 'http://api.twitter.com/1/statuses/show.json?id=' + item[2] 126 | urllib.urlretrieve( url, raw_dir + item[2] + '.json' ) 127 | 128 | # stay in Twitter API rate limits 129 | print ' pausing %d sec to obey Twitter API rate limits' % \ 130 | (download_pause_sec) 131 | time.sleep( download_pause_sec ) 132 | 133 | return 134 | 135 | 136 | def parse_tweet_json( filename ): 137 | 138 | # read tweet 139 | print 'opening: ' + filename 140 | fp = open( filename, 'rb' ) 141 | 142 | # parse json 143 | try: 144 | tweet_json = json.load( fp ) 145 | except ValueError: 146 | raise RuntimeError('error parsing json') 147 | 148 | # look for twitter api error msgs 149 | if 'error' in tweet_json: 150 | raise RuntimeError('error in downloaded tweet') 151 | 152 | # extract creation date and tweet text 153 | return [ tweet_json['created_at'], tweet_json['text'] ] 154 | 155 | 156 | def build_output_corpus( out_filename, raw_dir, total_list ): 157 | 158 | # open csv output file 159 | fp = open( out_filename, 'wb' ) 160 | writer = csv.writer( fp, delimiter=',', quotechar='"', escapechar='\\', 161 | quoting=csv.QUOTE_ALL ) 162 | 163 | # write header row 164 | writer.writerow( ['Topic','Sentiment','TweetId','TweetDate','TweetText'] ) 165 | 166 | # parse all downloaded tweets 167 | missing_count = 0 168 | for item in total_list: 169 | 170 | # ensure tweet exists 171 | if os.path.exists( raw_dir + item[2] + '.json' ): 172 | 173 | try: 174 | # parse tweet 175 | parsed_tweet = parse_tweet_json( raw_dir + item[2] + '.json' ) 176 | full_row = item + parsed_tweet 177 | 178 | # character encoding for output 179 | for i in range(0,len(full_row)): 180 | full_row[i] = full_row[i].encode("utf-8") 181 | 182 | # write csv row 183 | writer.writerow( full_row ) 184 | 185 | except RuntimeError: 186 | print '--> bad data in tweet #' + item[2] 187 | missing_count += 1 188 | 189 | else: 190 | print '--> missing tweet #' + item[2] 191 | missing_count += 1 192 | 193 | # indicate success 194 | if missing_count == 0: 195 | print '\nSuccessfully downloaded corpus!' 196 | print 'Output in: ' + out_filename + '\n' 197 | else: 198 | print '\nMissing %d of %d tweets!' % (missing_count, len(total_list)) 199 | print 'Partial output in: ' + out_filename + '\n' 200 | 201 | return 202 | 203 | 204 | def main(): 205 | 206 | # get user parameters 207 | user_params = get_user_params() 208 | dump_user_params( user_params ) 209 | 210 | # get fetch list 211 | total_list = read_total_list( user_params['inList'] ) 212 | fetch_list = purge_already_fetched( total_list, user_params['rawDir'] ) 213 | 214 | # start fetching data from twitter 215 | #download_tweets( fetch_list, user_params['rawDir'] ) 216 | 217 | # second pass for any failed downloads 218 | #print '\nStarting second pass to retry any failed downloads'; 219 | #fetch_list = purge_already_fetched( total_list, user_params['rawDir'] ) 220 | #download_tweets( fetch_list, user_params['rawDir'] ) 221 | 222 | # build output corpus 223 | build_output_corpus( user_params['outList'], user_params['rawDir'], 224 | total_list ) 225 | 226 | return 227 | 228 | 229 | if __name__ == '__main__': 230 | main() 231 | --------------------------------------------------------------------------------