└── install.py


/install.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Sanders-Twitter Sentiment Corpus Install Script
  3 | # Version 0.1
  4 | #
  5 | # Pulls tweet data from Twitter because ToS prevents distributing it directly.
  6 | #
  7 | # Right now we use unauthenticated requests, which are rate-limited to 150/hr.
  8 | # We use 125/hr to stay safe.  
  9 | #
 10 | # We could more than double the download speed by using authentication with
 11 | # OAuth logins.  But for now, this is too much of a PITA to implement.  Just let
 12 | # the script run over a weekend and you'll have all the data.
 13 | #
 14 | #   - Niek Sanders
 15 | #     njs@sananalytics.com
 16 | #     October 20, 2011
 17 | #
 18 | #
 19 | # Excuse the ugly code.  I threw this together as quickly as possible and I
 20 | # don't normally code in Python.
 21 | #
 22 | import csv, getpass, json, os, time, urllib
 23 | 
 24 | 
 25 | def get_user_params():
 26 | 
 27 |     user_params = {}
 28 | 
 29 |     # get user input params
 30 |     #user_params['inList']  = raw_input( '\nInput file [./corpus.csv]: ' )
 31 |     #user_params['outList'] = raw_input( 'Results file [./full-corpus.csv]: ' )
 32 |     #user_params['rawDir']  = raw_input( 'Raw data dir [./rawdata/]: ' )
 33 |     
 34 |     # apply defaults
 35 |     #if user_params['inList']  == '': 
 36 |     user_params['inList'] = './corpus.csv'
 37 |     #if user_params['outList'] == '': 
 38 |     user_params['outList'] = './full-corpus.csv'
 39 |     #if user_params['rawDir']  == '': 
 40 |     user_params['rawDir'] = './rawdata/'
 41 | 
 42 |     return user_params
 43 | 
 44 | 
 45 | def dump_user_params( user_params ):
 46 | 
 47 |     # dump user params for confirmation
 48 |     print 'Input:    '   + user_params['inList']
 49 |     print 'Output:   '   + user_params['outList']
 50 |     print 'Raw data: '   + user_params['rawDir']
 51 |     return
 52 | 
 53 | 
 54 | def read_total_list( in_filename ):
 55 | 
 56 |     # read total fetch list csv
 57 |     fp = open( in_filename, 'rb' )
 58 |     reader = csv.reader( fp, delimiter=',', quotechar='"' )
 59 | 
 60 |     total_list = []
 61 |     for row in reader:
 62 |         total_list.append( row )
 63 | 
 64 |     return total_list
 65 | 
 66 | 
 67 | def purge_already_fetched( fetch_list, raw_dir ):
 68 | 
 69 |     # list of tweet ids that still need downloading
 70 |     rem_list = []
 71 | 
 72 |     # check each tweet to see if we have it
 73 |     for item in fetch_list:
 74 | 
 75 |         # check if json file exists
 76 |         tweet_file = raw_dir + item[2] + '.json'
 77 |         if os.path.exists( tweet_file ):
 78 | 
 79 |             # attempt to parse json file
 80 |             try:
 81 |                 parse_tweet_json( tweet_file )
 82 |                 print '--> already downloaded #' + item[2]
 83 |             except RuntimeError:
 84 |                 rem_list.append( item )
 85 |         else:
 86 |             rem_list.append( item )
 87 | 
 88 |     return rem_list
 89 | 
 90 | 
 91 | def get_time_left_str( cur_idx, fetch_list, download_pause ):
 92 | 
 93 |     tweets_left = len(fetch_list) - cur_idx
 94 |     total_seconds = tweets_left * download_pause
 95 | 
 96 |     str_hr = int( total_seconds / 3600 )
 97 |     str_min = int((total_seconds - str_hr*3600) / 60)
 98 |     str_sec = total_seconds - str_hr*3600 - str_min*60
 99 | 
100 |     return '%dh %dm %ds' % (str_hr, str_min, str_sec)
101 | 
102 | 
103 | def download_tweets( fetch_list, raw_dir ):
104 | 
105 |     # ensure raw data directory exists
106 |     if not os.path.exists( raw_dir ):
107 |         os.mkdir( raw_dir )
108 | 
109 |     # stay within rate limits
110 |     max_tweets_per_hr  = 125
111 |     download_pause_sec = 3600 / max_tweets_per_hr
112 | 
113 |     # download tweets
114 |     for idx in range(0,len(fetch_list)):
115 | 
116 |         # current item
117 |         item = fetch_list[idx]
118 | 
119 |         # print status
120 |         trem = get_time_left_str( idx, fetch_list, download_pause_sec )
121 |         print '--> downloading tweet #%s (%d of %d) (%s left)' % \
122 |               (item[2], idx+1, len(fetch_list), trem)
123 | 
124 |         # pull data
125 |         url = 'http://api.twitter.com/1/statuses/show.json?id=' + item[2]
126 |         urllib.urlretrieve( url, raw_dir + item[2] + '.json' )
127 | 
128 |         # stay in Twitter API rate limits 
129 |         print '    pausing %d sec to obey Twitter API rate limits' % \
130 |               (download_pause_sec)
131 |         time.sleep( download_pause_sec )
132 | 
133 |     return
134 | 
135 | 
136 | def parse_tweet_json( filename ):
137 |     
138 |     # read tweet
139 |     print 'opening: ' + filename
140 |     fp = open( filename, 'rb' )
141 | 
142 |     # parse json
143 |     try:
144 |         tweet_json = json.load( fp )
145 |     except ValueError:
146 |         raise RuntimeError('error parsing json')
147 | 
148 |     # look for twitter api error msgs
149 |     if 'error' in tweet_json:
150 |         raise RuntimeError('error in downloaded tweet')
151 | 
152 |     # extract creation date and tweet text
153 |     return [ tweet_json['created_at'], tweet_json['text'] ]
154 | 
155 | 
156 | def build_output_corpus( out_filename, raw_dir, total_list ):
157 | 
158 |     # open csv output file
159 |     fp = open( out_filename, 'wb' )
160 |     writer = csv.writer( fp, delimiter=',', quotechar='"', escapechar='\\',
161 |                          quoting=csv.QUOTE_ALL )
162 | 
163 |     # write header row
164 |     writer.writerow( ['Topic','Sentiment','TweetId','TweetDate','TweetText'] )
165 | 
166 |     # parse all downloaded tweets
167 |     missing_count = 0
168 |     for item in total_list:
169 | 
170 |         # ensure tweet exists
171 |         if os.path.exists( raw_dir + item[2] + '.json' ):
172 | 
173 |             try: 
174 |                 # parse tweet
175 |                 parsed_tweet = parse_tweet_json( raw_dir + item[2] + '.json' )
176 |                 full_row = item + parsed_tweet
177 |     
178 |                 # character encoding for output
179 |                 for i in range(0,len(full_row)):
180 |                     full_row[i] = full_row[i].encode("utf-8")
181 |     
182 |                 # write csv row
183 |                 writer.writerow( full_row )
184 | 
185 |             except RuntimeError:
186 |                 print '--> bad data in tweet #' + item[2]
187 |                 missing_count += 1
188 | 
189 |         else:
190 |             print '--> missing tweet #' + item[2]
191 |             missing_count += 1
192 | 
193 |     # indicate success
194 |     if missing_count == 0:
195 |         print '\nSuccessfully downloaded corpus!'
196 |         print 'Output in: ' + out_filename + '\n'
197 |     else: 
198 |         print '\nMissing %d of %d tweets!' % (missing_count, len(total_list))
199 |         print 'Partial output in: ' + out_filename + '\n'
200 | 
201 |     return
202 | 
203 | 
204 | def main():
205 | 
206 |     # get user parameters
207 |     user_params = get_user_params()
208 |     dump_user_params( user_params )
209 | 
210 |     # get fetch list
211 |     total_list = read_total_list( user_params['inList'] )
212 |     fetch_list = purge_already_fetched( total_list, user_params['rawDir'] )
213 | 
214 |     # start fetching data from twitter
215 |     #download_tweets( fetch_list, user_params['rawDir'] )
216 | 
217 |     # second pass for any failed downloads
218 |     #print '\nStarting second pass to retry any failed downloads';
219 |     #fetch_list = purge_already_fetched( total_list, user_params['rawDir'] )
220 |     #download_tweets( fetch_list, user_params['rawDir'] )
221 | 
222 |     # build output corpus
223 |     build_output_corpus( user_params['outList'], user_params['rawDir'], 
224 |                          total_list )
225 | 
226 |     return
227 | 
228 | 
229 | if __name__ == '__main__':
230 |     main()
231 | 


--------------------------------------------------------------------------------