├── README.md ├── download_tribute_messages.py └── analyze_tribute_messages.py /README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /download_tribute_messages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import urllib2 3 | import simplejson as json 4 | import time 5 | import codecs 6 | 7 | # a page on apple's site shows the # of messages available 8 | # start with 0 and retrieve up to message_range messages 9 | metadata = json.loads(urllib2.urlopen('http://www.apple.com/stevejobs/messages/main.json').read()) 10 | message_range = int(metadata['totalMessages']) 11 | print '%s total messages to download:' % message_range 12 | 13 | 14 | # the url for each message. i learned of this url by inspecting 15 | # the network calls to http://www.apple.com/stevejobs 16 | # using chrome's developer tools 17 | url="http://www.apple.com/stevejobs/messages/%d.json" 18 | 19 | # create our destination file 20 | # i'm using codecs because it does a better job at handling international characters 21 | output_file = 'stevejobs_tribute.txt' 22 | file_handle = codecs.open(output_file,'w','utf-8') 23 | 24 | # helper function to remove tabs and linefeeds 25 | def clean(txt): 26 | return txt.replace('\n',' ').replace('\t',' ') 27 | 28 | for i in range(0, message_range): 29 | req = url % i 30 | data = urllib2.urlopen(req).read() 31 | data = json.loads(data) 32 | file_handle.write(clean(data['mainText']) + '\n') 33 | file_handle.close() 34 | -------------------------------------------------------------------------------- /analyze_tribute_messages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #nltk.help.upenn_tagset('RB') 3 | from collections import defaultdict 4 | from operator import itemgetter 5 | import re 6 | import urllib2 7 | import string 8 | import simplejson as json 9 | 10 | import codecs 11 | import nltk 12 | 13 | 14 | OUTPUT_FILE = 'data/stevejobs_tribute.txt' 15 | 16 | adverbs = defaultdict(int) 17 | adjectives = defaultdict(int) 18 | trigrams = defaultdict(int) 19 | 20 | message_has_adjective = False 21 | message_has_adverb = False 22 | message_contains_product_mention = False 23 | messages_with_adjective = 0 24 | messages_with_adverb = 0 25 | messages = 0 26 | messages_with_product_mention = 0 27 | 28 | exclude = set(string.punctuation) 29 | 30 | products = {'iPhone':{'regex':'iphones?','count':0}, 31 | 'iMac':{'regex':'imacs?','count':0}, 32 | 'iPad':{'regex':'ipads?','count':0}, 33 | 'iTunes':{'regex':'itunes','count':0}, 34 | 'iPod':{'regex':'ipods?','count':0}, 35 | 'cube':{'regex':'cubes?','count':0}, 36 | 'MacBook':{'regex':'macbooks?','count':0}, 37 | 'iBook':{'regex':'ibooks?','count':0}, 38 | 'Apple TV':{'regex':'apple ?tvs?','count':0}, 39 | 'Apple II Family':{'regex':r'(apple )?(2|ii|\]\[|\/\/)([ce\+|]|gs|s)?[^0-9]', 'count':0}, 40 | 'LaserWriter':{'regex':'laserwriter?','count':0}, 41 | 'PowerBook':{'regex':'powerbook?','count':0}, 42 | 'Newton':{'regex':'newton?','count':0}, 43 | 'OSX':{'regex':'osx','count':0}, 44 | 'iMovie':{'regex':'imovie','count':0}, 45 | 'Macintosh':{'regex':'macintosh','count':0}, 46 | 'Lisa':{'regex':'lisa','count':0}, 47 | 'Mac':{'regex':'mac','count':0}, 48 | } 49 | def clean(txt): 50 | return txt.replace('\n',' ').replace('\t',' ') 51 | 52 | def top_n(dct,n = 10): 53 | srtd=sorted(dct.iteritems(), key=itemgetter(1), reverse=True) 54 | for x in srtd[0:n+1]: 55 | print x 56 | 57 | def nltk_hacking(text_file=OUTPUT_FILE): 58 | f = open(text_file).read() 59 | f = clean(f) 60 | f = unescape(f) 61 | text = nltk.WordPunctTokenizer().tokenize(f) 62 | txt = nltk.Text(text) 63 | return txt 64 | 65 | def nltk_concordance(term,text_file): 66 | f = open(text_file).read() 67 | # remove punctuation 68 | f = f.translate(string.maketrans("",""), string.punctuation) 69 | split_text=nltk.Text(f.split()) 70 | split_text.concordance(term,lines=100) 71 | 72 | # >>> f = f.translate(string.maketrans("",""), string.punctuation) 73 | # >>> foo=nltk.Text(f.split()) 74 | # >>> print foo.concordance('newton') 75 | 76 | 77 | 78 | def unescape(s): 79 | """unescapes html codes""" 80 | s = s.replace("<", "<") 81 | s = s.replace(">", ">") 82 | s = s.replace(" ", " ") 83 | # this has to be last: 84 | s = s.replace("&", "&") 85 | return s 86 | 87 | 88 | for line in open(OUTPUT_FILE): 89 | message_has_adjective = False 90 | message_has_adverb = False 91 | message_contains_product_mention = False 92 | 93 | # remove the trailing linefeed and convert to lower-case 94 | # and remove html control characters 95 | messages += 1 96 | data = line.strip() 97 | data = data.lower() 98 | data = unescape(data) 99 | 100 | # check for product mentions 101 | for k,v in products.iteritems(): 102 | if re.search(v['regex'],data): 103 | products[k]['count'] += 1 104 | message_contains_product_mention = True 105 | 106 | # if the message contains a product mention 107 | # increment the product mention counter 108 | if message_contains_product_mention: 109 | messages_with_product_mention += 1 110 | 111 | 112 | # tokenize the sentences using nltk's wordpuncttokenizer 113 | text = nltk.WordPunctTokenizer().tokenize(data) 114 | 115 | # compute trigrams 116 | nltk_trigrams = nltk.trigrams(text) 117 | for itm in nltk_trigrams: 118 | trigrams[itm] += 1 119 | 120 | # pos-tag each token. we're interested in adjectives and adverbs 121 | parts_of_speech = nltk.pos_tag(text) 122 | # test for adjectives and adverbs, increment the counters 123 | # when we find one. 124 | 125 | for (word,pos) in parts_of_speech: 126 | if pos.startswith('JJ'): 127 | message_has_adjective = True 128 | adjectives[word] += 1 129 | 130 | if pos.startswith('RB'): 131 | message_has_adverb = True 132 | adverbs[word] += 1 133 | 134 | # if the message contains an adverb or an adjective, increment a counter 135 | if message_has_adjective: 136 | messages_with_adjective += 1 137 | if message_has_adverb: 138 | messages_with_adverb += 1 139 | 140 | # output the 25 most frequently-used adjectives and adverbs 141 | n = 25 142 | print "top %s adverbs" % n 143 | top_n(adverbs, n) 144 | print 145 | print "top %s adjectives" % n 146 | top_n(adjectives, n) 147 | 148 | print "messages with adjectives: %s" % messages_with_adjective 149 | print "messages with adverbs: %s" % messages_with_adverb 150 | print "total messages with product mentions: %s" % messages_with_product_mention 151 | print "total messages: %s" % messages 152 | 153 | 154 | # output the top 50 most-common trigrams 155 | n = 50 156 | print "top %s trigrams" % n 157 | top_n(trigrams, n) 158 | srtd=sorted(products.iteritems(),key=itemgetter(1)) 159 | for x,y in srtd: 160 | print "%s\t\t%s" % (x,y['count']) 161 | 162 | 163 | print 164 | print 165 | # concordance for newton 166 | print "concordance for newton:" 167 | nltk_concordance('newton',OUTPUT_FILE) 168 | 169 | 170 | --------------------------------------------------------------------------------