├── README.md
├── download_tribute_messages.py
└── analyze_tribute_messages.py


/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/download_tribute_messages.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import urllib2
 3 | import simplejson as json
 4 | import time
 5 | import codecs
 6 | 
 7 | # a page on apple's site shows the # of messages available
 8 | # start with 0 and retrieve up to message_range messages
 9 | metadata = json.loads(urllib2.urlopen('http://www.apple.com/stevejobs/messages/main.json').read())
10 | message_range = int(metadata['totalMessages'])
11 | print '%s total messages to download:' % message_range
12 | 
13 | 
14 | # the url for each message. i learned of this url by inspecting
15 | # the network calls to http://www.apple.com/stevejobs
16 | # using chrome's developer tools
17 | url="http://www.apple.com/stevejobs/messages/%d.json" 
18 | 
19 | # create our destination file
20 | # i'm using codecs because it does a better job at handling international characters
21 | output_file = 'stevejobs_tribute.txt'
22 | file_handle = codecs.open(output_file,'w','utf-8')
23 | 
24 | # helper function to remove tabs and linefeeds
25 | def clean(txt):
26 |   return txt.replace('\n',' ').replace('\t',' ')
27 | 
28 | for i in range(0, message_range):
29 |   req = url % i
30 |   data = urllib2.urlopen(req).read()
31 |   data = json.loads(data)
32 |   file_handle.write(clean(data['mainText']) + '\n')
33 | file_handle.close()
34 | 


--------------------------------------------------------------------------------
/analyze_tribute_messages.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #nltk.help.upenn_tagset('RB')
  3 | from collections import defaultdict
  4 | from operator import itemgetter
  5 | import re
  6 | import urllib2
  7 | import string
  8 | import simplejson as json
  9 | 
 10 | import codecs
 11 | import nltk
 12 | 
 13 | 
 14 | OUTPUT_FILE = 'data/stevejobs_tribute.txt'
 15 | 
 16 | adverbs = defaultdict(int)
 17 | adjectives = defaultdict(int)
 18 | trigrams = defaultdict(int)
 19 | 
 20 | message_has_adjective = False
 21 | message_has_adverb = False
 22 | message_contains_product_mention = False
 23 | messages_with_adjective = 0
 24 | messages_with_adverb = 0
 25 | messages = 0
 26 | messages_with_product_mention = 0
 27 | 
 28 | exclude = set(string.punctuation)
 29 | 
 30 | products = {'iPhone':{'regex':'iphones?','count':0},
 31 | 	'iMac':{'regex':'imacs?','count':0},
 32 | 	'iPad':{'regex':'ipads?','count':0},
 33 | 	'iTunes':{'regex':'itunes','count':0},
 34 | 	'iPod':{'regex':'ipods?','count':0},
 35 | 	'cube':{'regex':'cubes?','count':0},
 36 | 	'MacBook':{'regex':'macbooks?','count':0},
 37 | 	'iBook':{'regex':'ibooks?','count':0},
 38 | 	'Apple TV':{'regex':'apple ?tvs?','count':0},
 39 | 	'Apple II Family':{'regex':r'(apple )?(2|ii|\]\[|\/\/)([ce\+|]|gs|s)?[^0-9]', 'count':0},
 40 | 	'LaserWriter':{'regex':'laserwriter?','count':0},
 41 | 	'PowerBook':{'regex':'powerbook?','count':0},
 42 | 	'Newton':{'regex':'newton?','count':0},
 43 | 	'OSX':{'regex':'osx','count':0},
 44 | 	'iMovie':{'regex':'imovie','count':0},
 45 | 	'Macintosh':{'regex':'macintosh','count':0},
 46 | 	'Lisa':{'regex':'lisa','count':0},
 47 | 	'Mac':{'regex':'mac','count':0},
 48 | }
 49 | def clean(txt):
 50 |   return txt.replace('\n',' ').replace('\t',' ')
 51 | 
 52 | def top_n(dct,n = 10):
 53 | 	srtd=sorted(dct.iteritems(), key=itemgetter(1), reverse=True)
 54 | 	for x in srtd[0:n+1]:
 55 | 		print x
 56 | 
 57 | def nltk_hacking(text_file=OUTPUT_FILE):
 58 | 	f = open(text_file).read()
 59 | 	f = clean(f)
 60 | 	f = unescape(f)
 61 | 	text = nltk.WordPunctTokenizer().tokenize(f) 
 62 | 	txt = nltk.Text(text)
 63 | 	return txt
 64 | 
 65 | def nltk_concordance(term,text_file):
 66 | 	f = open(text_file).read()
 67 | 	# remove punctuation
 68 | 	f = f.translate(string.maketrans("",""), string.punctuation)
 69 | 	split_text=nltk.Text(f.split())
 70 | 	split_text.concordance(term,lines=100)
 71 | 
 72 | 	# >>> f = f.translate(string.maketrans("",""), string.punctuation)
 73 | 	# >>> foo=nltk.Text(f.split())
 74 | 	# >>> print foo.concordance('newton')
 75 | 
 76 | 	
 77 | 
 78 | def unescape(s):
 79 | 	"""unescapes html codes"""
 80 | 	s = s.replace("&lt;", "<")
 81 | 	s = s.replace("&gt;", ">")
 82 | 	s = s.replace("&nbsp;", " ")
 83 | 	# this has to be last:
 84 | 	s = s.replace("&amp;", "&")
 85 | 	return s
 86 | 
 87 | 
 88 | for line in open(OUTPUT_FILE):
 89 | 	message_has_adjective = False
 90 | 	message_has_adverb = False
 91 | 	message_contains_product_mention = False
 92 | 	
 93 | 	# remove the trailing linefeed and convert to lower-case
 94 | 	# and remove html control characters
 95 | 	messages += 1
 96 | 	data = line.strip()
 97 | 	data = data.lower()
 98 | 	data = unescape(data)
 99 | 	
100 | 	# check for product mentions
101 | 	for k,v in products.iteritems():
102 | 		if re.search(v['regex'],data):
103 | 			products[k]['count'] += 1
104 | 			message_contains_product_mention = True
105 | 			
106 | 	# if the message contains a product mention
107 | 	# increment the product mention counter
108 | 	if message_contains_product_mention:
109 | 		messages_with_product_mention += 1
110 | 
111 | 
112 | # tokenize the sentences using nltk's wordpuncttokenizer
113 | 	text = nltk.WordPunctTokenizer().tokenize(data) 
114 | 
115 | # compute trigrams
116 | 	nltk_trigrams = nltk.trigrams(text)
117 | 	for itm in nltk_trigrams:
118 | 		trigrams[itm] += 1
119 | 
120 | # pos-tag each token. we're interested in adjectives and adverbs
121 | 	parts_of_speech = nltk.pos_tag(text)
122 | 	# test for adjectives and adverbs, increment the counters
123 | 	# when we find one. 
124 | 
125 | 	for (word,pos) in parts_of_speech:
126 | 		if pos.startswith('JJ'):
127 | 			message_has_adjective = True
128 | 			adjectives[word] += 1
129 | 
130 | 		if pos.startswith('RB'):
131 | 			message_has_adverb = True
132 | 			adverbs[word] += 1
133 | 
134 | 	# if the message contains an adverb or an adjective, increment a counter
135 | 	if message_has_adjective:
136 | 		messages_with_adjective += 1
137 | 	if message_has_adverb:
138 | 		messages_with_adverb += 1
139 | 
140 | # output the 25 most frequently-used adjectives and adverbs
141 | n = 25
142 | print "top %s adverbs" % n
143 | top_n(adverbs, n)
144 | print
145 | print "top %s adjectives" % n
146 | top_n(adjectives, n)
147 | 
148 | print "messages with adjectives: %s" % messages_with_adjective
149 | print "messages with adverbs: %s" % messages_with_adverb
150 | print "total messages with product mentions: %s" % messages_with_product_mention
151 | print "total messages: %s" % messages
152 | 
153 | 
154 | # output the top 50 most-common trigrams
155 | n = 50
156 | print "top %s trigrams" % n
157 | top_n(trigrams, n)
158 | srtd=sorted(products.iteritems(),key=itemgetter(1))
159 | for x,y in srtd:
160 | 	print "%s\t\t%s" % (x,y['count'])
161 | 	
162 | 	
163 | print
164 | print
165 | # concordance for newton
166 | print "concordance for newton:"
167 | nltk_concordance('newton',OUTPUT_FILE)
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------