├── .gitignore ├── README ├── chapter6 ├── test.db ├── test1.db ├── feedfilter.py └── docclass.py ├── chapter10 ├── Thumbs.db ├── nnmf.py ├── stockvolume.py ├── features.txt ├── stockfeatures.txt ├── newsfeatures.py ├── docclass.py ├── clusters.py └── articles.txt ├── chapter3 ├── Thumbs.db ├── downloadzebodata.py ├── generatefeedvector.py ├── feedlist.txt └── clusters.py ├── chapter7 ├── Thumbs.db ├── addresslist.txt ├── zillow.py ├── hotornot.py └── treepredict.py ├── chapter2 ├── deliciousrec.py ├── recommendations.py └── pydelicious.py ├── chapter5 ├── dorm.py ├── socialnetwork.py ├── kayak.py ├── schedule.txt └── optimization.py ├── chapter9 ├── advancedclassify.py ├── facebook.py ├── agesonly.csv └── svm.py ├── chapter8 ├── ebaypredict.py ├── optimization.py └── numpredict.py ├── chapter4 ├── nn.py └── searchengine.py └── chapter11 └── gp.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cd] 2 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Examples from Programming Collective Intelligence ( http://oreilly.com/catalog/9780596529321/ ) 2 | -------------------------------------------------------------------------------- /chapter6/test.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ferronrsmith/programming-collective-intelligence-code/HEAD/chapter6/test.db -------------------------------------------------------------------------------- /chapter6/test1.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ferronrsmith/programming-collective-intelligence-code/HEAD/chapter6/test1.db -------------------------------------------------------------------------------- /chapter10/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ferronrsmith/programming-collective-intelligence-code/HEAD/chapter10/Thumbs.db -------------------------------------------------------------------------------- /chapter3/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ferronrsmith/programming-collective-intelligence-code/HEAD/chapter3/Thumbs.db -------------------------------------------------------------------------------- /chapter7/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ferronrsmith/programming-collective-intelligence-code/HEAD/chapter7/Thumbs.db -------------------------------------------------------------------------------- /chapter7/addresslist.txt: -------------------------------------------------------------------------------- 1 | 6 Washington 2 | 21 Manassas 3 | 280 Pearl 4 | 55 Ellery 5 | 50 Follen 6 | 51 Granite 7 | 992 Memorial 8 | 83 Trowbridge 9 | 1 Dana 10 | 45 Regent 11 | 90 Alpine 12 | 21 Francis 13 | 112 Avon Hill 14 | 9 Bellevue 15 | 4 Blanchard Rd 16 | 34 Shea 17 | 5 Fountain 18 | 14 Marcella 19 | 39 Saint Saveur 20 | 35 Pemberton 21 | 46 Shepard 22 | 31 Market 23 | 99 Howard 24 | 88 Pearl 25 | 208 Western 26 | 285 Windsor 27 | 26 Cambridgepark 28 | 211 Erie 29 | 129 Franklin 30 | 27 Gurney 31 | 149 Prospect 32 | 27 Linnaean 33 | 20 Dudley 34 | 60 Otis St 35 | 130 Mount Auburn St 36 | 2 Michael Way 37 | 263 Columbia St 38 | 6 Hurlbut St 39 | 199 Harvard St 40 | 168 River St 41 | 400 Washington St 42 | 12 Traill St 43 | 74 Field St 44 | 21 Walden Square Rd 45 | 7 Wendell St 46 | 15 Normandy Ave 47 | 6 Gibson Ter 48 | 94 Pine St 49 | 23 Magee St 50 | 175 Richdale Ave 51 | 168 River St 52 | 246 Brattle St -------------------------------------------------------------------------------- /chapter2/deliciousrec.py: -------------------------------------------------------------------------------- 1 | from pydelicious import get_popular,get_userposts,get_urlposts 2 | import time 3 | 4 | def initializeUserDict(tag,count=5): 5 | user_dict={} 6 | # get the top count' popular posts 7 | for p1 in get_popular(tag=tag)[0:count]: 8 | # find all users who posted this 9 | for p2 in get_urlposts(p1['href']): 10 | user=p2['user'] 11 | user_dict[user]={} 12 | return user_dict 13 | 14 | def fillItems(user_dict): 15 | all_items={} 16 | # Find links posted by all users 17 | for user in user_dict: 18 | for i in range(3): 19 | try: 20 | posts=get_userposts(user) 21 | break 22 | except: 23 | print "Failed user "+user+", retrying" 24 | time.sleep(4) 25 | for post in posts: 26 | url=post['href'] 27 | user_dict[user][url]=1.0 28 | all_items[url]=1 29 | 30 | # Fill in missing items with 0 31 | for ratings in user_dict.values(): 32 | for item in all_items: 33 | if item not in ratings: 34 | ratings[item]=0.0 35 | -------------------------------------------------------------------------------- /chapter10/nnmf.py: -------------------------------------------------------------------------------- 1 | from numpy import * 2 | 3 | def difcost(a,b): 4 | dif=0 5 | for i in range(shape(a)[0]): 6 | for j in range(shape(a)[1]): 7 | # Euclidean Distance 8 | dif+=pow(a[i,j]-b[i,j],2) 9 | return dif 10 | 11 | def factorize(v,pc=10,iter=50): 12 | ic=shape(v)[0] 13 | fc=shape(v)[1] 14 | 15 | # Initialize the weight and feature matrices with random values 16 | w=matrix([[random.random() for j in range(pc)] for i in range(ic)]) 17 | h=matrix([[random.random() for i in range(fc)] for i in range(pc)]) 18 | 19 | # Perform operation a maximum of iter times 20 | for i in range(iter): 21 | wh=w*h 22 | 23 | # Calculate the current difference 24 | cost=difcost(v,wh) 25 | 26 | if i%10==0: print cost 27 | 28 | # Terminate if the matrix has been fully factorized 29 | if cost==0: break 30 | 31 | # Update feature matrix 32 | hn=(transpose(w)*v) 33 | hd=(transpose(w)*w*h) 34 | 35 | h=matrix(array(h)*array(hn)/array(hd)) 36 | 37 | # Update weights matrix 38 | wn=(v*transpose(h)) 39 | wd=(w*h*transpose(h)) 40 | 41 | w=matrix(array(w)*array(wn)/array(wd)) 42 | 43 | return w,h 44 | -------------------------------------------------------------------------------- /chapter7/zillow.py: -------------------------------------------------------------------------------- 1 | import xml.dom.minidom 2 | import urllib2 3 | 4 | zwskey="YOUR API KEY" 5 | 6 | def getaddressdata(address,city): 7 | escad=address.replace(' ','+') 8 | url='http://www.zillow.com/webservice/GetDeepSearchResults.htm?' 9 | url+='zws-id=%s&address=%s&citystatezip=%s' % (zwskey,escad,city) 10 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 11 | code=doc.getElementsByTagName('code')[0].firstChild.data 12 | if code!='0': return None 13 | if 1: 14 | zipcode=doc.getElementsByTagName('zipcode')[0].firstChild.data 15 | use=doc.getElementsByTagName('useCode')[0].firstChild.data 16 | year=doc.getElementsByTagName('yearBuilt')[0].firstChild.data 17 | sqft=doc.getElementsByTagName('finishedSqFt')[0].firstChild.data 18 | bath=doc.getElementsByTagName('bathrooms')[0].firstChild.data 19 | bed=doc.getElementsByTagName('bedrooms')[0].firstChild.data 20 | rooms=1 #doc.getElementsByTagName('totalRooms')[0].firstChild.data 21 | price=doc.getElementsByTagName('amount')[0].firstChild.data 22 | else: 23 | return None 24 | 25 | return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price) 26 | 27 | def getpricelist(): 28 | l1=[] 29 | for line in file('addresslist.txt'): 30 | data=getaddressdata(line.strip(),'Cambridge,MA') 31 | l1.append(data) 32 | return l1 33 | -------------------------------------------------------------------------------- /chapter3/downloadzebodata.py: -------------------------------------------------------------------------------- 1 | from BeautifulSoup import BeautifulSoup 2 | import urllib2 3 | import re 4 | chare=re.compile(r'[!-\.&]') 5 | itemowners={} 6 | 7 | # Words to remove 8 | dropwords=['a','new','some','more','my','own','the','many','other','another'] 9 | 10 | currentuser=0 11 | for i in range(1,51): 12 | # URL for the want search page 13 | c=urllib2.urlopen( 14 | 'http://member.zebo.com/Main?event_key=USERSEARCH&wiowiw=wiw&keyword=car&page=%d' 15 | % (i)) 16 | soup=BeautifulSoup(c.read()) 17 | for td in soup('td'): 18 | # Find table cells of bgverdanasmall class 19 | if ('class' in dict(td.attrs) and td['class']=='bgverdanasmall'): 20 | items=[re.sub(chare,'',str(a.contents[0]).lower()).strip() for a in td('a')] 21 | for item in items: 22 | # Remove extra words 23 | txt=' '.join([t for t in item.split(' ') if t not in dropwords]) 24 | if len(txt)<2: continue 25 | itemowners.setdefault(txt,{}) 26 | itemowners[txt][currentuser]=1 27 | currentuser+=1 28 | 29 | out=file('zebo.txt','w') 30 | out.write('Item') 31 | for user in range(0,currentuser): out.write('\tU%d' % user) 32 | out.write('\n') 33 | for item,owners in itemowners.items(): 34 | if len(owners)>10: 35 | out.write(item) 36 | for user in range(0,currentuser): 37 | if user in owners: out.write('\t1') 38 | else: out.write('\t0') 39 | out.write('\n') 40 | -------------------------------------------------------------------------------- /chapter10/stockvolume.py: -------------------------------------------------------------------------------- 1 | import nnmf 2 | import urllib2 3 | from numpy import * 4 | 5 | tickers=['YHOO','AVP','BIIB','BP','CL','CVX', 6 | 'DNA','EXPE','GOOG','PG','XOM','AMGN'] 7 | 8 | shortest=300 9 | prices={} 10 | dates=None 11 | 12 | for t in tickers: 13 | # Open the URL 14 | rows=urllib2.urlopen('http://ichart.finance.yahoo.com/table.csv?'+\ 15 | 's=%s&d=11&e=26&f=2006&g=d&a=3&b=12&c=1996'%t +\ 16 | '&ignore=.csv').readlines() 17 | 18 | 19 | # Extract the volume field from every line 20 | prices[t]=[float(r.split(',')[5]) for r in rows[1:] if r.strip()!=''] 21 | if len(prices[t])]+>').sub('',html) 25 | 26 | # Split words by all non-alpha characters 27 | words=re.compile(r'[^A-Z^a-z]+').split(txt) 28 | 29 | # Convert to lowercase 30 | return [word.lower() for word in words if word!=''] 31 | 32 | 33 | apcount={} 34 | wordcounts={} 35 | feedlist=[line for line in file('feedlist.txt')] 36 | for feedurl in feedlist: 37 | try: 38 | title,wc=getwordcounts(feedurl) 39 | wordcounts[title]=wc 40 | for word,count in wc.items(): 41 | apcount.setdefault(word,0) 42 | if count>1: 43 | apcount[word]+=1 44 | except: 45 | print 'Failed to parse feed %s' % feedurl 46 | 47 | wordlist=[] 48 | for w,bc in apcount.items(): 49 | frac=float(bc)/len(feedlist) 50 | if frac>0.1 and frac<0.5: 51 | wordlist.append(w) 52 | 53 | out=file('blogdata1.txt','w') 54 | out.write('Blog') 55 | for word in wordlist: out.write('\t%s' % word) 56 | out.write('\n') 57 | for blog,wc in wordcounts.items(): 58 | print blog 59 | out.write(blog) 60 | for word in wordlist: 61 | if word in wc: out.write('\t%d' % wc[word]) 62 | else: out.write('\t0') 63 | out.write('\n') 64 | -------------------------------------------------------------------------------- /chapter6/feedfilter.py: -------------------------------------------------------------------------------- 1 | import feedparser 2 | import re 3 | 4 | # Takes a filename of URL of a blog feed and classifies the entries 5 | def read(feed,classifier): 6 | # Get feed entries and loop over them 7 | f=feedparser.parse(feed) 8 | for entry in f['entries']: 9 | print 10 | print '-----' 11 | # Print the contents of the entry 12 | print 'Title: '+entry['title'].encode('utf-8') 13 | print 'Publisher: '+entry['publisher'].encode('utf-8') 14 | print 15 | print entry['summary'].encode('utf-8') 16 | 17 | 18 | # Combine all the text to create one item for the classifier 19 | fulltext='%s\n%s\n%s' % (entry['title'],entry['publisher'],entry['summary']) 20 | 21 | # Print the best guess at the current category 22 | print 'Guess: '+str(classifier.classify(entry)) 23 | 24 | # Ask the user to specify the correct category and train on that 25 | cl=raw_input('Enter category: ') 26 | classifier.train(entry,cl) 27 | 28 | 29 | def entryfeatures(entry): 30 | splitter=re.compile('\\W*') 31 | f={} 32 | 33 | # Extract the title words and annotate 34 | titlewords=[s.lower() for s in splitter.split(entry['title']) 35 | if len(s)>2 and len(s)<20] 36 | for w in titlewords: f['Title:'+w]=1 37 | 38 | # Extract the summary words 39 | summarywords=[s.lower() for s in splitter.split(entry['summary']) 40 | if len(s)>2 and len(s)<20] 41 | 42 | # Count uppercase words 43 | uc=0 44 | for i in range(len(summarywords)): 45 | w=summarywords[i] 46 | f[w]=1 47 | if w.isupper(): uc+=1 48 | 49 | # Get word pairs in summary as features 50 | if i0.3: f['UPPERCASE']=1 59 | 60 | return f 61 | -------------------------------------------------------------------------------- /chapter7/hotornot.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import xml.dom.minidom 3 | 4 | api_key='YOUR KEY HERE' 5 | 6 | def getrandomratings(c): 7 | # Construct URL for getRandomProfile 8 | url="http://services.hotornot.com/rest/?app_key=%s" % api_key 9 | url+="&method=Rate.getRandomProfile&retrieve_num=%d" % c 10 | url+="&get_rate_info=true&meet_users_only=true" 11 | 12 | f1=urllib2.urlopen(url).read() 13 | 14 | doc=xml.dom.minidom.parseString(f1) 15 | 16 | emids=doc.getElementsByTagName('emid') 17 | ratings=doc.getElementsByTagName('rating') 18 | 19 | # Combine the emids and ratings together into a list 20 | result=[] 21 | for e,r in zip(emids,ratings): 22 | if r.firstChild!=None: 23 | result.append((e.firstChild.data,r.firstChild.data)) 24 | return result 25 | 26 | stateregions={'New England':['ct','mn','ma','nh','ri','vt'], 27 | 'Mid Atlantic':['de','md','nj','ny','pa'], 28 | 'South':['al','ak','fl','ga','ky','la','ms','mo', 29 | 'nc','sc','tn','va','wv'], 30 | 'Midwest':['il','in','ia','ks','mi','ne','nd','oh','sd','wi'], 31 | 'West':['ak','ca','co','hi','id','mt','nv','or','ut','wa','wy']} 32 | 33 | def getpeopledata(ratings): 34 | result=[] 35 | for emid,rating in ratings: 36 | # URL for the MeetMe.getProfile method 37 | url="http://services.hotornot.com/rest/?app_key=%s" % api_key 38 | url+="&method=MeetMe.getProfile&emid=%s&get_keywords=true" % emid 39 | 40 | # Get all the info about this person 41 | try: 42 | rating=int(float(rating)+0.5) 43 | doc2=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 44 | gender=doc2.getElementsByTagName('gender')[0].firstChild.data 45 | age=doc2.getElementsByTagName('age')[0].firstChild.data 46 | loc=doc2.getElementsByTagName('location')[0].firstChild.data[0:2] 47 | 48 | # Convert state to region 49 | for r,s in stateregions.items(): 50 | if loc in s: region=r 51 | 52 | if region!=None: 53 | result.append((gender,int(age),region,rating)) 54 | except: 55 | pass 56 | return result 57 | 58 | -------------------------------------------------------------------------------- /chapter10/features.txt: -------------------------------------------------------------------------------- 1 | ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 2 | (6.58003120192, u'The Abs Diet by David Zinczenko') 3 | (5.9231935598, u"I did'nt diet to get in shape for Trinidad's Carnival.....") 4 | (5.04673654071, u'Sensible Diet & Exercise') 5 | 6 | ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 7 | (6.52183126318, u'food/exercise Monday 10/1') 8 | (5.94642162786, u'diet/exercise 10/4') 9 | (5.3332773133, u'food/exercise Friday 10/5') 10 | 11 | ['food', 'calories', 'than', 'easy', 'high', 'come'] 12 | (9.98464450123, u'High or low fat food? Easy trick for figuring it out') 13 | (3.41252863148, u'Oatmeal, cereal of choice.') 14 | (3.19119866786, u'Food and Workout Log 10.8.07') 15 | 16 | ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 17 | (7.46811621754, u'saturday') 18 | (5.62839188358, u'diet-exercise thursday') 19 | (5.29370213306, u'sleepy food/fitness thursday') 20 | 21 | ['food', 'home', 'then', 'exercise', 'morning', 'went'] 22 | (5.22083940456, u'Food & Exercise -- 10/5/2007') 23 | (5.16310413391, u'Food & Exercise -- 10/4/2007') 24 | (4.75585045074, u'Food & Exercise -- 9/28/2007 (yesterday)') 25 | 26 | ['fats', 'quot', 'this', 'good', 'about', 'like'] 27 | (14.9233786406, u'Good fats bad fats') 28 | (1.3775418859, u'Should we ban marathons?') 29 | (1.37194239805, u'Food & Exercise -- 10/3/2007') 30 | 31 | ['quot', 'they', 'money', 'want', 'very', 'best'] 32 | (6.1620884463, u'More about the Chicago marathon') 33 | (5.58276496802, u'LOUIE + LINESMAKER = $$$$') 34 | (4.04959173123, u'High or low fat food? Easy trick for figuring it out') 35 | 36 | ['that', 'much', 'does', 'exercise', 'this', 'morning'] 37 | (7.73926153154, u'Food & Exercise -- 10/7/2007') 38 | (5.96451663382, u'< 1g, etc.') 39 | (3.81276353396, u"why I'm succeeding, finally, with my fitness") 40 | 41 | ['with', 'your', 'weight', 'have', 'control', 'about'] 42 | (6.78756986407, u'Control ur Weight') 43 | (5.54567450388, u'Flu-Busting Chicken Soup') 44 | (5.21079777525, u'Weight Loss Tips') 45 | 46 | ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 47 | (5.58477112035, u'Food and Workout Log 9.27.08') 48 | (5.48488799917, u'Food and Workout Log 10.3.07') 49 | (5.10395750879, u'Food and Workout Log 10.10.07') 50 | 51 | -------------------------------------------------------------------------------- /chapter5/socialnetwork.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | people=['Charlie','Augustus','Veruca','Violet','Mike','Joe','Willy','Miranda'] 4 | 5 | links=[('Augustus', 'Willy'), 6 | ('Mike', 'Joe'), 7 | ('Miranda', 'Mike'), 8 | ('Violet', 'Augustus'), 9 | ('Miranda', 'Willy'), 10 | ('Charlie', 'Mike'), 11 | ('Veruca', 'Joe'), 12 | ('Miranda', 'Augustus'), 13 | ('Willy', 'Augustus'), 14 | ('Joe', 'Charlie'), 15 | ('Veruca', 'Augustus'), 16 | ('Miranda', 'Joe')] 17 | 18 | 19 | def crosscount(v): 20 | # Convert the number list into a dictionary of person:(x,y) 21 | loc=dict([(people[i],(v[i*2],v[i*2+1])) for i in range(0,len(people))]) 22 | total=0 23 | 24 | # Loop through every pair of links 25 | for i in range(len(links)): 26 | for j in range(i+1,len(links)): 27 | 28 | # Get the locations 29 | (x1,y1),(x2,y2)=loc[links[i][0]],loc[links[i][1]] 30 | (x3,y3),(x4,y4)=loc[links[j][0]],loc[links[j][1]] 31 | 32 | den=(y4-y3)*(x2-x1)-(x4-x3)*(y2-y1) 33 | 34 | # den==0 if the lines are parallel 35 | if den==0: continue 36 | 37 | # Otherwise ua and ub are the fraction of the 38 | # line where they cross 39 | ua=((x4-x3)*(y1-y3)-(y4-y3)*(x1-x3))/den 40 | ub=((x2-x1)*(y1-y3)-(y2-y1)*(x1-x3))/den 41 | 42 | # If the fraction is between 0 and 1 for both lines 43 | # then they cross each other 44 | if ua>0 and ua<1 and ub>0 and ub<1: 45 | total+=1 46 | for i in range(len(people)): 47 | for j in range(i+1,len(people)): 48 | # Get the locations of the two nodes 49 | (x1,y1),(x2,y2)=loc[people[i]],loc[people[j]] 50 | 51 | # Find the distance between them 52 | dist=math.sqrt(math.pow(x1-x2,2)+math.pow(y1-y2,2)) 53 | # Penalize any nodes closer than 50 pixels 54 | if dist<50: 55 | total+=(1.0-(dist/50.0)) 56 | 57 | return total 58 | from PIL import Image,ImageDraw 59 | 60 | def drawnetwork(sol): 61 | # Create the image 62 | img=Image.new('RGB',(400,400),(255,255,255)) 63 | draw=ImageDraw.Draw(img) 64 | 65 | # Create the position dict 66 | pos=dict([(people[i],(sol[i*2],sol[i*2+1])) for i in range(0,len(people))]) 67 | 68 | for (a,b) in links: 69 | draw.line((pos[a],pos[b]),fill=(255,0,0)) 70 | 71 | for n,p in pos.items(): 72 | draw.text(p,n,(0,0,0)) 73 | 74 | img.show() 75 | 76 | 77 | domain=[(10,370)]*(len(people)*2) -------------------------------------------------------------------------------- /chapter5/kayak.py: -------------------------------------------------------------------------------- 1 | import time 2 | import urllib2 3 | import xml.dom.minidom 4 | 5 | kayakkey='YOUR KEY HERE' 6 | 7 | def getkayaksession(): 8 | # Construct the URL to start a session 9 | url='http://www.kayak.com/k/ident/apisession?token=%s&version=1' % kayakkey 10 | 11 | # Parse the resulting XML 12 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 13 | 14 | # Find xxxxxxxx 15 | sid=doc.getElementsByTagName('sid')[0].firstChild.data 16 | return sid 17 | 18 | def flightsearch(sid,origin,destination,depart_date): 19 | 20 | # Construct search URL 21 | url='http://www.kayak.com/s/apisearch?basicmode=true&oneway=y&origin=%s' % origin 22 | url+='&destination=%s&depart_date=%s' % (destination,depart_date) 23 | url+='&return_date=none&depart_time=a&return_time=a' 24 | url+='&travelers=1&cabin=e&action=doFlights&apimode=1' 25 | url+='&_sid_=%s&version=1' % (sid) 26 | 27 | # Get the XML 28 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 29 | 30 | # Extract the search ID 31 | searchid=doc.getElementsByTagName('searchid')[0].firstChild.data 32 | 33 | return searchid 34 | 35 | def flightsearchresults(sid,searchid): 36 | def parseprice(p): 37 | return float(p[1:].replace(',','')) 38 | 39 | # Polling loop 40 | while 1: 41 | time.sleep(2) 42 | 43 | # Construct URL for polling 44 | url='http://www.kayak.com/s/basic/flight?' 45 | url+='searchid=%s&c=5&apimode=1&_sid_=%s&version=1' % (searchid,sid) 46 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 47 | 48 | # Look for morepending tag, and wait until it is no longer true 49 | morepending=doc.getElementsByTagName('morepending')[0].firstChild 50 | if morepending==None or morepending.data=='false': break 51 | 52 | # Now download the complete list 53 | url='http://www.kayak.com/s/basic/flight?' 54 | url+='searchid=%s&c=999&apimode=1&_sid_=%s&version=1' % (searchid,sid) 55 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 56 | 57 | # Get the various elements as lists 58 | prices=doc.getElementsByTagName('price') 59 | departures=doc.getElementsByTagName('depart') 60 | arrivals=doc.getElementsByTagName('arrive') 61 | 62 | # Zip them together 63 | return zip([p.firstChild.data.split(' ')[1] for p in departures], 64 | [p.firstChild.data.split(' ')[1] for p in arrivals], 65 | [parseprice(p.firstChild.data) for p in prices]) 66 | 67 | 68 | def createschedule(people,dest,dep,ret): 69 | # Get a session id for these searches 70 | sid=getkayaksession() 71 | flights={} 72 | 73 | for p in people: 74 | name,origin=p 75 | # Outbound flight 76 | searchid=flightsearch(sid,origin,dest,dep) 77 | flights[(origin,dest)]=flightsearchresults(sid,searchid) 78 | 79 | # Return flight 80 | searchid=flightsearch(sid,dest,origin,ret) 81 | flights[(dest,origin)]=flightsearchresults(sid,searchid) 82 | 83 | return flights 84 | -------------------------------------------------------------------------------- /chapter5/schedule.txt: -------------------------------------------------------------------------------- 1 | LGA,OMA,6:19,8:13,239 2 | OMA,LGA,6:11,8:31,249 3 | LGA,OMA,8:04,10:59,136 4 | OMA,LGA,7:39,10:24,219 5 | LGA,OMA,9:31,11:43,210 6 | OMA,LGA,9:15,12:03,99 7 | LGA,OMA,11:07,13:24,171 8 | OMA,LGA,11:08,13:07,175 9 | LGA,OMA,12:31,14:02,234 10 | OMA,LGA,12:18,14:56,172 11 | LGA,OMA,14:05,15:47,226 12 | OMA,LGA,13:37,15:08,250 13 | LGA,OMA,15:07,17:21,129 14 | OMA,LGA,15:03,16:42,135 15 | LGA,OMA,16:35,18:56,144 16 | OMA,LGA,16:51,19:09,147 17 | LGA,OMA,18:25,20:34,205 18 | OMA,LGA,18:12,20:17,242 19 | LGA,OMA,20:05,21:44,172 20 | OMA,LGA,20:05,22:06,261 21 | LGA,ORD,6:03,8:43,219 22 | ORD,LGA,6:05,8:32,174 23 | LGA,ORD,7:50,10:08,164 24 | ORD,LGA,8:25,10:34,157 25 | LGA,ORD,9:11,10:42,172 26 | ORD,LGA,9:42,11:32,169 27 | LGA,ORD,10:33,13:11,132 28 | ORD,LGA,11:01,12:39,260 29 | LGA,ORD,12:08,14:47,231 30 | ORD,LGA,12:44,14:17,134 31 | LGA,ORD,14:19,17:09,190 32 | ORD,LGA,14:22,16:32,126 33 | LGA,ORD,15:04,17:23,189 34 | ORD,LGA,15:58,18:40,173 35 | LGA,ORD,17:06,20:00,95 36 | ORD,LGA,16:43,19:00,246 37 | LGA,ORD,18:33,20:22,143 38 | ORD,LGA,18:48,21:45,246 39 | LGA,ORD,19:32,21:25,160 40 | ORD,LGA,19:50,22:24,269 41 | LGA,MIA,6:33,9:14,172 42 | MIA,LGA,6:25,9:30,335 43 | LGA,MIA,8:23,11:07,143 44 | MIA,LGA,7:34,9:40,324 45 | LGA,MIA,9:25,12:46,295 46 | MIA,LGA,9:15,12:29,225 47 | LGA,MIA,11:08,14:38,262 48 | MIA,LGA,11:28,14:40,248 49 | LGA,MIA,12:37,15:05,170 50 | MIA,LGA,12:05,15:30,330 51 | LGA,MIA,14:08,16:09,232 52 | MIA,LGA,14:01,17:24,338 53 | LGA,MIA,15:23,18:49,150 54 | MIA,LGA,15:34,18:11,326 55 | LGA,MIA,16:50,19:26,304 56 | MIA,LGA,17:07,20:04,291 57 | LGA,MIA,18:07,21:30,355 58 | MIA,LGA,18:23,21:35,134 59 | LGA,MIA,20:27,23:42,169 60 | MIA,LGA,19:53,22:21,173 61 | LGA,BOS,6:39,8:09,86 62 | BOS,LGA,6:17,8:26,89 63 | LGA,BOS,8:23,10:28,149 64 | BOS,LGA,8:04,10:11,95 65 | LGA,BOS,9:58,11:18,130 66 | BOS,LGA,9:45,11:50,172 67 | LGA,BOS,10:33,12:03,74 68 | BOS,LGA,11:16,13:29,83 69 | LGA,BOS,12:08,14:05,142 70 | BOS,LGA,12:34,15:02,109 71 | LGA,BOS,13:39,15:30,74 72 | BOS,LGA,13:40,15:37,138 73 | LGA,BOS,15:25,16:58,62 74 | BOS,LGA,15:27,17:18,151 75 | LGA,BOS,17:03,18:03,103 76 | BOS,LGA,17:11,18:30,108 77 | LGA,BOS,18:24,20:49,124 78 | BOS,LGA,18:34,19:36,136 79 | LGA,BOS,19:58,21:23,142 80 | BOS,LGA,20:17,22:22,102 81 | LGA,DAL,6:09,9:49,414 82 | DAL,LGA,6:12,10:22,230 83 | LGA,DAL,7:57,11:15,347 84 | DAL,LGA,7:53,11:37,433 85 | LGA,DAL,9:49,13:51,229 86 | DAL,LGA,9:08,12:12,364 87 | LGA,DAL,10:51,14:16,256 88 | DAL,LGA,10:30,14:57,290 89 | LGA,DAL,12:20,16:34,500 90 | DAL,LGA,12:19,15:25,342 91 | LGA,DAL,14:20,17:32,332 92 | DAL,LGA,13:54,18:02,294 93 | LGA,DAL,15:49,20:10,497 94 | DAL,LGA,15:44,18:55,382 95 | LGA,DAL,17:14,20:59,277 96 | DAL,LGA,16:52,20:48,448 97 | LGA,DAL,18:44,22:42,351 98 | DAL,LGA,18:26,21:29,464 99 | LGA,DAL,19:57,23:15,512 100 | DAL,LGA,20:07,23:27,473 101 | LGA,CAK,6:58,9:01,238 102 | CAK,LGA,6:08,8:06,224 103 | LGA,CAK,8:19,11:16,122 104 | CAK,LGA,8:27,10:45,139 105 | LGA,CAK,9:58,12:56,249 106 | CAK,LGA,9:15,12:14,247 107 | LGA,CAK,10:32,13:16,139 108 | CAK,LGA,10:53,13:36,189 109 | LGA,CAK,12:01,13:41,267 110 | CAK,LGA,12:08,14:59,149 111 | LGA,CAK,13:37,15:33,142 112 | CAK,LGA,13:40,15:38,137 113 | LGA,CAK,15:50,18:45,243 114 | CAK,LGA,15:23,17:25,232 115 | LGA,CAK,16:33,18:15,253 116 | CAK,LGA,17:08,19:08,262 117 | LGA,CAK,18:17,21:04,259 118 | CAK,LGA,18:35,20:28,204 119 | LGA,CAK,19:46,21:45,214 120 | CAK,LGA,20:30,23:11,114 121 | -------------------------------------------------------------------------------- /chapter3/feedlist.txt: -------------------------------------------------------------------------------- 1 | http://feeds.feedburner.com/37signals/beMH 2 | http://feeds.feedburner.com/blogspot/bRuz 3 | http://battellemedia.com/index.xml 4 | http://blog.guykawasaki.com/index.rdf 5 | http://blog.outer-court.com/rss.xml 6 | http://feeds.searchenginewatch.com/sewblog 7 | http://blog.topix.net/index.rdf 8 | http://blogs.abcnews.com/theblotter/index.rdf 9 | http://feeds.feedburner.com/ConsumingExperienceFull 10 | http://flagrantdisregard.com/index.php/feed/ 11 | http://featured.gigaom.com/feed/ 12 | http://gizmodo.com/index.xml 13 | http://gofugyourself.typepad.com/go_fug_yourself/index.rdf 14 | http://googleblog.blogspot.com/rss.xml 15 | http://feeds.feedburner.com/GoogleOperatingSystem 16 | http://headrush.typepad.com/creating_passionate_users/index.rdf 17 | http://feeds.feedburner.com/instapundit/main 18 | http://jeremy.zawodny.com/blog/rss2.xml 19 | http://joi.ito.com/index.rdf 20 | http://feeds.feedburner.com/Mashable 21 | http://michellemalkin.com/index.rdf 22 | http://moblogsmoproblems.blogspot.com/rss.xml 23 | http://newsbusters.org/node/feed 24 | http://beta.blogger.com/feeds/27154654/posts/full?alt=rss 25 | http://feeds.feedburner.com/paulstamatiou 26 | http://powerlineblog.com/index.rdf 27 | http://feeds.feedburner.com/Publishing20 28 | http://radar.oreilly.com/index.rdf 29 | http://scienceblogs.com/pharyngula/index.xml 30 | http://scobleizer.wordpress.com/feed/ 31 | http://sethgodin.typepad.com/seths_blog/index.rdf 32 | http://rss.slashdot.org/Slashdot/slashdot 33 | http://thinkprogress.org/feed/ 34 | http://feeds.feedburner.com/andrewsullivan/rApM 35 | http://wilwheaton.typepad.com/wwdnbackup/index.rdf 36 | http://www.43folders.com/feed/ 37 | http://www.456bereastreet.com/feed.xml 38 | http://www.autoblog.com/rss.xml 39 | http://www.bloggersblog.com/rss.xml 40 | http://www.bloglines.com/rss/about/news 41 | http://www.blogmaverick.com/rss.xml 42 | http://www.boingboing.net/index.rdf 43 | http://www.buzzmachine.com/index.xml 44 | http://www.captainsquartersblog.com/mt/index.rdf 45 | http://www.coolhunting.com/index.rdf 46 | http://feeds.copyblogger.com/Copyblogger 47 | http://feeds.feedburner.com/crooksandliars/YaCP 48 | http://feeds.dailykos.com/dailykos/index.xml 49 | http://www.deadspin.com/index.xml 50 | http://www.downloadsquad.com/rss.xml 51 | http://www.engadget.com/rss.xml 52 | http://www.gapingvoid.com/index.rdf 53 | http://www.gawker.com/index.xml 54 | http://www.gothamist.com/index.rdf 55 | http://www.huffingtonpost.com/raw_feed_index.rdf 56 | http://www.hyperorg.com/blogger/index.rdf 57 | http://www.joelonsoftware.com/rss.xml 58 | http://www.joystiq.com/rss.xml 59 | http://www.kotaku.com/index.xml 60 | http://feeds.kottke.org/main 61 | http://www.lifehack.org/feed/ 62 | http://www.lifehacker.com/index.xml 63 | http://littlegreenfootballs.com/weblog/lgf-rss.php 64 | http://www.makezine.com/blog/index.xml 65 | http://www.mattcutts.com/blog/feed/ 66 | http://xml.metafilter.com/rss.xml 67 | http://www.mezzoblue.com/rss/index.xml 68 | http://www.micropersuasion.com/index.rdf 69 | http://www.neilgaiman.com/journal/feed/rss.xml 70 | http://www.oilman.ca/feed/ 71 | http://www.perezhilton.com/index.xml 72 | http://www.plasticbag.org/index.rdf 73 | http://www.powazek.com/rss.xml 74 | http://www.problogger.net/feed/ 75 | http://feeds.feedburner.com/QuickOnlineTips 76 | http://www.readwriteweb.com/rss.xml 77 | http://www.schneier.com/blog/index.rdf 78 | http://scienceblogs.com/sample/combined.xml 79 | http://www.seroundtable.com/index.rdf 80 | http://www.shoemoney.com/feed/ 81 | http://www.sifry.com/alerts/index.rdf 82 | http://www.simplebits.com/xml/rss.xml 83 | http://feeds.feedburner.com/Spikedhumor 84 | http://www.stevepavlina.com/blog/feed 85 | http://www.talkingpointsmemo.com/index.xml 86 | http://www.tbray.org/ongoing/ongoing.rss 87 | http://feeds.feedburner.com/TechCrunch 88 | http://www.techdirt.com/techdirt_rss.xml 89 | http://www.techeblog.com/index.php/feed/ 90 | http://www.thesuperficial.com/index.xml 91 | http://www.tmz.com/rss.xml 92 | http://www.treehugger.com/index.rdf 93 | http://www.tuaw.com/rss.xml 94 | http://www.valleywag.com/index.xml 95 | http://www.we-make-money-not-art.com/index.rdf 96 | http://www.wired.com/rss/index.xml 97 | http://www.wonkette.com/index.xml 98 | -------------------------------------------------------------------------------- /chapter10/stockfeatures.txt: -------------------------------------------------------------------------------- 1 | 5.26743580154e+017 2 | 3.93402025291e+016 3 | 2.21688612312e+016 4 | 1.71500393528e+016 5 | 1.49411594165e+016 6 | [[ 2.33322541e+06 2.07819608e+06 2.51935438e+06 2.96234043e+06 7 | 1.75536111e+06 7.86146406e+06 2.63057169e+06 2.15047807e+06 8 | 5.08400536e+06 7.00030282e+06 1.85413701e+07 3.38175040e+06] 9 | [ 4.39522609e+06 3.06456173e+05 1.01774069e+06 5.95775828e+05 10 | 4.58278700e+05 2.44897111e+06 6.88990546e+05 9.20287049e+05 11 | 4.92159041e+06 2.73739991e+06 3.16536914e+06 1.59875019e+07] 12 | [ 1.94852289e+07 2.76219783e+05 2.65520981e+03 3.05103534e+05 13 | 1.98473327e+05 3.64804329e+05 1.19037805e+05 1.98460099e+02 14 | 3.76011874e+05 1.43281935e+05 1.39846581e+06 3.84252682e+05] 15 | [ 1.17533915e+07 3.03635741e+05 5.79421694e+05 4.36884572e+05 16 | 3.06811879e+05 9.98011680e+05 5.08825718e+05 2.75383182e+05 17 | 5.01943100e+06 1.15884764e+06 1.40079467e+06 1.47720209e+04] 18 | [ 1.05481574e+07 3.70822814e+05 6.30403606e+05 7.01379744e+05 19 | 1.69117963e+05 1.67921090e+06 6.68489498e+05 3.92653670e+05 20 | 1.29366132e+03 1.25383449e+06 4.57858763e+06 4.65246631e+05]] 21 | [[ 0.43597457 0.05871326 0.06516699 0.03360034 0.52074744] 22 | [ 0.44059965 0.1652264 0.29512033 0.11415813 0.55129002] 23 | [ 0.71651796 0.14618471 0.76636368 0.58318813 0.27252264] 24 | ..., 25 | [ 1.28562362 0.84010606 0.65675734 0.2187646 0.68153007] 26 | [ 0.78639688 0.40560653 1.21738032 1.17089036 2.06706388] 27 | [ 2.45069957 0.00640682 0.86072825 0.10106403 1.12640551]] 28 | Feature 0 29 | (18541370.141110275, 'XOM') 30 | (7861464.0553792343, 'CVX') 31 | (7000302.8181583285, 'PG') 32 | (5084005.3613334689, 'GOOG') 33 | (3381750.4044293971, 'AMGN') 34 | (2962340.4315599473, 'BP') 35 | (2630571.6923459047, 'DNA') 36 | (2519354.3804378472, 'BIIB') 37 | (2333225.4065250917, 'YHOO') 38 | (2150478.0737609738, 'EXPE') 39 | (2078196.0848287165, 'AVP') 40 | (1755361.1131727577, 'CL') 41 | 42 | [(2.4506995728828622, '18-Oct-05'), (1.7327784403764923, '11-Sep-06'), (1.5111300572258395, '8-Jun-06')] 43 | 44 | Feature 1 45 | (15987501.883808712, 'AMGN') 46 | (4921590.4116128432, 'GOOG') 47 | (4395226.0932264365, 'YHOO') 48 | (3165369.1418494503, 'XOM') 49 | (2737399.9096869556, 'PG') 50 | (2448971.1065134653, 'CVX') 51 | (1017740.6942413859, 'BIIB') 52 | (920287.04939950886, 'EXPE') 53 | (688990.54637332377, 'DNA') 54 | (595775.82846660342, 'BP') 55 | (458278.69976566656, 'CL') 56 | (306456.1727793481, 'AVP') 57 | 58 | [(5.5183934865182875, '15-Feb-06'), (2.138473391072961, '1-Feb-06'), (1.9475044925471119, '26-Jan-06')] 59 | 60 | Feature 2 61 | (19485228.873686153, 'YHOO') 62 | (1398465.8074515802, 'XOM') 63 | (384252.68231490435, 'AMGN') 64 | (376011.87440058013, 'GOOG') 65 | (364804.32850560133, 'CVX') 66 | (305103.53400016041, 'BP') 67 | (276219.78349040612, 'AVP') 68 | (198473.32671485722, 'CL') 69 | (143281.93458262246, 'PG') 70 | (119037.80463716132, 'DNA') 71 | (2655.2098122150296, 'BIIB') 72 | (198.46009910268154, 'EXPE') 73 | 74 | [(8.3018051767438337, '19-Jul-06'), (4.5697390847378792, '19-Sep-06'), (2.506039176128628, '19-Apr-06')] 75 | 76 | Feature 3 77 | (11753391.461576829, 'YHOO') 78 | (5019430.9962252304, 'GOOG') 79 | (1400794.6664170395, 'XOM') 80 | (1158847.6445206082, 'PG') 81 | (998011.67965212127, 'CVX') 82 | (579421.69354580715, 'BIIB') 83 | (508825.71818347432, 'DNA') 84 | (436884.5719282077, 'BP') 85 | (306811.8787867761, 'CL') 86 | (303635.74069823755, 'AVP') 87 | (275383.18216351332, 'EXPE') 88 | (14772.020946359988, 'AMGN') 89 | 90 | [(6.9635400795449733, '18-Jan-06'), (4.4080426022720891, '18-Oct-06'), (3.6766225277997848, '20-Jan-06')] 91 | 92 | Feature 4 93 | (10548157.403712066, 'YHOO') 94 | (4578587.6349422066, 'XOM') 95 | (1679210.8955857321, 'CVX') 96 | (1253834.4926672454, 'PG') 97 | (701379.74418841151, 'BP') 98 | (668489.49759360566, 'DNA') 99 | (630403.60590710363, 'BIIB') 100 | (465246.63059152756, 'AMGN') 101 | (392653.67018991744, 'EXPE') 102 | (370822.8136748391, 'AVP') 103 | (169117.96293892173, 'CL') 104 | (1293.6613221068894, 'GOOG') 105 | 106 | [(3.2242716186256213, '19-Jul-06'), (2.4565899212822875, '18-Oct-06'), (2.2169891870590743, '17-Oct-06')] 107 | 108 | -------------------------------------------------------------------------------- /chapter10/newsfeatures.py: -------------------------------------------------------------------------------- 1 | import feedparser 2 | import re 3 | 4 | 5 | feedlist=['http://today.reuters.com/rss/topNews', 6 | 'http://today.reuters.com/rss/domesticNews', 7 | 'http://today.reuters.com/rss/worldNews', 8 | 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml', 9 | 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml', 10 | 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml', 11 | 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml', 12 | 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', 13 | 'http://www.nytimes.com/services/xml/rss/nyt/International.xml', 14 | 'http://news.google.com/?output=rss', 15 | 'http://feeds.salon.com/salon/news', 16 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss', 17 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss', 18 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss', 19 | 'http://rss.cnn.com/rss/edition.rss', 20 | 'http://rss.cnn.com/rss/edition_world.rss', 21 | 'http://rss.cnn.com/rss/edition_us.rss'] 22 | 23 | def stripHTML(h): 24 | p='' 25 | s=0 26 | for c in h: 27 | if c=='<': s=1 28 | elif c=='>': 29 | s=0 30 | p+=' ' 31 | elif s==0: p+=c 32 | return p 33 | 34 | 35 | def separatewords(text): 36 | splitter=re.compile('\\W*') 37 | return [s.lower() for s in splitter.split(text) if len(s)>3] 38 | 39 | def getarticlewords(): 40 | allwords={} 41 | articlewords=[] 42 | articletitles=[] 43 | ec=0 44 | # Loop over every feed 45 | for feed in feedlist: 46 | f=feedparser.parse(feed) 47 | 48 | # Loop over every article 49 | for e in f.entries: 50 | # Ignore identical articles 51 | if e.title in articletitles: continue 52 | 53 | # Extract the words 54 | txt=e.title.encode('utf8')+stripHTML(e.description.encode('utf8')) 55 | words=separatewords(txt) 56 | articlewords.append({}) 57 | articletitles.append(e.title) 58 | 59 | # Increase the counts for this word in allwords and in articlewords 60 | for word in words: 61 | allwords.setdefault(word,0) 62 | allwords[word]+=1 63 | articlewords[ec].setdefault(word,0) 64 | articlewords[ec][word]+=1 65 | ec+=1 66 | return allwords,articlewords,articletitles 67 | 68 | def makematrix(allw,articlew): 69 | wordvec=[] 70 | 71 | # Only take words that are common but not too common 72 | for w,c in allw.items(): 73 | if c>3 and c0: return 0 62 | else: return 1 63 | 64 | def yesno(v): 65 | if v=='yes': return 1 66 | elif v=='no': return -1 67 | else: return 0 68 | 69 | def matchcount(interest1,interest2): 70 | l1=interest1.split(':') 71 | l2=interest2.split(':') 72 | x=0 73 | for v in l1: 74 | if v in l2: x+=1 75 | return x 76 | 77 | yahookey="YOUR API KEY" 78 | from xml.dom.minidom import parseString 79 | from urllib import urlopen,quote_plus 80 | 81 | loc_cache={} 82 | def getlocation(address): 83 | if address in loc_cache: return loc_cache[address] 84 | data=urlopen('http://api.local.yahoo.com/MapsService/V1/'+\ 85 | 'geocode?appid=%s&location=%s' % 86 | (yahookey,quote_plus(address))).read() 87 | doc=parseString(data) 88 | lat=doc.getElementsByTagName('Latitude')[0].firstChild.nodeValue 89 | long=doc.getElementsByTagName('Longitude')[0].firstChild.nodeValue 90 | loc_cache[address]=(float(lat),float(long)) 91 | return loc_cache[address] 92 | 93 | def milesdistance(a1,a2): 94 | lat1,long1=getlocation(a1) 95 | lat2,long2=getlocation(a2) 96 | latdif=69.1*(lat2-lat1) 97 | longdif=53.0*(long2-long1) 98 | return (latdif**2+longdif**2)**.5 99 | 100 | def loadnumerical(): 101 | oldrows=loadmatch('matchmaker.csv') 102 | newrows=[] 103 | for row in oldrows: 104 | d=row.data 105 | data=[float(d[0]),yesno(d[1]),yesno(d[2]), 106 | float(d[5]),yesno(d[6]),yesno(d[7]), 107 | matchcount(d[3],d[8]), 108 | milesdistance(d[4],d[9]), 109 | row.match] 110 | newrows.append(matchrow(data)) 111 | return newrows 112 | 113 | def scaledata(rows): 114 | low=[999999999.0]*len(rows[0].data) 115 | high=[-999999999.0]*len(rows[0].data) 116 | # Find the lowest and highest values 117 | for row in rows: 118 | d=row.data 119 | for i in range(len(d)): 120 | if d[i]high[i]: high[i]=d[i] 122 | 123 | # Create a function that scales data 124 | def scaleinput(d): 125 | return [(d[i]-low[i])/(high[i]-low[i]) 126 | for i in range(len(low))] 127 | 128 | # Scale all the data 129 | newrows=[matchrow(scaleinput(row.data)+[row.match]) 130 | for row in rows] 131 | 132 | # Return the new data and the function 133 | return newrows,scaleinput 134 | 135 | 136 | def rbf(v1,v2,gamma=10): 137 | dv=[v1[i]-v2[i] for i in range(len(v1))] 138 | l=veclength(dv) 139 | return math.e**(-gamma*l) 140 | 141 | def nlclassify(point,rows,offset,gamma=10): 142 | sum0=0.0 143 | sum1=0.0 144 | count0=0 145 | count1=0 146 | 147 | for row in rows: 148 | if row.match==0: 149 | sum0+=rbf(point,row.data,gamma) 150 | count0+=1 151 | else: 152 | sum1+=rbf(point,row.data,gamma) 153 | count1+=1 154 | y=(1.0/count0)*sum0-(1.0/count1)*sum1+offset 155 | 156 | if y>0: return 0 157 | else: return 1 158 | 159 | def getoffset(rows,gamma=10): 160 | l0=[] 161 | l1=[] 162 | for row in rows: 163 | if row.match==0: l0.append(row.data) 164 | else: l1.append(row.data) 165 | sum0=sum(sum([rbf(v1,v2,gamma) for v1 in l0]) for v2 in l0) 166 | sum1=sum(sum([rbf(v1,v2,gamma) for v1 in l1]) for v2 in l1) 167 | 168 | return (1.0/(len(l1)**2))*sum1-(1.0/(len(l0)**2))*sum0 169 | -------------------------------------------------------------------------------- /chapter9/facebook.py: -------------------------------------------------------------------------------- 1 | import urllib,md5,webbrowser,time 2 | from xml.dom.minidom import parseString 3 | 4 | apikey="47e953c8ea9ed30db904af453125c759" 5 | secret="ea703e4721e8c7bf88b92110a46a9b06" 6 | FacebookURL = "https://api.facebook.com/restserver.php" 7 | 8 | def getsinglevalue(node,tag): 9 | nl=node.getElementsByTagName(tag) 10 | if len(nl)>0: 11 | tagNode=nl[0] 12 | if tagNode.hasChildNodes(): 13 | return tagNode.firstChild.nodeValue 14 | return '' 15 | 16 | def callid(): 17 | return str(int(time.time()*10)) 18 | 19 | class fbsession: 20 | def __init__(self): 21 | self.session_secret=None 22 | self.session_key=None 23 | self.createtoken() 24 | webbrowser.open(self.getlogin()) 25 | print "Press enter after logging in:", 26 | raw_input() 27 | self.getsession() 28 | def sendrequest(self, args): 29 | args['api_key'] = apikey 30 | args['sig'] = self.makehash(args) 31 | post_data = urllib.urlencode(args) 32 | url = FacebookURL + "?" + post_data 33 | data=urllib.urlopen(url).read() 34 | print data 35 | return parseString(data) 36 | def makehash(self,args): 37 | hasher = md5.new(''.join([x + '=' + args[x] for x in sorted(args.keys())])) 38 | if self.session_secret: hasher.update(self.session_secret) 39 | else: hasher.update(secret) 40 | return hasher.hexdigest() 41 | def createtoken(self): 42 | res = self.sendrequest({'method':"facebook.auth.createToken"}) 43 | self.token = getsinglevalue(res,'token') 44 | def getlogin(self): 45 | return "http://api.facebook.com/login.php?api_key="+apikey+\ 46 | "&auth_token=" + self.token 47 | def getsession(self): 48 | doc=self.sendrequest({'method':'facebook.auth.getSession', 49 | 'auth_token':self.token}) 50 | self.session_key=getsinglevalue(doc,'session_key') 51 | self.session_secret=getsinglevalue(doc,'secret') 52 | def getfriends(self): 53 | doc=self.sendrequest({'method':'facebook.friends.get', 54 | 'session_key':self.session_key,'call_id':callid()}) 55 | results=[] 56 | for n in doc.getElementsByTagName('result_elt'): 57 | results.append(n.firstChild.nodeValue) 58 | return results 59 | 60 | def getinfo(self,users): 61 | ulist=','.join(users) 62 | 63 | fields='gender,current_location,relationship_status,'+\ 64 | 'affiliations,hometown_location' 65 | 66 | doc=self.sendrequest({'method':'facebook.users.getInfo', 67 | 'session_key':self.session_key,'call_id':callid(), 68 | 'users':ulist,'fields':fields}) 69 | 70 | results={} 71 | for n,id in zip(doc.getElementsByTagName('result_elt'),users): 72 | # Get the location 73 | locnode=n.getElementsByTagName('hometown_location')[0] 74 | loc=getsinglevalue(locnode,'city')+', '+getsinglevalue(locnode,'state') 75 | 76 | # Get school 77 | college='' 78 | gradyear='0' 79 | affiliations=n.getElementsByTagName('affiliations_elt') 80 | for aff in affiliations: 81 | # Type 1 is college 82 | if getsinglevalue(aff,'type')=='1': 83 | college=getsinglevalue(aff,'name') 84 | gradyear=getsinglevalue(aff,'year') 85 | 86 | results[id]={'gender':getsinglevalue(n,'gender'), 87 | 'status':getsinglevalue(n,'relationship_status'), 88 | 'location':loc,'college':college,'year':gradyear} 89 | return results 90 | 91 | def arefriends(self,idlist1,idlist2): 92 | id1=','.join(idlist1) 93 | id2=','.join(idlist2) 94 | doc=self.sendrequest({'method':'facebook.friends.areFriends', 95 | 'session_key':self.session_key,'call_id':callid(), 96 | 'id1':id1,'id2':id2}) 97 | results=[] 98 | for n in doc.getElementsByTagName('result_elt'): 99 | results.append(int(n.firstChild.nodeValue)) 100 | return results 101 | 102 | 103 | 104 | def makedataset(self): 105 | from advancedclassify import milesdistance 106 | # Get all the info for all my friends 107 | friends=self.getfriends() 108 | info=self.getinfo(friends) 109 | ids1,ids2=[],[] 110 | rows=[] 111 | 112 | # Nested loop to look at every pair of friends 113 | for i in range(len(friends)): 114 | f1=friends[i] 115 | data1=info[f1] 116 | 117 | # Start at i+1 so we don't double up 118 | for j in range(i+1,len(friends)): 119 | f2=friends[j] 120 | data2=info[f2] 121 | ids1.append(f1) 122 | ids2.append(f2) 123 | 124 | # Generate some numbers from the data 125 | if data1['college']==data2['college']: sameschool=1 126 | else: sameschool=0 127 | male1=(data1['gender']=='Male') and 1 or 0 128 | male2=(data2['gender']=='Male') and 1 or 0 129 | 130 | row=[male1,int(data1['year']),male2,int(data2['year']),sameschool] 131 | rows.append(row) 132 | # Call arefriends in blocks for every pair of people 133 | arefriends=[] 134 | for i in range(0,len(ids1),30): 135 | j=min(i+30,len(ids1)) 136 | pa=self.arefriends(ids1[i:j],ids2[i:j]) 137 | arefriends+=pa 138 | return arefriends,rows 139 | 140 | -------------------------------------------------------------------------------- /chapter8/ebaypredict.py: -------------------------------------------------------------------------------- 1 | import httplib 2 | from xml.dom.minidom import parse, parseString, Node 3 | 4 | devKey = 'YOUR DEV KEY' 5 | appKey = 'YOUR APP KEY' 6 | certKey = 'YOUR CERT KEY' 7 | serverUrl = 'api.ebay.com' 8 | userToken = 'YOUR TOKEN' 9 | 10 | def getHeaders(apicall,siteID="0",compatabilityLevel = "433"): 11 | headers = {"X-EBAY-API-COMPATIBILITY-LEVEL": compatabilityLevel, 12 | "X-EBAY-API-DEV-NAME": devKey, 13 | "X-EBAY-API-APP-NAME": appKey, 14 | "X-EBAY-API-CERT-NAME": certKey, 15 | "X-EBAY-API-CALL-NAME": apicall, 16 | "X-EBAY-API-SITEID": siteID, 17 | "Content-Type": "text/xml"} 18 | return headers 19 | 20 | def sendRequest(apicall,xmlparameters): 21 | connection = httplib.HTTPSConnection(serverUrl) 22 | connection.request("POST", '/ws/api.dll', xmlparameters, getHeaders(apicall)) 23 | response = connection.getresponse() 24 | if response.status != 200: 25 | print "Error sending request:" + response.reason 26 | else: 27 | data = response.read() 28 | connection.close() 29 | return data 30 | 31 | def getSingleValue(node,tag): 32 | nl=node.getElementsByTagName(tag) 33 | if len(nl)>0: 34 | tagNode=nl[0] 35 | if tagNode.hasChildNodes(): 36 | return tagNode.firstChild.nodeValue 37 | return '-1' 38 | 39 | 40 | def doSearch(query,categoryID=None,page=1): 41 | xml = ""+\ 42 | ""+\ 43 | "" +\ 44 | userToken +\ 45 | "" + \ 46 | ""+\ 47 | "200"+\ 48 | ""+str(page)+""+\ 49 | ""+\ 50 | "" + query + "" 51 | if categoryID!=None: 52 | xml+=""+str(categoryID)+"" 53 | xml+="" 54 | 55 | data=sendRequest('GetSearchResults',xml) 56 | response = parseString(data) 57 | itemNodes = response.getElementsByTagName('Item'); 58 | results = [] 59 | for item in itemNodes: 60 | itemId=getSingleValue(item,'ItemID') 61 | itemTitle=getSingleValue(item,'Title') 62 | itemPrice=getSingleValue(item,'CurrentPrice') 63 | itemEnds=getSingleValue(item,'EndTime') 64 | results.append((itemId,itemTitle,itemPrice,itemEnds)) 65 | return results 66 | 67 | 68 | def getCategory(query='',parentID=None,siteID='0'): 69 | lquery=query.lower() 70 | xml = ""+\ 71 | ""+\ 72 | "" +\ 73 | userToken +\ 74 | ""+\ 75 | "ReturnAll"+\ 76 | "true"+\ 77 | ""+siteID+"" 78 | if parentID==None: 79 | xml+="1" 80 | else: 81 | xml+=""+str(parentID)+"" 82 | xml += "" 83 | data=sendRequest('GetCategories',xml) 84 | categoryList=parseString(data) 85 | catNodes=categoryList.getElementsByTagName('Category') 86 | for node in catNodes: 87 | catid=getSingleValue(node,'CategoryID') 88 | name=getSingleValue(node,'CategoryName') 89 | if name.lower().find(lquery)!=-1: 90 | print catid,name 91 | 92 | def getItem(itemID): 93 | xml = ""+\ 94 | ""+\ 95 | "" +\ 96 | userToken +\ 97 | "" + \ 98 | "" + str(itemID) + ""+\ 99 | "ItemReturnAttributes"+\ 100 | "" 101 | data=sendRequest('GetItem',xml) 102 | result={} 103 | response=parseString(data) 104 | result['title']=getSingleValue(response,'Title') 105 | sellingStatusNode = response.getElementsByTagName('SellingStatus')[0]; 106 | result['price']=getSingleValue(sellingStatusNode,'CurrentPrice') 107 | result['bids']=getSingleValue(sellingStatusNode,'BidCount') 108 | seller = response.getElementsByTagName('Seller') 109 | result['feedback'] = getSingleValue(seller[0],'FeedbackScore') 110 | 111 | attributeSet=response.getElementsByTagName('Attribute'); 112 | attributes={} 113 | for att in attributeSet: 114 | attID=att.attributes.getNamedItem('attributeID').nodeValue 115 | attValue=getSingleValue(att,'ValueLiteral') 116 | attributes[attID]=attValue 117 | result['attributes']=attributes 118 | return result 119 | 120 | 121 | def makeLaptopDataset(): 122 | searchResults=doSearch('laptop',categoryID=51148) 123 | result=[] 124 | for r in searchResults: 125 | item=getItem(r[0]) 126 | att=item['attributes'] 127 | try: 128 | data=(float(att['12']),float(att['26444']), 129 | float(att['26446']),float(att['25710']), 130 | float(item['feedback']) 131 | ) 132 | entry={'input':data,'result':float(item['price'])} 133 | result.append(entry) 134 | except: 135 | print item['title']+' failed' 136 | return result 137 | -------------------------------------------------------------------------------- /chapter8/optimization.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import math 4 | 5 | people = [('Seymour','BOS'), 6 | ('Franny','DAL'), 7 | ('Zooey','CAK'), 8 | ('Walt','MIA'), 9 | ('Buddy','ORD'), 10 | ('Les','OMA')] 11 | # Laguardia 12 | destination='LGA' 13 | 14 | flights={} 15 | # 16 | """ 17 | for line in file('schedule.txt'): 18 | origin,dest,depart,arrive,price=line.strip().split(',') 19 | flights.setdefault((origin,dest),[]) 20 | 21 | # Add details to the list of possible flights 22 | flights[(origin,dest)].append((depart,arrive,int(price))) 23 | """ 24 | def getminutes(t): 25 | x=time.strptime(t,'%H:%M') 26 | return x[3]*60+x[4] 27 | 28 | def printschedule(r): 29 | for d in range(len(r)/2): 30 | name=people[d][0] 31 | origin=people[d][1] 32 | out=flights[(origin,destination)][int(r[d])] 33 | ret=flights[(destination,origin)][int(r[d+1])] 34 | print '%10s%10s %5s-%5s $%3s %5s-%5s $%3s' % (name,origin, 35 | out[0],out[1],out[2], 36 | ret[0],ret[1],ret[2]) 37 | 38 | def schedulecost(sol): 39 | totalprice=0 40 | latestarrival=0 41 | earliestdep=24*60 42 | 43 | for d in range(len(sol)/2): 44 | # Get the inbound and outbound flights 45 | origin=people[d][1] 46 | outbound=flights[(origin,destination)][int(sol[d])] 47 | returnf=flights[(destination,origin)][int(sol[d+1])] 48 | 49 | # Total price is the price of all outbound and return flights 50 | totalprice+=outbound[2] 51 | totalprice+=returnf[2] 52 | 53 | # Track the latest arrival and earliest departure 54 | if latestarrivalgetminutes(returnf[0]): earliestdep=getminutes(returnf[0]) 56 | 57 | # Every person must wait at the airport until the latest person arrives. 58 | # They also must arrive at the same time and wait for their flights. 59 | totalwait=0 60 | for d in range(len(sol)/2): 61 | origin=people[d][1] 62 | outbound=flights[(origin,destination)][int(sol[d])] 63 | returnf=flights[(destination,origin)][int(sol[d+1])] 64 | totalwait+=latestarrival-getminutes(outbound[1]) 65 | totalwait+=getminutes(returnf[0])-earliestdep 66 | 67 | # Does this solution require an extra day of car rental? That'll be $50! 68 | if latestarrival>earliestdep: totalprice+=50 69 | 70 | return totalprice+totalwait 71 | 72 | def randomoptimize(domain,costf): 73 | best=999999999 74 | bestr=None 75 | for i in range(0,1000): 76 | # Create a random solution 77 | r=[float(random.randint(domain[i][0],domain[i][1])) 78 | for i in range(len(domain))] 79 | 80 | # Get the cost 81 | cost=costf(r) 82 | 83 | # Compare it to the best one so far 84 | if cost0.1: 96 | # Choose one of the indices 97 | i=random.randint(0,len(domain)-1) 98 | 99 | # Choose a direction to change it 100 | dir=random.randint(-step,step) 101 | 102 | # Create a new list with one of the values changed 103 | vecb=vec[:] 104 | vecb[i]+=dir 105 | if vecb[i]domain[i][1]: vecb[i]=domain[i][1] 107 | 108 | # Calculate the current cost and the new cost 109 | ea=costf(vec) 110 | eb=costf(vecb) 111 | p=pow(math.e,(-eb-ea)/T) 112 | 113 | print vec,ea 114 | 115 | 116 | # Is it better, or does it make the probability 117 | # cutoff? 118 | if (ebmaxv: v[i][d]=maxv 160 | elif v[i][d]<-maxv: v[i][d]=-maxv 161 | 162 | # constrain bounds of solutions 163 | x[i][d]+=v[i][d] 164 | if x[i][d]domain[d][1]: x[i][d]=domain[d][1] 166 | 167 | print p[g],costf(p[g]) 168 | return p[g] 169 | -------------------------------------------------------------------------------- /chapter8/numpredict.py: -------------------------------------------------------------------------------- 1 | from random import random,randint 2 | import math 3 | 4 | def wineprice(rating,age): 5 | peak_age=rating-50 6 | 7 | # Calculate price based on rating 8 | price=rating/2 9 | if age>peak_age: 10 | # Past its peak, goes bad in 10 years 11 | price=price*(5-(age-peak_age)/2) 12 | else: 13 | # Increases to 5x original value as it 14 | # approaches its peak 15 | price=price*(5*((age+1)/peak_age)) 16 | if price<0: price=0 17 | return price 18 | 19 | 20 | def wineset1(): 21 | rows=[] 22 | for i in range(300): 23 | # Create a random age and rating 24 | rating=random()*50+50 25 | age=random()*50 26 | 27 | # Get reference price 28 | price=wineprice(rating,age) 29 | 30 | # Add some noise 31 | price*=(random()*0.2+0.9) 32 | 33 | # Add to the dataset 34 | rows.append({'input':(rating,age), 35 | 'result':price}) 36 | return rows 37 | 38 | def euclidean(v1,v2): 39 | d=0.0 40 | for i in range(len(v1)): 41 | d+=(v1[i]-v2[i])**2 42 | return math.sqrt(d) 43 | 44 | 45 | def getdistances(data,vec1): 46 | distancelist=[] 47 | 48 | # Loop over every item in the dataset 49 | for i in range(len(data)): 50 | vec2=data[i]['input'] 51 | 52 | # Add the distance and the index 53 | distancelist.append((euclidean(vec1,vec2),i)) 54 | 55 | # Sort by distance 56 | distancelist.sort() 57 | return distancelist 58 | 59 | def knnestimate(data,vec1,k=5): 60 | # Get sorted distances 61 | dlist=getdistances(data,vec1) 62 | avg=0.0 63 | 64 | # Take the average of the top k results 65 | for i in range(k): 66 | idx=dlist[i][1] 67 | avg+=data[idx]['result'] 68 | avg=avg/k 69 | return avg 70 | 71 | def inverseweight(dist,num=1.0,const=0.1): 72 | return num/(dist+const) 73 | 74 | def subtractweight(dist,const=1.0): 75 | if dist>const: 76 | return 0 77 | else: 78 | return const-dist 79 | 80 | def gaussian(dist,sigma=5.0): 81 | return math.e**(-dist**2/(2*sigma**2)) 82 | 83 | def weightedknn(data,vec1,k=5,weightf=gaussian): 84 | # Get distances 85 | dlist=getdistances(data,vec1) 86 | avg=0.0 87 | totalweight=0.0 88 | 89 | # Get weighted average 90 | for i in range(k): 91 | dist=dlist[i][0] 92 | idx=dlist[i][1] 93 | weight=weightf(dist) 94 | avg+=weight*data[idx]['result'] 95 | totalweight+=weight 96 | if totalweight==0: return 0 97 | avg=avg/totalweight 98 | return avg 99 | 100 | def dividedata(data,test=0.05): 101 | trainset=[] 102 | testset=[] 103 | for row in data: 104 | if random()=low and v<=high: 176 | nweight+=weight 177 | tweight+=weight 178 | if tweight==0: return 0 179 | 180 | # The probability is the weights in the range 181 | # divided by all the weights 182 | return nweight/tweight 183 | 184 | from pylab import * 185 | 186 | def cumulativegraph(data,vec1,high,k=5,weightf=gaussian): 187 | t1=arange(0.0,high,0.1) 188 | cprob=array([probguess(data,vec1,0,v,k,weightf) for v in t1]) 189 | plot(t1,cprob) 190 | show() 191 | 192 | 193 | def probabilitygraph(data,vec1,high,k=5,weightf=gaussian,ss=5.0): 194 | # Make a range for the prices 195 | t1=arange(0.0,high,0.1) 196 | 197 | # Get the probabilities for the entire range 198 | probs=[probguess(data,vec1,v,v+0.1,k,weightf) for v in t1] 199 | 200 | # Smooth them by adding the gaussian of the nearby probabilites 201 | smoothed=[] 202 | for i in range(len(probs)): 203 | sv=0.0 204 | for j in range(0,len(probs)): 205 | dist=abs(i-j)*0.1 206 | weight=gaussian(dist,sigma=ss) 207 | sv+=weight*probs[j] 208 | smoothed.append(sv) 209 | smoothed=array(smoothed) 210 | 211 | plot(t1,smoothed) 212 | show() 213 | -------------------------------------------------------------------------------- /chapter2/recommendations.py: -------------------------------------------------------------------------------- 1 | # A dictionary of movie critics and their ratings of a small 2 | # set of movies 3 | critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5, 4 | 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 5 | 'The Night Listener': 3.0}, 6 | 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 7 | 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 8 | 'You, Me and Dupree': 3.5}, 9 | 'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0, 10 | 'Superman Returns': 3.5, 'The Night Listener': 4.0}, 11 | 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 12 | 'The Night Listener': 4.5, 'Superman Returns': 4.0, 13 | 'You, Me and Dupree': 2.5}, 14 | 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 15 | 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0, 16 | 'You, Me and Dupree': 2.0}, 17 | 'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 18 | 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5}, 19 | 'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}} 20 | 21 | 22 | from math import sqrt 23 | 24 | # Returns a distance-based similarity score for person1 and person2 25 | def sim_distance(prefs,person1,person2): 26 | # Get the list of shared_items 27 | si={} 28 | for item in prefs[person1]: 29 | if item in prefs[person2]: si[item]=1 30 | 31 | # if they have no ratings in common, return 0 32 | if len(si)==0: return 0 33 | 34 | # Add up the squares of all the differences 35 | sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2) 36 | for item in prefs[person1] if item in prefs[person2]]) 37 | 38 | return 1/(1+sum_of_squares) 39 | 40 | # Returns the Pearson correlation coefficient for p1 and p2 41 | def sim_pearson(prefs,p1,p2): 42 | # Get the list of mutually rated items 43 | si={} 44 | for item in prefs[p1]: 45 | if item in prefs[p2]: si[item]=1 46 | 47 | # if they are no ratings in common, return 0 48 | if len(si)==0: return 0 49 | 50 | # Sum calculations 51 | n=len(si) 52 | 53 | # Sums of all the preferences 54 | sum1=sum([prefs[p1][it] for it in si]) 55 | sum2=sum([prefs[p2][it] for it in si]) 56 | 57 | # Sums of the squares 58 | sum1Sq=sum([pow(prefs[p1][it],2) for it in si]) 59 | sum2Sq=sum([pow(prefs[p2][it],2) for it in si]) 60 | 61 | # Sum of the products 62 | pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si]) 63 | 64 | # Calculate r (Pearson score) 65 | num=pSum-(sum1*sum2/n) 66 | den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n)) 67 | if den==0: return 0 68 | 69 | r=num/den 70 | 71 | return r 72 | 73 | # Returns the best matches for person from the prefs dictionary. 74 | # Number of results and similarity function are optional params. 75 | def topMatches(prefs,person,n=5,similarity=sim_pearson): 76 | scores=[(similarity(prefs,person,other),other) 77 | for other in prefs if other!=person] 78 | scores.sort() 79 | scores.reverse() 80 | return scores[0:n] 81 | 82 | # Gets recommendations for a person by using a weighted average 83 | # of every other user's rankings 84 | def getRecommendations(prefs,person,similarity=sim_pearson): 85 | totals={} 86 | simSums={} 87 | for other in prefs: 88 | # don't compare me to myself 89 | if other==person: continue 90 | sim=similarity(prefs,person,other) 91 | 92 | # ignore scores of zero or lower 93 | if sim<=0: continue 94 | for item in prefs[other]: 95 | 96 | # only score movies I haven't seen yet 97 | if item not in prefs[person] or prefs[person][item]==0: 98 | # Similarity * Score 99 | totals.setdefault(item,0) 100 | totals[item]+=prefs[other][item]*sim 101 | # Sum of similarities 102 | simSums.setdefault(item,0) 103 | simSums[item]+=sim 104 | 105 | # Create the normalized list 106 | rankings=[(total/simSums[item],item) for item,total in totals.items()] 107 | 108 | # Return the sorted list 109 | rankings.sort() 110 | rankings.reverse() 111 | return rankings 112 | 113 | def transformPrefs(prefs): 114 | result={} 115 | for person in prefs: 116 | for item in prefs[person]: 117 | result.setdefault(item,{}) 118 | 119 | # Flip item and person 120 | result[item][person]=prefs[person][item] 121 | return result 122 | 123 | 124 | def calculateSimilarItems(prefs,n=10): 125 | # Create a dictionary of items showing which other items they 126 | # are most similar to. 127 | result={} 128 | # Invert the preference matrix to be item-centric 129 | itemPrefs=transformPrefs(prefs) 130 | c=0 131 | for item in itemPrefs: 132 | # Status updates for large datasets 133 | c+=1 134 | if c%100==0: print "%d / %d" % (c,len(itemPrefs)) 135 | # Find the most similar items to this one 136 | scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance) 137 | result[item]=scores 138 | return result 139 | 140 | def getRecommendedItems(prefs,itemMatch,user): 141 | userRatings=prefs[user] 142 | scores={} 143 | totalSim={} 144 | # Loop over items rated by this user 145 | for (item,rating) in userRatings.items( ): 146 | 147 | # Loop over items similar to this one 148 | for (similarity,item2) in itemMatch[item]: 149 | 150 | # Ignore if this user has already rated this item 151 | if item2 in userRatings: continue 152 | # Weighted sum of rating times similarity 153 | scores.setdefault(item2,0) 154 | scores[item2]+=similarity*rating 155 | # Sum of all the similarities 156 | totalSim.setdefault(item2,0) 157 | totalSim[item2]+=similarity 158 | 159 | # Divide each total score by total weighting to get an average 160 | rankings=[(score/totalSim[item],item) for item,score in scores.items( )] 161 | 162 | # Return the rankings from highest to lowest 163 | rankings.sort( ) 164 | rankings.reverse( ) 165 | return rankings 166 | 167 | def loadMovieLens(path='/data/movielens'): 168 | # Get movie titles 169 | movies={} 170 | for line in open(path+'/u.item'): 171 | (id,title)=line.split('|')[0:2] 172 | movies[id]=title 173 | 174 | # Load data 175 | prefs={} 176 | for line in open(path+'/u.data'): 177 | (user,movieid,rating,ts)=line.split('\t') 178 | prefs.setdefault(user,{}) 179 | prefs[user][movies[movieid]]=float(rating) 180 | return prefs 181 | -------------------------------------------------------------------------------- /chapter4/nn.py: -------------------------------------------------------------------------------- 1 | from math import tanh 2 | from pysqlite2 import dbapi2 as sqlite 3 | 4 | def dtanh(y): 5 | return 1.0-y*y 6 | 7 | class searchnet: 8 | def __init__(self,dbname): 9 | self.con=sqlite.connect(dbname) 10 | 11 | def __del__(self): 12 | self.con.close() 13 | 14 | def maketables(self): 15 | self.con.execute('create table hiddennode(create_key)') 16 | self.con.execute('create table wordhidden(fromid,toid,strength)') 17 | self.con.execute('create table hiddenurl(fromid,toid,strength)') 18 | self.con.commit() 19 | 20 | def getstrength(self,fromid,toid,layer): 21 | if layer==0: table='wordhidden' 22 | else: table='hiddenurl' 23 | res=self.con.execute('select strength from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchone() 24 | if res==None: 25 | if layer==0: return -0.2 26 | if layer==1: return 0 27 | return res[0] 28 | 29 | def setstrength(self,fromid,toid,layer,strength): 30 | if layer==0: table='wordhidden' 31 | else: table='hiddenurl' 32 | res=self.con.execute('select rowid from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchone() 33 | if res==None: 34 | self.con.execute('insert into %s (fromid,toid,strength) values (%d,%d,%f)' % (table,fromid,toid,strength)) 35 | else: 36 | rowid=res[0] 37 | self.con.execute('update %s set strength=%f where rowid=%d' % (table,strength,rowid)) 38 | 39 | def generatehiddennode(self,wordids,urls): 40 | if len(wordids)>3: return None 41 | # Check if we already created a node for this set of words 42 | sorted_words=[str(id) for id in wordids] 43 | sorted_words.sort() 44 | createkey='_'.join(sorted_words) 45 | res=self.con.execute( 46 | "select rowid from hiddennode where create_key='%s'" % createkey).fetchone() 47 | 48 | # If not, create it 49 | if res==None: 50 | cur=self.con.execute( 51 | "insert into hiddennode (create_key) values ('%s')" % createkey) 52 | hiddenid=cur.lastrowid 53 | # Put in some default weights 54 | for wordid in wordids: 55 | self.setstrength(wordid,hiddenid,0,1.0/len(wordids)) 56 | for urlid in urls: 57 | self.setstrength(hiddenid,urlid,1,0.1) 58 | self.con.commit() 59 | 60 | def getallhiddenids(self,wordids,urlids): 61 | l1={} 62 | for wordid in wordids: 63 | cur=self.con.execute( 64 | 'select toid from wordhidden where fromid=%d' % wordid) 65 | for row in cur: l1[row[0]]=1 66 | for urlid in urlids: 67 | cur=self.con.execute( 68 | 'select fromid from hiddenurl where toid=%d' % urlid) 69 | for row in cur: l1[row[0]]=1 70 | return l1.keys() 71 | 72 | def setupnetwork(self,wordids,urlids): 73 | # value lists 74 | self.wordids=wordids 75 | self.hiddenids=self.getallhiddenids(wordids,urlids) 76 | self.urlids=urlids 77 | 78 | # node outputs 79 | self.ai = [1.0]*len(self.wordids) 80 | self.ah = [1.0]*len(self.hiddenids) 81 | self.ao = [1.0]*len(self.urlids) 82 | 83 | # create weights matrix 84 | self.wi = [[self.getstrength(wordid,hiddenid,0) 85 | for hiddenid in self.hiddenids] 86 | for wordid in self.wordids] 87 | self.wo = [[self.getstrength(hiddenid,urlid,1) 88 | for urlid in self.urlids] 89 | for hiddenid in self.hiddenids] 90 | 91 | def feedforward(self): 92 | # the only inputs are the query words 93 | for i in range(len(self.wordids)): 94 | self.ai[i] = 1.0 95 | 96 | # hidden activations 97 | for j in range(len(self.hiddenids)): 98 | sum = 0.0 99 | for i in range(len(self.wordids)): 100 | sum = sum + self.ai[i] * self.wi[i][j] 101 | self.ah[j] = tanh(sum) 102 | 103 | # output activations 104 | for k in range(len(self.urlids)): 105 | sum = 0.0 106 | for j in range(len(self.hiddenids)): 107 | sum = sum + self.ah[j] * self.wo[j][k] 108 | self.ao[k] = tanh(sum) 109 | 110 | return self.ao[:] 111 | 112 | def getresult(self,wordids,urlids): 113 | self.setupnetwork(wordids,urlids) 114 | return self.feedforward() 115 | 116 | def backPropagate(self, targets, N=0.5): 117 | # calculate errors for output 118 | output_deltas = [0.0] * len(self.urlids) 119 | for k in range(len(self.urlids)): 120 | error = targets[k]-self.ao[k] 121 | output_deltas[k] = dtanh(self.ao[k]) * error 122 | 123 | # calculate errors for hidden layer 124 | hidden_deltas = [0.0] * len(self.hiddenids) 125 | for j in range(len(self.hiddenids)): 126 | error = 0.0 127 | for k in range(len(self.urlids)): 128 | error = error + output_deltas[k]*self.wo[j][k] 129 | hidden_deltas[j] = dtanh(self.ah[j]) * error 130 | 131 | # update output weights 132 | for j in range(len(self.hiddenids)): 133 | for k in range(len(self.urlids)): 134 | change = output_deltas[k]*self.ah[j] 135 | self.wo[j][k] = self.wo[j][k] + N*change 136 | 137 | # update input weights 138 | for i in range(len(self.wordids)): 139 | for j in range(len(self.hiddenids)): 140 | change = hidden_deltas[j]*self.ai[i] 141 | self.wi[i][j] = self.wi[i][j] + N*change 142 | 143 | def trainquery(self,wordids,urlids,selectedurl): 144 | # generate a hidden node if necessary 145 | self.generatehiddennode(wordids,urlids) 146 | 147 | self.setupnetwork(wordids,urlids) 148 | self.feedforward() 149 | targets=[0.0]*len(urlids) 150 | targets[urlids.index(selectedurl)]=1.0 151 | error = self.backPropagate(targets) 152 | self.updatedatabase() 153 | 154 | def updatedatabase(self): 155 | # set them to database values 156 | for i in range(len(self.wordids)): 157 | for j in range(len(self.hiddenids)): 158 | self.setstrength(self.wordids[i],self. hiddenids[j],0,self.wi[i][j]) 159 | for j in range(len(self.hiddenids)): 160 | for k in range(len(self.urlids)): 161 | self.setstrength(self.hiddenids[j],self.urlids[k],1,self.wo[j][k]) 162 | self.con.commit() 163 | -------------------------------------------------------------------------------- /chapter5/optimization.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import math 4 | 5 | people = [('Seymour','BOS'), 6 | ('Franny','DAL'), 7 | ('Zooey','CAK'), 8 | ('Walt','MIA'), 9 | ('Buddy','ORD'), 10 | ('Les','OMA')] 11 | # Laguardia 12 | destination='LGA' 13 | 14 | flights={} 15 | # 16 | for line in file('schedule.txt'): 17 | origin,dest,depart,arrive,price=line.strip().split(',') 18 | flights.setdefault((origin,dest),[]) 19 | 20 | # Add details to the list of possible flights 21 | flights[(origin,dest)].append((depart,arrive,int(price))) 22 | 23 | def getminutes(t): 24 | x=time.strptime(t,'%H:%M') 25 | return x[3]*60+x[4] 26 | 27 | def printschedule(r): 28 | for d in range(len(r)/2): 29 | name=people[d][0] 30 | origin=people[d][1] 31 | out=flights[(origin,destination)][int(r[d])] 32 | ret=flights[(destination,origin)][int(r[d+1])] 33 | print '%10s%10s %5s-%5s $%3s %5s-%5s $%3s' % (name,origin, 34 | out[0],out[1],out[2], 35 | ret[0],ret[1],ret[2]) 36 | 37 | def schedulecost(sol): 38 | totalprice=0 39 | latestarrival=0 40 | earliestdep=24*60 41 | 42 | for d in range(len(sol)/2): 43 | # Get the inbound and outbound flights 44 | origin=people[d][1] 45 | outbound=flights[(origin,destination)][int(sol[d])] 46 | returnf=flights[(destination,origin)][int(sol[d+1])] 47 | 48 | # Total price is the price of all outbound and return flights 49 | totalprice+=outbound[2] 50 | totalprice+=returnf[2] 51 | 52 | # Track the latest arrival and earliest departure 53 | if latestarrivalgetminutes(returnf[0]): earliestdep=getminutes(returnf[0]) 55 | 56 | # Every person must wait at the airport until the latest person arrives. 57 | # They also must arrive at the same time and wait for their flights. 58 | totalwait=0 59 | for d in range(len(sol)/2): 60 | origin=people[d][1] 61 | outbound=flights[(origin,destination)][int(sol[d])] 62 | returnf=flights[(destination,origin)][int(sol[d+1])] 63 | totalwait+=latestarrival-getminutes(outbound[1]) 64 | totalwait+=getminutes(returnf[0])-earliestdep 65 | 66 | # Does this solution require an extra day of car rental? That'll be $50! 67 | if latestarrival>earliestdep: totalprice+=50 68 | 69 | return totalprice+totalwait 70 | 71 | def randomoptimize(domain,costf): 72 | best=999999999 73 | bestr=None 74 | for i in range(0,1000): 75 | # Create a random solution 76 | r=[float(random.randint(domain[i][0],domain[i][1])) 77 | for i in range(len(domain))] 78 | 79 | # Get the cost 80 | cost=costf(r) 81 | 82 | # Compare it to the best one so far 83 | if costdomain[j][0]: 100 | neighbors.append(sol[0:j]+[sol[j]+1]+sol[j+1:]) 101 | if sol[j]0.1: 124 | # Choose one of the indices 125 | i=random.randint(0,len(domain)-1) 126 | 127 | # Choose a direction to change it 128 | dir=random.randint(-step,step) 129 | 130 | # Create a new list with one of the values changed 131 | vecb=vec[:] 132 | vecb[i]+=dir 133 | if vecb[i]domain[i][1]: vecb[i]=domain[i][1] 135 | 136 | # Calculate the current cost and the new cost 137 | ea=costf(vec) 138 | eb=costf(vecb) 139 | p=pow(math.e,(-eb-ea)/T) 140 | 141 | # Is it better, or does it make the probability 142 | # cutoff? 143 | if (ebdomain[i][0]: 156 | return vec[0:i]+[vec[i]-step]+vec[i+1:] 157 | elif vec[i]2 and len(s)<20] 11 | 12 | # Return the unique set of words only 13 | return dict([(w,1) for w in words]) 14 | 15 | class classifier: 16 | def __init__(self,getfeatures,filename=None): 17 | # Counts of feature/category combinations 18 | self.fc={} 19 | # Counts of documents in each category 20 | self.cc={} 21 | self.getfeatures=getfeatures 22 | 23 | def setdb(self,dbfile): 24 | self.con=sqlite.connect(dbfile) 25 | self.con.execute('create table if not exists fc(feature,category,count)') 26 | self.con.execute('create table if not exists cc(category,count)') 27 | 28 | 29 | def incf(self,f,cat): 30 | count=self.fcount(f,cat) 31 | if count==0: 32 | self.con.execute("insert into fc values ('%s','%s',1)" 33 | % (f,cat)) 34 | else: 35 | self.con.execute( 36 | "update fc set count=%d where feature='%s' and category='%s'" 37 | % (count+1,f,cat)) 38 | 39 | def fcount(self,f,cat): 40 | res=self.con.execute( 41 | 'select count from fc where feature="%s" and category="%s"' 42 | %(f,cat)).fetchone() 43 | if res==None: return 0 44 | else: return float(res[0]) 45 | 46 | def incc(self,cat): 47 | count=self.catcount(cat) 48 | if count==0: 49 | self.con.execute("insert into cc values ('%s',1)" % (cat)) 50 | else: 51 | self.con.execute("update cc set count=%d where category='%s'" 52 | % (count+1,cat)) 53 | 54 | def catcount(self,cat): 55 | res=self.con.execute('select count from cc where category="%s"' 56 | %(cat)).fetchone() 57 | if res==None: return 0 58 | else: return float(res[0]) 59 | 60 | def categories(self): 61 | cur=self.con.execute('select category from cc'); 62 | return [d[0] for d in cur] 63 | 64 | def totalcount(self): 65 | res=self.con.execute('select sum(count) from cc').fetchone(); 66 | if res==None: return 0 67 | return res[0] 68 | 69 | 70 | def train(self,item,cat): 71 | features=self.getfeatures(item) 72 | # Increment the count for every feature with this category 73 | for f in features: 74 | self.incf(f,cat) 75 | 76 | # Increment the count for this category 77 | self.incc(cat) 78 | self.con.commit() 79 | 80 | def fprob(self,f,cat): 81 | if self.catcount(cat)==0: return 0 82 | 83 | # The total number of times this feature appeared in this 84 | # category divided by the total number of items in this category 85 | return self.fcount(f,cat)/self.catcount(cat) 86 | 87 | def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5): 88 | # Calculate current probability 89 | basicprob=prf(f,cat) 90 | 91 | # Count the number of times this feature has appeared in 92 | # all categories 93 | totals=sum([self.fcount(f,c) for c in self.categories()]) 94 | 95 | # Calculate the weighted average 96 | bp=((weight*ap)+(totals*basicprob))/(weight+totals) 97 | return bp 98 | 99 | 100 | 101 | 102 | class naivebayes(classifier): 103 | 104 | def __init__(self,getfeatures): 105 | classifier.__init__(self,getfeatures) 106 | self.thresholds={} 107 | 108 | def docprob(self,item,cat): 109 | features=self.getfeatures(item) 110 | 111 | # Multiply the probabilities of all the features together 112 | p=1 113 | for f in features: p*=self.weightedprob(f,cat,self.fprob) 114 | return p 115 | 116 | def prob(self,item,cat): 117 | catprob=self.catcount(cat)/self.totalcount() 118 | docprob=self.docprob(item,cat) 119 | return docprob*catprob 120 | 121 | def setthreshold(self,cat,t): 122 | self.thresholds[cat]=t 123 | 124 | def getthreshold(self,cat): 125 | if cat not in self.thresholds: return 1.0 126 | return self.thresholds[cat] 127 | 128 | def classify(self,item,default=None): 129 | probs={} 130 | # Find the category with the highest probability 131 | max=0.0 132 | for cat in self.categories(): 133 | probs[cat]=self.prob(item,cat) 134 | if probs[cat]>max: 135 | max=probs[cat] 136 | best=cat 137 | 138 | # Make sure the probability exceeds threshold*next best 139 | for cat in probs: 140 | if cat==best: continue 141 | if probs[cat]*self.getthreshold(best)>probs[best]: return default 142 | return best 143 | 144 | class fisherclassifier(classifier): 145 | def cprob(self,f,cat): 146 | # The frequency of this feature in this category 147 | clf=self.fprob(f,cat) 148 | if clf==0: return 0 149 | 150 | # The frequency of this feature in all the categories 151 | freqsum=sum([self.fprob(f,c) for c in self.categories()]) 152 | 153 | # The probability is the frequency in this category divided by 154 | # the overall frequency 155 | p=clf/(freqsum) 156 | 157 | return p 158 | def fisherprob(self,item,cat): 159 | # Multiply all the probabilities together 160 | p=1 161 | features=self.getfeatures(item) 162 | for f in features: 163 | p*=(self.weightedprob(f,cat,self.cprob)) 164 | 165 | # Take the natural log and multiply by -2 166 | fscore=-2*math.log(p) 167 | 168 | # Use the inverse chi2 function to get a probability 169 | return self.invchi2(fscore,len(features)*2) 170 | def invchi2(self,chi, df): 171 | m = chi / 2.0 172 | sum = term = math.exp(-m) 173 | for i in range(1, df//2): 174 | term *= m / i 175 | sum += term 176 | return min(sum, 1.0) 177 | def __init__(self,getfeatures): 178 | classifier.__init__(self,getfeatures) 179 | self.minimums={} 180 | 181 | def setminimum(self,cat,min): 182 | self.minimums[cat]=min 183 | 184 | def getminimum(self,cat): 185 | if cat not in self.minimums: return 0 186 | return self.minimums[cat] 187 | def classify(self,item,default=None): 188 | # Loop through looking for the best result 189 | best=default 190 | max=0.0 191 | for c in self.categories(): 192 | p=self.fisherprob(item,c) 193 | # Make sure it exceeds its minimum 194 | if p>self.getminimum(c) and p>max: 195 | best=c 196 | max=p 197 | return best 198 | 199 | 200 | def sampletrain(cl): 201 | cl.train('Nobody owns the water.','good') 202 | cl.train('the quick rabbit jumps fences','good') 203 | cl.train('buy pharmaceuticals now','bad') 204 | cl.train('make quick money at the online casino','bad') 205 | cl.train('the quick brown fox jumps','good') 206 | -------------------------------------------------------------------------------- /chapter10/docclass.py: -------------------------------------------------------------------------------- 1 | import re 2 | import math 3 | import cPickle 4 | from pysqlite2 import dbapi2 as sqlite 5 | 6 | def getwords(doc): 7 | splitter=re.compile('\\W*') 8 | words=[s.lower() for s in splitter.split(doc) 9 | if len(s)>2 and len(s)<20] 10 | 11 | # Return the unique set of words only 12 | return dict([(w,1) for w in words]) 13 | 14 | #def entryfeatures(entry): 15 | 16 | def sampletrain(cl): 17 | cl.train('Nobody owns the water.','good') 18 | cl.train('the quick rabbit jumps fences','good') 19 | cl.train('buy pharmaceuticals now','bad') 20 | cl.train('make quick money at the online casino','bad') 21 | cl.train('the quick brown fox jumps','good') 22 | 23 | class classifier: 24 | def __init__(self,getfeatures): 25 | self.fc={} 26 | self.cc={} 27 | self.getfeatures=getfeatures 28 | 29 | def setdb(self,dbfile): 30 | self.con=sqlite.connect(dbfile) 31 | self.con.execute('create table if not exists fc(feature,category,count)') 32 | self.con.execute('create table if not exists cc(category,count)') 33 | 34 | def incf(self,f,cat): 35 | count=self.fcount(f,cat) 36 | if count==0: 37 | self.con.execute("insert into fc values ('%s','%s',1)" 38 | % (f,cat)) 39 | else: 40 | self.con.execute( 41 | "update fc set count=%d where feature='%s' and category='%s'" 42 | % (count+1,f,cat)) 43 | 44 | def fcount(self,f,cat): 45 | res=self.con.execute( 46 | 'select count from fc where feature="%s" and category="%s"' 47 | %(f,cat)).fetchone() 48 | if res==None: return 0 49 | else: return float(res[0]) 50 | 51 | def incc(self,cat): 52 | count=self.catcount(cat) 53 | if count==0: 54 | self.con.execute("insert into cc values ('%s',1)" % (cat)) 55 | else: 56 | self.con.execute("update cc set count=%d where category='%s'" 57 | % (count+1,cat)) 58 | 59 | def catcount(self,cat): 60 | res=self.con.execute('select count from cc where category="%s"' 61 | %(cat)).fetchone() 62 | if res==None: return 0.0 63 | else: return float(res[0]) 64 | 65 | def categories(self): 66 | cur=self.con.execute('select category from cc'); 67 | return [d[0] for d in cur] 68 | 69 | def totalcount(self): 70 | res=self.con.execute('select sum(count) from cc').fetchone(); 71 | if res==None: return 0 72 | return res[0] 73 | 74 | 75 | """ 76 | def incf(self,f,cat): 77 | self.fc.setdefault(f,{}) 78 | self.fc[f].setdefault(cat,0) 79 | self.fc[f][cat]+=1 80 | 81 | def incc(self,cat): 82 | self.cc.setdefault(cat,0) 83 | self.cc[cat]+=1 84 | 85 | def fcount(self,f,cat): 86 | if f in self.fc and cat in self.fc[f]: 87 | return float(self.fc[f][cat]) 88 | return 0.0 89 | 90 | def catcount(self,cat): 91 | if cat in self.cc: 92 | return float(self.cc[cat]) 93 | return 0 94 | 95 | def totalcount(self): 96 | return sum(self.cc.values()) 97 | 98 | def categories(self): 99 | return self.cc.keys() 100 | """ 101 | 102 | 103 | def train(self,item,cat): 104 | features=self.getfeatures(item) 105 | for f in features: 106 | self.incf(f,cat) 107 | self.incc(cat) 108 | self.con.commit() 109 | 110 | def fprob(self,f,cat): 111 | if self.catcount(cat)==0: return 0 112 | return self.fcount(f,cat)/self.catcount(cat) 113 | 114 | def setfilename(self,filename): 115 | self.filename=filename 116 | self.restoredata() 117 | 118 | def restoredata(self): 119 | try: f=file(self.filename,'rb') 120 | except: return 121 | self.fc=cPickle.load(f) 122 | self.cc=cPickle.load(f) 123 | f.close() 124 | 125 | def savedata(self): 126 | f=file(self.filename,'wb') 127 | cPickle.dump(self.fc,f,True) 128 | cPickle.dump(self.cc,f,True) 129 | f.close() 130 | def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5): 131 | basicprob=prf(f,cat) 132 | totals=sum([self.fcount(f,c) for c in self.categories()]) 133 | bp=((weight*ap)+(totals*basicprob))/(weight+totals) 134 | return bp 135 | 136 | 137 | 138 | class naivebayes(classifier): 139 | def __init__(self,getfeatures): 140 | classifier.__init__(self,getfeatures) 141 | self.thresholds={} 142 | 143 | def setthreshold(self,cat,t): 144 | self.thresholds[cat]=t 145 | 146 | def getthreshold(self,cat): 147 | if cat not in self.thresholds: return 1.0 148 | return self.thresholds[cat] 149 | 150 | def classify(self,item,default=None): 151 | probs={} 152 | max=0.0 153 | for cat in self.categories(): 154 | probs[cat]=self.prob(item,cat) 155 | if probs[cat]>max: 156 | max=probs[cat] 157 | best=cat 158 | for cat in probs: 159 | if cat==best: continue 160 | if probs[cat]*self.getthreshold(best)>probs[best]: return default 161 | return best 162 | 163 | def docprob(self,item,cat): 164 | features=self.getfeatures(item) 165 | p=1 166 | for f in features: p*=self.weightedprob(f,cat,self.fprob) 167 | return p 168 | 169 | 170 | def prob(self,item,cat): 171 | catprob=self.catcount(cat)/self.totalcount() 172 | docprob=self.docprob(item,cat) 173 | return docprob*catprob 174 | 175 | class fisherclassifier(classifier): 176 | def __init__(self,getfeatures): 177 | classifier.__init__(self,getfeatures) 178 | self.minimums={} 179 | 180 | def setminimum(self,cat,min): 181 | self.minimums[cat]=min 182 | 183 | def getminimum(self,cat): 184 | if cat not in self.minimums: return 0 185 | return self.minimums[cat] 186 | 187 | def classify(self,item,default=None): 188 | best=default 189 | max=0.0 190 | for c in self.categories(): 191 | p=self.fisherprob(item,c) 192 | if p>self.getminimum(c) and p>max: 193 | best=c 194 | max=p 195 | return best 196 | 197 | 198 | def cprob(self,f,cat): 199 | # The frequency of this feature in this category 200 | clf=self.fprob(f,cat) 201 | 202 | if clf==0: return 0.0 203 | 204 | # The frequency of this feature in all the categories 205 | freqsum=sum([self.fprob(f,c) for c in self.categories()]) 206 | 207 | # The probability is the frequency in this category divided by 208 | # the overall frequency 209 | p=clf/(freqsum) 210 | 211 | return p 212 | 213 | 214 | def fisherprob(self,item,cat): 215 | p=1 216 | features=self.getfeatures(item) 217 | for f in features: 218 | p*=(self.weightedprob(f,cat,self.cprob)) 219 | fscore=-2*math.log(p) 220 | return self.chi2P(fscore,len(features)*2) 221 | 222 | def chi2P(self,chi,df): 223 | m = chi / 2.0 224 | sum = term = math.exp(-m) 225 | for i in range(1, df//2): 226 | term *= m / i 227 | sum += term 228 | return min(sum, 1.0) 229 | 230 | -------------------------------------------------------------------------------- /chapter9/agesonly.csv: -------------------------------------------------------------------------------- 1 | 24,30,1 2 | 30,40,1 3 | 22,49,0 4 | 43,39,1 5 | 23,30,1 6 | 23,49,0 7 | 48,46,1 8 | 23,23,1 9 | 29,49,0 10 | 38,38,1 11 | 30,34,1 12 | 40,50,1 13 | 35,32,1 14 | 49,44,1 15 | 38,22,1 16 | 30,27,1 17 | 26,24,1 18 | 39,23,1 19 | 36,43,1 20 | 25,31,1 21 | 27,27,1 22 | 32,22,1 23 | 40,30,1 24 | 26,28,1 25 | 46,32,1 26 | 41,37,1 27 | 39,41,1 28 | 18,28,0 29 | 18,47,0 30 | 39,44,1 31 | 38,21,1 32 | 24,36,0 33 | 32,22,1 34 | 21,20,1 35 | 42,36,1 36 | 46,41,1 37 | 39,38,1 38 | 18,31,0 39 | 31,45,1 40 | 44,24,0 41 | 49,22,0 42 | 26,27,1 43 | 25,34,1 44 | 47,23,0 45 | 27,48,0 46 | 32,49,1 47 | 46,41,1 48 | 24,32,1 49 | 29,26,1 50 | 25,36,1 51 | 27,35,1 52 | 38,19,1 53 | 18,40,0 54 | 34,49,1 55 | 32,35,1 56 | 47,49,1 57 | 47,18,0 58 | 33,24,1 59 | 35,28,1 60 | 35,41,1 61 | 39,43,1 62 | 29,18,1 63 | 18,44,0 64 | 26,26,1 65 | 31,43,1 66 | 20,29,0 67 | 28,18,1 68 | 31,38,1 69 | 34,34,1 70 | 32,33,1 71 | 34,27,1 72 | 19,38,0 73 | 32,21,1 74 | 33,37,1 75 | 33,18,1 76 | 18,46,0 77 | 31,37,1 78 | 36,30,1 79 | 40,40,1 80 | 38,30,1 81 | 49,28,1 82 | 31,47,1 83 | 28,50,0 84 | 49,43,1 85 | 24,31,1 86 | 33,43,1 87 | 28,24,1 88 | 45,29,1 89 | 49,35,1 90 | 36,29,1 91 | 42,32,1 92 | 29,18,1 93 | 49,20,0 94 | 22,27,1 95 | 41,38,1 96 | 47,21,0 97 | 40,32,1 98 | 35,18,1 99 | 35,33,1 100 | 34,28,1 101 | 22,31,0 102 | 46,20,0 103 | 18,49,0 104 | 48,23,0 105 | 39,21,1 106 | 20,34,0 107 | 24,20,1 108 | 38,18,1 109 | 37,47,1 110 | 39,37,1 111 | 38,39,1 112 | 27,42,1 113 | 47,49,1 114 | 27,42,1 115 | 40,28,1 116 | 41,46,1 117 | 39,25,1 118 | 43,36,1 119 | 49,30,1 120 | 24,38,0 121 | 49,42,1 122 | 19,22,0 123 | 43,27,1 124 | 30,37,1 125 | 24,31,1 126 | 24,48,0 127 | 24,29,1 128 | 18,19,1 129 | 29,25,1 130 | 38,33,1 131 | 39,20,1 132 | 24,30,1 133 | 22,39,0 134 | 47,21,0 135 | 30,44,1 136 | 41,38,1 137 | 29,33,1 138 | 42,42,1 139 | 47,27,1 140 | 23,20,1 141 | 39,18,1 142 | 30,26,1 143 | 36,27,1 144 | 40,18,1 145 | 31,18,1 146 | 46,27,1 147 | 41,44,1 148 | 26,34,1 149 | 33,18,1 150 | 48,19,0 151 | 46,27,1 152 | 25,40,0 153 | 50,36,1 154 | 20,21,1 155 | 33,47,1 156 | 40,35,1 157 | 24,27,1 158 | 34,19,1 159 | 26,45,0 160 | 34,36,1 161 | 21,27,0 162 | 48,28,1 163 | 23,25,1 164 | 48,46,1 165 | 30,20,1 166 | 23,40,0 167 | 36,40,1 168 | 21,45,0 169 | 30,40,1 170 | 39,24,1 171 | 42,47,1 172 | 28,37,1 173 | 24,30,1 174 | 37,25,1 175 | 44,34,1 176 | 43,32,1 177 | 46,29,1 178 | 49,22,0 179 | 41,28,1 180 | 23,50,0 181 | 30,43,1 182 | 25,32,1 183 | 27,46,0 184 | 23,21,1 185 | 39,41,1 186 | 33,27,1 187 | 49,21,0 188 | 33,33,1 189 | 18,25,0 190 | 42,35,1 191 | 36,25,1 192 | 26,50,0 193 | 18,37,0 194 | 35,37,1 195 | 39,38,1 196 | 22,30,0 197 | 18,44,0 198 | 46,44,1 199 | 24,27,1 200 | 41,34,1 201 | 40,39,1 202 | 34,49,1 203 | 35,41,1 204 | 46,48,1 205 | 50,23,0 206 | 49,20,0 207 | 22,47,0 208 | 27,26,1 209 | 30,30,1 210 | 37,39,1 211 | 42,44,1 212 | 41,27,1 213 | 24,21,1 214 | 34,28,1 215 | 23,43,0 216 | 43,35,1 217 | 42,40,1 218 | 25,24,1 219 | 36,24,1 220 | 25,23,1 221 | 44,30,1 222 | 39,33,1 223 | 38,33,1 224 | 49,30,1 225 | 40,19,1 226 | 19,46,0 227 | 31,21,1 228 | 48,33,1 229 | 26,24,1 230 | 20,37,0 231 | 29,31,1 232 | 35,28,1 233 | 37,25,1 234 | 42,42,1 235 | 42,48,1 236 | 41,47,1 237 | 44,45,1 238 | 45,46,1 239 | 25,38,1 240 | 19,45,0 241 | 36,26,1 242 | 33,36,1 243 | 27,19,1 244 | 48,24,0 245 | 37,48,1 246 | 23,31,0 247 | 20,29,0 248 | 27,44,0 249 | 47,24,0 250 | 36,18,1 251 | 37,48,1 252 | 32,29,1 253 | 46,48,1 254 | 31,47,1 255 | 23,45,0 256 | 28,30,1 257 | 36,32,1 258 | 25,43,0 259 | 24,44,0 260 | 34,47,1 261 | 46,42,1 262 | 18,31,0 263 | 23,25,1 264 | 44,39,1 265 | 18,29,0 266 | 49,40,1 267 | 24,33,0 268 | 21,44,0 269 | 40,24,1 270 | 46,41,1 271 | 42,33,1 272 | 25,41,0 273 | 29,42,1 274 | 40,18,1 275 | 37,40,1 276 | 46,28,1 277 | 33,20,1 278 | 18,42,0 279 | 22,36,0 280 | 27,46,0 281 | 33,48,1 282 | 21,37,0 283 | 26,50,0 284 | 29,23,1 285 | 23,33,0 286 | 21,38,0 287 | 18,30,0 288 | 29,28,1 289 | 31,22,1 290 | 30,48,1 291 | 41,37,1 292 | 35,31,1 293 | 48,32,1 294 | 29,37,1 295 | 32,33,1 296 | 43,26,1 297 | 21,33,0 298 | 44,28,1 299 | 35,18,1 300 | 35,35,1 301 | 25,20,1 302 | 39,46,1 303 | 26,39,1 304 | 36,29,1 305 | 29,44,1 306 | 28,42,1 307 | 38,21,1 308 | 28,49,0 309 | 33,26,1 310 | 31,28,1 311 | 25,47,0 312 | 23,25,1 313 | 45,49,1 314 | 28,26,1 315 | 36,48,1 316 | 42,48,1 317 | 42,21,1 318 | 29,32,1 319 | 26,28,1 320 | 24,46,0 321 | 39,30,1 322 | 29,46,1 323 | 43,43,1 324 | 20,42,0 325 | 35,41,1 326 | 45,19,0 327 | 38,45,1 328 | 25,38,1 329 | 31,20,1 330 | 38,43,1 331 | 37,30,1 332 | 43,27,1 333 | 43,44,1 334 | 21,30,0 335 | 22,45,0 336 | 44,26,1 337 | 43,42,1 338 | 26,41,0 339 | 47,35,1 340 | 48,30,1 341 | 41,24,1 342 | 19,48,0 343 | 45,24,0 344 | 38,41,1 345 | 42,46,1 346 | 49,45,1 347 | 28,44,1 348 | 22,44,0 349 | 31,48,1 350 | 48,21,0 351 | 31,20,1 352 | 30,39,1 353 | 23,23,1 354 | 21,32,0 355 | 19,19,1 356 | 21,27,0 357 | 24,46,0 358 | 25,28,1 359 | 48,50,1 360 | 25,32,1 361 | 26,29,1 362 | 33,48,1 363 | 35,32,1 364 | 48,25,1 365 | 30,27,1 366 | 34,49,1 367 | 40,45,1 368 | 28,32,1 369 | 47,33,1 370 | 29,33,1 371 | 21,22,1 372 | 21,39,0 373 | 41,45,1 374 | 46,39,1 375 | 22,24,1 376 | 32,22,1 377 | 27,46,0 378 | 26,35,1 379 | 27,29,1 380 | 48,19,0 381 | 35,26,1 382 | 42,29,1 383 | 30,22,1 384 | 20,26,0 385 | 33,25,1 386 | 37,30,1 387 | 37,32,1 388 | 20,22,1 389 | 42,48,1 390 | 29,20,1 391 | 32,46,1 392 | 37,34,1 393 | 29,45,1 394 | 19,44,0 395 | 49,18,0 396 | 28,25,1 397 | 48,31,1 398 | 35,46,1 399 | 34,26,1 400 | 38,26,1 401 | 36,31,1 402 | 31,30,1 403 | 27,19,1 404 | 44,38,1 405 | 19,37,0 406 | 43,49,1 407 | 19,42,0 408 | 32,24,1 409 | 46,43,1 410 | 43,46,1 411 | 33,32,1 412 | 23,35,0 413 | 26,34,1 414 | 48,20,0 415 | 45,38,1 416 | 30,30,1 417 | 28,23,1 418 | 43,36,1 419 | 19,37,0 420 | 39,45,1 421 | 20,30,0 422 | 28,30,1 423 | 19,42,0 424 | 41,21,1 425 | 42,31,1 426 | 47,45,1 427 | 42,48,1 428 | 40,22,1 429 | 28,20,1 430 | 22,31,0 431 | 28,24,1 432 | 18,33,0 433 | 42,47,1 434 | 35,18,1 435 | 32,28,1 436 | 45,39,1 437 | 46,45,1 438 | 41,43,1 439 | 24,37,0 440 | 34,30,1 441 | 40,22,1 442 | 38,20,1 443 | 43,28,1 444 | 21,26,0 445 | 35,27,1 446 | 33,37,1 447 | 48,39,1 448 | 47,40,1 449 | 31,32,1 450 | 18,32,0 451 | 31,20,1 452 | 30,49,1 453 | 22,46,0 454 | 36,39,1 455 | 30,35,1 456 | 49,50,1 457 | 46,39,1 458 | 45,44,1 459 | 34,40,1 460 | 27,28,1 461 | 27,35,1 462 | 46,46,1 463 | 26,42,0 464 | 27,18,1 465 | 23,38,0 466 | 30,30,1 467 | 34,32,1 468 | 48,27,1 469 | 31,23,1 470 | 29,47,0 471 | 47,31,1 472 | 35,19,1 473 | 30,28,1 474 | 33,44,1 475 | 36,37,1 476 | 34,44,1 477 | 42,43,1 478 | 36,29,1 479 | 35,46,1 480 | 22,36,0 481 | 39,47,1 482 | 23,23,1 483 | 47,20,0 484 | 38,22,1 485 | 21,33,0 486 | 37,41,1 487 | 18,18,1 488 | 35,34,1 489 | 49,49,1 490 | 33,32,1 491 | 31,19,1 492 | 31,26,1 493 | 45,31,1 494 | 41,44,1 495 | 27,47,0 496 | 28,26,1 497 | 18,47,0 498 | 37,18,1 499 | 20,42,0 500 | 36,45,1 501 | -------------------------------------------------------------------------------- /chapter10/clusters.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | from math import sqrt 4 | from PIL import Image,ImageDraw,ImageFont 5 | 6 | # Returns the Pearson correlation coefficient for p1 and p2 7 | def pearson(v1,v2): 8 | # Simple sums 9 | sum1=sum(v1) 10 | sum2=sum(v2) 11 | 12 | # Sums of the squares 13 | sum1Sq=sum([pow(v,2) for v in v1]) 14 | sum2Sq=sum([pow(v,2) for v in v2]) 15 | 16 | # Sum of the products 17 | pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) 18 | 19 | # Calculate r (Pearson score) 20 | num=pSum-(sum1*sum2/len(v1)) 21 | den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) 22 | if den==0: return 0 23 | 24 | return 1.0-(num/den) 25 | 26 | 27 | class bicluster: 28 | def __init__(self,vec,left=None,right=None,distance=0.0,id=None): 29 | self.left=left 30 | self.right=right 31 | self.vec=vec 32 | self.id=id 33 | self.distance=distance 34 | 35 | def euclidean(v1,v2): 36 | sqsum=sum([math.pow(v1[i]-v2[i],2) for i in range(len(v1))]) 37 | return math.sqrt(sqsum) 38 | 39 | def printclust(clust,labels=None,n=0): 40 | for i in range(n): print ' ', 41 | if clust.id<0: 42 | print '-' 43 | else: 44 | if labels==None: print clust.id 45 | else: print labels[clust.id] 46 | if clust.left!=None: printclust(clust.left,labels=labels,n=n+1) 47 | if clust.right!=None: printclust(clust.right,labels=labels,n=n+1) 48 | 49 | def hcluster(vecs,distance=pearson): 50 | distances={} 51 | currentclustid=-1 52 | clust=[bicluster(vecs[i],id=i) for i in range(len(vecs))] 53 | 54 | while len(clust)>1: 55 | lowestpair=(0,1) 56 | closest=distance(clust[0].vec,clust[1].vec) 57 | for i in range(len(clust)): 58 | for j in range(i+1,len(clust)): 59 | if (clust[i].id,clust[j].id) not in distances: 60 | distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) 61 | d=distances[(clust[i].id,clust[j].id)] 62 | 63 | if d0: 102 | for vecid in bestmatches[i]: 103 | for m in range(len(vecs[vecid])): 104 | avgs[m]+=vecs[vecid][m] 105 | for j in range(len(avgs)): 106 | avgs[j]/=len(bestmatches[i]) 107 | clusters[i]=avgs 108 | 109 | return bestmatches 110 | 111 | def readfile(filename): 112 | lines=[line for line in file(filename)] 113 | colnames=lines[0].strip().split('\t')[1:] 114 | rownames=[] 115 | data=[] 116 | for line in lines[1:]: 117 | p=line.strip().split('\t') 118 | rownames.append(p[0]) 119 | data.append([float(x) for x in p[1:]]) 120 | return rownames,colnames,data 121 | 122 | def test2(): 123 | rownames,colnames,data=readfile('datafile.txt') 124 | return hcluster(data) 125 | #for i in range(len(rownames)): 126 | # print i,rownames[i] 127 | 128 | def distance(v1,v2): 129 | c1,c2,shr=0,0,0 130 | 131 | for i in range(len(v1)): 132 | if v1[i]!=0: c1+=1 133 | if v2[i]!=0: c2+=1 134 | if v1[i]!=0 and v2[i]!=0: shr+=1 135 | 136 | return float(shr)/(c1+c2-shr) 137 | 138 | 139 | #test2() 140 | 141 | def getheight(clust): 142 | if clust.left==None and clust.right==None: return 1 143 | return getheight(clust.left)+getheight(clust.right) 144 | 145 | def getdepth(clust): 146 | if clust.left==None and clust.right==None: return 0 147 | return max(getdepth(clust.left),getdepth(clust.right))+clust.distance 148 | 149 | def drawdendrogram(clust,labels,jpeg='clusters.jpg'): 150 | h=getheight(clust)*20 151 | depth=getdepth(clust) 152 | w=1200 153 | scaling=float(w-150)/depth 154 | img=Image.new('RGB',(w,h),(255,255,255)) 155 | draw=ImageDraw.Draw(img) 156 | 157 | draw.line((0,h/2,10,h/2),fill=(255,0,0)) 158 | 159 | drawnode(draw,clust,10,(h/2),scaling,labels) 160 | img.save(jpeg,'JPEG') 161 | 162 | def drawnode(draw,clust,x,y,scaling,labels): 163 | if clust.id<0: 164 | h1=getheight(clust.left)*20 165 | h2=getheight(clust.right)*20 166 | top=y-(h1+h2)/2 167 | bottom=y+(h1+h2)/2 168 | 169 | ll=clust.distance*scaling 170 | 171 | draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0)) 172 | 173 | draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0)) 174 | draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0)) 175 | 176 | drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels) 177 | drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels) 178 | else: 179 | draw.text((x+5,y-7),labels[clust.id].encode('utf8'),(0,0,0)) 180 | 181 | def rotatematrix(data): 182 | newdata=[] 183 | for i in range(len(data[0])): 184 | newrow=[data[j][i] for j in range(len(data))] 185 | newdata.append(newrow) 186 | return newdata 187 | 188 | def scaledown(data,distance=pearson,rate=0.01): 189 | n=len(data) 190 | realdist=[[distance(data[i],data[j]) for j in range(n)] for i in range(0,n)] 191 | 192 | outersum=0.0 193 | 194 | loc=[[random.random(),random.random()] for i in range(n)] 195 | fakedist=[[0.0 for j in range(n)] for i in range(n)] 196 | 197 | lasterror=None 198 | for m in range(0,1000): 199 | # Find projected distances 200 | for i in range(n): 201 | for j in range(n): 202 | fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) 203 | for x in range(len(loc[i]))])) 204 | 205 | # Move points 206 | grad=[[0.0,0.0] for i in range(n)] 207 | 208 | totalerror=0 209 | for k in range(n): 210 | for j in range(n): 211 | if j==k: continue 212 | errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k] 213 | grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm 214 | grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm 215 | totalerror+=abs(errorterm) 216 | print totalerror 217 | if lasterror and lasterror0: return l[1] 51 | else: return l[2] 52 | ifw=fwrapper(iffunc,3,'if') 53 | 54 | def isgreater(l): 55 | if l[0]>l[1]: return 1 56 | else: return 0 57 | gtw=fwrapper(isgreater,2,'isgreater') 58 | 59 | flist=[addw,mulw,ifw,gtw,subw] 60 | 61 | def exampletree(): 62 | return node(ifw,[ 63 | node(gtw,[paramnode(0),constnode(3)]), 64 | node(addw,[paramnode(1),constnode(5)]), 65 | node(subw,[paramnode(1),constnode(2)]), 66 | ] 67 | ) 68 | 69 | def makerandomtree(pc,maxdepth=4,fpr=0.5,ppr=0.6): 70 | if random()0: 71 | f=choice(flist) 72 | children=[makerandomtree(pc,maxdepth-1,fpr,ppr) 73 | for i in range(f.childcount)] 74 | return node(f,children) 75 | elif random()pnew: 148 | newpop.append(mutate( 149 | crossover(scores[selectindex()][1], 150 | scores[selectindex()][1], 151 | probswap=breedingrate), 152 | pc,probchange=mutationrate)) 153 | else: 154 | # Add a random node to mix things up 155 | newpop.append(makerandomtree(pc)) 156 | 157 | population=newpop 158 | scores[0][1].display() 159 | return scores[0][1] 160 | 161 | 162 | def gridgame(p): 163 | # Board size 164 | max=(3,3) 165 | 166 | # Remember the last move for each player 167 | lastmove=[-1,-1] 168 | 169 | # Remember the player's locations 170 | location=[[randint(0,max[0]),randint(0,max[1])]] 171 | 172 | # Put the second player a sufficient distance from the first 173 | location.append([(location[0][0]+2)%4,(location[0][1]+2)%4]) 174 | # Maximum of 50 moves before a tie 175 | for o in range(50): 176 | 177 | # For each player 178 | for i in range(2): 179 | locs=location[i][:]+location[1-i][:] 180 | locs.append(lastmove[i]) 181 | move=p[i].evaluate(locs)%4 182 | 183 | # You lose if you move the same direction twice in a row 184 | if lastmove[i]==move: return 1-i 185 | lastmove[i]=move 186 | if move==0: 187 | location[i][0]-=1 188 | # Board wraps 189 | if location[i][0]<0: location[i][0]=0 190 | if move==1: 191 | location[i][0]+=1 192 | if location[i][0]>max[0]: location[i][0]=max[0] 193 | if move==2: 194 | location[i][1]-=1 195 | if location[i][1]<0: location[i][1]=0 196 | if move==3: 197 | location[i][1]+=1 198 | if location[i][1]>max[1]: location[i][1]=max[1] 199 | 200 | # If you have captured the other player, you win 201 | if location[i]==location[1-i]: return i 202 | return -1 203 | 204 | 205 | def tournament(pl): 206 | # Count losses 207 | losses=[0 for p in pl] 208 | 209 | # Every player plays every other player 210 | for i in range(len(pl)): 211 | for j in range(len(pl)): 212 | if i==j: continue 213 | 214 | # Who is the winner? 215 | winner=gridgame([pl[i],pl[j]]) 216 | 217 | # Two points for a loss, one point for a tie 218 | if winner==0: 219 | losses[j]+=2 220 | elif winner==1: 221 | losses[i]+=2 222 | elif winner==-1: 223 | losses[i]+=1 224 | losses[i]+=1 225 | pass 226 | 227 | # Sort and return the results 228 | z=zip(losses,pl) 229 | z.sort() 230 | return z 231 | 232 | class humanplayer: 233 | def evaluate(self,board): 234 | 235 | # Get my location and the location of other players 236 | me=tuple(board[0:2]) 237 | others=[tuple(board[x:x+2]) for x in range(2,len(board)-1,2)] 238 | 239 | # Display the board 240 | for i in range(4): 241 | for j in range(4): 242 | if (i,j)==me: 243 | print 'O', 244 | elif (i,j) in others: 245 | print 'X', 246 | else: 247 | print '.', 248 | print 249 | 250 | # Show moves, for reference 251 | print 'Your last move was %d' % board[len(board)-1] 252 | print ' 0' 253 | print '2 3' 254 | print ' 1' 255 | print 'Enter move: ', 256 | 257 | # Return whatever the user enters 258 | move=int(raw_input()) 259 | return move 260 | 261 | 262 | class fwrapper: 263 | def __init__(self,function,params,name): 264 | self.function=function 265 | self.childcount=param 266 | self.name=name 267 | 268 | #flist={'str':[substringw,concatw],'int':[indexw]} 269 | flist=[addw,mulw,ifw,gtw,subw] 270 | -------------------------------------------------------------------------------- /chapter7/treepredict.py: -------------------------------------------------------------------------------- 1 | my_data=[['slashdot','USA','yes',18,'None'], 2 | ['google','France','yes',23,'Premium'], 3 | ['digg','USA','yes',24,'Basic'], 4 | ['kiwitobes','France','yes',23,'Basic'], 5 | ['google','UK','no',21,'Premium'], 6 | ['(direct)','New Zealand','no',12,'None'], 7 | ['(direct)','UK','no',21,'Basic'], 8 | ['google','USA','no',24,'Premium'], 9 | ['slashdot','France','yes',19,'None'], 10 | ['digg','USA','no',18,'None'], 11 | ['google','UK','no',18,'None'], 12 | ['kiwitobes','UK','no',19,'None'], 13 | ['digg','New Zealand','yes',12,'Basic'], 14 | ['slashdot','UK','no',21,'None'], 15 | ['google','UK','yes',18,'Basic'], 16 | ['kiwitobes','France','yes',19,'Basic']] 17 | 18 | class decisionnode: 19 | def __init__(self,col=-1,value=None,results=None,tb=None,fb=None): 20 | self.col=col 21 | self.value=value 22 | self.results=results 23 | self.tb=tb 24 | self.fb=fb 25 | 26 | # Divides a set on a specific column. Can handle numeric 27 | # or nominal values 28 | def divideset(rows,column,value): 29 | # Make a function that tells us if a row is in 30 | # the first group (true) or the second group (false) 31 | split_function=None 32 | if isinstance(value,int) or isinstance(value,float): 33 | split_function=lambda row:row[column]>=value 34 | else: 35 | split_function=lambda row:row[column]==value 36 | 37 | # Divide the rows into two sets and return them 38 | set1=[row for row in rows if split_function(row)] 39 | set2=[row for row in rows if not split_function(row)] 40 | return (set1,set2) 41 | 42 | 43 | # Create counts of possible results (the last column of 44 | # each row is the result) 45 | def uniquecounts(rows): 46 | results={} 47 | for row in rows: 48 | # The result is the last column 49 | r=row[len(row)-1] 50 | if r not in results: results[r]=0 51 | results[r]+=1 52 | return results 53 | 54 | # Probability that a randomly placed item will 55 | # be in the wrong category 56 | def giniimpurity(rows): 57 | total=len(rows) 58 | counts=uniquecounts(rows) 59 | imp=0 60 | for k1 in counts: 61 | p1=float(counts[k1])/total 62 | for k2 in counts: 63 | if k1==k2: continue 64 | p2=float(counts[k2])/total 65 | imp+=p1*p2 66 | return imp 67 | 68 | # Entropy is the sum of p(x)log(p(x)) across all 69 | # the different possible results 70 | def entropy(rows): 71 | from math import log 72 | log2=lambda x:log(x)/log(2) 73 | results=uniquecounts(rows) 74 | # Now calculate the entropy 75 | ent=0.0 76 | for r in results.keys(): 77 | p=float(results[r])/len(rows) 78 | ent=ent-p*log2(p) 79 | return ent 80 | 81 | 82 | 83 | 84 | def printtree(tree,indent=''): 85 | # Is this a leaf node? 86 | if tree.results!=None: 87 | print str(tree.results) 88 | else: 89 | # Print the criteria 90 | print str(tree.col)+':'+str(tree.value)+'? ' 91 | 92 | # Print the branches 93 | print indent+'T->', 94 | printtree(tree.tb,indent+' ') 95 | print indent+'F->', 96 | printtree(tree.fb,indent+' ') 97 | 98 | 99 | def getwidth(tree): 100 | if tree.tb==None and tree.fb==None: return 1 101 | return getwidth(tree.tb)+getwidth(tree.fb) 102 | 103 | def getdepth(tree): 104 | if tree.tb==None and tree.fb==None: return 0 105 | return max(getdepth(tree.tb),getdepth(tree.fb))+1 106 | 107 | 108 | from PIL import Image,ImageDraw 109 | 110 | def drawtree(tree,jpeg='tree.jpg'): 111 | w=getwidth(tree)*100 112 | h=getdepth(tree)*100+120 113 | 114 | img=Image.new('RGB',(w,h),(255,255,255)) 115 | draw=ImageDraw.Draw(img) 116 | 117 | drawnode(draw,tree,w/2,20) 118 | img.save(jpeg,'JPEG') 119 | 120 | def drawnode(draw,tree,x,y): 121 | if tree.results==None: 122 | # Get the width of each branch 123 | w1=getwidth(tree.fb)*100 124 | w2=getwidth(tree.tb)*100 125 | 126 | # Determine the total space required by this node 127 | left=x-(w1+w2)/2 128 | right=x+(w1+w2)/2 129 | 130 | # Draw the condition string 131 | draw.text((x-20,y-10),str(tree.col)+':'+str(tree.value),(0,0,0)) 132 | 133 | # Draw links to the branches 134 | draw.line((x,y,left+w1/2,y+100),fill=(255,0,0)) 135 | draw.line((x,y,right-w2/2,y+100),fill=(255,0,0)) 136 | 137 | # Draw the branch nodes 138 | drawnode(draw,tree.fb,left+w1/2,y+100) 139 | drawnode(draw,tree.tb,right-w2/2,y+100) 140 | else: 141 | txt=' \n'.join(['%s:%d'%v for v in tree.results.items()]) 142 | draw.text((x-20,y),txt,(0,0,0)) 143 | 144 | 145 | def classify(observation,tree): 146 | if tree.results!=None: 147 | return tree.results 148 | else: 149 | v=observation[tree.col] 150 | branch=None 151 | if isinstance(v,int) or isinstance(v,float): 152 | if v>=tree.value: branch=tree.tb 153 | else: branch=tree.fb 154 | else: 155 | if v==tree.value: branch=tree.tb 156 | else: branch=tree.fb 157 | return classify(observation,branch) 158 | 159 | def prune(tree,mingain): 160 | # If the branches aren't leaves, then prune them 161 | if tree.tb.results==None: 162 | prune(tree.tb,mingain) 163 | if tree.fb.results==None: 164 | prune(tree.fb,mingain) 165 | 166 | # If both the subbranches are now leaves, see if they 167 | # should merged 168 | if tree.tb.results!=None and tree.fb.results!=None: 169 | # Build a combined dataset 170 | tb,fb=[],[] 171 | for v,c in tree.tb.results.items(): 172 | tb+=[[v]]*c 173 | for v,c in tree.fb.results.items(): 174 | fb+=[[v]]*c 175 | 176 | # Test the reduction in entropy 177 | delta=entropy(tb+fb)-(entropy(tb)+entropy(fb)/2) 178 | 179 | if delta=tree.value: branch=tree.tb 202 | else: branch=tree.fb 203 | else: 204 | if v==tree.value: branch=tree.tb 205 | else: branch=tree.fb 206 | return mdclassify(observation,branch) 207 | 208 | def variance(rows): 209 | if len(rows)==0: return 0 210 | data=[float(row[len(row)-1]) for row in rows] 211 | mean=sum(data)/len(data) 212 | variance=sum([(d-mean)**2 for d in data])/len(data) 213 | return variance 214 | 215 | def buildtree(rows,scoref=entropy): 216 | if len(rows)==0: return decisionnode() 217 | current_score=scoref(rows) 218 | 219 | # Set up some variables to track the best criteria 220 | best_gain=0.0 221 | best_criteria=None 222 | best_sets=None 223 | 224 | column_count=len(rows[0])-1 225 | for col in range(0,column_count): 226 | # Generate the list of different values in 227 | # this column 228 | column_values={} 229 | for row in rows: 230 | column_values[row[col]]=1 231 | # Now try dividing the rows up for each value 232 | # in this column 233 | for value in column_values.keys(): 234 | (set1,set2)=divideset(rows,col,value) 235 | 236 | # Information gain 237 | p=float(len(set1))/len(rows) 238 | gain=current_score-p*scoref(set1)-(1-p)*scoref(set2) 239 | if gain>best_gain and len(set1)>0 and len(set2)>0: 240 | best_gain=gain 241 | best_criteria=(col,value) 242 | best_sets=(set1,set2) 243 | # Create the sub branches 244 | if best_gain>0: 245 | trueBranch=buildtree(best_sets[0]) 246 | falseBranch=buildtree(best_sets[1]) 247 | return decisionnode(col=best_criteria[0],value=best_criteria[1], 248 | tb=trueBranch,fb=falseBranch) 249 | else: 250 | return decisionnode(results=uniquecounts(rows)) 251 | -------------------------------------------------------------------------------- /chapter9/svm.py: -------------------------------------------------------------------------------- 1 | import svmc 2 | from svmc import C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR 3 | from svmc import LINEAR, POLY, RBF, SIGMOID 4 | from math import exp, fabs 5 | 6 | def _int_array(seq): 7 | size = len(seq) 8 | array = svmc.new_int(size) 9 | i = 0 10 | for item in seq: 11 | svmc.int_setitem(array,i,item) 12 | i = i + 1 13 | return array 14 | 15 | def _double_array(seq): 16 | size = len(seq) 17 | array = svmc.new_double(size) 18 | i = 0 19 | for item in seq: 20 | svmc.double_setitem(array,i,item) 21 | i = i + 1 22 | return array 23 | 24 | def _free_int_array(x): 25 | if x != 'NULL' and x != None: 26 | svmc.delete_int(x) 27 | 28 | def _free_double_array(x): 29 | if x != 'NULL' and x != None: 30 | svmc.delete_double(x) 31 | 32 | def _int_array_to_list(x,n): 33 | return map(svmc.int_getitem,[x]*n,range(n)) 34 | 35 | def _double_array_to_list(x,n): 36 | return map(svmc.double_getitem,[x]*n,range(n)) 37 | 38 | class svm_parameter: 39 | 40 | # default values 41 | default_parameters = { 42 | 'svm_type' : C_SVC, 43 | 'kernel_type' : RBF, 44 | 'degree' : 3, 45 | 'gamma' : 0, # 1/k 46 | 'coef0' : 0, 47 | 'nu' : 0.5, 48 | 'cache_size' : 40, 49 | 'C' : 1, 50 | 'eps' : 1e-3, 51 | 'p' : 0.1, 52 | 'shrinking' : 1, 53 | 'nr_weight' : 0, 54 | 'weight_label' : [], 55 | 'weight' : [], 56 | 'probability' : 0 57 | } 58 | 59 | def __init__(self,**kw): 60 | self.__dict__['param'] = svmc.new_svm_parameter() 61 | for attr,val in self.default_parameters.items(): 62 | setattr(self,attr,val) 63 | for attr,val in kw.items(): 64 | setattr(self,attr,val) 65 | 66 | def __getattr__(self,attr): 67 | get_func = getattr(svmc,'svm_parameter_%s_get' % (attr)) 68 | return get_func(self.param) 69 | 70 | def __setattr__(self,attr,val): 71 | 72 | if attr == 'weight_label': 73 | self.__dict__['weight_label_len'] = len(val) 74 | val = _int_array(val) 75 | _free_int_array(self.weight_label) 76 | elif attr == 'weight': 77 | self.__dict__['weight_len'] = len(val) 78 | val = _double_array(val) 79 | _free_double_array(self.weight) 80 | 81 | set_func = getattr(svmc,'svm_parameter_%s_set' % (attr)) 82 | set_func(self.param,val) 83 | 84 | def __repr__(self): 85 | ret = '' 96 | 97 | def __del__(self): 98 | _free_int_array(self.weight_label) 99 | _free_double_array(self.weight) 100 | svmc.delete_svm_parameter(self.param) 101 | 102 | def _convert_to_svm_node_array(x): 103 | """ convert a sequence or mapping to an svm_node array """ 104 | import operator 105 | 106 | # Find non zero elements 107 | iter_range = [] 108 | if type(x) == dict: 109 | for k, v in x.iteritems(): 110 | # all zeros kept due to the precomputed kernel; no good solution yet 111 | # if v != 0: 112 | iter_range.append( k ) 113 | elif operator.isSequenceType(x): 114 | for j in range(len(x)): 115 | # if x[j] != 0: 116 | iter_range.append( j ) 117 | else: 118 | raise TypeError,"data must be a mapping or a sequence" 119 | 120 | iter_range.sort() 121 | data = svmc.svm_node_array(len(iter_range)+1) 122 | svmc.svm_node_array_set(data,len(iter_range),-1,0) 123 | 124 | j = 0 125 | for k in iter_range: 126 | svmc.svm_node_array_set(data,j,k,x[k]) 127 | j = j + 1 128 | return data 129 | 130 | class svm_problem: 131 | def __init__(self,y,x): 132 | assert len(y) == len(x) 133 | self.prob = prob = svmc.new_svm_problem() 134 | self.size = size = len(y) 135 | 136 | self.y_array = y_array = svmc.new_double(size) 137 | for i in range(size): 138 | svmc.double_setitem(y_array,i,y[i]) 139 | 140 | self.x_matrix = x_matrix = svmc.svm_node_matrix(size) 141 | self.data = [] 142 | self.maxlen = 0; 143 | for i in range(size): 144 | data = _convert_to_svm_node_array(x[i]) 145 | self.data.append(data); 146 | svmc.svm_node_matrix_set(x_matrix,i,data) 147 | if type(x[i]) == dict: 148 | if (len(x[i]) > 0): 149 | self.maxlen = max(self.maxlen,max(x[i].keys())) 150 | else: 151 | self.maxlen = max(self.maxlen,len(x[i])) 152 | 153 | svmc.svm_problem_l_set(prob,size) 154 | svmc.svm_problem_y_set(prob,y_array) 155 | svmc.svm_problem_x_set(prob,x_matrix) 156 | 157 | def __repr__(self): 158 | return "" % (self.size) 159 | 160 | def __del__(self): 161 | svmc.delete_svm_problem(self.prob) 162 | svmc.delete_double(self.y_array) 163 | for i in range(self.size): 164 | svmc.svm_node_array_destroy(self.data[i]) 165 | svmc.svm_node_matrix_destroy(self.x_matrix) 166 | 167 | class svm_model: 168 | def __init__(self,arg1,arg2=None): 169 | if arg2 == None: 170 | # create model from file 171 | filename = arg1 172 | self.model = svmc.svm_load_model(filename) 173 | else: 174 | # create model from problem and parameter 175 | prob,param = arg1,arg2 176 | self.prob = prob 177 | if param.gamma == 0: 178 | param.gamma = 1.0/prob.maxlen 179 | msg = svmc.svm_check_parameter(prob.prob,param.param) 180 | if msg: raise ValueError, msg 181 | self.model = svmc.svm_train(prob.prob,param.param) 182 | 183 | #setup some classwide variables 184 | self.nr_class = svmc.svm_get_nr_class(self.model) 185 | self.svm_type = svmc.svm_get_svm_type(self.model) 186 | #create labels(classes) 187 | intarr = svmc.new_int(self.nr_class) 188 | svmc.svm_get_labels(self.model,intarr) 189 | self.labels = _int_array_to_list(intarr, self.nr_class) 190 | svmc.delete_int(intarr) 191 | #check if valid probability model 192 | self.probability = svmc.svm_check_probability_model(self.model) 193 | 194 | def predict(self,x): 195 | data = _convert_to_svm_node_array(x) 196 | ret = svmc.svm_predict(self.model,data) 197 | svmc.svm_node_array_destroy(data) 198 | return ret 199 | 200 | 201 | def get_nr_class(self): 202 | return self.nr_class 203 | 204 | def get_labels(self): 205 | if self.svm_type == NU_SVR or self.svm_type == EPSILON_SVR or self.svm_type == ONE_CLASS: 206 | raise TypeError, "Unable to get label from a SVR/ONE_CLASS model" 207 | return self.labels 208 | 209 | def predict_values_raw(self,x): 210 | #convert x into svm_node, allocate a double array for return 211 | n = self.nr_class*(self.nr_class-1)//2 212 | data = _convert_to_svm_node_array(x) 213 | dblarr = svmc.new_double(n) 214 | svmc.svm_predict_values(self.model, data, dblarr) 215 | ret = _double_array_to_list(dblarr, n) 216 | svmc.delete_double(dblarr) 217 | svmc.svm_node_array_destroy(data) 218 | return ret 219 | 220 | def predict_values(self,x): 221 | v=self.predict_values_raw(x) 222 | if self.svm_type == NU_SVR or self.svm_type == EPSILON_SVR or self.svm_type == ONE_CLASS: 223 | return v[0] 224 | else: #self.svm_type == C_SVC or self.svm_type == NU_SVC 225 | count = 0 226 | d = {} 227 | for i in range(len(self.labels)): 228 | for j in range(i+1, len(self.labels)): 229 | d[self.labels[i],self.labels[j]] = v[count] 230 | d[self.labels[j],self.labels[i]] = -v[count] 231 | count += 1 232 | return d 233 | 234 | def predict_probability(self,x): 235 | #c code will do nothing on wrong type, so we have to check ourself 236 | if self.svm_type == NU_SVR or self.svm_type == EPSILON_SVR: 237 | raise TypeError, "call get_svr_probability or get_svr_pdf for probability output of regression" 238 | elif self.svm_type == ONE_CLASS: 239 | raise TypeError, "probability not supported yet for one-class problem" 240 | #only C_SVC,NU_SVC goes in 241 | if not self.probability: 242 | raise TypeError, "model does not support probabiliy estimates" 243 | 244 | #convert x into svm_node, alloc a double array to receive probabilities 245 | data = _convert_to_svm_node_array(x) 246 | dblarr = svmc.new_double(self.nr_class) 247 | pred = svmc.svm_predict_probability(self.model, data, dblarr) 248 | pv = _double_array_to_list(dblarr, self.nr_class) 249 | svmc.delete_double(dblarr) 250 | svmc.svm_node_array_destroy(data) 251 | p = {} 252 | for i in range(len(self.labels)): 253 | p[self.labels[i]] = pv[i] 254 | return pred, p 255 | 256 | def get_svr_probability(self): 257 | #leave the Error checking to svm.cpp code 258 | ret = svmc.svm_get_svr_probability(self.model) 259 | if ret == 0: 260 | raise TypeError, "not a regression model or probability information not available" 261 | return ret 262 | 263 | def get_svr_pdf(self): 264 | #get_svr_probability will handle error checking 265 | sigma = self.get_svr_probability() 266 | return lambda z: exp(-fabs(z)/sigma)/(2*sigma) 267 | 268 | 269 | def save(self,filename): 270 | svmc.svm_save_model(filename,self.model) 271 | 272 | def __del__(self): 273 | svmc.svm_destroy_model(self.model) 274 | 275 | 276 | def cross_validation(prob, param, fold): 277 | if param.gamma == 0: 278 | param.gamma = 1.0/prob.maxlen 279 | dblarr = svmc.new_double(prob.size) 280 | svmc.svm_cross_validation(prob.prob, param.param, fold, dblarr) 281 | ret = _double_array_to_list(dblarr, prob.size) 282 | svmc.delete_double(dblarr) 283 | return ret 284 | -------------------------------------------------------------------------------- /chapter3/clusters.py: -------------------------------------------------------------------------------- 1 | from PIL import Image,ImageDraw 2 | 3 | def readfile(filename): 4 | lines=[line for line in file(filename)] 5 | 6 | # First line is the column titles 7 | colnames=lines[0].strip().split('\t')[1:] 8 | rownames=[] 9 | data=[] 10 | for line in lines[1:]: 11 | p=line.strip().split('\t') 12 | # First column in each row is the rowname 13 | rownames.append(p[0]) 14 | # The data for this row is the remainder of the row 15 | data.append([float(x) for x in p[1:]]) 16 | return rownames,colnames,data 17 | 18 | 19 | from math import sqrt 20 | 21 | def pearson(v1,v2): 22 | # Simple sums 23 | sum1=sum(v1) 24 | sum2=sum(v2) 25 | 26 | # Sums of the squares 27 | sum1Sq=sum([pow(v,2) for v in v1]) 28 | sum2Sq=sum([pow(v,2) for v in v2]) 29 | 30 | # Sum of the products 31 | pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) 32 | 33 | # Calculate r (Pearson score) 34 | num=pSum-(sum1*sum2/len(v1)) 35 | den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) 36 | if den==0: return 0 37 | 38 | return 1.0-num/den 39 | 40 | class bicluster: 41 | def __init__(self,vec,left=None,right=None,distance=0.0,id=None): 42 | self.left=left 43 | self.right=right 44 | self.vec=vec 45 | self.id=id 46 | self.distance=distance 47 | 48 | def hcluster(rows,distance=pearson): 49 | distances={} 50 | currentclustid=-1 51 | 52 | # Clusters are initially just the rows 53 | clust=[bicluster(rows[i],id=i) for i in range(len(rows))] 54 | 55 | while len(clust)>1: 56 | lowestpair=(0,1) 57 | closest=distance(clust[0].vec,clust[1].vec) 58 | 59 | # loop through every pair looking for the smallest distance 60 | for i in range(len(clust)): 61 | for j in range(i+1,len(clust)): 62 | # distances is the cache of distance calculations 63 | if (clust[i].id,clust[j].id) not in distances: 64 | distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) 65 | 66 | d=distances[(clust[i].id,clust[j].id)] 67 | 68 | if d0: 205 | for rowid in bestmatches[i]: 206 | for m in range(len(rows[rowid])): 207 | avgs[m]+=rows[rowid][m] 208 | for j in range(len(avgs)): 209 | avgs[j]/=len(bestmatches[i]) 210 | clusters[i]=avgs 211 | 212 | return bestmatches 213 | 214 | def tanamoto(v1,v2): 215 | c1,c2,shr=0,0,0 216 | 217 | for i in range(len(v1)): 218 | if v1[i]!=0: c1+=1 # in v1 219 | if v2[i]!=0: c2+=1 # in v2 220 | if v1[i]!=0 and v2[i]!=0: shr+=1 # in both 221 | 222 | return 1.0-(float(shr)/(c1+c2-shr)) 223 | 224 | def scaledown(data,distance=pearson,rate=0.01): 225 | n=len(data) 226 | 227 | # The real distances between every pair of items 228 | realdist=[[distance(data[i],data[j]) for j in range(n)] 229 | for i in range(0,n)] 230 | 231 | # Randomly initialize the starting points of the locations in 2D 232 | loc=[[random.random(),random.random()] for i in range(n)] 233 | fakedist=[[0.0 for j in range(n)] for i in range(n)] 234 | 235 | lasterror=None 236 | for m in range(0,1000): 237 | # Find projected distances 238 | for i in range(n): 239 | for j in range(n): 240 | fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) 241 | for x in range(len(loc[i]))])) 242 | 243 | # Move points 244 | grad=[[0.0,0.0] for i in range(n)] 245 | 246 | totalerror=0 247 | for k in range(n): 248 | for j in range(n): 249 | if j==k: continue 250 | # The error is percent difference between the distances 251 | errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k] 252 | 253 | # Each point needs to be moved away from or towards the other 254 | # point in proportion to how much error it has 255 | grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm 256 | grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm 257 | 258 | # Keep track of the total error 259 | totalerror+=abs(errorterm) 260 | print totalerror 261 | 262 | # If the answer got worse by moving the points, we are done 263 | if lasterror and lasterror0: 198 | tablelist+=',' 199 | clauselist+=' and ' 200 | clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber) 201 | fieldlist+=',w%d.location' % tablenumber 202 | tablelist+='wordlocation w%d' % tablenumber 203 | clauselist+='w%d.wordid=%d' % (tablenumber,wordid) 204 | tablenumber+=1 205 | 206 | # Create the query from the separate parts 207 | fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist) 208 | print fullquery 209 | cur=self.con.execute(fullquery) 210 | rows=[row for row in cur] 211 | 212 | return rows,wordids 213 | 214 | def getscoredlist(self,rows,wordids): 215 | totalscores=dict([(row[0],0) for row in rows]) 216 | 217 | # This is where we'll put our scoring functions 218 | weights=[(1.0,self.locationscore(rows)), 219 | (1.0,self.frequencyscore(rows)), 220 | (1.0,self.pagerankscore(rows)), 221 | (1.0,self.linktextscore(rows,wordids)), 222 | (5.0,self.nnscore(rows,wordids))] 223 | for (weight,scores) in weights: 224 | for url in totalscores: 225 | totalscores[url]+=weight*scores[url] 226 | 227 | return totalscores 228 | 229 | def geturlname(self,id): 230 | return self.con.execute( 231 | "select url from urllist where rowid=%d" % id).fetchone()[0] 232 | 233 | def query(self,q): 234 | rows,wordids=self.getmatchrows(q) 235 | scores=self.getscoredlist(rows,wordids) 236 | rankedscores=[(score,url) for (url,score) in scores.items()] 237 | rankedscores.sort() 238 | rankedscores.reverse() 239 | for (score,urlid) in rankedscores[0:10]: 240 | print '%f\t%s' % (score,self.geturlname(urlid)) 241 | return wordids,[r[1] for r in rankedscores[0:10]] 242 | 243 | def normalizescores(self,scores,smallIsBetter=0): 244 | vsmall=0.00001 # Avoid division by zero errors 245 | if smallIsBetter: 246 | minscore=min(scores.values()) 247 | return dict([(u,float(minscore)/max(vsmall,l)) for (u,l) in scores.items()]) 248 | else: 249 | maxscore=max(scores.values()) 250 | if maxscore==0: maxscore=vsmall 251 | return dict([(u,float(c)/maxscore) for (u,c) in scores.items()]) 252 | 253 | def frequencyscore(self,rows): 254 | counts=dict([(row[0],0) for row in rows]) 255 | for row in rows: counts[row[0]]+=1 256 | return self.normalizescores(counts) 257 | 258 | def locationscore(self,rows): 259 | locations=dict([(row[0],1000000) for row in rows]) 260 | for row in rows: 261 | loc=sum(row[1:]) 262 | if loc>> a = pydelicious.apiNew('user', 'passwd') 8 | >>> # or: 9 | >>> a = DeliciousAPI('user', 'passwd') 10 | >>> a.tags_get() # Same as: 11 | >>> a.request('tags/get', ) 12 | 13 | Or by calling the 'convenience' methods on the module. 14 | 15 | - def add(user, passwd, url, description, tags = "", extended = "", dt = "", replace="no"): 16 | - def get(user, passwd, tag="", dt="", count = 0): 17 | - def get_all(user, passwd, tag = ""): 18 | - def delete(user, passwd, url): 19 | - def rename_tag(user, passwd, oldtag, newtag): 20 | - def get_tags(user, passwd): 21 | 22 | >>> a = apiNew(user, passwd) 23 | >>> a.posts_add(url="http://my.com/", desciption="my.com", extended="the url is my.moc", tags="my com") 24 | True 25 | >>> len(a.posts_all()) 26 | 1 27 | >>> get_all(user, passwd) 28 | 1 29 | 30 | This are short functions for getrss calls. 31 | 32 | >>> rss_ 33 | 34 | def get_userposts(user): 35 | def get_tagposts(tag): 36 | def get_urlposts(url): 37 | def get_popular(tag = ""): 38 | 39 | >>> json_posts() 40 | >>> json_tags() 41 | >>> json_network() 42 | >>> json_fans() 43 | 44 | :License: pydelicious is released under the BSD license. See 'license.txt' 45 | for more informations. 46 | 47 | :berend: 48 | - Rewriting comments to english. More documentation, examples. 49 | - Added JSON-like return values for XML data (del.icio.us also serves some JSON...) 50 | - better error/exception classes and handling, work in progress. 51 | - Encoding seems to be working (using UTF-8 here). 52 | 53 | :@todo: 54 | - Source code SHOULD BE ASCII! 55 | - More tests. 56 | - Parse datetimes in XML. 57 | - Salvage and test RSS functionality? 58 | - Setup not used, Still works? Should setup.py be tested? 59 | - API functions need required argument checks. 60 | 61 | * lizense einbinden und auch via setup.py verteilen 62 | * readme auch schreiben und via setup.py verteilen 63 | * auch auf anderen systemen testen (linux -> uni) 64 | * automatisch releases bauen lassen, richtig benennen und in das 65 | richtige verzeichnis verschieben. 66 | * was k[o]nnen die anderen librarys denn noch so? (ruby, java, perl, etc) 67 | * was wollen die, die es benutzen? 68 | * wof[u]r k[o]nnte ich es benutzen? 69 | * entschlacken? 70 | 71 | :done: 72 | * Refactored the API class, much cleaner now and functions dlcs_api_request, dlcs_parse_xml are available for who wants them. 73 | * stimmt das so? muss eher noch t[a]g str2utf8 konvertieren 74 | >>> pydelicious.getrss(tag="t[a]g") 75 | url: http://del.icio.us/rss/tag/t[a]g 76 | * requester muss eine sekunde warten 77 | * __init__.py gibt die funktionen weiter 78 | * html parser funktioniert noch nicht, gar nicht 79 | * alte funktionen fehlen, get_posts_by_url, etc. 80 | * post funktion erstellen, die auch die fehlenden attribs addiert. 81 | * die api muss ich noch weiter machen 82 | * requester muss die 503er abfangen 83 | * rss parser muss auf viele m[o]glichkeiten angepasst werden 84 | """ 85 | import sys 86 | import os 87 | import time 88 | import datetime 89 | import md5, httplib 90 | import urllib, urllib2, time 91 | from StringIO import StringIO 92 | 93 | try: 94 | from elementtree.ElementTree import parse as parse_xml 95 | except ImportError: 96 | from xml.etree.ElementTree import parse as parse_xml 97 | 98 | import feedparser 99 | 100 | 101 | ### Static config 102 | 103 | __version__ = '0.5.0' 104 | __author__ = 'Frank Timmermann ' # GP: does not respond to emails 105 | __contributors__ = [ 106 | 'Greg Pinero', 107 | 'Berend van Berkum '] 108 | __url__ = 'http://code.google.com/p/pydelicious/' 109 | __author_email__ = "" 110 | # Old URL: 'http://deliciouspython.python-hosting.com/' 111 | 112 | __description__ = '''pydelicious.py allows you to access the web service of del.icio.us via it's API through python.''' 113 | __long_description__ = '''the goal is to design an easy to use and fully functional python interface to del.icio.us. ''' 114 | 115 | DLCS_OK_MESSAGES = ('done', 'ok') # Known text values of positive del.icio.us answers 116 | DLCS_WAIT_TIME = 4 117 | DLCS_REQUEST_TIMEOUT = 444 # Seconds before socket triggers timeout 118 | #DLCS_API_REALM = 'del.icio.us API' 119 | DLCS_API_HOST = 'https://api.del.icio.us' 120 | DLCS_API_PATH = 'v1' 121 | DLCS_API = "%s/%s" % (DLCS_API_HOST, DLCS_API_PATH) 122 | DLCS_RSS = 'http://del.icio.us/rss/' 123 | 124 | ISO_8601_DATETIME = '%Y-%m-%dT%H:%M:%SZ' 125 | 126 | USER_AGENT = 'pydelicious.py/%s %s' % (__version__, __url__) 127 | 128 | DEBUG = 0 129 | if 'DLCS_DEBUG' in os.environ: 130 | DEBUG = int(os.environ['DLCS_DEBUG']) 131 | 132 | 133 | # Taken from FeedParser.py 134 | # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers. 135 | # Python 2.3 now has this functionality available in the standard socket library, so under 136 | # 2.3 you don't need to install anything. But you probably should anyway, because the socket 137 | # module is buggy and timeoutsocket is better. 138 | try: 139 | import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py 140 | timeoutsocket.setDefaultSocketTimeout(DLCS_REQUEST_TIMEOUT) 141 | except ImportError: 142 | import socket 143 | if hasattr(socket, 'setdefaulttimeout'): socket.setdefaulttimeout(DLCS_REQUEST_TIMEOUT) 144 | if DEBUG: print >>sys.stderr, "Set socket timeout to %s seconds" % DLCS_REQUEST_TIMEOUT 145 | 146 | 147 | ### Utility classes 148 | 149 | class _Waiter: 150 | """Waiter makes sure a certain amount of time passes between 151 | successive calls of `Waiter()`. 152 | 153 | Some attributes: 154 | :last: time of last call 155 | :wait: the minimum time needed between calls 156 | :waited: the number of calls throttled 157 | 158 | pydelicious.Waiter is an instance created when the module is loaded. 159 | """ 160 | def __init__(self, wait): 161 | self.wait = wait 162 | self.waited = 0 163 | self.lastcall = 0; 164 | 165 | def __call__(self): 166 | tt = time.time() 167 | 168 | timeago = tt - self.lastcall 169 | 170 | if self.lastcall and DEBUG>2: 171 | print >>sys.stderr, "Lastcall: %s seconds ago." % lastcall 172 | 173 | if timeago <= self.wait: 174 | if DEBUG>0: print >>sys.stderr, "Waiting %s seconds." % self.wait 175 | time.sleep(self.wait) 176 | self.waited += 1 177 | self.lastcall = tt + self.wait 178 | else: 179 | self.lastcall = tt 180 | 181 | Waiter = _Waiter(DLCS_WAIT_TIME) 182 | 183 | class PyDeliciousException(Exception): 184 | '''Std. pydelicious error''' 185 | pass 186 | 187 | class DeliciousError(Exception): 188 | """Raised when the server responds with a negative answer""" 189 | 190 | 191 | class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): 192 | '''@xxx:bvb: Where is this used? should it be registered somewhere with urllib2? 193 | 194 | Handles HTTP Error, currently only 503. 195 | ''' 196 | def http_error_503(self, req, fp, code, msg, headers): 197 | raise urllib2.HTTPError(req, code, throttled_message, headers, fp) 198 | 199 | 200 | class post(dict): 201 | """Post object, contains href, description, hash, dt, tags, 202 | extended, user, count(, shared). 203 | 204 | @xxx:bvb: Is this needed? Right now this is superfluous, 205 | """ 206 | def __init__(self, href = "", description = "", hash = "", time = "", tag = "", extended = "", user = "", count = "", 207 | tags = "", url = "", dt = ""): # tags or tag? 208 | self["href"] = href 209 | if url != "": self["href"] = url 210 | self["description"] = description 211 | self["hash"] = hash 212 | self["dt"] = dt 213 | if time != "": self["dt"] = time 214 | self["tags"] = tags 215 | if tag != "": self["tags"] = tag # tag or tags? # !! tags 216 | self["extended"] = extended 217 | self["user"] = user 218 | self["count"] = count 219 | 220 | def __getattr__(self, name): 221 | try: return self[name] 222 | except: object.__getattribute__(self, name) 223 | 224 | 225 | class posts(list): 226 | """@xxx:bvb: idem as class post, python structures (dict/list) might 227 | suffice or a more generic solution is needed. 228 | """ 229 | def __init__(self, *args): 230 | for i in args: self.append(i) 231 | 232 | def __getattr__(self, attr): 233 | try: return [p[attr] for p in self] 234 | except: object.__getattribute__(self, attr) 235 | 236 | ### Utility functions 237 | 238 | def str2uni(s): 239 | # type(in) str or unicode 240 | # type(out) unicode 241 | return ("".join([unichr(ord(i)) for i in s])) 242 | 243 | def str2utf8(s): 244 | # type(in) str or unicode 245 | # type(out) str 246 | return ("".join([unichr(ord(i)).encode("utf-8") for i in s])) 247 | 248 | def str2quote(s): 249 | return urllib.quote_plus("".join([unichr(ord(i)).encode("utf-8") for i in s])) 250 | 251 | def dict0(d): 252 | # Trims empty dict entries 253 | # {'a':'a', 'b':'', 'c': 'c'} => {'a': 'a', 'c': 'c'} 254 | dd = dict() 255 | for i in d: 256 | if d[i] != "": dd[i] = d[i] 257 | return dd 258 | 259 | def delicious_datetime(str): 260 | """Parse a ISO 8601 formatted string to a Python datetime ... 261 | """ 262 | return datetime.datetime(*time.strptime(str, ISO_8601_DATETIME)[0:6]) 263 | 264 | def http_request(url, user_agent=USER_AGENT, retry=4): 265 | """Retrieve the contents referenced by the URL using urllib2. 266 | 267 | Retries up to four times (default) on exceptions. 268 | """ 269 | request = urllib2.Request(url, headers={'User-Agent':user_agent}) 270 | 271 | # Remember last error 272 | e = None 273 | 274 | # Repeat request on time-out errors 275 | tries = retry; 276 | while tries: 277 | try: 278 | return urllib2.urlopen(request) 279 | 280 | except urllib2.HTTPError, e: # protocol errors, 281 | raise PyDeliciousException, "%s" % e 282 | 283 | except urllib2.URLError, e: 284 | # @xxx: Ugly check for time-out errors 285 | #if len(e)>0 and 'timed out' in arg[0]: 286 | print >> sys.stderr, "%s, %s tries left." % (e, tries) 287 | Waiter() 288 | tries = tries - 1 289 | #else: 290 | # tries = None 291 | 292 | # Give up 293 | raise PyDeliciousException, \ 294 | "Unable to retrieve data at '%s', %s" % (url, e) 295 | 296 | def http_auth_request(url, host, user, passwd, user_agent=USER_AGENT): 297 | """Call an HTTP server with authorization credentials using urllib2. 298 | """ 299 | if DEBUG: httplib.HTTPConnection.debuglevel = 1 300 | 301 | # Hook up handler/opener to urllib2 302 | password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() 303 | password_manager.add_password(None, host, user, passwd) 304 | auth_handler = urllib2.HTTPBasicAuthHandler(password_manager) 305 | opener = urllib2.build_opener(auth_handler) 306 | urllib2.install_opener(opener) 307 | 308 | return http_request(url, user_agent) 309 | 310 | def dlcs_api_request(path, params='', user='', passwd='', throttle=True): 311 | """Retrieve/query a path within the del.icio.us API. 312 | 313 | This implements a minimum interval between calls to avoid 314 | throttling. [#]_ Use param 'throttle' to turn this behaviour off. 315 | 316 | @todo: back off on 503's (HTTPError, URLError? @todo: testing). 317 | 318 | Returned XML does not always correspond with given del.icio.us examples 319 | @todo: (cf. help/api/... and post's attributes) 320 | 321 | .. [#] http://del.icio.us/help/api/ 322 | """ 323 | if throttle: 324 | Waiter() 325 | 326 | if params: 327 | # params come as a dict, strip empty entries and urlencode 328 | url = "%s/%s?%s" % (DLCS_API, path, urllib.urlencode(dict0(params))) 329 | else: 330 | url = "%s/%s" % (DLCS_API, path) 331 | 332 | if DEBUG: print >>sys.stderr, "dlcs_api_request: %s" % url 333 | 334 | try: 335 | return http_auth_request(url, DLCS_API_HOST, user, passwd, USER_AGENT) 336 | 337 | # @bvb: Is this ever raised? When? 338 | except DefaultErrorHandler, e: 339 | print >>sys.stderr, "%s" % e 340 | 341 | def dlcs_parse_xml(data, split_tags=False): 342 | """Parse any del.icio.us XML document and return Python data structure. 343 | 344 | Recognizes all XML document formats as returned by the version 1 API and 345 | translates to a JSON-like data structure (dicts 'n lists). 346 | 347 | Returned instance is always a dictionary. Examples:: 348 | 349 | {'posts': [{'url':'...','hash':'...',},],} 350 | {'tags':['tag1', 'tag2',]} 351 | {'dates': [{'count':'...','date':'...'},], 'tag':'', 'user':'...'} 352 | {'result':(True, "done")} 353 | # etcetera. 354 | """ 355 | 356 | if DEBUG>3: print >>sys.stderr, "dlcs_parse_xml: parsing from ", data 357 | 358 | if not hasattr(data, 'read'): 359 | data = StringIO(data) 360 | 361 | doc = parse_xml(data) 362 | root = doc.getroot() 363 | fmt = root.tag 364 | 365 | # Split up into three cases: Data, Result or Update 366 | if fmt in ('tags', 'posts', 'dates', 'bundles'): 367 | 368 | # Data: expect a list of data elements, 'resources'. 369 | # Use `fmt` (without last 's') to find data elements, elements 370 | # don't have contents, attributes contain all the data we need: 371 | # append to list 372 | elist = [el.attrib for el in doc.findall(fmt[:-1])] 373 | 374 | # Return list in dict, use tagname of rootnode as keyname. 375 | data = {fmt: elist} 376 | 377 | # Root element might have attributes too, append dict. 378 | data.update(root.attrib) 379 | 380 | return data 381 | 382 | elif fmt == 'result': 383 | 384 | # Result: answer to operations 385 | if root.attrib.has_key('code'): 386 | msg = root.attrib['code'] 387 | else: 388 | msg = root.text 389 | 390 | # Return {'result':(True, msg)} for /known/ O.K. messages, 391 | # use (False, msg) otherwise 392 | v = msg in DLCS_OK_MESSAGES 393 | return {fmt: (v, msg)} 394 | 395 | elif fmt == 'update': 396 | 397 | # Update: "time" 398 | #return {fmt: root.attrib} 399 | return {fmt: {'time':time.strptime(root.attrib['time'], ISO_8601_DATETIME)}} 400 | 401 | else: 402 | raise PyDeliciousException, "Unknown XML document format '%s'" % fmt 403 | 404 | def dlcs_rss_request(tag = "", popular = 0, user = "", url = ''): 405 | """Handle a request for RSS 406 | 407 | @todo: translate from German 408 | 409 | rss sollte nun wieder funktionieren, aber diese try, except scheisse ist so nicht schoen 410 | 411 | rss wird unterschiedlich zusammengesetzt. ich kann noch keinen einheitlichen zusammenhang 412 | zwischen daten (url, desc, ext, usw) und dem feed erkennen. warum k[o]nnen die das nicht einheitlich machen? 413 | """ 414 | tag = str2quote(tag) 415 | user = str2quote(user) 416 | if url != '': 417 | # http://del.icio.us/rss/url/efbfb246d886393d48065551434dab54 418 | url = DLCS_RSS + '''url/%s'''%md5.new(url).hexdigest() 419 | elif user != '' and tag != '': 420 | url = DLCS_RSS + '''%(user)s/%(tag)s'''%dict(user=user, tag=tag) 421 | elif user != '' and tag == '': 422 | # http://del.icio.us/rss/delpy 423 | url = DLCS_RSS + '''%s'''%user 424 | elif popular == 0 and tag == '': 425 | url = DLCS_RSS 426 | elif popular == 0 and tag != '': 427 | # http://del.icio.us/rss/tag/apple 428 | # http://del.icio.us/rss/tag/web2.0 429 | url = DLCS_RSS + "tag/%s"%tag 430 | elif popular == 1 and tag == '': 431 | url = DLCS_RSS + '''popular/''' 432 | elif popular == 1 and tag != '': 433 | url = DLCS_RSS + '''popular/%s'''%tag 434 | rss = http_request(url).read() 435 | rss = feedparser.parse(rss) 436 | # print rss 437 | # for e in rss.entries: print e;print 438 | l = posts() 439 | for e in rss.entries: 440 | if e.has_key("links") and e["links"]!=[] and e["links"][0].has_key("href"): 441 | url = e["links"][0]["href"] 442 | elif e.has_key("link"): 443 | url = e["link"] 444 | elif e.has_key("id"): 445 | url = e["id"] 446 | else: 447 | url = "" 448 | if e.has_key("title"): 449 | description = e['title'] 450 | elif e.has_key("title_detail") and e["title_detail"].has_key("title"): 451 | description = e["title_detail"]['value'] 452 | else: 453 | description = '' 454 | try: tags = e['categories'][0][1] 455 | except: 456 | try: tags = e["category"] 457 | except: tags = "" 458 | if e.has_key("modified"): 459 | dt = e['modified'] 460 | else: 461 | dt = "" 462 | if e.has_key("summary"): 463 | extended = e['summary'] 464 | elif e.has_key("summary_detail"): 465 | e['summary_detail']["value"] 466 | else: 467 | extended = "" 468 | if e.has_key("author"): 469 | user = e['author'] 470 | else: 471 | user = "" 472 | # time = dt ist weist auf ein problem hin 473 | # die benennung der variablen ist nicht einheitlich 474 | # api senden und 475 | # xml bekommen sind zwei verschiedene schuhe :( 476 | l.append(post(url = url, description = description, tags = tags, dt = dt, extended = extended, user = user)) 477 | return l 478 | 479 | 480 | ### Main module class 481 | 482 | class DeliciousAPI: 483 | """Class providing main interace to del.icio.us API. 484 | 485 | Methods ``request`` and ``request_raw`` represent the core. For all API 486 | paths there are furthermore methods (e.g. posts_add for 'posts/all') with 487 | an explicit declaration of the parameters and documentation. These all call 488 | ``request`` and pass on extra keywords like ``_raw``. 489 | """ 490 | 491 | def __init__(self, user, passwd, codec='iso-8859-1', api_request=dlcs_api_request, xml_parser=dlcs_parse_xml): 492 | """Initialize access to the API with ``user`` and ``passwd``. 493 | 494 | ``codec`` sets the encoding of the arguments. 495 | 496 | The ``api_request`` and ``xml_parser`` parameters by default point to 497 | functions within this package with standard implementations to 498 | request and parse a resource. See ``dlcs_api_request()`` and 499 | ``dlcs_parse_xml()``. Note that ``api_request`` should return a 500 | file-like instance with an HTTPMessage instance under ``info()``, 501 | see ``urllib2.openurl`` for more info. 502 | """ 503 | assert user != "" 504 | self.user = user 505 | self.passwd = passwd 506 | self.codec = codec 507 | 508 | # Implement communication to server and parsing of respons messages: 509 | assert callable(api_request) 510 | self._api_request = api_request 511 | assert callable(xml_parser) 512 | self._parse_response = xml_parser 513 | 514 | def _call_server(self, path, **params): 515 | params = dict0(params) 516 | for key in params: 517 | params[key] = params[key].encode(self.codec) 518 | 519 | # see __init__ for _api_request() 520 | return self._api_request(path, params, self.user, self.passwd) 521 | 522 | 523 | ### Core functionality 524 | 525 | def request(self, path, _raw=False, **params): 526 | """Calls a path in the API, parses the answer to a JSON-like structure by 527 | default. Use with ``_raw=True`` or ``call request_raw()`` directly to 528 | get the filehandler and process the response message manually. 529 | 530 | Calls to some paths will return a `result` message, i.e.:: 531 | 532 | 533 | 534 | or:: 535 | 536 | ... 537 | 538 | These are all parsed to ``{'result':(Boolean, MessageString)}`` and this 539 | method will raise ``DeliciousError`` on negative `result` answers. Using 540 | ``_raw=True`` bypasses all parsing and will never raise ``DeliciousError``. 541 | 542 | See ``dlcs_parse_xml()`` and ``self.request_raw()``.""" 543 | 544 | # method _parse_response is bound in `__init__()`, `_call_server` 545 | # uses `_api_request` also set in `__init__()` 546 | if _raw: 547 | # return answer 548 | return self.request_raw(path, **params) 549 | 550 | else: 551 | # get answer and parse 552 | fl = self._call_server(path, **params) 553 | rs = self._parse_response(fl) 554 | 555 | # Raise an error for negative 'result' answers 556 | if type(rs) == dict and rs == 'result' and not rs['result'][0]: 557 | errmsg = "" 558 | if len(rs['result'])>0: 559 | errmsg = rs['result'][1:] 560 | raise DeliciousError, errmsg 561 | 562 | return rs 563 | 564 | def request_raw(self, path, **params): 565 | """Calls the path in the API, returns the filehandle. Returned 566 | file-like instances have an ``HTTPMessage`` instance with HTTP header 567 | information available. Use ``filehandle.info()`` or refer to the 568 | ``urllib2.openurl`` documentation. 569 | """ 570 | # see `request()` on how the response can be handled 571 | return self._call_server(path, **params) 572 | 573 | ### Explicit declarations of API paths, their parameters and docs 574 | 575 | # Tags 576 | def tags_get(self, **kwds): 577 | """Returns a list of tags and the number of times it is used by the user. 578 | :: 579 | 580 | 581 | 582 | """ 583 | return self.request("tags/get", **kwds) 584 | 585 | def tags_rename(self, old, new, **kwds): 586 | """Rename an existing tag with a new tag name. Returns a `result` 587 | message or raises an ``DeliciousError``. See ``self.request()``. 588 | 589 | &old (required) 590 | Tag to rename. 591 | &new (required) 592 | New name. 593 | """ 594 | return self.request("tags/rename", old=old, new=new, **kwds) 595 | 596 | # Posts 597 | def posts_update(self, **kwds): 598 | """Returns the last update time for the user. Use this before calling 599 | `posts_all` to see if the data has changed since the last fetch. 600 | :: 601 | 602 | 603 | """ 604 | return self.request("posts/update", **kwds) 605 | 606 | def posts_dates(self, tag="", **kwds): 607 | """Returns a list of dates with the number of posts at each date. 608 | :: 609 | 610 | 611 | 612 | 613 | &tag (optional). 614 | Filter by this tag. 615 | """ 616 | return self.request("posts/dates", tag=tag, **kwds) 617 | 618 | def posts_get(self, tag="", dt="", url="", **kwds): 619 | """Returns posts matching the arguments. If no date or url is given, 620 | most recent date will be used. 621 | :: 622 | 623 | 624 | 625 | 626 | &tag (optional). 627 | Filter by this tag. 628 | &dt (optional). 629 | Filter by this date (CCYY-MM-DDThh:mm:ssZ). 630 | &url (optional). 631 | Filter by this url. 632 | """ 633 | return self.request("posts/get", tag=tag, dt=dt, url=url, **kwds) 634 | 635 | def posts_recent(self, tag="", count="", **kwds): 636 | """Returns a list of the most recent posts, filtered by argument. 637 | :: 638 | 639 | 640 | 641 | 642 | &tag (optional). 643 | Filter by this tag. 644 | &count (optional). 645 | Number of items to retrieve (Default:15, Maximum:100). 646 | """ 647 | return self.request("posts/recent", tag=tag, count=count, **kwds) 648 | 649 | def posts_all(self, tag="", **kwds): 650 | """Returns all posts. Please use sparingly. Call the `posts_update` 651 | method to see if you need to fetch this at all. 652 | :: 653 | 654 | 655 | 656 | 657 | &tag (optional). 658 | Filter by this tag. 659 | """ 660 | return self.request("posts/all", tag=tag, **kwds) 661 | 662 | def posts_add(self, url, description, extended="", tags="", dt="", 663 | replace="no", shared="yes", **kwds): 664 | """Add a post to del.icio.us. Returns a `result` message or raises an 665 | ``DeliciousError``. See ``self.request()``. 666 | 667 | &url (required) 668 | the url of the item. 669 | &description (required) 670 | the description of the item. 671 | &extended (optional) 672 | notes for the item. 673 | &tags (optional) 674 | tags for the item (space delimited). 675 | &dt (optional) 676 | datestamp of the item (format "CCYY-MM-DDThh:mm:ssZ"). 677 | 678 | Requires a LITERAL "T" and "Z" like in ISO8601 at http://www.cl.cam.ac.uk/~mgk25/iso-time.html for example: "1984-09-01T14:21:31Z" 679 | &replace=no (optional) - don't replace post if given url has already been posted. 680 | &shared=no (optional) - make the item private 681 | """ 682 | return self.request("posts/add", url=url, description=description, 683 | extended=extended, tags=tags, dt=dt, 684 | replace=replace, shared=shared, **kwds) 685 | 686 | def posts_delete(self, url, **kwds): 687 | """Delete a post from del.icio.us. Returns a `result` message or 688 | raises an ``DeliciousError``. See ``self.request()``. 689 | 690 | &url (required) 691 | the url of the item. 692 | """ 693 | return self.request("posts/delete", url=url, **kwds) 694 | 695 | # Bundles 696 | def bundles_all(self, **kwds): 697 | """Retrieve user bundles from del.icio.us. 698 | :: 699 | 700 | 701 | 702 | """ 703 | return self.request("tags/bundles/all", **kwds) 704 | 705 | def bundles_set(self, bundle, tags, **kwds): 706 | """Assign a set of tags to a single bundle, wipes away previous 707 | settings for bundle. Returns a `result` messages or raises an 708 | ``DeliciousError``. See ``self.request()``. 709 | 710 | &bundle (required) 711 | the bundle name. 712 | &tags (required) 713 | list of tags (space seperated). 714 | """ 715 | if type(tags)==list: 716 | tags = " ".join(tags) 717 | return self.request("tags/bundles/set", bundle=bundle, tags=tags, 718 | **kwds) 719 | 720 | def bundles_delete(self, bundle, **kwds): 721 | """Delete a bundle from del.icio.us. Returns a `result` message or 722 | raises an ``DeliciousError``. See ``self.request()``. 723 | 724 | &bundle (required) 725 | the bundle name. 726 | """ 727 | return self.request("tags/bundles/delete", bundle=bundle, **kwds) 728 | 729 | ### Utils 730 | 731 | # Lookup table for del.icio.us url-path to DeliciousAPI method. 732 | paths = { 733 | 'tags/get': tags_get, 734 | 'tags/rename': tags_rename, 735 | 'posts/update': posts_update, 736 | 'posts/dates': posts_dates, 737 | 'posts/get': posts_get, 738 | 'posts/recent': posts_recent, 739 | 'posts/all': posts_all, 740 | 'posts/add': posts_add, 741 | 'posts/delete': posts_delete, 742 | 'tags/bundles/all': bundles_all, 743 | 'tags/bundles/set': bundles_set, 744 | 'tags/bundles/delete': bundles_delete, 745 | } 746 | 747 | def get_url(self, url): 748 | """Return the del.icio.us url at which the HTML page with posts for 749 | ``url`` can be found. 750 | """ 751 | return "http://del.icio.us/url/?url=%s" % (url,) 752 | 753 | 754 | ### Convenience functions on this package 755 | 756 | def apiNew(user, passwd): 757 | """creates a new DeliciousAPI object. 758 | requires user(name) and passwd 759 | """ 760 | return DeliciousAPI(user=user, passwd=passwd) 761 | 762 | def add(user, passwd, url, description, tags="", extended="", dt="", replace="no"): 763 | return apiNew(user, passwd).posts_add(url=url, description=description, extended=extended, tags=tags, dt=dt, replace=replace) 764 | 765 | def get(user, passwd, tag="", dt="", count = 0): 766 | posts = apiNew(user, passwd).posts_get(tag=tag,dt=dt) 767 | if count != 0: posts = posts[0:count] 768 | return posts 769 | 770 | def get_all(user, passwd, tag=""): 771 | return apiNew(user, passwd).posts_all(tag=tag) 772 | 773 | def delete(user, passwd, url): 774 | return apiNew(user, passwd).posts_delete(url=url) 775 | 776 | def rename_tag(user, passwd, oldtag, newtag): 777 | return apiNew(user=user, passwd=passwd).tags_rename(old=oldtag, new=newtag) 778 | 779 | def get_tags(user, passwd): 780 | return apiNew(user=user, passwd=passwd).tags_get() 781 | 782 | 783 | ### RSS functions @bvb: still working...? 784 | def getrss(tag="", popular=0, url='', user=""): 785 | """get posts from del.icio.us via parsing RSS @bvb[or HTML] 786 | 787 | @bvb[not tested] 788 | 789 | tag (opt) sort by tag 790 | popular (opt) look for the popular stuff 791 | user (opt) get the posts by a user, this striks popular 792 | url (opt) get the posts by url 793 | """ 794 | return dlcs_rss_request(tag=tag, popular=popular, user=user, url=url) 795 | 796 | def get_userposts(user): 797 | return getrss(user = user) 798 | 799 | def get_tagposts(tag): 800 | return getrss(tag = tag) 801 | 802 | def get_urlposts(url): 803 | return getrss(url = url) 804 | 805 | def get_popular(tag = ""): 806 | return getrss(tag = tag, popular = 1) 807 | 808 | 809 | ### @TODO: implement JSON fetching 810 | def json_posts(user, count=15): 811 | """http://del.icio.us/feeds/json/mpe 812 | http://del.icio.us/feeds/json/mpe/art+history 813 | count=### the number of posts you want to get (default is 15, maximum is 100) 814 | raw a raw JSON object is returned, instead of an object named Delicious.posts 815 | """ 816 | 817 | def json_tags(user, atleast, count, sort='alpha'): 818 | """http://del.icio.us/feeds/json/tags/mpe 819 | atleast=### include only tags for which there are at least ### number of posts 820 | count=### include ### tags, counting down from the top 821 | sort={alpha|count} construct the object with tags in alphabetic order (alpha), or by count of posts (count) 822 | callback=NAME wrap the object definition in a function call NAME(...), thus invoking that function when the feed is executed 823 | raw a pure JSON object is returned, instead of code that will construct an object named Delicious.tags 824 | """ 825 | 826 | def json_network(user): 827 | """http://del.icio.us/feeds/json/network/mpe 828 | callback=NAME wrap the object definition in a function call NAME(...) 829 | ?raw a raw JSON object is returned, instead of an object named Delicious.posts 830 | """ 831 | 832 | def json_fans(user): 833 | """http://del.icio.us/feeds/json/fans/mpe 834 | callback=NAME wrap the object definition in a function call NAME(...) 835 | ?raw a pure JSON object is returned, instead of an object named Delicious. 836 | """ 837 | 838 | --------------------------------------------------------------------------------