├── Bayes ├── docclass.py ├── feedfilter.py ├── python_feed.db ├── python_search.xml ├── rss.json ├── rss.py ├── rss1.json ├── test.db └── 帮助.txt ├── KNN ├── ebaypredict.py ├── numpredict.py ├── optimization.py └── 帮助.txt ├── LR ├── data.txt ├── softmax回归.py ├── 帮助.txt ├── 线性回归.py ├── 逻辑回归.py └── 逻辑回归——没有onehot.py ├── README.md ├── Recommend ├── recommendations.py └── 帮助.txt ├── SVM ├── agesonly.csv ├── facebook.py ├── linearclassify.py ├── matchmaker.csv ├── svm-digits.py ├── svm-simple.py ├── svm-smo.py ├── svm-svc.py ├── svm.py ├── svmMLiA.py ├── testSet.txt ├── testSetRBF.txt ├── testSetRBF2.txt ├── 帮助.txt └── 核方法.py ├── Search-Engines ├── nn.py ├── searchengine.py ├── spyder.py └── 帮助.txt ├── adaboost └── 帮助 ├── association ├── Apriori.py ├── fpGrowth.py └── 帮助.txt ├── cluster ├── DBSCAN.py ├── DBSCAN_data.txt ├── Thumbs.db ├── blogclust.jpg ├── blogdata.txt ├── blogs2d.jpg ├── clusters.py ├── downloadrss.py ├── feedlist.txt ├── generatefeedvector.py ├── wordclust.jpg ├── zebo.txt └── 帮助.txt ├── decision-tree ├── Thumbs.db ├── addresslist.txt ├── hotornot.py ├── treepredict.py ├── treeview.jpg ├── zillow.py └── 帮助.txt ├── ensemble-learning ├── ROC.py ├── adaboost.py ├── adaboost │ ├── ROC.py │ ├── adaboost.py │ ├── horseColicTest2.txt │ ├── horseColicTraining2.txt │ ├── horse_adaboost.py │ ├── sklearn_adaboost.py │ └── 帮助.txt ├── horseColicTest2.txt ├── horseColicTraining2.txt ├── horse_adaboost.py ├── sklearn_adaboost.py └── 帮助.txt ├── feature-extraction ├── Thumbs.db ├── articles.txt ├── clusters.py ├── docclass.py ├── features.txt ├── newsfeatures.py ├── nmf.py ├── nnmf.py ├── stockfeatures.txt ├── stockvolume.py └── 帮助.txt └── optimization ├── dorm.py ├── optimization.py ├── schedule.txt ├── socialnetwork.py └── 帮助.txt /Bayes/feedfilter.py: -------------------------------------------------------------------------------- 1 | # 利用分类器,应用到过滤博客订阅源 2 | import feedparser 3 | import re 4 | import docclass 5 | 6 | # 接受一个博客订阅源的url文件名并对内容项进行分类 7 | def read(feedfile,classifier): 8 | # 得到订阅源的内容项并遍历循环 9 | f=feedparser.parse(feedfile) 10 | for entry in f['entries']: 11 | print 12 | print('-----') 13 | # 将内容项打印输出 14 | print('Title: '+entry['title']) 15 | print('Publisher: '+entry['publisher']) 16 | print 17 | print(entry['summary']) 18 | 19 | 20 | # 将所有文本组合在一起,为分类器构建一个内容项 21 | fulltext='%s\n%s\n%s' % (entry['title'],entry['publisher'],entry['summary']) 22 | 23 | # 将当前分类的最佳推测结果打印输出 24 | print('Guess: '+str(classifier.classify(fulltext))) 25 | 26 | # 请求用户给出正确分类,并据此进行训练 27 | if(entry.has_key('cat') and entry['cat']!=None): 28 | classifier.train(fulltext, entry['cat']) 29 | else: 30 | cl=input('Enter category: ') 31 | classifier.train(fulltext,cl) 32 | 33 | 34 | # 对特征检测的改进。新的特征提取函数。更关注文章的名称、摘要、作者介绍 35 | def entryfeatures(entry): 36 | splitter=re.compile('\\W*') 37 | features={} 38 | 39 | # 提取标题中的单词并进行标识 40 | titlewords=[s.lower() for s in splitter.split(entry['title']) if len(s)>2 and len(s)<20] 41 | for w in titlewords: features['Title:'+w]=1 42 | 43 | # 提取摘要中单词 44 | summarywords=[s.lower() for s in splitter.split(entry['summary']) if len(s)>2 and len(s)<20] 45 | 46 | # 统计大写单词 47 | uc=0 48 | for i in range(len(summarywords)): 49 | w=summarywords[i] 50 | features[w]=1 51 | if w.isupper(): uc+=1 52 | 53 | # 将从摘要中获得词组作为特征 54 | if i0.3: features['UPPERCASE']=1 63 | 64 | return features 65 | 66 | 67 | if __name__=="__main__": #只有在执行当前模块时才会运行此函数 68 | # 对博客文章进行分类和训练 69 | cl=docclass.fisherclassifier(docclass.getwords) 70 | cl.setdb('python_feed.db') 71 | read('python_search.xml',cl) 72 | 73 | # 使用改进的特征提取函数对文章分类进行处理 74 | cl = docclass.fisherclassifier(entryfeatures) 75 | cl.setdb('python_feed.db') 76 | read('python_search.xml', cl) 77 | 78 | -------------------------------------------------------------------------------- /Bayes/python_feed.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-infra/data-mining/3f9a5ab6764a950108e95f295417ae83a99287b7/Bayes/python_feed.db -------------------------------------------------------------------------------- /Bayes/rss.py: -------------------------------------------------------------------------------- 1 | # 获取百度新闻rss数据集,用于文本分类 2 | import urllib 3 | import re 4 | from bs4 import BeautifulSoup 5 | import json 6 | import io 7 | 8 | feedlist=[ 9 | 'http://news.baidu.com/n?cmd=1&class=civilnews&tn=rss&sub=0', #国内焦点 10 | 'http://news.baidu.com/n?cmd=1&class=shizheng&tn=rss&sub=0', #时政焦点 11 | 'http://news.baidu.com/n?cmd=1&class=gangaotai&tn=rss&sub=0', #港澳台焦点 12 | 'http://news.baidu.com/n?cmd=1&class=internews&tn=rss&sub=0', #国际焦点 13 | 'http://news.baidu.com/n?cmd=1&class=mil&tn=rss&sub=0', #军事焦点 14 | 'http://news.baidu.com/n?cmd=1&class=hqsy&tn=rss&sub=0', #环球视野焦点 15 | 'http://news.baidu.com/n?cmd=1&class=finannews&tn=rss&sub=0', #财经焦点 16 | 'http://news.baidu.com/n?cmd=1&class=stock&tn=rss&sub=0', #股票焦点 17 | 'http://news.baidu.com/n?cmd=1&class=money&tn=rss&sub=0', #理财焦点 18 | 'http://news.baidu.com/n?cmd=1&class=financialnews&tn=rss&sub=0', #金融观察焦点 19 | 'http://news.baidu.com/n?cmd=1&class=internet&tn=rss&sub=0', #互联网焦点 20 | 'http://news.baidu.com/n?cmd=1&class=rwdt&tn=rss&sub=0', #人物动态焦点 21 | 'http://news.baidu.com/n?cmd=1&class=gsdt&tn=rss&sub=0', #公司动态焦点 22 | 'http://news.baidu.com/n?cmd=1&class=housenews&tn=rss&sub=0', #房产焦点 23 | 'http://news.baidu.com/n?cmd=1&class=gddt&tn=rss&sub=0', #各地动态焦点 24 | 'http://news.baidu.com/n?cmd=1&class=zcfx&tn=rss&sub=0', #政策风向焦点 25 | 'http://news.baidu.com/n?cmd=1&class=fitment&tn=rss&sub=0', #家居焦点 26 | 'http://news.baidu.com/n?cmd=1&class=autonews&tn=rss&sub=0', #汽车焦点 27 | 'http://news.baidu.com/n?cmd=1&class=autobuy&tn=rss&sub=0', #新车导购焦点 28 | 'http://news.baidu.com/n?cmd=1&class=autoreview&tn=rss&sub=0', #试驾焦点 29 | 'http://news.baidu.com/n?cmd=1&class=sportnews&tn=rss&sub=0', #体育焦点 30 | 'http://news.baidu.com/n?cmd=1&class=nba&tn=rss&sub=0', #NBA焦点 31 | 'http://news.baidu.com/n?cmd=1&class=worldsoccer&tn=rss&sub=0', #国际足球焦点 32 | 'http://news.baidu.com/n?cmd=1&class=chinasoccer&tn=rss&sub=0', #国内足球焦点 33 | 'http://news.baidu.com/n?cmd=1&class=cba&tn=rss&sub=0', #国内篮球焦点 34 | 'http://news.baidu.com/n?cmd=1&class=othersports&tn=rss&sub=0', #综合体育焦点 35 | 'http://news.baidu.com/n?cmd=1&class=olympic&tn=rss&sub=0', #奥运焦点 36 | 'http://news.baidu.com/n?cmd=1&class=enternews&tn=rss&sub=0', #娱乐焦点 37 | 'http://news.baidu.com/n?cmd=1&class=star&tn=rss&sub=0', #明星焦点 38 | 'http://news.baidu.com/n?cmd=1&class=film&tn=rss&sub=0', #电影焦点 39 | 'http://news.baidu.com/n?cmd=1&class=tv&tn=rss&sub=0', #电视焦点 40 | 'http://news.baidu.com/n?cmd=1&class=music&tn=rss&sub=0', #音乐焦点 41 | 'http://news.baidu.com/n?cmd=1&class=gamenews&tn=rss&sub=0', #游戏焦点 42 | 'http://news.baidu.com/n?cmd=1&class=netgames&tn=rss&sub=0', #网络游戏焦点 43 | 'http://news.baidu.com/n?cmd=1&class=tvgames&tn=rss&sub=0', #电视游戏焦点 44 | 'http://news.baidu.com/n?cmd=1&class=edunews&tn=rss&sub=0', #教育焦点 45 | 'http://news.baidu.com/n?cmd=1&class=exams&tn=rss&sub=0', #考试焦点 46 | 'http://news.baidu.com/n?cmd=1&class=abroad&tn=rss&sub=0', #留学焦点 47 | 'http://news.baidu.com/n?cmd=1&class=healthnews&tn=rss&sub=0', #健康焦点 48 | 'http://news.baidu.com/n?cmd=1&class=baojian&tn=rss&sub=0', #保健养生焦点 49 | 'http://news.baidu.com/n?cmd=1&class=yiyao&tn=rss&sub=0', #寻医问药焦点 50 | 'http://news.baidu.com/n?cmd=1&class=technnews&tn=rss&sub=0', #科技焦点 51 | 'http://news.baidu.com/n?cmd=1&class=mobile&tn=rss&sub=0', #手机焦点 52 | 'http://news.baidu.com/n?cmd=1&class=digi&tn=rss&sub=0', #数码焦点 53 | 'http://news.baidu.com/n?cmd=1&class=computer&tn=rss&sub=0', #电脑焦点 54 | 'http://news.baidu.com/n?cmd=1&class=discovery&tn=rss&sub=0', #科普焦点 55 | 'http://news.baidu.com/n?cmd=1&class=socianews&tn=rss&sub=0', #社会焦点 56 | 'http://news.baidu.com/n?cmd=1&class=shyf&tn=rss&sub=0', #社会与法焦点 57 | 'http://news.baidu.com/n?cmd=1&class=shwx&tn=rss&sub=0', #社会万象焦点 58 | 'http://news.baidu.com/n?cmd=1&class=zqsk&tn=rss&sub=0', #真情时刻焦点 59 | ] 60 | 61 | def getrss1(feedlist): 62 | for url in feedlist: 63 | info={} 64 | info[url]={ 65 | 'title':'', 66 | 'allitem':[] 67 | } 68 | try: 69 | response=urllib.request.urlopen(url) 70 | text = str(response.read(), encoding='utf-8') 71 | soup = BeautifulSoup(text, 'lxml') 72 | title = soup.title 73 | info[url]['title']=title 74 | for item in soup('item'): 75 | try: 76 | print(item) 77 | suburl={ 78 | 'title':item('title').replace(']]>','').replace('', '').replace('', '').replace('',''), 82 | 'type':title 83 | } 84 | print(suburl) 85 | info[url]['allitem'].append(suburl) 86 | except: 87 | print('无法匹配'+item) 88 | except: 89 | print("error: %s" % url) 90 | 91 | 92 | def getrss(feedlist): 93 | rss = {} 94 | 95 | for url in feedlist: 96 | rss[url] = { 97 | 'title': '', 98 | 'allitem': [] 99 | } 100 | try: 101 | response = urllib.request.urlopen(url) 102 | text = str(response.read(), encoding='utf-8') 103 | soup = BeautifulSoup(text, 'lxml') 104 | title = soup.title.get_text() 105 | rss[url]['title'] = title 106 | patterstr = r'.*?' \ 107 | r'(.*?).*?' \ 108 | r'(.*?).*?' \ 109 | r'(.*?).*?' \ 110 | r'.*?
(.*?)' 112 | pattern = re.compile(patterstr,re.S) #使用多行模式 113 | results = re.findall(pattern, text) #如何查询多次 114 | 115 | if results!=None or len(results)==0: 116 | for result in results: 117 | suburl = { 118 | 'title': result[0].replace(']]>', '').replace('', '').replace('', '').replace('', ''), 122 | 'type': title 123 | } 124 | print(suburl) 125 | rss[url]['allitem'].append(suburl) 126 | except: 127 | print("error: %s" % url) 128 | 129 | return rss 130 | 131 | 132 | # 形成一个文本描述和分类的数据集。 133 | if __name__ == '__main__': 134 | rss = getrss(feedlist) 135 | jsonstr = json.dumps(rss,ensure_ascii=False) 136 | f = io.open('rss.json', 'w', encoding='utf-8') 137 | f.writelines(jsonstr) 138 | f.close() 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /Bayes/test.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-infra/data-mining/3f9a5ab6764a950108e95f295417ae83a99287b7/Bayes/test.db -------------------------------------------------------------------------------- /Bayes/帮助.txt: -------------------------------------------------------------------------------- 1 | 贝叶斯算法的demo 2 | -------------------------------------------------------------------------------- /KNN/ebaypredict.py: -------------------------------------------------------------------------------- 1 | #爬取ebay网站真实数据 2 | 3 | import httplib 4 | from xml.dom.minidom import parse, parseString, Node 5 | 6 | devKey = 'YOUR DEV KEY' 7 | appKey = 'YOUR APP KEY' 8 | certKey = 'YOUR CERT KEY' 9 | serverUrl = 'api.ebay.com' 10 | userToken = 'YOUR TOKEN' 11 | 12 | def getHeaders(apicall,siteID="0",compatabilityLevel = "433"): 13 | headers = {"X-EBAY-API-COMPATIBILITY-LEVEL": compatabilityLevel, 14 | "X-EBAY-API-DEV-NAME": devKey, 15 | "X-EBAY-API-APP-NAME": appKey, 16 | "X-EBAY-API-CERT-NAME": certKey, 17 | "X-EBAY-API-CALL-NAME": apicall, 18 | "X-EBAY-API-SITEID": siteID, 19 | "Content-Type": "text/xml"} 20 | return headers 21 | 22 | def sendRequest(apicall,xmlparameters): 23 | connection = httplib.HTTPSConnection(serverUrl) 24 | connection.request("POST", '/ws/api.dll', xmlparameters, getHeaders(apicall)) 25 | response = connection.getresponse() 26 | if response.status != 200: 27 | print "Error sending request:" + response.reason 28 | else: 29 | data = response.read() 30 | connection.close() 31 | return data 32 | 33 | def getSingleValue(node,tag): 34 | nl=node.getElementsByTagName(tag) 35 | if len(nl)>0: 36 | tagNode=nl[0] 37 | if tagNode.hasChildNodes(): 38 | return tagNode.firstChild.nodeValue 39 | return '-1' 40 | 41 | 42 | def doSearch(query,categoryID=None,page=1): 43 | xml = ""+\ 44 | ""+\ 45 | "" +\ 46 | userToken +\ 47 | "" + \ 48 | ""+\ 49 | "200"+\ 50 | ""+str(page)+""+\ 51 | ""+\ 52 | "" + query + "" 53 | if categoryID!=None: 54 | xml+=""+str(categoryID)+"" 55 | xml+="" 56 | 57 | data=sendRequest('GetSearchResults',xml) 58 | response = parseString(data) 59 | itemNodes = response.getElementsByTagName('Item'); 60 | results = [] 61 | for item in itemNodes: 62 | itemId=getSingleValue(item,'ItemID') 63 | itemTitle=getSingleValue(item,'Title') 64 | itemPrice=getSingleValue(item,'CurrentPrice') 65 | itemEnds=getSingleValue(item,'EndTime') 66 | results.append((itemId,itemTitle,itemPrice,itemEnds)) 67 | return results 68 | 69 | 70 | def getCategory(query='',parentID=None,siteID='0'): 71 | lquery=query.lower() 72 | xml = ""+\ 73 | ""+\ 74 | "" +\ 75 | userToken +\ 76 | ""+\ 77 | "ReturnAll"+\ 78 | "true"+\ 79 | ""+siteID+"" 80 | if parentID==None: 81 | xml+="1" 82 | else: 83 | xml+=""+str(parentID)+"" 84 | xml += "" 85 | data=sendRequest('GetCategories',xml) 86 | categoryList=parseString(data) 87 | catNodes=categoryList.getElementsByTagName('Category') 88 | for node in catNodes: 89 | catid=getSingleValue(node,'CategoryID') 90 | name=getSingleValue(node,'CategoryName') 91 | if name.lower().find(lquery)!=-1: 92 | print catid,name 93 | 94 | def getItem(itemID): 95 | xml = ""+\ 96 | ""+\ 97 | "" +\ 98 | userToken +\ 99 | "" + \ 100 | "" + str(itemID) + ""+\ 101 | "ItemReturnAttributes"+\ 102 | "" 103 | data=sendRequest('GetItem',xml) 104 | result={} 105 | response=parseString(data) 106 | result['title']=getSingleValue(response,'Title') 107 | sellingStatusNode = response.getElementsByTagName('SellingStatus')[0]; 108 | result['price']=getSingleValue(sellingStatusNode,'CurrentPrice') 109 | result['bids']=getSingleValue(sellingStatusNode,'BidCount') 110 | seller = response.getElementsByTagName('Seller') 111 | result['feedback'] = getSingleValue(seller[0],'FeedbackScore') 112 | 113 | attributeSet=response.getElementsByTagName('Attribute'); 114 | attributes={} 115 | for att in attributeSet: 116 | attID=att.attributes.getNamedItem('attributeID').nodeValue 117 | attValue=getSingleValue(att,'ValueLiteral') 118 | attributes[attID]=attValue 119 | result['attributes']=attributes 120 | return result 121 | 122 | 123 | def makeLaptopDataset(): 124 | searchResults=doSearch('laptop',categoryID=51148) 125 | result=[] 126 | for r in searchResults: 127 | item=getItem(r[0]) 128 | att=item['attributes'] 129 | try: 130 | data=(float(att['12']),float(att['26444']), 131 | float(att['26446']),float(att['25710']), 132 | float(item['feedback']) 133 | ) 134 | entry={'input':data,'result':float(item['price'])} 135 | result.append(entry) 136 | except: 137 | print item['title']+' failed' 138 | return result 139 | -------------------------------------------------------------------------------- /KNN/optimization.py: -------------------------------------------------------------------------------- 1 | # 优化算法。寻找使成本函数最小的题解。精髓:1、将题解转化为数字序列化,可以写出题解范围。2、成本函数能返回值 2 | import time 3 | import random 4 | import math 5 | 6 | 7 | # 随机搜索算法:随机选择题解,计算成本值,成本值最小的题解为确定题解。domain为题解范围(可选航班范围),costf为成本函数。 8 | def randomoptimize(domain,costf): 9 | best=999999999 10 | bestr=None 11 | for i in range(0,1000): 12 | # 创建随机解 13 | sol=[random.randint(domain[i][0],domain[i][1]) for i in range(len(domain))] 14 | #计算成本值 15 | cost=costf(sol) 16 | 17 | # 与目前得到的最优解进行比较 18 | if costdomain[j][0]: 37 | neighbors.append(sol[0:j]+[sol[j]-1]+sol[j+1:]) #向近0偏移 38 | if sol[j]0.1: 61 | # 选择一个索引值 62 | i=random.randint(0,len(domain)-1) 63 | 64 | # 选择一个改变索引值的方向 65 | dir=random.randint(-step,step) 66 | 67 | #创建一个代表题解的新列表,改变其中一个值 68 | vecb=vec[:] 69 | vecb[i]+=dir 70 | if vecb[i]domain[i][1]: vecb[i]=domain[i][1] #如果渐变不超出了题解的范围 72 | 73 | # 计算当前成本与新的成本 74 | ea=costf(vec) 75 | eb=costf(vecb) 76 | p=pow(math.e,(-eb-ea)/T) 77 | 78 | # 它是更好的解么?或者是趋向最优解的可能的临界解么 79 | if (eb=domain[i][0]+step: 92 | return vec[0:i]+[vec[i]-step]+vec[i+1:] 93 | elif vec[i]<=domain[i][1]-step: 94 | return vec[0:i]+[vec[i]+step]+vec[i+1:] 95 | 96 | # 杂交操作(交叉) 97 | def crossover(r1,r2): 98 | i=random.randint(1,len(domain)-2) 99 | return r1[0:i]+r2[i:] 100 | 101 | # 构建初始种群 102 | pop=[] 103 | for i in range(popsize): #随机产生50个动物的种群 104 | vec=[random.randint(domain[i][0],domain[i][1]) for i in range(len(domain))] 105 | pop.append(vec) 106 | # 每一代有多少胜出者? 107 | topelite=int(elite*popsize) 108 | 109 | # 主循环 110 | for i in range(maxiter): 111 | scores=[(costf(v),v) for v in pop] 112 | scores.sort() 113 | ranked=[v for (s,v) in scores] 114 | 115 | # 在种群中选出优胜者 116 | pop=ranked[0:topelite] 117 | 118 | # 为优秀基因者,添加变异和配对后的胜出者 119 | while len(pop)0.5: 79 | # if z>0: #h>0.5的判断等价于 z>0 80 | return 1,y 81 | else: 82 | return 0,y 83 | 84 | 85 | 86 | # 绘制分界线。 87 | def plotBestFit(dataMat,labelMat,weights): 88 | plt.scatter(dataMat[:, 1].flatten().A[0], dataMat[:, 2].flatten().A[0], 89 | c=labelMat.flatten().A[0],alpha=.5) # 第一个偏量为b,第2个偏量x1,第3个偏量x2 90 | 91 | x1 = np.arange(-4.0, 4.0, 0.1) 92 | x2 = (-weights[0] - weights[1] * x1) / weights[2] # 逻辑回归获取的回归系数,满足w0+w1*x1+w2*x2=0,即x2 =(-w0-w1*x1)/w2 93 | plt.plot(x1, x2) 94 | plt.xlabel('X1'); plt.ylabel('X2') #绘制label 95 | plt.show() 96 | 97 | 98 | 99 | if __name__ == '__main__': 100 | # 查看数据集的分布 101 | # plotDataSet() 102 | 103 | dataMat, labelMat = loadDataSet(data) # 加载数据集 104 | weights = gradAscent(dataMat, labelMat) # 梯度下降法求回归系数 105 | # weights = stocGradAscent(dataMat, labelMat) # 局部梯度下降法求回归系数 106 | print(weights) 107 | type,y = predict1(weights,[0.317029,14.739025]) 108 | print(type,y) 109 | plotBestFit(dataMat,labelMat,weights) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data-mining 2 | 数据挖掘的相关案例和demo 3 | -------------------------------------------------------------------------------- /Recommend/recommendations.py: -------------------------------------------------------------------------------- 1 | #根据偏好数据集,匹配行和列,进而实现推荐行和推荐列 2 | 3 | # 偏好数据集(人-电影-评分) 4 | prefs={ 5 | 'name1': {'movie1': 2.5, 'movie2': 3.5,'movie3': 3.0, 'movie4': 3.5, 'movie5': 2.5, 'movie6': 3.0}, 6 | 'name2': {'movie1': 3.0, 'movie2': 3.5,'movie3': 1.5, 'movie4': 5.0, 'movie6': 3.0,'movie5': 3.5}, 7 | 'name3': {'movie1': 2.5, 'movie2': 3.0,'movie4': 3.5, 'movie6': 4.0}, 8 | 'name4': {'movie2': 3.5, 'movie3': 3.0, 'movie6': 4.5, 'movie4': 4.0, 'movie5': 2.5}, 9 | 'name5': {'movie1': 3.0, 'movie2': 4.0, 'movie3': 2.0, 'movie4': 3.0, 'movie6': 3.0,'movie5': 2.0}, 10 | 'name6': {'movie1': 3.0, 'movie2': 4.0, 'movie6': 3.0, 'movie4': 5.0, 'movie5': 3.5}, 11 | 'name7': {'movie2':4.5,'movie5':1.0,'movie4':4.0} 12 | } 13 | 14 | 15 | from math import sqrt 16 | # 计算两行之间的欧几里得距离,以此来代表相似度。prefs表示偏好数据集 17 | def sim_distance(prefs,row1_name,row2_name): 18 | # 首先计算是否有共同列(都看过的电影) 19 | si={} 20 | for item in prefs[row1_name]: 21 | if item in prefs[row2_name]: si[item]=1 22 | 23 | # 如果没有共同列,则两行之间相似度为0 24 | if len(si)==0: return 0 25 | 26 | # 根据共同列计算两行的欧几里得距离,并将距离映射到0-1上。0表示完全不相似,1表示完全相似 27 | sum_of_squares=sum([pow(prefs[row1_name][item]-prefs[row2_name][item],2) for item in prefs[row1_name] if item in prefs[row2_name]]) 28 | return 1/(1+sum_of_squares) 29 | 30 | # 计算两行的皮尔逊相似度,以此来代表相似度。prefs表示数据集 31 | def sim_pearson(prefs,row1_name,row2_name): 32 | # 首先计算是否有共同列(都看过的电影) 33 | si={} 34 | for item in prefs[row1_name]: 35 | if item in prefs[row2_name]: si[item]=1 36 | 37 | # 如果没有共同列,两行之间相似度为0 38 | if len(si)==0: return 0 39 | 40 | # 得到列表元素个数 41 | n=len(si) 42 | 43 | # 对两行的共同列求和 44 | sum1=sum([prefs[row1_name][it] for it in si]) 45 | sum2=sum([prefs[row2_name][it] for it in si]) 46 | 47 | # 对两行的共同列求平方和 48 | sum1Sq=sum([pow(prefs[row1_name][it],2) for it in si]) 49 | sum2Sq=sum([pow(prefs[row2_name][it],2) for it in si]) 50 | 51 | # 对两行的共同列求乘积之和 52 | pSum=sum([prefs[row1_name][it]*prefs[row2_name][it] for it in si]) 53 | 54 | # 计算皮尔逊评价值 55 | num=pSum-(sum1*sum2/n) 56 | den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n)) 57 | if den==0: return 0 58 | 59 | r=num/den 60 | 61 | return r 62 | 63 | # 匹配相似行 64 | # 根据偏好数据集,返回与某个行最匹配的n行。person表示要匹配的行(人),similarity表示相似度计算函数 65 | def topMatches(prefs,row_name,n=5,similarity=sim_pearson): 66 | scores=[(similarity(prefs,row_name,other),other) for other in prefs if other!=row_name] 67 | scores.sort() 68 | scores.reverse() 69 | num = n 70 | if n>len(scores):num= len(scores) 71 | return scores[0:num] 72 | 73 | # 利用相似行,估计某行所有列存在的空白值,并排名(估计影片评分,并排名推荐) 74 | # 利用所有其他行的各列取值的加权平均(相似度为权值),为某行各列提供估值 75 | def getRecommendations(prefs,row_name,similarity=sim_pearson): 76 | totals={} 77 | simSums={} 78 | for other in prefs: 79 | # 不和自己做比较 80 | if other==row_name: continue 81 | sim=similarity(prefs,row_name,other) 82 | 83 | # 忽略评价值为0或为负的情况 84 | if sim<=0: continue 85 | for item in prefs[other]: 86 | # 只对自己还未有的列进行临时估值 87 | if item not in prefs[row_name] or prefs[row_name][item]==0: 88 | # 相似度*临时估值 89 | totals.setdefault(item,0) 90 | totals[item]+=prefs[other][item]*sim 91 | # 相似度之和 92 | simSums.setdefault(item,0) 93 | simSums[item]+=sim 94 | 95 | # 建立归一化列表 96 | rankings=[(total/simSums[item],item) for item,total in totals.items()] 97 | 98 | # 返回最终估值经过排序的列表 99 | rankings.sort() 100 | rankings.reverse() 101 | return rankings 102 | 103 | # 转置偏好数据集,以便实现匹配列 104 | def transformPrefs(prefs): 105 | result={} 106 | for row_name in prefs: 107 | for item in prefs[row_name]: 108 | result.setdefault(item,{}) 109 | 110 | # 将行与列对调 111 | result[item][row_name]=prefs[row_name][item] 112 | return result 113 | 114 | # 匹配相似列,返回各列的匹配集合(因为各列的匹配可提前在用户登陆前完成), 115 | # 根据转置后的偏好数据集,获取每列相似的n个其他列 116 | def calculateSimilarItems(prefs,n=10): 117 | # 建立字典,以给出与这些列最为相近的所有其他列 118 | itemMatch={} 119 | # 以列为中心对偏好矩阵实施转置处理 120 | itemPrefs=transformPrefs(prefs) 121 | c=0 122 | for item in itemPrefs: 123 | # 针对大数据集更新状态变量 124 | c+=1 125 | if c%100==0: print("%d / %d" % (c,len(itemPrefs))) 126 | # 寻找最为相近的列 127 | scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance) 128 | itemMatch[item]=scores 129 | 130 | return itemMatch #返回每列匹配的其他列 131 | 132 | # 利用相似列,对某一行的各列进行估值,(估计影片评分,并排名推荐):根据偏好数据集和提前构造好的物品匹配库,向用户推荐物品 133 | def getRecommendedItems(prefs,itemMatch,row_name): 134 | onerow=prefs[row_name] #获取当前行所拥有的列 135 | scores={} 136 | totalSim={} 137 | # 循环遍历由当前行所拥有的列 138 | for (item,rating) in onerow.items( ): 139 | 140 | # 循环遍历与当前列相似的列 141 | for (similarity,item2) in itemMatch[item]: 142 | 143 | # 忽略行已经拥有的列 144 | if item2 in onerow: continue 145 | # 估值与相似度的加权之和 146 | scores.setdefault(item2,0) 147 | scores[item2]+=similarity*rating 148 | # 全部相似度之和 149 | totalSim.setdefault(item2,0) 150 | totalSim[item2]+=similarity 151 | 152 | # 将每个合计值除以加权和,求出平均值 153 | rankings=[(score/totalSim[item],item) for item,score in scores.items( )] 154 | 155 | # 按最高值到最低值的顺序,返回估值排行 156 | rankings.sort( ) 157 | rankings.reverse( ) 158 | return rankings 159 | 160 | #下载大量数据集 161 | def loadMovieLens(path='/data/movielens'): 162 | # 获取影片标题 163 | movies={} 164 | for line in open(path+'/u.item'): 165 | (id,title)=line.split('|')[0:2] 166 | movies[id]=title 167 | 168 | # 加载数据 169 | prefs={} 170 | for line in open(path+'/u.data'): 171 | (user,movieid,rating,ts)=line.split('\t') 172 | prefs.setdefault(user,{}) 173 | prefs[user][movies[movieid]]=float(rating) 174 | return prefs 175 | 176 | 177 | if __name__=="__main__": #只有在执行当前模块时才会运行此函数 178 | #利用相似人推荐相似物品 179 | rankings = getRecommendations(prefs,'name7') 180 | print(rankings) #打印推荐排名 181 | #利用相似物品推荐相似物品 182 | itemMatch = calculateSimilarItems(prefs) # 提前计算所有物品的相似物品 183 | rankings = getRecommendedItems(prefs,itemMatch,'name7') 184 | print(rankings) #打印推荐排名 -------------------------------------------------------------------------------- /Recommend/帮助.txt: -------------------------------------------------------------------------------- 1 | 匹配和推荐系统的demo 2 | -------------------------------------------------------------------------------- /SVM/agesonly.csv: -------------------------------------------------------------------------------- 1 | 24,30,1 2 | 30,40,1 3 | 22,49,0 4 | 43,39,1 5 | 23,30,1 6 | 23,49,0 7 | 48,46,1 8 | 23,23,1 9 | 29,49,0 10 | 38,38,1 11 | 30,34,1 12 | 40,50,1 13 | 35,32,1 14 | 49,44,1 15 | 38,22,1 16 | 30,27,1 17 | 26,24,1 18 | 39,23,1 19 | 36,43,1 20 | 25,31,1 21 | 27,27,1 22 | 32,22,1 23 | 40,30,1 24 | 26,28,1 25 | 46,32,1 26 | 41,37,1 27 | 39,41,1 28 | 18,28,0 29 | 18,47,0 30 | 39,44,1 31 | 38,21,1 32 | 24,36,0 33 | 32,22,1 34 | 21,20,1 35 | 42,36,1 36 | 46,41,1 37 | 39,38,1 38 | 18,31,0 39 | 31,45,1 40 | 44,24,0 41 | 49,22,0 42 | 26,27,1 43 | 25,34,1 44 | 47,23,0 45 | 27,48,0 46 | 32,49,1 47 | 46,41,1 48 | 24,32,1 49 | 29,26,1 50 | 25,36,1 51 | 27,35,1 52 | 38,19,1 53 | 18,40,0 54 | 34,49,1 55 | 32,35,1 56 | 47,49,1 57 | 47,18,0 58 | 33,24,1 59 | 35,28,1 60 | 35,41,1 61 | 39,43,1 62 | 29,18,1 63 | 18,44,0 64 | 26,26,1 65 | 31,43,1 66 | 20,29,0 67 | 28,18,1 68 | 31,38,1 69 | 34,34,1 70 | 32,33,1 71 | 34,27,1 72 | 19,38,0 73 | 32,21,1 74 | 33,37,1 75 | 33,18,1 76 | 18,46,0 77 | 31,37,1 78 | 36,30,1 79 | 40,40,1 80 | 38,30,1 81 | 49,28,1 82 | 31,47,1 83 | 28,50,0 84 | 49,43,1 85 | 24,31,1 86 | 33,43,1 87 | 28,24,1 88 | 45,29,1 89 | 49,35,1 90 | 36,29,1 91 | 42,32,1 92 | 29,18,1 93 | 49,20,0 94 | 22,27,1 95 | 41,38,1 96 | 47,21,0 97 | 40,32,1 98 | 35,18,1 99 | 35,33,1 100 | 34,28,1 101 | 22,31,0 102 | 46,20,0 103 | 18,49,0 104 | 48,23,0 105 | 39,21,1 106 | 20,34,0 107 | 24,20,1 108 | 38,18,1 109 | 37,47,1 110 | 39,37,1 111 | 38,39,1 112 | 27,42,1 113 | 47,49,1 114 | 27,42,1 115 | 40,28,1 116 | 41,46,1 117 | 39,25,1 118 | 43,36,1 119 | 49,30,1 120 | 24,38,0 121 | 49,42,1 122 | 19,22,0 123 | 43,27,1 124 | 30,37,1 125 | 24,31,1 126 | 24,48,0 127 | 24,29,1 128 | 18,19,1 129 | 29,25,1 130 | 38,33,1 131 | 39,20,1 132 | 24,30,1 133 | 22,39,0 134 | 47,21,0 135 | 30,44,1 136 | 41,38,1 137 | 29,33,1 138 | 42,42,1 139 | 47,27,1 140 | 23,20,1 141 | 39,18,1 142 | 30,26,1 143 | 36,27,1 144 | 40,18,1 145 | 31,18,1 146 | 46,27,1 147 | 41,44,1 148 | 26,34,1 149 | 33,18,1 150 | 48,19,0 151 | 46,27,1 152 | 25,40,0 153 | 50,36,1 154 | 20,21,1 155 | 33,47,1 156 | 40,35,1 157 | 24,27,1 158 | 34,19,1 159 | 26,45,0 160 | 34,36,1 161 | 21,27,0 162 | 48,28,1 163 | 23,25,1 164 | 48,46,1 165 | 30,20,1 166 | 23,40,0 167 | 36,40,1 168 | 21,45,0 169 | 30,40,1 170 | 39,24,1 171 | 42,47,1 172 | 28,37,1 173 | 24,30,1 174 | 37,25,1 175 | 44,34,1 176 | 43,32,1 177 | 46,29,1 178 | 49,22,0 179 | 41,28,1 180 | 23,50,0 181 | 30,43,1 182 | 25,32,1 183 | 27,46,0 184 | 23,21,1 185 | 39,41,1 186 | 33,27,1 187 | 49,21,0 188 | 33,33,1 189 | 18,25,0 190 | 42,35,1 191 | 36,25,1 192 | 26,50,0 193 | 18,37,0 194 | 35,37,1 195 | 39,38,1 196 | 22,30,0 197 | 18,44,0 198 | 46,44,1 199 | 24,27,1 200 | 41,34,1 201 | 40,39,1 202 | 34,49,1 203 | 35,41,1 204 | 46,48,1 205 | 50,23,0 206 | 49,20,0 207 | 22,47,0 208 | 27,26,1 209 | 30,30,1 210 | 37,39,1 211 | 42,44,1 212 | 41,27,1 213 | 24,21,1 214 | 34,28,1 215 | 23,43,0 216 | 43,35,1 217 | 42,40,1 218 | 25,24,1 219 | 36,24,1 220 | 25,23,1 221 | 44,30,1 222 | 39,33,1 223 | 38,33,1 224 | 49,30,1 225 | 40,19,1 226 | 19,46,0 227 | 31,21,1 228 | 48,33,1 229 | 26,24,1 230 | 20,37,0 231 | 29,31,1 232 | 35,28,1 233 | 37,25,1 234 | 42,42,1 235 | 42,48,1 236 | 41,47,1 237 | 44,45,1 238 | 45,46,1 239 | 25,38,1 240 | 19,45,0 241 | 36,26,1 242 | 33,36,1 243 | 27,19,1 244 | 48,24,0 245 | 37,48,1 246 | 23,31,0 247 | 20,29,0 248 | 27,44,0 249 | 47,24,0 250 | 36,18,1 251 | 37,48,1 252 | 32,29,1 253 | 46,48,1 254 | 31,47,1 255 | 23,45,0 256 | 28,30,1 257 | 36,32,1 258 | 25,43,0 259 | 24,44,0 260 | 34,47,1 261 | 46,42,1 262 | 18,31,0 263 | 23,25,1 264 | 44,39,1 265 | 18,29,0 266 | 49,40,1 267 | 24,33,0 268 | 21,44,0 269 | 40,24,1 270 | 46,41,1 271 | 42,33,1 272 | 25,41,0 273 | 29,42,1 274 | 40,18,1 275 | 37,40,1 276 | 46,28,1 277 | 33,20,1 278 | 18,42,0 279 | 22,36,0 280 | 27,46,0 281 | 33,48,1 282 | 21,37,0 283 | 26,50,0 284 | 29,23,1 285 | 23,33,0 286 | 21,38,0 287 | 18,30,0 288 | 29,28,1 289 | 31,22,1 290 | 30,48,1 291 | 41,37,1 292 | 35,31,1 293 | 48,32,1 294 | 29,37,1 295 | 32,33,1 296 | 43,26,1 297 | 21,33,0 298 | 44,28,1 299 | 35,18,1 300 | 35,35,1 301 | 25,20,1 302 | 39,46,1 303 | 26,39,1 304 | 36,29,1 305 | 29,44,1 306 | 28,42,1 307 | 38,21,1 308 | 28,49,0 309 | 33,26,1 310 | 31,28,1 311 | 25,47,0 312 | 23,25,1 313 | 45,49,1 314 | 28,26,1 315 | 36,48,1 316 | 42,48,1 317 | 42,21,1 318 | 29,32,1 319 | 26,28,1 320 | 24,46,0 321 | 39,30,1 322 | 29,46,1 323 | 43,43,1 324 | 20,42,0 325 | 35,41,1 326 | 45,19,0 327 | 38,45,1 328 | 25,38,1 329 | 31,20,1 330 | 38,43,1 331 | 37,30,1 332 | 43,27,1 333 | 43,44,1 334 | 21,30,0 335 | 22,45,0 336 | 44,26,1 337 | 43,42,1 338 | 26,41,0 339 | 47,35,1 340 | 48,30,1 341 | 41,24,1 342 | 19,48,0 343 | 45,24,0 344 | 38,41,1 345 | 42,46,1 346 | 49,45,1 347 | 28,44,1 348 | 22,44,0 349 | 31,48,1 350 | 48,21,0 351 | 31,20,1 352 | 30,39,1 353 | 23,23,1 354 | 21,32,0 355 | 19,19,1 356 | 21,27,0 357 | 24,46,0 358 | 25,28,1 359 | 48,50,1 360 | 25,32,1 361 | 26,29,1 362 | 33,48,1 363 | 35,32,1 364 | 48,25,1 365 | 30,27,1 366 | 34,49,1 367 | 40,45,1 368 | 28,32,1 369 | 47,33,1 370 | 29,33,1 371 | 21,22,1 372 | 21,39,0 373 | 41,45,1 374 | 46,39,1 375 | 22,24,1 376 | 32,22,1 377 | 27,46,0 378 | 26,35,1 379 | 27,29,1 380 | 48,19,0 381 | 35,26,1 382 | 42,29,1 383 | 30,22,1 384 | 20,26,0 385 | 33,25,1 386 | 37,30,1 387 | 37,32,1 388 | 20,22,1 389 | 42,48,1 390 | 29,20,1 391 | 32,46,1 392 | 37,34,1 393 | 29,45,1 394 | 19,44,0 395 | 49,18,0 396 | 28,25,1 397 | 48,31,1 398 | 35,46,1 399 | 34,26,1 400 | 38,26,1 401 | 36,31,1 402 | 31,30,1 403 | 27,19,1 404 | 44,38,1 405 | 19,37,0 406 | 43,49,1 407 | 19,42,0 408 | 32,24,1 409 | 46,43,1 410 | 43,46,1 411 | 33,32,1 412 | 23,35,0 413 | 26,34,1 414 | 48,20,0 415 | 45,38,1 416 | 30,30,1 417 | 28,23,1 418 | 43,36,1 419 | 19,37,0 420 | 39,45,1 421 | 20,30,0 422 | 28,30,1 423 | 19,42,0 424 | 41,21,1 425 | 42,31,1 426 | 47,45,1 427 | 42,48,1 428 | 40,22,1 429 | 28,20,1 430 | 22,31,0 431 | 28,24,1 432 | 18,33,0 433 | 42,47,1 434 | 35,18,1 435 | 32,28,1 436 | 45,39,1 437 | 46,45,1 438 | 41,43,1 439 | 24,37,0 440 | 34,30,1 441 | 40,22,1 442 | 38,20,1 443 | 43,28,1 444 | 21,26,0 445 | 35,27,1 446 | 33,37,1 447 | 48,39,1 448 | 47,40,1 449 | 31,32,1 450 | 18,32,0 451 | 31,20,1 452 | 30,49,1 453 | 22,46,0 454 | 36,39,1 455 | 30,35,1 456 | 49,50,1 457 | 46,39,1 458 | 45,44,1 459 | 34,40,1 460 | 27,28,1 461 | 27,35,1 462 | 46,46,1 463 | 26,42,0 464 | 27,18,1 465 | 23,38,0 466 | 30,30,1 467 | 34,32,1 468 | 48,27,1 469 | 31,23,1 470 | 29,47,0 471 | 47,31,1 472 | 35,19,1 473 | 30,28,1 474 | 33,44,1 475 | 36,37,1 476 | 34,44,1 477 | 42,43,1 478 | 36,29,1 479 | 35,46,1 480 | 22,36,0 481 | 39,47,1 482 | 23,23,1 483 | 47,20,0 484 | 38,22,1 485 | 21,33,0 486 | 37,41,1 487 | 18,18,1 488 | 35,34,1 489 | 49,49,1 490 | 33,32,1 491 | 31,19,1 492 | 31,26,1 493 | 45,31,1 494 | 41,44,1 495 | 27,47,0 496 | 28,26,1 497 | 18,47,0 498 | 37,18,1 499 | 20,42,0 500 | 36,45,1 501 | -------------------------------------------------------------------------------- /SVM/facebook.py: -------------------------------------------------------------------------------- 1 | import urllib,md5,webbrowser,time 2 | from xml.dom.minidom import parseString 3 | 4 | apikey="47e953c8ea9ed30db904af453125c759" 5 | secret="ea703e4721e8c7bf88b92110a46a9b06" 6 | FacebookURL = "https://api.facebook.com/restserver.php" 7 | 8 | def getsinglevalue(node,tag): 9 | nl=node.getElementsByTagName(tag) 10 | if len(nl)>0: 11 | tagNode=nl[0] 12 | if tagNode.hasChildNodes(): 13 | return tagNode.firstChild.nodeValue 14 | return '' 15 | 16 | def callid(): 17 | return str(int(time.time()*10)) 18 | 19 | class fbsession: 20 | def __init__(self): 21 | self.session_secret=None 22 | self.session_key=None 23 | self.createtoken() 24 | webbrowser.open(self.getlogin()) 25 | print "Press enter after logging in:", 26 | raw_input() 27 | self.getsession() 28 | def sendrequest(self, args): 29 | args['api_key'] = apikey 30 | args['sig'] = self.makehash(args) 31 | post_data = urllib.urlencode(args) 32 | url = FacebookURL + "?" + post_data 33 | data=urllib.urlopen(url).read() 34 | print data 35 | return parseString(data) 36 | def makehash(self,args): 37 | hasher = md5.new(''.join([x + '=' + args[x] for x in sorted(args.keys())])) 38 | if self.session_secret: hasher.update(self.session_secret) 39 | else: hasher.update(secret) 40 | return hasher.hexdigest() 41 | def createtoken(self): 42 | res = self.sendrequest({'method':"facebook.auth.createToken"}) 43 | self.token = getsinglevalue(res,'token') 44 | def getlogin(self): 45 | return "http://api.facebook.com/login.php?api_key="+apikey+\ 46 | "&auth_token=" + self.token 47 | def getsession(self): 48 | doc=self.sendrequest({'method':'facebook.auth.getSession', 49 | 'auth_token':self.token}) 50 | self.session_key=getsinglevalue(doc,'session_key') 51 | self.session_secret=getsinglevalue(doc,'secret') 52 | def getfriends(self): 53 | doc=self.sendrequest({'method':'facebook.friends.get', 54 | 'session_key':self.session_key,'call_id':callid()}) 55 | results=[] 56 | for n in doc.getElementsByTagName('result_elt'): 57 | results.append(n.firstChild.nodeValue) 58 | return results 59 | 60 | def getinfo(self,users): 61 | ulist=','.join(users) 62 | 63 | fields='gender,current_location,relationship_status,'+\ 64 | 'affiliations,hometown_location' 65 | 66 | doc=self.sendrequest({'method':'facebook.users.getInfo', 67 | 'session_key':self.session_key,'call_id':callid(), 68 | 'users':ulist,'fields':fields}) 69 | 70 | results={} 71 | for n,id in zip(doc.getElementsByTagName('result_elt'),users): 72 | # Get the location 73 | locnode=n.getElementsByTagName('hometown_location')[0] 74 | loc=getsinglevalue(locnode,'city')+', '+getsinglevalue(locnode,'state') 75 | 76 | # Get school 77 | college='' 78 | gradyear='0' 79 | affiliations=n.getElementsByTagName('affiliations_elt') 80 | for aff in affiliations: 81 | # Type 1 is college 82 | if getsinglevalue(aff,'type')=='1': 83 | college=getsinglevalue(aff,'name') 84 | gradyear=getsinglevalue(aff,'year') 85 | 86 | results[id]={'gender':getsinglevalue(n,'gender'), 87 | 'status':getsinglevalue(n,'relationship_status'), 88 | 'location':loc,'college':college,'year':gradyear} 89 | return results 90 | 91 | def arefriends(self,idlist1,idlist2): 92 | id1=','.join(idlist1) 93 | id2=','.join(idlist2) 94 | doc=self.sendrequest({'method':'facebook.friends.areFriends', 95 | 'session_key':self.session_key,'call_id':callid(), 96 | 'id1':id1,'id2':id2}) 97 | results=[] 98 | for n in doc.getElementsByTagName('result_elt'): 99 | results.append(int(n.firstChild.nodeValue)) 100 | return results 101 | 102 | 103 | 104 | def makedataset(self): 105 | from advancedclassify import milesdistance 106 | # Get all the info for all my friends 107 | friends=self.getfriends() 108 | info=self.getinfo(friends) 109 | ids1,ids2=[],[] 110 | rows=[] 111 | 112 | # Nested loop to look at every pair of friends 113 | for i in range(len(friends)): 114 | f1=friends[i] 115 | data1=info[f1] 116 | 117 | # Start at i+1 so we don't double up 118 | for j in range(i+1,len(friends)): 119 | f2=friends[j] 120 | data2=info[f2] 121 | ids1.append(f1) 122 | ids2.append(f2) 123 | 124 | # Generate some numbers from the data 125 | if data1['college']==data2['college']: sameschool=1 126 | else: sameschool=0 127 | male1=(data1['gender']=='Male') and 1 or 0 128 | male2=(data2['gender']=='Male') and 1 or 0 129 | 130 | row=[male1,int(data1['year']),male2,int(data2['year']),sameschool] 131 | rows.append(row) 132 | # Call arefriends in blocks for every pair of people 133 | arefriends=[] 134 | for i in range(0,len(ids1),30): 135 | j=min(i+30,len(ids1)) 136 | pa=self.arefriends(ids1[i:j],ids2[i:j]) 137 | arefriends+=pa 138 | return arefriends,rows 139 | 140 | -------------------------------------------------------------------------------- /SVM/linearclassify.py: -------------------------------------------------------------------------------- 1 | # 线性分类器:计算样本数据每个分类中所有节点的平均值。对新输入对象计算到哪个中心点最近就属于哪个分类 2 | # 使用基本线性分类进行婚姻数据匹配 3 | 4 | # 场景:男女不同的属性信息,例如年龄、是否吸烟、是否要孩子、兴趣列表、家庭住址。产生的输出结果,配对成功还是不成功 5 | 6 | # 定义数据类 7 | class matchrow: 8 | def __init__(self,row,allnum=False): 9 | if allnum: 10 | self.data=[float(row[i]) for i in range(len(row)-1)] #如果每个属性都是数字就转化为浮点型 11 | else: 12 | self.data=row[0:len(row)-1] #如果并不是数字,就保留源数据类型 13 | self.matchresult=int(row[len(row)-1]) #最后一位表示分类(匹配结果),0表示匹配失败,1表示匹配成功 14 | 15 | # 从文件中加载数据.allnum表示是否所有属性都是数字 16 | def loadmatch(filename,allnum=False): 17 | rows=[] 18 | for line in open(filename): 19 | rows.append(matchrow(line.split(','),allnum)) 20 | return rows 21 | 22 | 23 | 24 | import matplotlib.pyplot as plt 25 | 26 | # 绘制只根据年龄进行配对的结果分布散点图 27 | def plotagematches(rows): 28 | xdm,ydm=[r.data[0] for r in rows if r.matchresult==1],[r.data[1] for r in rows if r.matchresult==1] 29 | xdn,ydn=[r.data[0] for r in rows if r.matchresult==0],[r.data[1] for r in rows if r.matchresult==0] 30 | 31 | plt.plot(xdm,ydm,'bo') 32 | plt.plot(xdn,ydn,'b+') 33 | 34 | plt.show() 35 | 36 | 37 | # 使用基本的线性分类。rows为样本数据集。(计算样本数据每个分类中所有节点的平均值。对新输入对象计算到哪个中心点最近就属于哪个分类) 38 | def lineartrain(rows): 39 | averages={} 40 | counts={} 41 | 42 | for row in rows: 43 | # 得到该坐标点所属的分类 44 | cat=row.matchresult 45 | 46 | averages.setdefault(cat,[0.0]*(len(row.data))) 47 | counts.setdefault(cat,0) 48 | 49 | # 将该坐标点加入averages中。每个维度都要求均值 50 | for i in range(len(row.data)): 51 | averages[cat][i]+=float(row.data[i]) 52 | 53 | # 记录每个分类中有多少个坐标点 54 | counts[cat]+=1 55 | 56 | # 将总和除以计数值以求得平均值 57 | for cat,avg in averages.items(): 58 | for i in range(len(avg)): 59 | avg[i]/=counts[cat] 60 | 61 | return averages 62 | 63 | # 绘制线性分类器均值点和分割线 64 | def plotlinear(rows): 65 | xdm,ydm=[r.data[0] for r in rows if r.matchresult==1],[r.data[1] for r in rows if r.matchresult==1] 66 | xdn,ydn=[r.data[0] for r in rows if r.matchresult==0],[r.data[1] for r in rows if r.matchresult==0] 67 | 68 | plt.plot(xdm,ydm,'bo') 69 | plt.plot(xdn,ydn,'b+') 70 | # 获取均值点 71 | averages = lineartrain(rows) 72 | #绘制均值点 73 | averx=[] 74 | avery=[] 75 | for value in averages.values(): 76 | averx.append(value[0]) 77 | avery.append(value[1]) 78 | 79 | plt.plot(averx,avery,'r*') 80 | #绘制垂直平分线作为分割线 81 | # y=-(x1-x0)/(y1-y0)* (x-(x0+x1)/2)+(y0+y1)/2 82 | xnew = range(15,60,1) 83 | print(xnew) 84 | print(averx,avery) 85 | ynew = [-(averx[1]-averx[0])/(avery[1]-avery[0])*(x-(averx[0]+averx[1])/2)+(avery[0]+avery[1])/2 for x in xnew] 86 | plt.plot(xnew, ynew, 'r--') 87 | plt.axis([15, 52, 15, 50]) #设置显示范围 88 | plt.show() 89 | 90 | 91 | # ================使用点积函数来代替欧几里德距离================= 92 | 93 | # 向量点积函数,代替欧几里得距离 94 | def dotproduct(v1,v2): 95 | return sum([v1[i]*v2[i] for i in range(len(v1))]) 96 | 97 | # 向量线段长度。 98 | def veclength(v): 99 | return sum([p**2 for p in v]) 100 | 101 | # 使用点积结果为正还是负来判断属于哪个分类 102 | def dpclassify(point,avgs): 103 | b=(dotproduct(avgs[1],avgs[1])-dotproduct(avgs[0],avgs[0]))/2 104 | y=dotproduct(point,avgs[0])-dotproduct(point,avgs[1])+b 105 | if y>0: return 0 106 | else: return 1 107 | 108 | 109 | 110 | 111 | # ======================复杂数据集的线性分类器========================== 112 | 113 | # 将是否问题转化为数值。yes转化为1,no转化为-1,缺失或模棱两可转化为0 114 | def yesno(v): 115 | if v=='yes': return 1 116 | elif v=='no': return -1 117 | else: return 0 118 | 119 | # 将列表转化为数值。获取公共项的数目。获取两个人相同的兴趣数量 120 | def matchcount(interest1,interest2): 121 | l1=interest1.split(':') 122 | l2=interest2.split(':') 123 | x=0 124 | for v in l1: 125 | if v in l2: x+=1 126 | return x 127 | 128 | 129 | # 利用百度地图来计算两个人的位置距离 130 | baidukey="tc42noD8p3SO1hZhFTryMeRv" 131 | import urllib 132 | import json 133 | # 使用geocoding api发起指定格式的请求,解析指定格式的返回数据,获取地址的经纬度 134 | # http://api.map.baidu.com/geocoder/v2/?address=北京市海淀区上地十街10号&output=json&ak=您的ak&callback=showLocation 135 | ak ='HIa8GVmtk9WSjhuevGfqMCGu' 136 | loc_cache={} 137 | def getlocation(address): #这个结果每次获取最好存储在数据库中,不然每次运行都要花费大量的时间获取地址 138 | if address in loc_cache: return loc_cache[address] 139 | urlpath = 'http://api.map.baidu.com/geocoder/v2/?address=%s&output=json&ak=%s' % (urllib.parse.quote_plus(address),ak) 140 | data=urllib.request.urlopen(urlpath).read() 141 | response = json.loads(data,encoding='UTF-8') # dict 142 | if not response['result']: 143 | print('没有找到地址:'+address) 144 | return None 145 | 146 | long = response['result']['location']['lng'] 147 | lat = response['result']['location']['lat'] 148 | loc_cache[address]=(float(lat),float(long)) 149 | print('地址:' + address+"===经纬度:"+str(loc_cache[address])) 150 | return loc_cache[address] 151 | 152 | # 计算两个地点之间的实际距离 153 | def milesdistance(a1,a2): 154 | try: 155 | lat1,long1=getlocation(a1) 156 | lat2,long2=getlocation(a2) 157 | latdif=69.1*(lat2-lat1) 158 | longdif=53.0*(long2-long1) 159 | return (latdif**2+longdif**2)**.5 160 | except: 161 | return None 162 | 163 | 164 | 165 | # 构造新的数据集。包含各个复杂属性转化为数值数据 166 | def loadnumerical(): 167 | oldrows=loadmatch('matchmaker.csv') 168 | newrows=[] 169 | for row in oldrows: 170 | d=row.data 171 | distance = milesdistance(d[4],d[9]) # 以为有可能无法获取地址的经纬度,进而无法获取两地之间的距离,这里就成了缺失值。我们暂且直接抛弃缺失值 172 | if distance: 173 | data=[float(d[0]),yesno(d[1]),yesno(d[2]), 174 | float(d[5]),yesno(d[6]),yesno(d[7]), 175 | matchcount(d[3],d[8]),distance,row.matchresult] 176 | newrows.append(matchrow(data)) 177 | return newrows 178 | 179 | 180 | # 对数据进行缩放处理,全部归一化到0-1上,因为不同参考变量之间的数值尺度不同 181 | def scaledata(rows): 182 | low=[999999999.0]*len(rows[0].data) 183 | high=[-999999999.0]*len(rows[0].data) 184 | # 寻找最大值和最小值 185 | for row in rows: 186 | d=row.data 187 | for i in range(len(d)): 188 | if d[i]high[i]: high[i]=d[i] 190 | 191 | # 对数据进行缩放处理的函数 192 | def scaleinput(d): 193 | return [(d[i]-low[i])/(high[i]-low[i]) 194 | for i in range(len(low))] 195 | 196 | # 对所有数据进行缩放处理 197 | newrows=[matchrow(scaleinput(row.data)+[row.matchresult]) for row in rows] 198 | 199 | # 返回新的数据和缩放处理函数 200 | return newrows,scaleinput 201 | 202 | 203 | 204 | 205 | 206 | 207 | if __name__=='__main__': #只有在执行当前模块时才会运行此函数 208 | 209 | agesonly = loadmatch('agesonly.csv') #读入只关注年龄的配对情况 210 | # plotagematches(agesonly) #绘制年龄配对散点图 211 | 212 | #======使用基本线性分类器分类=========== 213 | # plotlinear(agesonly) #绘制线性分类器均值点和分割线 214 | 215 | #==========复杂数据的线性分类器========== 216 | numercalset=loadnumerical() #获取转化为数值型的复杂数据集 217 | scaledset,scalef=scaledata(numercalset) #对复杂数据集进行比例缩放 218 | catavgs = lineartrain(scaledset) #计算分类均值点 219 | print(catavgs) 220 | onedata = scalef(numercalset[0].data) #取一个数据作为新数据先比例缩放 221 | dpclassify(onedata,catavgs) #使用点积结果来判断属于哪个分类 222 | -------------------------------------------------------------------------------- /SVM/svm-simple.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import random 5 | 6 | 7 | # 简化版smo 8 | 9 | 10 | # 函数说明:读取数据 11 | def loadDataSet(fileName): 12 | alldata = np.loadtxt(fileName) 13 | dataMat = alldata[:,0:2] #添加数据 14 | labelMat = alldata[:,2] #.astype(int).reshape(-1,1) #添加标签 15 | return dataMat,labelMat 16 | 17 | 18 | """ 19 | 函数说明:随机选择alpha 20 | 21 | Parameters: 22 | i - alpha_i的索引值 23 | m - alpha参数个数 24 | Returns: 25 | j - alpha_j的索引值 26 | 27 | """ 28 | def selectJrand(i, m): 29 | j = i #选择一个不等于i的j 30 | while (j == i): 31 | j = int(random.uniform(0, m)) 32 | return j 33 | 34 | """ 35 | 函数说明:修剪alpha 36 | 37 | Parameters: 38 | aj - alpha_j值 39 | H - alpha上限 40 | L - alpha下限 41 | Returns: 42 | aj - alpah值 43 | 44 | """ 45 | def clipAlpha(aj,H,L): 46 | if aj > H: 47 | aj = H 48 | if L > aj: 49 | aj = L 50 | return aj 51 | 52 | 53 | # 函数说明:数据可视化 54 | def showDataSet(dataMat, labelMat): 55 | 56 | place_plus = np.where(labelMat==1)[0] # 正样本的位置 57 | place_minus = np.where(labelMat==-1)[0] # 负样本的位置 58 | data_plus = dataMat[place_plus] #正样本 59 | data_minus = dataMat[place_minus] #负样本 60 | 61 | plt.scatter(np.transpose(data_plus)[0], np.transpose(data_plus)[1]) #正样本散点图 62 | plt.scatter(np.transpose(data_minus)[0], np.transpose(data_minus)[1]) #负样本散点图 63 | plt.show() 64 | 65 | 66 | """ 67 | 函数说明:简化版SMO算法 68 | 69 | Parameters: 70 | dataMatIn - 数据矩阵 71 | classLabels - 数据标签 72 | C - 松弛变量 73 | toler - 容错率 74 | maxIter - 最大迭代次数 75 | """ 76 | def smoSimple(dataMatIn, classLabels, C, toler, maxIter): 77 | #转换为numpy的mat存储 78 | dataMatrix = np.mat(dataMatIn) 79 | labelMat = np.mat(classLabels).transpose() 80 | #初始化b参数,统计dataMatrix的维度 81 | b = 0; m,n = np.shape(dataMatrix) 82 | #初始化alpha参数,设为0 83 | alphas = np.mat(np.zeros((m,1))) 84 | #初始化迭代次数 85 | iter_num = 0 86 | #最多迭代matIter次 87 | while (iter_num < maxIter): 88 | alphaPairsChanged = 0 89 | for i in range(m): 90 | #步骤1:计算误差Ei 91 | fXi = float(np.multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b 92 | Ei = fXi - float(labelMat[i]) 93 | #优化alpha,设定一定的容错率。 94 | if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)): 95 | #随机选择另一个与alpha_i成对优化的alpha_j 96 | j = selectJrand(i,m) 97 | #步骤1:计算误差Ej 98 | fXj = float(np.multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b 99 | Ej = fXj - float(labelMat[j]) 100 | #保存更新前的aplpha值,使用深拷贝 101 | alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy(); 102 | #步骤2:计算上下界L和H 103 | if (labelMat[i] != labelMat[j]): 104 | L = max(0, alphas[j] - alphas[i]) 105 | H = min(C, C + alphas[j] - alphas[i]) 106 | else: 107 | L = max(0, alphas[j] + alphas[i] - C) 108 | H = min(C, alphas[j] + alphas[i]) 109 | if L==H: print("L==H"); continue 110 | #步骤3:计算eta 111 | eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T 112 | if eta >= 0: print("eta>=0"); continue 113 | #步骤4:更新alpha_j 114 | alphas[j] -= labelMat[j]*(Ei - Ej)/eta 115 | #步骤5:修剪alpha_j 116 | alphas[j] = clipAlpha(alphas[j],H,L) 117 | if (abs(alphas[j] - alphaJold) < 0.00001): print("alpha_j变化太小"); continue 118 | #步骤6:更新alpha_i 119 | alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j]) 120 | #步骤7:更新b_1和b_2 121 | b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T 122 | b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T 123 | #步骤8:根据b_1和b_2更新b 124 | if (0 < alphas[i]) and (C > alphas[i]): b = b1 125 | elif (0 < alphas[j]) and (C > alphas[j]): b = b2 126 | else: b = (b1 + b2)/2.0 127 | #统计优化次数 128 | alphaPairsChanged += 1 129 | #打印统计信息 130 | print("第%d次迭代 样本:%d, alpha优化次数:%d" % (iter_num,i,alphaPairsChanged)) 131 | #更新迭代次数 132 | if (alphaPairsChanged == 0): iter_num += 1 133 | else: iter_num = 0 134 | print("迭代次数: %d" % iter_num) 135 | return b,alphas 136 | 137 | """ 138 | 函数说明:分类结果可视化 139 | 140 | Parameters: 141 | dataMat - 数据矩阵 142 | w - 直线法向量 143 | b - 直线解决 144 | """ 145 | def showClassifer(dataMat, w, b): 146 | # 绘制样本点 147 | place_plus = np.where(labelMat==1)[0] # 正样本的位置 148 | place_minus = np.where(labelMat==-1)[0] # 负样本的位置 149 | 150 | data_plus = dataMat[place_plus] #正样本 151 | data_minus = dataMat[place_minus] #负样本 152 | 153 | plt.scatter(np.transpose(data_plus)[0], np.transpose(data_plus)[1],s=30, alpha=0.7) #正样本散点图 154 | plt.scatter(np.transpose(data_minus)[0], np.transpose(data_minus)[1], s=30, alpha=0.7) #负样本散点图 155 | 156 | 157 | #绘制直线 158 | x1 = max(dataMat[:,0]) # 第一个属性的最大值 159 | x2 = min(dataMat[:,0]) # 第一个属性的最小值 160 | a1, a2 = w 161 | b = float(b) 162 | a1 = float(a1[0]) 163 | a2 = float(a2[0]) 164 | y1, y2 = (-b- a1*x1)/a2, (-b - a1*x2)/a2 165 | plt.plot([x1, x2], [y1, y2]) 166 | #找出支持向量点 167 | for i, alpha in enumerate(alphas): 168 | if abs(alpha) > 0: 169 | x, y = dataMat[i] 170 | plt.scatter([x], [y], s=150, c='none', alpha=0.7, linewidth=1.5, edgecolor='red') 171 | plt.show() 172 | 173 | 174 | """ 175 | 函数说明:计算w 176 | 177 | Parameters: 178 | dataMat - 数据矩阵 179 | labelMat - 数据标签 180 | alphas - alphas值 181 | """ 182 | def get_w(dataMat, labelMat, alphas): 183 | alphas, dataMat, labelMat = np.array(alphas), np.array(dataMat), np.array(labelMat) 184 | w = np.dot((np.tile(labelMat.reshape(1, -1).T, (1, 2)) * dataMat).T, alphas) 185 | return w.tolist() 186 | 187 | 188 | if __name__ == '__main__': 189 | dataMat, labelMat = loadDataSet('testSet.txt') 190 | showDataSet(dataMat,labelMat) 191 | b,alphas = smoSimple(dataMat, labelMat, 0.6, 0.001, 40) 192 | w = get_w(dataMat, labelMat, alphas) 193 | showClassifer(dataMat, w, b) -------------------------------------------------------------------------------- /SVM/svm-svc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | import operator 4 | from os import listdir 5 | from sklearn.svm import SVC 6 | 7 | 8 | 9 | def img2vector(filename): 10 | """ 11 | 将32x32的二进制图像转换为1x1024向量。 12 | Parameters: 13 | filename - 文件名 14 | Returns: 15 | returnVect - 返回的二进制图像的1x1024向量 16 | """ 17 | #创建1x1024零向量 18 | returnVect = np.zeros((1, 1024)) 19 | #打开文件 20 | fr = open(filename) 21 | #按行读取 22 | for i in range(32): 23 | #读一行数据 24 | lineStr = fr.readline() 25 | #每一行的前32个元素依次添加到returnVect中 26 | for j in range(32): 27 | returnVect[0, 32*i+j] = int(lineStr[j]) 28 | #返回转换后的1x1024向量 29 | return returnVect 30 | 31 | def handwritingClassTest(): 32 | """ 33 | 手写数字分类测试 34 | Parameters: 35 | 无 36 | Returns: 37 | 无 38 | """ 39 | #测试集的Labels 40 | hwLabels = [] 41 | #返回trainingDigits目录下的文件名 42 | trainingFileList = listdir('trainingDigits') 43 | #返回文件夹下文件的个数 44 | m = len(trainingFileList) 45 | #初始化训练的Mat矩阵,测试集 46 | trainingMat = np.zeros((m, 1024)) 47 | #从文件名中解析出训练集的类别 48 | for i in range(m): 49 | #获得文件的名字 50 | fileNameStr = trainingFileList[i] 51 | #获得分类的数字 52 | classNumber = int(fileNameStr.split('_')[0]) 53 | #将获得的类别添加到hwLabels中 54 | hwLabels.append(classNumber) 55 | #将每一个文件的1x1024数据存储到trainingMat矩阵中 56 | trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr)) 57 | clf = SVC(C=200,kernel='rbf') 58 | clf.fit(trainingMat,hwLabels) 59 | #返回testDigits目录下的文件列表 60 | testFileList = listdir('testDigits') 61 | #错误检测计数 62 | errorCount = 0.0 63 | #测试数据的数量 64 | mTest = len(testFileList) 65 | #从文件中解析出测试集的类别并进行分类测试 66 | for i in range(mTest): 67 | #获得文件的名字 68 | fileNameStr = testFileList[i] 69 | #获得分类的数字 70 | classNumber = int(fileNameStr.split('_')[0]) 71 | #获得测试集的1x1024向量,用于训练 72 | vectorUnderTest = img2vector('testDigits/%s' % (fileNameStr)) 73 | #获得预测结果 74 | # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 75 | classifierResult = clf.predict(vectorUnderTest) 76 | print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) 77 | if(classifierResult != classNumber): 78 | errorCount += 1.0 79 | print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100)) 80 | 81 | if __name__ == '__main__': 82 | handwritingClassTest() -------------------------------------------------------------------------------- /SVM/testSet.txt: -------------------------------------------------------------------------------- 1 | 3.542485 1.977398 -1 2 | 3.018896 2.556416 -1 3 | 7.551510 -1.580030 1 4 | 2.114999 -0.004466 -1 5 | 8.127113 1.274372 1 6 | 7.108772 -0.986906 1 7 | 8.610639 2.046708 1 8 | 2.326297 0.265213 -1 9 | 3.634009 1.730537 -1 10 | 0.341367 -0.894998 -1 11 | 3.125951 0.293251 -1 12 | 2.123252 -0.783563 -1 13 | 0.887835 -2.797792 -1 14 | 7.139979 -2.329896 1 15 | 1.696414 -1.212496 -1 16 | 8.117032 0.623493 1 17 | 8.497162 -0.266649 1 18 | 4.658191 3.507396 -1 19 | 8.197181 1.545132 1 20 | 1.208047 0.213100 -1 21 | 1.928486 -0.321870 -1 22 | 2.175808 -0.014527 -1 23 | 7.886608 0.461755 1 24 | 3.223038 -0.552392 -1 25 | 3.628502 2.190585 -1 26 | 7.407860 -0.121961 1 27 | 7.286357 0.251077 1 28 | 2.301095 -0.533988 -1 29 | -0.232542 -0.547690 -1 30 | 3.457096 -0.082216 -1 31 | 3.023938 -0.057392 -1 32 | 8.015003 0.885325 1 33 | 8.991748 0.923154 1 34 | 7.916831 -1.781735 1 35 | 7.616862 -0.217958 1 36 | 2.450939 0.744967 -1 37 | 7.270337 -2.507834 1 38 | 1.749721 -0.961902 -1 39 | 1.803111 -0.176349 -1 40 | 8.804461 3.044301 1 41 | 1.231257 -0.568573 -1 42 | 2.074915 1.410550 -1 43 | -0.743036 -1.736103 -1 44 | 3.536555 3.964960 -1 45 | 8.410143 0.025606 1 46 | 7.382988 -0.478764 1 47 | 6.960661 -0.245353 1 48 | 8.234460 0.701868 1 49 | 8.168618 -0.903835 1 50 | 1.534187 -0.622492 -1 51 | 9.229518 2.066088 1 52 | 7.886242 0.191813 1 53 | 2.893743 -1.643468 -1 54 | 1.870457 -1.040420 -1 55 | 5.286862 -2.358286 1 56 | 6.080573 0.418886 1 57 | 2.544314 1.714165 -1 58 | 6.016004 -3.753712 1 59 | 0.926310 -0.564359 -1 60 | 0.870296 -0.109952 -1 61 | 2.369345 1.375695 -1 62 | 1.363782 -0.254082 -1 63 | 7.279460 -0.189572 1 64 | 1.896005 0.515080 -1 65 | 8.102154 -0.603875 1 66 | 2.529893 0.662657 -1 67 | 1.963874 -0.365233 -1 68 | 8.132048 0.785914 1 69 | 8.245938 0.372366 1 70 | 6.543888 0.433164 1 71 | -0.236713 -5.766721 -1 72 | 8.112593 0.295839 1 73 | 9.803425 1.495167 1 74 | 1.497407 -0.552916 -1 75 | 1.336267 -1.632889 -1 76 | 9.205805 -0.586480 1 77 | 1.966279 -1.840439 -1 78 | 8.398012 1.584918 1 79 | 7.239953 -1.764292 1 80 | 7.556201 0.241185 1 81 | 9.015509 0.345019 1 82 | 8.266085 -0.230977 1 83 | 8.545620 2.788799 1 84 | 9.295969 1.346332 1 85 | 2.404234 0.570278 -1 86 | 2.037772 0.021919 -1 87 | 1.727631 -0.453143 -1 88 | 1.979395 -0.050773 -1 89 | 8.092288 -1.372433 1 90 | 1.667645 0.239204 -1 91 | 9.854303 1.365116 1 92 | 7.921057 -1.327587 1 93 | 8.500757 1.492372 1 94 | 1.339746 -0.291183 -1 95 | 3.107511 0.758367 -1 96 | 2.609525 0.902979 -1 97 | 3.263585 1.367898 -1 98 | 2.912122 -0.202359 -1 99 | 1.731786 0.589096 -1 100 | 2.387003 1.573131 -1 101 | -------------------------------------------------------------------------------- /SVM/testSetRBF.txt: -------------------------------------------------------------------------------- 1 | -0.214824 0.662756 -1.000000 2 | -0.061569 -0.091875 1.000000 3 | 0.406933 0.648055 -1.000000 4 | 0.223650 0.130142 1.000000 5 | 0.231317 0.766906 -1.000000 6 | -0.748800 -0.531637 -1.000000 7 | -0.557789 0.375797 -1.000000 8 | 0.207123 -0.019463 1.000000 9 | 0.286462 0.719470 -1.000000 10 | 0.195300 -0.179039 1.000000 11 | -0.152696 -0.153030 1.000000 12 | 0.384471 0.653336 -1.000000 13 | -0.117280 -0.153217 1.000000 14 | -0.238076 0.000583 1.000000 15 | -0.413576 0.145681 1.000000 16 | 0.490767 -0.680029 -1.000000 17 | 0.199894 -0.199381 1.000000 18 | -0.356048 0.537960 -1.000000 19 | -0.392868 -0.125261 1.000000 20 | 0.353588 -0.070617 1.000000 21 | 0.020984 0.925720 -1.000000 22 | -0.475167 -0.346247 -1.000000 23 | 0.074952 0.042783 1.000000 24 | 0.394164 -0.058217 1.000000 25 | 0.663418 0.436525 -1.000000 26 | 0.402158 0.577744 -1.000000 27 | -0.449349 -0.038074 1.000000 28 | 0.619080 -0.088188 -1.000000 29 | 0.268066 -0.071621 1.000000 30 | -0.015165 0.359326 1.000000 31 | 0.539368 -0.374972 -1.000000 32 | -0.319153 0.629673 -1.000000 33 | 0.694424 0.641180 -1.000000 34 | 0.079522 0.193198 1.000000 35 | 0.253289 -0.285861 1.000000 36 | -0.035558 -0.010086 1.000000 37 | -0.403483 0.474466 -1.000000 38 | -0.034312 0.995685 -1.000000 39 | -0.590657 0.438051 -1.000000 40 | -0.098871 -0.023953 1.000000 41 | -0.250001 0.141621 1.000000 42 | -0.012998 0.525985 -1.000000 43 | 0.153738 0.491531 -1.000000 44 | 0.388215 -0.656567 -1.000000 45 | 0.049008 0.013499 1.000000 46 | 0.068286 0.392741 1.000000 47 | 0.747800 -0.066630 -1.000000 48 | 0.004621 -0.042932 1.000000 49 | -0.701600 0.190983 -1.000000 50 | 0.055413 -0.024380 1.000000 51 | 0.035398 -0.333682 1.000000 52 | 0.211795 0.024689 1.000000 53 | -0.045677 0.172907 1.000000 54 | 0.595222 0.209570 -1.000000 55 | 0.229465 0.250409 1.000000 56 | -0.089293 0.068198 1.000000 57 | 0.384300 -0.176570 1.000000 58 | 0.834912 -0.110321 -1.000000 59 | -0.307768 0.503038 -1.000000 60 | -0.777063 -0.348066 -1.000000 61 | 0.017390 0.152441 1.000000 62 | -0.293382 -0.139778 1.000000 63 | -0.203272 0.286855 1.000000 64 | 0.957812 -0.152444 -1.000000 65 | 0.004609 -0.070617 1.000000 66 | -0.755431 0.096711 -1.000000 67 | -0.526487 0.547282 -1.000000 68 | -0.246873 0.833713 -1.000000 69 | 0.185639 -0.066162 1.000000 70 | 0.851934 0.456603 -1.000000 71 | -0.827912 0.117122 -1.000000 72 | 0.233512 -0.106274 1.000000 73 | 0.583671 -0.709033 -1.000000 74 | -0.487023 0.625140 -1.000000 75 | -0.448939 0.176725 1.000000 76 | 0.155907 -0.166371 1.000000 77 | 0.334204 0.381237 -1.000000 78 | 0.081536 -0.106212 1.000000 79 | 0.227222 0.527437 -1.000000 80 | 0.759290 0.330720 -1.000000 81 | 0.204177 -0.023516 1.000000 82 | 0.577939 0.403784 -1.000000 83 | -0.568534 0.442948 -1.000000 84 | -0.011520 0.021165 1.000000 85 | 0.875720 0.422476 -1.000000 86 | 0.297885 -0.632874 -1.000000 87 | -0.015821 0.031226 1.000000 88 | 0.541359 -0.205969 -1.000000 89 | -0.689946 -0.508674 -1.000000 90 | -0.343049 0.841653 -1.000000 91 | 0.523902 -0.436156 -1.000000 92 | 0.249281 -0.711840 -1.000000 93 | 0.193449 0.574598 -1.000000 94 | -0.257542 -0.753885 -1.000000 95 | -0.021605 0.158080 1.000000 96 | 0.601559 -0.727041 -1.000000 97 | -0.791603 0.095651 -1.000000 98 | -0.908298 -0.053376 -1.000000 99 | 0.122020 0.850966 -1.000000 100 | -0.725568 -0.292022 -1.000000 101 | -------------------------------------------------------------------------------- /SVM/testSetRBF2.txt: -------------------------------------------------------------------------------- 1 | 0.676771 -0.486687 -1.000000 2 | 0.008473 0.186070 1.000000 3 | -0.727789 0.594062 -1.000000 4 | 0.112367 0.287852 1.000000 5 | 0.383633 -0.038068 1.000000 6 | -0.927138 -0.032633 -1.000000 7 | -0.842803 -0.423115 -1.000000 8 | -0.003677 -0.367338 1.000000 9 | 0.443211 -0.698469 -1.000000 10 | -0.473835 0.005233 1.000000 11 | 0.616741 0.590841 -1.000000 12 | 0.557463 -0.373461 -1.000000 13 | -0.498535 -0.223231 -1.000000 14 | -0.246744 0.276413 1.000000 15 | -0.761980 -0.244188 -1.000000 16 | 0.641594 -0.479861 -1.000000 17 | -0.659140 0.529830 -1.000000 18 | -0.054873 -0.238900 1.000000 19 | -0.089644 -0.244683 1.000000 20 | -0.431576 -0.481538 -1.000000 21 | -0.099535 0.728679 -1.000000 22 | -0.188428 0.156443 1.000000 23 | 0.267051 0.318101 1.000000 24 | 0.222114 -0.528887 -1.000000 25 | 0.030369 0.113317 1.000000 26 | 0.392321 0.026089 1.000000 27 | 0.298871 -0.915427 -1.000000 28 | -0.034581 -0.133887 1.000000 29 | 0.405956 0.206980 1.000000 30 | 0.144902 -0.605762 -1.000000 31 | 0.274362 -0.401338 1.000000 32 | 0.397998 -0.780144 -1.000000 33 | 0.037863 0.155137 1.000000 34 | -0.010363 -0.004170 1.000000 35 | 0.506519 0.486619 -1.000000 36 | 0.000082 -0.020625 1.000000 37 | 0.057761 -0.155140 1.000000 38 | 0.027748 -0.553763 -1.000000 39 | -0.413363 -0.746830 -1.000000 40 | 0.081500 -0.014264 1.000000 41 | 0.047137 -0.491271 1.000000 42 | -0.267459 0.024770 1.000000 43 | -0.148288 -0.532471 -1.000000 44 | -0.225559 -0.201622 1.000000 45 | 0.772360 -0.518986 -1.000000 46 | -0.440670 0.688739 -1.000000 47 | 0.329064 -0.095349 1.000000 48 | 0.970170 -0.010671 -1.000000 49 | -0.689447 -0.318722 -1.000000 50 | -0.465493 -0.227468 -1.000000 51 | -0.049370 0.405711 1.000000 52 | -0.166117 0.274807 1.000000 53 | 0.054483 0.012643 1.000000 54 | 0.021389 0.076125 1.000000 55 | -0.104404 -0.914042 -1.000000 56 | 0.294487 0.440886 -1.000000 57 | 0.107915 -0.493703 -1.000000 58 | 0.076311 0.438860 1.000000 59 | 0.370593 -0.728737 -1.000000 60 | 0.409890 0.306851 -1.000000 61 | 0.285445 0.474399 -1.000000 62 | -0.870134 -0.161685 -1.000000 63 | -0.654144 -0.675129 -1.000000 64 | 0.285278 -0.767310 -1.000000 65 | 0.049548 -0.000907 1.000000 66 | 0.030014 -0.093265 1.000000 67 | -0.128859 0.278865 1.000000 68 | 0.307463 0.085667 1.000000 69 | 0.023440 0.298638 1.000000 70 | 0.053920 0.235344 1.000000 71 | 0.059675 0.533339 -1.000000 72 | 0.817125 0.016536 -1.000000 73 | -0.108771 0.477254 1.000000 74 | -0.118106 0.017284 1.000000 75 | 0.288339 0.195457 1.000000 76 | 0.567309 -0.200203 -1.000000 77 | -0.202446 0.409387 1.000000 78 | -0.330769 -0.240797 1.000000 79 | -0.422377 0.480683 -1.000000 80 | -0.295269 0.326017 1.000000 81 | 0.261132 0.046478 1.000000 82 | -0.492244 -0.319998 -1.000000 83 | -0.384419 0.099170 1.000000 84 | 0.101882 -0.781145 -1.000000 85 | 0.234592 -0.383446 1.000000 86 | -0.020478 -0.901833 -1.000000 87 | 0.328449 0.186633 1.000000 88 | -0.150059 -0.409158 1.000000 89 | -0.155876 -0.843413 -1.000000 90 | -0.098134 -0.136786 1.000000 91 | 0.110575 -0.197205 1.000000 92 | 0.219021 0.054347 1.000000 93 | 0.030152 0.251682 1.000000 94 | 0.033447 -0.122824 1.000000 95 | -0.686225 -0.020779 -1.000000 96 | -0.911211 -0.262011 -1.000000 97 | 0.572557 0.377526 -1.000000 98 | -0.073647 -0.519163 -1.000000 99 | -0.281830 -0.797236 -1.000000 100 | -0.555263 0.126232 -1.000000 101 | -------------------------------------------------------------------------------- /SVM/帮助.txt: -------------------------------------------------------------------------------- 1 | 支持向量机SVM以及核方法的相关demo 2 | -------------------------------------------------------------------------------- /SVM/核方法.py: -------------------------------------------------------------------------------- 1 | # 输入对象的各个属性间存在非线性作用 2 | # 使用核方法,进行婚姻数据匹配。婚姻数据同线性分类器中数据 3 | 4 | # 场景:男女不同的属性信息,例如年龄、是否吸烟、是否要孩子、兴趣列表、家庭住址。产生的输出结果,配对成功还是不成功 5 | 6 | import linearclassify 7 | 8 | #=============核方法============ 9 | import math 10 | 11 | # 向量线段长度。 12 | def veclength(v): 13 | return sum([p**2 for p in v]) 14 | 15 | # 使用径向基函数代替向量点积函数。将数据映射到更高维的空间(以为更高纬度空间可以通过线性分离)。可以调整gamma参数,达到最佳分离 16 | def rbf(v1,v2,gamma=10): 17 | dv=[v1[i]-v2[i] for i in range(len(v1))] 18 | l=veclength(dv) 19 | return math.e**(-gamma*l) 20 | 21 | # 使用核方法进行线性分类。计算每个坐标点与分类中其余每个坐标点之间的点积或径向基函数的结果,然后对他们求均值 22 | def nlclassify(point,rows,offset,gamma=10): 23 | sum0=0.0 24 | sum1=0.0 25 | count0=0 26 | count1=0 27 | 28 | for row in rows: 29 | if row.matchresult==0: 30 | sum0+=rbf(point,row.data,gamma) #求径向基函数 31 | count0+=1 32 | else: 33 | sum1+=rbf(point,row.data,gamma) 34 | count1+=1 35 | y=(1.0/count0)*sum0-(1.0/count1)*sum1+offset 36 | 37 | if y>0: return 0 38 | else: return 1 39 | 40 | def getoffset(rows,gamma=10): 41 | t0=[] 42 | t1=[] 43 | for row in rows: 44 | if row.matchresult==0: t0.append(row.data) 45 | else: t1.append(row.data) 46 | sum0=sum(sum([rbf(v1,v2,gamma) for v1 in t0]) for v2 in t0) 47 | sum1=sum(sum([rbf(v1,v2,gamma) for v1 in t1]) for v2 in t1) 48 | 49 | return (1.0/(len(t1)**2))*sum1-(1.0/(len(t0)**2))*sum0 50 | 51 | 52 | 53 | if __name__=='__main__': #只有在执行当前模块时才会运行此函数 54 | 55 | # 年龄匹配数据集的核方法 56 | agesonly = linearclassify.loadmatch('agesonly.csv',allnum=True) #读入只关注年龄的配对情况 57 | print(agesonly) 58 | offset = getoffset(agesonly) #获取高维度下的数据偏移 59 | print(offset) 60 | result = nlclassify([30,30],agesonly,offset) #使用核方法来判断属于哪个分类 61 | print(result) 62 | 63 | # 复杂数据集的核方法 64 | numercalset = linearclassify.loadnumerical() # 获取转化为数值型的复杂数据集 65 | scaledset, scalef = linearclassify.scaledata(numercalset) # 对复杂数据集进行比例缩放 66 | ssoffset = getoffset(scaledset) #获取高维度下的数据偏移 67 | onedata = scalef([28.0,-1,-1,26.0,-1,1,2,0.8]) # 取一个数据作为新数据先比例缩放 68 | result = nlclassify(onedata, scaledset, ssoffset) # 使用核方法来判断属于哪个分类 69 | print(result) -------------------------------------------------------------------------------- /Search-Engines/searchengine.py: -------------------------------------------------------------------------------- 1 | # 搜索和排名 2 | import urllib 3 | from bs4 import BeautifulSoup 4 | import re 5 | import sqlite3 6 | import nn 7 | import os 8 | import spyder #获取爬虫数据集 9 | 10 | # 分词时忽略下列词 11 | biaodian = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+ ,。?‘’“”!;:\r\n、()…' #所有的标点符号 12 | # ignorewords=[',','。','?','“','”','!',';',':','\n','、','-',',','.','?','\r\n','_',' '] 13 | ignorewords = list(set(biaodian)) #去重 14 | ignorewords.append('\r\n') #添加一个不忽略的项 15 | 16 | 17 | 18 | # 定义搜索引擎类 19 | class searcher: 20 | def __init__(self,dbname): 21 | self.con=sqlite3.connect(dbname) #链接数据库 22 | self.curs = self.con.cursor() 23 | 24 | def __del__(self): 25 | self.curs.close() 26 | self.con.close() 27 | 28 | 29 | # 根据搜索字符串分词后获取查询到的链接 30 | def getmatchrows(self,querystr): 31 | # 构造数据库的查询字符串(搜索字符串根据空格分割成查询字符串列表) 32 | fieldlist='w0.urlid' 33 | tablelist='' 34 | clauselist='' 35 | wordids=[] 36 | 37 | # 根据空格分割单词 38 | words=querystr.strip().split(' ') 39 | tablenumber=0 40 | for word in words: 41 | # 获取单词的id 42 | wordrow=self.curs.execute("select rowid from wordlist where word='%s'" % word).fetchall() 43 | if wordrow!=None and len(wordrow)> 0: 44 | wordid=wordrow[0][0] #获取单词id 45 | wordids.append(wordid) 46 | if tablenumber>0: 47 | tablelist+=',' 48 | clauselist+=' and ' 49 | clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber) 50 | fieldlist+=',w%d.location' % tablenumber 51 | tablelist+='wordlocation w%d' % tablenumber 52 | clauselist+='w%d.wordid=%d' % (tablenumber,wordid) 53 | tablenumber+=1 54 | 55 | # 根据各个组分,建立查询。为列表中的每个单词,建立指向wordlocation表的引用,并根据对应的urlid将它们连接起来进行联合查询 56 | fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist) 57 | # print(fullquery) 58 | cur=self.curs.execute(fullquery) 59 | rows=[row for row in cur.fetchall()] 60 | 61 | return rows,wordids 62 | 63 | # 对查询到的链接进行排名。参数:rows,wordids查询字符串id列表 64 | def getscoredlist(self,rows,wordids): 65 | totalscores=dict([(row[0],0) for row in rows]) 66 | # 对链接进行评价的函数。(权重和评价值),使用了多种评价函数 67 | weights=[(1.0,self.locationscore(rows)), #根据关键词出现的位置获取权重 68 | (1.0,self.frequencyscore(rows)), #根据关键词出现的频率获取权重 69 | (1.0,self.pagerankscore(rows)), #根据pagerank获取权重 70 | (1.0,self.linktextscore(rows,wordids)), #根据链接描述获取权重 71 | (5.0,self.nnscore(rows,wordids))] #根据神经网络获取权重 72 | for (weight,scores) in weights: 73 | for urlid in totalscores: 74 | totalscores[urlid]+=weight*scores[urlid] 75 | 76 | return totalscores #返回每个链接的评价值 77 | 78 | #根据urlid查询url 79 | def geturlname(self,id): 80 | return self.curs.execute("select url from urllist where rowid=%d" % id).fetchall()[0][0] 81 | 82 | #搜索函数:将上面的搜索、评价、排名合并在一起 83 | def query(self,querystr): 84 | rows,wordids=self.getmatchrows(querystr) #rows是[urlid,wordlocation1,wordlocation2,wordlocation3...] 85 | if rows==None or len(rows)==0: 86 | print('无法查询到,请使用空格分隔查询关键词') 87 | return 88 | 89 | scores=self.getscoredlist(rows,wordids) 90 | rankedscores=[(score,url) for (url,score) in scores.items()] 91 | rankedscores.sort() 92 | rankedscores.reverse() 93 | for (score,urlid) in rankedscores[0:10]: 94 | print('%f\t%d\t%s' % (score,urlid,self.geturlname(urlid))) 95 | return wordids,[r[1] for r in rankedscores[0:10]] 96 | 97 | 98 | # 评价值归一化:因为不同的评价方法的返回值和含义不同。这里所有的评价值归一化到0-1,默认越大越好 99 | def normalizescores(self,scores,smallIsBetter=0): 100 | vsmall=0.00001 #避免被0整除 101 | if smallIsBetter: 102 | minscore=min(scores.values()) 103 | return dict([(u,float(minscore)/max(vsmall,l)) for (u,l) in scores.items()]) 104 | else: 105 | maxscore=max(scores.values()) 106 | if maxscore==0: maxscore=vsmall 107 | return dict([(u,float(c)/maxscore) for (u,c) in scores.items()]) 108 | 109 | # 根据单词频度进行评价的函数.#rows是[urlid,wordlocation1,wordlocation2,wordlocation3...] 110 | def frequencyscore(self,rows): 111 | counts=dict([(row[0],0) for row in rows]) 112 | for row in rows: counts[row[0]]+=1 113 | return self.normalizescores(counts) 114 | 115 | # 根据单词位置进行评价的函数.#rows是[urlid,wordlocation1,wordlocation2,wordlocation3...] 116 | def locationscore(self,rows): 117 | locations=dict([(row[0],1000000) for row in rows]) 118 | for row in rows: 119 | loc=sum(row[1:]) 120 | if loc") 178 | wordids,urlids=mysearcher.query(searchkey) 179 | # print(wordids,urlids) 180 | selurlid= input("选中链接id>") 181 | selurlid = int(selurlid) 182 | mynet.trainquery(wordids, urlids,selurlid) #根据用户选择的链接进行训练 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /Search-Engines/帮助.txt: -------------------------------------------------------------------------------- 1 | 搜索引擎和排名的demo 2 | -------------------------------------------------------------------------------- /adaboost/帮助: -------------------------------------------------------------------------------- 1 | adaboost算法demo 2 | -------------------------------------------------------------------------------- /association/Apriori.py: -------------------------------------------------------------------------------- 1 | from numpy import * 2 | 3 | 4 | def loadDataSet(): 5 | return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] 6 | 7 | 8 | # =======================================================频繁项集=================================================== 9 | 10 | # 获取候选1项集,dataSet为事务集。返回一个list,每个元素都是set集合 11 | def createC1(dataSet): 12 | C1 = [] # 元素个数为1的项集(非频繁项集,因为还没有同最小支持度比较) 13 | for transaction in dataSet: 14 | for item in transaction: 15 | if not [item] in C1: 16 | C1.append([item]) 17 | C1.sort() # 这里排序是为了,生成新的候选集时可以直接认为两个n项候选集前面的部分相同 18 | # 因为除了候选1项集外其他的候选n项集都是以二维列表的形式存在,所以要将候选1项集的每一个元素都转化为一个单独的集合。 19 | return list(map(frozenset, C1)) #map(frozenset, C1)的语义是将C1由Python列表转换为不变集合(frozenset,Python中的数据结构) 20 | 21 | 22 | 23 | 24 | # 找出候选集中的频繁项集 25 | # dataSet为全部数据集,Ck为大小为k(包含k个元素)的候选项集,minSupport为设定的最小支持度 26 | def scanD(dataSet, Ck, minSupport): 27 | ssCnt = {} # 记录每个候选项的个数 28 | for tid in dataSet: 29 | for can in Ck: 30 | if can.issubset(tid): 31 | ssCnt[can] = ssCnt.get(can, 0) + 1 # 计算每一个项集出现的频率 32 | numItems = float(len(dataSet)) 33 | retList = [] 34 | supportData = {} 35 | for key in ssCnt: 36 | support = ssCnt[key] / numItems 37 | if support >= minSupport: 38 | retList.insert(0, key) #将频繁项集插入返回列表的首部 39 | supportData[key] = support 40 | return retList, supportData #retList为在Ck中找出的频繁项集(支持度大于minSupport的),supportData记录各频繁项集的支持度 41 | 42 | 43 | # 通过频繁项集列表Lk和项集个数k生成候选项集C(k+1)。 44 | def aprioriGen(Lk, k): 45 | retList = [] 46 | lenLk = len(Lk) 47 | for i in range(lenLk): 48 | for j in range(i + 1, lenLk): 49 | # 前k-1项相同时,才将两个集合合并,合并后才能生成k+1项 50 | L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2] # 取出两个集合的前k-1个元素 51 | L1.sort(); L2.sort() 52 | if L1 == L2: 53 | retList.append(Lk[i] | Lk[j]) 54 | return retList 55 | 56 | # 获取事务集中的所有的频繁项集 57 | # Ck表示项数为k的候选项集,最初的C1通过createC1()函数生成。Lk表示项数为k的频繁项集,supK为其支持度,Lk和supK由scanD()函数通过Ck计算而来。 58 | def apriori(dataSet, minSupport=0.5): 59 | C1 = createC1(dataSet) # 从事务集中获取候选1项集 60 | D = list(map(set, dataSet)) # 将事务集的每个元素转化为集合 61 | L1, supportData = scanD(D, C1, minSupport) # 获取频繁1项集和对应的支持度 62 | L = [L1] # L用来存储所有的频繁项集 63 | k = 2 64 | while (len(L[k-2]) > 0): # 一直迭代到项集数目过大而在事务集中不存在这种n项集 65 | Ck = aprioriGen(L[k-2], k) # 根据频繁项集生成新的候选项集。Ck表示项数为k的候选项集 66 | Lk, supK = scanD(D, Ck, minSupport) # Lk表示项数为k的频繁项集,supK为其支持度 67 | L.append(Lk);supportData.update(supK) # 添加新频繁项集和他们的支持度 68 | k += 1 69 | return L, supportData 70 | 71 | 72 | 73 | # =====================================================关联规则================================================== 74 | # 对候选规则集进行评估 75 | # 计算规则的可信度,并过滤出满足最小可信度要求的规则。freqSet为频繁2项集, H为频繁项集每个元素都是集合的形式 76 | def calcConf(freqSet, H, supportData, brl, minConf=0.7): 77 | prunedH = [] 78 | for conseq in H: 79 | conf = supportData[freqSet] / supportData[freqSet - conseq] # 支持度相减就等级差集的置信度 80 | if conf >= minConf: 81 | print(freqSet - conseq, '-->', conseq, 'conf:', conf) 82 | brl.append((freqSet - conseq, conseq, conf)) # 记录关联规则和置信度 83 | prunedH.append(conseq) #记录关联规则的源项 84 | return prunedH 85 | 86 | # 生成候选规则集。freqSet为频繁n项集,H为频繁项集每个元素多是集合的形式,brl为关联规则,minConf为最低置信区间 87 | def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): 88 | m = len(H[0]) #先计算要计算规则的源项的大小 89 | if (len(freqSet) > (m + 1)): #查看该频繁项集是否大到可以移除大小为m的子集,也就是说是否还有目的项 90 | Hmpl = aprioriGen(H, m + 1) # 生成候选集 91 | Hmpl = calcConf(freqSet, Hmpl, supportData, brl, minConf) # 计算关联规则 92 | if (len(Hmpl) > 1): 93 | rulesFromConseq(freqSet, Hmpl, supportData, brl, minConf) # 下一次迭代的列表 94 | 95 | 96 | # 获取关联规则。L为频繁项集(不包含频繁1项集),supportData为频繁项集对象的支持度,minConf最小可信度阈值 97 | def generateRules(L, supportData, minConf=0.7): 98 | bigRuleList = [] 99 | for i in range(1, len(L)): # 遍历每一个频繁项 100 | for freqSet in L[i]: # 遍历每一个项 101 | H1 = [frozenset([item]) for item in freqSet] #将每一个项转化为集合 102 | if (i > 1): 103 | # 三个及以上元素的集合 104 | rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) 105 | else: 106 | # 两个元素的集合 107 | calcConf(freqSet, H1, supportData, bigRuleList, minConf) 108 | return bigRuleList 109 | 110 | 111 | if __name__=='__main__': 112 | dataSet = loadDataSet() # 获取事务集。每个元素都是列表 113 | # C1 = createC1(dataSet) # 获取候选1项集。每个元素都是集合 114 | # D = list(map(set, dataSet)) # 转化事务集的形式,每个元素都转化为集合。 115 | # L1, suppDat = scanD(D, C1, 0.5) 116 | # print(L1,suppDat) 117 | 118 | 119 | # L, suppData = apriori(dataSet,minSupport=0.7) 120 | # print(L,suppData) 121 | 122 | 123 | L, suppData = apriori(dataSet, minSupport=0.5) 124 | print(L,suppData) 125 | rules = generateRules(L, suppData, minConf=0.7) 126 | print(rules) -------------------------------------------------------------------------------- /association/fpGrowth.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # FP树类 4 | class treeNode: 5 | def __init__(self, nameValue, numOccur, parentNode): 6 | self.name = nameValue #节点元素名称,在构造时初始化为给定值 7 | self.count = numOccur # 出现次数,在构造时初始化为给定值 8 | self.nodeLink = None # 指向下一个相似节点的指针,默认为None 9 | self.parent = parentNode # 指向父节点的指针,在构造时初始化为给定值 10 | self.children = {} # 指向子节点的字典,以子节点的元素名称为键,指向子节点的指针为值,初始化为空字典 11 | 12 | # 增加节点的出现次数值 13 | def inc(self, numOccur): 14 | self.count += numOccur 15 | 16 | # 输出节点和子节点的FP树结构 17 | def disp(self, ind=1): 18 | print(' ' * ind, self.name, ' ', self.count) 19 | for child in self.children.values(): 20 | child.disp(ind + 1) 21 | 22 | 23 | # =======================================================构建FP树================================================== 24 | 25 | 26 | # 对不是第一个出现的节点,更新头指针块。就是添加到相似元素链表的尾部 27 | def updateHeader(nodeToTest, targetNode): 28 | while (nodeToTest.nodeLink != None): 29 | nodeToTest = nodeToTest.nodeLink 30 | nodeToTest.nodeLink = targetNode 31 | 32 | # 根据一个排序过滤后的频繁项更新FP树 33 | def updateTree(items, inTree, headerTable, count): 34 | if items[0] in inTree.children: 35 | # 有该元素项时计数值+1 36 | inTree.children[items[0]].inc(count) 37 | else: 38 | # 没有这个元素项时创建一个新节点 39 | inTree.children[items[0]] = treeNode(items[0], count, inTree) 40 | # 更新头指针表或前一个相似元素项节点的指针指向新节点 41 | if headerTable[items[0]][1] == None: # 如果是第一次出现,则在头指针表中增加对该节点的指向 42 | headerTable[items[0]][1] = inTree.children[items[0]] 43 | else: 44 | updateHeader(headerTable[items[0]][1], inTree.children[items[0]]) 45 | 46 | if len(items) > 1: 47 | # 对剩下的元素项迭代调用updateTree函数 48 | updateTree(items[1::], inTree.children[items[0]], headerTable, count) 49 | 50 | 51 | 52 | # 主程序。创建FP树。dataSet为事务集,为一个字典,键为每个事物,值为该事物出现的次数。minSup为最低支持度 53 | def createTree(dataSet, minSup=1): 54 | # 第一次遍历数据集,创建头指针表 55 | headerTable = {} 56 | for trans in dataSet: 57 | for item in trans: 58 | headerTable[item] = headerTable.get(item, 0) + dataSet[trans] 59 | # 移除不满足最小支持度的元素项 60 | keys = list(headerTable.keys()) # 因为字典要求在迭代中不能修改,所以转化为列表 61 | for k in keys: 62 | if headerTable[k] < minSup: 63 | del(headerTable[k]) 64 | # 空元素集,返回空 65 | freqItemSet = set(headerTable.keys()) 66 | if len(freqItemSet) == 0: 67 | return None, None 68 | # 增加一个数据项,用于存放指向相似元素项指针 69 | for k in headerTable: 70 | headerTable[k] = [headerTable[k], None] # 每个键的值,第一个为个数,第二个为下一个节点的位置 71 | retTree = treeNode('Null Set', 1, None) # 根节点 72 | # 第二次遍历数据集,创建FP树 73 | for tranSet, count in dataSet.items(): 74 | localD = {} # 记录频繁1项集的全局频率,用于排序 75 | for item in tranSet: 76 | if item in freqItemSet: # 只考虑频繁项 77 | localD[item] = headerTable[item][0] # 注意这个[0],因为之前加过一个数据项 78 | if len(localD) > 0: 79 | orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)] # 排序 80 | updateTree(orderedItems, retTree, headerTable, count) # 更新FP树 81 | return retTree, headerTable 82 | 83 | 84 | # =================================================查找元素条件模式基=============================================== 85 | 86 | # 直接修改prefixPath的值,将当前节点leafNode添加到prefixPath的末尾,然后递归添加其父节点。 87 | # prefixPath就是一条从treeNode(包括treeNode)到根节点(不包括根节点)的路径 88 | def ascendTree(leafNode, prefixPath): 89 | if leafNode.parent != None: 90 | prefixPath.append(leafNode.name) 91 | ascendTree(leafNode.parent, prefixPath) 92 | 93 | # 为给定元素项生成一个条件模式基(前缀路径)。basePet表示输入的频繁项,treeNode为当前FP树中对应的第一个节点 94 | # 函数返回值即为条件模式基condPats,用一个字典表示,键为前缀路径,值为计数值。 95 | def findPrefixPath(basePat, treeNode): 96 | condPats = {} # 存储条件模式基 97 | while treeNode != None: 98 | prefixPath = [] # 用于存储前缀路径 99 | ascendTree(treeNode, prefixPath) # 生成前缀路径 100 | if len(prefixPath) > 1: 101 | condPats[frozenset(prefixPath[1:])] = treeNode.count # 出现的数量就是当前叶子节点的数量 102 | treeNode = treeNode.nodeLink # 遍历下一个相同元素 103 | return condPats 104 | 105 | 106 | 107 | # =================================================递归查找频繁项集=============================================== 108 | # 根据事务集获取FP数和频繁项。 109 | # 遍历频繁项,生成每个频繁项的条件FP树和条件FP树的频繁项 110 | # 这样每个频繁项与他条件FP数的频繁项都构成了频繁项集 111 | 112 | # inTree和headerTable是由createTree()函数生成的事务集的FP树。 113 | # minSup表示最小支持度。 114 | # preFix请传入一个空集合(set([])),将在函数中用于保存当前前缀。 115 | # freqItemList请传入一个空列表([]),将用来储存生成的频繁项集。 116 | def mineTree(inTree, headerTable, minSup, preFix, freqItemList): 117 | # 对频繁项按出现的数量进行排序进行排序 118 | sorted_headerTable = sorted(headerTable.items(), key=lambda p: p[1][0]) #返回重新排序的列表。每个元素是一个元组,[(key,[num,treeNode],()) 119 | bigL = [v[0] for v in sorted_headerTable] # 获取频繁项 120 | for basePat in bigL: 121 | newFreqSet = preFix.copy() # 新的频繁项集 122 | newFreqSet.add(basePat) # 当前前缀添加一个新元素 123 | freqItemList.append(newFreqSet) # 所有的频繁项集列表 124 | condPattBases = findPrefixPath(basePat, headerTable[basePat][1]) # 获取条件模式基。就是basePat元素的所有前缀路径。它像一个新的事务集 125 | myCondTree, myHead = createTree(condPattBases, minSup) # 创建条件FP数 126 | 127 | if myHead != None: 128 | # 用于测试 129 | print('conditional tree for:', newFreqSet) 130 | myCondTree.disp() 131 | mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList) # 递归直到不再有元素 132 | 133 | 134 | 135 | 136 | # 生成数据集 137 | def loadSimpDat(): 138 | simpDat = [['r', 'z', 'h', 'j', 'p'], 139 | ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], 140 | ['z'], 141 | ['r', 'x', 'n', 'o', 's'], 142 | ['y', 'r', 'x', 'z', 'q', 't', 'p'], 143 | ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']] 144 | return simpDat 145 | 146 | # 将数据集转化为目标格式 147 | def createInitSet(dataSet): 148 | retDict = {} 149 | for trans in dataSet: 150 | retDict[frozenset(trans)] = 1 151 | return retDict 152 | 153 | if __name__=='__main__': 154 | minSup =3 155 | simpDat = loadSimpDat() # 加载数据集 156 | initSet = createInitSet(simpDat) # 转化为符合格式的事务集 157 | myFPtree, myHeaderTab = createTree(initSet, minSup) # 形成FP树 158 | # myFPtree.disp() # 打印树 159 | 160 | freqItems = [] # 用于存储频繁项集 161 | mineTree(myFPtree, myHeaderTab, minSup, set([]), freqItems) # 获取频繁项集 162 | print(freqItems) # 打印频繁项集 -------------------------------------------------------------------------------- /association/帮助.txt: -------------------------------------------------------------------------------- 1 | 关联分析的相关demo 2 | -------------------------------------------------------------------------------- /cluster/DBSCAN.py: -------------------------------------------------------------------------------- 1 | import pylab as pl 2 | from collections import defaultdict,Counter 3 | 4 | # 加载数据集 5 | def loaddata(filepath): 6 | points = [[float(eachpoint.split(",")[0]), float(eachpoint.split(",")[1])] for eachpoint in open(filepath,"r")] 7 | return points 8 | 9 | # 以距离最大的维度上的距离为两个对象之间的距离 10 | def distance(point1,point2): 11 | return max(abs(point1[0] - point2[0]),abs(point1[1] - point2[1])) 12 | 13 | 14 | # 计算每个数据点相邻的数据点,邻域定义为以该点为中心以边长为2*EPs的网格 15 | def getSurroundPoint(points,Eps=1): 16 | surroundPoints = {} # 每个元素默认是一个空列表 17 | for idx1,point1 in enumerate(points): 18 | for idx2,point2 in enumerate(points): 19 | if (idx1 < idx2): 20 | if(distance(point1,point2)<=Eps): 21 | surroundPoints.setdefault(idx1,[]) # 设置每个点的默认值邻节点为空列表 22 | surroundPoints.setdefault(idx2, []) # 设置每个点的默认值邻节点为空列表 23 | surroundPoints[idx1].append(idx2) 24 | surroundPoints[idx2].append(idx1) 25 | return surroundPoints 26 | 27 | 28 | 29 | # 定义邻域内相邻的数据点的个数大于4的为核心点,获取核心点以及核心点的周边点 30 | def findallCore(points,surroundPoints,Eps=10,MinPts=5): 31 | # 获取所有核心点 32 | corePointIdx = [pointIdx for pointIdx,surPointIdxs in surroundPoints.items() if len(surPointIdxs)>=MinPts] 33 | # 邻域内包含某个核心点的非核心点,定义为边界点 34 | borderPointIdx = [] 35 | for pointIdx,surPointIdxs in surroundPoints.items(): 36 | if (pointIdx not in corePointIdx): # 边界点本身不是核心点 37 | for onesurPointIdx in surPointIdxs: 38 | if onesurPointIdx in corePointIdx: # 边界点周边至少包含一个核心点 39 | borderPointIdx.append(pointIdx) 40 | break 41 | 42 | corePoint = [points[pointIdx] for pointIdx in corePointIdx] # 核心点 43 | borderPoint = [points[pointIdx] for pointIdx in borderPointIdx] #边界点 44 | return corePointIdx,borderPointIdx 45 | 46 | # 获取所有噪声点。噪音点既不是边界点也不是核心点 47 | def findallnoise(points,corePointIdx,borderPointIdx): 48 | noisePointIdx = [pointIdx for pointIdx in range(len(points)) if pointIdx not in corePointIdx and pointIdx not in borderPointIdx] 49 | noisePoint = [points[pointIdx] for pointIdx in noisePointIdx] 50 | return noisePoint 51 | 52 | 53 | 54 | 55 | # 根据邻域关系,核心点,边界点进行分簇 56 | def divideGroups(points,surroundPoints,corePointIdx,borderPointIdx): 57 | groups = [idx for idx in range(len(points))] # groups记录每个节点所属的簇编号 58 | # 各个核心点与其邻域内的所有核心点放在同一个簇中 59 | for pointidx,surroundIdxs in surroundPoints.items(): 60 | for oneSurroundIdx in surroundIdxs: 61 | if (pointidx in corePointIdx and oneSurroundIdx in corePointIdx and pointidx < oneSurroundIdx): 62 | for idx in range(len(groups)): 63 | if groups[idx] == groups[oneSurroundIdx]: 64 | groups[idx] = groups[pointidx] 65 | 66 | # 边界点跟其邻域内的某个核心点放在同一个簇中 67 | for pointidx,surroundIdxs in surroundPoints.items(): 68 | for oneSurroundIdx in surroundIdxs: 69 | if (pointidx in borderPointIdx and oneSurroundIdx in corePointIdx): 70 | groups[pointidx] = groups[oneSurroundIdx] 71 | break 72 | return groups 73 | 74 | # 绘制分簇图 75 | def plotgroup(points,groups,noisePoint): 76 | # 取簇规模最大的3个簇 77 | finalGroup = Counter(groups).most_common(3) 78 | finalGroup = [onecount[0] for onecount in finalGroup] 79 | group1 = [points[idx] for idx in range(len(points)) if groups[idx]==finalGroup[0]] 80 | group2 = [points[idx] for idx in range(len(points)) if groups[idx]==finalGroup[1]] 81 | group3 = [points[idx] for idx in range(len(points)) if groups[idx]==finalGroup[2]] 82 | pl.plot([eachpoint[0] for eachpoint in group1], [eachpoint[1] for eachpoint in group1], 'or') 83 | pl.plot([eachpoint[0] for eachpoint in group2], [eachpoint[1] for eachpoint in group2], 'oy') 84 | pl.plot([eachpoint[0] for eachpoint in group3], [eachpoint[1] for eachpoint in group3], 'og') 85 | # 打印噪音点,黑色 86 | pl.plot([eachpoint[0] for eachpoint in noisePoint], [eachpoint[1] for eachpoint in noisePoint], 'ok') 87 | pl.show() 88 | 89 | 90 | if __name__=='__main__': 91 | points = loaddata('DBSCAN_data.txt') # 加载数据 92 | surroundPoints=getSurroundPoint(points,Eps=2) # 获取邻域关系 93 | corePointIdx, borderPointIdx = findallCore(points,surroundPoints,Eps=2,MinPts=3) # 获取核心节点和边界节点 94 | noisePoint = findallnoise(points,corePointIdx,borderPointIdx) # 获取噪音节点 95 | groups = divideGroups(points,surroundPoints,corePointIdx,borderPointIdx) # 节点分簇 96 | plotgroup(points, groups, noisePoint) # 可视化绘图 -------------------------------------------------------------------------------- /cluster/DBSCAN_data.txt: -------------------------------------------------------------------------------- 1 | -2.68420713,1.469732895 2 | -2.71539062,-0.763005825 3 | -2.88981954,-0.618055245 4 | -2.7464372,-1.40005944 5 | -2.72859298,1.50266052 6 | -2.27989736,3.365022195 7 | -2.82089068,-0.369470295 8 | -2.62648199,0.766824075 9 | -2.88795857,-2.568591135 10 | -2.67384469,-0.48011265 11 | -2.50652679,2.933707545 12 | -2.61314272,0.096842835 13 | -2.78743398,-1.024830855 14 | -3.22520045,-2.264759595 15 | -2.64354322,5.33787705 16 | -2.38386932,6.05139453 17 | -2.6225262,3.681403515 18 | -2.64832273,1.436115015 19 | -2.19907796,3.956598405 20 | -2.58734619,2.34213138 21 | 1.28479459,3.084476355 22 | 0.93241075,1.436391405 23 | 1.46406132,2.268854235 24 | 0.18096721,-3.71521773 25 | 1.08713449,0.339256755 26 | 0.64043675,-1.87795566 27 | 1.09522371,1.277510445 28 | -0.75146714,-4.504983795 29 | 1.04329778,1.030306095 30 | -0.01019007,-3.242586915 31 | -0.5110862,-5.681213775 32 | 0.51109806,-0.460278495 33 | 0.26233576,-2.46551985 34 | 0.98404455,-0.55962189 35 | -0.174864,-1.133170065 36 | 0.92757294,2.107062945 37 | 0.65959279,-1.583893305 38 | 0.23454059,-1.493648235 39 | 0.94236171,-2.43820017 40 | 0.0432464,-2.616702525 41 | 4.53172698,-0.05329008 42 | 3.41407223,-2.58716277 43 | 4.61648461,1.538708805 44 | 3.97081495,-0.815065605 45 | 4.34975798,-0.188471475 46 | 5.39687992,2.462256225 47 | 2.51938325,-5.361082605 48 | 4.9320051,1.585696545 49 | 4.31967279,-1.104966765 50 | 4.91813423,3.511712835 51 | 3.66193495,1.0891728 52 | 3.80234045,-0.972695745 53 | 4.16537886,0.96876126 54 | 3.34459422,-3.493869435 55 | 3.5852673,-2.426881725 56 | 3.90474358,0.534685455 57 | 3.94924878,0.18328617 58 | 5.48876538,5.27195043 59 | 5.79468686,1.139695065 60 | 3.29832982,-3.42456273 -------------------------------------------------------------------------------- /cluster/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-infra/data-mining/3f9a5ab6764a950108e95f295417ae83a99287b7/cluster/Thumbs.db -------------------------------------------------------------------------------- /cluster/blogclust.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-infra/data-mining/3f9a5ab6764a950108e95f295417ae83a99287b7/cluster/blogclust.jpg -------------------------------------------------------------------------------- /cluster/blogs2d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-infra/data-mining/3f9a5ab6764a950108e95f295417ae83a99287b7/cluster/blogs2d.jpg -------------------------------------------------------------------------------- /cluster/downloadrss.py: -------------------------------------------------------------------------------- 1 | # 收集来自zebo的数据集(用户搜索过的链接),并存入文件 2 | 3 | from bs4 import BeautifulSoup 4 | import urllib 5 | import re 6 | chare=re.compile(r'[!-\.&]') 7 | itemowners={} 8 | 9 | # 要去除的单词 10 | dropwords=['a','new','some','more','my','own','the','many','other','another'] 11 | 12 | currentuser=0 13 | for i in range(1,51): 14 | # 搜索“用户希望拥有的物品”所对应的url 15 | c=urllib.request.urlopen('http://member.zebo.com/Main?event_key=USERSEARCH&wiowiw=wiw&keyword=car&page=%d' % (i)) 16 | soup=BeautifulSoup(c.read()) 17 | for td in soup('td'): 18 | # 寻找带有bgverdanasmall类的表格单元 19 | if ('class' in dict(td.attrs) and td['class']=='bgverdanasmall'): 20 | items=[re.sub(chare,'',str(a.contents[0]).lower()).strip() for a in td('a')] 21 | for item in items: 22 | # 去除多余的单词 23 | txt=' '.join([t for t in item.split(' ') if t not in dropwords]) 24 | if len(txt)<2: continue 25 | itemowners.setdefault(txt,{}) 26 | itemowners[txt][currentuser]=1 27 | currentuser+=1 28 | 29 | out=open('zebo.txt','w') 30 | out.write('Item') 31 | for user in range(0,currentuser): out.write('\tU%d' % user) 32 | out.write('\n') 33 | for item,owners in itemowners.items(): 34 | if len(owners)>10: 35 | out.write(item) 36 | for user in range(0,currentuser): 37 | if user in owners: out.write('\t1') 38 | else: out.write('\t0') 39 | out.write('\n') 40 | -------------------------------------------------------------------------------- /cluster/feedlist.txt: -------------------------------------------------------------------------------- 1 | http://feeds.feedburner.com/37signals/beMH 2 | http://feeds.feedburner.com/blogspot/bRuz 3 | http://battellemedia.com/index.xml 4 | http://blog.guykawasaki.com/index.rdf 5 | http://blog.outer-court.com/rss.xml 6 | http://feeds.searchenginewatch.com/sewblog 7 | http://blog.topix.net/index.rdf 8 | http://blogs.abcnews.com/theblotter/index.rdf 9 | http://feeds.feedburner.com/ConsumingExperienceFull 10 | http://flagrantdisregard.com/index.php/feed/ 11 | http://featured.gigaom.com/feed/ 12 | http://gizmodo.com/index.xml 13 | http://gofugyourself.typepad.com/go_fug_yourself/index.rdf 14 | http://googleblog.blogspot.com/rss.xml 15 | http://feeds.feedburner.com/GoogleOperatingSystem 16 | http://headrush.typepad.com/creating_passionate_users/index.rdf 17 | http://feeds.feedburner.com/instapundit/main 18 | http://jeremy.zawodny.com/blog/rss2.xml 19 | http://joi.ito.com/index.rdf 20 | http://feeds.feedburner.com/Mashable 21 | http://michellemalkin.com/index.rdf 22 | http://moblogsmoproblems.blogspot.com/rss.xml 23 | http://newsbusters.org/node/feed 24 | http://beta.blogger.com/feeds/27154654/posts/full?alt=rss 25 | http://feeds.feedburner.com/paulstamatiou 26 | http://powerlineblog.com/index.rdf 27 | http://feeds.feedburner.com/Publishing20 28 | http://radar.oreilly.com/index.rdf 29 | http://scienceblogs.com/pharyngula/index.xml 30 | http://scobleizer.wordpress.com/feed/ 31 | http://sethgodin.typepad.com/seths_blog/index.rdf 32 | http://rss.slashdot.org/Slashdot/slashdot 33 | http://thinkprogress.org/feed/ 34 | http://feeds.feedburner.com/andrewsullivan/rApM 35 | http://wilwheaton.typepad.com/wwdnbackup/index.rdf 36 | http://www.43folders.com/feed/ 37 | http://www.456bereastreet.com/feed.xml 38 | http://www.autoblog.com/rss.xml 39 | http://www.bloggersblog.com/rss.xml 40 | http://www.bloglines.com/rss/about/news 41 | http://www.blogmaverick.com/rss.xml 42 | http://www.boingboing.net/index.rdf 43 | http://www.buzzmachine.com/index.xml 44 | http://www.captainsquartersblog.com/mt/index.rdf 45 | http://www.coolhunting.com/index.rdf 46 | http://feeds.copyblogger.com/Copyblogger 47 | http://feeds.feedburner.com/crooksandliars/YaCP 48 | http://feeds.dailykos.com/dailykos/index.xml 49 | http://www.deadspin.com/index.xml 50 | http://www.downloadsquad.com/rss.xml 51 | http://www.engadget.com/rss.xml 52 | http://www.gapingvoid.com/index.rdf 53 | http://www.gawker.com/index.xml 54 | http://www.gothamist.com/index.rdf 55 | http://www.huffingtonpost.com/raw_feed_index.rdf 56 | http://www.hyperorg.com/blogger/index.rdf 57 | http://www.joelonsoftware.com/rss.xml 58 | http://www.joystiq.com/rss.xml 59 | http://www.kotaku.com/index.xml 60 | http://feeds.kottke.org/main 61 | http://www.lifehack.org/feed/ 62 | http://www.lifehacker.com/index.xml 63 | http://littlegreenfootballs.com/weblog/lgf-rss.php 64 | http://www.makezine.com/blog/index.xml 65 | http://www.mattcutts.com/blog/feed/ 66 | http://xml.metafilter.com/rss.xml 67 | http://www.mezzoblue.com/rss/index.xml 68 | http://www.micropersuasion.com/index.rdf 69 | http://www.neilgaiman.com/journal/feed/rss.xml 70 | http://www.oilman.ca/feed/ 71 | http://www.perezhilton.com/index.xml 72 | http://www.plasticbag.org/index.rdf 73 | http://www.powazek.com/rss.xml 74 | http://www.problogger.net/feed/ 75 | http://feeds.feedburner.com/QuickOnlineTips 76 | http://www.readwriteweb.com/rss.xml 77 | http://www.schneier.com/blog/index.rdf 78 | http://scienceblogs.com/sample/combined.xml 79 | http://www.seroundtable.com/index.rdf 80 | http://www.shoemoney.com/feed/ 81 | http://www.sifry.com/alerts/index.rdf 82 | http://www.simplebits.com/xml/rss.xml 83 | http://feeds.feedburner.com/Spikedhumor 84 | http://www.stevepavlina.com/blog/feed 85 | http://www.talkingpointsmemo.com/index.xml 86 | http://www.tbray.org/ongoing/ongoing.rss 87 | http://feeds.feedburner.com/TechCrunch 88 | http://www.techdirt.com/techdirt_rss.xml 89 | http://www.techeblog.com/index.php/feed/ 90 | http://www.thesuperficial.com/index.xml 91 | http://www.tmz.com/rss.xml 92 | http://www.treehugger.com/index.rdf 93 | http://www.tuaw.com/rss.xml 94 | http://www.valleywag.com/index.xml 95 | http://www.we-make-money-not-art.com/index.rdf 96 | http://www.wired.com/rss/index.xml 97 | http://www.wonkette.com/index.xml 98 | -------------------------------------------------------------------------------- /cluster/generatefeedvector.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- 2 | #解析数据源,选择性提取特征数据集并存储。(特征数据集是用来聚类的) 3 | import feedparser #pip install feedparser 4 | import re 5 | import urllib 6 | # 提取特征数据集(链接-单词-次数) 7 | # 返回一个RSS订阅源的标题和包含单词计数情况的字典 8 | def getwordcounts(url): 9 | return 10 | # 解析订阅源 11 | d=feedparser.parse(url) 12 | wc={} 13 | 14 | # 循环遍历所有文章条目 15 | for e in d.entries: 16 | if 'summary' in e: summary=e.summary 17 | else: summary=e.description 18 | 19 | # 提取一个单词列表 20 | words=getwords(e.title+' '+summary) 21 | for word in words: 22 | wc.setdefault(word,0) 23 | wc[word]+=1 24 | return d.feed.title,wc #返回输入变量和特征数据集 25 | 26 | #根据源代码提取单词列表 27 | def getwords(html): 28 | # 去除所有的html标记 29 | txt=re.compile(r'<[^>]+>').sub('',html) 30 | 31 | # 利用所有非字母字符拆分出单词 32 | words=re.compile(r'[^A-Z^a-z]+').split(txt) 33 | 34 | # 转化成小写形式 35 | return [word.lower() for word in words if word!=''] 36 | 37 | 38 | apcount={} #第一个特征数据集:每个特征出现在的输入数目(每个单词出现在多少文章中) 39 | wordcounts={} #第二个特征数据集:每个输入出现的特征数目(每篇文章包含的单词的数目) 40 | feedlist=[line for line in open('feedlist.txt')] 41 | for feedurl in feedlist: 42 | try: 43 | title,wc=getwordcounts(feedurl) 44 | print(title) 45 | print(wc) 46 | wordcounts[title]=wc 47 | for word,count in wc.items(): 48 | apcount.setdefault(word,0) 49 | if count>1: 50 | apcount[word]+=1 51 | except: 52 | print('Failed to parse feed %s' % feedurl) 53 | 54 | # 选取部分特征进行分析。因为特征出现次数太少具有偶然性,太多了具有普遍性,没法用于区分 55 | wordlist=[] 56 | for w,bc in apcount.items(): 57 | frac=float(bc)/len(feedlist) 58 | if frac>0.1 and frac<0.5: 59 | wordlist.append(w) 60 | 61 | # 将要分析的特征写入文件。最终形式为每行代表一个输入(文章),每列代表一个特征(单词),取值为出现的数量 62 | out=open('blogdata1.txt','w') 63 | out.write('Blog') 64 | for word in wordlist: out.write('\t%s' % word) 65 | out.write('\n') 66 | # 将要分析的特征数据集写入文件 67 | for blog,wc in wordcounts.items(): 68 | print(blog) 69 | out.write(blog) 70 | for word in wordlist: 71 | if word in wc: out.write('\t%d' % wc[word]) 72 | else: out.write('\t0') 73 | out.write('\n') 74 | 75 | -------------------------------------------------------------------------------- /cluster/wordclust.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-infra/data-mining/3f9a5ab6764a950108e95f295417ae83a99287b7/cluster/wordclust.jpg -------------------------------------------------------------------------------- /cluster/帮助.txt: -------------------------------------------------------------------------------- 1 | 包含层次聚类 2 | k均值聚类 3 | 密度聚类 4 | -------------------------------------------------------------------------------- /decision-tree/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-infra/data-mining/3f9a5ab6764a950108e95f295417ae83a99287b7/decision-tree/Thumbs.db -------------------------------------------------------------------------------- /decision-tree/addresslist.txt: -------------------------------------------------------------------------------- 1 | 6 Washington 2 | 21 Manassas 3 | 280 Pearl 4 | 55 Ellery 5 | 50 Follen 6 | 51 Granite 7 | 992 Memorial 8 | 83 Trowbridge 9 | 1 Dana 10 | 45 Regent 11 | 90 Alpine 12 | 21 Francis 13 | 112 Avon Hill 14 | 9 Bellevue 15 | 4 Blanchard Rd 16 | 34 Shea 17 | 5 Fountain 18 | 14 Marcella 19 | 39 Saint Saveur 20 | 35 Pemberton 21 | 46 Shepard 22 | 31 Market 23 | 99 Howard 24 | 88 Pearl 25 | 208 Western 26 | 285 Windsor 27 | 26 Cambridgepark 28 | 211 Erie 29 | 129 Franklin 30 | 27 Gurney 31 | 149 Prospect 32 | 27 Linnaean 33 | 20 Dudley 34 | 60 Otis St 35 | 130 Mount Auburn St 36 | 2 Michael Way 37 | 263 Columbia St 38 | 6 Hurlbut St 39 | 199 Harvard St 40 | 168 River St 41 | 400 Washington St 42 | 12 Traill St 43 | 74 Field St 44 | 21 Walden Square Rd 45 | 7 Wendell St 46 | 15 Normandy Ave 47 | 6 Gibson Ter 48 | 94 Pine St 49 | 23 Magee St 50 | 175 Richdale Ave 51 | 168 River St 52 | 246 Brattle St -------------------------------------------------------------------------------- /decision-tree/hotornot.py: -------------------------------------------------------------------------------- 1 | # 对“热度”评价进行建模。api服务网站不能用了 2 | import urllib.request 3 | import treepredict 4 | import xml.dom.minidom 5 | 6 | api_key='479NUNJHETN' 7 | 8 | def getrandomratings(c): 9 | # 为getRandomProfile构造url 10 | 11 | url="http://services.hotornot.com/rest/?app_key=%s" % api_key 12 | url+="&method=Rate.getRandomProfile&retrieve_num=%d" % c 13 | url+="&get_rate_info=true&meet_users_only=true" 14 | 15 | print(url) 16 | f1=urllib.request.urlopen(url).read() 17 | 18 | 19 | doc=xml.dom.minidom.parseString(f1) 20 | 21 | emids=doc.getElementsByTagName('emid') 22 | ratings=doc.getElementsByTagName('rating') #获取评价 23 | 24 | # 将emids和ratings组合在一个列表中 25 | result=[] 26 | for e,r in zip(emids,ratings): 27 | if r.firstChild!=None: 28 | result.append((e.firstChild.data,r.firstChild.data)) 29 | print(result) 30 | return result 31 | 32 | stateregions={'New England':['ct','mn','ma','nh','ri','vt'], 33 | 'Mid Atlantic':['de','md','nj','ny','pa'], 34 | 'South':['al','ak','fl','ga','ky','la','ms','mo', 35 | 'nc','sc','tn','va','wv'], 36 | 'Midwest':['il','in','ia','ks','mi','ne','nd','oh','sd','wi'], 37 | 'West':['ak','ca','co','hi','id','mt','nv','or','ut','wa','wy']} 38 | 39 | def getpeopledata(ratings): 40 | result=[] 41 | for emid,rating in ratings: 42 | # 对应于MeetMe.getProfile方法调用的url 43 | url="http://services.hotornot.com/rest/?app_key=%s" % api_key 44 | url+="&method=MeetMe.getProfile&emid=%s&get_keywords=true" % emid 45 | 46 | # 得到所有关于此人的详细信息 47 | try: 48 | rating=int(float(rating)+0.5) 49 | doc2=xml.dom.minidom.parseString(urllib.request.urlopen(url).read()) 50 | gender=doc2.getElementsByTagName('gender')[0].firstChild.data 51 | age=doc2.getElementsByTagName('age')[0].firstChild.data 52 | loc=doc2.getElementsByTagName('location')[0].firstChild.data[0:2] 53 | 54 | # 将州转换为地区 55 | for r,s in stateregions.items(): 56 | if loc in s: region=r 57 | 58 | if region!=None: 59 | result.append((gender,int(age),region,rating)) 60 | except: 61 | pass 62 | return result 63 | 64 | if __name__=='__main__': #只有在执行当前模块时才会运行此函数 65 | l1=getrandomratings(500) 66 | print(len(l1)) 67 | pdata = getpeopledata(l1) 68 | print(pdata) 69 | tree = treepredict.buildtree(pdata,scoref=treepredict.variance) #创建决策树 70 | treepredict.prune(tree,0.5) #剪支 71 | treepredict.drawtree(tree,'hot.jpg') -------------------------------------------------------------------------------- /decision-tree/treeview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-infra/data-mining/3f9a5ab6764a950108e95f295417ae83a99287b7/decision-tree/treeview.jpg -------------------------------------------------------------------------------- /decision-tree/zillow.py: -------------------------------------------------------------------------------- 1 | # 对住房价格进行估计。api网站不能用了 2 | import xml.dom.minidom 3 | import urllib.request 4 | import treepredict 5 | 6 | zwskey="X1-ZWzlchwxis15aj_9skq6" 7 | 8 | def getaddressdata(address,city): 9 | escad=address.replace(' ','+') 10 | # 构造url 11 | url='http://www.zillow.com/webservice/GetDeepSearchResults.htm?' 12 | url+='zws-id=%s&address=%s&citystatezip=%s' % (zwskey,escad,city) 13 | # 解析xml形式的返回结果 14 | request = urllib.request.Request(url) 15 | doc=xml.dom.minidom.parseString(urllib.request.urlopen(request).read()) 16 | print(url) 17 | code=doc.getElementsByTagName('code')[0].firstChild.data 18 | # 状态码为0代表操作成功,否则代表错误发生 19 | if code!='0': return None 20 | # 提取有关房产的信息 21 | try: 22 | zipcode=doc.getElementsByTagName('zipcode')[0].firstChild.data 23 | use=doc.getElementsByTagName('useCode')[0].firstChild.data 24 | year=doc.getElementsByTagName('yearBuilt')[0].firstChild.data 25 | sqft=doc.getElementsByTagName('finishedSqFt')[0].firstChild.data 26 | bath=doc.getElementsByTagName('bathrooms')[0].firstChild.data 27 | bed=doc.getElementsByTagName('bedrooms')[0].firstChild.data 28 | rooms=1 #doc.getElementsByTagName('totalRooms')[0].firstChild.data 29 | price=doc.getElementsByTagName('amount')[0].firstChild.data 30 | except: 31 | return None 32 | 33 | return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price) 34 | 35 | # 读取文件构造数据集 36 | def getpricelist(): 37 | l1=[] 38 | for line in open('addresslist.txt'): 39 | data=getaddressdata(line.strip(),'Cambridge,MA') 40 | print(data) 41 | l1.append(data) 42 | return l1 43 | 44 | 45 | 46 | if __name__=='__main__': #只有在执行当前模块时才会运行此函数 47 | housedata = getpricelist() 48 | print(housedata) 49 | tree = treepredict.buildtree(housedata,scoref=treepredict.variance) #创建决策树 50 | treepredict.drawtree(tree,'house.jpg') -------------------------------------------------------------------------------- /decision-tree/帮助.txt: -------------------------------------------------------------------------------- 1 | 决策树相关的demo 2 | -------------------------------------------------------------------------------- /ensemble-learning/ROC.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from matplotlib.font_manager import FontProperties 5 | 6 | def loadDataSet(fileName): 7 | numFeat = len((open(fileName).readline().split('\t'))) 8 | dataMat = []; labelMat = [] 9 | fr = open(fileName) 10 | for line in fr.readlines(): 11 | lineArr = [] 12 | curLine = line.strip().split('\t') 13 | for i in range(numFeat - 1): 14 | lineArr.append(float(curLine[i])) 15 | dataMat.append(lineArr) 16 | labelMat.append(float(curLine[-1])) 17 | 18 | return dataMat, labelMat 19 | 20 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 21 | """ 22 | 单层决策树分类函数 23 | Parameters: 24 | dataMatrix - 数据矩阵 25 | dimen - 第dimen列,也就是第几个特征 26 | threshVal - 阈值 27 | threshIneq - 标志 28 | Returns: 29 | retArray - 分类结果 30 | """ 31 | retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 32 | if threshIneq == 'lt': 33 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 34 | else: 35 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1 36 | return retArray 37 | 38 | 39 | def buildStump(dataArr,classLabels,D): 40 | """ 41 | 找到数据集上最佳的单层决策树 42 | Parameters: 43 | dataArr - 数据矩阵 44 | classLabels - 数据标签 45 | D - 样本权重 46 | Returns: 47 | bestStump - 最佳单层决策树信息 48 | minError - 最小误差 49 | bestClasEst - 最佳的分类结果 50 | """ 51 | dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T 52 | m,n = np.shape(dataMatrix) 53 | numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) 54 | minError = float('inf') #最小误差初始化为正无穷大 55 | for i in range(n): #遍历所有特征 56 | rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 57 | stepSize = (rangeMax - rangeMin) / numSteps #计算步长 58 | for j in range(-1, int(numSteps) + 1): 59 | for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than 60 | threshVal = (rangeMin + float(j) * stepSize) #计算阈值 61 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 62 | errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 63 | errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 64 | weightedError = D.T * errArr #计算误差 65 | # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 66 | if weightedError < minError: #找到误差最小的分类方式 67 | minError = weightedError 68 | bestClasEst = predictedVals.copy() 69 | bestStump['dim'] = i 70 | bestStump['thresh'] = threshVal 71 | bestStump['ineq'] = inequal 72 | return bestStump, minError, bestClasEst 73 | 74 | def adaBoostTrainDS(dataArr, classLabels, numIt = 40): 75 | """ 76 | 使用AdaBoost算法训练分类器 77 | Parameters: 78 | dataArr - 数据矩阵 79 | classLabels - 数据标签 80 | numIt - 最大迭代次数 81 | Returns: 82 | weakClassArr - 训练好的分类器 83 | aggClassEst - 类别估计累计值 84 | """ 85 | weakClassArr = [] 86 | m = np.shape(dataArr)[0] 87 | D = np.mat(np.ones((m, 1)) / m) #初始化权重 88 | aggClassEst = np.mat(np.zeros((m,1))) 89 | for i in range(numIt): 90 | bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 91 | # print("D:",D.T) 92 | alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16))) #计算弱学习算法权重alpha,使error不等于0,因为分母不能为0 93 | bestStump['alpha'] = alpha #存储弱学习算法权重 94 | weakClassArr.append(bestStump) #存储单层决策树 95 | # print("classEst: ", classEst.T) 96 | expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst) #计算e的指数项 97 | D = np.multiply(D, np.exp(expon)) 98 | D = D / D.sum() #根据样本权重公式,更新样本权重 99 | #计算AdaBoost误差,当误差为0的时候,退出循环 100 | aggClassEst += alpha * classEst #计算类别估计累计值 101 | # print("aggClassEst: ", aggClassEst.T) 102 | aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差 103 | errorRate = aggErrors.sum() / m 104 | # print("total error: ", errorRate) 105 | if errorRate == 0.0: break #误差为0,退出循环 106 | return weakClassArr, aggClassEst 107 | 108 | 109 | def plotROC(predStrengths, classLabels): 110 | """ 111 | 绘制ROC 112 | Parameters: 113 | predStrengths - 分类器的预测强度 114 | classLabels - 类别 115 | Returns: 116 | 无 117 | """ 118 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) 119 | cur = (1.0, 1.0) #绘制光标的位置 120 | ySum = 0.0 #用于计算AUC 121 | numPosClas = np.sum(np.array(classLabels) == 1.0) #统计正类的数量 122 | yStep = 1 / float(numPosClas) #y轴步长 123 | xStep = 1 / float(len(classLabels) - numPosClas) #x轴步长 124 | 125 | sortedIndicies = predStrengths.argsort() #预测强度排序,从低到高 126 | fig = plt.figure() 127 | fig.clf() 128 | ax = plt.subplot(111) 129 | for index in sortedIndicies.tolist()[0]: 130 | if classLabels[index] == 1.0: 131 | delX = 0; delY = yStep 132 | else: 133 | delX = xStep; delY = 0 134 | ySum += cur[1] #高度累加 135 | ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c = 'b') #绘制ROC 136 | cur = (cur[0] - delX, cur[1] - delY) #更新绘制光标的位置 137 | ax.plot([0,1], [0,1], 'b--') 138 | plt.title('AdaBoost马疝病检测系统的ROC曲线', FontProperties = font) 139 | plt.xlabel('假阳率', FontProperties = font) 140 | plt.ylabel('真阳率', FontProperties = font) 141 | ax.axis([0, 1, 0, 1]) 142 | print('AUC面积为:', ySum * xStep) #计算AUC 143 | plt.show() 144 | 145 | 146 | if __name__ == '__main__': 147 | dataArr, LabelArr = loadDataSet('horseColicTraining2.txt') 148 | weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, LabelArr, 50) 149 | plotROC(aggClassEst.T, LabelArr) -------------------------------------------------------------------------------- /ensemble-learning/adaboost.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # 创建单层决策树的数据集 6 | def loadSimpData(): 7 | datMat = np.array([ 8 | [ 1. , 2.1], 9 | [ 1.5, 1.6], 10 | [ 1.3, 1. ], 11 | [ 1. , 1. ], 12 | [ 2. , 1. ] 13 | ]) 14 | classLabels =np.array([1.0, 1.0, -1.0, -1.0, 1.0]) 15 | return datMat,classLabels 16 | 17 | # 加载数据集 18 | def loadDataSet(fileName): 19 | alldata = np.loadtxt(fileName) 20 | n = alldata.shape[1] 21 | dataMat = alldata[:, 0:n-1] # 添加数据 22 | labelMat = alldata[:, n-1] # .astype(int).reshape(-1,1) #添加标签 23 | return dataMat, labelMat 24 | 25 | # 数据数据可视化 26 | def showDataSet(dataMat, labelMat): 27 | # 绘制样本点 28 | place_plus = np.where(labelMat==1)[0] # 正样本的位置 29 | place_minus = np.where(labelMat==-1)[0] # 负样本的位置 30 | 31 | data_plus = dataMat[place_plus] #正样本 32 | data_minus = dataMat[place_minus] #负样本 33 | 34 | plt.scatter(np.transpose(data_plus)[0], np.transpose(data_plus)[1],s=30, alpha=0.7) #正样本散点图 35 | plt.scatter(np.transpose(data_minus)[0], np.transpose(data_minus)[1], s=30, alpha=0.7) #负样本散点图 36 | plt.show() 37 | 38 | 39 | # 就是根据数据集,要区分的特征,用来分类的特征的阈值进行计算分类结果。分类结果为1或-1分别表示两种类型 40 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 41 | """ 42 | 单层决策树分类函数 43 | Parameters: 44 | dataMatrix - 数据矩阵 45 | dimen - 第dimen列,也就是第几个特征 46 | threshVal - 阈值 47 | threshIneq - 标志 48 | Returns: 49 | retArray - 分类结果 50 | """ 51 | retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 52 | if threshIneq == 'lt': 53 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 54 | else: 55 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1 56 | return retArray 57 | 58 | # 获取决策树第一层分类信息。即获取最佳分类特征,以及该特征的分类阈值,和采用该阈值该特征进行分类的结果和误差 59 | def buildStump(dataArr,classLabels,D): 60 | """ 61 | 找到数据集上最佳的单层决策树 62 | Parameters: 63 | dataArr - 数据矩阵 64 | classLabels - 数据标签 65 | D - 样本权重 66 | Returns: 67 | bestStump - 最佳单层决策树信息 68 | minError - 最小误差 69 | bestClasEst - 最佳的分类结果 70 | """ 71 | dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T 72 | m,n = np.shape(dataMatrix) 73 | numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) 74 | minError = float('inf') #最小误差初始化为正无穷大 75 | for i in range(n): #遍历所有特征 76 | rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 77 | stepSize = (rangeMax - rangeMin) / numSteps #计算步长 78 | for j in range(-1, int(numSteps) + 1): 79 | for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than 80 | threshVal = (rangeMin + float(j) * stepSize) #计算阈值 81 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 82 | errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 83 | errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 84 | weightedError = D.T * errArr #计算误差 85 | # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 86 | if weightedError < minError: #找到误差最小的分类方式 87 | minError = weightedError 88 | bestClasEst = predictedVals.copy() 89 | bestStump['dim'] = i 90 | bestStump['thresh'] = threshVal 91 | bestStump['ineq'] = inequal 92 | return bestStump, minError, bestClasEst 93 | 94 | def adaBoostTrainDS(dataArr, classLabels, numIt = 40): 95 | """ 96 | 使用AdaBoost算法提升弱分类器性能 97 | Parameters: 98 | dataArr - 数据矩阵 99 | classLabels - 数据标签 100 | numIt - 最大迭代次数 101 | Returns: 102 | weakClassArr - 训练好的分类器 103 | aggClassEst - 类别估计累计值 104 | """ 105 | weakClassArr = [] 106 | m = np.shape(dataArr)[0] 107 | D = np.mat(np.ones((m, 1)) / m) #初始化样本权重 108 | aggClassEst = np.mat(np.zeros((m,1))) # 弱分类器的权重 109 | for i in range(numIt): 110 | bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 111 | # print("D:",D.T) 112 | alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16))) #计算弱学习算法权重alpha,使error不等于0,因为分母不能为0 113 | bestStump['alpha'] = alpha #存储弱学习算法权重 114 | weakClassArr.append(bestStump) #存储单层决策树 115 | # print("classEst: ", classEst.T) 116 | expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst) #计算e的指数项 117 | D = np.multiply(D, np.exp(expon)) 118 | D = D / D.sum() #根据样本权重公式,更新样本权重 119 | #计算AdaBoost误差,当误差为0的时候,退出循环 120 | aggClassEst += alpha * classEst #计算类别估计累计值 121 | # print("aggClassEst: ", aggClassEst.T) 122 | aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差 123 | errorRate = aggErrors.sum() / m 124 | # print("total error: ", errorRate) 125 | if errorRate == 0.0: break #误差为0,退出循环 126 | return weakClassArr, aggClassEst 127 | 128 | 129 | def adaClassify(datToClass,classifierArr): 130 | """ 131 | AdaBoost分类函数 132 | Parameters: 133 | datToClass - 待分类样例 134 | classifierArr - 训练好的分类器 135 | Returns: 136 | 分类结果 137 | """ 138 | dataMatrix = np.mat(datToClass) 139 | m = np.shape(dataMatrix)[0] 140 | aggClassEst = np.mat(np.zeros((m,1))) 141 | for i in range(len(classifierArr)): #遍历所有分类器,进行分类 142 | classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq']) 143 | aggClassEst += classifierArr[i]['alpha'] * classEst 144 | # print(aggClassEst) 145 | return np.sign(aggClassEst) 146 | 147 | if __name__ == '__main__': 148 | # dataArr,classLabels = loadSimpData() 149 | # # showDataSet(dataArr,classLabels) 150 | # weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, classLabels) 151 | # print(adaClassify([[0,0],[5,5]], weakClassArr)) 152 | 153 | dataArr, LabelArr = loadDataSet('horseColicTraining2.txt') # 加载训练集 154 | weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, LabelArr) #AdaBoost算法形成 155 | testArr, testLabelArr = loadDataSet('horseColicTest2.txt') # 加载测集 156 | print(weakClassArr) 157 | predictions = adaClassify(dataArr, weakClassArr) 158 | errArr = np.mat(np.ones((len(dataArr), 1))) 159 | print('训练集的错误率:%.3f%%' % float(errArr[predictions != np.mat(LabelArr).T].sum() / len(dataArr) * 100)) 160 | predictions = adaClassify(testArr, weakClassArr) 161 | errArr = np.mat(np.ones((len(testArr), 1))) 162 | print('测试集的错误率:%.3f%%' % float(errArr[predictions != np.mat(testLabelArr).T].sum() / len(testArr) * 100)) 163 | -------------------------------------------------------------------------------- /ensemble-learning/adaboost/ROC.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from matplotlib.font_manager import FontProperties 5 | 6 | def loadDataSet(fileName): 7 | numFeat = len((open(fileName).readline().split('\t'))) 8 | dataMat = []; labelMat = [] 9 | fr = open(fileName) 10 | for line in fr.readlines(): 11 | lineArr = [] 12 | curLine = line.strip().split('\t') 13 | for i in range(numFeat - 1): 14 | lineArr.append(float(curLine[i])) 15 | dataMat.append(lineArr) 16 | labelMat.append(float(curLine[-1])) 17 | 18 | return dataMat, labelMat 19 | 20 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 21 | """ 22 | 单层决策树分类函数 23 | Parameters: 24 | dataMatrix - 数据矩阵 25 | dimen - 第dimen列,也就是第几个特征 26 | threshVal - 阈值 27 | threshIneq - 标志 28 | Returns: 29 | retArray - 分类结果 30 | """ 31 | retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 32 | if threshIneq == 'lt': 33 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 34 | else: 35 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1 36 | return retArray 37 | 38 | 39 | def buildStump(dataArr,classLabels,D): 40 | """ 41 | 找到数据集上最佳的单层决策树 42 | Parameters: 43 | dataArr - 数据矩阵 44 | classLabels - 数据标签 45 | D - 样本权重 46 | Returns: 47 | bestStump - 最佳单层决策树信息 48 | minError - 最小误差 49 | bestClasEst - 最佳的分类结果 50 | """ 51 | dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T 52 | m,n = np.shape(dataMatrix) 53 | numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) 54 | minError = float('inf') #最小误差初始化为正无穷大 55 | for i in range(n): #遍历所有特征 56 | rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 57 | stepSize = (rangeMax - rangeMin) / numSteps #计算步长 58 | for j in range(-1, int(numSteps) + 1): 59 | for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than 60 | threshVal = (rangeMin + float(j) * stepSize) #计算阈值 61 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 62 | errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 63 | errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 64 | weightedError = D.T * errArr #计算误差 65 | # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 66 | if weightedError < minError: #找到误差最小的分类方式 67 | minError = weightedError 68 | bestClasEst = predictedVals.copy() 69 | bestStump['dim'] = i 70 | bestStump['thresh'] = threshVal 71 | bestStump['ineq'] = inequal 72 | return bestStump, minError, bestClasEst 73 | 74 | def adaBoostTrainDS(dataArr, classLabels, numIt = 40): 75 | """ 76 | 使用AdaBoost算法训练分类器 77 | Parameters: 78 | dataArr - 数据矩阵 79 | classLabels - 数据标签 80 | numIt - 最大迭代次数 81 | Returns: 82 | weakClassArr - 训练好的分类器 83 | aggClassEst - 类别估计累计值 84 | """ 85 | weakClassArr = [] 86 | m = np.shape(dataArr)[0] 87 | D = np.mat(np.ones((m, 1)) / m) #初始化权重 88 | aggClassEst = np.mat(np.zeros((m,1))) 89 | for i in range(numIt): 90 | bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 91 | # print("D:",D.T) 92 | alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16))) #计算弱学习算法权重alpha,使error不等于0,因为分母不能为0 93 | bestStump['alpha'] = alpha #存储弱学习算法权重 94 | weakClassArr.append(bestStump) #存储单层决策树 95 | # print("classEst: ", classEst.T) 96 | expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst) #计算e的指数项 97 | D = np.multiply(D, np.exp(expon)) 98 | D = D / D.sum() #根据样本权重公式,更新样本权重 99 | #计算AdaBoost误差,当误差为0的时候,退出循环 100 | aggClassEst += alpha * classEst #计算类别估计累计值 101 | # print("aggClassEst: ", aggClassEst.T) 102 | aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差 103 | errorRate = aggErrors.sum() / m 104 | # print("total error: ", errorRate) 105 | if errorRate == 0.0: break #误差为0,退出循环 106 | return weakClassArr, aggClassEst 107 | 108 | 109 | def plotROC(predStrengths, classLabels): 110 | """ 111 | 绘制ROC 112 | Parameters: 113 | predStrengths - 分类器的预测强度 114 | classLabels - 类别 115 | Returns: 116 | 无 117 | """ 118 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) 119 | cur = (1.0, 1.0) #绘制光标的位置 120 | ySum = 0.0 #用于计算AUC 121 | numPosClas = np.sum(np.array(classLabels) == 1.0) #统计正类的数量 122 | yStep = 1 / float(numPosClas) #y轴步长 123 | xStep = 1 / float(len(classLabels) - numPosClas) #x轴步长 124 | 125 | sortedIndicies = predStrengths.argsort() #预测强度排序,从低到高 126 | fig = plt.figure() 127 | fig.clf() 128 | ax = plt.subplot(111) 129 | for index in sortedIndicies.tolist()[0]: 130 | if classLabels[index] == 1.0: 131 | delX = 0; delY = yStep 132 | else: 133 | delX = xStep; delY = 0 134 | ySum += cur[1] #高度累加 135 | ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c = 'b') #绘制ROC 136 | cur = (cur[0] - delX, cur[1] - delY) #更新绘制光标的位置 137 | ax.plot([0,1], [0,1], 'b--') 138 | plt.title('AdaBoost马疝病检测系统的ROC曲线', FontProperties = font) 139 | plt.xlabel('假阳率', FontProperties = font) 140 | plt.ylabel('真阳率', FontProperties = font) 141 | ax.axis([0, 1, 0, 1]) 142 | print('AUC面积为:', ySum * xStep) #计算AUC 143 | plt.show() 144 | 145 | 146 | if __name__ == '__main__': 147 | dataArr, LabelArr = loadDataSet('horseColicTraining2.txt') 148 | weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, LabelArr, 50) 149 | plotROC(aggClassEst.T, LabelArr) -------------------------------------------------------------------------------- /ensemble-learning/adaboost/adaboost.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # 创建单层决策树的数据集 6 | def loadSimpData(): 7 | datMat = np.array([ 8 | [ 1. , 2.1], 9 | [ 1.5, 1.6], 10 | [ 1.3, 1. ], 11 | [ 1. , 1. ], 12 | [ 2. , 1. ] 13 | ]) 14 | classLabels =np.array([1.0, 1.0, -1.0, -1.0, 1.0]) 15 | return datMat,classLabels 16 | 17 | # 加载数据集 18 | def loadDataSet(fileName): 19 | alldata = np.loadtxt(fileName) 20 | n = alldata.shape[1] 21 | dataMat = alldata[:, 0:n-1] # 添加数据 22 | labelMat = alldata[:, n-1] # .astype(int).reshape(-1,1) #添加标签 23 | return dataMat, labelMat 24 | 25 | # 数据数据可视化 26 | def showDataSet(dataMat, labelMat): 27 | # 绘制样本点 28 | place_plus = np.where(labelMat==1)[0] # 正样本的位置 29 | place_minus = np.where(labelMat==-1)[0] # 负样本的位置 30 | 31 | data_plus = dataMat[place_plus] #正样本 32 | data_minus = dataMat[place_minus] #负样本 33 | 34 | plt.scatter(np.transpose(data_plus)[0], np.transpose(data_plus)[1],s=30, alpha=0.7) #正样本散点图 35 | plt.scatter(np.transpose(data_minus)[0], np.transpose(data_minus)[1], s=30, alpha=0.7) #负样本散点图 36 | plt.show() 37 | 38 | 39 | # 就是根据数据集,要区分的特征,用来分类的特征的阈值进行计算分类结果。分类结果为1或-1分别表示两种类型 40 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 41 | """ 42 | 单层决策树分类函数 43 | Parameters: 44 | dataMatrix - 数据矩阵 45 | dimen - 第dimen列,也就是第几个特征 46 | threshVal - 阈值 47 | threshIneq - 标志 48 | Returns: 49 | retArray - 分类结果 50 | """ 51 | retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 52 | if threshIneq == 'lt': 53 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 54 | else: 55 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1 56 | return retArray 57 | 58 | # 获取决策树第一层分类信息。即获取最佳分类特征,以及该特征的分类阈值,和采用该阈值该特征进行分类的结果和误差 59 | def buildStump(dataArr,classLabels,D): 60 | """ 61 | 找到数据集上最佳的单层决策树 62 | Parameters: 63 | dataArr - 数据矩阵 64 | classLabels - 数据标签 65 | D - 样本权重 66 | Returns: 67 | bestStump - 最佳单层决策树信息 68 | minError - 最小误差 69 | bestClasEst - 最佳的分类结果 70 | """ 71 | dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T 72 | m,n = np.shape(dataMatrix) 73 | numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) 74 | minError = float('inf') #最小误差初始化为正无穷大 75 | for i in range(n): #遍历所有特征 76 | rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 77 | stepSize = (rangeMax - rangeMin) / numSteps #计算步长 78 | for j in range(-1, int(numSteps) + 1): 79 | for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than 80 | threshVal = (rangeMin + float(j) * stepSize) #计算阈值 81 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 82 | errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 83 | errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 84 | weightedError = D.T * errArr #计算误差 85 | # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 86 | if weightedError < minError: #找到误差最小的分类方式 87 | minError = weightedError 88 | bestClasEst = predictedVals.copy() 89 | bestStump['dim'] = i 90 | bestStump['thresh'] = threshVal 91 | bestStump['ineq'] = inequal 92 | return bestStump, minError, bestClasEst 93 | 94 | def adaBoostTrainDS(dataArr, classLabels, numIt = 40): 95 | """ 96 | 使用AdaBoost算法提升弱分类器性能 97 | Parameters: 98 | dataArr - 数据矩阵 99 | classLabels - 数据标签 100 | numIt - 最大迭代次数 101 | Returns: 102 | weakClassArr - 训练好的分类器 103 | aggClassEst - 类别估计累计值 104 | """ 105 | weakClassArr = [] 106 | m = np.shape(dataArr)[0] 107 | D = np.mat(np.ones((m, 1)) / m) #初始化样本权重 108 | aggClassEst = np.mat(np.zeros((m,1))) # 弱分类器的权重 109 | for i in range(numIt): 110 | bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 111 | # print("D:",D.T) 112 | alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16))) #计算弱学习算法权重alpha,使error不等于0,因为分母不能为0 113 | bestStump['alpha'] = alpha #存储弱学习算法权重 114 | weakClassArr.append(bestStump) #存储单层决策树 115 | # print("classEst: ", classEst.T) 116 | expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst) #计算e的指数项 117 | D = np.multiply(D, np.exp(expon)) 118 | D = D / D.sum() #根据样本权重公式,更新样本权重 119 | #计算AdaBoost误差,当误差为0的时候,退出循环 120 | aggClassEst += alpha * classEst #计算类别估计累计值 121 | # print("aggClassEst: ", aggClassEst.T) 122 | aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差 123 | errorRate = aggErrors.sum() / m 124 | # print("total error: ", errorRate) 125 | if errorRate == 0.0: break #误差为0,退出循环 126 | return weakClassArr, aggClassEst 127 | 128 | 129 | def adaClassify(datToClass,classifierArr): 130 | """ 131 | AdaBoost分类函数 132 | Parameters: 133 | datToClass - 待分类样例 134 | classifierArr - 训练好的分类器 135 | Returns: 136 | 分类结果 137 | """ 138 | dataMatrix = np.mat(datToClass) 139 | m = np.shape(dataMatrix)[0] 140 | aggClassEst = np.mat(np.zeros((m,1))) 141 | for i in range(len(classifierArr)): #遍历所有分类器,进行分类 142 | classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq']) 143 | aggClassEst += classifierArr[i]['alpha'] * classEst 144 | # print(aggClassEst) 145 | return np.sign(aggClassEst) 146 | 147 | if __name__ == '__main__': 148 | # dataArr,classLabels = loadSimpData() 149 | # # showDataSet(dataArr,classLabels) 150 | # weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, classLabels) 151 | # print(adaClassify([[0,0],[5,5]], weakClassArr)) 152 | 153 | dataArr, LabelArr = loadDataSet('horseColicTraining2.txt') # 加载训练集 154 | weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, LabelArr) #AdaBoost算法形成 155 | testArr, testLabelArr = loadDataSet('horseColicTest2.txt') # 加载测集 156 | print(weakClassArr) 157 | predictions = adaClassify(dataArr, weakClassArr) 158 | errArr = np.mat(np.ones((len(dataArr), 1))) 159 | print('训练集的错误率:%.3f%%' % float(errArr[predictions != np.mat(LabelArr).T].sum() / len(dataArr) * 100)) 160 | predictions = adaClassify(testArr, weakClassArr) 161 | errArr = np.mat(np.ones((len(testArr), 1))) 162 | print('测试集的错误率:%.3f%%' % float(errArr[predictions != np.mat(testLabelArr).T].sum() / len(testArr) * 100)) 163 | -------------------------------------------------------------------------------- /ensemble-learning/adaboost/horse_adaboost.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | """ 6 | Author: 7 | Jack Cui 8 | Blog: 9 | http://blog.csdn.net/c406495762 10 | Zhihu: 11 | https://www.zhihu.com/people/Jack--Cui/ 12 | Modify: 13 | 2017-10-10 14 | """ 15 | 16 | def loadDataSet(fileName): 17 | numFeat = len((open(fileName).readline().split('\t'))) 18 | dataMat = []; labelMat = [] 19 | fr = open(fileName) 20 | for line in fr.readlines(): 21 | lineArr = [] 22 | curLine = line.strip().split('\t') 23 | for i in range(numFeat - 1): 24 | lineArr.append(float(curLine[i])) 25 | dataMat.append(lineArr) 26 | labelMat.append(float(curLine[-1])) 27 | 28 | return dataMat, labelMat 29 | 30 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 31 | """ 32 | 单层决策树分类函数 33 | Parameters: 34 | dataMatrix - 数据矩阵 35 | dimen - 第dimen列,也就是第几个特征 36 | threshVal - 阈值 37 | threshIneq - 标志 38 | Returns: 39 | retArray - 分类结果 40 | """ 41 | retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 42 | if threshIneq == 'lt': 43 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 44 | else: 45 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1 46 | return retArray 47 | 48 | 49 | def buildStump(dataArr,classLabels,D): 50 | """ 51 | 找到数据集上最佳的单层决策树 52 | Parameters: 53 | dataArr - 数据矩阵 54 | classLabels - 数据标签 55 | D - 样本权重 56 | Returns: 57 | bestStump - 最佳单层决策树信息 58 | minError - 最小误差 59 | bestClasEst - 最佳的分类结果 60 | """ 61 | dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T 62 | m,n = np.shape(dataMatrix) 63 | numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) 64 | minError = float('inf') #最小误差初始化为正无穷大 65 | for i in range(n): #遍历所有特征 66 | rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 67 | stepSize = (rangeMax - rangeMin) / numSteps #计算步长 68 | for j in range(-1, int(numSteps) + 1): 69 | for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than 70 | threshVal = (rangeMin + float(j) * stepSize) #计算阈值 71 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 72 | errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 73 | errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 74 | weightedError = D.T * errArr #计算误差 75 | # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 76 | if weightedError < minError: #找到误差最小的分类方式 77 | minError = weightedError 78 | bestClasEst = predictedVals.copy() 79 | bestStump['dim'] = i 80 | bestStump['thresh'] = threshVal 81 | bestStump['ineq'] = inequal 82 | return bestStump, minError, bestClasEst 83 | 84 | def adaBoostTrainDS(dataArr, classLabels, numIt = 40): 85 | """ 86 | 使用AdaBoost算法提升弱分类器性能 87 | Parameters: 88 | dataArr - 数据矩阵 89 | classLabels - 数据标签 90 | numIt - 最大迭代次数 91 | Returns: 92 | weakClassArr - 训练好的分类器 93 | aggClassEst - 类别估计累计值 94 | """ 95 | weakClassArr = [] 96 | m = np.shape(dataArr)[0] 97 | D = np.mat(np.ones((m, 1)) / m) #初始化权重 98 | aggClassEst = np.mat(np.zeros((m,1))) 99 | for i in range(numIt): 100 | bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 101 | # print("D:",D.T) 102 | alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16))) #计算弱学习算法权重alpha,使error不等于0,因为分母不能为0 103 | bestStump['alpha'] = alpha #存储弱学习算法权重 104 | weakClassArr.append(bestStump) #存储单层决策树 105 | # print("classEst: ", classEst.T) 106 | expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst) #计算e的指数项 107 | D = np.multiply(D, np.exp(expon)) 108 | D = D / D.sum() #根据样本权重公式,更新样本权重 109 | #计算AdaBoost误差,当误差为0的时候,退出循环 110 | aggClassEst += alpha * classEst #计算类别估计累计值 111 | # print("aggClassEst: ", aggClassEst.T) 112 | aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差 113 | errorRate = aggErrors.sum() / m 114 | # print("total error: ", errorRate) 115 | if errorRate == 0.0: break #误差为0,退出循环 116 | return weakClassArr, aggClassEst 117 | 118 | def adaClassify(datToClass,classifierArr): 119 | """ 120 | AdaBoost分类函数 121 | Parameters: 122 | datToClass - 待分类样例 123 | classifierArr - 训练好的分类器 124 | Returns: 125 | 分类结果 126 | """ 127 | dataMatrix = np.mat(datToClass) 128 | m = np.shape(dataMatrix)[0] 129 | aggClassEst = np.mat(np.zeros((m,1))) 130 | for i in range(len(classifierArr)): #遍历所有分类器,进行分类 131 | classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq']) 132 | aggClassEst += classifierArr[i]['alpha'] * classEst 133 | # print(aggClassEst) 134 | return np.sign(aggClassEst) 135 | 136 | if __name__ == '__main__': 137 | dataArr, LabelArr = loadDataSet('horseColicTraining2.txt') 138 | weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, LabelArr) 139 | testArr, testLabelArr = loadDataSet('horseColicTest2.txt') 140 | print(weakClassArr) 141 | predictions = adaClassify(dataArr, weakClassArr) 142 | errArr = np.mat(np.ones((len(dataArr), 1))) 143 | print('训练集的错误率:%.3f%%' % float(errArr[predictions != np.mat(LabelArr).T].sum() / len(dataArr) * 100)) 144 | predictions = adaClassify(testArr, weakClassArr) 145 | errArr = np.mat(np.ones((len(testArr), 1))) 146 | print('测试集的错误率:%.3f%%' % float(errArr[predictions != np.mat(testLabelArr).T].sum() / len(testArr) * 100)) 147 | -------------------------------------------------------------------------------- /ensemble-learning/adaboost/sklearn_adaboost.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import numpy as np 3 | from sklearn.ensemble import AdaBoostClassifier 4 | from sklearn.tree import DecisionTreeClassifier 5 | 6 | """ 7 | Author: 8 | Jack Cui 9 | Blog: 10 | http://blog.csdn.net/c406495762 11 | Zhihu: 12 | https://www.zhihu.com/people/Jack--Cui/ 13 | Modify: 14 | 2017-10-11 15 | """ 16 | 17 | def loadDataSet(fileName): 18 | numFeat = len((open(fileName).readline().split('\t'))) 19 | dataMat = []; labelMat = [] 20 | fr = open(fileName) 21 | for line in fr.readlines(): 22 | lineArr = [] 23 | curLine = line.strip().split('\t') 24 | for i in range(numFeat - 1): 25 | lineArr.append(float(curLine[i])) 26 | dataMat.append(lineArr) 27 | labelMat.append(float(curLine[-1])) 28 | 29 | return dataMat, labelMat 30 | 31 | if __name__ == '__main__': 32 | dataArr, classLabels = loadDataSet('horseColicTraining2.txt') 33 | testArr, testLabelArr = loadDataSet('horseColicTest2.txt') 34 | bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2), algorithm = "SAMME", n_estimators = 10) 35 | bdt.fit(dataArr, classLabels) 36 | predictions = bdt.predict(dataArr) 37 | errArr = np.mat(np.ones((len(dataArr), 1))) 38 | print('训练集的错误率:%.3f%%' % float(errArr[predictions != classLabels].sum() / len(dataArr) * 100)) 39 | predictions = bdt.predict(testArr) 40 | errArr = np.mat(np.ones((len(testArr), 1))) 41 | print('测试集的错误率:%.3f%%' % float(errArr[predictions != testLabelArr].sum() / len(testArr) * 100)) -------------------------------------------------------------------------------- /ensemble-learning/adaboost/帮助.txt: -------------------------------------------------------------------------------- 1 | adaboost算法相关文档 2 | -------------------------------------------------------------------------------- /ensemble-learning/horse_adaboost.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | """ 6 | Author: 7 | Jack Cui 8 | Blog: 9 | http://blog.csdn.net/c406495762 10 | Zhihu: 11 | https://www.zhihu.com/people/Jack--Cui/ 12 | Modify: 13 | 2017-10-10 14 | """ 15 | 16 | def loadDataSet(fileName): 17 | numFeat = len((open(fileName).readline().split('\t'))) 18 | dataMat = []; labelMat = [] 19 | fr = open(fileName) 20 | for line in fr.readlines(): 21 | lineArr = [] 22 | curLine = line.strip().split('\t') 23 | for i in range(numFeat - 1): 24 | lineArr.append(float(curLine[i])) 25 | dataMat.append(lineArr) 26 | labelMat.append(float(curLine[-1])) 27 | 28 | return dataMat, labelMat 29 | 30 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 31 | """ 32 | 单层决策树分类函数 33 | Parameters: 34 | dataMatrix - 数据矩阵 35 | dimen - 第dimen列,也就是第几个特征 36 | threshVal - 阈值 37 | threshIneq - 标志 38 | Returns: 39 | retArray - 分类结果 40 | """ 41 | retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 42 | if threshIneq == 'lt': 43 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 44 | else: 45 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1 46 | return retArray 47 | 48 | 49 | def buildStump(dataArr,classLabels,D): 50 | """ 51 | 找到数据集上最佳的单层决策树 52 | Parameters: 53 | dataArr - 数据矩阵 54 | classLabels - 数据标签 55 | D - 样本权重 56 | Returns: 57 | bestStump - 最佳单层决策树信息 58 | minError - 最小误差 59 | bestClasEst - 最佳的分类结果 60 | """ 61 | dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T 62 | m,n = np.shape(dataMatrix) 63 | numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) 64 | minError = float('inf') #最小误差初始化为正无穷大 65 | for i in range(n): #遍历所有特征 66 | rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 67 | stepSize = (rangeMax - rangeMin) / numSteps #计算步长 68 | for j in range(-1, int(numSteps) + 1): 69 | for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than 70 | threshVal = (rangeMin + float(j) * stepSize) #计算阈值 71 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 72 | errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 73 | errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 74 | weightedError = D.T * errArr #计算误差 75 | # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 76 | if weightedError < minError: #找到误差最小的分类方式 77 | minError = weightedError 78 | bestClasEst = predictedVals.copy() 79 | bestStump['dim'] = i 80 | bestStump['thresh'] = threshVal 81 | bestStump['ineq'] = inequal 82 | return bestStump, minError, bestClasEst 83 | 84 | def adaBoostTrainDS(dataArr, classLabels, numIt = 40): 85 | """ 86 | 使用AdaBoost算法提升弱分类器性能 87 | Parameters: 88 | dataArr - 数据矩阵 89 | classLabels - 数据标签 90 | numIt - 最大迭代次数 91 | Returns: 92 | weakClassArr - 训练好的分类器 93 | aggClassEst - 类别估计累计值 94 | """ 95 | weakClassArr = [] 96 | m = np.shape(dataArr)[0] 97 | D = np.mat(np.ones((m, 1)) / m) #初始化权重 98 | aggClassEst = np.mat(np.zeros((m,1))) 99 | for i in range(numIt): 100 | bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 101 | # print("D:",D.T) 102 | alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16))) #计算弱学习算法权重alpha,使error不等于0,因为分母不能为0 103 | bestStump['alpha'] = alpha #存储弱学习算法权重 104 | weakClassArr.append(bestStump) #存储单层决策树 105 | # print("classEst: ", classEst.T) 106 | expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst) #计算e的指数项 107 | D = np.multiply(D, np.exp(expon)) 108 | D = D / D.sum() #根据样本权重公式,更新样本权重 109 | #计算AdaBoost误差,当误差为0的时候,退出循环 110 | aggClassEst += alpha * classEst #计算类别估计累计值 111 | # print("aggClassEst: ", aggClassEst.T) 112 | aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差 113 | errorRate = aggErrors.sum() / m 114 | # print("total error: ", errorRate) 115 | if errorRate == 0.0: break #误差为0,退出循环 116 | return weakClassArr, aggClassEst 117 | 118 | def adaClassify(datToClass,classifierArr): 119 | """ 120 | AdaBoost分类函数 121 | Parameters: 122 | datToClass - 待分类样例 123 | classifierArr - 训练好的分类器 124 | Returns: 125 | 分类结果 126 | """ 127 | dataMatrix = np.mat(datToClass) 128 | m = np.shape(dataMatrix)[0] 129 | aggClassEst = np.mat(np.zeros((m,1))) 130 | for i in range(len(classifierArr)): #遍历所有分类器,进行分类 131 | classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq']) 132 | aggClassEst += classifierArr[i]['alpha'] * classEst 133 | # print(aggClassEst) 134 | return np.sign(aggClassEst) 135 | 136 | if __name__ == '__main__': 137 | dataArr, LabelArr = loadDataSet('horseColicTraining2.txt') 138 | weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, LabelArr) 139 | testArr, testLabelArr = loadDataSet('horseColicTest2.txt') 140 | print(weakClassArr) 141 | predictions = adaClassify(dataArr, weakClassArr) 142 | errArr = np.mat(np.ones((len(dataArr), 1))) 143 | print('训练集的错误率:%.3f%%' % float(errArr[predictions != np.mat(LabelArr).T].sum() / len(dataArr) * 100)) 144 | predictions = adaClassify(testArr, weakClassArr) 145 | errArr = np.mat(np.ones((len(testArr), 1))) 146 | print('测试集的错误率:%.3f%%' % float(errArr[predictions != np.mat(testLabelArr).T].sum() / len(testArr) * 100)) 147 | -------------------------------------------------------------------------------- /ensemble-learning/sklearn_adaboost.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import numpy as np 3 | from sklearn.ensemble import AdaBoostClassifier 4 | from sklearn.tree import DecisionTreeClassifier 5 | 6 | """ 7 | Author: 8 | Jack Cui 9 | Blog: 10 | http://blog.csdn.net/c406495762 11 | Zhihu: 12 | https://www.zhihu.com/people/Jack--Cui/ 13 | Modify: 14 | 2017-10-11 15 | """ 16 | 17 | def loadDataSet(fileName): 18 | numFeat = len((open(fileName).readline().split('\t'))) 19 | dataMat = []; labelMat = [] 20 | fr = open(fileName) 21 | for line in fr.readlines(): 22 | lineArr = [] 23 | curLine = line.strip().split('\t') 24 | for i in range(numFeat - 1): 25 | lineArr.append(float(curLine[i])) 26 | dataMat.append(lineArr) 27 | labelMat.append(float(curLine[-1])) 28 | 29 | return dataMat, labelMat 30 | 31 | if __name__ == '__main__': 32 | dataArr, classLabels = loadDataSet('horseColicTraining2.txt') 33 | testArr, testLabelArr = loadDataSet('horseColicTest2.txt') 34 | bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2), algorithm = "SAMME", n_estimators = 10) 35 | bdt.fit(dataArr, classLabels) 36 | predictions = bdt.predict(dataArr) 37 | errArr = np.mat(np.ones((len(dataArr), 1))) 38 | print('训练集的错误率:%.3f%%' % float(errArr[predictions != classLabels].sum() / len(dataArr) * 100)) 39 | predictions = bdt.predict(testArr) 40 | errArr = np.mat(np.ones((len(testArr), 1))) 41 | print('测试集的错误率:%.3f%%' % float(errArr[predictions != testLabelArr].sum() / len(testArr) * 100)) -------------------------------------------------------------------------------- /ensemble-learning/帮助.txt: -------------------------------------------------------------------------------- 1 | 本目录为集成学习相关的内容 2 | -------------------------------------------------------------------------------- /feature-extraction/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-infra/data-mining/3f9a5ab6764a950108e95f295417ae83a99287b7/feature-extraction/Thumbs.db -------------------------------------------------------------------------------- /feature-extraction/clusters.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | from math import sqrt 4 | from PIL import Image,ImageDraw,ImageFont 5 | 6 | # Returns the Pearson correlation coefficient for p1 and p2 7 | def pearson(v1,v2): 8 | # Simple sums 9 | sum1=sum(v1) 10 | sum2=sum(v2) 11 | 12 | # Sums of the squares 13 | sum1Sq=sum([pow(v,2) for v in v1]) 14 | sum2Sq=sum([pow(v,2) for v in v2]) 15 | 16 | # Sum of the products 17 | pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) 18 | 19 | # Calculate r (Pearson score) 20 | num=pSum-(sum1*sum2/len(v1)) 21 | den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) 22 | if den==0: return 0 23 | 24 | return 1.0-(num/den) 25 | 26 | 27 | class bicluster: 28 | def __init__(self,vec,left=None,right=None,distance=0.0,id=None): 29 | self.left=left 30 | self.right=right 31 | self.vec=vec 32 | self.id=id 33 | self.distance=distance 34 | 35 | def euclidean(v1,v2): 36 | sqsum=sum([math.pow(v1[i]-v2[i],2) for i in range(len(v1))]) 37 | return math.sqrt(sqsum) 38 | 39 | def printclust(clust,labels=None,n=0): 40 | for i in range(n): print(' ',end='') 41 | if clust.id<0: 42 | print('-') 43 | else: 44 | if labels==None: print(clust.id) 45 | else: print(labels[clust.id]) 46 | if clust.left!=None: printclust(clust.left,labels=labels,n=n+1) 47 | if clust.right!=None: printclust(clust.right,labels=labels,n=n+1) 48 | 49 | def hcluster(vecs,distance=pearson): 50 | distances={} 51 | currentclustid=-1 52 | clust=[bicluster(vecs[i],id=i) for i in range(len(vecs))] 53 | 54 | while len(clust)>1: 55 | lowestpair=(0,1) 56 | closest=distance(clust[0].vec,clust[1].vec) 57 | for i in range(len(clust)): 58 | for j in range(i+1,len(clust)): 59 | if (clust[i].id,clust[j].id) not in distances: 60 | distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) 61 | d=distances[(clust[i].id,clust[j].id)] 62 | 63 | if d0: 102 | for vecid in bestmatches[i]: 103 | for m in range(len(vecs[vecid])): 104 | avgs[m]+=vecs[vecid][m] 105 | for j in range(len(avgs)): 106 | avgs[j]/=len(bestmatches[i]) 107 | clusters[i]=avgs 108 | 109 | return bestmatches 110 | 111 | def readfile(filename): 112 | lines=[line for line in open(filename)] 113 | colnames=lines[0].strip().split('\t')[1:] 114 | rownames=[] 115 | data=[] 116 | for line in lines[1:]: 117 | p=line.strip().split('\t') 118 | rownames.append(p[0]) 119 | data.append([float(x) for x in p[1:]]) 120 | return rownames,colnames,data 121 | 122 | def test2(): 123 | rownames,colnames,data=readfile('datafile.txt') 124 | return hcluster(data) 125 | #for i in range(len(rownames)): 126 | # print i,rownames[i] 127 | 128 | def distance(v1,v2): 129 | c1,c2,shr=0,0,0 130 | 131 | for i in range(len(v1)): 132 | if v1[i]!=0: c1+=1 133 | if v2[i]!=0: c2+=1 134 | if v1[i]!=0 and v2[i]!=0: shr+=1 135 | 136 | return float(shr)/(c1+c2-shr) 137 | 138 | 139 | #test2() 140 | 141 | def getheight(clust): 142 | if clust.left==None and clust.right==None: return 1 143 | return getheight(clust.left)+getheight(clust.right) 144 | 145 | def getdepth(clust): 146 | if clust.left==None and clust.right==None: return 0 147 | return max(getdepth(clust.left),getdepth(clust.right))+clust.distance 148 | 149 | def drawdendrogram(clust,labels,jpeg='clusters.jpg'): 150 | h=getheight(clust)*20 151 | depth=getdepth(clust) 152 | w=1200 153 | scaling=float(w-150)/depth 154 | img=Image.new('RGB',(w,h),(255,255,255)) 155 | draw=ImageDraw.Draw(img) 156 | 157 | draw.line((0,h/2,10,h/2),fill=(255,0,0)) 158 | 159 | drawnode(draw,clust,10,(h/2),scaling,labels) 160 | img.save(jpeg,'JPEG') 161 | 162 | def drawnode(draw,clust,x,y,scaling,labels): 163 | if clust.id<0: 164 | h1=getheight(clust.left)*20 165 | h2=getheight(clust.right)*20 166 | top=y-(h1+h2)/2 167 | bottom=y+(h1+h2)/2 168 | 169 | ll=clust.distance*scaling 170 | 171 | draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0)) 172 | 173 | draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0)) 174 | draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0)) 175 | 176 | drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels) 177 | drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels) 178 | else: 179 | draw.text((x+5,y-7),labels[clust.id].encode('utf8'),(0,0,0)) 180 | 181 | def rotatematrix(data): 182 | newdata=[] 183 | for i in range(len(data[0])): 184 | newrow=[data[j][i] for j in range(len(data))] 185 | newdata.append(newrow) 186 | return newdata 187 | 188 | def scaledown(data,distance=pearson,rate=0.01): 189 | n=len(data) 190 | realdist=[[distance(data[i],data[j]) for j in range(n)] for i in range(0,n)] 191 | 192 | outersum=0.0 193 | 194 | loc=[[random.random(),random.random()] for i in range(n)] 195 | fakedist=[[0.0 for j in range(n)] for i in range(n)] 196 | 197 | lasterror=None 198 | for m in range(0,1000): 199 | # Find projected distances 200 | for i in range(n): 201 | for j in range(n): 202 | fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) 203 | for x in range(len(loc[i]))])) 204 | 205 | # Move points 206 | grad=[[0.0,0.0] for i in range(n)] 207 | 208 | totalerror=0 209 | for k in range(n): 210 | for j in range(n): 211 | if j==k: continue 212 | errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k] 213 | grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm 214 | grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm 215 | totalerror+=abs(errorterm) 216 | print(totalerror) 217 | if lasterror and lasterror2 and len(s)<20] 10 | 11 | # Return the unique set of words only 12 | return dict([(w,1) for w in words]) 13 | 14 | #def entryfeatures(entry): 15 | 16 | def sampletrain(cl): 17 | cl.train('Nobody owns the water.','good') 18 | cl.train('the quick rabbit jumps fences','good') 19 | cl.train('buy pharmaceuticals now','bad') 20 | cl.train('make quick money at the online casino','bad') 21 | cl.train('the quick brown fox jumps','good') 22 | 23 | class classifier: 24 | def __init__(self,getfeatures): 25 | self.fc={} 26 | self.cc={} 27 | self.getfeatures=getfeatures 28 | 29 | def setdb(self,dbfile): 30 | self.con=sqlite.connect(dbfile) 31 | self.con.execute('create table if not exists fc(feature,category,count)') 32 | self.con.execute('create table if not exists cc(category,count)') 33 | 34 | def incf(self,f,cat): 35 | count=self.fcount(f,cat) 36 | if count==0: 37 | self.con.execute("insert into fc values ('%s','%s',1)" 38 | % (f,cat)) 39 | else: 40 | self.con.execute( 41 | "update fc set count=%d where feature='%s' and category='%s'" 42 | % (count+1,f,cat)) 43 | 44 | def fcount(self,f,cat): 45 | res=self.con.execute( 46 | 'select count from fc where feature="%s" and category="%s"' 47 | %(f,cat)).fetchone() 48 | if res==None: return 0 49 | else: return float(res[0]) 50 | 51 | def incc(self,cat): 52 | count=self.catcount(cat) 53 | if count==0: 54 | self.con.execute("insert into cc values ('%s',1)" % (cat)) 55 | else: 56 | self.con.execute("update cc set count=%d where category='%s'" 57 | % (count+1,cat)) 58 | 59 | def catcount(self,cat): 60 | res=self.con.execute('select count from cc where category="%s"' 61 | %(cat)).fetchone() 62 | if res==None: return 0.0 63 | else: return float(res[0]) 64 | 65 | def categories(self): 66 | cur=self.con.execute('select category from cc'); 67 | return [d[0] for d in cur] 68 | 69 | def totalcount(self): 70 | res=self.con.execute('select sum(count) from cc').fetchone(); 71 | if res==None: return 0 72 | return res[0] 73 | 74 | 75 | """ 76 | def incf(self,f,cat): 77 | self.fc.setdefault(f,{}) 78 | self.fc[f].setdefault(cat,0) 79 | self.fc[f][cat]+=1 80 | 81 | def incc(self,cat): 82 | self.cc.setdefault(cat,0) 83 | self.cc[cat]+=1 84 | 85 | def fcount(self,f,cat): 86 | if f in self.fc and cat in self.fc[f]: 87 | return float(self.fc[f][cat]) 88 | return 0.0 89 | 90 | def catcount(self,cat): 91 | if cat in self.cc: 92 | return float(self.cc[cat]) 93 | return 0 94 | 95 | def totalcount(self): 96 | return sum(self.cc.values()) 97 | 98 | def categories(self): 99 | return self.cc.keys() 100 | """ 101 | 102 | 103 | def train(self,item,cat): 104 | features=self.getfeatures(item) 105 | for f in features: 106 | self.incf(f,cat) 107 | self.incc(cat) 108 | self.con.commit() 109 | 110 | def fprob(self,f,cat): 111 | if self.catcount(cat)==0: return 0 112 | return self.fcount(f,cat)/self.catcount(cat) 113 | 114 | def setfilename(self,filename): 115 | self.filename=filename 116 | self.restoredata() 117 | 118 | def restoredata(self): 119 | try: f=open(self.filename,'rb') 120 | except: return 121 | self.fc=cPickle.load(f) 122 | self.cc=cPickle.load(f) 123 | f.close() 124 | 125 | def savedata(self): 126 | f=open(self.filename,'wb') 127 | cPickle.dump(self.fc,f,True) 128 | cPickle.dump(self.cc,f,True) 129 | f.close() 130 | def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5): 131 | basicprob=prf(f,cat) 132 | totals=sum([self.fcount(f,c) for c in self.categories()]) 133 | bp=((weight*ap)+(totals*basicprob))/(weight+totals) 134 | return bp 135 | 136 | 137 | 138 | class naivebayes(classifier): 139 | def __init__(self,getfeatures): 140 | classifier.__init__(self,getfeatures) 141 | self.thresholds={} 142 | 143 | def setthreshold(self,cat,t): 144 | self.thresholds[cat]=t 145 | 146 | def getthreshold(self,cat): 147 | if cat not in self.thresholds: return 1.0 148 | return self.thresholds[cat] 149 | 150 | def classify(self,item,default=None): 151 | probs={} 152 | max=0.0 153 | for cat in self.categories(): 154 | probs[cat]=self.prob(item,cat) 155 | if probs[cat]>max: 156 | max=probs[cat] 157 | best=cat 158 | for cat in probs: 159 | if cat==best: continue 160 | if probs[cat]*self.getthreshold(best)>probs[best]: return default 161 | return best 162 | 163 | def docprob(self,item,cat): 164 | features=self.getfeatures(item) 165 | p=1 166 | for f in features: p*=self.weightedprob(f,cat,self.fprob) 167 | return p 168 | 169 | 170 | def prob(self,item,cat): 171 | catprob=self.catcount(cat)/self.totalcount() 172 | docprob=self.docprob(item,cat) 173 | return docprob*catprob 174 | 175 | class fisherclassifier(classifier): 176 | def __init__(self,getfeatures): 177 | classifier.__init__(self,getfeatures) 178 | self.minimums={} 179 | 180 | def setminimum(self,cat,min): 181 | self.minimums[cat]=min 182 | 183 | def getminimum(self,cat): 184 | if cat not in self.minimums: return 0 185 | return self.minimums[cat] 186 | 187 | def classify(self,item,default=None): 188 | best=default 189 | max=0.0 190 | for c in self.categories(): 191 | p=self.fisherprob(item,c) 192 | if p>self.getminimum(c) and p>max: 193 | best=c 194 | max=p 195 | return best 196 | 197 | 198 | def cprob(self,f,cat): 199 | # The frequency of this feature in this category 200 | clf=self.fprob(f,cat) 201 | 202 | if clf==0: return 0.0 203 | 204 | # The frequency of this feature in all the categories 205 | freqsum=sum([self.fprob(f,c) for c in self.categories()]) 206 | 207 | # The probability is the frequency in this category divided by 208 | # the overall frequency 209 | p=clf/(freqsum) 210 | 211 | return p 212 | 213 | 214 | def fisherprob(self,item,cat): 215 | p=1 216 | features=self.getfeatures(item) 217 | for f in features: 218 | p*=(self.weightedprob(f,cat,self.cprob)) 219 | fscore=-2*math.log(p) 220 | return self.chi2P(fscore,len(features)*2) 221 | 222 | def chi2P(self,chi,df): 223 | m = chi / 2.0 224 | sum = term = math.exp(-m) 225 | for i in range(1, df//2): 226 | term *= m / i 227 | sum += term 228 | return min(sum, 1.0) 229 | 230 | -------------------------------------------------------------------------------- /feature-extraction/features.txt: -------------------------------------------------------------------------------- 1 | ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 2 | (6.58003120192, u'The Abs Diet by David Zinczenko') 3 | (5.9231935598, u"I did'nt diet to get in shape for Trinidad's Carnival.....") 4 | (5.04673654071, u'Sensible Diet & Exercise') 5 | 6 | ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 7 | (6.52183126318, u'food/exercise Monday 10/1') 8 | (5.94642162786, u'diet/exercise 10/4') 9 | (5.3332773133, u'food/exercise Friday 10/5') 10 | 11 | ['food', 'calories', 'than', 'easy', 'high', 'come'] 12 | (9.98464450123, u'High or low fat food? Easy trick for figuring it out') 13 | (3.41252863148, u'Oatmeal, cereal of choice.') 14 | (3.19119866786, u'Food and Workout Log 10.8.07') 15 | 16 | ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 17 | (7.46811621754, u'saturday') 18 | (5.62839188358, u'diet-exercise thursday') 19 | (5.29370213306, u'sleepy food/fitness thursday') 20 | 21 | ['food', 'home', 'then', 'exercise', 'morning', 'went'] 22 | (5.22083940456, u'Food & Exercise -- 10/5/2007') 23 | (5.16310413391, u'Food & Exercise -- 10/4/2007') 24 | (4.75585045074, u'Food & Exercise -- 9/28/2007 (yesterday)') 25 | 26 | ['fats', 'quot', 'this', 'good', 'about', 'like'] 27 | (14.9233786406, u'Good fats bad fats') 28 | (1.3775418859, u'Should we ban marathons?') 29 | (1.37194239805, u'Food & Exercise -- 10/3/2007') 30 | 31 | ['quot', 'they', 'money', 'want', 'very', 'best'] 32 | (6.1620884463, u'More about the Chicago marathon') 33 | (5.58276496802, u'LOUIE + LINESMAKER = $$$$') 34 | (4.04959173123, u'High or low fat food? Easy trick for figuring it out') 35 | 36 | ['that', 'much', 'does', 'exercise', 'this', 'morning'] 37 | (7.73926153154, u'Food & Exercise -- 10/7/2007') 38 | (5.96451663382, u'< 1g, etc.') 39 | (3.81276353396, u"why I'm succeeding, finally, with my fitness") 40 | 41 | ['with', 'your', 'weight', 'have', 'control', 'about'] 42 | (6.78756986407, u'Control ur Weight') 43 | (5.54567450388, u'Flu-Busting Chicken Soup') 44 | (5.21079777525, u'Weight Loss Tips') 45 | 46 | ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 47 | (5.58477112035, u'Food and Workout Log 9.27.08') 48 | (5.48488799917, u'Food and Workout Log 10.3.07') 49 | (5.10395750879, u'Food and Workout Log 10.10.07') 50 | 51 | -------------------------------------------------------------------------------- /feature-extraction/newsfeatures.py: -------------------------------------------------------------------------------- 1 | # 非监督式的特征提取。寻找独立特征(主题)。这些特征(主题)可能不能简单的描述 2 | # 如果用于分类,不用提前知道需要什么分类,也不需要数据训练 3 | # 对于文章分类,特征(主题)可能不能简单的描述,但是可以计算每个属性(单词)对特征(主题)的重要程度 4 | 5 | # 注意和分类器中区分,在分类器中我们把单词作为文章的特征,更缺的说的属性特征,这里我们要研究的特征是输入对象表现出来的主题特征,单词作为研究主题特征的参考属性。 6 | 7 | import feedparser 8 | import re 9 | 10 | 11 | feedlist=['http://today.reuters.com/rss/topNews', 12 | 'http://today.reuters.com/rss/domesticNews', 13 | 'http://today.reuters.com/rss/worldNews', 14 | 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml', 15 | 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml', 16 | 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml', 17 | 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml', 18 | 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', 19 | 'http://www.nytimes.com/services/xml/rss/nyt/International.xml', 20 | 'http://news.google.com/?output=rss', 21 | 'http://feeds.salon.com/salon/news', 22 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss', 23 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss', 24 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss', 25 | 'http://rss.cnn.com/rss/edition.rss', 26 | 'http://rss.cnn.com/rss/edition_world.rss', 27 | 'http://rss.cnn.com/rss/edition_us.rss'] 28 | 29 | # 删除文章中所有的图片和HTML标记 30 | def stripHTML(h): 31 | p='' 32 | s=0 33 | for c in h: 34 | if c=='<': s=1 35 | elif c=='>': 36 | s=0 37 | p+=' ' 38 | elif s==0: p+=c 39 | return p 40 | 41 | # 拆分文章中的单词 42 | def separatewords(text): 43 | splitter=re.compile('\\W*') 44 | return [s.lower() for s in splitter.split(text) if len(s)>3] 45 | 46 | #获取文章的单词。单词出现一次就记录一次。 47 | def getarticlewords(): 48 | allwords={} 49 | articlewords=[] 50 | articletitles=[] 51 | ec=0 52 | # 遍历每个订阅源 53 | for feed in feedlist: 54 | f=feedparser.parse(feed) 55 | 56 | # 遍历每篇文章 57 | for e in f.entries: 58 | # 跳过标题相同的文章 59 | if e.title in articletitles: continue 60 | 61 | # 提取单词 62 | txt=e.title.encode('utf8')+stripHTML(e.description.encode('utf8')) 63 | words=separatewords(txt) 64 | articlewords.append({}) 65 | articletitles.append(e.title) 66 | 67 | # 在allwords和articlewords中增加针对当前单词的计数 68 | for word in words: 69 | allwords.setdefault(word,0) 70 | allwords[word]+=1 71 | articlewords[ec].setdefault(word,0) 72 | articlewords[ec][word]+=1 #单词出现一次就记录一次。 73 | ec+=1 74 | # allwords记录单词在所有文章中被使用的次数,我们将利用该变量来判断,哪些单词应该被视作特征的一部分 75 | # articlewords是单词在每篇文章中出现的次数 76 | # articletitles是一个文章标题的列表 77 | return allwords,articlewords,articletitles 78 | 79 | 80 | # 将数据集转化为矩阵。allw为单词在所有文章中出现的次数,articlew为单词在每篇文章中出现的次数 81 | def makematrix(allw,articlew): 82 | wordvec=[] 83 | 84 | # 只考虑那些普通的但又不至于非常普通的单词 85 | for w,c in allw.items(): 86 | if c>3 and cgetminutes(returnf[0]): earliestdep=getminutes(returnf[0]) 73 | 74 | # 接机服务:每个人必须在机场等待直到最后一个人到达位置 75 | # 送机服务:他们必须同时达到机场,并等待他们的返程航班 76 | totalwait=0 77 | for d in range(int(len(sol)/2)): 78 | origin=peoplelist[d][1] 79 | outbound=flights[(origin,destination)][int(sol[2*d])] 80 | returnf=flights[(destination,origin)][int(sol[2*d+1])] 81 | totalwait+=latestarrival-getminutes(outbound[1]) 82 | totalwait+=getminutes(returnf[0])-earliestdep 83 | 84 | # 这个题解要求多付一天的汽车出租费用么?如果是,则费用为50美元 85 | if latestarrival>earliestdep: totalprice+=50 86 | 87 | return totalprice+totalwait 88 | 89 | # 随机搜索算法:随机选择题解,计算成本值,成本值最小的题解为确定题解。domain为题解范围(可选航班范围),costf为成本函数。 90 | def randomoptimize(domain,costf): 91 | best=999999999 92 | bestr=None 93 | for i in range(0,1000): 94 | # 创建随机解 95 | sol=[random.randint(domain[i][0],domain[i][1]) for i in range(len(domain))] 96 | #计算成本值 97 | cost=costf(sol) 98 | 99 | # 与目前得到的最优解进行比较 100 | if costdomain[j][0]: 119 | neighbors.append(sol[0:j]+[sol[j]-1]+sol[j+1:]) #向近0偏移 120 | if sol[j]0.1: 143 | # 选择一个索引值 144 | i=random.randint(0,len(domain)-1) 145 | 146 | # 选择一个改变索引值的方向 147 | dir=random.randint(-step,step) 148 | 149 | #创建一个代表题解的新列表,改变其中一个值 150 | vecb=vec[:] 151 | vecb[i]+=dir 152 | if vecb[i]domain[i][1]: vecb[i]=domain[i][1] #如果渐变不超出了题解的范围 154 | 155 | # 计算当前成本与新的成本 156 | ea=costf(vec) 157 | eb=costf(vecb) 158 | p=pow(math.e,(-eb-ea)/T) 159 | 160 | # 它是更好的解么?或者是趋向最优解的可能的临界解么 161 | if (eb=domain[i][0]+step: 174 | return vec[0:i]+[vec[i]-step]+vec[i+1:] 175 | elif vec[i]<=domain[i][1]-step: 176 | return vec[0:i]+[vec[i]+step]+vec[i+1:] 177 | 178 | # 杂交操作(交叉) 179 | def crossover(r1,r2): 180 | i=random.randint(1,len(domain)-2) 181 | return r1[0:i]+r2[i:] 182 | 183 | # 构建初始种群 184 | pop=[] 185 | for i in range(popsize): #随机产生50个动物的种群 186 | vec=[random.randint(domain[i][0],domain[i][1]) for i in range(len(domain))] 187 | pop.append(vec) 188 | # 每一代有多少胜出者? 189 | topelite=int(elite*popsize) 190 | 191 | # 主循环 192 | for i in range(maxiter): 193 | scores=[(costf(v),v) for v in pop] 194 | scores.sort() 195 | ranked=[v for (s,v) in scores] 196 | 197 | # 在种群中选出优胜者 198 | pop=ranked[0:topelite] 199 | 200 | # 为优秀基因者,添加变异和配对后的胜出者 201 | while len(pop)0 and ua<1 and ub>0 and ub<1: 56 | total+=1 57 | for i in range(len(people)): 58 | for j in range(i+1,len(people)): 59 | # 获取两个节点的位置 60 | (x1,y1),(x2,y2)=loc[people[i]],loc[people[j]] 61 | 62 | # 获取两节点之间的距离 63 | dist=math.sqrt(math.pow(x1-x2,2)+math.pow(y1-y2,2)) 64 | # 对间距小于50个像素的节点进行判罚 65 | if dist<50: 66 | total+=(1.0-(dist/50.0)) 67 | 68 | return total 69 | 70 | 71 | #画图,绘制网络 72 | from PIL import Image,ImageDraw 73 | 74 | def drawnetwork(sol): 75 | # 建立image对象 76 | img=Image.new('RGB',(400,400),(255,255,255)) 77 | draw=ImageDraw.Draw(img) 78 | 79 | # 建立标识位置信息的字典 80 | pos=dict([(people[i],(sol[i*2],sol[i*2+1])) for i in range(0,len(people))]) 81 | 82 | for (a,b) in links: 83 | draw.line((pos[a],pos[b]),fill=(255,0,0)) 84 | 85 | for n,p in pos.items(): 86 | draw.text(p,n,(0,0,0)) 87 | 88 | img.show() 89 | 90 | 91 | domain=[(10,370)]*(len(people)*2) #设定题解范围 92 | 93 | if __name__=="__main__": #只有在执行当前模块时才会运行此函数 94 | print(domain) 95 | s = optimization.randomoptimize(domain, crosscount) # 随机搜索法,寻找最优题解 96 | print(s) 97 | drawnetwork(s) # 绘制关系网 98 | s = optimization.hillclimb(domain, crosscount) # 爬山法,寻找最优题解 99 | print(s) 100 | drawnetwork(s) # 绘制关系网 101 | s = optimization.annealingoptimize(domain, crosscount) # 模拟退火算法,寻找最优题解 102 | print(s) 103 | drawnetwork(s) # 绘制关系网 104 | s = optimization.geneticoptimize(domain, crosscount) # 遗传算法,寻找最优题解 105 | print(s) 106 | drawnetwork(s) # 绘制关系网 -------------------------------------------------------------------------------- /optimization/帮助.txt: -------------------------------------------------------------------------------- 1 | 优化算法的相关demo 2 | --------------------------------------------------------------------------------