├── mapper.py ├── reducer.py ├── encode.py ├── README.md ├── view_data.py ├── mapper2.py ├── reducer2.py ├── getAuthors.py ├── viewRelation.py └── final.py /mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | def read_input(file): 4 | for line in file: 5 | yield line.split(',') 6 | 7 | def main(): 8 | data = read_input(sys.stdin) 9 | for names in data: 10 | for name in names: 11 | if name.isdigit(): 12 | print '%s\t%d' % (name, 1) 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from itertools import groupby 3 | from operator import itemgetter 4 | import sys 5 | 6 | def readMapOutput(file): 7 | for line in file: 8 | yield line.strip().split('\t') 9 | 10 | def main(): 11 | data = readMapOutput(sys.stdin) 12 | for currentName, group in groupby(data, itemgetter(0)): 13 | try: 14 | total_count = sum(int(count) for currentName, count in group) 15 | print "%s\t%d" % (currentName, total_count) 16 | except ValueError: 17 | pass 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /encode.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | source = codecs.open('authors.txt','r','utf-8') 3 | result = codecs.open('authors_encoded.txt','w','utf-8') 4 | index = codecs.open('authors_index.txt','w','utf-8') 5 | index_dic = {} 6 | name_id = 0 7 | 8 | for line in source: 9 | name_list = line.split(',') 10 | for name in name_list: 11 | if not (name == '\r\n'): 12 | if name in index_dic: 13 | index_dic[name][1] +=1 14 | else: 15 | index_dic[name] = [name_id,1] 16 | index.write(name + u'\r\n') 17 | name_id += 1 18 | result.write(str(index_dic[name][0]) + u',') 19 | result.write('\r\n') 20 | 21 | source.close() 22 | result.close() 23 | index.close() 24 | #print sorted(index_dic.iteritems(), key = lambda a:a[1][1]) 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DBLP-Coauthor-Mining
(从DBLP数据集中挖掘合作者) 2 | ============= 3 | 详细说明 4 | ------------- 5 | 详细说明请查看 [数据挖掘实战之DBLP中合作者挖掘(Python+Hadoop)](http://www.tianjun.ml/essays/20) 6 | 7 | 文件说明 8 | ------------- 9 | ### getAuthors.py 10 | 下载DBLP数据集``dblp.xml``到该目录下,[http://dblp.uni-trier.de/xml/](http://dblp.uni-trier.de/xml/)
运行``getAuthors.py`` 得到``authors.txt``文件 11 | ### encode.py 12 | 运行该文件后将对上一步得到的``authors.txt``文件编码(安装作者姓名出现的顺序依次以正整数编码)得到编码后的文件``authors_encoded.txt``,以及作者姓名与编码对应的文件``authors_index.txt``,其对应关系为姓名所在的行号减1即为其编码ID(ID从0开始) 13 | ### view.data.py 14 | 读取``authors.txt``,统计不同支持度下有多少作者,同时绘制曲线,确定支持度阈值大概范围 15 | ### final.py 16 | 主要借鉴了《机器学习实战》中的例子,将结果写入了 ``result*.txt``文件,注意最后的结果增加了置信度过滤。 17 | ### mapper.py & reduce.py 18 | 第一轮MapReduce的Map和Reduce所用到的文件,其实质就是一个wordCount的过程 19 | ### mapper2.py & reduce2.py 20 | 第二轮MapReduce的Map和Reduce所用到的文件,注意在这里的输出并给出没有完整的挖掘结果,而是输出的条件模式集,有空的话再转化一下。(本s实验目的只是验证FP-growth在分布式下实现的可能性,所以没有给出完整的结果) 21 | ### viewRelation.py 22 | 添加了作者与其合作者之间的可视化功能,使用了networkx包。 23 | 24 | -------------------------------------------------------------------------------- /view_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from matplotlib.font_manager import FontProperties 3 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) 4 | import codecs 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | data = codecs.open('authors_encoded.txt','r','utf-8') 8 | word_counts = {} 9 | maxCounts = 0 10 | for line in data: 11 | line = line.split(',') 12 | for word in line[0:-1]: 13 | word_counts[word] = word_counts.get(word,0) + 1 14 | if word_counts[word] > maxCounts: 15 | maxCounts = word_counts[word] 16 | maxKey = word 17 | 18 | xMax = maxCounts 19 | data.close() 20 | bins = {} 21 | for k,v in word_counts.iteritems(): 22 | bins[v] = bins.get(v,0) + 1 23 | 24 | y = [] 25 | for i in range(40, 200): 26 | y.append(bins.get(i,0)) 27 | plt.plot(y,'-'); 28 | plt.grid() 29 | plt.yticks(range(0,1000,100)) 30 | plt.xticks(range(0,160,20),range(40,200,20)) 31 | plt.xlabel(u'支持度',fontproperties=font) 32 | plt.ylabel(u'对应支持度下的作者个数',fontproperties=font) 33 | plt.title(u'作者数量与支持度之间的对应关系',fontproperties=font) 34 | plt.show() 35 | -------------------------------------------------------------------------------- /mapper2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | def creatDic(): 4 | freqDic = {} 5 | with open('sortedList', 'r') as sortedList: 6 | for line in sortedList: 7 | line = line.strip().split('\t') 8 | freqDic[int(line[0])] = int(line[1]) 9 | return freqDic 10 | 11 | def read_input(inFile): 12 | for line in inFile: 13 | yield line.split(',') 14 | 15 | def main(freqDic, minSup): 16 | data = read_input(sys.stdin) 17 | for names in data: 18 | names = {name:freqDic[int(name)] for name in names \ 19 | if name.isdigit() \ 20 | and freqDic.get(int(name), 0) >= minSup} 21 | lenth = len(names) 22 | if lenth >= 2: 23 | conPatItems = [name for name, value in \ 24 | sorted(names.iteritems(), \ 25 | key = lambda p:p[1])] 26 | for i in range(lenth-1): 27 | print "%s\t%s" % (conPatItems[i], conPatItems[i+1::]) 28 | else: 29 | continue 30 | 31 | if __name__ == '__main__': 32 | support = 100 33 | dic = creatDic() 34 | main(dic, support) 35 | -------------------------------------------------------------------------------- /reducer2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from itertools import groupby 3 | from operator import itemgetter 4 | import sys 5 | 6 | def readMapOutput(file): 7 | for line in file: 8 | yield line.strip().split('\t') 9 | 10 | def main(minSup): 11 | data = readMapOutput(sys.stdin) 12 | for currentName, group in groupby(data, itemgetter(0)): 13 | localDic = {} 14 | try: 15 | for currentName, conPatItems in group: 16 | conPatItems = conPatItems.strip().strip('[').strip(']') 17 | #print "%s\t%s" % (currentName, conPatItems) 18 | itemList = conPatItems.split(',') 19 | for item in itemList: 20 | item = item.strip().strip("'") 21 | item = int(item) 22 | localDic[item] = localDic.get(item,0) + 1 23 | resultDic = {k:v for k, v in localDic.iteritems() \ 24 | if v >= minSup} 25 | #Here we just print out 2-coauthors 26 | if len(resultDic) >= 1: 27 | print "%s\t%s" % (currentName, resultDic.items()) 28 | 29 | except: 30 | print "%s\t%s" %("inner err", "sorry!") 31 | pass 32 | 33 | if __name__ == "__main__": 34 | support = 100 35 | main(support) 36 | -------------------------------------------------------------------------------- /getAuthors.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | from xml.sax import handler, make_parser 3 | paper_tag = ('article','inproceedings','proceedings','book', 4 | 'incollection','phdthesis','mastersthesis','www') 5 | 6 | class mHandler(handler.ContentHandler): 7 | def __init__(self,result): 8 | self.result = result 9 | self.flag = 0 10 | 11 | def startDocument(self): 12 | print 'Document Start' 13 | 14 | def endDocument(self): 15 | print 'Document End' 16 | 17 | def startElement(self, name, attrs): 18 | if name == 'author': 19 | self.flag = 1 20 | 21 | def endElement(self, name): 22 | if name == 'author': 23 | self.result.write(',') 24 | self.flag = 0 25 | if (name in paper_tag) : 26 | self.result.write('\r\n') 27 | 28 | def characters(self, chrs): # [8] 29 | if self.flag: 30 | self.result.write(chrs) 31 | 32 | def parserDblpXml(source,result): 33 | handler = mHandler(result) 34 | parser = make_parser() 35 | parser.setContentHandler(handler) 36 | 37 | parser.parse(source) 38 | 39 | 40 | if __name__ == '__main__': 41 | source = codecs.open('dblp.xml','r','utf-8') 42 | result = codecs.open('authors.txt','w','utf-8') 43 | parserDblpXml(source,result) 44 | result.close() 45 | source.close() 46 | -------------------------------------------------------------------------------- /viewRelation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import itertools 3 | import networkx as nx 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import codecs 7 | from matplotlib.font_manager import FontProperties 8 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc") 9 | def createEdge(nodeX): 10 | with codecs.open('authors.txt','r','utf-8') as f: 11 | for line in f: 12 | line = line.strip().split(',') 13 | if line[-1] == '': 14 | line.remove('') 15 | if nodeX in line and len(line) >1: 16 | line.remove(nodeX) 17 | for author in line: 18 | yield (author,nodeX) 19 | def makeFreqDic(): 20 | print "Creating FreqDic..." 21 | with codecs.open('authors.txt','r','utf-8') as f: 22 | freqDic = {} 23 | for line in f: 24 | line = line.strip().split(',') 25 | if line[-1] == '': 26 | line.remove('') 27 | for author in line: 28 | freqDic[author] = freqDic.get(author,0) + 1 29 | return freqDic 30 | def main(freqDic,nodeX): 31 | G = nx.Graph() 32 | print "Adding edge..." 33 | for A,B in createEdge(nodeX): 34 | edgeDic = G.get_edge_data(A,B,default = {'weight':0}) 35 | G.add_edge(A, B, weight = edgeDic['weight'] + 1) 36 | nodes = G.nodes() 37 | nodes.remove(nodeX) 38 | shells = [[nodeX], nodes] 39 | pos = nx.shell_layout(G,shells) 40 | print "Drawing nodes..." 41 | nodeSize = [10*freqDic[n] for n, dic in G.nodes_iter(data=True)] 42 | nodeColors = np.random.rand(len(nodeSize)) 43 | nx.draw_networkx_nodes(G, pos, node_size=nodeSize,node_color= nodeColors,alpha=0.7) 44 | print "Drawing edges..." 45 | edgeWidth = [edata['weight'] for u,v,edata in G.edges(data=True)] 46 | edgeColor = np.random.rand(G.number_of_edges()) 47 | nx.draw_networkx_edges(G, pos, width = edgeWidth, edge_color=edgeColor,alpha=0.35) 48 | print "Adding label..." 49 | select_labels = {n:n for n,d in G.nodes_iter(data=True) if freqDic[n] >= 80} 50 | select_labels[nodeX]= nodeX 51 | nx.draw_networkx_label(G,pos,labels = select_labels,font_size=8,alpha=0.3) 52 | title = str(nodeX) + u"与其合作者之间的关系网络" 53 | plt.title(title, size=15,fontproperties=font) 54 | plt.text(0.5, 0.94, u"# 节点大小对应该作者发表文章总次数", 55 | horizontalalignment='center', 56 | size=10,color='r',verticalalignment='center', 57 | transform=plt.gca().transAxes, 58 | fontproperties=font) 59 | plt.text(0.5, 0.97, u"# 节点之间连线粗细对应该两个作者一起发表文章总次数", 60 | horizontalalignment='center', 61 | size=10,color='r',verticalalignment='center', 62 | transform=plt.gca().transAxes, 63 | fontproperties=font) 64 | plt.axis('off') 65 | fileName = str(nodeX) + ".png" 66 | plt.savefig(fileName,transparent=True,dpi=500) 67 | plt.show() 68 | 69 | if __name__ == '__main__': 70 | freqDic = makeFreqDic() 71 | nodeX = u'Irith Pomeranz' 72 | main(freqDic, nodeX) 73 | -------------------------------------------------------------------------------- /final.py: -------------------------------------------------------------------------------- 1 | class treeNode: 2 | def __init__(self, nameValue, numOccur, parentNode): 3 | self.name = nameValue 4 | self.count = numOccur 5 | self.nodeLink = None 6 | self.parent = parentNode #needs to be updated 7 | self.children = {} 8 | def inc(self, numOccur): 9 | self.count += numOccur 10 | def disp(self, ind=1): 11 | print ' '*ind, self.name, ' ', self.count 12 | for child in self.children.values(): 13 | child.disp(ind+1) 14 | 15 | def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine 16 | freqDic = {} 17 | #go over dataSet twice 18 | for trans in dataSet:#first pass counts frequency of occurance 19 | for item in trans: 20 | freqDic[item] = freqDic.get(item, 0) + dataSet[trans] 21 | 22 | headerTable = {k:v for (k,v) in freqDic.iteritems() if v >= minSup} 23 | 24 | 25 | #print 'freqItemSet: ',freqItemSet 26 | if len(headerTable) == 0: return None, None #if no items meet min support -->get out 27 | for k in headerTable: 28 | headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link 29 | #print 'headerTable: ',headerTable 30 | retTree = treeNode('Null Set', 1, None) #create tree 31 | for tranSet, count in dataSet.items(): #go through dataset 2nd time 32 | localD = {} 33 | for item in tranSet: #put transaction items in order 34 | if headerTable.get(item,0): 35 | localD[item] = headerTable[item][0] 36 | if len(localD) > 0: 37 | orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)] 38 | updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq itemset 39 | return retTree, headerTable #return tree and header table 40 | 41 | def updateTree(items, inTree, headerTable, count): 42 | if items[0] in inTree.children:#check if orderedItems[0] in retTree.children 43 | inTree.children[items[0]].inc(count) #incrament count 44 | else: #add items[0] to inTree.children 45 | inTree.children[items[0]] = treeNode(items[0], count, inTree) 46 | if headerTable[items[0]][1] == None: #update header table 47 | headerTable[items[0]][1] = inTree.children[items[0]] 48 | else: 49 | updateHeader(headerTable[items[0]][1], inTree.children[items[0]]) 50 | if len(items) > 1:#call updateTree() with remaining ordered items 51 | updateTree(items[1::], inTree.children[items[0]], headerTable, count) 52 | 53 | def updateHeader(nodeToTest, targetNode): #this version does not use recursion 54 | while (nodeToTest.nodeLink != None): #Do not use recursion to traverse a linked list! 55 | nodeToTest = nodeToTest.nodeLink 56 | nodeToTest.nodeLink = targetNode 57 | 58 | def ascendTree(leafNode, prefixPath): #ascends from leaf node to root 59 | if leafNode.parent != None: 60 | prefixPath.append(leafNode.name) 61 | ascendTree(leafNode.parent, prefixPath) 62 | 63 | def findPrefixPath(basePat, treeNode): #treeNode comes from header table 64 | condPats = {} 65 | while treeNode != None: 66 | prefixPath = [] 67 | ascendTree(treeNode, prefixPath) 68 | if len(prefixPath) > 1: 69 | condPats[frozenset(prefixPath[1:])] = treeNode.count 70 | treeNode = treeNode.nodeLink 71 | return condPats 72 | 73 | def mineTree(inTree, headerTable, minSup, preFix, freqItemList): 74 | bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]#(sort header table) 75 | for basePat in bigL: #start from bottom of header table 76 | newFreqSet = preFix.copy() 77 | newFreqSet.add(basePat) 78 | #print 'finalFrequent Item: ',newFreqSet #append to set 79 | if len(newFreqSet) > 1: 80 | freqItemList[frozenset(newFreqSet)] = headerTable[basePat][0] 81 | condPattBases = findPrefixPath(basePat, headerTable[basePat][1]) 82 | #print 'condPattBases :',basePat, condPattBases 83 | #2. construct cond FP-tree from cond. pattern base 84 | 85 | myCondTree, myHead = createTree(condPattBases, minSup) 86 | #print 'head from conditional tree: ', myHead 87 | if myHead != None: #3. mine cond. FP-tree 88 | #print 'conditional tree for: ',newFreqSet 89 | #myCondTree.disp(1) 90 | mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList) 91 | 92 | def loadSimpDat(inFile): 93 | dataSet = {} 94 | for line in inFile: 95 | line =line.strip().split(',') 96 | dataLine = [word for word in line if word.isdigit()] 97 | dataSet[frozenset(dataLine)] = dataSet.get(frozenset(dataLine),0) + 1 98 | 99 | return dataSet 100 | 101 | 102 | 103 | if __name__ == "__main__": 104 | minSup = 100 105 | print "Reading Source File ... Wait..." 106 | with open('authors_encoded.txt','r') as f: 107 | dataSet = loadSimpDat(f) 108 | 109 | print "Constructing FP-tree ... Wait..." 110 | myFPtree, myHeaderTab = createTree(dataSet, minSup) 111 | 112 | print "Mining frequent items ... Wait..." 113 | myFreqList = {} 114 | mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) 115 | print "Totally %d frequent itemsets found ! " %len(myFreqList) 116 | print "Constructing authors_index... Wait..." 117 | 118 | maxCoauthors = 0 119 | for freqAuthors in myFreqList.keys(): 120 | if len(freqAuthors) > maxCoauthors: 121 | maxCoauthors = len(freqAuthors) 122 | print "the max num of coauthors is %d " % (maxCoauthors) 123 | 124 | 125 | with open('authors_index.txt','r') as authorsIndex: 126 | i = 0 127 | authorsDic = {} 128 | for name in authorsIndex: 129 | name = name.strip() 130 | authorsDic[i] = name 131 | i = i+1 132 | 133 | print "Writing result into result.txt... Wait..." 134 | 135 | with open('result4.txt','w') as result2: 136 | with open('result3.txt','w') as result: 137 | result.write("%25s\t%25s\t%15s\t%10s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\n" \ 138 | %('authorA','authorB','authorC','Sup(A,B,C)','Sup(A)','Sup(B)','Sup(C)',\ 139 | 'Con(A)','Con(B)','Con(C)','MinCon','MaxCon')) 140 | result2.write("%25s\t%25s\t%15s\t%10s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\n" \ 141 | %('authorA','authorB','authorC','Sup(A,B,C)','Sup(A)','Sup(B)','Sup(C)',\ 142 | 'Con(A)','Con(B)','Con(C)','MinCon','MaxCon')) 143 | resultList = sorted(myFreqList.items(), key=lambda p: p[1], reverse=True) 144 | for itemSet, support in resultList: 145 | itemList = list(itemSet) 146 | A = itemList[0] 147 | authorA = authorsDic.get(int(A),'0') 148 | B = itemList[1] 149 | authorB = authorsDic.get(int(B),'0') 150 | SupAB_C = int(support) 151 | SupA = int(myHeaderTab.get(A,[0])[0]) 152 | SupB = int(myHeaderTab.get(B,[0])[0]) 153 | ConA = float(SupAB_C) / float(SupA) 154 | ConB = float(SupAB_C) / float(SupB) 155 | (C,authorC,SupC,ConC) = ('','',0.0,0.0) 156 | 157 | if len(itemList) == 3: 158 | C = itemList[2] 159 | authorC = authorsDic.get(int(C),'0') 160 | SupC = int(myHeaderTab.get(C,[0])[0]) 161 | ConC = float(SupAB_C) / float(SupC) 162 | MinCon = min([ConA, ConB, ConC]) 163 | MaxCon = max([ConA, ConB, ConC]) 164 | elif len(itemList) == 2: 165 | MinCon = min([ConA, ConB]) 166 | MaxCon = max([ConA, ConB]) 167 | 168 | 169 | if MinCon < 0.4 or MaxCon < 0.5 or (MinCon + MaxCon)/2 < 0.5: 170 | continue 171 | result.write("%25s\t%25s\t%15s\t%10.0f\t%6.0f\t%6.0f\t%6.0f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\n" \ 172 | %(authorA,authorB,authorC,SupAB_C,\ 173 | SupA,SupB,SupC,ConA,ConB,ConC,MinCon,MaxCon)) 174 | result2.write("%25s\t%25s\t%15s\t%10.0f\t%6.0f\t%6.0f\t%6.0f\t\%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\n"\ 175 | %(A,B,C,SupAB_C,SupA,SupB,SupC,\ 176 | ConA,ConB,ConC,MinCon,MaxCon)) 177 | print "Finished !" 178 | --------------------------------------------------------------------------------