├── mapper.py
├── reducer.py
├── encode.py
├── README.md
├── view_data.py
├── mapper2.py
├── reducer2.py
├── getAuthors.py
├── viewRelation.py
└── final.py


/mapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | def read_input(file):
 4 |     for line in file:
 5 |         yield line.split(',')
 6 | 
 7 | def main():
 8 |     data = read_input(sys.stdin)
 9 |     for names in data:
10 |         for name in names:
11 |             if name.isdigit():
12 |                 print '%s\t%d' % (name, 1)
13 | if __name__ == "__main__":
14 |     main()
15 | 


--------------------------------------------------------------------------------
/reducer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from itertools import groupby
 3 | from operator import itemgetter
 4 | import sys
 5 | 
 6 | def readMapOutput(file):
 7 |     for line in file:
 8 |         yield line.strip().split('\t')
 9 | 
10 | def main():
11 |     data = readMapOutput(sys.stdin)
12 |     for currentName, group in groupby(data, itemgetter(0)):
13 |         try:
14 |             total_count = sum(int(count) for currentName, count in group)
15 |             print "%s\t%d" % (currentName, total_count)
16 |         except ValueError:
17 |             pass
18 | 
19 | if __name__ == "__main__":
20 |     main()
21 | 


--------------------------------------------------------------------------------
/encode.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | source = codecs.open('authors.txt','r','utf-8')
 3 | result = codecs.open('authors_encoded.txt','w','utf-8')
 4 | index = codecs.open('authors_index.txt','w','utf-8')
 5 | index_dic = {}
 6 | name_id = 0
 7 | 
 8 | for line in source:
 9 |     name_list = line.split(',')
10 |     for name in name_list:
11 |         if not (name == '\r\n'):
12 |             if name in index_dic:
13 |                 index_dic[name][1] +=1
14 |             else:
15 |                 index_dic[name] = [name_id,1]
16 |                 index.write(name + u'\r\n')
17 |                 name_id += 1
18 |             result.write(str(index_dic[name][0]) + u',')
19 |     result.write('\r\n')
20 | 
21 | source.close()
22 | result.close()
23 | index.close()
24 | #print sorted(index_dic.iteritems(), key = lambda a:a[1][1])
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | DBLP-Coauthor-Mining<br \>(从DBLP数据集中挖掘合作者)
 2 | =============
 3 | 详细说明
 4 | -------------
 5 | 详细说明请查看 [数据挖掘实战之DBLP中合作者挖掘（Python+Hadoop)](http://www.tianjun.ml/essays/20)
 6 | 
 7 | 文件说明
 8 | -------------
 9 | ### getAuthors.py
10 | 下载DBLP数据集``dblp.xml``到该目录下，[http://dblp.uni-trier.de/xml/](http://dblp.uni-trier.de/xml/)<br />运行``getAuthors.py`` 得到``authors.txt``文件
11 | ### encode.py
12 | 运行该文件后将对上一步得到的``authors.txt``文件编码（安装作者姓名出现的顺序依次以正整数编码）得到编码后的文件``authors_encoded.txt``，以及作者姓名与编码对应的文件``authors_index.txt``，其对应关系为姓名所在的行号减1即为其编码ID（ID从0开始）
13 | ### view.data.py
14 | 读取``authors.txt``，统计不同支持度下有多少作者，同时绘制曲线，确定支持度阈值大概范围
15 | ### final.py
16 | 主要借鉴了《机器学习实战》中的例子，将结果写入了 ``result*.txt``文件，注意最后的结果增加了置信度过滤。
17 | ### mapper.py & reduce.py
18 | 第一轮MapReduce的Map和Reduce所用到的文件，其实质就是一个wordCount的过程
19 | ### mapper2.py & reduce2.py
20 | 第二轮MapReduce的Map和Reduce所用到的文件，注意在这里的输出并给出没有完整的挖掘结果，而是输出的条件模式集，有空的话再转化一下。（本s实验目的只是验证FP-growth在分布式下实现的可能性，所以没有给出完整的结果）
21 | ### viewRelation.py
22 | 添加了作者与其合作者之间的可视化功能，使用了networkx包。
23 | 
24 | 


--------------------------------------------------------------------------------
/view_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from matplotlib.font_manager import FontProperties
 3 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) 
 4 | import codecs
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | data = codecs.open('authors_encoded.txt','r','utf-8')
 8 | word_counts = {}
 9 | maxCounts = 0
10 | for line in data:
11 |     line = line.split(',')
12 |     for word in line[0:-1]:
13 |         word_counts[word] = word_counts.get(word,0) + 1
14 |         if word_counts[word] > maxCounts:
15 |             maxCounts = word_counts[word]
16 |             maxKey = word
17 | 
18 | xMax = maxCounts
19 | data.close()
20 | bins = {}
21 | for k,v in word_counts.iteritems():
22 |     bins[v] = bins.get(v,0) + 1
23 | 
24 | y = []
25 | for i in range(40, 200):
26 |     y.append(bins.get(i,0))
27 | plt.plot(y,'-');
28 | plt.grid()
29 | plt.yticks(range(0,1000,100))
30 | plt.xticks(range(0,160,20),range(40,200,20))
31 | plt.xlabel(u'支持度',fontproperties=font)
32 | plt.ylabel(u'对应支持度下的作者个数',fontproperties=font)
33 | plt.title(u'作者数量与支持度之间的对应关系',fontproperties=font)
34 | plt.show()
35 | 


--------------------------------------------------------------------------------
/mapper2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | def creatDic():
 4 |     freqDic = {}
 5 |     with open('sortedList', 'r') as sortedList:
 6 |         for line in sortedList:
 7 |             line = line.strip().split('\t')
 8 |             freqDic[int(line[0])] = int(line[1])
 9 |     return freqDic
10 | 
11 | def read_input(inFile):
12 |     for line in inFile:
13 |         yield line.split(',')
14 | 
15 | def main(freqDic, minSup):
16 |     data = read_input(sys.stdin)
17 |     for names in data:
18 |         names = {name:freqDic[int(name)] for name in names \
19 |                  if name.isdigit() \
20 |                  and freqDic.get(int(name), 0) >= minSup}
21 |         lenth = len(names)
22 |         if lenth >= 2:
23 |             conPatItems = [name for name, value in \
24 |                            sorted(names.iteritems(), \
25 |                                   key = lambda p:p[1])]
26 |             for i in range(lenth-1):
27 |                 print "%s\t%s" % (conPatItems[i], conPatItems[i+1::])
28 |         else:
29 |             continue
30 | 
31 | if __name__ == '__main__':
32 |     support = 100
33 |     dic = creatDic()
34 |     main(dic, support)
35 | 


--------------------------------------------------------------------------------
/reducer2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from itertools import groupby
 3 | from operator import itemgetter
 4 | import sys
 5 | 
 6 | def readMapOutput(file):
 7 |     for line in file:
 8 |         yield line.strip().split('\t')
 9 | 
10 | def main(minSup):
11 |     data = readMapOutput(sys.stdin)
12 |     for currentName, group in groupby(data, itemgetter(0)):
13 |         localDic = {}
14 |         try:
15 |             for currentName, conPatItems in group:
16 |                 conPatItems = conPatItems.strip().strip('[').strip(']')
17 |                 #print "%s\t%s" % (currentName, conPatItems)
18 |                 itemList = conPatItems.split(',')
19 |                 for item in itemList:
20 |                     item = item.strip().strip("'")
21 |                     item = int(item)
22 |                     localDic[item] = localDic.get(item,0) + 1
23 |             resultDic = {k:v for k, v in localDic.iteritems() \
24 |                          if v >= minSup}
25 |             #Here we just print out 2-coauthors
26 |             if len(resultDic) >= 1:
27 |                 print "%s\t%s" % (currentName, resultDic.items())
28 | 
29 |         except:
30 |             print "%s\t%s" %("inner err", "sorry!")
31 |             pass
32 | 
33 | if __name__ == "__main__":
34 |     support = 100
35 |     main(support)
36 | 


--------------------------------------------------------------------------------
/getAuthors.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | from xml.sax import handler, make_parser
 3 | paper_tag = ('article','inproceedings','proceedings','book',
 4 |                    'incollection','phdthesis','mastersthesis','www')
 5 | 
 6 | class mHandler(handler.ContentHandler):
 7 |     def __init__(self,result):
 8 |         self.result = result
 9 |         self.flag = 0
10 | 
11 |     def startDocument(self):
12 |         print 'Document Start'
13 |         
14 |     def endDocument(self):
15 |         print 'Document End'
16 |         
17 |     def startElement(self, name, attrs):
18 |         if name == 'author':
19 |             self.flag = 1
20 |                     
21 |     def endElement(self, name):
22 |         if name == 'author':
23 |             self.result.write(',')
24 |             self.flag = 0
25 |         if (name in paper_tag) :
26 |             self.result.write('\r\n')
27 |         
28 |     def characters(self, chrs):                                 # [8]
29 |         if self.flag:
30 |             self.result.write(chrs)
31 | 
32 | def parserDblpXml(source,result):
33 |     handler = mHandler(result)
34 |     parser = make_parser()
35 |     parser.setContentHandler(handler)
36 |         
37 |     parser.parse(source)
38 |     
39 | 
40 | if __name__ == '__main__':
41 |     source = codecs.open('dblp.xml','r','utf-8')
42 |     result = codecs.open('authors.txt','w','utf-8')
43 |     parserDblpXml(source,result)
44 |     result.close()
45 |     source.close()
46 | 


--------------------------------------------------------------------------------
/viewRelation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import itertools
 3 | import networkx as nx
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | import codecs
 7 | from matplotlib.font_manager import FontProperties
 8 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc")
 9 | def createEdge(nodeX):
10 |     with codecs.open('authors.txt','r','utf-8') as f:
11 |         for line in f:
12 |             line = line.strip().split(',')
13 |             if line[-1] == '':
14 |                 line.remove('')
15 |             if nodeX in line and len(line) >1:
16 |                 line.remove(nodeX)
17 |                 for author in line:
18 |                     yield (author,nodeX)
19 | def makeFreqDic():
20 |     print "Creating FreqDic..."
21 |     with codecs.open('authors.txt','r','utf-8') as f:
22 |         freqDic = {}
23 |         for line in f:           
24 |             line = line.strip().split(',')
25 |             if line[-1] == '':
26 |                 line.remove('')               
27 |             for author in line:
28 |                 freqDic[author] = freqDic.get(author,0) + 1
29 |         return freqDic
30 | def main(freqDic,nodeX):
31 |     G = nx.Graph()
32 |     print "Adding edge..."
33 |     for A,B in createEdge(nodeX):
34 |         edgeDic = G.get_edge_data(A,B,default = {'weight':0})
35 |         G.add_edge(A, B, weight = edgeDic['weight'] + 1)
36 |     nodes = G.nodes()
37 |     nodes.remove(nodeX)
38 |     shells = [[nodeX], nodes]
39 |     pos = nx.shell_layout(G,shells)
40 |     print "Drawing nodes..."
41 |     nodeSize = [10*freqDic[n] for n, dic in G.nodes_iter(data=True)]
42 |     nodeColors = np.random.rand(len(nodeSize))
43 |     nx.draw_networkx_nodes(G, pos, node_size=nodeSize,node_color= nodeColors,alpha=0.7)
44 |     print "Drawing edges..."
45 |     edgeWidth = [edata['weight'] for u,v,edata in G.edges(data=True)]
46 |     edgeColor = np.random.rand(G.number_of_edges())
47 |     nx.draw_networkx_edges(G, pos, width = edgeWidth, edge_color=edgeColor,alpha=0.35)
48 |     print "Adding label..."
49 |     select_labels = {n:n for n,d in G.nodes_iter(data=True) if freqDic[n] >= 80}
50 |     select_labels[nodeX]= nodeX
51 |     nx.draw_networkx_label(G,pos,labels = select_labels,font_size=8,alpha=0.3)
52 |     title = str(nodeX) + u"与其合作者之间的关系网络"
53 |     plt.title(title, size=15,fontproperties=font)
54 |     plt.text(0.5, 0.94,  u"# 节点大小对应该作者发表文章总次数",
55 |              horizontalalignment='center',
56 |              size=10,color='r',verticalalignment='center',
57 |              transform=plt.gca().transAxes,
58 |              fontproperties=font)
59 |     plt.text(0.5, 0.97,  u"# 节点之间连线粗细对应该两个作者一起发表文章总次数",
60 |              horizontalalignment='center',
61 |              size=10,color='r',verticalalignment='center',
62 |              transform=plt.gca().transAxes,
63 |              fontproperties=font)
64 |     plt.axis('off')
65 |     fileName = str(nodeX) + ".png"
66 |     plt.savefig(fileName,transparent=True,dpi=500)
67 |     plt.show()
68 | 
69 | if __name__ == '__main__':
70 |     freqDic = makeFreqDic()
71 |     nodeX = u'Irith Pomeranz'
72 |     main(freqDic, nodeX)
73 | 


--------------------------------------------------------------------------------
/final.py:
--------------------------------------------------------------------------------
  1 | class treeNode:
  2 |     def __init__(self, nameValue, numOccur, parentNode):
  3 |         self.name = nameValue
  4 |         self.count = numOccur
  5 |         self.nodeLink = None
  6 |         self.parent = parentNode      #needs to be updated
  7 |         self.children = {}
  8 |     def inc(self, numOccur):
  9 |         self.count += numOccur
 10 |     def disp(self, ind=1):
 11 |         print '  '*ind, self.name, ' ', self.count
 12 |         for child in self.children.values():
 13 |             child.disp(ind+1)
 14 | 
 15 | def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
 16 |     freqDic = {}
 17 |     #go over dataSet twice
 18 |     for trans in dataSet:#first pass counts frequency of occurance
 19 |         for item in trans:
 20 |             freqDic[item] = freqDic.get(item, 0) + dataSet[trans]
 21 |     
 22 |     headerTable = {k:v for (k,v) in freqDic.iteritems() if v >= minSup}
 23 | 
 24 | 
 25 |     #print 'freqItemSet: ',freqItemSet
 26 |     if len(headerTable) == 0: return None, None  #if no items meet min support -->get out
 27 |     for k in headerTable:
 28 |         headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link
 29 |     #print 'headerTable: ',headerTable
 30 |     retTree = treeNode('Null Set', 1, None) #create tree
 31 |     for tranSet, count in dataSet.items():  #go through dataset 2nd time
 32 |         localD = {}
 33 |         for item in tranSet:  #put transaction items in order
 34 |             if headerTable.get(item,0):
 35 |                 localD[item] = headerTable[item][0]
 36 |         if len(localD) > 0:
 37 |             orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
 38 |             updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq itemset
 39 |     return retTree, headerTable #return tree and header table
 40 | 
 41 | def updateTree(items, inTree, headerTable, count):
 42 |     if items[0] in inTree.children:#check if orderedItems[0] in retTree.children
 43 |         inTree.children[items[0]].inc(count) #incrament count
 44 |     else:   #add items[0] to inTree.children
 45 |         inTree.children[items[0]] = treeNode(items[0], count, inTree)
 46 |         if headerTable[items[0]][1] == None: #update header table
 47 |             headerTable[items[0]][1] = inTree.children[items[0]]
 48 |         else:
 49 |             updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
 50 |     if len(items) > 1:#call updateTree() with remaining ordered items
 51 |         updateTree(items[1::], inTree.children[items[0]], headerTable, count)
 52 | 
 53 | def updateHeader(nodeToTest, targetNode):   #this version does not use recursion
 54 |     while (nodeToTest.nodeLink != None):    #Do not use recursion to traverse a linked list!
 55 |         nodeToTest = nodeToTest.nodeLink
 56 |     nodeToTest.nodeLink = targetNode
 57 | 
 58 | def ascendTree(leafNode, prefixPath): #ascends from leaf node to root
 59 |     if leafNode.parent != None:
 60 |         prefixPath.append(leafNode.name)
 61 |         ascendTree(leafNode.parent, prefixPath)
 62 | 
 63 | def findPrefixPath(basePat, treeNode): #treeNode comes from header table
 64 |     condPats = {}
 65 |     while treeNode != None:
 66 |         prefixPath = []
 67 |         ascendTree(treeNode, prefixPath)
 68 |         if len(prefixPath) > 1:
 69 |             condPats[frozenset(prefixPath[1:])] = treeNode.count
 70 |         treeNode = treeNode.nodeLink
 71 |     return condPats
 72 | 
 73 | def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
 74 |     bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]#(sort header table)
 75 |     for basePat in bigL:  #start from bottom of header table
 76 |         newFreqSet = preFix.copy()
 77 |         newFreqSet.add(basePat)
 78 |         #print 'finalFrequent Item: ',newFreqSet    #append to set
 79 |         if len(newFreqSet) > 1:
 80 |             freqItemList[frozenset(newFreqSet)] = headerTable[basePat][0]
 81 |         condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
 82 |         #print 'condPattBases :',basePat, condPattBases
 83 |         #2. construct cond FP-tree from cond. pattern base
 84 |         
 85 |         myCondTree, myHead = createTree(condPattBases, minSup)
 86 |         #print 'head from conditional tree: ', myHead
 87 |         if myHead != None: #3. mine cond. FP-tree
 88 |             #print 'conditional tree for: ',newFreqSet
 89 |             #myCondTree.disp(1)
 90 |             mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
 91 | 
 92 | def loadSimpDat(inFile):
 93 |     dataSet = {}
 94 |     for line in inFile:
 95 |         line =line.strip().split(',')
 96 |         dataLine = [word for word in line if word.isdigit()]
 97 |         dataSet[frozenset(dataLine)] = dataSet.get(frozenset(dataLine),0) + 1
 98 |             
 99 |     return dataSet
100 | 
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     minSup = 100
105 |     print "Reading Source File ... Wait..."
106 |     with open('authors_encoded.txt','r') as f:
107 |         dataSet = loadSimpDat(f)
108 | 
109 |     print "Constructing FP-tree ... Wait..."
110 |     myFPtree, myHeaderTab = createTree(dataSet, minSup)
111 |     
112 |     print "Mining frequent items ... Wait..."
113 |     myFreqList = {}
114 |     mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
115 |     print "Totally %d frequent itemsets found ! " %len(myFreqList)
116 |     print "Constructing authors_index... Wait..."
117 | 
118 |     maxCoauthors = 0
119 |     for freqAuthors in myFreqList.keys():
120 |         if len(freqAuthors) > maxCoauthors:
121 |             maxCoauthors = len(freqAuthors)
122 |     print "the max num of coauthors is %d " % (maxCoauthors)
123 | 
124 |     
125 |     with open('authors_index.txt','r') as authorsIndex:
126 |         i = 0
127 |         authorsDic = {}
128 |         for name in authorsIndex:
129 |             name = name.strip()
130 |             authorsDic[i] = name
131 |             i = i+1
132 |     
133 |     print "Writing result into result.txt... Wait..."
134 | 
135 |     with open('result4.txt','w') as result2:
136 |         with open('result3.txt','w') as result:
137 |             result.write("%25s\t%25s\t%15s\t%10s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\n" \
138 |                          %('authorA','authorB','authorC','Sup(A,B,C)','Sup(A)','Sup(B)','Sup(C)',\
139 |                            'Con(A)','Con(B)','Con(C)','MinCon','MaxCon'))
140 |             result2.write("%25s\t%25s\t%15s\t%10s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\n" \
141 |                           %('authorA','authorB','authorC','Sup(A,B,C)','Sup(A)','Sup(B)','Sup(C)',\
142 |                             'Con(A)','Con(B)','Con(C)','MinCon','MaxCon'))
143 |             resultList = sorted(myFreqList.items(), key=lambda p: p[1], reverse=True)
144 |             for itemSet, support in resultList:
145 |                 itemList = list(itemSet)
146 |                 A = itemList[0]
147 |                 authorA = authorsDic.get(int(A),'0')
148 |                 B = itemList[1]
149 |                 authorB = authorsDic.get(int(B),'0')
150 |                 SupAB_C = int(support)
151 |                 SupA = int(myHeaderTab.get(A,[0])[0])
152 |                 SupB = int(myHeaderTab.get(B,[0])[0])
153 |                 ConA = float(SupAB_C) / float(SupA)
154 |                 ConB = float(SupAB_C) / float(SupB)
155 |                 (C,authorC,SupC,ConC) = ('','',0.0,0.0)
156 |    
157 |                 if len(itemList) == 3:
158 |                     C = itemList[2]
159 |                     authorC = authorsDic.get(int(C),'0')
160 |                     SupC = int(myHeaderTab.get(C,[0])[0])
161 |                     ConC = float(SupAB_C) / float(SupC)
162 |                     MinCon = min([ConA, ConB, ConC])
163 |                     MaxCon = max([ConA, ConB, ConC])
164 |                 elif len(itemList) == 2:
165 |                     MinCon = min([ConA, ConB])
166 |                     MaxCon = max([ConA, ConB])                
167 | 
168 | 
169 |                 if MinCon < 0.4 or MaxCon < 0.5 or (MinCon + MaxCon)/2 < 0.5:
170 |                     continue
171 |                 result.write("%25s\t%25s\t%15s\t%10.0f\t%6.0f\t%6.0f\t%6.0f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\n" \
172 |                              %(authorA,authorB,authorC,SupAB_C,\
173 |                                SupA,SupB,SupC,ConA,ConB,ConC,MinCon,MaxCon))
174 |                 result2.write("%25s\t%25s\t%15s\t%10.0f\t%6.0f\t%6.0f\t%6.0f\t\%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\n"\
175 |                               %(A,B,C,SupAB_C,SupA,SupB,SupC,\
176 |                                 ConA,ConB,ConC,MinCon,MaxCon))
177 |     print "Finished !"
178 | 


--------------------------------------------------------------------------------