├── .idea
├── Top10_Algorithms_in_DataMining.iml
├── encodings.xml
├── misc.xml
├── modules.xml
└── workspace.xml
├── Apriori
└── Apriori.py
├── C4.5
└── C4.5.py
├── CART
├── Cart.py
└── testSet
├── EM
└── em.py
├── K-means
├── Kmeans.py
└── testSet
├── KNN
└── KNN.py
├── PageRank
└── pagerank.py
└── README.md
/.idea/Top10_Algorithms_in_DataMining.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 | 1458570207194
32 |
33 | 1458570207194
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/Apriori/Apriori.py:
--------------------------------------------------------------------------------
1 | '''
2 | @version 0.1
3 | @date 2016-03-21
4 | @reference
5 | '''
6 |
7 |
8 | from numpy import *
9 |
10 | def loadDataSet():
11 | return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
12 |
13 | def createC1(dataSet):
14 | C1 = []
15 | for transaction in dataSet:
16 | for item in transaction:
17 | if not [item] in C1:
18 | C1.append([item])
19 |
20 | C1.sort()
21 | return map(frozenset, C1)#use frozen set so we
22 | #can use it as a key in a dict
23 | def scanD(D, Ck, minSupport):
24 | ssCnt = {}
25 | for tid in D:
26 | for can in Ck:
27 | if can.issubset(tid):
28 | if not ssCnt.has_key(can): ssCnt[can]=1
29 | else: ssCnt[can] += 1
30 | numItems = float(len(D))
31 | retList = []
32 | supportData = {}
33 | for key in ssCnt:
34 | support = ssCnt[key]/numItems
35 | if support >= minSupport:
36 | retList.insert(0,key)
37 | supportData[key] = support
38 | return retList, supportData
39 |
40 | def aprioriGen(Lk, k): #creates Ck
41 | retList = []
42 | lenLk = len(Lk)
43 | for i in range(lenLk):
44 | for j in range(i+1, lenLk):
45 | L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
46 | L1.sort(); L2.sort()
47 | if L1==L2: #if first k-2 elements are equal
48 | retList.append(Lk[i] | Lk[j]) #set union
49 | return retList
50 |
51 | def apriori(dataSet, minSupport = 0.5):
52 | C1 = createC1(dataSet)
53 | D = map(set, dataSet)
54 | L1, supportData = scanD(D, C1, minSupport)
55 | L = [L1]
56 | k = 2
57 | while (len(L[k-2]) > 0):
58 | Ck = aprioriGen(L[k-2], k)
59 | Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
60 | supportData.update(supK)
61 | L.append(Lk)
62 | k += 1
63 | return L, supportData
64 |
65 |
66 | if __name__ == '__main__':
67 | dataSet = loadDataSet()
68 | L, suppData = apriori(dataSet)
69 | print L
--------------------------------------------------------------------------------
/C4.5/C4.5.py:
--------------------------------------------------------------------------------
1 | '''
2 | @version 0.1
3 | @date 2016-04-06
4 | @author yangmu
5 | '''
6 |
7 | '''
8 | The implementation of decision tree which use the information gain ratio.
9 | '''
10 |
11 | from math import log
12 | import operator
13 |
14 | def calcShannonEnt(dataSet):
15 | numEntries = len(dataSet)
16 | labelCounts = {}
17 | for featVec in dataSet:
18 | currentLabel = featVec[-1]
19 | if currentLabel not in labelCounts.keys():
20 | labelCounts[currentLabel] = 0
21 | labelCounts[currentLabel] += 1
22 | shannonEnt = 0.0
23 | for key in labelCounts:
24 | prob = float(labelCounts[key])/numEntries
25 | shannonEnt -= prob * log(prob, 2)
26 | return shannonEnt
27 |
28 | def splitDataSet(dataSet, axis, value):
29 | retDataSet = []
30 | for featVec in dataSet:
31 | if featVec[axis] == value:
32 | reduceFeatVec = featVec[:axis]
33 | reduceFeatVec.extend(featVec[axis+1:])
34 | retDataSet.append(reduceFeatVec)
35 | return retDataSet
36 |
37 | def chooseBestFeatureToSplit(dataSet):
38 | numFeatures = len(dataSet[0]) - 1
39 | baseEntropy = calcShannonEnt(dataSet)
40 | bestInfoGainRatio = 0.0
41 | bestFeature = -1
42 | for i in range(numFeatures):
43 | featList = [example[i] for example in dataSet]
44 | uniqueVals = set(featList)
45 | newEntropy = 0.0
46 | splitInfo = 0.0
47 | for value in uniqueVals:
48 | subDataSet = splitDataSet(dataSet, i, value)
49 | prob = len(subDataSet)/float(len(dataSet))
50 | newEntropy += prob * calcShannonEnt(subDataSet)
51 | splitInfo += -prob * log(prob, 2)
52 | infoGain = baseEntropy - newEntropy
53 | if (splitInfo == 0): # fix the overflow bug
54 | continue
55 | infoGainRatio = infoGain / splitInfo
56 | if (infoGainRatio > bestInfoGainRatio):
57 | bestInfoGainRatio = infoGainRatio
58 | bestFeature = i
59 | return bestFeature
60 |
61 | def majorityCnt(classList):
62 | classCount = {}
63 | for vote in classList:
64 | if vote not in classCount.keys():
65 | classCount[vote] = 0
66 | classCount[vote] += 1
67 | sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reversed=True)
68 | return sortedClassCount[0][0]
69 |
70 | def createTree(dataSet, labels):
71 | classList = [example[-1] for example in dataSet]
72 | if classList.count(classList[0]) == len(classList):
73 | # 类别完全相同,停止划分
74 | return classList[0]
75 | if len(dataSet[0]) == 1:
76 | # 遍历完所有特征时返回出现次数最多的
77 | return majorityCnt(classList)
78 | bestFeat = chooseBestFeatureToSplit(dataSet)
79 | bestFeatLabel = labels[bestFeat]
80 | myTree = {bestFeatLabel:{}}
81 | del(labels[bestFeat])
82 | # 得到列表包括节点所有的属性值
83 | featValues = [example[bestFeat] for example in dataSet]
84 | uniqueVals = set(featValues)
85 | for value in uniqueVals:
86 | subLabels = labels[:]
87 | myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
88 | return myTree
89 |
90 | def classify(inputTree, featLabels, testVec):
91 | firstStr = list(inputTree.keys())[0]
92 | secondDict = inputTree[firstStr]
93 | featIndex = featLabels.index(firstStr)
94 | for key in secondDict.keys():
95 | if testVec[featIndex] == key:
96 | if type(secondDict[key]).__name__ == 'dict':
97 | classLabel = classify(secondDict[key], featLabels, testVec)
98 | else:
99 | classLabel = secondDict[key]
100 | return classLabel
101 |
102 | def classifyAll(inputTree, featLabels, testDataSet):
103 | classLabelAll = []
104 | for testVec in testDataSet:
105 | classLabelAll.append(classify(inputTree, featLabels, testVec))
106 | return classLabelAll
107 |
108 | def storeTree(inputTree, filename):
109 | import pickle
110 | fw = open(filename, 'wb')
111 | pickle.dump(inputTree, fw)
112 | fw.close()
113 |
114 | def grabTree(filename):
115 | import pickle
116 | fr = open(filename, 'rb')
117 | return pickle.load(fr)
118 |
119 | def createDataSet():
120 | """
121 | outlook-> 0: sunny | 1: overcast | 2: rain
122 | temperature-> 0: hot | 1: mild | 2: cool
123 | humidity-> 0: high | 1: normal
124 | windy-> 0: false | 1: true
125 | """
126 | dataSet = [[0, 0, 0, 0, 'N'],
127 | [0, 0, 0, 1, 'N'],
128 | [1, 0, 0, 0, 'Y'],
129 | [2, 1, 0, 0, 'Y'],
130 | [2, 2, 1, 0, 'Y'],
131 | [2, 2, 1, 1, 'N'],
132 | [1, 2, 1, 1, 'Y']]
133 | labels = ['outlook', 'temperature', 'humidity', 'windy']
134 | return dataSet, labels
135 |
136 | if __name__ == '__main__':
137 | dataSet, labels = createDataSet()
138 | labels_tmp = labels[:]
139 | desicionTree = createTree(dataSet, labels_tmp)
140 | print(desicionTree)
--------------------------------------------------------------------------------
/CART/Cart.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Feb 4, 2011
3 | Tree-Based Regression Methods
4 | @author: Peter Harrington
5 | @reference: Machine Learning in Action
6 | '''
7 | from numpy import *
8 |
9 | def loadDataSet(fileName): #general function to parse tab -delimited floats
10 | dataMat = [] #assume last column is target value
11 | fr = open(fileName)
12 | for line in fr.readlines():
13 | curLine = line.strip().split('\t')
14 | fltLine = map(float,curLine) #map all elements to float()
15 | dataMat.append(fltLine)
16 | return dataMat
17 |
18 | def binSplitDataSet(dataSet, feature, value):
19 | mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:][0]
20 | mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:][0]
21 | return mat0,mat1
22 |
23 | def regLeaf(dataSet):#returns the value used for each leaf
24 | return mean(dataSet[:,-1])
25 |
26 | def regErr(dataSet):
27 | return var(dataSet[:,-1]) * shape(dataSet)[0]
28 |
29 | def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
30 | tolS = ops[0]; tolN = ops[1]
31 | #if all the target variables are the same value: quit and return value
32 | if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
33 | return None, leafType(dataSet)
34 | m,n = shape(dataSet)
35 | #the choice of the best feature is driven by Reduction in RSS error from mean
36 | S = errType(dataSet)
37 | bestS = inf; bestIndex = 0; bestValue = 0
38 | for featIndex in range(n-1):
39 | for splitVal in set(dataSet[:,featIndex]):
40 | mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
41 | if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
42 | newS = errType(mat0) + errType(mat1)
43 | if newS < bestS:
44 | bestIndex = featIndex
45 | bestValue = splitVal
46 | bestS = newS
47 | #if the decrease (S-bestS) is less than a threshold don't do the split
48 | if (S - bestS) < tolS:
49 | return None, leafType(dataSet) #exit cond 2
50 | mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
51 | if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3
52 | return None, leafType(dataSet)
53 | return bestIndex,bestValue#returns the best feature to split on
54 | #and the value used for that split
55 |
56 | def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
57 | feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
58 | if feat == None: return val #if the splitting hit a stop condition return val
59 | retTree = {}
60 | retTree['spInd'] = feat
61 | retTree['spVal'] = val
62 | lSet, rSet = binSplitDataSet(dataSet, feat, val)
63 | retTree['left'] = createTree(lSet, leafType, errType, ops)
64 | retTree['right'] = createTree(rSet, leafType, errType, ops)
65 | return retTree
66 |
67 | def isTree(obj):
68 | return (type(obj).__name__=='dict')
69 |
70 | def getMean(tree):
71 | if isTree(tree['right']): tree['right'] = getMean(tree['right'])
72 | if isTree(tree['left']): tree['left'] = getMean(tree['left'])
73 | return (tree['left']+tree['right'])/2.0
74 |
75 | def prune(tree, testData):
76 | if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree
77 | if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them
78 | lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
79 | if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
80 | if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet)
81 | #if they are now both leafs, see if we can merge them
82 | if not isTree(tree['left']) and not isTree(tree['right']):
83 | lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
84 | errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\
85 | sum(power(rSet[:,-1] - tree['right'],2))
86 | treeMean = (tree['left']+tree['right'])/2.0
87 | errorMerge = sum(power(testData[:,-1] - treeMean,2))
88 | if errorMerge < errorNoMerge:
89 | print "merging"
90 | return treeMean
91 | else: return tree
92 | else: return tree
93 |
--------------------------------------------------------------------------------
/CART/testSet:
--------------------------------------------------------------------------------
1 | 0.036098 0.155096
2 | 0.993349 1.077553
3 | 0.530897 0.893462
4 | 0.712386 0.564858
5 | 0.343554 -0.371700
6 | 0.098016 -0.332760
7 | 0.691115 0.834391
8 | 0.091358 0.099935
9 | 0.727098 1.000567
10 | 0.951949 0.945255
11 | 0.768596 0.760219
12 | 0.541314 0.893748
13 | 0.146366 0.034283
14 | 0.673195 0.915077
15 | 0.183510 0.184843
16 | 0.339563 0.206783
17 | 0.517921 1.493586
18 | 0.703755 1.101678
19 | 0.008307 0.069976
20 | 0.243909 -0.029467
21 | 0.306964 -0.177321
22 | 0.036492 0.408155
23 | 0.295511 0.002882
24 | 0.837522 1.229373
25 | 0.202054 -0.087744
26 | 0.919384 1.029889
27 | 0.377201 -0.243550
28 | 0.814825 1.095206
29 | 0.611270 0.982036
30 | 0.072243 -0.420983
31 | 0.410230 0.331722
32 | 0.869077 1.114825
33 | 0.620599 1.334421
34 | 0.101149 0.068834
35 | 0.820802 1.325907
36 | 0.520044 0.961983
37 | 0.488130 -0.097791
38 | 0.819823 0.835264
39 | 0.975022 0.673579
40 | 0.953112 1.064690
41 | 0.475976 -0.163707
42 | 0.273147 -0.455219
43 | 0.804586 0.924033
44 | 0.074795 -0.349692
45 | 0.625336 0.623696
46 | 0.656218 0.958506
47 | 0.834078 1.010580
48 | 0.781930 1.074488
49 | 0.009849 0.056594
50 | 0.302217 -0.148650
51 | 0.678287 0.907727
52 | 0.180506 0.103676
53 | 0.193641 -0.327589
54 | 0.343479 0.175264
55 | 0.145809 0.136979
56 | 0.996757 1.035533
57 | 0.590210 1.336661
58 | 0.238070 -0.358459
59 | 0.561362 1.070529
60 | 0.377597 0.088505
61 | 0.099142 0.025280
62 | 0.539558 1.053846
63 | 0.790240 0.533214
64 | 0.242204 0.209359
65 | 0.152324 0.132858
66 | 0.252649 -0.055613
67 | 0.895930 1.077275
68 | 0.133300 -0.223143
69 | 0.559763 1.253151
70 | 0.643665 1.024241
71 | 0.877241 0.797005
72 | 0.613765 1.621091
73 | 0.645762 1.026886
74 | 0.651376 1.315384
75 | 0.697718 1.212434
76 | 0.742527 1.087056
77 | 0.901056 1.055900
78 | 0.362314 -0.556464
79 | 0.948268 0.631862
80 | 0.000234 0.060903
81 | 0.750078 0.906291
82 | 0.325412 -0.219245
83 | 0.726828 1.017112
84 | 0.348013 0.048939
85 | 0.458121 -0.061456
86 | 0.280738 -0.228880
87 | 0.567704 0.969058
88 | 0.750918 0.748104
89 | 0.575805 0.899090
90 | 0.507940 1.107265
91 | 0.071769 -0.110946
92 | 0.553520 1.391273
93 | 0.401152 -0.121640
94 | 0.406649 -0.366317
95 | 0.652121 1.004346
96 | 0.347837 -0.153405
97 | 0.081931 -0.269756
98 | 0.821648 1.280895
99 | 0.048014 0.064496
100 | 0.130962 0.184241
101 | 0.773422 1.125943
102 | 0.789625 0.552614
103 | 0.096994 0.227167
104 | 0.625791 1.244731
105 | 0.589575 1.185812
106 | 0.323181 0.180811
107 | 0.822443 1.086648
108 | 0.360323 -0.204830
109 | 0.950153 1.022906
110 | 0.527505 0.879560
111 | 0.860049 0.717490
112 | 0.007044 0.094150
113 | 0.438367 0.034014
114 | 0.574573 1.066130
115 | 0.536689 0.867284
116 | 0.782167 0.886049
117 | 0.989888 0.744207
118 | 0.761474 1.058262
119 | 0.985425 1.227946
120 | 0.132543 -0.329372
121 | 0.346986 -0.150389
122 | 0.768784 0.899705
123 | 0.848921 1.170959
124 | 0.449280 0.069098
125 | 0.066172 0.052439
126 | 0.813719 0.706601
127 | 0.661923 0.767040
128 | 0.529491 1.022206
129 | 0.846455 0.720030
130 | 0.448656 0.026974
131 | 0.795072 0.965721
132 | 0.118156 -0.077409
133 | 0.084248 -0.019547
134 | 0.845815 0.952617
135 | 0.576946 1.234129
136 | 0.772083 1.299018
137 | 0.696648 0.845423
138 | 0.595012 1.213435
139 | 0.648675 1.287407
140 | 0.897094 1.240209
141 | 0.552990 1.036158
142 | 0.332982 0.210084
143 | 0.065615 -0.306970
144 | 0.278661 0.253628
145 | 0.773168 1.140917
146 | 0.203693 -0.064036
147 | 0.355688 -0.119399
148 | 0.988852 1.069062
149 | 0.518735 1.037179
150 | 0.514563 1.156648
151 | 0.976414 0.862911
152 | 0.919074 1.123413
153 | 0.697777 0.827805
154 | 0.928097 0.883225
155 | 0.900272 0.996871
156 | 0.344102 -0.061539
157 | 0.148049 0.204298
158 | 0.130052 -0.026167
159 | 0.302001 0.317135
160 | 0.337100 0.026332
161 | 0.314924 -0.001952
162 | 0.269681 -0.165971
163 | 0.196005 -0.048847
164 | 0.129061 0.305107
165 | 0.936783 1.026258
166 | 0.305540 -0.115991
167 | 0.683921 1.414382
168 | 0.622398 0.766330
169 | 0.902532 0.861601
170 | 0.712503 0.933490
171 | 0.590062 0.705531
172 | 0.723120 1.307248
173 | 0.188218 0.113685
174 | 0.643601 0.782552
175 | 0.520207 1.209557
176 | 0.233115 -0.348147
177 | 0.465625 -0.152940
178 | 0.884512 1.117833
179 | 0.663200 0.701634
180 | 0.268857 0.073447
181 | 0.729234 0.931956
182 | 0.429664 -0.188659
183 | 0.737189 1.200781
184 | 0.378595 -0.296094
185 | 0.930173 1.035645
186 | 0.774301 0.836763
187 | 0.273940 -0.085713
188 | 0.824442 1.082153
189 | 0.626011 0.840544
190 | 0.679390 1.307217
191 | 0.578252 0.921885
192 | 0.785541 1.165296
193 | 0.597409 0.974770
194 | 0.014083 -0.132525
195 | 0.663870 1.187129
196 | 0.552381 1.369630
197 | 0.683886 0.999985
198 | 0.210334 -0.006899
199 | 0.604529 1.212685
200 | 0.250744 0.046297
201 |
--------------------------------------------------------------------------------
/EM/em.py:
--------------------------------------------------------------------------------
1 | '''
2 | @version 0.1
3 | @date 2016-05-01
4 | @author yangmu
5 | '''
6 |
7 | import math
8 |
9 | [pi,p,q]=[0.4,0.6,0.7]
10 |
11 | x=[1,1,0,1,0,0,1,0,1,1]
12 |
13 | def cal_u(pi1,p1,q1,xi):
14 | return pi1*math.pow(p1,xi)*math.pow(1-p1,1-xi)/float(pi1*math.pow(p1,xi)*math.pow(1-p1,1-xi)+(1-pi1)*math.pow(q1,xi)*math.pow(1-q1,1-xi))
15 |
16 | def e_step(pi1,p1,q1,x):
17 | return [cal_u(pi1,p1,q1,xi) for xi in x]
18 |
19 | def m_step(u,x):
20 | pi1=sum(u)/len(u)
21 | p1=sum([u[i]*x[i] for i in range(len(u))]) / sum(u)
22 | q1=sum([(1-u[i])*x[i] for i in range(len(u))]) / sum([1-u[i] for i in range(len(u))])
23 | return [pi1,p1,q1]
24 |
25 | def run(start_x,start_pi,start_p,start_q,iter_num):
26 | for i in range(iter_num):
27 | u=e_step(start_pi,start_p,start_q,x)
28 | print i,[start_pi,start_p,start_q]
29 | if [start_pi,start_p,start_q]==m_step(u,x):
30 | break
31 | else:
32 | [start_pi,start_p,start_q]=m_step(u,x)
33 |
34 | if __name__=='__main__':
35 | run(x,pi,p,q,100)
--------------------------------------------------------------------------------
/K-means/Kmeans.py:
--------------------------------------------------------------------------------
1 | from numpy import *
2 | import matplotlib.pyplot as plt
3 |
4 | '''
5 | @author yangmu
6 | @version 0.1
7 | @date 2016-03-20
8 | '''
9 |
10 | def loadDataSet(fileName):
11 | dataMat = []
12 | fr = open(fileName)
13 | for line in fr.readlines():
14 | curLine = line.strip().split('\t')
15 | fltLine = map(float,curLine)
16 | dataMat.append(fltLine)
17 | return dataMat
18 |
19 | def distEclud(vecA, vecB):
20 | return sqrt(sum(power(vecA - vecB, 2)))
21 |
22 | def randCent(dataSet, k):
23 | n = shape(dataSet)[1]
24 | centroids = mat(zeros((k,n)))
25 | for j in range(n):
26 | minJ = min(dataSet[:,j])
27 | rangeJ = float(max(dataSet[:,j]) - minJ)
28 | centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
29 | return centroids
30 |
31 | def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
32 | m = shape(dataSet)[0]
33 | clusterAssment = mat(zeros((m,2)))
34 |
35 | centroids = createCent(dataSet, k)
36 | clusterChanged = True
37 | while clusterChanged:
38 | clusterChanged = False
39 | for i in range(m):
40 | minDist = inf; minIndex = -1
41 | for j in range(k):
42 | distJI = distMeas(centroids[j,:],dataSet[i,:])
43 | if distJI < minDist:
44 | minDist = distJI; minIndex = j
45 | if clusterAssment[i,0] != minIndex: clusterChanged = True
46 | clusterAssment[i,:] = minIndex,minDist**2
47 | print centroids
48 | for cent in range(k):
49 | ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#get all the point in this cluster
50 | centroids[cent,:] = mean(ptsInClust, axis=0)
51 | return centroids, clusterAssment
52 |
53 | def showCluster(dataSet, k, centroids, clusterAssment):
54 | numSamples, dim = dataSet.shape
55 | if dim != 2:
56 | return 1
57 |
58 | mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', ' maxerr:
25 | ro = r.copy()
26 | # calculate each pagerank at a time
27 | for i in xrange(0,n):
28 | # inlinks of state i
29 | Ii = np.array(M[:,i].todense())[:,0]
30 | # account for sink states
31 | Si = sink / float(n)
32 | # account for teleportation to state i
33 | Ti = np.ones(n) / float(n)
34 |
35 | r[i] = ro.dot( Ii*s + Si*s + Ti*(1-s) )
36 |
37 | return r/sum(r)
38 |
39 |
40 |
41 |
42 | if __name__=='__main__':
43 | websites = np.array([[0,0,1,0,0,0,0],
44 | [0,1,1,0,0,0,0],
45 | [1,0,1,1,0,0,0],
46 | [0,0,0,1,1,0,0],
47 | [0,0,0,0,0,0,1],
48 | [0,0,0,0,0,1,1],
49 | [0,0,0,1,1,0,1]])
50 |
51 | page_value = pageRank(websites,s=.86)
52 | print page_value
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Top10_Algorithms_in_DataMining
2 | ## Hi, there! This repository mainly contains the implementation source code of top 10 algorithms in datamining.
3 | * Most demos will use python as the programming language.
4 | * If you are interested in this work, join me :).
5 | * Any questions or suggestions, please mail yangmuted@163.com
--------------------------------------------------------------------------------