├── .idea ├── Top10_Algorithms_in_DataMining.iml ├── encodings.xml ├── misc.xml ├── modules.xml └── workspace.xml ├── Apriori └── Apriori.py ├── C4.5 └── C4.5.py ├── CART ├── Cart.py └── testSet ├── EM └── em.py ├── K-means ├── Kmeans.py └── testSet ├── KNN └── KNN.py ├── PageRank └── pagerank.py └── README.md /.idea/Top10_Algorithms_in_DataMining.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 10 | 11 | 12 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 1458570207194 32 | 35 | 36 | 37 | 38 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /Apriori/Apriori.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @version 0.1 3 | @date 2016-03-21 4 | @reference 5 | ''' 6 | 7 | 8 | from numpy import * 9 | 10 | def loadDataSet(): 11 | return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] 12 | 13 | def createC1(dataSet): 14 | C1 = [] 15 | for transaction in dataSet: 16 | for item in transaction: 17 | if not [item] in C1: 18 | C1.append([item]) 19 | 20 | C1.sort() 21 | return map(frozenset, C1)#use frozen set so we 22 | #can use it as a key in a dict 23 | def scanD(D, Ck, minSupport): 24 | ssCnt = {} 25 | for tid in D: 26 | for can in Ck: 27 | if can.issubset(tid): 28 | if not ssCnt.has_key(can): ssCnt[can]=1 29 | else: ssCnt[can] += 1 30 | numItems = float(len(D)) 31 | retList = [] 32 | supportData = {} 33 | for key in ssCnt: 34 | support = ssCnt[key]/numItems 35 | if support >= minSupport: 36 | retList.insert(0,key) 37 | supportData[key] = support 38 | return retList, supportData 39 | 40 | def aprioriGen(Lk, k): #creates Ck 41 | retList = [] 42 | lenLk = len(Lk) 43 | for i in range(lenLk): 44 | for j in range(i+1, lenLk): 45 | L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2] 46 | L1.sort(); L2.sort() 47 | if L1==L2: #if first k-2 elements are equal 48 | retList.append(Lk[i] | Lk[j]) #set union 49 | return retList 50 | 51 | def apriori(dataSet, minSupport = 0.5): 52 | C1 = createC1(dataSet) 53 | D = map(set, dataSet) 54 | L1, supportData = scanD(D, C1, minSupport) 55 | L = [L1] 56 | k = 2 57 | while (len(L[k-2]) > 0): 58 | Ck = aprioriGen(L[k-2], k) 59 | Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk 60 | supportData.update(supK) 61 | L.append(Lk) 62 | k += 1 63 | return L, supportData 64 | 65 | 66 | if __name__ == '__main__': 67 | dataSet = loadDataSet() 68 | L, suppData = apriori(dataSet) 69 | print L -------------------------------------------------------------------------------- /C4.5/C4.5.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @version 0.1 3 | @date 2016-04-06 4 | @author yangmu 5 | ''' 6 | 7 | ''' 8 | The implementation of decision tree which use the information gain ratio. 9 | ''' 10 | 11 | from math import log 12 | import operator 13 | 14 | def calcShannonEnt(dataSet): 15 | numEntries = len(dataSet) 16 | labelCounts = {} 17 | for featVec in dataSet: 18 | currentLabel = featVec[-1] 19 | if currentLabel not in labelCounts.keys(): 20 | labelCounts[currentLabel] = 0 21 | labelCounts[currentLabel] += 1 22 | shannonEnt = 0.0 23 | for key in labelCounts: 24 | prob = float(labelCounts[key])/numEntries 25 | shannonEnt -= prob * log(prob, 2) 26 | return shannonEnt 27 | 28 | def splitDataSet(dataSet, axis, value): 29 | retDataSet = [] 30 | for featVec in dataSet: 31 | if featVec[axis] == value: 32 | reduceFeatVec = featVec[:axis] 33 | reduceFeatVec.extend(featVec[axis+1:]) 34 | retDataSet.append(reduceFeatVec) 35 | return retDataSet 36 | 37 | def chooseBestFeatureToSplit(dataSet): 38 | numFeatures = len(dataSet[0]) - 1 39 | baseEntropy = calcShannonEnt(dataSet) 40 | bestInfoGainRatio = 0.0 41 | bestFeature = -1 42 | for i in range(numFeatures): 43 | featList = [example[i] for example in dataSet] 44 | uniqueVals = set(featList) 45 | newEntropy = 0.0 46 | splitInfo = 0.0 47 | for value in uniqueVals: 48 | subDataSet = splitDataSet(dataSet, i, value) 49 | prob = len(subDataSet)/float(len(dataSet)) 50 | newEntropy += prob * calcShannonEnt(subDataSet) 51 | splitInfo += -prob * log(prob, 2) 52 | infoGain = baseEntropy - newEntropy 53 | if (splitInfo == 0): # fix the overflow bug 54 | continue 55 | infoGainRatio = infoGain / splitInfo 56 | if (infoGainRatio > bestInfoGainRatio): 57 | bestInfoGainRatio = infoGainRatio 58 | bestFeature = i 59 | return bestFeature 60 | 61 | def majorityCnt(classList): 62 | classCount = {} 63 | for vote in classList: 64 | if vote not in classCount.keys(): 65 | classCount[vote] = 0 66 | classCount[vote] += 1 67 | sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reversed=True) 68 | return sortedClassCount[0][0] 69 | 70 | def createTree(dataSet, labels): 71 | classList = [example[-1] for example in dataSet] 72 | if classList.count(classList[0]) == len(classList): 73 | # 类别完全相同,停止划分 74 | return classList[0] 75 | if len(dataSet[0]) == 1: 76 | # 遍历完所有特征时返回出现次数最多的 77 | return majorityCnt(classList) 78 | bestFeat = chooseBestFeatureToSplit(dataSet) 79 | bestFeatLabel = labels[bestFeat] 80 | myTree = {bestFeatLabel:{}} 81 | del(labels[bestFeat]) 82 | # 得到列表包括节点所有的属性值 83 | featValues = [example[bestFeat] for example in dataSet] 84 | uniqueVals = set(featValues) 85 | for value in uniqueVals: 86 | subLabels = labels[:] 87 | myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) 88 | return myTree 89 | 90 | def classify(inputTree, featLabels, testVec): 91 | firstStr = list(inputTree.keys())[0] 92 | secondDict = inputTree[firstStr] 93 | featIndex = featLabels.index(firstStr) 94 | for key in secondDict.keys(): 95 | if testVec[featIndex] == key: 96 | if type(secondDict[key]).__name__ == 'dict': 97 | classLabel = classify(secondDict[key], featLabels, testVec) 98 | else: 99 | classLabel = secondDict[key] 100 | return classLabel 101 | 102 | def classifyAll(inputTree, featLabels, testDataSet): 103 | classLabelAll = [] 104 | for testVec in testDataSet: 105 | classLabelAll.append(classify(inputTree, featLabels, testVec)) 106 | return classLabelAll 107 | 108 | def storeTree(inputTree, filename): 109 | import pickle 110 | fw = open(filename, 'wb') 111 | pickle.dump(inputTree, fw) 112 | fw.close() 113 | 114 | def grabTree(filename): 115 | import pickle 116 | fr = open(filename, 'rb') 117 | return pickle.load(fr) 118 | 119 | def createDataSet(): 120 | """ 121 | outlook-> 0: sunny | 1: overcast | 2: rain 122 | temperature-> 0: hot | 1: mild | 2: cool 123 | humidity-> 0: high | 1: normal 124 | windy-> 0: false | 1: true 125 | """ 126 | dataSet = [[0, 0, 0, 0, 'N'], 127 | [0, 0, 0, 1, 'N'], 128 | [1, 0, 0, 0, 'Y'], 129 | [2, 1, 0, 0, 'Y'], 130 | [2, 2, 1, 0, 'Y'], 131 | [2, 2, 1, 1, 'N'], 132 | [1, 2, 1, 1, 'Y']] 133 | labels = ['outlook', 'temperature', 'humidity', 'windy'] 134 | return dataSet, labels 135 | 136 | if __name__ == '__main__': 137 | dataSet, labels = createDataSet() 138 | labels_tmp = labels[:] 139 | desicionTree = createTree(dataSet, labels_tmp) 140 | print(desicionTree) -------------------------------------------------------------------------------- /CART/Cart.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Feb 4, 2011 3 | Tree-Based Regression Methods 4 | @author: Peter Harrington 5 | @reference: Machine Learning in Action 6 | ''' 7 | from numpy import * 8 | 9 | def loadDataSet(fileName): #general function to parse tab -delimited floats 10 | dataMat = [] #assume last column is target value 11 | fr = open(fileName) 12 | for line in fr.readlines(): 13 | curLine = line.strip().split('\t') 14 | fltLine = map(float,curLine) #map all elements to float() 15 | dataMat.append(fltLine) 16 | return dataMat 17 | 18 | def binSplitDataSet(dataSet, feature, value): 19 | mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:][0] 20 | mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:][0] 21 | return mat0,mat1 22 | 23 | def regLeaf(dataSet):#returns the value used for each leaf 24 | return mean(dataSet[:,-1]) 25 | 26 | def regErr(dataSet): 27 | return var(dataSet[:,-1]) * shape(dataSet)[0] 28 | 29 | def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)): 30 | tolS = ops[0]; tolN = ops[1] 31 | #if all the target variables are the same value: quit and return value 32 | if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1 33 | return None, leafType(dataSet) 34 | m,n = shape(dataSet) 35 | #the choice of the best feature is driven by Reduction in RSS error from mean 36 | S = errType(dataSet) 37 | bestS = inf; bestIndex = 0; bestValue = 0 38 | for featIndex in range(n-1): 39 | for splitVal in set(dataSet[:,featIndex]): 40 | mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) 41 | if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue 42 | newS = errType(mat0) + errType(mat1) 43 | if newS < bestS: 44 | bestIndex = featIndex 45 | bestValue = splitVal 46 | bestS = newS 47 | #if the decrease (S-bestS) is less than a threshold don't do the split 48 | if (S - bestS) < tolS: 49 | return None, leafType(dataSet) #exit cond 2 50 | mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) 51 | if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3 52 | return None, leafType(dataSet) 53 | return bestIndex,bestValue#returns the best feature to split on 54 | #and the value used for that split 55 | 56 | def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering 57 | feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split 58 | if feat == None: return val #if the splitting hit a stop condition return val 59 | retTree = {} 60 | retTree['spInd'] = feat 61 | retTree['spVal'] = val 62 | lSet, rSet = binSplitDataSet(dataSet, feat, val) 63 | retTree['left'] = createTree(lSet, leafType, errType, ops) 64 | retTree['right'] = createTree(rSet, leafType, errType, ops) 65 | return retTree 66 | 67 | def isTree(obj): 68 | return (type(obj).__name__=='dict') 69 | 70 | def getMean(tree): 71 | if isTree(tree['right']): tree['right'] = getMean(tree['right']) 72 | if isTree(tree['left']): tree['left'] = getMean(tree['left']) 73 | return (tree['left']+tree['right'])/2.0 74 | 75 | def prune(tree, testData): 76 | if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree 77 | if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them 78 | lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) 79 | if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet) 80 | if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet) 81 | #if they are now both leafs, see if we can merge them 82 | if not isTree(tree['left']) and not isTree(tree['right']): 83 | lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) 84 | errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\ 85 | sum(power(rSet[:,-1] - tree['right'],2)) 86 | treeMean = (tree['left']+tree['right'])/2.0 87 | errorMerge = sum(power(testData[:,-1] - treeMean,2)) 88 | if errorMerge < errorNoMerge: 89 | print "merging" 90 | return treeMean 91 | else: return tree 92 | else: return tree 93 | -------------------------------------------------------------------------------- /CART/testSet: -------------------------------------------------------------------------------- 1 | 0.036098 0.155096 2 | 0.993349 1.077553 3 | 0.530897 0.893462 4 | 0.712386 0.564858 5 | 0.343554 -0.371700 6 | 0.098016 -0.332760 7 | 0.691115 0.834391 8 | 0.091358 0.099935 9 | 0.727098 1.000567 10 | 0.951949 0.945255 11 | 0.768596 0.760219 12 | 0.541314 0.893748 13 | 0.146366 0.034283 14 | 0.673195 0.915077 15 | 0.183510 0.184843 16 | 0.339563 0.206783 17 | 0.517921 1.493586 18 | 0.703755 1.101678 19 | 0.008307 0.069976 20 | 0.243909 -0.029467 21 | 0.306964 -0.177321 22 | 0.036492 0.408155 23 | 0.295511 0.002882 24 | 0.837522 1.229373 25 | 0.202054 -0.087744 26 | 0.919384 1.029889 27 | 0.377201 -0.243550 28 | 0.814825 1.095206 29 | 0.611270 0.982036 30 | 0.072243 -0.420983 31 | 0.410230 0.331722 32 | 0.869077 1.114825 33 | 0.620599 1.334421 34 | 0.101149 0.068834 35 | 0.820802 1.325907 36 | 0.520044 0.961983 37 | 0.488130 -0.097791 38 | 0.819823 0.835264 39 | 0.975022 0.673579 40 | 0.953112 1.064690 41 | 0.475976 -0.163707 42 | 0.273147 -0.455219 43 | 0.804586 0.924033 44 | 0.074795 -0.349692 45 | 0.625336 0.623696 46 | 0.656218 0.958506 47 | 0.834078 1.010580 48 | 0.781930 1.074488 49 | 0.009849 0.056594 50 | 0.302217 -0.148650 51 | 0.678287 0.907727 52 | 0.180506 0.103676 53 | 0.193641 -0.327589 54 | 0.343479 0.175264 55 | 0.145809 0.136979 56 | 0.996757 1.035533 57 | 0.590210 1.336661 58 | 0.238070 -0.358459 59 | 0.561362 1.070529 60 | 0.377597 0.088505 61 | 0.099142 0.025280 62 | 0.539558 1.053846 63 | 0.790240 0.533214 64 | 0.242204 0.209359 65 | 0.152324 0.132858 66 | 0.252649 -0.055613 67 | 0.895930 1.077275 68 | 0.133300 -0.223143 69 | 0.559763 1.253151 70 | 0.643665 1.024241 71 | 0.877241 0.797005 72 | 0.613765 1.621091 73 | 0.645762 1.026886 74 | 0.651376 1.315384 75 | 0.697718 1.212434 76 | 0.742527 1.087056 77 | 0.901056 1.055900 78 | 0.362314 -0.556464 79 | 0.948268 0.631862 80 | 0.000234 0.060903 81 | 0.750078 0.906291 82 | 0.325412 -0.219245 83 | 0.726828 1.017112 84 | 0.348013 0.048939 85 | 0.458121 -0.061456 86 | 0.280738 -0.228880 87 | 0.567704 0.969058 88 | 0.750918 0.748104 89 | 0.575805 0.899090 90 | 0.507940 1.107265 91 | 0.071769 -0.110946 92 | 0.553520 1.391273 93 | 0.401152 -0.121640 94 | 0.406649 -0.366317 95 | 0.652121 1.004346 96 | 0.347837 -0.153405 97 | 0.081931 -0.269756 98 | 0.821648 1.280895 99 | 0.048014 0.064496 100 | 0.130962 0.184241 101 | 0.773422 1.125943 102 | 0.789625 0.552614 103 | 0.096994 0.227167 104 | 0.625791 1.244731 105 | 0.589575 1.185812 106 | 0.323181 0.180811 107 | 0.822443 1.086648 108 | 0.360323 -0.204830 109 | 0.950153 1.022906 110 | 0.527505 0.879560 111 | 0.860049 0.717490 112 | 0.007044 0.094150 113 | 0.438367 0.034014 114 | 0.574573 1.066130 115 | 0.536689 0.867284 116 | 0.782167 0.886049 117 | 0.989888 0.744207 118 | 0.761474 1.058262 119 | 0.985425 1.227946 120 | 0.132543 -0.329372 121 | 0.346986 -0.150389 122 | 0.768784 0.899705 123 | 0.848921 1.170959 124 | 0.449280 0.069098 125 | 0.066172 0.052439 126 | 0.813719 0.706601 127 | 0.661923 0.767040 128 | 0.529491 1.022206 129 | 0.846455 0.720030 130 | 0.448656 0.026974 131 | 0.795072 0.965721 132 | 0.118156 -0.077409 133 | 0.084248 -0.019547 134 | 0.845815 0.952617 135 | 0.576946 1.234129 136 | 0.772083 1.299018 137 | 0.696648 0.845423 138 | 0.595012 1.213435 139 | 0.648675 1.287407 140 | 0.897094 1.240209 141 | 0.552990 1.036158 142 | 0.332982 0.210084 143 | 0.065615 -0.306970 144 | 0.278661 0.253628 145 | 0.773168 1.140917 146 | 0.203693 -0.064036 147 | 0.355688 -0.119399 148 | 0.988852 1.069062 149 | 0.518735 1.037179 150 | 0.514563 1.156648 151 | 0.976414 0.862911 152 | 0.919074 1.123413 153 | 0.697777 0.827805 154 | 0.928097 0.883225 155 | 0.900272 0.996871 156 | 0.344102 -0.061539 157 | 0.148049 0.204298 158 | 0.130052 -0.026167 159 | 0.302001 0.317135 160 | 0.337100 0.026332 161 | 0.314924 -0.001952 162 | 0.269681 -0.165971 163 | 0.196005 -0.048847 164 | 0.129061 0.305107 165 | 0.936783 1.026258 166 | 0.305540 -0.115991 167 | 0.683921 1.414382 168 | 0.622398 0.766330 169 | 0.902532 0.861601 170 | 0.712503 0.933490 171 | 0.590062 0.705531 172 | 0.723120 1.307248 173 | 0.188218 0.113685 174 | 0.643601 0.782552 175 | 0.520207 1.209557 176 | 0.233115 -0.348147 177 | 0.465625 -0.152940 178 | 0.884512 1.117833 179 | 0.663200 0.701634 180 | 0.268857 0.073447 181 | 0.729234 0.931956 182 | 0.429664 -0.188659 183 | 0.737189 1.200781 184 | 0.378595 -0.296094 185 | 0.930173 1.035645 186 | 0.774301 0.836763 187 | 0.273940 -0.085713 188 | 0.824442 1.082153 189 | 0.626011 0.840544 190 | 0.679390 1.307217 191 | 0.578252 0.921885 192 | 0.785541 1.165296 193 | 0.597409 0.974770 194 | 0.014083 -0.132525 195 | 0.663870 1.187129 196 | 0.552381 1.369630 197 | 0.683886 0.999985 198 | 0.210334 -0.006899 199 | 0.604529 1.212685 200 | 0.250744 0.046297 201 | -------------------------------------------------------------------------------- /EM/em.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @version 0.1 3 | @date 2016-05-01 4 | @author yangmu 5 | ''' 6 | 7 | import math 8 | 9 | [pi,p,q]=[0.4,0.6,0.7] 10 | 11 | x=[1,1,0,1,0,0,1,0,1,1] 12 | 13 | def cal_u(pi1,p1,q1,xi): 14 | return pi1*math.pow(p1,xi)*math.pow(1-p1,1-xi)/float(pi1*math.pow(p1,xi)*math.pow(1-p1,1-xi)+(1-pi1)*math.pow(q1,xi)*math.pow(1-q1,1-xi)) 15 | 16 | def e_step(pi1,p1,q1,x): 17 | return [cal_u(pi1,p1,q1,xi) for xi in x] 18 | 19 | def m_step(u,x): 20 | pi1=sum(u)/len(u) 21 | p1=sum([u[i]*x[i] for i in range(len(u))]) / sum(u) 22 | q1=sum([(1-u[i])*x[i] for i in range(len(u))]) / sum([1-u[i] for i in range(len(u))]) 23 | return [pi1,p1,q1] 24 | 25 | def run(start_x,start_pi,start_p,start_q,iter_num): 26 | for i in range(iter_num): 27 | u=e_step(start_pi,start_p,start_q,x) 28 | print i,[start_pi,start_p,start_q] 29 | if [start_pi,start_p,start_q]==m_step(u,x): 30 | break 31 | else: 32 | [start_pi,start_p,start_q]=m_step(u,x) 33 | 34 | if __name__=='__main__': 35 | run(x,pi,p,q,100) -------------------------------------------------------------------------------- /K-means/Kmeans.py: -------------------------------------------------------------------------------- 1 | from numpy import * 2 | import matplotlib.pyplot as plt 3 | 4 | ''' 5 | @author yangmu 6 | @version 0.1 7 | @date 2016-03-20 8 | ''' 9 | 10 | def loadDataSet(fileName): 11 | dataMat = [] 12 | fr = open(fileName) 13 | for line in fr.readlines(): 14 | curLine = line.strip().split('\t') 15 | fltLine = map(float,curLine) 16 | dataMat.append(fltLine) 17 | return dataMat 18 | 19 | def distEclud(vecA, vecB): 20 | return sqrt(sum(power(vecA - vecB, 2))) 21 | 22 | def randCent(dataSet, k): 23 | n = shape(dataSet)[1] 24 | centroids = mat(zeros((k,n))) 25 | for j in range(n): 26 | minJ = min(dataSet[:,j]) 27 | rangeJ = float(max(dataSet[:,j]) - minJ) 28 | centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) 29 | return centroids 30 | 31 | def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): 32 | m = shape(dataSet)[0] 33 | clusterAssment = mat(zeros((m,2))) 34 | 35 | centroids = createCent(dataSet, k) 36 | clusterChanged = True 37 | while clusterChanged: 38 | clusterChanged = False 39 | for i in range(m): 40 | minDist = inf; minIndex = -1 41 | for j in range(k): 42 | distJI = distMeas(centroids[j,:],dataSet[i,:]) 43 | if distJI < minDist: 44 | minDist = distJI; minIndex = j 45 | if clusterAssment[i,0] != minIndex: clusterChanged = True 46 | clusterAssment[i,:] = minIndex,minDist**2 47 | print centroids 48 | for cent in range(k): 49 | ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#get all the point in this cluster 50 | centroids[cent,:] = mean(ptsInClust, axis=0) 51 | return centroids, clusterAssment 52 | 53 | def showCluster(dataSet, k, centroids, clusterAssment): 54 | numSamples, dim = dataSet.shape 55 | if dim != 2: 56 | return 1 57 | 58 | mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', ' maxerr: 25 | ro = r.copy() 26 | # calculate each pagerank at a time 27 | for i in xrange(0,n): 28 | # inlinks of state i 29 | Ii = np.array(M[:,i].todense())[:,0] 30 | # account for sink states 31 | Si = sink / float(n) 32 | # account for teleportation to state i 33 | Ti = np.ones(n) / float(n) 34 | 35 | r[i] = ro.dot( Ii*s + Si*s + Ti*(1-s) ) 36 | 37 | return r/sum(r) 38 | 39 | 40 | 41 | 42 | if __name__=='__main__': 43 | websites = np.array([[0,0,1,0,0,0,0], 44 | [0,1,1,0,0,0,0], 45 | [1,0,1,1,0,0,0], 46 | [0,0,0,1,1,0,0], 47 | [0,0,0,0,0,0,1], 48 | [0,0,0,0,0,1,1], 49 | [0,0,0,1,1,0,1]]) 50 | 51 | page_value = pageRank(websites,s=.86) 52 | print page_value -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Top10_Algorithms_in_DataMining 2 | ## Hi, there! This repository mainly contains the implementation source code of top 10 algorithms in datamining. 3 | * Most demos will use python as the programming language. 4 | * If you are interested in this work, join me :). 5 | * Any questions or suggestions, please mail yangmuted@163.com --------------------------------------------------------------------------------