├── .gitignore ├── README.md ├── api.md ├── bak ├── LinerModel.py ├── extract.sh ├── install.py └── install.sh ├── examples ├── testBandit.py ├── testCanopy.py ├── testDbScan.py ├── testHCluster.py ├── testKnn.py └── testPageRank.py ├── moodstyle ├── __init__.py ├── alg │ ├── Bandit.py │ ├── PageRank.py │ └── __init__.py ├── classifier │ ├── AdBoost.py │ ├── AdTree.py │ ├── Bayes.py │ ├── Bp.py │ ├── Bp1.py │ ├── Cart.py │ ├── DecisionTree.py │ ├── DecisionTree1.py │ ├── GRTree.py │ ├── Hmm.py │ ├── Hmm1.py │ ├── Interface.py │ ├── Knn.py │ ├── LinerModel.py │ ├── Logistic.py │ ├── RandomForst.py │ ├── RegressionTree.py │ └── __init__.py ├── cluster │ ├── Canopy.py │ ├── DDistance.py │ ├── DbScan.py │ ├── HCluster.py │ ├── Kmeans.py │ ├── KmeansPlusPlus.py │ ├── MiniBatchKMeans.py │ └── __init__.py ├── common │ ├── Array.py │ ├── BaseStrut.py │ ├── DataSet.py │ ├── Dict.py │ ├── __init__.py │ └── util.py ├── config.py ├── feature │ ├── DefaultValue.py │ ├── Feature.py │ ├── OneHotCode.py │ └── __init__.py ├── libsvm │ ├── __init__.py │ ├── libsvm.so.2 │ ├── svm.py │ └── svmutil.py └── text │ ├── FeatureExtract.py │ ├── Ngram.py │ └── __init__.py ├── setup.py └── test ├── testAdBoost.py ├── testAdTree.py ├── testAnn.py ├── testArray.py ├── testBandit.py ├── testBaseStrut.py ├── testBayes.py ├── testBp.py ├── testBp1.py ├── testCanopy.py ├── testCart.py ├── testDDistance.py ├── testDataSet.py ├── testDbScan.py ├── testDecisionTree.py ├── testDecisionTree1.py ├── testDefaultValue.py ├── testDict.py ├── testEmm.py ├── testFeature.py ├── testFeatureExtract.py ├── testGRTree.py ├── testHCluster.py ├── testHmm.py ├── testInterface.py ├── testKdTree.py ├── testKmeans.py ├── testKmeansPlusPlus.py ├── testKnn.py ├── testLinerModel.py ├── testLogistic.py ├── testMiniBatchKMeans.py ├── testNgram.py ├── testOneHotCode.py ├── testPageRank.py ├── testRandomForst.py └── testRegressionTree.py /.gitignore: -------------------------------------------------------------------------------- 1 | *pyc 2 | *swp 3 | *out 4 | *err 5 | moodstyle/cool 6 | .settings 7 | .pydevproject 8 | .project 9 | 10 | 11 | # install create folder 12 | build/ 13 | dist/ 14 | moodstyle.egg-info/ 15 | test/.ropeproject/ 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | moodstyle机器学习脚本库 3 | ======================= 4 | 5 | machine learning algorithm with pure python which let pramgramer learn easily; 6 | 7 | please push your code to this project,let's learn together; 8 | 9 | 10 | Install 11 | ----------- 12 | + python setup.py install 13 | + relay package 14 | + [b2](https://github.com/intohole/b2) 15 | 16 | API 17 | ----------- 18 | + [API文档/API Document](api.md) 19 | -------------------------------------------------------------------------------- /api.md: -------------------------------------------------------------------------------- 1 | API 2 | ========= 3 | 4 | + moodstyle.classifier.LinerModel.LinerModel 5 | - 参数: 6 | - w 参数个数/训练数据/测试数据个数 7 | - learn_rate 学习率,默认值0.1 8 | - labels label,默认[1,-1] 9 | - train(datas,labels) 10 | - datas 训练数据,[[0,1],[1,2]] 11 | - labels 打标数据,[1,-1] 12 | - predict 13 | - data 预测数据,数据形式为[0,1] 14 | 15 | 16 | + moodstyle.alg.Bandit.Greedy 17 | 18 | - 参数(Paramters): 19 | - e 探测率(explor rate); e 在(in) (0,1) 20 | - N 需要探测桶数(the number of explor bucket) 21 | 22 | - 函数(Function): 23 | - getIndex() 24 | - 获得运行的桶号(get index of bucket) 25 | - process(label) 26 | - 获得标注 27 | 28 | 29 | + moodstyle.common.DDistance 30 | - 计算两个数据之间距离(caculate distance of input data) 31 | - 函数(Function) 32 | - distance(data1,data2) 33 | - data1 输入数据(input data); 例如(example): [1,2] 34 | - data2 输入数据(input data); 例如 (example):[2,3] 35 | - warn: length of data1 is equal to the length of data2 36 | - moodstyle.common.DDistance.Manhattan 37 | - 曼哈顿距离 38 | - moodstyle.common.DDistance.Chebyshev 39 | - 切比雪夫距离 40 | - moodstyle.common.DDistance.Cosine 41 | - 余弦距离 42 | - moodstyle.common.DDistance.Hamming 43 | - 海明距离 44 | - moodstyle.common.DDistance.Eucliden 45 | - 欧式距离 46 | 47 | + moodstyle.alg.PageRank 48 | - moodstyle.alg.PageRank.GraphV2 49 | - 建立图关系(build data graph) 50 | - 构造函数(init function): 51 | - GraphV2(N) 52 | - N the number of node; 53 | - add_edge(n1 , n2) 54 | - n1 node point to another; intger 55 | - n2 node which n1 point to; intger 56 | - moodstyle.alg.PageRank.PageRank 57 | - 计算图的page rank(caculate graph) 58 | - 函数(Function) 59 | - rank 60 | - param: graph GraphV2 model,build node relation 61 | - return: weight array,weight of the nodes 62 | -------------------------------------------------------------------------------- /bak/LinerModel.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | class LinerModel(object): 7 | 8 | 9 | 10 | 11 | 12 | def train(self , datas , labels , item_len , learn_rate ): 13 | self.weights = [1.] * item_len 14 | self.offset = 1. 15 | for i in range(len(labels)): 16 | l = self.predict(datas[i]) 17 | self.update_weight( l , labels[i] , datas[i] , learn_rate) 18 | 19 | 20 | 21 | def update_weight(self , l , target , data , learn_rate): 22 | for i in range(len(self.weights)): 23 | self.weights[i] = self.weights[i] - learn_rate *(l - target) * data[i] 24 | self.offset = self.offset - learn_rate * (l - target) 25 | 26 | 27 | 28 | def predict(self , data): 29 | if data and len(data) == len(self.weights): 30 | return sum([ data[i] *self.weights[i] for i in range(len(self.weights))]) + self.offset 31 | 32 | 33 | if __name__ == '__main__': 34 | l = LinerModel() 35 | from random import random 36 | datas = [ [random() * 10 ] for i in range(10000)] 37 | labels = [ 1 if data[0] >= 5 else 0 for data in datas] 38 | l.train(datas , labels , 1 , 0.01) 39 | print l.weights 40 | print l.offset 41 | print l.predict([6.]) -------------------------------------------------------------------------------- /bak/extract.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | 4 | FUNCTION_FILE="README.md" 5 | PROJECT_NAME="moodstyle" 6 | 7 | 8 | 9 | echo > ${FUNCTION_FILE} 10 | echo "moodstyle机器学习脚本库" >> ${FUNCTION_FILE} 11 | echo "=======================" >> ${FUNCTION_FILE} 12 | echo "" >> ${FUNCTION_FILE} 13 | echo "" >> ${FUNCTION_FILE} 14 | echo "" >> ${FUNCTION_FILE} 15 | echo "" >> ${FUNCTION_FILE} 16 | cd `dirname $0` 17 | CWD=`pwd` 18 | for function_file in `ls ${CWD}/${PROJECT_NAME}/*.py`;do 19 | echo "+ `basename ${function_file}`" >> ${FUNCTION_FILE} 20 | cat ${function_file} | 21 | grep -En "(^class|^\s*def )" | 22 | # grep -v "__" | 23 | # grep -v "def _" | 24 | sed 's/class / + /' | 25 | sed 's/def / + /' | 26 | sed 's/:$//' | 27 | sed 's/\]/\\]/g' | 28 | sed 's/\[/\\[/g' | 29 | awk -F ":" '{ 30 | line_number = $1; 31 | function_name = "" 32 | for(i = 2 ;i <=NF ;i++){ 33 | if(i != 2){ 34 | function_name=function_name":"$i 35 | }else{ 36 | function_name =$i 37 | } 38 | } 39 | print function_name"]('${PROJECT_NAME}'/'`basename ${function_file}`'#L"line_number")" 40 | }' | sed 's/+ /+ [/'>> ${FUNCTION_FILE} 41 | done 42 | -------------------------------------------------------------------------------- /bak/install.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | import re 7 | from b2 import file2 8 | from b2 import str2 9 | import os 10 | 11 | 12 | import_pattern = re.compile("^\s*(from|import){1}\s{1,}(test[\d\w_]{1,})\s*") 13 | for file in file2.walk_folder("./test" , file_filter = lambda x: True if x.endswith("py") else False): 14 | filename = os.path.basename(file).replace("test" ,"") 15 | d = dict() 16 | with open(file) as f: 17 | for line in f: 18 | line = line.rstrip() 19 | matcher = import_pattern.match(line) 20 | if matcher: 21 | d[matcher.group(2)] = matcher.group(2).replace("test","") 22 | 23 | with open(file) as f,open("./moodstyle/%s" % filename , "w") as w: 24 | for line in f: 25 | line = str2.replace_all( line , d) 26 | w.write(line) 27 | -------------------------------------------------------------------------------- /bak/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | 4 | CWD=$(cd `dirname $0` ; pwd) 5 | INSTALL_PATH=${CWD}/moodstyle 6 | for i in `ls test/*`;do 7 | new_name=`basename $i | sed 's/^test//g'` 8 | cp ${i} ${INSTALL_PATH}/${new_name} 9 | done 10 | for i in `ls ${INSTALL_PATH}/*.py`;do 11 | sed -i 's/^import test/import /g' ${i} 12 | sed -i 's/^from test/from /g' ${i} 13 | done 14 | 15 | -------------------------------------------------------------------------------- /examples/testBandit.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | 7 | from moodstyle.alg import Bandit 8 | import random 9 | 10 | 11 | 12 | N = 100 13 | # 随机生成N个概率 14 | p = [random.random() for i in range(N)] 15 | 16 | # 0.1 进行explor 17 | greedy = Bandit.Greedy(0.05,N) 18 | # 重复实验次数 19 | TIMES = 100000 20 | COUNT = 0 21 | for _ in range(TIMES): 22 | index = greedy.getIndex() 23 | prop = random.random() 24 | if prop <= p[index]: 25 | label = 1 26 | COUNT += 1 27 | else: 28 | label = 0 29 | greedy.process(label) 30 | print greedy.p 31 | print p 32 | print COUNT / float(TIMES) 33 | 34 | -------------------------------------------------------------------------------- /examples/testCanopy.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | 7 | from random import randint 8 | from moodstyle.cluster import Canopy 9 | 10 | 11 | s = Canopy.CanopyCluster(120, 100) 12 | datas = [[randint(0, 1000)] for i in range(100)] 13 | for i in s.cluster(datas): 14 | print i 15 | -------------------------------------------------------------------------------- /examples/testDbScan.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | from moodstyle.cluster.DbScan import DbScan 6 | from random import randint 7 | 8 | 9 | t = DbScan(0.5,10) 10 | 11 | 12 | datas = [[ _ , randint(0, 20) * 1.0, randint(0, 20) * 1.0] for _ in range(100)] 13 | t.cluster(datas) 14 | print datas 15 | 16 | 17 | -------------------------------------------------------------------------------- /examples/testHCluster.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | from moodstyle import HCluster 5 | from random import randint 6 | 7 | 8 | 9 | 10 | hc = HCluster.ALHierarchicalClustering() 11 | datas = [[i, randint(1, 20), randint(1, 20)] for i in range(10)] 12 | clusters = hc.cluster(datas, 4, 100) 13 | for cluster in clusters: 14 | print cluster 15 | -------------------------------------------------------------------------------- /examples/testKnn.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from moodstyle.classifier.Knn import KdTree 4 | 5 | kd = KdTree() 6 | datas = [(2, 3), (5, 4), (9, 6), (4, 7), (8, 1), (7, 2)] 7 | print kd.get_split_index(datas, 3, 2, 1) 8 | -------------------------------------------------------------------------------- /examples/testPageRank.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | from moodstyle.alg.PageRank import PageRank 5 | from moodstyle.alg.PageRank import GraphV2 6 | 7 | graph = GraphV2(10) 8 | graph.add_edge(1 , 9) 9 | graph.add_edge(3 , 4) 10 | graph.add_edge(6 , 8) 11 | graph.add_edge(7 , 8) 12 | graph.add_edge(0,1) 13 | pagerank = PageRank() 14 | print pagerank.rank(graph ) 15 | 16 | -------------------------------------------------------------------------------- /moodstyle/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/moodstyle/__init__.py -------------------------------------------------------------------------------- /moodstyle/alg/Bandit.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from collections import defaultdict 4 | import random 5 | 6 | 7 | 8 | class Greedy(object): 9 | """ 10 | """ 11 | EXPLORER = 1 12 | WORK = 0 13 | 14 | def __init__(self,e,N): 15 | self.e = e 16 | self.p = [0.5] * N 17 | self._front = [0.] * N 18 | self._explor = [0.] * N 19 | self.N = N 20 | self._max_index = 0 21 | # 1 explorer ; 0 work 22 | self._status = None 23 | self._last = None 24 | 25 | def _prop(self): 26 | for i in range(self.N): 27 | if self._explor[i] == 0.: 28 | continue 29 | self.p[i] = self._front[i] / self._explor[i] 30 | if self.p[self._max_index] < self.p[i]: 31 | self._max_index = i 32 | 33 | 34 | def getIndex(self): 35 | r = random.random() 36 | index = None 37 | if r < self.e: 38 | index = random.randint(0,self.N - 1) 39 | self._status = self.EXPLORER 40 | else: 41 | self._status = self.WORK 42 | index = self._max_index 43 | self._last = index 44 | return index 45 | 46 | 47 | def process(self,label): 48 | if self._status == self.EXPLORER: 49 | if label == 1: 50 | self._front[self._last] += 1. 51 | self._explor[self._last] += 1. 52 | self._prop() 53 | 54 | 55 | #class UCB(object): 56 | # 57 | # 58 | # def __init__(self,N,max_value): 59 | # self.N = N 60 | # self.max_value = max_value 61 | # self._count = 0 62 | # self._sub_count = [0.] * N 63 | # self._sub_sum = [0.] * N 64 | # self.p = [0.] * N 65 | # self._last = None 66 | # 67 | # 68 | # def _prop(self): 69 | # for i in range(self.N): 70 | # if self._sub_count == 0: 71 | # continue 72 | # self.p[i] = self._sub_sum[i] / self._sub_count + math.sqrt( 2 * math.log(self._count) / self._sub_count[i]) 73 | # 74 | # def getIndex(self): 75 | # for i in range(self.N): 76 | # if self._sub_count[i] == 0: 77 | # self._last = i 78 | # return self._last 79 | # self._last = max(enumerate(self.p),key = lambda x:x[1])[0] 80 | # return self._last 81 | # 82 | # def process(self,label): 83 | # self._count += 1 84 | # self._sub_count[self._last] += 1 85 | # self._sub_sum[self._last] += label 86 | # 87 | 88 | -------------------------------------------------------------------------------- /moodstyle/alg/PageRank.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | from ..common.DataSet import DataSet 5 | from collections import Counter 6 | from collections import defaultdict 7 | import copy 8 | import sys 9 | 10 | 11 | class Graph(object): 12 | 13 | 14 | def __init__(self , point_len , dense = True ): 15 | self.weights = [1.] * point_len 16 | self.data = DataSet(point_len , dense) 17 | for i in range(point_len): 18 | self.data.append() 19 | self._keys = xrange(point_len) 20 | self._len = point_len 21 | self.outs_counter = Counter() 22 | self.point_ins = defaultdict(set) 23 | 24 | def __len__(self): 25 | return self._len 26 | 27 | def add_edge(self , point_a , point_b): 28 | """add point a to b edge 29 | param: point_a:point 图点a -> b 30 | param: point_b 图被指向点 31 | return:None 32 | """ 33 | self.data[point_a][point_b] = 1 34 | self.outs_counter[point_a] += 1 35 | self.point_ins[point_b].add(point_a) 36 | 37 | def keys(self): 38 | return self._keys 39 | 40 | def ins(self , point): 41 | return self.point_ins[point] 42 | 43 | def outs(self , point): 44 | if point and isinstance(point , (int , long)): 45 | if point >= 0 and point < self._len: 46 | for index in self.data[point].keys(): 47 | if self.data[point][index] > 0: 48 | yield index 49 | def outs_count(self , point): 50 | return self.outs_counter[point] 51 | 52 | def update(self,weights): 53 | if weights: 54 | error = 0. 55 | for i in self.keys(): 56 | error = max( error , abs(weights[i] - self.weights[i])) 57 | self.weights[i] = weights[i] 58 | return error 59 | 60 | 61 | class GraphV2(object): 62 | 63 | 64 | 65 | def __init__(self,point_len): 66 | self.weights = [1.] * point_len 67 | self._keys = xrange(point_len) 68 | self._len = point_len 69 | self.outs_counter = Counter() 70 | self.point_ins = defaultdict(set) 71 | 72 | def __len__(self): 73 | return self._len 74 | 75 | def add_edge(self ,point_a , point_b): 76 | self.outs_counter[point_a] +=1 77 | self.point_ins[point_b].add(point_a) 78 | 79 | def keys(self): 80 | return self._keys 81 | 82 | def outs_count(self,point): 83 | return self.outs_counter[point] 84 | 85 | def update(self,weights): 86 | if weights: 87 | error = 0. 88 | for i in self.keys(): 89 | error = max( error , abs(weights[i] - self.weights[i])) 90 | self.weights[i] = weights[i] 91 | return error 92 | 93 | def ins(self,point): 94 | return self.point_ins[point] 95 | 96 | class PageRank(object): 97 | 98 | 99 | def rank(self , graph ,iter_count = 1000, d = 0.85 , min_error = 0.01): 100 | for _iter in xrange(iter_count): 101 | weights = copy.copy(graph.weights) 102 | for i in graph.keys(): 103 | weights[i] =(1-d) + d * sum([ weights[point_in]/graph.outs_count(point_in) for point_in in graph.ins(i)]) 104 | error = graph.update(weights) 105 | if error < min_error: 106 | break 107 | return copy.copy(graph.weights) 108 | 109 | def calc_error(self , weights , graph): 110 | return max(abs(weights[i] - graph.weights[i]) for i in graph.keys()) 111 | -------------------------------------------------------------------------------- /moodstyle/alg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/moodstyle/alg/__init__.py -------------------------------------------------------------------------------- /moodstyle/classifier/AdBoost.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import math 4 | 5 | 6 | class BoostClassifier(object): 7 | 8 | def __init__(self, classifier, weight=None): 9 | if classifier and callable(classifier): 10 | self.classifier = classifier 11 | self.weight = weight 12 | else: 13 | raise TypeError, 'classifier has classify and is callable!' 14 | 15 | def __call__(self, data): 16 | return self.classifier(data) 17 | 18 | def __str__(self): 19 | return 'weight : %s' % self.weight 20 | 21 | 22 | class BoostData(object): 23 | 24 | def __init__(self, data, weight=None): 25 | if data and isinstance(data, (list, tuple)) and len(data) >= 2: 26 | self.data = data 27 | self.weight = weight 28 | else: 29 | raise ValueError 30 | 31 | 32 | class AdBoost(object): 33 | 34 | def __init__(self, classifiers=[]): 35 | 36 | self.__boost_classifier = [BoostClassifier(cl) for cl in classifiers] 37 | self.__labels = set() 38 | 39 | def train(self, datas): 40 | ''' 41 | [[feature1 , feature2 ,....,featuren , label]] 42 | ''' 43 | # 初始化权重, 每个数据初始化权重为 weight = ( 1.0 / 数据长度 ) 44 | if len(datas) == 0 or len(self.__boost_classifier) == 0: 45 | raise ValueError 46 | 47 | for data in datas: 48 | self.__labels.add(data[-1]) 49 | # 将数据权重初始化为 1.0/ 数据总长度 50 | trains = [BoostData(data, 1.0 / len(datas)) for data in datas] 51 | # 开始计算每个分类器的权重 52 | for _ in range(len(self.__boost_classifier)): 53 | best_classifier = self.__get_trainer(trains)[0] 54 | 55 | best_classifier[1].weight = math.log((1 - best_classifier[0]) / 56 | best_classifier[0], math.e) / 2 57 | self.__update_data_weight(trains, best_classifier[1]) 58 | 59 | def classify(self, data): 60 | weight = sum([classifier.weight * classifier(data) 61 | for classifier in self.__boost_classifier if classifier.weight != None]) 62 | return sorted([(abs(label - weight), label) for label in self.__labels], key=lambda x: x[0])[0][1] 63 | 64 | def __update_data_weight(self, trains, classifier): 65 | ''' 66 | 功能: 更新数据的权重 67 | 公式: 68 | 69 | ''' 70 | zm = sum([data.weight * math.exp(-classifier.weight * 71 | data.data[-1] * classifier(data.data[:-1])) for data in trains]) 72 | for data in trains: 73 | data.weight = data.weight * \ 74 | math.exp(-classifier.weight * 75 | data.data[-1] * classifier(data.data[:-1])) / zm 76 | 77 | def __str__(self): 78 | return '\t'.join( 79 | [ 80 | '%s : %s' % (i, self.__boost_classifier[i]) 81 | for i in range(len(self.__boost_classifier)) 82 | ] 83 | ) 84 | 85 | def __get_trainer(self, trains): 86 | ''' 87 | trains , 训练的数据 88 | ''' 89 | # 循环每个分类器(除了已经添加为分类器的分类器), 计算 (数据的权重 * 分类器错分) 90 | # , 找到上述值最小的一个 , 作为下个分类器 91 | return sorted([( 92 | sum( 93 | [ 94 | bd.weight 95 | for bd in trains 96 | if cl(bd.data[:-1]) != bd.data[-1] 97 | ] 98 | ), cl) 99 | for cl in self.__boost_classifier 100 | if cl.weight == None 101 | ], 102 | key=lambda x: x[0], reverse=False) 103 | 104 | 105 | if __name__ == '__main__': 106 | classifiers = [lambda x: -1 if x[0] > 2.5 else 1, lambda x: 107 | 1 if x[0] > 5.5 else -1, lambda x: 1 if x[0] < 8.5 else -1] 108 | a = AdBoost(classifiers) 109 | datas = [[0, 1], [1, 1], [2, 1], [3, -1], [4, -1], 110 | [5, -1], [6, 1], [7, 1], [8, 1], [9, -1]] 111 | a.train(datas) 112 | print a.classify([9]) 113 | print a 114 | -------------------------------------------------------------------------------- /moodstyle/classifier/AdTree.py: -------------------------------------------------------------------------------- 1 | #!/coding=utf-8 2 | 3 | from collections import defaultdict 4 | from copy import deepcopy 5 | 6 | 7 | class Classifier(object): 8 | 9 | def __init__(self, classifier): 10 | if classifier and (callable(classifier)): 11 | self.classifier = classifier 12 | else: 13 | raise TypeError, 'classifier must be callable and valuable !' 14 | # label , ( fit value sum , count ) 15 | self.__weights = defaultdict(float) 16 | self.__count = defaultdict(int) 17 | 18 | def clear(self): 19 | self.__weights.clear() 20 | self.__count.clear() 21 | 22 | def update_fit_value(self, data, value): 23 | label = self.classifier(data) 24 | self.__weights[label] += value 25 | self.__count[label] += 1. 26 | 27 | def updates_fit_values(self, datas, values): 28 | for i in range(len(datas)): 29 | self.update_fit_value(datas[i], values[i]) 30 | 31 | def sync(self): 32 | ''' 33 | 计算每个分类器需要拟合误差值 34 | ''' 35 | for label in self.__weights.keys(): 36 | self.__weights[label] /= self.__count[label] 37 | 38 | def classify(self, data): 39 | label = self.classifier(data) 40 | return self.__weights[label] if self.__weights.has_key(label) else None 41 | 42 | 43 | def __str__(self): 44 | return str(self.__weights) 45 | 46 | 47 | 48 | 49 | 50 | 51 | class AdTree(object): 52 | 53 | def __init__(self): 54 | #定义分类器集合 55 | self.classifiers = [] 56 | 57 | def train(self, datas, weights, classifiers, diff=0.2): 58 | ''' 59 | datas 60 | weights , 每个数据需要拟合的数值 61 | F0(x) = 0 62 | F1(x) = F0(x) + 树的分值 63 | FN(x) = FN(x) + 树(N-1) 64 | r(x) = sum ( yi - F 65 | ''' 66 | r = deepcopy(weights) 67 | 68 | for _ in range(len(classifiers)): 69 | _classifiers = [Classifier(classifier) for classifier in classifiers] 70 | # 更新每个分类器 , 与上轮的 71 | # 残差 , 计算需要拟合的weight 72 | for _classifier in _classifiers: 73 | _classifier.updates_fit_values(datas, r) 74 | _classifier.sync() 75 | #计算损失函数值 , 分类器的标记 76 | loss, ci = self.find_min_loss(datas, r, _classifiers) 77 | self.classifiers.append(deepcopy(_classifiers[ci])) 78 | #更新下一轮残差是当前一轮分类器拟合上一轮残差剩余的残差 79 | #所以更新残差的时候是分类器,而不是所有分类器都参加更正 80 | r = self.update_residual(datas, r , _classifiers[ci]) 81 | #损失数值小于要求值之后 , 会跳出方程 82 | if loss < diff: 83 | break 84 | 85 | def find_min_loss(self, datas, residuals, classifiers): 86 | ''' 87 | 每一轮迭代迭代 , 只需要拟合上一轮的残差值 88 | datas : 数据 89 | residuals : 上一轮的残差表 90 | return : 91 | (最小损失函数值 , 分类器序号) 92 | 93 | ''' 94 | 95 | return min([ 96 | ( 97 | sum( 98 | [ 99 | (classifiers[j].classify(datas[i]) - residuals[i]) ** 2 100 | for i in range(len(datas)) 101 | ] 102 | ), j) 103 | for j in range(len(classifiers)) 104 | ]) 105 | 106 | def update_residual(self, datas, residuals , classifier): 107 | ''' 108 | 返回一个参差表 , 通过生成的分类器 , 计算下一轮需要拟合的残差表 109 | Rn-1,i = yi - fn-1(xi) 110 | ''' 111 | return [ 112 | residuals[i] - classifier.classify(datas[i]) 113 | for i in range(len(datas)) 114 | ] 115 | 116 | def classify(self, data): 117 | return sum([classifier.classify(data) for classifier in self.classifiers]) if len(self.classifiers) > 0 else 0 118 | 119 | 120 | if __name__ == '__main__': 121 | datas = [i for i in range(1, 11)] 122 | weights = [5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05] 123 | 124 | at = AdTree() 125 | classifiers = [lambda x: 1 if x >= 1.5 else 0, lambda x: 1 if x >= 2.5 else 0, lambda x: 1 if x >= 3.5 else 0, lambda x: 1 if x >= 4.5 else 0, lambda x: 126 | 1 if x >= 5.5 else 0, lambda x: 1 if x >= 6.5 else 0, lambda x: 1 if x >= 7.5 else 0, lambda x: 1 if x >= 8.5 else 0, lambda x: 1 if x >= 9.5 else 0] 127 | 128 | at.train(datas, weights, classifiers) 129 | print at.classify(8) 130 | -------------------------------------------------------------------------------- /moodstyle/classifier/Bayes.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from collections import Counter 4 | from collections import defaultdict 5 | import sys 6 | 7 | class Bayes(object): 8 | 9 | 10 | def train(self, datas, attr_len, labels, dense=True): 11 | """贝叶斯训练函数 12 | params: 13 | datas 训练数据 , [[]] 14 | attr_len 属性长度 15 | labels 分类数组 , 与datas对应 16 | dense 是否为稀疏矩阵,现在只支持dense=True 17 | return 18 | None 19 | raise 20 | None 21 | """ 22 | self.label_status = Counter(labels) 23 | self.default_prob = defaultdict(float) 24 | self.attr_status = { 25 | i: defaultdict(lambda: defaultdict(float)) for i in range(attr_len)} 26 | self.base_count = len(datas) 27 | self.attr_range = range(attr_len) 28 | for i in range(len(datas)): 29 | for j in range(attr_len): 30 | attr_val = datas[i][j] 31 | # 统计每个属性对应 p(I | C) , I < (v1 , v2 ,v3....,vn) 32 | self.attr_status[j][attr_val][labels[i]] += 1. 33 | # 计算每个属性出现val 时 , P(v1|I,C) 34 | for feature, attr_label in self.attr_status.items(): 35 | for attr_val, label in self.attr_status[feature].items(): 36 | for cl in label.keys(): 37 | self.attr_status[feature][attr_val][ 38 | cl] /= ( self.label_status[cl] + self.base_count) 39 | for label in self.label_status.keys(): 40 | self.default_prob[label] = 1. / ( self.label_status[label] + self.base_count) 41 | # 计算所有类别出现的概率 P(C) = sum(Cn) / sum(C) , n < (1,2,3,4,5....n) 42 | labels_count = float(sum(self.label_status.values())) 43 | for label, count in self.label_status.items(): 44 | self.label_status[label] /= labels_count 45 | def _predict(self , data , label): 46 | prob = 1. 47 | for i in self.attr_range: 48 | if data[i] == 0: 49 | continue 50 | prob *= self.get_prob(i , data[i], label) 51 | return prob * self.label_status[label] 52 | 53 | def get_prob(self , attr_index , value ,label ): 54 | """得到在指定序号下value在特定类别下发生概率 55 | params 56 | attr_index 暂定为属性序号 57 | value 属性值 58 | label 类别 59 | return 60 | prob 发生概率 61 | raise 62 | None 63 | """ 64 | if value in self.attr_status[attr_index]: 65 | if label in self.attr_status[attr_index][value]: 66 | return self.attr_status[attr_index][value][label] 67 | return self.default_prob[label] 68 | 69 | 70 | 71 | def predict(self, data): 72 | """对输入数据进行预测 73 | params: 74 | data 75 | return 76 | label 数据标记 77 | raise 78 | None 79 | """ 80 | probs = [( self._predict(data , label),label ) for label in self.label_status.keys() ] 81 | return sorted(probs, key = lambda x:x[0] , reverse = True)[0] 82 | 83 | def predict_old(self, data): 84 | 85 | """对输入数据进行预测 86 | params: 87 | data 88 | return 89 | label 数据标记 90 | raise 91 | None 92 | """ 93 | return sorted([( 94 | reduce(lambda x, y:x * y, 95 | [ 96 | self.attr_status[i][data[i]][label] 97 | for i in range(len(data)) 98 | if self.attr_status[i][data[i]].has_key(label) 99 | ] 100 | ) 101 | * 102 | self.label_status[label], label 103 | ) 104 | for label in self.label_status.keys() 105 | ], reverse=True) 106 | 107 | if __name__ == "__main__": 108 | b = Bayes() 109 | datas = [ [ 0 , 0 ] , [0 , 1] , [1 , 1] ,[1 , 0]] 110 | labels = [ 0 , 1 , 0 , 1] 111 | b.train(datas , 2 ,labels = labels) 112 | print b.predict([ 2 , 1]) 113 | -------------------------------------------------------------------------------- /moodstyle/classifier/Bp.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | import math 6 | import random 7 | import string 8 | from math import exp 9 | 10 | random.seed(0) 11 | 12 | class Layer(object): 13 | 14 | 15 | def __init__(self , layer_count , default = 1.): 16 | self.layer = [ default ] * layer_count 17 | self.count = layer_count 18 | 19 | def __len__(self): 20 | return self.count 21 | 22 | class RatioArray(object): 23 | 24 | 25 | def __init__(self, line , row , min_value , max_value ): 26 | self.ratios = [ 27 | [ 28 | random.random(min_value , max_value) 29 | for _ in range(line) 30 | ] 31 | for i in range(row)] 32 | 33 | 34 | 35 | 36 | class Bp(object): 37 | ''' 38 | 算法实现逻辑: 39 | 输入层 隐藏层 输出层 40 | 算法流程: 41 | input -> hidden -> output 42 | calc error every layer 43 | output -> hidden -> input 44 | 主要是逻辑回灌 神经元 45 | 46 | ''' 47 | pass 48 | 49 | 50 | 51 | 52 | 53 | # 生成区间[a, b)内的随机数 54 | def rand(a, b): 55 | return (b-a)*random.random() + a 56 | 57 | # 生成大小 I*J 的矩阵,默认零矩阵 (当然,亦可用 NumPy 提速) 58 | def makeMatrix(I, J, fill=0.0): 59 | m = [] 60 | for i in range(I): 61 | m.append([fill]*J) 62 | return m 63 | 64 | #定义sigmoid函数 , 一个简单神经元输出 65 | def sigmoid(x): 66 | return 1.0 / (1 + exp(-x)) 67 | 68 | 69 | # 函数 sigmoid 的派生函数, 为了得到输出 (即:y) 70 | def dsigmoid(y): 71 | return 1.0 - y**2 72 | 73 | class NN: 74 | ''' 三层反向传播神经网络 ''' 75 | def __init__(self, ni, nh, no): 76 | # 输入层、隐藏层、输出层的节点(数) 77 | self.ni = ni + 1 # 增加一个偏差节点 78 | self.nh = nh 79 | self.no = no 80 | 81 | self.ai = [1.0]*self.ni #当前输入层节点信息 82 | self.ah = [1.0]*self.nh #当前隐藏层节点信息 83 | self.ao = [1.0]*self.no #当前输出层节点信息 84 | 85 | # 建立权重(矩阵) 86 | self.wi = makeMatrix(self.ni, self.nh) #输入层到隐藏层的参数信息 87 | self.wo = makeMatrix(self.nh, self.no) #隐藏层到输出层的参数信息 88 | 89 | # 设为随机值 90 | for i in range(self.ni): 91 | for j in range(self.nh): 92 | self.wi[i][j] = rand(-0.2, 0.2) 93 | for j in range(self.nh): 94 | for k in range(self.no): 95 | self.wo[j][k] = rand(-2.0, 2.0) 96 | 97 | # 最后建立动量因子(矩阵) 98 | self.ci = makeMatrix(self.ni, self.nh) 99 | self.co = makeMatrix(self.nh, self.no) 100 | 101 | def update(self, inputs): 102 | if len(inputs) != self.ni-1: 103 | raise ValueError('与输入层节点数不符!') 104 | 105 | # 激活输入层 106 | for i in range(self.ni-1): 107 | #self.ai[i] = sigmoid(inputs[i]) 108 | self.ai[i] = inputs[i] 109 | 110 | # 激活隐藏层 111 | for j in range(self.nh): 112 | sum = 0.0 113 | for i in range(self.ni): 114 | sum = sum + self.ai[i] * self.wi[i][j] 115 | self.ah[j] = sigmoid(sum) 116 | 117 | # 激活输出层 118 | for k in range(self.no): 119 | sum = 0.0 120 | for j in range(self.nh): 121 | sum = sum + self.ah[j] * self.wo[j][k] 122 | self.ao[k] = sigmoid(sum) 123 | 124 | return self.ao[:] 125 | 126 | def backPropagate(self, targets, N, M): 127 | ''' 反向传播 ''' 128 | if len(targets) != self.no: 129 | raise ValueError('与输出层节点数不符!') 130 | 131 | # 计算输出层的误差 132 | output_deltas = [0.0] * self.no 133 | for k in range(self.no): 134 | error = targets[k]-self.ao[k] 135 | output_deltas[k] = dsigmoid(self.ao[k]) * error 136 | 137 | # 计算隐藏层的误差 138 | hidden_deltas = [0.0] * self.nh 139 | for j in range(self.nh): 140 | error = 0.0 141 | for k in range(self.no): 142 | error = error + output_deltas[k]*self.wo[j][k] 143 | hidden_deltas[j] = dsigmoid(self.ah[j]) * error 144 | 145 | # 更新输出层权重 146 | for j in range(self.nh): 147 | for k in range(self.no): 148 | change = output_deltas[k]*self.ah[j] 149 | self.wo[j][k] = self.wo[j][k] + N*change + M*self.co[j][k] 150 | self.co[j][k] = change 151 | #print(N*change, M*self.co[j][k]) 152 | 153 | # 更新输入层权重 154 | for i in range(self.ni): 155 | for j in range(self.nh): 156 | change = hidden_deltas[j]*self.ai[i] 157 | self.wi[i][j] = self.wi[i][j] + N*change + M*self.ci[i][j] 158 | self.ci[i][j] = change 159 | 160 | # 计算误差 161 | error = 0.0 162 | for k in range(len(targets)): 163 | error = error + 0.5*(targets[k]-self.ao[k])**2 164 | return error 165 | 166 | def test(self, patterns): 167 | for p in patterns: 168 | print(p[0], '->', self.update(p[0])) 169 | 170 | def weights(self): 171 | print('输入层权重:') 172 | for i in range(self.ni): 173 | print(self.wi[i]) 174 | print() 175 | print('输出层权重:') 176 | for j in range(self.nh): 177 | print(self.wo[j]) 178 | 179 | def train(self, patterns, iterations=1000, N=0.5, M=0.1): 180 | # N: 学习速率(learning rate) 181 | # M: 动量因子(momentum factor) 182 | for i in range(iterations): 183 | error = 0.0 184 | for p in patterns: 185 | inputs = p[0] 186 | targets = p[1] 187 | self.update(inputs) 188 | error = error + self.backPropagate(targets, N, M) 189 | if i % 100 == 0: 190 | print('误差 %-.5f' % error) 191 | 192 | 193 | def demo(): 194 | # 一个演示:教神经网络学习逻辑异或(XOR)------------可以换成你自己的数据试试 195 | pat = [ 196 | [[0,0], [0]], 197 | [[0,1], [1]], 198 | [[1,0], [1]], 199 | [[1,1], [0]] 200 | ] 201 | 202 | # 创建一个神经网络:输入层有两个节点、隐藏层有两个节点、输出层有一个节点 203 | n = NN(2, 2, 1) 204 | # 用一些模式训练它 205 | n.train(pat) 206 | # 测试训练的成果(不要吃惊哦) 207 | n.test(pat) 208 | # 看看训练好的权重(当然可以考虑把训练好的权重持久化) 209 | #n.weights() 210 | 211 | 212 | if __name__ == '__main__': 213 | demo() 214 | -------------------------------------------------------------------------------- /moodstyle/classifier/Bp1.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import math 4 | import random 5 | 6 | 7 | class Neroun(object): 8 | 9 | 10 | def __init__(self , weight_len , learn_rate = 0.1 , delta = random.uniform(1 , -1)): 11 | self.weights = self.init_weights(weight_len) 12 | self.delta = delta 13 | self.weight_len = weight_len 14 | self.weight_range = xrange(weight_len) 15 | self.learn_rate = learn_rate 16 | 17 | def init_weights(self , weight_len , weight_max = 0.5 , weight_min = -0.5): 18 | return [ random.uniform(weight_max , weight_min) for i in range(weight_len)] 19 | 20 | def predict(self , inputs): 21 | return self.simgod( sum( value * weight for value , weight in zip(inputs ,self.weights)) + self.delta) 22 | 23 | def simgod(self , value): 24 | return 1. / ( 1 + math.exp(-value)) 25 | 26 | def disgod(self , target): 27 | return target * (1 - target) 28 | 29 | def __len__(self): 30 | return self.weight_len 31 | 32 | def __getitem__(self , index): 33 | return self.weights[index] 34 | 35 | def __setitem__(self , index , value): 36 | self.weights[index] = self.weights[index] + self.learn_rate * value 37 | 38 | def update(self , target, predict): 39 | error = target - predict 40 | for i in self.weight_range: 41 | self.weights[i] += self.rate * error 42 | return error 43 | 44 | class Layer(object): 45 | 46 | 47 | 48 | def __init__(self , inputs_len ,neroun_len , learn_rate = 0.1): 49 | """神经元层初始化 50 | params: 51 | inputs_len 神经元输入数目 52 | neroun_len 神经元个数 53 | learn_rate 学习率 54 | return 55 | None 56 | raise 57 | None 58 | """ 59 | self.nerouns = [ Neroun(inputs_len , learn_rate = learn_rate) for i in range(neroun_len)] 60 | self.nerouns_len = neroun_len 61 | self.nerouns_range = xrange(self.nerouns_len) 62 | self.output = [ 0.] * self.nerouns_len 63 | 64 | def predict(self , inputs ): 65 | return [self.nerouns[i].predict(inputs) for i in self.nerouns_range ] 66 | 67 | def train_predict(self , inputs): 68 | for i in self.nerouns_range: 69 | self.output[i] = self.nerouns[i].predict(inputs) 70 | return self.output[:] 71 | 72 | def update(self , deltas): 73 | for i in self.nerouns_range: 74 | for j in xrange(len(self.nerouns[i])): 75 | self.nerouns[i][j] = deltas[i] 76 | 77 | def get_delta(self ,errors): 78 | raise NotImplemetion 79 | 80 | class OutPutLayer(Layer): 81 | 82 | def get_delta(self , errors ): 83 | return [self.output[i] * (1 - self.output[i]) * errors[i] for i in self.nerouns_range ] 84 | 85 | class HiddenLayer(Layer): 86 | 87 | def __init__(self , inputs_len ,neroun_len , next_layer, learn_rate = 0.1): 88 | super(HiddenLayer , self).__init__(inputs_len ,neroun_len , learn_rate) 89 | self.next_layer = next_layer 90 | 91 | def get_delta(self , errors ): 92 | delta = [0.] * self.nerouns_len 93 | for i in self.nerouns_range: 94 | error = sum( errors[j] * self.next_layer.nerouns[j][i] for j in self.next_layer.nerouns_range) 95 | delta[i] = self.output[i] * (1 - self.output[i])*error 96 | return delta 97 | 98 | class Bp(object): 99 | 100 | 101 | 102 | def __init__(self , inputs_len , hidden_len , outputs_len): 103 | self.input_layer_len = inputs_len 104 | self.hidden_layer_len = hidden_len 105 | self.output_layer_len = outputs_len 106 | self.output_layer = OutPutLayer(hidden_len , outputs_len) 107 | self.hidden_layer = HiddenLayer(inputs_len , hidden_len , self.output_layer) 108 | 109 | 110 | def predict(self , inputs): 111 | if len(inputs) != self.input_layer_len: 112 | raise Exception 113 | hidden_outputs = self.hidden_layer.predict(inputs) 114 | outputs = self.output_layer.predict(hidden_outputs) 115 | return outputs 116 | 117 | def _train_predict(self , inputs): 118 | if len(inputs) != self.input_layer_len: 119 | raise Exception 120 | hidden_outputs = self.hidden_layer.train_predict(inputs) 121 | outputs = self.output_layer.train_predict(hidden_outputs) 122 | return outputs 123 | 124 | 125 | def train(self , inputs ,targets ): 126 | 127 | #calc output errors 128 | predicts = self._train_predict(inputs) 129 | errors = [ pre - tar for pre , tar in zip(predicts , targets)] 130 | output_deltas = self.output_layer.get_delta(errors) 131 | hidden_deltas = self.hidden_layer.get_delta(output_deltas) 132 | self.output_layer.update(output_deltas) 133 | self.hidden_layer.update(hidden_deltas) 134 | return sum((pre - tar) ** 2 for pre , tar in zip(predicts , targets)) 135 | 136 | 137 | if __name__ == "__main__": 138 | 139 | bp = Bp(2 , 2 , 1) 140 | print bp.train([ 1 , 0 ] , [1. , 1. ,1.,1.]) 141 | -------------------------------------------------------------------------------- /moodstyle/classifier/Cart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | from collections import Counter 5 | from collections import defaultdict 6 | import cPickle 7 | import copy 8 | 9 | 10 | class Node(object): 11 | 12 | def __init__(self, split_attr, split_value): 13 | self.split_attr = split_attr 14 | self.split_value = split_value 15 | self.left_tree = None 16 | self.right_tree = None 17 | 18 | def __str__(self): 19 | return '[split_attr : %s split_value : %s ] [left tree %s] [right tree %s ] ' % (self.split_attr, self.split_value , self.left_tree , self.right_tree) 20 | 21 | 22 | class CartTree(object): 23 | 24 | def __init__(self): 25 | self.tree = None 26 | self.attrs = None 27 | self.attrs_dict = {} 28 | def load_model(self, file_path): 29 | ''' 30 | 加载模型 31 | file_path : 模型加载地址 32 | 功能 : 不管是否成功都会覆盖model 33 | ''' 34 | with open(file_path) as f: 35 | self.tree = cPickle.loads(f.readline().strip()) 36 | 37 | def save(self, model_path): 38 | if not self.tree: 39 | raise ValueError, 'no model can save!' 40 | with open(model_path, 'w') as f: 41 | f.write(cPickle.dumps(self.tree)) 42 | 43 | def __train(self, datas, labels, attrs, threshold=0.01): 44 | """train函数调用的真正训练函数 45 | """ 46 | label_dict = Counter(labels) 47 | if len(label_dict.keys()) == 1: 48 | return float(label_dict.keys()[0]) 49 | if len(attrs) == 0: 50 | return sum(label for label in labels) / float(len(labels)) 51 | attr_gain, attr, split_value, left_data, left_label, right_data, right_label = self.get_best_feature( 52 | datas, labels, attrs) # 得到最好信息增益的属性 53 | if attr_gain < threshold: 54 | return sum(label for label in labels) / float(len(labels)) 55 | node = Node(attr, split_value) 56 | child_attr = self.get_split_attr(# 为下轮切割属性 57 | attrs, attr 58 | ) 59 | #创建左子树 60 | node.left_tree = self.__train( 61 | self.split_data_by_attr(left_data , attrs , attr) , left_label, child_attr, threshold) 62 | #创建 63 | node.right_tree = self.__train( 64 | self.split_data_by_attr(right_data , attrs , attr), right_label, child_attr, threshold) 65 | return node 66 | 67 | 68 | 69 | 70 | def split_data_by_attr(self, datas, attrs, attr_name): 71 | ''' 72 | 切割训练集为了下一步 73 | datas :训练的数据 [[data]] 74 | attrs 属性名称列表 75 | attr_val 属性值 76 | dense_data 是否是密集型数据 , 暂时废弃 77 | ''' 78 | dump_datas = [] 79 | index = attrs.index(attr_name) 80 | for data in datas: 81 | dump = [] 82 | dump = data[:index] 83 | dump.extend(data[index + 1:]) 84 | dump_datas.append(dump) 85 | return dump_datas 86 | 87 | def train(self, datas, attrs, labels, threshold=0.01): 88 | self.attrs = attrs 89 | self.attrs_dict = {attr : index for index , attr in enumerate(attrs)} 90 | self.tree = self.__train(datas, labels, attrs, threshold) 91 | 92 | def get_split_attr(self, attrs, attr): 93 | split_attrs = [] 94 | index = attrs.index(attr) 95 | split_attrs.extend(attrs[:index]) 96 | split_attrs.extend(attrs[index + 1:]) 97 | return split_attrs 98 | 99 | def get_split_value(self, datas, split_index): 100 | ''' 101 | 得到cart树,分割数据点index,平均数 102 | ''' 103 | if len(datas): 104 | return sum(data[split_index] for data in datas) / float(len(datas)) 105 | raise ValueError 106 | 107 | def calc_gini(self, datas, labels, split_index, split_value): 108 | """计算属性值gini值 109 | 参数: 110 | datas 数据集合 111 | labels label集合,与datas集合对应 112 | 113 | """ 114 | labels_dict = Counter(labels) 115 | label_dist_dict = { 116 | label: defaultdict(int) for label in labels_dict.keys()} 117 | left_data = [] 118 | left_label = [] 119 | right_data = [] 120 | right_label = [] 121 | for i in range(len(labels)): 122 | if datas[i][split_index] > split_value: 123 | label_dist_dict[labels[i]][1] += 1. 124 | right_data.append(datas[i]) 125 | right_label.append(labels[i]) 126 | else: 127 | label_dist_dict[labels[i]][0] += 1. 128 | left_data.append(datas[i]) 129 | left_label.append(labels[i]) 130 | gini = 0. 131 | for label in labels_dict.keys(): 132 | prob = labels_dict[label] / float(len(labels)) 133 | prob_label = label_dist_dict[label][1] / float(len(labels)) 134 | gini += (prob * 2 * prob_label * (1 - prob_label)) 135 | return gini, left_data, left_label, right_data, right_label 136 | 137 | def get_best_feature(self, datas, labels, attrs): 138 | """得到datas数据中最好的分割属性 139 | params: 140 | datas 训练数据 eg,[[1 , 3, ,4]] 141 | labels 根据训练数据对应的label 142 | attrs 训练属性列表 143 | return 144 | 145 | raise: 146 | None 147 | """ 148 | gini_min = float('inf') 149 | left_data = None 150 | left_label = None 151 | right_data = None 152 | right_label = None 153 | split_attr = None 154 | split_value = None 155 | for split_index in range(len(attrs)): 156 | _split_value = self.get_split_value(datas, split_index) 157 | gini, _left_data, _left_label, _right_data, _right_label = self.calc_gini( 158 | datas, labels, split_index, _split_value) 159 | if gini < gini_min: 160 | gini_min = gini 161 | split_attr = attrs[split_index] 162 | left_data = _left_data 163 | left_label = _left_label 164 | right_data = _right_data 165 | right_label = _right_label 166 | split_value = _split_value 167 | return gini_min, split_attr, split_value, left_data, left_label, right_data, right_label 168 | 169 | def _classify(self, data, attrs, node): 170 | if isinstance(node, Node) is False: 171 | return node 172 | value = data[self.attrs_dict[node.split_attr]] 173 | if node.left_tree is None and node.right_tree is None: 174 | return value 175 | if value > node.split_value: 176 | return self._classify(data, attrs, node.right_tree) 177 | else: 178 | return self._classify(data, attrs, node.left_tree) 179 | 180 | def classify(self, data): 181 | """对输入的数据进行打分 182 | params 183 | data 输入数据,输入类型为list 184 | return 185 | value cart树返回数值 186 | raise 187 | Exception 188 | """ 189 | if data and isinstance(data , (list,tuple)) and len(data) and len(self.attrs): 190 | return self._classify(copy.copy(data), self.attrs , self.tree) 191 | else: 192 | raise ValueError,"data is list , eg [0 , 1 ,3 ...] ; data length is equal train attrs" 193 | 194 | if __name__ == '__main__': 195 | datas = [[1, 0, 0, 0], 196 | [1, 0, 0, 1], 197 | [1, 1, 0, 1], 198 | [1, 1, 1, 0], 199 | [1, 0, 0, 0], 200 | [2, 0, 0, 0], 201 | [2, 0, 0, 1], 202 | [2, 1, 1, 1], 203 | [2, 0, 1, 2], 204 | [2, 0, 1, 2], 205 | [3, 0, 1, 2], 206 | [3, 0, 1, 1], 207 | [3, 1, 0, 1], 208 | [3, 1, 0, 2], 209 | [3, 0, 0, 0]] 210 | labels = [0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0] 211 | d = CartTree() 212 | d.train(datas, [1, 2, 3, 4], labels) 213 | print d.tree 214 | print d.classify([3, 1 , 0 , 0]) 215 | print d.attrs 216 | -------------------------------------------------------------------------------- /moodstyle/classifier/DecisionTree.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from collections import defaultdict 4 | from collections import Counter 5 | from math import log 6 | 7 | 8 | class Node(object): 9 | def __init__(self, attr_name=None, label=None): 10 | self.attr_name = attr_name 11 | self.label = label 12 | self.child = {} 13 | 14 | def __str__(self): 15 | child = '\t'.join( 16 | ['%s %s' % (val, str(node)) for val, node in self.child.items()]) 17 | return 'attr: %s \t label : %s \t childs : [ %s ] ' % (self.attr_name, 18 | self.label, 19 | self.child) 20 | 21 | 22 | class DecisionTree(object): 23 | def __init__(self): 24 | self.tree = None 25 | self.attrs = None 26 | 27 | def train(self, datas, attrs, threshold=0.01, denseData=True, tree=None): 28 | if self.attrs == None: 29 | self.attrs = attrs 30 | node = Node() 31 | if self.tree == None: 32 | self.tree = node 33 | label_dict = Counter([data[-1] for data in datas]) 34 | if len(label_dict.keys()) == 1: 35 | node.label = datas[0][-1] 36 | return node # 如果都输于同一类 , 则返回树 37 | if len(attrs) == 0: 38 | node.label = label_dict.most_common()[0][0] 39 | return node # 如果属性为空 , 则返回绝大数的类标记 40 | attr, attr_gain, attr_val = self.getBestFeature( 41 | datas, attrs, denseData)[0] # 得到最好信息增益的属性 42 | if attr_gain < threshold: 43 | node.label = label_dict.most_common()[0][0] 44 | return node 45 | node.attr_name = attr 46 | for val in attr_val: 47 | #按照属性不同value 区分这个 48 | #取得最好分类属性 , 按照不同该属性不同val 区分数据 ; 49 | node.child[val] = self.train( 50 | self.splitDataByAttr(datas, attrs, attr, val), 51 | self.getSplitAttrs(attrs, attr), threshold, denseData, node) 52 | return node 53 | 54 | @staticmethod 55 | def entropy(probs): 56 | if probs: 57 | if isinstance(probs, (list, tuple)): 58 | return sum([-prob * log(prob, 2) for prob in probs]) 59 | elif isinstance(probs, (int, float)): 60 | return -probs * log(probs, 2) 61 | 62 | def getSplitAttrs(self, attrs, attr): 63 | split_attrs = [] 64 | index = attrs.index(attr) 65 | split_attrs.extend(attrs[:index]) 66 | split_attrs.extend(attrs[index + 1:]) 67 | return split_attrs 68 | 69 | def getBestFeature(self, datas, attrs, denseData): 70 | ''' 71 | 通过算法获得最好分类的属性 ; 72 | 思想: 73 | 1. 信息增益 74 | 2. 信息增益率 75 | 参数: 76 | datas 训练的数据 77 | attrs 属性列表 78 | deseData 是否为密集型数据 , == True [[v1 , v2 ,v3 ,v4 .... vn , label]] 79 | == False [({f1 : v1 , f2:v2...fn:vn} , label1)] 80 | ''' 81 | label_dict = defaultdict(float) 82 | for data in datas: 83 | label_dict[data[-1]] += 1 84 | data_num = len(datas) # 计算此次计算信息增益的数据长度 , 样本大小 85 | label_entropy = DecisionTree.entropy([ 86 | label_count / data_num for label_count in label_dict.values() 87 | ]) # 计算整个系统的熵 88 | # 计算每个属性的熵 89 | # 声明一个属性列表 , {属性 : {属性值 : 出现的次数}} 90 | attr_value_count = {attr: defaultdict(float) for attr in attrs} 91 | # 声明属性->属性值->类别->数量 92 | attr_value_class_count = {attr: defaultdict(dict) for attr in attrs} 93 | iter_index = range(len(attrs)) 94 | for data in datas: 95 | if denseData: 96 | for i in iter_index: 97 | # 计算每个属性下不同值数量 , 此处必要转换为离散变量 98 | attr_value_count[attrs[i]][data[i]] += 1 99 | if not attr_value_class_count[attrs[i]][data[i]].has_key( 100 | data[-1]): 101 | attr_value_class_count[attrs[i]][data[i]][data[ 102 | -1]] = 0. 103 | attr_value_class_count[attrs[i]][data[i]][data[-1]] += 1.0 104 | # 信息增益计算公式分析 105 | # H(D) - H(D|A) 106 | # 系统熵 - 每个属性下 , 存在这个类别的信息熵 107 | # 108 | # gains = [(属性名称 , 信息增益 , (属性值))......(属性名称n , 信息增益n , (f1 ...fn))] 109 | gains = [ 110 | ( 111 | attr, 112 | label_entropy - sum([ 113 | attr_value_count[attr][value] / data_num * 114 | DecisionTree.entropy([ 115 | # 计算每个属性在特定属性值时 , 发生的概率 116 | # p(DA1)/A 117 | attr_value_class_count[attr][value][label] / 118 | attr_value_count[attr][value] 119 | # 循环每个属性值在特定label产生 120 | for label in attr_value_class_count[attr][value] 121 | .keys() 122 | ]) for value in attr_value_count[attr].values() 123 | if attr_value_class_count[attr].has_key(value) 124 | ]), 125 | attr_value_count[attr].keys()) 126 | for attr in attr_value_count.keys() 127 | ] 128 | return sorted(gains, key=lambda x: x[1], reverse=True) 129 | 130 | def splitDataByAttr(self, 131 | datas, 132 | attrs, 133 | attr_name, 134 | attr_value, 135 | denseData=True): 136 | ''' 137 | 切割训练集为了下一步 138 | datas :训练的数据 [[data]] 139 | attrs 属性名称列表 140 | attr_val 属性值 141 | denseData 是否是密集型数据 , 暂时废弃 142 | ''' 143 | dump_datas = [] 144 | index = attrs.index(attr_name) 145 | for data in datas: 146 | dump = [] 147 | if data[index] == attr_value: 148 | dump = data[:index] 149 | dump.extend(data[index + 1:]) 150 | dump_datas.append(dump) 151 | return dump_datas 152 | 153 | def classify(self, data): 154 | ''' 155 | 功能: 用于分类模型 156 | 参数 : 157 | data 待分析的数据 , list 158 | 返回: 159 | 返回决策树的label 160 | ''' 161 | if self.tree == None: 162 | raise Exception, 'no model !' 163 | node = self.tree 164 | if node.label != None: 165 | return node.label 166 | for _ in range(len(data)): 167 | index = self.attrs.index(node.attr_name) 168 | node = self.tree.child[data[index]] 169 | if node.label != None: 170 | return node.label 171 | return None 172 | 173 | 174 | if __name__ == '__main__': 175 | # 测试数据 176 | # 是否必须水里 是否有脚蹼 属于鱼类 177 | data = [[1, 0, 1], [0, 1, 0], [1, 1, 1]] 178 | d = DecisionTree() 179 | d.train(data, [1, 2]) 180 | print d.classify([1, 0]) 181 | -------------------------------------------------------------------------------- /moodstyle/classifier/GRTree.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/moodstyle/classifier/GRTree.py -------------------------------------------------------------------------------- /moodstyle/classifier/Hmm1.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | from collections import defaultdict 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | from cPickle import load 10 | from cPickle import dump 11 | 12 | class HmmItem(object): 13 | 14 | __slots__ = ('obs', 'hide') 15 | 16 | def __init__(self, obs, hide): 17 | self.obs = obs 18 | self.hide = hide 19 | 20 | def __str__(self): 21 | return 'obs_state: %s \t hide_state: %s' % (self.obs, self.hide) 22 | 23 | 24 | class HmmItems(list): 25 | 26 | ''' 27 | 主要为了存储序列性观察与隐藏相对应状态 ; 28 | 主要方法: 29 | t[1]=(1,2) 30 | t.append(HmmItem) or t.append((obs , hide)) 31 | ''' 32 | 33 | def __check(self, value): 34 | if not value: 35 | raise ValueError, 'value is nothing , keep it out' 36 | 37 | def __setitem__(self, key, value): 38 | self.__check(value) 39 | if isinstance(value, HmmItem): 40 | super(HmmItems, self).__setitem__(key, value) 41 | elif isinstance(value, (tuple, list)) and len(value) == 2: 42 | super(HmmItems, self).__setitem__(key, HmmItem(value[0], value[1])) 43 | else: 44 | raise TypeError, 'HmmItems append accept type only ( HmmItem , tuple or list which is first item is obs state and second is hide state!) ' 45 | 46 | def append(self, value): 47 | self.__check(value) 48 | if isinstance(value, HmmItem): 49 | super(HmmItems, self).append(value) 50 | elif isinstance(value, (tuple, list)) and len(value) == 2: 51 | super(HmmItems, self).append(HmmItem(value[0], value[1])) 52 | else: 53 | raise TypeError, 'HmmItems append accept type only ( HmmItem , tuple or list which is first item is obs state and second is hide state!) ' 54 | 55 | def __str__(self): 56 | return ' '.join(['%s' % str(i) for i in self]) 57 | 58 | class HmmModel(object): 59 | 60 | def __init__(self, states): 61 | if not (states and isinstance(states, (list, tuple))): 62 | raise ValueError 63 | self.states_count = defaultdict(float) 64 | self.obs_state = defaultdict(float) # obs state {obs_state : count } 65 | self.states = states # obs state list 66 | self.start_states = self.create_start_states(states) # start probability 67 | self.transition_probability = self.create_transition_probability(states) 68 | self.emission_probability = self.create_emission_probability(states) 69 | 70 | def create_start_states(self, states, init_value=0.): 71 | start_states = defaultdict(float) 72 | for state in states: 73 | start_states[state] += init_value 74 | return start_states 75 | 76 | def create_transition_probability(self, states, init_value=0.): 77 | transition_probability = {} 78 | for state in states: 79 | transition_probability[state] = defaultdict(float) 80 | for after_state in states: 81 | transition_probability[state][after_state] += init_value 82 | return transition_probability 83 | 84 | def create_emission_probability(self, states): 85 | emission_probability = {} 86 | for state in states: 87 | emission_probability[state] = defaultdict(float) 88 | return emission_probability 89 | 90 | 91 | 92 | 93 | class Hmm(object): 94 | 95 | def __init__(self, model_path): 96 | model = self.load(model_path) 97 | if model and isinstance(model, HmmModel): 98 | self.model = model 99 | else: 100 | raise TypeError, 'model file not have right hmm model! : %s' % model_path 101 | 102 | def load(self, model_path): 103 | with open(model_path, 'rb') as f: 104 | return load(f) 105 | 106 | def viterbi(self, obs): 107 | ''' 108 | 特比算法 摘自wiki 维特比算法 109 | ''' 110 | V = [{}] 111 | path = {} 112 | for y in self.model.states: 113 | V[0][y] = self.model.start_states[y] * \ 114 | self.model.emission_probability[y][obs[0]] 115 | path[y] = [y] 116 | for t in range(1, len(obs)): 117 | V.append({}) 118 | newpath = {} 119 | for y in self.model.states: 120 | (prob, state) = max( 121 | [(V[t - 1][y0] * self.model.transition_probability[y0][y] * self.model.emission_probability[y][obs[t]], y0) for y0 in self.model.states]) 122 | V[t][y] = prob 123 | newpath[y] = path[state] + [y] 124 | path = newpath 125 | (prob, state) = max([(V[len(obs) - 1][y], y) 126 | for y in self.model.states]) 127 | return (prob, path[state]) 128 | 129 | 130 | class TrainHmm(object): 131 | 132 | def __init__(self, states): 133 | self.hmm = HmmModel(states) 134 | 135 | def save(self, model_path): 136 | with open(model_path, 'wb') as f: 137 | dump(self.hmm, f) 138 | 139 | def add_items(self, hmmitems): 140 | """将序列转换为hmmitems ,添加hmm训练器 141 | params 142 | hmmitems HmmItems 143 | return 144 | None 145 | raise 146 | None 147 | """ 148 | for i in range(len(hmmitems) - 1): 149 | self.hmm.transition_probability[hmmitems[i].hide][ 150 | hmmitems[i + 1].hide] += 1 151 | self.hmm.start_states[hmmitems[0].hide] += 1 152 | for item in hmmitems: 153 | self.hmm.obs_state[item.obs] += 1 154 | self.hmm.states_count[item.hide] += 1 155 | self.hmm.emission_probability[item.hide][item.obs] += 1 156 | 157 | def translate(self): 158 | 159 | startsCount = sum(self.hmm.start_states.values()) 160 | # 计算开始状态概率 161 | for state in self.hmm.start_states.keys(): 162 | self.hmm.start_states[state] = self.hmm.start_states[ 163 | state] / startsCount 164 | # 转移矩阵 165 | hide_state_keys = self.hmm.transition_probability.keys() 166 | hide_stats_count = sum(self.hmm.states_count.values()) 167 | for hide_state in hide_state_keys: 168 | for after_hide_state in hide_state_keys: 169 | self.hmm.transition_probability[hide_state][after_hide_state] = ( self.hmm.transition_probability[hide_state][ 170 | after_hide_state] + 1.0) / ( self.hmm.states_count[hide_state] + hide_stats_count) 171 | # 可观察状态下的隐藏状态发生概率 172 | for hide_state in self.hmm.emission_probability.keys(): 173 | for obs_state in self.hmm.obs_state.keys(): 174 | # 注释下 : 在这个观察状态下 , 隐藏状态发生的概率 , 如果是 ( 可观察状态 in 此隐藏状态 ) / 可观察状态 175 | # in this obs state , hide state will 176 | # p(hide_state | obs_state) 177 | # p(A|B) = P(AB) / P(B) = Count(AB) / count(Br) 178 | self.hmm.emission_probability[hide_state][obs_state] = ( 179 | self.hmm.emission_probability[hide_state][obs_state] + 1.) / ( self.hmm.states_count[hide_state] + hide_stats_count ) 180 | 181 | 182 | class TrainSeg(object): 183 | 184 | def __init__(self , states = ['s' , 'e' , 'm' ,'b']): 185 | self.model = TrainHmm(states) 186 | 187 | 188 | def add_line(self , line): 189 | if len(line) != 0: 190 | words = line.decode("utf-8").split() 191 | hmmitems = [] 192 | for word in words: 193 | for item in self.word_state(word): 194 | hmmitems.append(item) 195 | self.model.add_items(hmmitems) 196 | return True 197 | 198 | def word_state(self , word): 199 | if len(word) == 0: 200 | yield 201 | elif len(word) == 1: 202 | yield HmmItem(word , 's') 203 | elif len(word) == 2: 204 | yield HmmItem(word, 'b') 205 | yield HmmItem(word , 'e') 206 | elif len(word) >=3: 207 | yield HmmItem(word[0] , 'b') 208 | for _word in word[1:-1]: 209 | yield HmmItem(_word , 'm') 210 | yield HmmItem(word[-1] , 'e') 211 | 212 | 213 | if __name__ == '__main__': 214 | 215 | t = TrainSeg() 216 | t.add_line('我 爱 中国 !') 217 | t.model.translate() 218 | 219 | -------------------------------------------------------------------------------- /moodstyle/classifier/Interface.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | 7 | class Classify(object): 8 | 9 | 10 | 11 | 12 | def predict(self , data , *argv , **kw): 13 | raise NotImplmetion 14 | 15 | 16 | class Regression(object): 17 | 18 | def predict(self , data , *argv , **kw): 19 | raise NotImplmetion 20 | -------------------------------------------------------------------------------- /moodstyle/classifier/Knn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from collections import Counter 4 | 5 | 6 | class ClassItem(object): 7 | 8 | pass 9 | 10 | 11 | class KdNode(object): 12 | def __init__(self, split, left_child, right_child, data, parrent_node): 13 | self.split = split # 切分点 14 | self.left = left # 左子树 15 | self.right = right # 右子树 16 | self.data = data # 数据点 17 | self.parrent = pattern_node 18 | 19 | 20 | class KdTree(object): 21 | def create_kd_tree(self, datas, k, feature_len, depth): 22 | if datas == None or len(datas) == 0: 23 | return KdNode(None, None, None, None, None) 24 | 25 | split_index = self.get_split_index(datas, k, feature_len, depth) 26 | datas = sorted(datas, key=lambda x: x[split_index], reverse=True) 27 | split_data_index = len(datas) / 2 28 | data = datas[split_data_index] 29 | 30 | def get_split_index(self, datas, k, feature_len, depth): 31 | data_sum = [0] * feature_len 32 | # 计算方差,找到方差最大的列,方差越大,证明点越分散,越具有可区分度 33 | 34 | for data in datas: 35 | for i in range(feature_len): 36 | data_sum[i] += data[i] 37 | data_avg = [data_sum[i] / len(datas) for i in range(feature_len)] 38 | data_chi = [0] * feature_len 39 | for data in datas: 40 | for i in range(len(data)): 41 | data_chi[i] += (data[i] - data_avg[i])**2 42 | 43 | return sorted( 44 | [(data_chi[i], i) for i in range(feature_len)], 45 | key=lambda x: x[0], 46 | reverse=True)[0][1] 47 | 48 | 49 | class Knn(object): 50 | def __init__(self, train_data, labels, top_n): 51 | self.train_data = train_data 52 | self.labels = labels 53 | self.top_n = top_n 54 | 55 | def classify(self, data): 56 | label_orders = sorted( 57 | [(self.distance(data, self.train_data[i]), labels) 58 | for i in range(len(self.train_data))], 59 | key=lambda x: x[1]) 60 | return Counter( 61 | label for data, label in label_orders[:self.top_n]).most_common(1) 62 | -------------------------------------------------------------------------------- /moodstyle/classifier/LinerModel.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | #文件功能: 3 | # 感知器实现 4 | #参考网页地址: 5 | # http://www.hankcs.com/ml/the-perceptron.html 6 | # http://zh.wikipedia.org/wiki/%E6%84%9F%E7%9F%A5%E5%99%A8 7 | # http://blog.csdn.net/cscmaker/article/details/8296171 8 | #实现原理: 9 | # Min(loss(yi - yt)**2 ) 10 | # 11 | #算法: 12 | # 梯度下降 13 | # 14 | 15 | import Interface 16 | from ..common import DataSet 17 | from b2 import exceptions2 18 | 19 | 20 | 21 | class LinerModel(Interface.Classify): 22 | 23 | 24 | 25 | def __init__(self , w , learn_rate = 0.1 , labels = [1 , -1 ]): 26 | """liner model 27 | param:w:predict/train data len:int 28 | """ 29 | self.ratios = [0.] * w 30 | self.weight_len = xrange(w) 31 | self.b = 0 32 | self.data_range = xrange(w) 33 | self.r = learn_rate 34 | self.labels = labels 35 | 36 | def __train(self , data , label): 37 | yt = self.predict(data) 38 | if yt == label: 39 | return 40 | for i in self.weight_len: 41 | self.ratios[i] += self.r * label * data[i] 42 | self.b += self.r * label 43 | 44 | 45 | 46 | def predict(self , data , *argv , **kw ): 47 | """线性模型进行预测 48 | param:data/预测数据:list/tuple/dict 49 | return:predict value:int:model predict label 50 | raise:None 51 | test: 52 | >>> model = Ann(2,0.1) 53 | >>> datas = [[[3, 3], 1], [[4, 3], 1], [[1, 1], -1], [[2, 2], -1] , [[7,3] , 1 ] , [ [-1 , -1] , -1 ] ] 54 | >>> datas = [ data[0] for data in datas] 55 | >>> labels = [ data[1] for data in datas] 56 | >>> model.train(datas , labels) 57 | >>> model.predict(datas[-1]) == -1 58 | True 59 | """ 60 | yt = sum( self.ratios[i] * data[i] for i in self.data_range ) + self.b 61 | return min([ ( (yt -label) ** 2 , label) for label in self.labels])[1] 62 | 63 | def train(self , datas , labels , *argv , **kw): 64 | exceptions2.judge_null(datas) 65 | exceptions2.judge_null(labels) 66 | exceptions2.judge_type(datas,(list,tuple,DataSet.DataSet)) 67 | for data,label in zip(datas,labels): 68 | self.__train(data,label) 69 | -------------------------------------------------------------------------------- /moodstyle/classifier/Logistic.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from math import exp 4 | 5 | 6 | class Logistic(object): 7 | 8 | def train(self, datas, labels, alpha=0.001): 9 | self.params = [1. for _ in range(len(datas[0]))] 10 | self.labels = set(labels) 11 | 12 | for i in range(len(datas)): 13 | h = self.sigmod(self.classify(datas[i])) 14 | #L(a) = error , 已知损失数值 , 需要求更新权重的 15 | error = (labels[i] - h) 16 | for j in range(len(self.params)): 17 | self.params[j] += (alpha * datas[i][j] * error) 18 | 19 | def classify(self, data): 20 | _val = sum([data[i] * self.params[i] for i in range(len(self.params))]) 21 | return min([abs(label - _val , label) for label in self.labels ]) 22 | 23 | 24 | def sigmod(self, x): 25 | 26 | return 1. / (1 + exp(-x)) 27 | 28 | 29 | 30 | if __name__ == '__main__': 31 | data = [] 32 | labels = [] 33 | from random import randint 34 | for _ in range(10000): 35 | x = randint(1 , 10) 36 | y = randint(1 , 10) 37 | data.append((x,y)) 38 | if x <= y: 39 | labels.append(1) 40 | else: 41 | labels.append(0) 42 | b = Logistic() 43 | b.train(data, labels) 44 | print b.classify([2, 5]) 45 | -------------------------------------------------------------------------------- /moodstyle/classifier/RandomForst.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | class RF(object): 7 | 8 | 9 | pass 10 | 11 | -------------------------------------------------------------------------------- /moodstyle/classifier/RegressionTree.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | class TreeNode(object): 6 | 7 | 8 | 9 | def __init__(self): 10 | self.value = value 11 | self.left_leaf = None 12 | self.right_leaf = None 13 | self.split_value = None 14 | 15 | class RegressionTree(object): 16 | 17 | 18 | 19 | 20 | def __init__(self): 21 | self.tree = TreeNode() 22 | 23 | 24 | 25 | 26 | def train(self , datasets , targets): 27 | feature_split_values = self.init_split_value(datasets , targets) 28 | 29 | 30 | def loss(self , datasets , labels , attr , split_value): 31 | """回归树的损失函数 32 | param:datasets:class 训练数据集 33 | param:labels:list target目标训练值 34 | param:attr:int 训练属性坐标 35 | param:split_value:float 属性分裂值 36 | return:error :如果训练数据集没有出错则返回损失值,否则返回None 37 | raise:None 38 | """ 39 | c1 , c2 = self.get_target_avg(datasets , attr , split_value) 40 | if c1 is None or c2 is None: 41 | return None 42 | error = None 43 | for i in xrange(len(datasets)): 44 | if datasets[i][attr] is None: 45 | continue 46 | if error is None: 47 | error = 0. 48 | if datasets[i][attr] > split_value: 49 | error += (labels[i] - c1) ** 2 50 | else: 51 | error += (labels[i] - c2) ** 2 52 | return error 53 | 54 | 55 | def get_target_avg(self , datasets ,targets , attr , split_value): 56 | """根据分裂点,得到两个部分的target目标值 57 | param:datasets:class 训练数据集 58 | param:targets:list 训练数据对应的目标值list 59 | param:attr:int 训练属性坐标 60 | param:split_value:float 属性分裂值 61 | return:数据集中大于split_value值属性的训练target目标平均值c1 , 数据集attr属性中小于等于spliit_value属性的训练target目标平均值c2 62 | raise:None 63 | """ 64 | c1 , c2 = 0., 0. 65 | c1_count , c2_count = 0, 0 66 | for i in xrange(len(datasets)): 67 | if datasets[i][attr] is None: 68 | continue 69 | if datasets[i][attr] > split_value: 70 | c1 += targets[i] 71 | c1_count += 1 72 | else: 73 | c2 += targets[i] 74 | c2_count += 1 75 | return c1 / c1_count , c2/ c2_count if c1 == 0 or c2 == 0 else None,None 76 | 77 | -------------------------------------------------------------------------------- /moodstyle/classifier/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/moodstyle/classifier/__init__.py -------------------------------------------------------------------------------- /moodstyle/cluster/Canopy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | #!/usr/bin/env python 3 | 4 | from random import randint 5 | from random import sample 6 | from b2 import exceptions2 7 | import DDistance 8 | 9 | 10 | class Canopy(object): 11 | 12 | def __init__(self, centre): 13 | self.centre = centre # 中心点 14 | self.datas = [] # 涵盖的数据 , 如果在class下直接声明 , 会造成多个类公用一个list 15 | 16 | def __str__(self): 17 | return '%s : [ %s ]' % ( 18 | str(self.centre), ','.join([str(data) for data in self.datas])) 19 | 20 | 21 | class CanopyCluster(object): 22 | """canopy 是一个粗聚类算法 23 | 主要是两个值确定: 24 | t1 外围圈子 25 | t2 内部圈子 26 | 过程 : 27 |    判断数据list是否为空 28 |      随机一个数据元素作为中心 , 建立canopy 29 | 删除这个元素 30 | 循环每个数据每个元素 , 计算它与canopy中心的距离 31 |   如果 距离小于 < t1 32 |   canopy 加入此数据 33 |   如果 距离小于 < t2 34 |   在数据中删除这个元素  35 |   将canopy 加入到聚类中心处 36 | 思想: 37 |   减少计算 , 通过两个半径有效的去除元素 38 |   可以为kmeans方法 , 提供k值参考 39 | """ 40 | def __init__(self, t1, t2, calc_distance = DDistance.DefaultDistance()): 41 | exceptions2.judge_null(calc_distance) 42 | exceptions2.judge_type(calc_distance,DDistance.DDdistance) 43 | exceptions2.judge_type(t1,(int,float)) 44 | exceptions2.judge_type(t2,(int,float)) 45 | exceptions2.judge_smaller(t2,t1) 46 | self.t1 = t1 47 | self.t2 = t2 48 | self._calc_distance = calc_distance 49 | 50 | def cluster(self, datas): 51 | canopys = [] 52 | while len(datas) > 0: 53 | rand_center = randint(0, len(datas) - 1) 54 | canopy = Canopy(datas[rand_center]) 55 | del datas[rand_center] 56 | index = 0 57 | # 这里有个操作 , 因为for i in range(9) 这样是在一个list,删除元素无用 58 | while index < len(datas): 59 | distance = self._calc_distance.distance(canopy.centre, datas[index]) 60 | if distance < self.t1: 61 | canopy.datas.append(datas[index]) 62 | if distance < self.t2: 63 | del datas[index] 64 | index = index - 1 65 | index = index + 1 66 | canopys.append(canopy) 67 | return canopys 68 | -------------------------------------------------------------------------------- /moodstyle/cluster/DDistance.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import math 4 | 5 | 6 | class DDdistance(object): 7 | 8 | def distance(self, data1, data2): 9 | raise NotImplementedError 10 | 11 | 12 | class Manhattan(DDdistance): 13 | 14 | """ 15 | 算法实现曼哈顿距离 16 | """ 17 | 18 | def distance(self, data1, data2): 19 | if len(data1) != len(data2): 20 | raise ValueError 21 | return sum([abs(data1[i] - data2[i]) for i in range(len(data1))]) 22 | 23 | 24 | class DefaultDistance(DDdistance): 25 | 26 | def distance(self, data1, data2): 27 | return math.sqrt( 28 | sum([ 29 | (data1[i] - data2[i]) ** 2 30 | for i in range(len(data1)) 31 | ]) 32 | ) 33 | 34 | 35 | class Chebyshev(DDdistance): 36 | 37 | """ 38 | 切比雪夫距离 39 | """ 40 | 41 | def distance(self, data1, data2): 42 | if len(data1) != len(data2): 43 | raise ValueError 44 | return max([abs(data1[i] - data2[i]) for i in range(len(data1))]) 45 | 46 | 47 | class Cosine(DDdistance): 48 | 49 | """ 50 | 余弦距离 51 | """ 52 | 53 | def distance(self, data1, data2): 54 | if len(data1) != len(data2): 55 | raise ValueError 56 | 57 | return sum([data1[i] * data2[i] for i in range(len(data1))]) / ( 58 | math.sqrt(sum([data ** 2 for data in data1])) + 59 | math.sqrt(sum([data ** 2 for data in data2])) 60 | ) 61 | 62 | 63 | class Hamming(DDdistance): 64 | 65 | """ 66 | 海明距离 67 | """ 68 | 69 | def distance(self, data1, data2): 70 | return sum([1 if data1[i] == data2[i] else 0 for i in range(len(data1))]) / float(len(data1)) 71 | 72 | 73 | class Euclidean(DDdistance): 74 | 75 | """ 76 | 欧式距离 77 | """ 78 | 79 | def distance(self, data1, data2): 80 | return math.sqrt(sum([(data1 - data2) ** 2 for i in range(len(data1))])) 81 | -------------------------------------------------------------------------------- /moodstyle/cluster/DbScan.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from random import randint 4 | from ..common.BaseStrut import WeightArray 5 | from b2 import exceptions2 6 | import DDistance 7 | 8 | 9 | class ClusterItem(object): 10 | 11 | 12 | def __init__(self , data): 13 | self.data = data 14 | self.neighbours = [] 15 | self.visited = False 16 | self.cluster = 0 17 | 18 | 19 | class DbScan(object): 20 | 21 | def __init__(self,radius,minPoint,distance = DDistance.DefaultDistance()): 22 | exceptions2.judge_type(distance,DDistance.DDdistance) 23 | exceptions2.judge_null(radius) 24 | exceptions2.judge_null(minPoint) 25 | self.distance = distance 26 | self.radius = radius 27 | self.minPoint = minPoint 28 | 29 | def cluster(self, datas): 30 | ''' 31 | 算法:DBSCAN 32 | 参数: 33 | radius 半径 34 | minPoint 给定点在radius领域内成为核心对象的最小领域点数 35 | 输出:目标类簇集合 36 | 方法: 37 | repeat 38 | 1) 判断输入点是否为核心对象 39 | 2) 找出核心对象的E领域中的所有直接密度可达点 40 | util 所有输入点都判断完毕 41 | 42 | repeat 43 | 针对所有核心对象的E领域所有直接密度可达点找到最大密度相连对象集合, 44 | 中间涉及到一些密度可达对象的合并。 45 | Util 所有核心对象的E领域都遍历完毕 46 | ''' 47 | cluters = [] 48 | weight_map = WeightArray(datas , self.distance.distance) 49 | items = [ ClusterItem(data) for data in datas ] 50 | k = 1 51 | for i in range(len(items)): 52 | if items[i].visited == False: 53 | neighbours = [ items[j] for j in range(len(items)) if i != j and weight_map[(i,j)] < self.radius ] 54 | if len(neighbours) >= self.minPoint: 55 | items[i].visited = True 56 | items[i].cluster = k 57 | for neighbour in neighbours: 58 | if neighbour.visited == False or neighbour.cluster == -1: 59 | neighbour.cluster = k 60 | neighbour.visited = True 61 | items[i].data.append(neighbour) 62 | 63 | elif neighbour.visited == True and neighbour.cluster != -1: 64 | neighbour.cluster = k 65 | for item in neighbour.data: 66 | item.cluster = k 67 | items[i].data.extend(neighbour.data) 68 | del neighbour.data[:] 69 | k += 1 70 | else: 71 | items[i].visited = True 72 | items[i].cluster = -1 73 | -------------------------------------------------------------------------------- /moodstyle/cluster/HCluster.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | import math 5 | from ..common.BaseStrut import WeightArray 6 | import DDistance 7 | 8 | 9 | class HierarchicalClustering(object): 10 | 11 | def __init__(self,distance = DDistance.DDdistance()): 12 | judge_type(distance,DDistance.DDdistance) 13 | self._distance_handler = distance 14 | 15 | def cluster(self, datas, cluster_num, threshold=0.03): 16 | 17 | no_change = False 18 | 19 | # 创建数据距离词典 20 | distance_map = WeightArray(datas, self._distance_handler.distance) 21 | # 创建一个cluster,每个数据都是一个cluster 22 | clusters = [[datas[i]] for i in range(len(datas))] 23 | 24 | # 如果聚类不小于要求聚类数目继续 25 | while len(clusters) > cluster_num: 26 | min_distance = None #最短距离保存值 27 | min_cluster_pair = None #最短距离所对应的数据 28 | for i in range(len(clusters)): 29 | for j in range(i + 1, len(clusters)): 30 | d = self.get_cluster_distance( 31 | clusters[i], clusters[j], distance_map) 32 | if d < threshold and (min_distance is None or d < min_distance): 33 | min_distance = d 34 | min_cluster_pair = (i, j) 35 | if min_cluster_pair: 36 | clusters[min_cluster_pair[0]].extend( 37 | clusters[min_cluster_pair[1]]) 38 | del clusters[min_cluster_pair[1]] 39 | else: 40 | break 41 | return clusters 42 | 43 | def get_cluster_distance(self, cluster1, cluster2, distance_map): 44 | """ 45 | function 46 | 实现类之间平均距离 47 | params: 48 | cluster1 簇1 49 | cluster2 簇2 50 | distance_map DataDistance实例 51 | return 52 | 两个类之间平均距离 53 | """ 54 | raise NotImplementedError 55 | 56 | 57 | class ALHierarchicalClustering(HierarchicalClustering): 58 | """ 59 | 主要算法: 60 | 计算cluster之间平均距离 61 | """ 62 | def get_cluster_distance(self, cluster1, cluster2, distance_map): 63 | return sum([sum(distance_map[(data1[0], data2[0])]for data2 in cluster2) for data1 in cluster1]) / float(len(cluster1) * len(cluster2)) 64 | 65 | 66 | class SLHierarchicalClustering(HierarchicalClustering): 67 | """ 68 | 主要算法: 69 | 两个cluster中最小的两个数据之间距离 70 | """ 71 | 72 | def get_cluster_distance(self, cluster1, cluster2, distance_map): 73 | 74 | return min([min(distance_map[(data1[0], data2[0])] for data2 in cluster2) for data1 in cluster1]) / float(len(cluster1) * len(cluster2)) 75 | 76 | 77 | class CLHierarchicalClustering(HierarchicalClustering): 78 | """ 79 | 主要算法: 80 | 两个cluster中距离最大两个数据距离 81 | """ 82 | 83 | def get_cluster_distance(self, cluster1, cluster2, distance_map): 84 | 85 | return max([max(distance_map[(data1[0], data2[0])] for data2 in cluster2) for data1 in cluster1]) / float(len(cluster1) * len(cluster2)) 86 | 87 | -------------------------------------------------------------------------------- /moodstyle/cluster/Kmeans.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | #!/usr/bin/env python 3 | 4 | from random import randint 5 | from random import sample 6 | from collections import Counter 7 | from collections import defaultdict 8 | from math import sqrt 9 | import re 10 | import DDistance 11 | from copy import copy 12 | import sys 13 | from b2 import exceptions2 14 | from b2 import sort2 15 | """ 16 | 处理数据的格式 [数据1,数据2] 17 | 但是必须要改写 def distance(data1,data2) 数据距离函数 18 | 数据转换格式 {分类:{数据的位置:数据距离}} 19 | """ 20 | 21 | 22 | class Center(object): 23 | """ 24 | """ 25 | 26 | def __init__(self, 27 | label, 28 | center_vector, 29 | distance=DDistance.DefaultDistance()): 30 | exceptions2.judge_type(label, (int, long, basestring)) 31 | exceptions2.judge_type(center_vector, (list, tuple)) 32 | exceptions2.judge_type(distance, DDistance.DDdistance) 33 | self.label = label 34 | self.vector = center_vector 35 | self._distance = distance 36 | 37 | def __sub__(self, value): 38 | exceptions2.judge_null(value) 39 | if isinstance(vector, (list, tuple)): 40 | return self._distance(self.vector, vector) 41 | elif isinstance(value, Center): 42 | return self._distance(self.vector, value.vector) 43 | elif hasattr(value, "vector") and isinstance( 44 | getattr(value, "vector"), (list, tuple)): 45 | return self._distance(self.vector, value.vector) 46 | else: 47 | raise TypeError 48 | 49 | 50 | class Kmeans(object): 51 | def cluster(self, datas, k, iter_count=10000, diff=0.00001): 52 | """ 53 | 函数功能: 54 | 对数据进行聚类 , 通过kmeans 算法 55 | 过程: 56 | 随机从数据选出centers (一个随机过程) 57 | 开始迭代 58 | 循环每个数据: 59 | 计算数据与每个中心距离 , 找到一个最小值 60 | 如果 数据原有label 和现有label 不同: 61 | diff_labels += 1 62 | 计算数据label变化比率 , 如果超出diff设置值 , 继续下轮迭代 63 | 否则 , 跳出循环 64 | 返回数据labels 65 | """ 66 | centers = self.rand_seed(datas, k) 67 | center_range = range(len(centers)) 68 | data_range = range(len(datas)) 69 | labels = [-1 for i in data_range] 70 | for _ in range(iter_count): 71 | diff_labels = 0 72 | for i in data_range: 73 | bestlabel = min([(self.distance(datas[i], centers[j][0]), 74 | centers[j][1]) for j in center_range]) 75 | if labels[i] != bestlabel[1]: 76 | diff_labels += 1 77 | labels[i] = bestlabel[1] 78 | if float(diff_labels) / len(datas) < diff: 79 | break 80 | centers = self.update_centers(datas, labels, centers) 81 | return labels, centers 82 | 83 | def rand_seed(self, datas, k): 84 | rand_seeds = sample(datas, k) 85 | rand_seeds = [(copy(rand_seeds[i]), i) for i in range(len(rand_seeds))] 86 | return rand_seeds 87 | 88 | def update_centers(self, datas, labels, centers): 89 | centers_dict = { 90 | center[1]: [0 for i in range(len(center[0]))] 91 | for center in centers 92 | } 93 | label_dict = Counter(labels) 94 | for i in range(len(datas)): 95 | for j in range(len(datas[i])): 96 | centers_dict[labels[i]][j] += datas[i][j] 97 | for label in label_dict.keys(): 98 | for i in range(len(centers_dict[label])): 99 | centers_dict[label][i] /= label_dict[label] 100 | return sorted( 101 | [(center, label) for label, center in centers_dict.items()], 102 | key=lambda x: x[1], 103 | reverse=False) 104 | -------------------------------------------------------------------------------- /moodstyle/cluster/KmeansPlusPlus.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from Kmeans import Kmeans 4 | from Kmeans import DKmeans 5 | from random import randint 6 | from random import random 7 | from copy import copy 8 | 9 | 10 | class KmeansPlusPlus(Kmeans): 11 | 12 | def rand_seed(self, datas, k): 13 | ''' 14 | function: 15 | kmeans++与kmeans最大不同点就是种子生成算法不同 16 | params: 17 | datas 聚类数据 18 | k 聚类数目 19 | ''' 20 | seeds = [( 21 | copy(datas[randint(0, len(datas) - 1)]), 0 22 | )] #初始化种子库 ,随机一个种子 23 | #获取剩余种子 24 | for k_iter in range(k - 1): 25 | ds = [] #种子距离 26 | for data in datas: 27 | ds.append( 28 | min(self.distance(seed[0], data) 29 | for seed in seeds)) 30 | sum_distance = sum(ds) 31 | rand_distance = random() * sum_distance 32 | for i in range(len(ds)): 33 | rand_distance -= ds[i] 34 | if rand_distance <= 0: 35 | seeds.append((copy(datas[i]), k_iter + 1)) 36 | break 37 | return seeds 38 | 39 | 40 | class DKmeansPlusPlus(KmeansPlusPlus , DKmeans): 41 | 42 | pass 43 | 44 | 45 | 46 | if __name__ == '__main__': 47 | k = DKmeansPlusPlus() 48 | datas = [[randint(0, 20) * 1.0, randint(0, 20) * 1.0] for _ in range(200)] 49 | labels = k.cluster(datas, 5, 200 , diff = 0.00001) 50 | print labels 51 | 52 | -------------------------------------------------------------------------------- /moodstyle/cluster/MiniBatchKMeans.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | from Kmeans import Kmeans 5 | from Kmeans import Center 6 | from DDistance import DefaultDistance 7 | from random import randint 8 | import random 9 | from collections import Counter 10 | from collections import defaultdict 11 | from math import sqrt 12 | from Kmeans import DKmeans 13 | from copy import copy 14 | import sys 15 | 16 | 17 | class MiniBatchKmeans(Kmeans): 18 | 19 | def cluster(self, datas, k, iter_count=10000, diff=0.00001): 20 | """ 21 | k center count 22 | """ 23 | if k > len(datas): 24 | return datas 25 | centers = self.rand_seed(datas, k) 26 | center_range = range(k) 27 | data_range = range(len(datas)) 28 | sample_rate = 0.3 29 | sample_data_count = int(len(datas) * sample_rate) 30 | sample_data_range = range(sample_data_count) 31 | for _ in range(iter_count): 32 | sample_data = random.sample(datas, sample_data_count) 33 | distance_vector = [-1] * sample_data_count 34 | center_counts = [0] * k 35 | for i in sample_data_range: 36 | mini_distance, bestlabel = min( 37 | [ 38 | ( 39 | self.distance( 40 | datas[i], 41 | centers[j] 42 | ), j 43 | ) 44 | for j in center_range 45 | ] 46 | ) 47 | distance_vector[i] = bestlabel 48 | for i in sample_data_range: 49 | data_label = distance_vector[i] 50 | center_counts[data_label] += 1 51 | eta = 1.0 / center_counts[data_label] 52 | centers[data_label] = self.add( 53 | centers[data_label], 54 | sample_data[i], 55 | eta, 56 | len(sample_data[i]) 57 | ) 58 | return centers 59 | 60 | def rand_seed(self, datas, k): 61 | return [copy(data) for data in random.sample(datas, k)] 62 | 63 | def add(self, center, data, eta, data_len): 64 | _center = [i * (1.0 - eta) for i in center] 65 | _data = [eta * i for i in data] 66 | return [_center[i] + _data[i] for i in range(data_len)] 67 | 68 | 69 | class DMiniBatchKmeans(MiniBatchKmeans, DefaultDistance): 70 | pass 71 | 72 | 73 | if __name__ == '__main__': 74 | k = DMiniBatchKmeans() 75 | datas = [[randint(1, 20), randint(1, 20), randint( 76 | 1, 20), randint(1, 20), randint(1, 20)] for _ in range(100)] 77 | labels = k.cluster(datas, 5, 200, diff=0.00001) 78 | print labels 79 | -------------------------------------------------------------------------------- /moodstyle/cluster/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/moodstyle/cluster/__init__.py -------------------------------------------------------------------------------- /moodstyle/common/Array.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | 因为对数据进行训练的时候,需要采取抽样方式,让数据更加随机,比如gbdt/随机森林训练的时候; 4 | 会采用随机抽样方式进行训练 5 | PoolArray :蓄水池抽样方法 6 | WaterSample :流式抽样方法 7 | SampleArray : 抽样方法的工厂类 8 | ''' 9 | from random import randint 10 | from random import random 11 | 12 | class PoolArray(objects): 13 | 14 | 15 | def __init__(self , objects , sample_rate ): 16 | self.sample_rate = sample_rate 17 | self.objects = objects 18 | self.data_len = len(objects) 19 | self.sample_num = int( self.data_len * sample_rate ) 20 | self.rand_index = self.create_rand_list() 21 | self.index = -1 22 | 23 | 24 | def create_rand_list(self): 25 | index = 0 26 | sample_list = [] 27 | while index < self.data_len: 28 | if index < self.sample_num: 29 | sample_list.append(index) 30 | else: 31 | rand_index = randint(0 , index) 32 | if rand_index < self.sample_num: 33 | sample_list[rand_index] = index 34 | index += 1 35 | return sample_list 36 | 37 | def next(self): 38 | self.index += 1 39 | if self.index < self.data_len: 40 | return self.objects[self.index] 41 | else: 42 | raise StopIteration 43 | 44 | 45 | class WaterSample(object): 46 | 47 | 48 | def __init__(self , objects , sample_rate): 49 | self.sample_rate = sample_rate 50 | self.objects = objects 51 | self.data_len = len(objects) 52 | self.index = 0 53 | 54 | 55 | def get_rand_score(self ): 56 | return random(0 , 1) 57 | 58 | def next(self): 59 | while self.get_rand_score() > sample_rate: 60 | self.index += 1 61 | if self.index < self.data_len: 62 | return self.objects[self.index] 63 | raise StopIteration 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | class SampleArray(object): 74 | 75 | 76 | def __init__(self , objects , sample_class = WaterSample): 77 | if objects is not None and hasattr(objects , '__iter__') and not isinstance(objects ,basestring): 78 | self.objects = objects 79 | self.data_len = len(self.objects) 80 | else: 81 | raise ValueError 82 | 83 | 84 | def __iter__(self): 85 | return sample_class(self.objects , self.sample_rate) 86 | 87 | 88 | 89 | 90 | def __getitem__(self , index ): 91 | if isinstance(index , (int , long)) and index >= 0 and index < self.data_len: 92 | return self.objects[index] 93 | else: 94 | raise ValueError 95 | 96 | 97 | -------------------------------------------------------------------------------- /moodstyle/common/BaseStrut.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | class WeightArray(object): 6 | 7 | 8 | def __init__(self, datas, distance_fun): 9 | ''' 10 | function: 11 | init 12 | params: 13 | datas 数据 14 | distance_fun 计算两个数据之间的距离 15 | ''' 16 | self.lable_dict = {datas[index][0]:index for index in range(len(datas))} 17 | self.distance_map = self.create_distance_map(datas, distance_fun) 18 | self.data_len = len(datas) 19 | 20 | 21 | 22 | def __getitem__(self, label_tuple): 23 | label1, label2 = label_tuple 24 | if self.lable_dict.has_key(label1) and self.lable_dict.has_key(label2): 25 | index1 = self.lable_dict[label1] 26 | index2 = self.lable_dict[label2] 27 | return self.get_distance_by_index(index1 , index2) 28 | raise IndexError, 'index : %s , index2 : %s not in this distance_map' 29 | 30 | 31 | 32 | def get_distance_by_index(self , row , line ): 33 | ''' 34 | function: 35 | 下半角矩阵 , 转换坐标 36 | 37 | ''' 38 | if line > row : 39 | tmp = row 40 | row = line 41 | line = tmp 42 | return self.distance_map[row][line] 43 | 44 | 45 | 46 | def create_distance_map(self, datas, distance_fun): 47 | ''' 48 | function: 49 | 创建数据距离map 50 | params: 51 | datas 数据,格式 [[label1 , x1 ,x2...,xN ] , [lable2 , x1 , x2 , ..., xN]....[labelN , x1, x2 , ...xN] ] 52 | distance_fun 距离公式 , 参数是data1 , data2 53 | return distance 54 | return 55 | datas_map 56 | ''' 57 | if distance_fun is None or not callable(distance_fun): 58 | raise ValueError , 'distance_fun is calc data distance function !' 59 | distance_map = [] 60 | for i in range(len(datas)): 61 | tmp_distance = [] 62 | for j in range(i + 1): 63 | if i == j: 64 | tmp_distance.append(0) 65 | else: 66 | tmp_distance.append(distance_fun(datas[i], datas[j])) 67 | distance_map.append(tmp_distance) 68 | return distance_map 69 | 70 | 71 | 72 | class Normalization(object): 73 | 74 | 75 | def __init__(self, *argv , **kw): 76 | pass 77 | 78 | 79 | def update(self , value): 80 | raise NotImplementedError 81 | 82 | 83 | def get_normalization(self ,value ): 84 | raise NotImplementedError 85 | 86 | 87 | 88 | class MinMax(Normalization): 89 | ''' 90 | 类功能: 91 | 通过最普通的方式,将数据归一化 92 | ''' 93 | 94 | 95 | def __init__(self , max_value = None , min_value = None ) : 96 | ''' 97 | function 98 | init 99 | params 100 | max_value 最大值 , 默认值None 101 | min_value 最小值 , 默认值None 102 | ''' 103 | self.max = max_value 104 | self.min = min_value 105 | 106 | 107 | def update(self , value): 108 | ''' 109 | function 110 | 将属性对应更新,获得数据最大值和最小值 111 | params 112 | value 属性值 , value 类型(int , long , float) 113 | return 114 | None 115 | raise 116 | 当value无法转换为float值,抛出ValueError 117 | ''' 118 | if value is not None: 119 | try: 120 | value = float(value) 121 | except Exception, e: 122 | raise ValueError , e 123 | if self.max is None or self.max < value : 124 | self.max = value 125 | if self.min is None or self.min > value : 126 | self.min = value 127 | 128 | 129 | def get_normalization(self , value ): 130 | ''' 131 | function 132 | 将value 转换为归一化后的值 133 | params 134 | value 属性值 135 | return 136 | [0,1] 137 | raise 138 | value == None ValueError 139 | ''' 140 | if value is None : 141 | raise ValueError 142 | if self.max == self.min: 143 | return 1 144 | return (float(value) - self.min) / (self.max - self.min ) 145 | 146 | 147 | import math 148 | class ZScore(Normalization): 149 | 150 | 151 | 152 | 153 | def __init__(self): 154 | self.avg_value = 0 #均值 155 | self.variance = 0 #方差均值 156 | 157 | 158 | 159 | def update(self , values): 160 | ''' 161 | function 162 | 计算数据的平均值和平均方差 163 | ''' 164 | self.avg_value = sum(values) / float(len(values)) 165 | self.variance = math.sqrt(sum( (value - self.avg_value) ** 2 for value in values ) ) 166 | 167 | 168 | def get_normalization(self , value): 169 | if value is None : 170 | raise ValueError 171 | if self.avg_value == 0: 172 | return 0 173 | return ( float(value) - self.avg_value ) / self.variance 174 | 175 | 176 | class LogNormalization(Normalization): 177 | 178 | 179 | def __init__(self , base = 10 ): 180 | ''' 181 | function 182 | init 183 | params 184 | base log基数,最好为最大值 185 | return 186 | None 187 | raise 188 | None 189 | ''' 190 | self.base = base 191 | 192 | 193 | def get_normalization(self , value ): 194 | if value is None : 195 | raise ValueError 196 | return math.log(value , self.base ) 197 | 198 | class Arccotx(Normalization): 199 | 200 | 201 | def get_normalization(self , value ): 202 | if value is None: 203 | raise ValueError 204 | return math.atan(float(value)) * 2 / math.pi 205 | 206 | if __name__ == '__main__': 207 | l = Arccotx() 208 | print l.get_normalization(10) 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /moodstyle/common/DataSet.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | __ALL__ = ["DList" , "DenseData" , "SparseData" , "DataSet"] 5 | 6 | class DList(list): 7 | 8 | def items(self): 9 | return enumerate(self) 10 | 11 | def has_key(self , value): 12 | if value and isinstance(value , (int , long)): 13 | if value > 0 and value < len(self): 14 | return True 15 | return False 16 | raise TypeError 17 | 18 | 19 | def update(self , data): 20 | if data and hasattr(data , "items"): 21 | for index , value in data.items(): 22 | if index > len(self): 23 | self.append(value) 24 | else: 25 | self[index] = value 26 | elif data and isinstance(data , (list , tuple)): 27 | for index , value in enumerate(data): 28 | if index > len(self): 29 | self[index] = value 30 | 31 | def keys(self): 32 | return xrange(self._data_len) 33 | 34 | def order_key(self): 35 | return xrange(self.data_len) 36 | 37 | def values(self): 38 | return self 39 | 40 | 41 | 42 | class DeseData(DList): 43 | """多维数组实现 44 | """ 45 | 46 | def __init__(self , data , default_value , data_len , *argv , **kw): 47 | if data is None: 48 | for i in xrange(data_len): 49 | self.append(default_value) 50 | elif isinstance(data , (list , tuple)) and len(data) == data_len: 51 | self.extend(data) 52 | elif isinstance(data , dict): 53 | for i in xrange(data_len): 54 | self.append(data.get( i , default_value)) 55 | else: 56 | raise TypeError("data type must be in [list , tuple , dict]") 57 | self._data_len = data_len 58 | 59 | def __len__(self): 60 | return self._data_len 61 | 62 | def __setitem__(self , index , value ): 63 | if index and index < self._data_len: 64 | super(DeseData , self).__setitem__(index , value) 65 | else: 66 | raise IndexError 67 | 68 | class SparseData(dict): 69 | """稀疏矩阵实现,利用dict实现;在存储空间上不占优势;;一般用于文本向量 70 | 计算中使用 71 | """ 72 | def __init__(self ,data , default_value , data_len , *argv , **kw): 73 | super(SparseData , self).__init__(*argv , **kw) 74 | self._default = default_value 75 | self.data_len = data_len 76 | if data is not None: 77 | self.update(data) 78 | 79 | def __getitem__(self , index ): 80 | return super(SparseData , self).__getitem__(index) if index in self else self._default 81 | 82 | def __setitem__(self , index , value): 83 | if index is not None: 84 | if index < self.data_len: 85 | super(SparseData , self).__setitem__(index , value ) 86 | else: 87 | raise IndexError 88 | else: 89 | raise ValueError 90 | 91 | def __len__(self): 92 | return self.data_len 93 | 94 | def order_key(self): 95 | return sorted(self.keys() , reverse = False) 96 | 97 | class DataSet(DList): 98 | """实现稀疏/非稀疏矩阵保存结构 99 | """ 100 | 101 | def __init__(self ,data_len, dense_data , *argv , **kw): 102 | """初始化矩阵 103 | params: 104 | data_len 矩阵维数 105 | dense_data 是否为稀疏矩阵 106 | return 107 | None 108 | raise 109 | None 110 | """ 111 | super(DataSet , self).__init__(*argv , **kw) 112 | self._type = dense_data 113 | self._data_len = data_len 114 | self._data_class = DeseData if dense_data is False else SparseData 115 | self._range = xrange(self._data_len) 116 | 117 | def append(self , data = None): 118 | """增加数据 119 | params: 120 | data 需要增加的数据;类型:tuple,list,dict 121 | return 122 | True 123 | raise: 124 | data 类型不符合需求,抛出TypeError 125 | """ 126 | if data is None or isinstance(data , (list , tuple , dict)) : 127 | super(DataSet , self).append(self._data_class(data ,-1 , self._data_len )) 128 | return True 129 | else: 130 | raise TypeError 131 | 132 | def extend(self , data): 133 | """增加同类型数据 134 | """ 135 | if isinstance(data , DataSet): 136 | if data._data_len == self._data_len and data._type == self._type: 137 | super(DataSet , self).extend(data) 138 | else: 139 | raise ValueError 140 | return 141 | raise TypeError 142 | 143 | 144 | def shape(self): 145 | return len(self) , self._data_len 146 | 147 | def data_range(self): 148 | return self._range 149 | -------------------------------------------------------------------------------- /moodstyle/common/Dict.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | import collections 7 | import DataSet 8 | 9 | class Dictionary(dict): 10 | 11 | def __init__(self , **kw): 12 | if "dict_path" in kw: 13 | self.open_dict(kw["dict_path"]) 14 | elif "words" in kw: 15 | self.update({word:word_seq for word_seq , word in enumerate(kw["words"])}) 16 | 17 | def open_dict(self , dict_path): 18 | with open(dict_path) as f: 19 | for seq , word in enumerate(f.readlines()): 20 | self[word] = seq 21 | 22 | def __setitem__(self ,key, value): 23 | if key and key not in self: 24 | value = len(self) 25 | super(Dictionary , self).__setitem__(key , value) 26 | 27 | 28 | 29 | def to_vector(self , words): 30 | word_counter = collections.Counter(words.split()) 31 | vector = [0] * len(self) 32 | for word,count in word_counter.items(): 33 | if word in self: 34 | vector[self[word]] = count 35 | return vector 36 | 37 | def to_one_hot(self , words): 38 | words = set(words.split()) 39 | vector = [0] * len(self) 40 | for word in words: 41 | if word in self: 42 | vector[self[word]] = count 43 | return vector 44 | 45 | 46 | 47 | if __name__ == "__main__": 48 | d = Dictionary(words = ["a" , "b" ,"c"]) 49 | d['c'] = 5 50 | d['d'] = 6 51 | print d.to_vector("b b a") 52 | 53 | -------------------------------------------------------------------------------- /moodstyle/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/moodstyle/common/__init__.py -------------------------------------------------------------------------------- /moodstyle/common/util.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | 3 | from b2 import exceptions2 4 | 5 | 6 | def entropy(probs): 7 | """calc entropy 8 | :param:probs:probality array:float array 9 | :return:entropy:float 10 | """ 11 | exceptions2.judge_null(probs) 12 | if isinstance(probs, (list, tuple)): 13 | return sum([-prob * log(prob, 2) for prob in probs]) 14 | elif isinstance(probs, (int, float)): 15 | return -probs * log(probs, 2) 16 | -------------------------------------------------------------------------------- /moodstyle/config.py: -------------------------------------------------------------------------------- 1 | # The default ``config.py`` 2 | 3 | 4 | def set_prefs(prefs): 5 | """This function is called before opening the project""" 6 | 7 | # Specify which files and folders to ignore in the project. 8 | # Changes to ignored resources are not added to the history and 9 | # VCSs. Also they are not returned in `Project.get_files()`. 10 | # Note that ``?`` and ``*`` match all characters but slashes. 11 | # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' 12 | # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' 13 | # '.svn': matches 'pkg/.svn' and all of its children 14 | # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' 15 | # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' 16 | prefs['ignored_resources'] = [ 17 | '*.pyc', '*~', '.ropeproject', '.hg', '.svn', '_svn', '.git', 18 | '.tox', '.env', 'node_modules', 'bower_components'] 19 | 20 | # Specifies which files should be considered python files. It is 21 | # useful when you have scripts inside your project. Only files 22 | # ending with ``.py`` are considered to be python files by 23 | # default. 24 | #prefs['python_files'] = ['*.py'] 25 | 26 | # Custom source folders: By default rope searches the project 27 | # for finding source folders (folders that should be searched 28 | # for finding modules). You can add paths to that list. Note 29 | # that rope guesses project source folders correctly most of the 30 | # time; use this if you have any problems. 31 | # The folders should be relative to project root and use '/' for 32 | # separating folders regardless of the platform rope is running on. 33 | # 'src/my_source_folder' for instance. 34 | #prefs.add('source_folders', 'src') 35 | 36 | # You can extend python path for looking up modules 37 | #prefs.add('python_path', '~/python/') 38 | 39 | # Should rope save object information or not. 40 | prefs['save_objectdb'] = True 41 | prefs['compress_objectdb'] = False 42 | 43 | # If `True`, rope analyzes each module when it is being saved. 44 | prefs['automatic_soa'] = True 45 | # The depth of calls to follow in static object analysis 46 | prefs['soa_followed_calls'] = 0 47 | 48 | # If `False` when running modules or unit tests "dynamic object 49 | # analysis" is turned off. This makes them much faster. 50 | prefs['perform_doa'] = True 51 | 52 | # Rope can check the validity of its object DB when running. 53 | prefs['validate_objectdb'] = True 54 | 55 | # How many undos to hold? 56 | prefs['max_history_items'] = 32 57 | 58 | # Shows whether to save history across sessions. 59 | prefs['save_history'] = True 60 | prefs['compress_history'] = False 61 | 62 | # Set the number spaces used for indenting. According to 63 | # :PEP:`8`, it is best to use 4 spaces. Since most of rope's 64 | # unit-tests use 4 spaces it is more reliable, too. 65 | prefs['indent_size'] = 4 66 | 67 | # Builtin and c-extension modules that are allowed to be imported 68 | # and inspected by rope. 69 | prefs['extension_modules'] = [] 70 | 71 | # Add all standard c-extensions to extension_modules list. 72 | prefs['import_dynload_stdmods'] = True 73 | 74 | # If `True` modules with syntax errors are considered to be empty. 75 | # The default value is `False`; When `False` syntax errors raise 76 | # `rope.base.exceptions.ModuleSyntaxError` exception. 77 | prefs['ignore_syntax_errors'] = False 78 | 79 | # If `True`, rope ignores unresolvable imports. Otherwise, they 80 | # appear in the importing namespace. 81 | prefs['ignore_bad_imports'] = False 82 | 83 | # If `True`, rope will transform a comma list of imports into 84 | # multiple separate import statements when organizing 85 | # imports. 86 | prefs['split_imports'] = False 87 | 88 | # If `True`, rope will sort imports alphabetically by module name 89 | # instead of alphabetically by import statement, with from imports 90 | # after normal imports. 91 | prefs['sort_imports_alphabetically'] = False 92 | 93 | 94 | def project_opened(project): 95 | """This function is called after opening the project""" 96 | # Do whatever you like here! 97 | -------------------------------------------------------------------------------- /moodstyle/feature/DefaultValue.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from collections import defaultdict 3 | 4 | class MissingValue(object): 5 | 6 | def get_value(self, feature): 7 | raise NotImplementedError 8 | 9 | 10 | class ArvgMissingValue(object): 11 | 12 | def __init__(self, feature_len): 13 | self.default_values = [None for i in range(feature_len)] 14 | 15 | def add(self, feature, value): 16 | self.default_values[feature] = value 17 | #if self.default_values[feature] is None 18 | #else (self.default_values[feature] + value) 19 | -------------------------------------------------------------------------------- /moodstyle/feature/Feature.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | import bisect 6 | 7 | 8 | class Binning(object): 9 | 10 | def __init__(self,k,box = None): 11 | self.k = k 12 | if box is None: 13 | self._box = [0.] * (k - 1) 14 | elif isinstance(box,(list),tuple): 15 | if len(box) != self.k - 1: 16 | raise ValueError("input box number not equal k, please recheck") 17 | self._box = box 18 | 19 | 20 | def train(self,features): 21 | pass 22 | 23 | 24 | def predict(self,feature): 25 | if feature is None: 26 | raise TypeError 27 | return bisect.bisect(self._box,feature) 28 | 29 | 30 | def _sort(self,array): 31 | return sorted(array,reversed = False) 32 | 33 | 34 | 35 | class EqualRate(Binning): 36 | 37 | 38 | def train(self,features): 39 | features = self._sort(features) 40 | for i in range(0,len(features),len(features) / self.k): 41 | pass 42 | 43 | class EqualLength(Binning): 44 | """equal length excute feature 45 | 46 | >>> f = EqualLength(5) 47 | >>> from random import randint 48 | >>> array = [randint(0,100000) for i in range(10000)] 49 | >>> f.train(array) 50 | >>> f.predict(899) 51 | 0 52 | """ 53 | 54 | def train(self,features): 55 | min_value = min(features) 56 | max_value = max(features) 57 | length = (max_value - min_value) / self.k 58 | for i in range(self.k - 1): 59 | self._box[i] = min_value + length * (i + 1) 60 | -------------------------------------------------------------------------------- /moodstyle/feature/OneHotCode.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from collections import defaultdict 4 | from b2 import math2 5 | import math 6 | 7 | 8 | 9 | class OneHotCode(object): 10 | 11 | """feature one hot 12 | Test: 13 | >>> ohc = OneHotCode() 14 | >>> ohc.train(0) 15 | >>> ohc.train(1) 16 | >>> ohc.train(2) 17 | >>> ohc.train(3) 18 | >>> ohc.train(4) 19 | >>> ohc.train(7) 20 | >>> ohc.predict(0) 21 | >>> ohc.predict(3) 22 | >>> ohc.predict(7) 23 | """ 24 | 25 | def __init__(self): 26 | self._feature_map = {} 27 | 28 | def train(self,data): 29 | if data in self._feature_map: 30 | return 31 | self._feature_map[data] = len(self._feature_map) 32 | 33 | def predict(self,data): 34 | array_len = int(math.ceil(math.sqrt(len(self._feature_map)))) 35 | index = self._feature_map[data] 36 | features = math2.bitfield(index) 37 | features[:0] = [0 for _ in range(array_len - len(features))] 38 | return features 39 | -------------------------------------------------------------------------------- /moodstyle/feature/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/moodstyle/feature/__init__.py -------------------------------------------------------------------------------- /moodstyle/libsvm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/moodstyle/libsvm/__init__.py -------------------------------------------------------------------------------- /moodstyle/libsvm/libsvm.so.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/moodstyle/libsvm/libsvm.so.2 -------------------------------------------------------------------------------- /moodstyle/text/Ngram.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | __ALL__ = ["ngram2List","ngram"] 5 | def ngram2List(content, splitor = lambda x : x.split(" ") , n = 2): 6 | """ngram用于文本中的处理 7 | test: 8 | >>> ngram2List("a b c d") 9 | [['a', 'b'], ['b', 'c'], ['c', 'd']] 10 | >>> ngram2List("a b c d",n=1) 11 | [['a'], ['b'], ['c'], ['d']] 12 | >>> ngram2List("a b c d",splitor = lambda x: x.split("\|")) 13 | ['a b c d'] 14 | """ 15 | if content is None: 16 | return [] 17 | words = splitor(content) 18 | if len(words) <= 1: 19 | return [content] 20 | return [[ words[i+j] for j in range(n)] for i in range(0,len(words) - n + 1)] 21 | 22 | def ngram(content, splitor = " " , n = 2 ): 23 | """ngram用于文本中的处理 24 | test: 25 | >>> ngram("a b c d") 26 | ['a b', 'b c', 'c d'] 27 | >>> ngram("a b c d",n=1) 28 | ['a', 'b', 'c', 'd'] 29 | >>> ngram("a b c d",splitor = " ") 30 | ['a b c d'] 31 | """ 32 | if content is None: 33 | return [] 34 | words = content.split(splitor) 35 | if len(words) <= 1: 36 | return [content] 37 | result = [] 38 | for i in range(0,len(words)-n + 1): 39 | result.append( splitor.join([words[i+j] for j in range(n)])) 40 | return result 41 | -------------------------------------------------------------------------------- /moodstyle/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/moodstyle/text/__init__.py -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | 5 | kw = dict( 6 | name='moodstyle', 7 | version='0.0.5', 8 | description='data mining python code', 9 | author='intoblack', 10 | author_email='intoblack86@gmail.com', 11 | url='https://github.com/intoblack/moodstyle', 12 | download_url='https://github.com/intoblack/moodstyle', 13 | platforms='all platform', 14 | packages=find_packages(), 15 | include_package_data=True 16 | ) 17 | 18 | setup(**kw) 19 | 20 | -------------------------------------------------------------------------------- /test/testAdBoost.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import math 4 | 5 | 6 | class BoostClassifier(object): 7 | 8 | def __init__(self, classifier, weight=None): 9 | if classifier and callable(classifier): 10 | self.classifier = classifier 11 | self.weight = weight 12 | else: 13 | raise TypeError, 'classifier has classify and is callable!' 14 | 15 | def __call__(self, data): 16 | return self.classifier(data) 17 | 18 | def __str__(self): 19 | return 'weight : %s' % self.weight 20 | 21 | 22 | class BoostData(object): 23 | 24 | def __init__(self, data, weight=None): 25 | if data and isinstance(data, (list, tuple)) and len(data) >= 2: 26 | self.data = data 27 | self.weight = weight 28 | else: 29 | raise ValueError 30 | 31 | 32 | class AdBoost(object): 33 | 34 | def __init__(self, classifiers=[]): 35 | 36 | self.__boost_classifier = [BoostClassifier(cl) for cl in classifiers] 37 | self.__labels = set() 38 | 39 | def train(self, datas): 40 | ''' 41 | [[feature1 , feature2 ,....,featuren , label]] 42 | ''' 43 | # 初始化权重, 每个数据初始化权重为 weight = ( 1.0 / 数据长度 ) 44 | if len(datas) == 0 or len(self.__boost_classifier) == 0: 45 | raise ValueError 46 | 47 | for data in datas: 48 | self.__labels.add(data[-1]) 49 | # 将数据权重初始化为 1.0/ 数据总长度 50 | trains = [BoostData(data, 1.0 / len(datas)) for data in datas] 51 | # 开始计算每个分类器的权重 52 | for _ in range(len(self.__boost_classifier)): 53 | best_classifier = self.__get_trainer(trains)[0] 54 | 55 | best_classifier[1].weight = math.log((1 - best_classifier[0]) / 56 | best_classifier[0], math.e) / 2 57 | self.__update_data_weight(trains, best_classifier[1]) 58 | 59 | def classify(self, data): 60 | weight = sum([classifier.weight * classifier(data) 61 | for classifier in self.__boost_classifier if classifier.weight != None]) 62 | return sorted([(abs(label - weight), label) for label in self.__labels], key=lambda x: x[0])[0][1] 63 | 64 | def __update_data_weight(self, trains, classifier): 65 | ''' 66 | 功能: 更新数据的权重 67 | 公式: 68 | 69 | ''' 70 | zm = sum([data.weight * math.exp(-classifier.weight * 71 | data.data[-1] * classifier(data.data[:-1])) for data in trains]) 72 | for data in trains: 73 | data.weight = data.weight * \ 74 | math.exp(-classifier.weight * 75 | data.data[-1] * classifier(data.data[:-1])) / zm 76 | 77 | def __str__(self): 78 | return '\t'.join( 79 | [ 80 | '%s : %s' % (i, self.__boost_classifier[i]) 81 | for i in range(len(self.__boost_classifier)) 82 | ] 83 | ) 84 | 85 | def __get_trainer(self, trains): 86 | ''' 87 | trains , 训练的数据 88 | ''' 89 | # 循环每个分类器(除了已经添加为分类器的分类器), 计算 (数据的权重 * 分类器错分) 90 | # , 找到上述值最小的一个 , 作为下个分类器 91 | return sorted([( 92 | sum( 93 | [ 94 | bd.weight 95 | for bd in trains 96 | if cl(bd.data[:-1]) != bd.data[-1] 97 | ] 98 | ), cl) 99 | for cl in self.__boost_classifier 100 | if cl.weight == None 101 | ], 102 | key=lambda x: x[0], reverse=False) 103 | 104 | 105 | if __name__ == '__main__': 106 | classifiers = [lambda x: -1 if x[0] > 2.5 else 1, lambda x: 107 | 1 if x[0] > 5.5 else -1, lambda x: 1 if x[0] < 8.5 else -1] 108 | a = AdBoost(classifiers) 109 | datas = [[0, 1], [1, 1], [2, 1], [3, -1], [4, -1], 110 | [5, -1], [6, 1], [7, 1], [8, 1], [9, -1]] 111 | a.train(datas) 112 | print a.classify([9]) 113 | print a 114 | -------------------------------------------------------------------------------- /test/testAdTree.py: -------------------------------------------------------------------------------- 1 | #!/coding=utf-8 2 | 3 | from collections import defaultdict 4 | from copy import deepcopy 5 | 6 | 7 | class Classifier(object): 8 | 9 | def __init__(self, classifier): 10 | if classifier and (callable(classifier)): 11 | self.classifier = classifier 12 | else: 13 | raise TypeError, 'classifier must be callable and valuable !' 14 | # label , ( fit value sum , count ) 15 | self.__weights = defaultdict(float) 16 | self.__count = defaultdict(int) 17 | 18 | def clear(self): 19 | self.__weights.clear() 20 | self.__count.clear() 21 | 22 | def update_fit_value(self, data, value): 23 | label = self.classifier(data) 24 | self.__weights[label] += value 25 | self.__count[label] += 1. 26 | 27 | def updates_fit_values(self, datas, values): 28 | for i in range(len(datas)): 29 | self.update_fit_value(datas[i], values[i]) 30 | 31 | def sync(self): 32 | ''' 33 | 计算每个分类器需要拟合误差值 34 | ''' 35 | for label in self.__weights.keys(): 36 | self.__weights[label] /= self.__count[label] 37 | 38 | def classify(self, data): 39 | label = self.classifier(data) 40 | return self.__weights[label] if self.__weights.has_key(label) else None 41 | 42 | 43 | def __str__(self): 44 | return str(self.__weights) 45 | 46 | 47 | 48 | 49 | 50 | 51 | class AdTree(object): 52 | 53 | def __init__(self): 54 | #定义分类器集合 55 | self.classifiers = [] 56 | 57 | def train(self, datas, weights, classifiers, diff=0.2): 58 | ''' 59 | datas 60 | weights , 每个数据需要拟合的数值 61 | F0(x) = 0 62 | F1(x) = F0(x) + 树的分值 63 | FN(x) = FN(x) + 树(N-1) 64 | r(x) = sum ( yi - F 65 | ''' 66 | r = deepcopy(weights) 67 | 68 | for _ in range(len(classifiers)): 69 | _classifiers = [Classifier(classifier) for classifier in classifiers] 70 | # 更新每个分类器 , 与上轮的 71 | # 残差 , 计算需要拟合的weight 72 | for _classifier in _classifiers: 73 | _classifier.updates_fit_values(datas, r) 74 | _classifier.sync() 75 | #计算损失函数值 , 分类器的标记 76 | loss, ci = self.find_min_loss(datas, r, _classifiers) 77 | self.classifiers.append(deepcopy(_classifiers[ci])) 78 | #更新下一轮残差是当前一轮分类器拟合上一轮残差剩余的残差 79 | #所以更新残差的时候是分类器,而不是所有分类器都参加更正 80 | r = self.update_residual(datas, r , _classifiers[ci]) 81 | #损失数值小于要求值之后 , 会跳出方程 82 | if loss < diff: 83 | break 84 | 85 | def find_min_loss(self, datas, residuals, classifiers): 86 | ''' 87 | 每一轮迭代迭代 , 只需要拟合上一轮的残差值 88 | datas : 数据 89 | residuals : 上一轮的残差表 90 | return : 91 | (最小损失函数值 , 分类器序号) 92 | 93 | ''' 94 | 95 | return min([ 96 | ( 97 | sum( 98 | [ 99 | (classifiers[j].classify(datas[i]) - residuals[i]) ** 2 100 | for i in range(len(datas)) 101 | ] 102 | ), j) 103 | for j in range(len(classifiers)) 104 | ]) 105 | 106 | def update_residual(self, datas, residuals , classifier): 107 | ''' 108 | 返回一个参差表 , 通过生成的分类器 , 计算下一轮需要拟合的残差表 109 | Rn-1,i = yi - fn-1(xi) 110 | ''' 111 | return [ 112 | residuals[i] - classifier.classify(datas[i]) 113 | for i in range(len(datas)) 114 | ] 115 | 116 | def classify(self, data): 117 | return sum([classifier.classify(data) for classifier in self.classifiers]) if len(self.classifiers) > 0 else 0 118 | 119 | 120 | if __name__ == '__main__': 121 | datas = [i for i in range(1, 11)] 122 | weights = [5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05] 123 | 124 | at = AdTree() 125 | classifiers = [lambda x: 1 if x >= 1.5 else 0, lambda x: 1 if x >= 2.5 else 0, lambda x: 1 if x >= 3.5 else 0, lambda x: 1 if x >= 4.5 else 0, lambda x: 126 | 1 if x >= 5.5 else 0, lambda x: 1 if x >= 6.5 else 0, lambda x: 1 if x >= 7.5 else 0, lambda x: 1 if x >= 8.5 else 0, lambda x: 1 if x >= 9.5 else 0] 127 | 128 | at.train(datas, weights, classifiers) 129 | print at.classify(8) 130 | -------------------------------------------------------------------------------- /test/testAnn.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | #文件功能: 3 | # 感知器实现 4 | #参考网页地址: 5 | # http://www.hankcs.com/ml/the-perceptron.html 6 | # http://zh.wikipedia.org/wiki/%E6%84%9F%E7%9F%A5%E5%99%A8 7 | # http://blog.csdn.net/cscmaker/article/details/8296171 8 | #实现原理: 9 | # Min(loss(yi - yt)**2 ) 10 | # 11 | #算法: 12 | # 梯度下降 13 | # 14 | 15 | import testInterface 16 | import testDataSet 17 | class Ann(testInterface.Classify): 18 | 19 | 20 | 21 | def __init__(self , w , learn_rate = 0.1 , labels = [1 , -1 ]): 22 | self.ratios = [0.] * w 23 | self.weight_len = xrange(w) 24 | self.b = 0 25 | self.data_range = xrange(w) 26 | self.r = learn_rate 27 | self.labels = labels 28 | 29 | def __train(self , data , label): 30 | yt = self.classify(data) 31 | if yt == label: 32 | return 33 | for i in self.weight_len: 34 | self.ratios[i] += self.r * label * data[i] 35 | self.b += self.r * label 36 | 37 | 38 | 39 | def classify(self , data , *argv , **kw ): 40 | """ann算法进行分类 41 | params:data 数据源:class [list , tuple ,dict] 42 | return:1,-1默认值:value 43 | raise:None 44 | test: 45 | >>> classify = Ann(2,0.1) 46 | >>> datas = [[[3, 3], 1], [[4, 3], 1], [[1, 1], -1], [[2, 2], -1] , [[7,3] , 1 ] , [ [-1 , -1] , -1 ] ] 47 | >>> for data in datas: 48 | ... classify.train(data[0] , data[-1]) 49 | >>> classify.classify(datas[-1][0]) == -1 50 | True 51 | """ 52 | yt = sum( self.ratios[i] * data[i] for i in self.data_range ) + self.b 53 | return min([ ( (yt -label) ** 2 , label) for label in self.labels])[1] 54 | 55 | def train(self , datas , labels , *argv , **kw): 56 | if data is None: 57 | raise TypeError("datas is must be valuealbe") 58 | if isinstance(data , (list, tuple, testDataSet.DataSet)) is False: 59 | raise TypeError("datas type in [list , tuple , testDataSet.DataSet]") 60 | if len(datas) != len(labels): 61 | raise Exception("datas len must be equal labels") 62 | for i in xrange(datas): 63 | self.__train(datas[i] , labels[i]) 64 | -------------------------------------------------------------------------------- /test/testArray.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | 因为对数据进行训练的时候,需要采取抽样方式,让数据更加随机,比如gbdt/随机森林训练的时候; 4 | 会采用随机抽样方式进行训练 5 | PoolArray :蓄水池抽样方法 6 | WaterSample :流式抽样方法 7 | SampleArray : 抽样方法的工厂类 8 | ''' 9 | from random import randint 10 | from random import random 11 | 12 | class PoolArray(objects): 13 | 14 | 15 | def __init__(self , objects , sample_rate ): 16 | self.sample_rate = sample_rate 17 | self.objects = objects 18 | self.data_len = len(objects) 19 | self.sample_num = int( self.data_len * sample_rate ) 20 | self.rand_index = self.create_rand_list() 21 | self.index = -1 22 | 23 | 24 | def create_rand_list(self): 25 | index = 0 26 | sample_list = [] 27 | while index < self.data_len: 28 | if index < self.sample_num: 29 | sample_list.append(index) 30 | else: 31 | rand_index = randint(0 , index) 32 | if rand_index < self.sample_num: 33 | sample_list[rand_index] = index 34 | index += 1 35 | return sample_list 36 | 37 | def next(self): 38 | self.index += 1 39 | if self.index < self.data_len: 40 | return self.objects[self.index] 41 | else: 42 | raise StopIteration 43 | 44 | 45 | class WaterSample(object): 46 | 47 | 48 | def __init__(self , objects , sample_rate): 49 | self.sample_rate = sample_rate 50 | self.objects = objects 51 | self.data_len = len(objects) 52 | self.index = 0 53 | 54 | 55 | def get_rand_score(self ): 56 | return random(0 , 1) 57 | 58 | def next(self): 59 | while self.get_rand_score() > sample_rate: 60 | self.index += 1 61 | if self.index < self.data_len: 62 | return self.objects[self.index] 63 | raise StopIteration 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | class SampleArray(object): 74 | 75 | 76 | def __init__(self , objects , sample_class = WaterSample): 77 | if objects is not None and hasattr(objects , '__iter__') and not isinstance(objects ,basestring): 78 | self.objects = objects 79 | self.data_len = len(self.objects) 80 | else: 81 | raise ValueError 82 | 83 | 84 | def __iter__(self): 85 | return sample_class(self.objects , self.sample_rate) 86 | 87 | 88 | 89 | 90 | def __getitem__(self , index ): 91 | if isinstance(index , (int , long)) and index >= 0 and index < self.data_len: 92 | return self.objects[index] 93 | else: 94 | raise ValueError 95 | 96 | 97 | -------------------------------------------------------------------------------- /test/testBandit.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from collections import defaultdict 4 | import random 5 | 6 | 7 | 8 | class Greedy(object): 9 | EXPLORER = 1 10 | WORK = 0 11 | 12 | def __init__(self,e,N): 13 | self.e = e 14 | self.p = [0.5] * N 15 | self._front = [0.] * N 16 | self._explor = [0.] * N 17 | self.N = N 18 | self._max_index = 0 19 | # 1 explorer ; 0 work 20 | self._status = None 21 | self._last = None 22 | 23 | def _prop(self): 24 | for i in range(self.N): 25 | if self._explor[i] == 0.: 26 | continue 27 | self.p[i] = self._front[i] / self._explor[i] 28 | if self.p[self._max_index] < self.p[i]: 29 | self._max_index = i 30 | 31 | 32 | def getIndex(self): 33 | r = random.random() 34 | index = None 35 | if r < self.e: 36 | index = random.randint(0,self.N - 1) 37 | self._status = self.EXPLORER 38 | else: 39 | self._status = self.WORK 40 | index = self._max_index 41 | self._last = index 42 | return index 43 | 44 | 45 | def process(self,label): 46 | if self._status == self.EXPLORER: 47 | if label == 1: 48 | self._front[self._last] += 1. 49 | self._explor[self._last] += 1. 50 | self._prop() 51 | 52 | 53 | class UCB(object): 54 | 55 | 56 | def __init__(self,N,max_value): 57 | self.N = N 58 | self.max_value = max_value 59 | self._count = 0 60 | self._sub_count = [0.] * N 61 | self._sub_sum = [0.] * N 62 | self.p = [0.] * N 63 | self._last = None 64 | 65 | 66 | def _prop(self): 67 | for i in range(self.N): 68 | if self._sub_count == 0: 69 | continue 70 | self.p[i] = self._sub_sum[i] / self._sub_count + math.sqrt( 2 * math.log(self._count) / self._sub_count[i]) 71 | 72 | def getIndex(self): 73 | for i in range(self.N): 74 | if self._sub_count[i] == 0: 75 | self._last = i 76 | return self._last 77 | self._last = max(enumerate(self.p),key = lambda x:x[1])[0] 78 | return self._last 79 | 80 | def process(self,label): 81 | self._count += 1 82 | self._sub_count[self._last] += 1 83 | self._sub_sum[self._last] += label 84 | 85 | if __name__ == "__main__": 86 | 87 | N = 100 88 | p = [random.random() for i in range(N)] 89 | greedy = UCB(N,1) 90 | TIMES = 100000 91 | COUNT = 0 92 | for _ in range(TIMES): 93 | index = greedy.getIndex() 94 | prop = random.random() 95 | if prop <= p[index]: 96 | label = 1 97 | COUNT += 1 98 | else: 99 | label = 0 100 | greedy.process(label) 101 | print COUNT / float(TIMES) 102 | 103 | -------------------------------------------------------------------------------- /test/testBaseStrut.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | class WeightArray(object): 6 | 7 | 8 | def __init__(self, datas, distance_fun): 9 | ''' 10 | function: 11 | init 12 | params: 13 | datas 数据 14 | distance_fun 计算两个数据之间的距离 15 | ''' 16 | self.lable_dict = {datas[index][0]:index for index in range(len(datas))} 17 | self.distance_map = self.create_distance_map(datas, distance_fun) 18 | self.data_len = len(datas) 19 | 20 | 21 | 22 | def __getitem__(self, label_tuple): 23 | label1, label2 = label_tuple 24 | if self.lable_dict.has_key(label1) and self.lable_dict.has_key(label2): 25 | index1 = self.lable_dict[label1] 26 | index2 = self.lable_dict[label2] 27 | return self.get_distance_by_index(index1 , index2) 28 | raise IndexError, 'index : %s , index2 : %s not in this distance_map' 29 | 30 | 31 | 32 | def get_distance_by_index(self , row , line ): 33 | ''' 34 | function: 35 | 下半角矩阵 , 转换坐标 36 | 37 | ''' 38 | if line > row : 39 | tmp = row 40 | row = line 41 | line = tmp 42 | return self.distance_map[row][line] 43 | 44 | 45 | 46 | def create_distance_map(self, datas, distance_fun): 47 | ''' 48 | function: 49 | 创建数据距离map 50 | params: 51 | datas 数据,格式 [[label1 , x1 ,x2...,xN ] , [lable2 , x1 , x2 , ..., xN]....[labelN , x1, x2 , ...xN] ] 52 | distance_fun 距离公式 , 参数是data1 , data2 53 | return distance 54 | return 55 | datas_map 56 | ''' 57 | if distance_fun is None or not callable(distance_fun): 58 | raise ValueError , 'distance_fun is calc data distance function !' 59 | distance_map = [] 60 | for i in range(len(datas)): 61 | tmp_distance = [] 62 | for j in range(i + 1): 63 | if i == j: 64 | tmp_distance.append(0) 65 | else: 66 | tmp_distance.append(distance_fun(datas[i], datas[j])) 67 | distance_map.append(tmp_distance) 68 | return distance_map 69 | 70 | 71 | 72 | class Normalization(object): 73 | 74 | 75 | def __init__(self, *argv , **kw): 76 | pass 77 | 78 | 79 | def update(self , value): 80 | raise NotImplementedError 81 | 82 | 83 | def get_normalization(self ,value ): 84 | raise NotImplementedError 85 | 86 | 87 | 88 | class MinMax(Normalization): 89 | ''' 90 | 类功能: 91 | 通过最普通的方式,将数据归一化 92 | ''' 93 | 94 | 95 | def __init__(self , max_value = None , min_value = None ) : 96 | ''' 97 | function 98 | init 99 | params 100 | max_value 最大值 , 默认值None 101 | min_value 最小值 , 默认值None 102 | ''' 103 | self.max = max_value 104 | self.min = min_value 105 | 106 | 107 | def update(self , value): 108 | ''' 109 | function 110 | 将属性对应更新,获得数据最大值和最小值 111 | params 112 | value 属性值 , value 类型(int , long , float) 113 | return 114 | None 115 | raise 116 | 当value无法转换为float值,抛出ValueError 117 | ''' 118 | if value is not None: 119 | try: 120 | value = float(value) 121 | except Exception, e: 122 | raise ValueError , e 123 | if self.max is None or self.max < value : 124 | self.max = value 125 | if self.min is None or self.min > value : 126 | self.min = value 127 | 128 | 129 | def get_normalization(self , value ): 130 | ''' 131 | function 132 | 将value 转换为归一化后的值 133 | params 134 | value 属性值 135 | return 136 | [0,1] 137 | raise 138 | value == None ValueError 139 | ''' 140 | if value is None : 141 | raise ValueError 142 | if self.max == self.min: 143 | return 1 144 | return (float(value) - self.min) / (self.max - self.min ) 145 | 146 | 147 | import math 148 | class ZScore(Normalization): 149 | 150 | 151 | 152 | 153 | def __init__(self): 154 | self.avg_value = 0 #均值 155 | self.variance = 0 #方差均值 156 | 157 | 158 | 159 | def update(self , values): 160 | ''' 161 | function 162 | 计算数据的平均值和平均方差 163 | ''' 164 | self.avg_value = sum(values) / float(len(values)) 165 | self.variance = math.sqrt(sum( (value - self.avg_value) ** 2 for value in values ) ) 166 | 167 | 168 | def get_normalization(self , value): 169 | if value is None : 170 | raise ValueError 171 | if self.avg_value == 0: 172 | return 0 173 | return ( float(value) - self.avg_value ) / self.variance 174 | 175 | 176 | class LogNormalization(Normalization): 177 | 178 | 179 | def __init__(self , base = 10 ): 180 | ''' 181 | function 182 | init 183 | params 184 | base log基数,最好为最大值 185 | return 186 | None 187 | raise 188 | None 189 | ''' 190 | self.base = base 191 | 192 | 193 | def get_normalization(self , value ): 194 | if value is None : 195 | raise ValueError 196 | return math.log(value , self.base ) 197 | 198 | class Arccotx(Normalization): 199 | 200 | 201 | def get_normalization(self , value ): 202 | if value is None: 203 | raise ValueError 204 | return math.atan(float(value)) * 2 / math.pi 205 | 206 | if __name__ == '__main__': 207 | l = Arccotx() 208 | print l.get_normalization(10) 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /test/testBayes.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from collections import Counter 4 | from collections import defaultdict 5 | import sys 6 | 7 | class Bayes(object): 8 | 9 | 10 | def train(self, datas, attr_len, labels, dense=True): 11 | """贝叶斯训练函数 12 | params: 13 | datas 训练数据 , [[]] 14 | attr_len 属性长度 15 | labels 分类数组 , 与datas对应 16 | dense 是否为稀疏矩阵,现在只支持dense=True 17 | return 18 | None 19 | raise 20 | None 21 | """ 22 | self.label_status = Counter(labels) 23 | self.default_prob = defaultdict(float) 24 | self.attr_status = { 25 | i: defaultdict(lambda: defaultdict(float)) for i in range(attr_len)} 26 | self.base_count = len(datas) 27 | self.attr_range = range(attr_len) 28 | for i in range(len(datas)): 29 | for j in range(attr_len): 30 | attr_val = datas[i][j] 31 | # 统计每个属性对应 p(I | C) , I < (v1 , v2 ,v3....,vn) 32 | self.attr_status[j][attr_val][labels[i]] += 1. 33 | # 计算每个属性出现val 时 , P(v1|I,C) 34 | for feature, attr_label in self.attr_status.items(): 35 | for attr_val, label in self.attr_status[feature].items(): 36 | for cl in label.keys(): 37 | self.attr_status[feature][attr_val][ 38 | cl] /= ( self.label_status[cl] + self.base_count) 39 | for label in self.label_status.keys(): 40 | self.default_prob[label] = 1. / ( self.label_status[label] + self.base_count) 41 | # 计算所有类别出现的概率 P(C) = sum(Cn) / sum(C) , n < (1,2,3,4,5....n) 42 | labels_count = float(sum(self.label_status.values())) 43 | for label, count in self.label_status.items(): 44 | self.label_status[label] /= labels_count 45 | def _predict(self , data , label): 46 | prob = 1. 47 | for i in self.attr_range: 48 | if data[i] == 0: 49 | continue 50 | prob *= self.get_prob(i , data[i], label) 51 | return prob * self.label_status[label] 52 | 53 | def get_prob(self , attr_index , value ,label ): 54 | """得到在指定序号下value在特定类别下发生概率 55 | params 56 | attr_index 暂定为属性序号 57 | value 属性值 58 | label 类别 59 | return 60 | prob 发生概率 61 | raise 62 | None 63 | """ 64 | if value in self.attr_status[attr_index]: 65 | if label in self.attr_status[attr_index][value]: 66 | return self.attr_status[attr_index][value][label] 67 | return self.default_prob[label] 68 | 69 | 70 | 71 | def predict(self, data): 72 | """对输入数据进行预测 73 | params: 74 | data 75 | return 76 | label 数据标记 77 | raise 78 | None 79 | """ 80 | probs = [( self._predict(data , label),label ) for label in self.label_status.keys() ] 81 | return sorted(probs, key = lambda x:x[0] , reverse = True)[0] 82 | 83 | def predict_old(self, data): 84 | 85 | """对输入数据进行预测 86 | params: 87 | data 88 | return 89 | label 数据标记 90 | raise 91 | None 92 | """ 93 | return sorted([( 94 | reduce(lambda x, y:x * y, 95 | [ 96 | self.attr_status[i][data[i]][label] 97 | for i in range(len(data)) 98 | if self.attr_status[i][data[i]].has_key(label) 99 | ] 100 | ) 101 | * 102 | self.label_status[label], label 103 | ) 104 | for label in self.label_status.keys() 105 | ], reverse=True) 106 | 107 | if __name__ == "__main__": 108 | b = Bayes() 109 | datas = [ [ 0 , 0 ] , [0 , 1] , [1 , 1] ,[1 , 0]] 110 | labels = [ 0 , 1 , 0 , 1] 111 | b.train(datas , 2 ,labels = labels) 112 | print b.predict([ 2 , 1]) 113 | -------------------------------------------------------------------------------- /test/testBp.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | import math 6 | import random 7 | import string 8 | from math import exp 9 | 10 | random.seed(0) 11 | 12 | class Layer(object): 13 | 14 | 15 | def __init__(self , layer_count , default = 1.): 16 | self.layer = [ default ] * layer_count 17 | self.count = layer_count 18 | 19 | def __len__(self): 20 | return self.count 21 | 22 | class RatioArray(object): 23 | 24 | 25 | def __init__(self, line , row , min_value , max_value ): 26 | self.ratios = [ 27 | [ 28 | random.random(min_value , max_value) 29 | for _ in range(line) 30 | ] 31 | for i in range(row)] 32 | 33 | 34 | 35 | 36 | class Bp(object): 37 | ''' 38 | 算法实现逻辑: 39 | 输入层 隐藏层 输出层 40 | 算法流程: 41 | input -> hidden -> output 42 | calc error every layer 43 | output -> hidden -> input 44 | 主要是逻辑回灌 神经元 45 | 46 | ''' 47 | pass 48 | 49 | 50 | 51 | 52 | 53 | # 生成区间[a, b)内的随机数 54 | def rand(a, b): 55 | return (b-a)*random.random() + a 56 | 57 | # 生成大小 I*J 的矩阵,默认零矩阵 (当然,亦可用 NumPy 提速) 58 | def makeMatrix(I, J, fill=0.0): 59 | m = [] 60 | for i in range(I): 61 | m.append([fill]*J) 62 | return m 63 | 64 | #定义sigmoid函数 , 一个简单神经元输出 65 | def sigmoid(x): 66 | return 1.0 / (1 + exp(-x)) 67 | 68 | 69 | # 函数 sigmoid 的派生函数, 为了得到输出 (即:y) 70 | def dsigmoid(y): 71 | return 1.0 - y**2 72 | 73 | class NN: 74 | ''' 三层反向传播神经网络 ''' 75 | def __init__(self, ni, nh, no): 76 | # 输入层、隐藏层、输出层的节点(数) 77 | self.ni = ni + 1 # 增加一个偏差节点 78 | self.nh = nh 79 | self.no = no 80 | 81 | self.ai = [1.0]*self.ni #当前输入层节点信息 82 | self.ah = [1.0]*self.nh #当前隐藏层节点信息 83 | self.ao = [1.0]*self.no #当前输出层节点信息 84 | 85 | # 建立权重(矩阵) 86 | self.wi = makeMatrix(self.ni, self.nh) #输入层到隐藏层的参数信息 87 | self.wo = makeMatrix(self.nh, self.no) #隐藏层到输出层的参数信息 88 | 89 | # 设为随机值 90 | for i in range(self.ni): 91 | for j in range(self.nh): 92 | self.wi[i][j] = rand(-0.2, 0.2) 93 | for j in range(self.nh): 94 | for k in range(self.no): 95 | self.wo[j][k] = rand(-2.0, 2.0) 96 | 97 | # 最后建立动量因子(矩阵) 98 | self.ci = makeMatrix(self.ni, self.nh) 99 | self.co = makeMatrix(self.nh, self.no) 100 | 101 | def update(self, inputs): 102 | if len(inputs) != self.ni-1: 103 | raise ValueError('与输入层节点数不符!') 104 | 105 | # 激活输入层 106 | for i in range(self.ni-1): 107 | #self.ai[i] = sigmoid(inputs[i]) 108 | self.ai[i] = inputs[i] 109 | 110 | # 激活隐藏层 111 | for j in range(self.nh): 112 | sum = 0.0 113 | for i in range(self.ni): 114 | sum = sum + self.ai[i] * self.wi[i][j] 115 | self.ah[j] = sigmoid(sum) 116 | 117 | # 激活输出层 118 | for k in range(self.no): 119 | sum = 0.0 120 | for j in range(self.nh): 121 | sum = sum + self.ah[j] * self.wo[j][k] 122 | self.ao[k] = sigmoid(sum) 123 | 124 | return self.ao[:] 125 | 126 | def backPropagate(self, targets, N, M): 127 | ''' 反向传播 ''' 128 | if len(targets) != self.no: 129 | raise ValueError('与输出层节点数不符!') 130 | 131 | # 计算输出层的误差 132 | output_deltas = [0.0] * self.no 133 | for k in range(self.no): 134 | error = targets[k]-self.ao[k] 135 | output_deltas[k] = dsigmoid(self.ao[k]) * error 136 | 137 | # 计算隐藏层的误差 138 | hidden_deltas = [0.0] * self.nh 139 | for j in range(self.nh): 140 | error = 0.0 141 | for k in range(self.no): 142 | error = error + output_deltas[k]*self.wo[j][k] 143 | hidden_deltas[j] = dsigmoid(self.ah[j]) * error 144 | 145 | # 更新输出层权重 146 | for j in range(self.nh): 147 | for k in range(self.no): 148 | change = output_deltas[k]*self.ah[j] 149 | self.wo[j][k] = self.wo[j][k] + N*change + M*self.co[j][k] 150 | self.co[j][k] = change 151 | #print(N*change, M*self.co[j][k]) 152 | 153 | # 更新输入层权重 154 | for i in range(self.ni): 155 | for j in range(self.nh): 156 | change = hidden_deltas[j]*self.ai[i] 157 | self.wi[i][j] = self.wi[i][j] + N*change + M*self.ci[i][j] 158 | self.ci[i][j] = change 159 | 160 | # 计算误差 161 | error = 0.0 162 | for k in range(len(targets)): 163 | error = error + 0.5*(targets[k]-self.ao[k])**2 164 | return error 165 | 166 | def test(self, patterns): 167 | for p in patterns: 168 | print(p[0], '->', self.update(p[0])) 169 | 170 | def weights(self): 171 | print('输入层权重:') 172 | for i in range(self.ni): 173 | print(self.wi[i]) 174 | print() 175 | print('输出层权重:') 176 | for j in range(self.nh): 177 | print(self.wo[j]) 178 | 179 | def train(self, patterns, iterations=1000, N=0.5, M=0.1): 180 | # N: 学习速率(learning rate) 181 | # M: 动量因子(momentum factor) 182 | for i in range(iterations): 183 | error = 0.0 184 | for p in patterns: 185 | inputs = p[0] 186 | targets = p[1] 187 | self.update(inputs) 188 | error = error + self.backPropagate(targets, N, M) 189 | if i % 100 == 0: 190 | print('误差 %-.5f' % error) 191 | 192 | 193 | def demo(): 194 | # 一个演示:教神经网络学习逻辑异或(XOR)------------可以换成你自己的数据试试 195 | pat = [ 196 | [[0,0], [0]], 197 | [[0,1], [1]], 198 | [[1,0], [1]], 199 | [[1,1], [0]] 200 | ] 201 | 202 | # 创建一个神经网络:输入层有两个节点、隐藏层有两个节点、输出层有一个节点 203 | n = NN(2, 2, 1) 204 | # 用一些模式训练它 205 | n.train(pat) 206 | # 测试训练的成果(不要吃惊哦) 207 | n.test(pat) 208 | # 看看训练好的权重(当然可以考虑把训练好的权重持久化) 209 | #n.weights() 210 | 211 | 212 | if __name__ == '__main__': 213 | demo() 214 | -------------------------------------------------------------------------------- /test/testBp1.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import math 4 | import random 5 | 6 | 7 | class Neroun(object): 8 | 9 | 10 | def __init__(self , weight_len , learn_rate = 0.1 , delta = random.uniform(1 , -1)): 11 | self.weights = self.init_weights(weight_len) 12 | self.delta = delta 13 | self.weight_len = weight_len 14 | self.weight_range = xrange(weight_len) 15 | self.learn_rate = learn_rate 16 | 17 | def init_weights(self , weight_len , weight_max = 0.5 , weight_min = -0.5): 18 | return [ random.uniform(weight_max , weight_min) for i in range(weight_len)] 19 | 20 | def predict(self , inputs): 21 | return self.simgod( sum( value * weight for value , weight in zip(inputs ,self.weights)) + self.delta) 22 | 23 | def simgod(self , value): 24 | return 1. / ( 1 + math.exp(-value)) 25 | 26 | def disgod(self , target): 27 | return target * (1 - target) 28 | 29 | def __len__(self): 30 | return self.weight_len 31 | 32 | def __getitem__(self , index): 33 | return self.weights[index] 34 | 35 | def __setitem__(self , index , value): 36 | self.weights[index] = self.weights[index] + self.learn_rate * value 37 | 38 | def update(self , target, predict): 39 | error = target - predict 40 | for i in self.weight_range: 41 | self.weights[i] += self.rate * error 42 | return error 43 | 44 | class Layer(object): 45 | 46 | 47 | 48 | def __init__(self , inputs_len ,neroun_len , learn_rate = 0.1): 49 | """神经元层初始化 50 | params: 51 | inputs_len 神经元输入数目 52 | neroun_len 神经元个数 53 | learn_rate 学习率 54 | return 55 | None 56 | raise 57 | None 58 | """ 59 | self.nerouns = [ Neroun(inputs_len , learn_rate = learn_rate) for i in range(neroun_len)] 60 | self.nerouns_len = neroun_len 61 | self.nerouns_range = xrange(self.nerouns_len) 62 | self.output = [ 0.] * self.nerouns_len 63 | 64 | def predict(self , inputs ): 65 | return [self.nerouns[i].predict(inputs) for i in self.nerouns_range ] 66 | 67 | def train_predict(self , inputs): 68 | for i in self.nerouns_range: 69 | self.output[i] = self.nerouns[i].predict(inputs) 70 | return self.output[:] 71 | 72 | def update(self , deltas): 73 | for i in self.nerouns_range: 74 | for j in xrange(len(self.nerouns[i])): 75 | self.nerouns[i][j] = deltas[i] 76 | 77 | def get_delta(self ,errors): 78 | raise NotImplemetion 79 | 80 | class OutPutLayer(Layer): 81 | 82 | def get_delta(self , errors ): 83 | return [self.output[i] * (1 - self.output[i]) * errors[i] for i in self.nerouns_range ] 84 | 85 | class HiddenLayer(Layer): 86 | 87 | def __init__(self , inputs_len ,neroun_len , next_layer, learn_rate = 0.1): 88 | super(HiddenLayer , self).__init__(inputs_len ,neroun_len , learn_rate) 89 | self.next_layer = next_layer 90 | 91 | def get_delta(self , errors ): 92 | delta = [0.] * self.nerouns_len 93 | for i in self.nerouns_range: 94 | error = sum( errors[j] * self.next_layer.nerouns[j][i] for j in self.next_layer.nerouns_range) 95 | delta[i] = self.output[i] * (1 - self.output[i])*error 96 | return delta 97 | 98 | class Bp(object): 99 | 100 | 101 | 102 | def __init__(self , inputs_len , hidden_len , outputs_len): 103 | self.input_layer_len = inputs_len 104 | self.hidden_layer_len = hidden_len 105 | self.output_layer_len = outputs_len 106 | self.output_layer = OutPutLayer(hidden_len , outputs_len) 107 | self.hidden_layer = HiddenLayer(inputs_len , hidden_len , self.output_layer) 108 | 109 | 110 | def predict(self , inputs): 111 | if len(inputs) != self.input_layer_len: 112 | raise Exception 113 | hidden_outputs = self.hidden_layer.predict(inputs) 114 | outputs = self.output_layer.predict(hidden_outputs) 115 | return outputs 116 | 117 | def _train_predict(self , inputs): 118 | if len(inputs) != self.input_layer_len: 119 | raise Exception 120 | hidden_outputs = self.hidden_layer.train_predict(inputs) 121 | outputs = self.output_layer.train_predict(hidden_outputs) 122 | return outputs 123 | 124 | 125 | def train(self , inputs ,targets ): 126 | 127 | #calc output errors 128 | predicts = self._train_predict(inputs) 129 | errors = [ pre - tar for pre , tar in zip(predicts , targets)] 130 | output_deltas = self.output_layer.get_delta(errors) 131 | hidden_deltas = self.hidden_layer.get_delta(output_deltas) 132 | self.output_layer.update(output_deltas) 133 | self.hidden_layer.update(hidden_deltas) 134 | return sum((pre - tar) ** 2 for pre , tar in zip(predicts , targets)) 135 | 136 | 137 | if __name__ == "__main__": 138 | 139 | bp = Bp(2 , 2 , 1) 140 | print bp.train([ 1 , 0 ] , [1. , 1. ,1.,1.]) 141 | -------------------------------------------------------------------------------- /test/testCanopy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | #!/usr/bin/env python 3 | 4 | from random import randint 5 | from random import sample 6 | 7 | class Canopy(object): 8 | 9 | ''' 10 | 一个类 11 | ''' 12 | 13 | def __init__(self, centre): 14 | self.centre = centre # 中心点 15 | self.datas = [] # 涵盖的数据 , 如果在class下直接声明 , 会造成多个类公用一个list 16 | 17 | def __str__(self): 18 | return '%s : [ %s ]' % ( 19 | str(self.centre), ','.join([str(data) for data in self.datas])) 20 | 21 | 22 | class CanopyCluster(object): 23 | 24 | ''' 25 | canopy 是一个粗聚类算法 26 | 主要是两个值确定: 27 | t1 外围圈子 28 | t2 内部圈子 29 | 过程 : 30 |    判断数据list是否为空 31 |      随机一个数据元素作为中心 , 建立canopy 32 | 删除这个元素 33 | 循环每个数据每个元素 , 计算它与canopy中心的距离 34 |   如果 距离小于 < t1 35 |   canopy 加入此数据 36 |   如果 距离小于 < t2 37 |   在数据中删除这个元素  38 |   将canopy 加入到聚类中心处 39 | 思想: 40 |   减少计算 , 通过两个半径有效的去除元素 41 |   可以为kmeans方法 , 提供k值参考 42 | ''' 43 | 44 | def __init__(self, t1, t2): 45 | if t1 <= t2: 46 | raise TypeError, 't1 must be bigger than t2' 47 | self.t1 = t1 48 | self.t2 = t2 49 | 50 | def cluster(self, datas): 51 | ''' 52 | function: 53 | datas 数据数组 54 | return 55 | 56 | ''' 57 | canopys = [] 58 | while len(datas) > 0: 59 | rand_center = randint(0, len(datas) - 1) 60 | canopy = Canopy(datas[rand_center]) 61 | del datas[rand_center] 62 | index = 0 63 | # 这里有个操作 , 因为for i in range(9) 这样是在一个list,删除元素无用 64 | while index < len(datas): 65 | distance = self.distance(canopy.centre, datas[index]) 66 | if distance < self.t1: 67 | canopy.datas.append(datas[index]) 68 | if distance < self.t2: 69 | del datas[index] 70 | index = index - 1 71 | index = index + 1 72 | canopys.append(canopy) 73 | return canopys 74 | 75 | def distance(self, data1, data2): 76 | raise NotImplementedError 77 | 78 | 79 | class SimpleCanopyCluster(CanopyCluster): 80 | 81 | def distance(self, data1, data2): 82 | return abs(data1 - data2) 83 | 84 | 85 | if __name__ == '__main__': 86 | s = SimpleCanopyCluster(120, 100) 87 | datas = [randint(0, 1000) for i in range(100)] 88 | for i in s.cluster(datas): 89 | print i 90 | -------------------------------------------------------------------------------- /test/testCart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | from collections import Counter 5 | from collections import defaultdict 6 | import cPickle 7 | import copy 8 | 9 | 10 | class Node(object): 11 | 12 | def __init__(self, split_attr, split_value): 13 | self.split_attr = split_attr 14 | self.split_value = split_value 15 | self.left_tree = None 16 | self.right_tree = None 17 | 18 | def __str__(self): 19 | return '[split_attr : %s split_value : %s ] [left tree %s] [right tree %s ] ' % (self.split_attr, self.split_value , self.left_tree , self.right_tree) 20 | 21 | 22 | class CartTree(object): 23 | 24 | def __init__(self): 25 | self.tree = None 26 | self.attrs = None 27 | self.attrs_dict = {} 28 | def load_model(self, file_path): 29 | ''' 30 | 加载模型 31 | file_path : 模型加载地址 32 | 功能 : 不管是否成功都会覆盖model 33 | ''' 34 | with open(file_path) as f: 35 | self.tree = cPickle.loads(f.readline().strip()) 36 | 37 | def save(self, model_path): 38 | if not self.tree: 39 | raise ValueError, 'no model can save!' 40 | with open(model_path, 'w') as f: 41 | f.write(cPickle.dumps(self.tree)) 42 | 43 | def __train(self, datas, labels, attrs, threshold=0.01): 44 | """train函数调用的真正训练函数 45 | """ 46 | label_dict = Counter(labels) 47 | if len(label_dict.keys()) == 1: 48 | return float(label_dict.keys()[0]) 49 | if len(attrs) == 0: 50 | return sum(label for label in labels) / float(len(labels)) 51 | attr_gain, attr, split_value, left_data, left_label, right_data, right_label = self.get_best_feature( 52 | datas, labels, attrs) # 得到最好信息增益的属性 53 | if attr_gain < threshold: 54 | return sum(label for label in labels) / float(len(labels)) 55 | node = Node(attr, split_value) 56 | child_attr = self.get_split_attr(# 为下轮切割属性 57 | attrs, attr 58 | ) 59 | #创建左子树 60 | node.left_tree = self.__train( 61 | self.split_data_by_attr(left_data , attrs , attr) , left_label, child_attr, threshold) 62 | #创建 63 | node.right_tree = self.__train( 64 | self.split_data_by_attr(right_data , attrs , attr), right_label, child_attr, threshold) 65 | return node 66 | 67 | 68 | 69 | 70 | def split_data_by_attr(self, datas, attrs, attr_name): 71 | ''' 72 | 切割训练集为了下一步 73 | datas :训练的数据 [[data]] 74 | attrs 属性名称列表 75 | attr_val 属性值 76 | dense_data 是否是密集型数据 , 暂时废弃 77 | ''' 78 | dump_datas = [] 79 | index = attrs.index(attr_name) 80 | for data in datas: 81 | dump = [] 82 | dump = data[:index] 83 | dump.extend(data[index + 1:]) 84 | dump_datas.append(dump) 85 | return dump_datas 86 | 87 | def train(self, datas, attrs, labels, threshold=0.01): 88 | self.attrs = attrs 89 | self.attrs_dict = {attr : index for index , attr in enumerate(attrs)} 90 | self.tree = self.__train(datas, labels, attrs, threshold) 91 | 92 | def get_split_attr(self, attrs, attr): 93 | split_attrs = [] 94 | index = attrs.index(attr) 95 | split_attrs.extend(attrs[:index]) 96 | split_attrs.extend(attrs[index + 1:]) 97 | return split_attrs 98 | 99 | def get_split_value(self, datas, split_index): 100 | ''' 101 | 得到cart树,分割数据点index,平均数 102 | ''' 103 | if len(datas): 104 | return sum(data[split_index] for data in datas) / float(len(datas)) 105 | raise ValueError 106 | 107 | def calc_gini(self, datas, labels, split_index, split_value): 108 | """计算属性值gini值 109 | 参数: 110 | datas 数据集合 111 | labels label集合,与datas集合对应 112 | 113 | """ 114 | labels_dict = Counter(labels) 115 | label_dist_dict = { 116 | label: defaultdict(int) for label in labels_dict.keys()} 117 | left_data = [] 118 | left_label = [] 119 | right_data = [] 120 | right_label = [] 121 | for i in range(len(labels)): 122 | if datas[i][split_index] > split_value: 123 | label_dist_dict[labels[i]][1] += 1. 124 | right_data.append(datas[i]) 125 | right_label.append(labels[i]) 126 | else: 127 | label_dist_dict[labels[i]][0] += 1. 128 | left_data.append(datas[i]) 129 | left_label.append(labels[i]) 130 | gini = 0. 131 | for label in labels_dict.keys(): 132 | prob = labels_dict[label] / float(len(labels)) 133 | prob_label = label_dist_dict[label][1] / float(len(labels)) 134 | gini += (prob * 2 * prob_label * (1 - prob_label)) 135 | return gini, left_data, left_label, right_data, right_label 136 | 137 | def get_best_feature(self, datas, labels, attrs): 138 | """得到datas数据中最好的分割属性 139 | params: 140 | datas 训练数据 eg,[[1 , 3, ,4]] 141 | labels 根据训练数据对应的label 142 | attrs 训练属性列表 143 | return 144 | 145 | raise: 146 | None 147 | """ 148 | gini_min = float('inf') 149 | left_data = None 150 | left_label = None 151 | right_data = None 152 | right_label = None 153 | split_attr = None 154 | split_value = None 155 | for split_index in range(len(attrs)): 156 | _split_value = self.get_split_value(datas, split_index) 157 | gini, _left_data, _left_label, _right_data, _right_label = self.calc_gini( 158 | datas, labels, split_index, _split_value) 159 | if gini < gini_min: 160 | gini_min = gini 161 | split_attr = attrs[split_index] 162 | left_data = _left_data 163 | left_label = _left_label 164 | right_data = _right_data 165 | right_label = _right_label 166 | split_value = _split_value 167 | return gini_min, split_attr, split_value, left_data, left_label, right_data, right_label 168 | 169 | def _classify(self, data, attrs, node): 170 | if isinstance(node, Node) is False: 171 | return node 172 | value = data[self.attrs_dict[node.split_attr]] 173 | if node.left_tree is None and node.right_tree is None: 174 | return value 175 | if value > node.split_value: 176 | return self._classify(data, attrs, node.right_tree) 177 | else: 178 | return self._classify(data, attrs, node.left_tree) 179 | 180 | def classify(self, data): 181 | """对输入的数据进行打分 182 | params 183 | data 输入数据,输入类型为list 184 | return 185 | value cart树返回数值 186 | raise 187 | Exception 188 | """ 189 | if data and isinstance(data , (list,tuple)) and len(data) and len(self.attrs): 190 | return self._classify(copy.copy(data), self.attrs , self.tree) 191 | else: 192 | raise ValueError,"data is list , eg [0 , 1 ,3 ...] ; data length is equal train attrs" 193 | 194 | if __name__ == '__main__': 195 | datas = [[1, 0, 0, 0], 196 | [1, 0, 0, 1], 197 | [1, 1, 0, 1], 198 | [1, 1, 1, 0], 199 | [1, 0, 0, 0], 200 | [2, 0, 0, 0], 201 | [2, 0, 0, 1], 202 | [2, 1, 1, 1], 203 | [2, 0, 1, 2], 204 | [2, 0, 1, 2], 205 | [3, 0, 1, 2], 206 | [3, 0, 1, 1], 207 | [3, 1, 0, 1], 208 | [3, 1, 0, 2], 209 | [3, 0, 0, 0]] 210 | labels = [0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0] 211 | d = CartTree() 212 | d.train(datas, [1, 2, 3, 4], labels) 213 | print d.tree 214 | print d.classify([3, 1 , 0 , 0]) 215 | print d.attrs 216 | -------------------------------------------------------------------------------- /test/testDDistance.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import math 4 | 5 | 6 | class DDdistance(object): 7 | 8 | def distance(self, data1, data2): 9 | raise NotImplementedError 10 | 11 | 12 | class Manhattan(DDdistance): 13 | 14 | """ 15 | 算法实现曼哈顿距离 16 | """ 17 | 18 | def distance(self, data1, data2): 19 | if len(data1) != len(data2): 20 | raise ValueError 21 | return sum([abs(data1[i] - data2[i]) for i in range(len(data1))]) 22 | 23 | 24 | class DefaultDistance(DDdistance): 25 | 26 | def distance(self, data1, data2): 27 | return math.sqrt( 28 | sum([ 29 | (data1[i] - data2[i]) ** 2 30 | for i in range(len(data1)) 31 | ]) 32 | ) 33 | 34 | 35 | class Chebyshev(DDdistance): 36 | 37 | """ 38 | 切比雪夫距离 39 | """ 40 | 41 | def distance(self, data1, data2): 42 | if len(data1) != len(data2): 43 | raise ValueError 44 | return max([abs(data1[i] - data2[i]) for i in range(len(data1))]) 45 | 46 | 47 | class Cosine(DDdistance): 48 | 49 | """ 50 | 余弦距离 51 | """ 52 | 53 | def distance(self, data1, data2): 54 | if len(data1) != len(data2): 55 | raise ValueError 56 | 57 | return sum([data1[i] * data2[i] for i in range(len(data1))]) / ( 58 | math.sqrt(sum([data ** 2 for data in data1])) + 59 | math.sqrt(sum([data ** 2 for data in data2])) 60 | ) 61 | 62 | 63 | class Hamming(DDdistance): 64 | 65 | """ 66 | 海明距离 67 | """ 68 | 69 | def distance(self, data1, data2): 70 | return sum([1 if data1[i] == data2[i] else 0 for i in range(len(data1))]) / float(len(data1)) 71 | 72 | 73 | class Euclidean(DDdistance): 74 | 75 | """ 76 | 欧式距离 77 | """ 78 | 79 | def distance(self, data1, data2): 80 | return math.sqrt(sum([(data1 - data2) ** 2 for i in range(len(data1))])) 81 | -------------------------------------------------------------------------------- /test/testDataSet.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | __ALL__ = ["DList" , "DenseData" , "SparseData" , "DataSet"] 5 | 6 | class DList(list): 7 | 8 | def items(self): 9 | return enumerate(self) 10 | 11 | def has_key(self , value): 12 | if value and isinstance(value , (int , long)): 13 | if value > 0 and value < len(self): 14 | return True 15 | return False 16 | raise TypeError 17 | 18 | 19 | def update(self , data): 20 | if data and hasattr(data , "items"): 21 | for index , value in data.items(): 22 | if index > len(self): 23 | self.append(value) 24 | else: 25 | self[index] = value 26 | elif data and isinstance(data , (list , tuple)): 27 | for index , value in enumerate(data): 28 | if index > len(self): 29 | self[index] = value 30 | 31 | def keys(self): 32 | return xrange(self._data_len) 33 | 34 | def order_key(self): 35 | return xrange(self.data_len) 36 | 37 | def values(self): 38 | return self 39 | 40 | 41 | 42 | class DeseData(DList): 43 | """多维数组实现 44 | """ 45 | 46 | def __init__(self , data , default_value , data_len , *argv , **kw): 47 | if data is None: 48 | for i in xrange(data_len): 49 | self.append(default_value) 50 | elif isinstance(data , (list , tuple)) and len(data) == data_len: 51 | self.extend(data) 52 | elif isinstance(data , dict): 53 | for i in xrange(data_len): 54 | self.append(data.get( i , default_value)) 55 | else: 56 | raise TypeError("data type must be in [list , tuple , dict]") 57 | self._data_len = data_len 58 | 59 | def __len__(self): 60 | return self._data_len 61 | 62 | def __setitem__(self , index , value ): 63 | if index and index < self._data_len: 64 | super(DeseData , self).__setitem__(index , value) 65 | else: 66 | raise IndexError 67 | 68 | class SparseData(dict): 69 | """稀疏矩阵实现,利用dict实现;在存储空间上不占优势;;一般用于文本向量 70 | 计算中使用 71 | """ 72 | def __init__(self ,data , default_value , data_len , *argv , **kw): 73 | super(SparseData , self).__init__(*argv , **kw) 74 | self._default = default_value 75 | self.data_len = data_len 76 | if data is not None: 77 | self.update(data) 78 | 79 | def __getitem__(self , index ): 80 | return super(SparseData , self).__getitem__(index) if index in self else self._default 81 | 82 | def __setitem__(self , index , value): 83 | if index is not None: 84 | if index < self.data_len: 85 | super(SparseData , self).__setitem__(index , value ) 86 | else: 87 | raise IndexError 88 | else: 89 | raise ValueError 90 | 91 | def __len__(self): 92 | return self.data_len 93 | 94 | def order_key(self): 95 | return sorted(self.keys() , reverse = False) 96 | 97 | class DataSet(DList): 98 | """实现稀疏/非稀疏矩阵保存结构 99 | """ 100 | 101 | def __init__(self ,data_len, dense_data , *argv , **kw): 102 | """初始化矩阵 103 | params: 104 | data_len 矩阵维数 105 | dense_data 是否为稀疏矩阵 106 | return 107 | None 108 | raise 109 | None 110 | """ 111 | super(DataSet , self).__init__(*argv , **kw) 112 | self._type = dense_data 113 | self._data_len = data_len 114 | self._data_class = DeseData if dense_data is False else SparseData 115 | self._range = xrange(self._data_len) 116 | 117 | def append(self , data = None): 118 | """增加数据 119 | params: 120 | data 需要增加的数据;类型:tuple,list,dict 121 | return 122 | True 123 | raise: 124 | data 类型不符合需求,抛出TypeError 125 | """ 126 | if data is None or isinstance(data , (list , tuple , dict)) : 127 | super(DataSet , self).append(self._data_class(data ,-1 , self._data_len )) 128 | return True 129 | else: 130 | raise TypeError 131 | 132 | def extend(self , data): 133 | """增加同类型数据 134 | """ 135 | if isinstance(data , DataSet): 136 | if data._data_len == self._data_len and data._type == self._type: 137 | super(DataSet , self).extend(data) 138 | else: 139 | raise ValueError 140 | return 141 | raise TypeError 142 | 143 | 144 | def shape(self): 145 | return len(self) , self._data_len 146 | 147 | def data_range(self): 148 | return self._range 149 | -------------------------------------------------------------------------------- /test/testDbScan.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from random import randint 4 | from testBaseStrut import WeightArray 5 | class ClusterItem(object): 6 | 7 | 8 | def __init__(self , data): 9 | self.data = data 10 | self.neighbours = [] 11 | self.visited = False 12 | self.cluster = 0 13 | 14 | 15 | class DbScan(object): 16 | 17 | 18 | def cluster(self , datas , radius , minPoint): 19 | ''' 20 | 算法:DBSCAN 21 | 参数: 22 | radius 半径 23 | minPoint 给定点在radius领域内成为核心对象的最小领域点数 24 | 输出:目标类簇集合 25 | 方法: 26 | repeat 27 | 1) 判断输入点是否为核心对象 28 | 2) 找出核心对象的E领域中的所有直接密度可达点 29 | util 所有输入点都判断完毕 30 | 31 | repeat 32 | 针对所有核心对象的E领域所有直接密度可达点找到最大密度相连对象集合, 33 | 中间涉及到一些密度可达对象的合并。 34 | Util 所有核心对象的E领域都遍历完毕 35 | ''' 36 | cluters = [] 37 | weight_map = WeightArray(datas , self.distance) 38 | items = [ ClusterItem(data) for data in datas ] 39 | k = 1 40 | for i in range(items): 41 | if items[i].visited == False: 42 | neighbours = [ items[j] for j in range(items) if i != j and weight_map[(i,j)] < radius ] 43 | if len(neighbours) >= minPoint: 44 | items[i].visited = True 45 | items[i].cluster = k 46 | for neighbour in neighbours: 47 | if neighbour.visited == False or neighbour.cluster == -1: 48 | neighbour.cluster = k 49 | neighbour.visited = True 50 | items[i].data.append(neighbour) 51 | 52 | elif neighbour.visited == True and neighbour.cluster != -1: 53 | neighbour.cluster = k 54 | for item in neighbour.data: 55 | item.cluster = k 56 | items[i].data.extend(neighbour.data) 57 | del neighbour.data[:] 58 | k += 1 59 | else: 60 | items[i].visited = True 61 | items[i].cluster = -1 62 | 63 | 64 | 65 | 66 | def distance(self , data1 , data2 ): 67 | raise NotImplementedError 68 | 69 | 70 | if __name__ == '__main__': 71 | t = DbScan() 72 | 73 | 74 | datas = [[ _ , randint(0, 20) * 1.0, randint(0, 20) * 1.0] for _ in range(100)] 75 | t.cluster() 76 | print datas 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /test/testDecisionTree.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | from collections import defaultdict 5 | from collections import Counter 6 | from math import log 7 | 8 | 9 | class Node(object): 10 | 11 | def __init__(self, attr_name=None, label=None): 12 | self.attr_name = attr_name 13 | self.label = label 14 | self.child = {} 15 | 16 | def __str__(self): 17 | child = '\t'.join(['%s %s' % (val , str(node)) for val , node in self.child.items()]) 18 | return 'attr: %s \t label : %s \t childs : [ %s ] ' % (self.attr_name, self.label, self.child) 19 | 20 | 21 | 22 | 23 | class DecisionTree(object): 24 | 25 | def __init__(self): 26 | self.tree = None 27 | self.attrs = None 28 | 29 | def train(self, datas, attrs, threshold=0.01, denseData=True, tree=None): 30 | if self.attrs == None: 31 | self.attrs = attrs 32 | node = Node() 33 | if self.tree == None: 34 | self.tree = node 35 | label_dict = Counter([data[-1] for data in datas]) 36 | if len(label_dict.keys()) == 1: 37 | node.label = datas[0][-1] 38 | return node # 如果都输于同一类 , 则返回树 39 | # return Node(label=datas[0][-1]) 40 | if len(attrs) == 0: 41 | node.label = label_dict.most_common()[0][0] 42 | return node # 如果属性为空 , 则返回绝大数的类标记 43 | # return Node(label=label_dict.most_common()[0][0]) 44 | attr, attr_gain, attr_val = self.getBestFeature( 45 | datas, attrs, denseData)[0] # 得到最好信息增益的属性 46 | if attr_gain < threshold: 47 | node.label = label_dict.most_common()[0][0] 48 | return node 49 | node.attr_name = attr 50 | for val in attr_val: 51 | #按照属性不同value 区分这个 52 | #取得最好分类属性 , 按照不同该属性不同val 区分数据 ; 53 | node.child[val] = self.train( 54 | self.splitDataByAttr( 55 | datas, attrs, attr, val 56 | ), 57 | self.getSplitAttrs( 58 | attrs, attr 59 | ), 60 | threshold, 61 | denseData, node) 62 | return node 63 | 64 | @staticmethod 65 | def entropy(probs): 66 | if probs: 67 | if isinstance(probs, (list, tuple)): 68 | return sum([-prob * log(prob, 2) for prob in probs]) 69 | elif isinstance(probs, (int, float)): 70 | return -probs * log(probs, 2) 71 | 72 | def getSplitAttrs(self, attrs, attr): 73 | split_attrs = [] 74 | index = attrs.index(attr) 75 | split_attrs.extend(attrs[:index]) 76 | split_attrs.extend(attrs[index + 1:]) 77 | return split_attrs 78 | 79 | def getBestFeature(self, datas, attrs, denseData): 80 | ''' 81 | 通过算法获得最好分类的属性 ; 82 | 思想: 83 | 1. 信息增益 84 | 2. 信息增益率 85 | 参数: 86 | datas 训练的数据 87 | attrs 属性列表 88 | deseData 是否为密集型数据 , == True [[v1 , v2 ,v3 ,v4 .... vn , label]] 89 | == False [({f1 : v1 , f2:v2...fn:vn} , label1)] 90 | ''' 91 | label_dict = defaultdict(float) 92 | for data in datas: 93 | label_dict[data[-1]] += 1 94 | data_num = len(datas) # 计算此次计算信息增益的数据长度 , 样本大小 95 | label_entropy = DecisionTree.entropy( 96 | [label_count / data_num for label_count in label_dict.values()]) # 计算整个系统的熵 97 | # 计算每个属性的熵 98 | # 声明一个属性列表 , {属性 : {属性值 : 出现的次数}} 99 | attr_value_count = {attr: defaultdict(float) for attr in attrs} 100 | # 声明属性->属性值->类别->数量 101 | attr_value_class_count = {attr: defaultdict(dict) for attr in attrs} 102 | iter_index = range(len(attrs)) 103 | for data in datas: 104 | if denseData: 105 | for i in iter_index: 106 | # 计算每个属性下不同值数量 , 此处必要转换为离散变量 107 | attr_value_count[attrs[i]][data[i]] += 1 108 | if not attr_value_class_count[attrs[i]][data[i]].has_key(data[-1]): 109 | attr_value_class_count[attrs[i]][data[i]][ 110 | data[-1]] = 0. 111 | attr_value_class_count[attrs[i]][data[i]][data[-1]] += 1.0 112 | # 信息增益计算公式分析 113 | # H(D) - H(D|A) 114 | # 系统熵 - 每个属性下 , 存在这个类别的信息熵 115 | # 116 | # gains = [(属性名称 , 信息增益 , (属性值))......(属性名称n , 信息增益n , (f1 ...fn))] 117 | gains = [(attr, 118 | label_entropy - 119 | sum( 120 | [ 121 | attr_value_count[attr][value] / data_num * 122 | DecisionTree.entropy( 123 | [ 124 | # 计算每个属性在特定属性值时 , 发生的概率 125 | # p(DA1)/A 126 | attr_value_class_count[attr][value][ 127 | label] / attr_value_count[attr][value] 128 | # 循环每个属性值在特定label产生 129 | for label in attr_value_class_count[attr][value].keys() 130 | ] 131 | ) 132 | for value in attr_value_count[attr].values() if attr_value_class_count[attr].has_key(value)] 133 | ), 134 | attr_value_count[attr].keys()) 135 | for attr in attr_value_count.keys()] 136 | return sorted(gains, key=lambda x: x[1], reverse=True) 137 | 138 | def splitDataByAttr(self, datas, attrs, attr_name, attr_value, denseData=True): 139 | ''' 140 | 切割训练集为了下一步 141 | datas :训练的数据 [[data]] 142 | attrs 属性名称列表 143 | attr_val 属性值 144 | denseData 是否是密集型数据 , 暂时废弃 145 | ''' 146 | dump_datas = [] 147 | index = attrs.index(attr_name) 148 | for data in datas: 149 | dump = [] 150 | if data[index] == attr_value: 151 | dump = data[:index] 152 | dump.extend(data[index + 1:]) 153 | dump_datas.append(dump) 154 | return dump_datas 155 | 156 | def classify(self, data): 157 | ''' 158 | 功能: 用于分类模型 159 | 参数 : 160 | data 待分析的数据 , list 161 | 返回: 162 | 返回决策树的label 163 | ''' 164 | if self.tree == None: 165 | raise Exception, 'no model !' 166 | node = self.tree 167 | if node.label != None: 168 | return node.label 169 | for _ in range(len(data)): 170 | index = self.attrs.index(node.attr_name) 171 | node = self.tree.child[data[index]] 172 | if node.label != None: 173 | return node.label 174 | return None 175 | 176 | if __name__ == '__main__': 177 | # 测试数据 178 | # 是否必须水里 是否有脚蹼 属于鱼类 179 | data = [ [1, 0, 1], [0, 1, 0], [1, 1, 1]] 180 | d = DecisionTree() 181 | d.train(data, [1, 2]) 182 | print d.classify([1, 0]) 183 | -------------------------------------------------------------------------------- /test/testDefaultValue.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from collections import defaultdict 3 | 4 | class MissingValue(object): 5 | 6 | def get_value(self, feature): 7 | raise NotImplementedError 8 | 9 | 10 | class ArvgMissingValue(object): 11 | 12 | def __init__(self, feature_len): 13 | self.default_values = [None for i in range(feature_len)] 14 | 15 | def add(self, feature, value): 16 | self.default_values[feature] = value 17 | #if self.default_values[feature] is None 18 | #else (self.default_values[feature] + value) 19 | -------------------------------------------------------------------------------- /test/testDict.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | import collections 7 | import testDataSet 8 | 9 | class Dictionary(dict): 10 | 11 | def __init__(self , **kw): 12 | if "dict_path" in kw: 13 | self.open_dict(kw["dict_path"]) 14 | elif "words" in kw: 15 | self.update({word:word_seq for word_seq , word in enumerate(kw["words"])}) 16 | 17 | def open_dict(self , dict_path): 18 | with open(dict_path) as f: 19 | for seq , word in enumerate(f.readlines()): 20 | self[word] = seq 21 | 22 | def __setitem__(self ,key, value): 23 | if key and key not in self: 24 | value = len(self) 25 | super(Dictionary , self).__setitem__(key , value) 26 | 27 | 28 | 29 | def to_vector(self , words): 30 | word_counter = collections.Counter(words.split()) 31 | vector = [0] * len(self) 32 | for word,count in word_counter.items(): 33 | if word in self: 34 | vector[self[word]] = count 35 | return vector 36 | 37 | def to_one_hot(self , words): 38 | words = set(words.split()) 39 | vector = [0] * len(self) 40 | for word in words: 41 | if word in self: 42 | vector[self[word]] = count 43 | return vector 44 | 45 | 46 | 47 | if __name__ == "__main__": 48 | d = Dictionary(words = ["a" , "b" ,"c"]) 49 | d['c'] = 5 50 | d['d'] = 6 51 | print d.to_vector("b b a") 52 | 53 | -------------------------------------------------------------------------------- /test/testEmm.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | class Emm(object): 10 | 11 | 12 | 13 | def __init__(self ): 14 | pass 15 | 16 | 17 | 18 | if __name__ == '__main__': 19 | pass 20 | -------------------------------------------------------------------------------- /test/testFeature.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | import bisect 6 | 7 | 8 | class Binning(object): 9 | 10 | 11 | 12 | def __init__(self,k,box = None): 13 | self.k = k 14 | if box is None: 15 | self._box = [0.] * (k - 1) 16 | elif isinstance(box,(list),tuple): 17 | if len(box) != self.k - 1: 18 | raise ValueError("input box number not equal k, please recheck") 19 | self._box = box 20 | 21 | 22 | def train(self,features): 23 | pass 24 | 25 | 26 | def predict(self,feature): 27 | if feature is None: 28 | raise TypeError 29 | return bisect.bisect(self._box,feature) 30 | 31 | 32 | def _sort(self,array): 33 | return sorted(array,reversed = False) 34 | 35 | 36 | 37 | class EqualRate(Binning): 38 | 39 | 40 | def train(self,features): 41 | features = self._sort(features) 42 | for i in range(0,len(features),len(features) / self.k): 43 | pass 44 | 45 | class EqualLength(Binning): 46 | """equal length excute feature 47 | 48 | >>> f = EqualLength(5) 49 | >>> from random import randint 50 | >>> array = [randint(0,100000) for i in range(10000)] 51 | >>> f.train(array) 52 | >>> f.predict(899) 53 | 0 54 | """ 55 | 56 | def train(self,features): 57 | min_value = min(features) 58 | max_value = max(features) 59 | length = (max_value - min_value) / self.k 60 | for i in range(self.k - 1): 61 | self._box[i] = min_value + length * (i + 1) 62 | -------------------------------------------------------------------------------- /test/testGRTree.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/test/testGRTree.py -------------------------------------------------------------------------------- /test/testHCluster.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | import math 5 | from testBaseStrut import WeightArray 6 | 7 | 8 | 9 | class HierarchicalClustering(object): 10 | 11 | def __init__(self): 12 | pass 13 | 14 | def cluster(self, datas, cluster_num, threshold=0.03): 15 | ''' 16 | 17 | ''' 18 | 19 | no_change = False 20 | 21 | # 创建数据距离词典 22 | distance_map = WeightArray(datas, self.distance) 23 | # 创建一个cluster,每个数据都是一个cluster 24 | clusters = [[datas[i]] for i in range(len(datas))] 25 | 26 | # 如果聚类不小于要求聚类数目继续 27 | while len(clusters) > cluster_num: 28 | min_distance = None #最短距离保存值 29 | min_cluster_pair = None #最短距离所对应的数据 30 | for i in range(len(clusters)): 31 | for j in range(i + 1, len(clusters)): 32 | d = self.get_cluster_distance( 33 | clusters[i], clusters[j], distance_map) 34 | if d < threshold and (min_distance is None or d < min_distance): 35 | min_distance = d 36 | min_cluster_pair = (i, j) 37 | if min_cluster_pair: 38 | clusters[min_cluster_pair[0]].extend( 39 | clusters[min_cluster_pair[1]]) 40 | del clusters[min_cluster_pair[1]] 41 | else: 42 | break 43 | return clusters 44 | 45 | def distance(self, data1, data2): 46 | ''' 47 | function: 48 | 计算两个数据的距离 49 | params: 50 | data1 第一个数据 51 | data2 第二个数据 52 | return 53 | distance 两个数据的距离 54 | ''' 55 | 56 | return math.sqrt(sum([(data1[i] - data2[i]) ** 2 for i in range(1, len(data1))])) 57 | 58 | def get_cluster_distance(self, cluster1, cluster2, distance_map): 59 | ''' 60 | function 61 | 实现类之间平均距离 62 | params: 63 | cluster1 簇1 64 | cluster2 簇2 65 | distance_map DataDistance实例 66 | return 67 | 两个类之间平均距离 68 | ''' 69 | raise NotImplementedError 70 | 71 | 72 | class ALHierarchicalClustering(HierarchicalClustering): 73 | ''' 74 | 主要算法: 75 | 计算cluster之间平均距离 76 | ''' 77 | def get_cluster_distance(self, cluster1, cluster2, distance_map): 78 | return sum([sum(distance_map[(data1[0], data2[0])]for data2 in cluster2) for data1 in cluster1]) / float(len(cluster1) * len(cluster2)) 79 | 80 | 81 | class SLHierarchicalClustering(HierarchicalClustering): 82 | ''' 83 | 主要算法: 84 | 两个cluster中最小的两个数据之间距离 85 | ''' 86 | 87 | def get_cluster_distance(self, cluster1, cluster2, distance_map): 88 | 89 | return min([min(distance_map[(data1[0], data2[0])] for data2 in cluster2) for data1 in cluster1]) / float(len(cluster1) * len(cluster2)) 90 | 91 | 92 | class CLHierarchicalClustering(HierarchicalClustering): 93 | ''' 94 | 主要算法: 95 | 两个cluster中距离最大两个数据距离 96 | ''' 97 | 98 | def get_cluster_distance(self, cluster1, cluster2, distance_map): 99 | 100 | return max([max(distance_map[(data1[0], data2[0])] for data2 in cluster2) for data1 in cluster1]) / float(len(cluster1) * len(cluster2)) 101 | 102 | 103 | if __name__ == '__main__': 104 | 105 | hc = ALHierarchicalClustering() 106 | from random import randint 107 | datas = [[i, randint(1, 20), randint(1, 20)] for i in range(10)] 108 | clusters = hc.cluster(datas, 4, 100) 109 | for cluster in clusters: 110 | print cluster 111 | -------------------------------------------------------------------------------- /test/testHmm.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | from collections import defaultdict 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | from cPickle import load 10 | from cPickle import dump 11 | 12 | class HmmItem(object): 13 | 14 | __slots__ = ('obs', 'hide') 15 | 16 | def __init__(self, obs, hide): 17 | self.obs = obs 18 | self.hide = hide 19 | 20 | def __str__(self): 21 | return 'obs_state: %s \t hide_state: %s' % (self.obs, self.hide) 22 | 23 | 24 | class HmmItems(list): 25 | 26 | ''' 27 | 主要为了存储序列性观察与隐藏相对应状态 ; 28 | 主要方法: 29 | t[1]=(1,2) 30 | t.append(HmmItem) or t.append((obs , hide)) 31 | ''' 32 | 33 | def __check(self, value): 34 | if not value: 35 | raise ValueError, 'value is nothing , keep it out' 36 | 37 | def __setitem__(self, key, value): 38 | self.__check(value) 39 | if isinstance(value, HmmItem): 40 | super(HmmItems, self).__setitem__(key, value) 41 | elif isinstance(value, (tuple, list)) and len(value) == 2: 42 | super(HmmItems, self).__setitem__(key, HmmItem(value[0], value[1])) 43 | else: 44 | raise TypeError, 'HmmItems append accept type only ( HmmItem , tuple or list which is first item is obs state and second is hide state!) ' 45 | 46 | def append(self, value): 47 | self.__check(value) 48 | if isinstance(value, HmmItem): 49 | super(HmmItems, self).append(value) 50 | elif isinstance(value, (tuple, list)) and len(value) == 2: 51 | super(HmmItems, self).append(HmmItem(value[0], value[1])) 52 | else: 53 | raise TypeError, 'HmmItems append accept type only ( HmmItem , tuple or list which is first item is obs state and second is hide state!) ' 54 | 55 | def __str__(self): 56 | return ' '.join(['%s' % str(i) for i in self]) 57 | 58 | class HmmModel(object): 59 | 60 | def __init__(self, states): 61 | if not (states and isinstance(states, (list, tuple))): 62 | raise ValueError 63 | self.states_count = defaultdict(float) 64 | self.obs_state = defaultdict(float) # obs state {obs_state : count } 65 | self.states = states # obs state list 66 | self.start_states = self.create_start_states(states) # start probability 67 | self.transition_probability = self.create_transition_probability(states) 68 | self.emission_probability = self.create_emission_probability(states) 69 | 70 | def create_start_states(self, states, init_value=0.): 71 | start_states = defaultdict(float) 72 | for state in states: 73 | start_states[state] += init_value 74 | return start_states 75 | 76 | def create_transition_probability(self, states, init_value=0.): 77 | transition_probability = {} 78 | for state in states: 79 | transition_probability[state] = defaultdict(float) 80 | for after_state in states: 81 | transition_probability[state][after_state] += init_value 82 | return transition_probability 83 | 84 | def create_emission_probability(self, states): 85 | emission_probability = {} 86 | for state in states: 87 | emission_probability[state] = defaultdict(float) 88 | return emission_probability 89 | 90 | 91 | 92 | 93 | class Hmm(object): 94 | 95 | def __init__(self, model_path): 96 | model = self.load(model_path) 97 | if model and isinstance(model, HmmModel): 98 | self.model = model 99 | else: 100 | raise TypeError, 'model file not have right hmm model! : %s' % model_path 101 | 102 | def load(self, model_path): 103 | with open(model_path, 'rb') as f: 104 | return load(f) 105 | 106 | def viterbi(self, obs): 107 | ''' 108 | 特比算法 摘自wiki 维特比算法 109 | ''' 110 | V = [{}] 111 | path = {} 112 | for y in self.model.states: 113 | V[0][y] = self.model.start_states[y] * \ 114 | self.model.emission_probability[y][obs[0]] 115 | path[y] = [y] 116 | for t in range(1, len(obs)): 117 | V.append({}) 118 | newpath = {} 119 | for y in self.model.states: 120 | (prob, state) = max( 121 | [(V[t - 1][y0] * self.model.transition_probability[y0][y] * self.model.emission_probability[y][obs[t]], y0) for y0 in self.model.states]) 122 | V[t][y] = prob 123 | newpath[y] = path[state] + [y] 124 | path = newpath 125 | (prob, state) = max([(V[len(obs) - 1][y], y) 126 | for y in self.model.states]) 127 | return (prob, path[state]) 128 | 129 | 130 | class TrainHmm(object): 131 | 132 | def __init__(self, states): 133 | self.hmm = HmmModel(states) 134 | 135 | def save(self, model_path): 136 | with open(model_path, 'wb') as f: 137 | dump(self.hmm, f) 138 | 139 | def add_items(self, hmmitems): 140 | """将序列转换为hmmitems ,添加hmm训练器 141 | params 142 | hmmitems HmmItems 143 | return 144 | None 145 | raise 146 | None 147 | """ 148 | for i in range(len(hmmitems) - 1): 149 | self.hmm.transition_probability[hmmitems[i].hide][ 150 | hmmitems[i + 1].hide] += 1 151 | self.hmm.start_states[hmmitems[0].hide] += 1 152 | for item in hmmitems: 153 | self.hmm.obs_state[item.obs] += 1 154 | self.hmm.states_count[item.hide] += 1 155 | self.hmm.emission_probability[item.hide][item.obs] += 1 156 | 157 | def translate(self): 158 | 159 | startsCount = sum(self.hmm.start_states.values()) 160 | # 计算开始状态概率 161 | for state in self.hmm.start_states.keys(): 162 | self.hmm.start_states[state] = self.hmm.start_states[ 163 | state] / startsCount 164 | # 转移矩阵 165 | hide_state_keys = self.hmm.transition_probability.keys() 166 | hide_stats_count = sum(self.hmm.states_count.values()) 167 | for hide_state in hide_state_keys: 168 | for after_hide_state in hide_state_keys: 169 | self.hmm.transition_probability[hide_state][after_hide_state] = ( self.hmm.transition_probability[hide_state][ 170 | after_hide_state] + 1.0) / ( self.hmm.states_count[hide_state] + hide_stats_count) 171 | # 可观察状态下的隐藏状态发生概率 172 | for hide_state in self.hmm.emission_probability.keys(): 173 | for obs_state in self.hmm.obs_state.keys(): 174 | # 注释下 : 在这个观察状态下 , 隐藏状态发生的概率 , 如果是 ( 可观察状态 in 此隐藏状态 ) / 可观察状态 175 | # in this obs state , hide state will 176 | # p(hide_state | obs_state) 177 | # p(A|B) = P(AB) / P(B) = Count(AB) / count(Br) 178 | self.hmm.emission_probability[hide_state][obs_state] = ( 179 | self.hmm.emission_probability[hide_state][obs_state] + 1.) / ( self.hmm.states_count[hide_state] + hide_stats_count ) 180 | 181 | 182 | class TrainSeg(object): 183 | 184 | def __init__(self , states = ['s' , 'e' , 'm' ,'b']): 185 | self.model = TrainHmm(states) 186 | 187 | 188 | def add_line(self , line): 189 | if len(line) != 0: 190 | words = line.decode("utf-8").split() 191 | hmmitems = [] 192 | for word in words: 193 | for item in self.word_state(word): 194 | hmmitems.append(item) 195 | self.model.add_items(hmmitems) 196 | return True 197 | 198 | def word_state(self , word): 199 | if len(word) == 0: 200 | yield 201 | elif len(word) == 1: 202 | yield HmmItem(word , 's') 203 | elif len(word) == 2: 204 | yield HmmItem(word, 'b') 205 | yield HmmItem(word , 'e') 206 | elif len(word) >=3: 207 | yield HmmItem(word[0] , 'b') 208 | for _word in word[1:-1]: 209 | yield HmmItem(_word , 'm') 210 | yield HmmItem(word[-1] , 'e') 211 | 212 | def train(): 213 | self.model.translate() 214 | 215 | if __name__ == '__main__': 216 | 217 | t = TrainSeg() 218 | t.add_line('我 爱 中国 !') 219 | t.model.train() 220 | 221 | -------------------------------------------------------------------------------- /test/testInterface.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | 7 | class Classify(object): 8 | 9 | 10 | 11 | 12 | def classify(self , data , *argv , **kw): 13 | raise NotImplmetion 14 | 15 | 16 | class Regression(object): 17 | 18 | 19 | 20 | 21 | def predict(self , data , *argv , **kw): 22 | raise NotImplmetion 23 | -------------------------------------------------------------------------------- /test/testKdTree.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intohole/moodstyle/1d06fc565c0df4bf07196854f3efb94bbefd1bfb/test/testKdTree.py -------------------------------------------------------------------------------- /test/testKmeans.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | #!/usr/bin/env python 3 | 4 | from random import randint 5 | from random import sample 6 | from collections import Counter 7 | from collections import defaultdict 8 | from math import sqrt 9 | import re 10 | from testDDistance import Manhattan 11 | from testDDistance import Hamming 12 | from testDDistance import Cosine 13 | from testDDistance import Euclidean 14 | from copy import copy 15 | import sys 16 | 17 | ''' 18 | 处理数据的格式 [数据1,数据2] 19 | 但是必须要改写 def distance(data1,data2) 数据距离函数 20 | 数据转换格式 {分类:{数据的位置:数据距离}} 21 | ''' 22 | 23 | 24 | class Center(object): 25 | 26 | def __init__(self, label, center_vector, distance_fun=None): 27 | if not isinstance(label, (int, long, basestring)): 28 | raise TypeError 29 | if not isinstance(center_vector, (list, tuple)): 30 | raise TypeError 31 | self.label = label 32 | self.vector = center_vector 33 | if distance_fun is None or callable(distance_fun) is False: 34 | self.distance_fun = self.default_distance_fun 35 | else: 36 | self.distance_fun = distance_fun 37 | 38 | def __sub__(self, value): 39 | if vector is None: 40 | raise TypeError 41 | if isinstance(vector): 42 | return self.distance_fun(self.vector, vector) 43 | elif isinstance(value, Center): 44 | return self.distance_fun(self.vector, value.vector) 45 | elif hasattr(value, "vector") and isinstance(getattr(value, "vector"), (list, tuple)): 46 | return self.distance_fun(self.vector, value.vector) 47 | else: 48 | raise TypeError 49 | return self.distance_fun(self.vector, vector) 50 | 51 | 52 | class Kmeans(object): 53 | 54 | def cluster(self, datas, k, iter_count=10000, diff=0.00001): 55 | ''' 56 | 函数功能: 57 | 对数据进行聚类 , 通过kmeans 算法 58 | 59 | 60 | 过程: 61 | 随机从数据选出centers (一个随机过程) 62 | 开始迭代 63 | 循环每个数据: 64 | 计算数据与每个中心距离 , 找到一个最小值 65 | 如果 数据原有label 和现有label 不同: 66 | diff_labels += 1 67 | 计算数据label变化比率 , 如果超出diff设置值 , 继续下轮迭代 68 | 否则 , 跳出循环 69 | 返回数据labels 70 | ''' 71 | centers = self.rand_seed(datas, k) 72 | center_range = range(len(centers)) 73 | data_range = range(len(datas)) 74 | labels = [-1 for i in data_range] 75 | for _ in range(iter_count): 76 | diff_labels = 0 77 | for i in data_range: 78 | bestlabel = min( 79 | [(self.distance(datas[i], centers[j][0]), centers[j][1]) 80 | for j in center_range]) 81 | if labels[i] != bestlabel[1]: 82 | diff_labels += 1 83 | labels[i] = bestlabel[1] 84 | if float(diff_labels) / len(datas) < diff: 85 | break 86 | centers = self.update_centers(datas, labels, centers) 87 | return labels, centers 88 | 89 | def rand_seed(self, datas, k): 90 | rand_seeds = sample(datas, k) 91 | rand_seeds = [(copy(rand_seeds[i]), i) for i in range(len(rand_seeds))] 92 | return rand_seeds 93 | 94 | def update_centers(self, datas, labels, centers): 95 | centers_dict = { 96 | center[1]: [0 for i in range(len(center[0]))] for center in centers} 97 | label_dict = Counter(labels) 98 | for i in range(len(datas)): 99 | for j in range(len(datas[i])): 100 | centers_dict[labels[i]][j] += datas[i][j] 101 | for label in label_dict.keys(): 102 | for i in range(len(centers_dict[label])): 103 | centers_dict[label][i] /= label_dict[label] 104 | return sorted([(center, label) for label, center in centers_dict.items()], key=lambda x: x[1], reverse=False) 105 | 106 | 107 | class DKmeans(Kmeans): 108 | 109 | def distance(self, data1, data2): 110 | return sqrt( 111 | sum([ 112 | (data1[i] - data2[i]) ** 2 113 | for i in range(len(data1)) 114 | ]) 115 | ) 116 | 117 | 118 | class ManhattanKmeans(Kmeans, Manhattan): 119 | 120 | pass 121 | 122 | 123 | class HammingKmeans(Kmeans, Hamming): 124 | 125 | pass 126 | 127 | 128 | class CosineKmeans(Kmeans, Cosine): 129 | pass 130 | 131 | 132 | class EuclideanKmeans(Kmeans, Euclidean): 133 | pass 134 | 135 | 136 | -------------------------------------------------------------------------------- /test/testKmeansPlusPlus.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from testKmeans import Kmeans 4 | from testKmeans import DKmeans 5 | from random import randint 6 | from random import random 7 | from copy import copy 8 | 9 | 10 | class KmeansPlusPlus(Kmeans): 11 | 12 | def rand_seed(self, datas, k): 13 | ''' 14 | function: 15 | kmeans++与kmeans最大不同点就是种子生成算法不同 16 | params: 17 | datas 聚类数据 18 | k 聚类数目 19 | ''' 20 | seeds = [( 21 | copy(datas[randint(0, len(datas) - 1)]), 0 22 | )] #初始化种子库 ,随机一个种子 23 | #获取剩余种子 24 | for k_iter in range(k - 1): 25 | ds = [] #种子距离 26 | for data in datas: 27 | ds.append( 28 | min(self.distance(seed[0], data) 29 | for seed in seeds)) 30 | sum_distance = sum(ds) 31 | rand_distance = random() * sum_distance 32 | for i in range(len(ds)): 33 | rand_distance -= ds[i] 34 | if rand_distance <= 0: 35 | seeds.append((copy(datas[i]), k_iter + 1)) 36 | break 37 | return seeds 38 | 39 | 40 | class DKmeansPlusPlus(KmeansPlusPlus , DKmeans): 41 | 42 | pass 43 | 44 | 45 | 46 | if __name__ == '__main__': 47 | k = DKmeansPlusPlus() 48 | datas = [[randint(0, 20) * 1.0, randint(0, 20) * 1.0] for _ in range(200)] 49 | labels = k.cluster(datas, 5, 200 , diff = 0.00001) 50 | print labels 51 | 52 | -------------------------------------------------------------------------------- /test/testKnn.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | class ClassItem(object): 5 | 6 | pass 7 | 8 | 9 | class KdNode(object): 10 | 11 | def __init__(self, split, left_child, right_child, data, parrent_node): 12 | self.split = split # 切分点 13 | self.left = left # 左子树 14 | self.right = right # 右子树 15 | self.data = data # 数据点 16 | self.parrent = pattern_node 17 | 18 | 19 | class KdTree(object): 20 | 21 | def create_kd_tree(self, datas, k, feature_len, depth): 22 | if datas == None or len(datas) == 0: 23 | return KdNode(None, None, None, None, None) 24 | 25 | split_index = self.get_split_index(datas, k, feature_len, depth) 26 | datas = sorted(datas, key=lambda x: x[split_index], reverse=True) 27 | split_data_index = len(datas) / 2 28 | data = datas[split_data_index] 29 | 30 | def get_split_index(self, datas, k, feature_len, depth): 31 | data_sum = [0] * feature_len 32 | # 计算方差,找到方差最大的列,方差越大,证明点越分散,越具有可区分度 33 | 34 | for data in datas: 35 | for i in range(feature_len): 36 | data_sum[i] += data[i] 37 | data_avg = [data_sum[i] / len(datas) for i in range(feature_len)] 38 | data_chi = [0] * feature_len 39 | for data in datas: 40 | for i in range(len(data)): 41 | data_chi[i] += (data[i] - data_avg[i]) ** 2 42 | 43 | return sorted([(data_chi[i], i) for i in range(feature_len)], key=lambda x: x[0], reverse=True)[0][1] 44 | 45 | from collections import Counter 46 | 47 | 48 | class Knn(object): 49 | 50 | def __init__(self, train_data, labels, top_n): 51 | self.train_data = train_data 52 | self.labels = labels 53 | self.top_n = top_n 54 | 55 | def classify(self, data): 56 | 57 | label_orders = sorted([ 58 | ( 59 | self.distance(data, self.train_data[i]), 60 | labels 61 | ) 62 | for i in range(len(self.train_data)) 63 | ], key=lambda x: x[1]) 64 | return Counter(label for data, label in label_orders[:self.top_n]).most_common(1) 65 | 66 | 67 | if __name__ == '__main__': 68 | kd = KdTree() 69 | datas = [(2, 3), (5, 4), (9, 6), (4, 7), (8, 1), (7, 2)] 70 | print kd.get_split_index(datas, 3, 2, 1) 71 | -------------------------------------------------------------------------------- /test/testLinerModel.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | class LinerModel(object): 7 | 8 | 9 | 10 | 11 | 12 | def train(self , datas , labels , item_len , learn_rate ): 13 | self.weights = [1.] * item_len 14 | self.offset = 1. 15 | for i in range(len(labels)): 16 | l = self.predict(datas[i]) 17 | self.update_weight( l , labels[i] , datas[i] , learn_rate) 18 | 19 | 20 | 21 | def update_weight(self , l , target , data , learn_rate): 22 | for i in range(len(self.weights)): 23 | self.weights[i] = self.weights[i] - learn_rate *(l - target) * data[i] 24 | self.offset = self.offset - learn_rate * (l - target) 25 | 26 | 27 | 28 | def predict(self , data): 29 | if data and len(data) == len(self.weights): 30 | return sum([ data[i] *self.weights[i] for i in range(len(self.weights))]) + self.offset 31 | 32 | 33 | if __name__ == '__main__': 34 | l = LinerModel() 35 | from random import random 36 | datas = [ [random() * 10 ] for i in range(10000)] 37 | labels = [ 1 if data[0] >= 5 else 0 for data in datas] 38 | l.train(datas , labels , 1 , 0.01) 39 | print l.weights 40 | print l.offset 41 | print l.predict([6.]) -------------------------------------------------------------------------------- /test/testLogistic.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from math import exp 4 | 5 | 6 | class Logistic(object): 7 | 8 | def train(self, datas, labels, alpha=0.001): 9 | self.params = [1. for _ in range(len(datas[0]))] 10 | self.labels = set(labels) 11 | 12 | for i in range(len(datas)): 13 | h = self.sigmod(self.classify(datas[i])) 14 | #L(a) = error , 已知损失数值 , 需要求更新权重的 15 | error = (labels[i] - h) 16 | for j in range(len(self.params)): 17 | self.params[j] += (alpha * datas[i][j] * error) 18 | 19 | def classify(self, data): 20 | _val = sum([data[i] * self.params[i] for i in range(len(self.params))]) 21 | return min([abs(label - _val , label) for label in self.labels ]) 22 | 23 | 24 | def sigmod(self, x): 25 | 26 | return 1. / (1 + exp(-x)) 27 | 28 | 29 | 30 | if __name__ == '__main__': 31 | data = [] 32 | labels = [] 33 | from random import randint 34 | for _ in range(10000): 35 | x = randint(1 , 10) 36 | y = randint(1 , 10) 37 | data.append((x,y)) 38 | if x <= y: 39 | labels.append(1) 40 | else: 41 | labels.append(0) 42 | b = Logistic() 43 | b.train(data, labels) 44 | print b.classify([2, 5]) 45 | -------------------------------------------------------------------------------- /test/testMiniBatchKMeans.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | from testKmeans import Kmeans 5 | from testKmeans import Center 6 | from testDDistance import DefaultDistance 7 | from random import randint 8 | import random 9 | from collections import Counter 10 | from collections import defaultdict 11 | from math import sqrt 12 | from testKmeans import DKmeans 13 | from copy import copy 14 | import sys 15 | 16 | 17 | class MiniBatchKmeans(Kmeans): 18 | 19 | def cluster(self, datas, k, iter_count=10000, diff=0.00001): 20 | """ 21 | k center count 22 | """ 23 | if k > len(datas): 24 | return datas 25 | centers = self.rand_seed(datas, k) 26 | center_range = range(k) 27 | data_range = range(len(datas)) 28 | sample_rate = 0.3 29 | sample_data_count = int(len(datas) * sample_rate) 30 | sample_data_range = range(sample_data_count) 31 | for _ in range(iter_count): 32 | sample_data = random.sample(datas, sample_data_count) 33 | distance_vector = [-1] * sample_data_count 34 | center_counts = [0] * k 35 | for i in sample_data_range: 36 | mini_distance, bestlabel = min( 37 | [ 38 | ( 39 | self.distance( 40 | datas[i], 41 | centers[j] 42 | ), j 43 | ) 44 | for j in center_range 45 | ] 46 | ) 47 | distance_vector[i] = bestlabel 48 | for i in sample_data_range: 49 | data_label = distance_vector[i] 50 | center_counts[data_label] += 1 51 | eta = 1.0 / center_counts[data_label] 52 | centers[data_label] = self.add( 53 | centers[data_label], 54 | sample_data[i], 55 | eta, 56 | len(sample_data[i]) 57 | ) 58 | return centers 59 | 60 | def rand_seed(self, datas, k): 61 | return [copy(data) for data in random.sample(datas, k)] 62 | 63 | def add(self, center, data, eta, data_len): 64 | _center = [i * (1.0 - eta) for i in center] 65 | _data = [eta * i for i in data] 66 | return [_center[i] + _data[i] for i in range(data_len)] 67 | 68 | 69 | class DMiniBatchKmeans(MiniBatchKmeans, DefaultDistance): 70 | pass 71 | 72 | 73 | if __name__ == '__main__': 74 | k = DMiniBatchKmeans() 75 | datas = [[randint(1, 20), randint(1, 20), randint( 76 | 1, 20), randint(1, 20), randint(1, 20)] for _ in range(100)] 77 | labels = k.cluster(datas, 5, 200, diff=0.00001) 78 | print labels 79 | -------------------------------------------------------------------------------- /test/testNgram.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | 7 | def ngram(content, splitor = " " , n = 2 ): 8 | """ngram用于文本中的处理 9 | test: 10 | >>> ngram("a b c d") 11 | ['a b', 'b c', 'c d'] 12 | >>> ngram("a b c d",n=1) 13 | ['a', 'b', 'c', 'd'] 14 | >>> ngram("a b c d",splitor = " ") 15 | ['a b c d'] 16 | """ 17 | if content is None: 18 | return [] 19 | words = content.split(splitor) 20 | if len(words) <= 1: 21 | return [content] 22 | result = [] 23 | for i in range(0,len(words)-n + 1): 24 | result.append( splitor.join([words[i+j] for j in range(n)])) 25 | return result 26 | -------------------------------------------------------------------------------- /test/testOneHotCode.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from collections import defaultdict 4 | from b2 import math2 5 | import math 6 | 7 | 8 | 9 | class OneHotCode(object): 10 | 11 | """feature one hot 12 | Test: 13 | >>> ohc = OneHotCode() 14 | >>> ohc.train(0) 15 | >>> ohc.train(1) 16 | >>> ohc.train(2) 17 | >>> ohc.train(3) 18 | >>> ohc.train(4) 19 | >>> ohc.train(7) 20 | >>> ohc.predict(0) 21 | >>> ohc.predict(3) 22 | >>> ohc.predict(7) 23 | """ 24 | 25 | def __init__(self): 26 | self._feature_map = {} 27 | 28 | def train(self,data): 29 | if data in self._feature_map: 30 | return 31 | self._feature_map[data] = len(self._feature_map) 32 | 33 | def predict(self,data): 34 | array_len = int(math.ceil(math.sqrt(len(self._feature_map)))) 35 | index = self._feature_map[data] 36 | features = math2.bitfield(index) 37 | features[:0] = [0 for _ in range(array_len - len(features))] 38 | return features 39 | -------------------------------------------------------------------------------- /test/testPageRank.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | from testDataSet import DataSet 5 | from collections import Counter 6 | from collections import defaultdict 7 | import copy 8 | import sys 9 | 10 | 11 | class Graph(object): 12 | 13 | 14 | def __init__(self , point_len , dense = True ): 15 | self.weights = [1.] * point_len 16 | self.data = DataSet(point_len , dense) 17 | for i in range(point_len): 18 | self.data.append() 19 | self._keys = xrange(point_len) 20 | self._len = point_len 21 | self.outs_counter = Counter() 22 | self.point_ins = defaultdict(set) 23 | 24 | def __len__(self): 25 | return self._len 26 | 27 | def add_edge(self , point_a , point_b): 28 | """add point a to b edge 29 | param: point_a:point 图点a -> b 30 | param: point_b 图被指向点 31 | return:None 32 | """ 33 | self.data[point_a][point_b] = 1 34 | self.outs_counter[point_a] += 1 35 | self.point_ins[point_b].add(point_a) 36 | 37 | def keys(self): 38 | return self._keys 39 | 40 | def ins(self , point): 41 | return self.point_ins[point] 42 | 43 | def outs(self , point): 44 | if point and isinstance(point , (int , long)): 45 | if point >= 0 and point < self._len: 46 | for index in self.data[point].keys(): 47 | if self.data[point][index] > 0: 48 | yield index 49 | def outs_count(self , point): 50 | return self.outs_counter[point] 51 | 52 | def update(self,weights): 53 | if weights: 54 | error = 0. 55 | for i in self.keys(): 56 | error = max( error , abs(weights[i] - self.weights[i])) 57 | self.weights[i] = weights[i] 58 | return error 59 | 60 | 61 | class GraphV2(object): 62 | 63 | 64 | 65 | def __init__(self,point_len): 66 | self.weights = [1.] * point_len 67 | self._keys = xrange(point_len) 68 | self._len = point_len 69 | self.outs_counter = Counter() 70 | self.point_ins = defaultdict(set) 71 | 72 | def __len__(self): 73 | return self._len 74 | 75 | def add_edge(self ,point_a , point_b): 76 | self.outs_counter[point_a] +=1 77 | self.point_ins[point_b].add(point_a) 78 | 79 | def keys(self): 80 | return self._keys 81 | 82 | def outs_count(self,point): 83 | return self.outs_counter[point] 84 | 85 | def update(self,weights): 86 | if weights: 87 | error = 0. 88 | for i in self.keys(): 89 | error = max( error , abs(weights[i] - self.weights[i])) 90 | self.weights[i] = weights[i] 91 | return error 92 | 93 | def ins(self,point): 94 | return self.point_ins[point] 95 | 96 | class PageRank(object): 97 | 98 | 99 | def rank(self , graph ,iter_count = 1000, d = 0.85 , min_error = 0.01): 100 | for _iter in xrange(iter_count): 101 | weights = copy.copy(graph.weights) 102 | for i in graph.keys(): 103 | weights[i] =(1-d) + d * sum([ weights[point_in]/graph.outs_count(point_in) for point_in in graph.ins(i)]) 104 | 105 | error = graph.update(weights) 106 | sys.stderr.write("iter : %s error:%s\n" % (_iter , error)) 107 | if error < min_error: 108 | break 109 | return copy.copy(graph.weights) 110 | 111 | def calc_error(self , weights , graph): 112 | return max(abs(weights[i] - graph.weights[i]) for i in graph.keys()) 113 | 114 | if __name__ == "__main__": 115 | graph = GraphV2(10) 116 | graph.add_edge(1 , 9) 117 | graph.add_edge(3 , 4) 118 | graph.add_edge(6 , 8) 119 | graph.add_edge(7 , 8) 120 | graph.add_edge(0,1) 121 | pagerank = PageRank() 122 | print pagerank.rank(graph ) 123 | 124 | -------------------------------------------------------------------------------- /test/testRandomForst.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | 6 | class RF(object): 7 | 8 | 9 | pass 10 | 11 | -------------------------------------------------------------------------------- /test/testRegressionTree.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | class TreeNode(object): 6 | 7 | 8 | 9 | def __init__(self): 10 | self.value = value 11 | self.left_leaf = None 12 | self.right_leaf = None 13 | self.split_value = None 14 | 15 | class RegressionTree(object): 16 | 17 | 18 | 19 | 20 | def __init__(self): 21 | self.tree = TreeNode() 22 | 23 | 24 | 25 | 26 | def train(self , datasets , targets): 27 | feature_split_values = self.init_split_value(datasets , targets) 28 | 29 | 30 | def loss(self , datasets , labels , attr , split_value): 31 | """回归树的损失函数 32 | param:datasets:class 训练数据集 33 | param:labels:list target目标训练值 34 | param:attr:int 训练属性坐标 35 | param:split_value:float 属性分裂值 36 | return:error :如果训练数据集没有出错则返回损失值,否则返回None 37 | raise:None 38 | """ 39 | c1 , c2 = self.get_target_avg(datasets , attr , split_value) 40 | if c1 is None or c2 is None: 41 | return None 42 | error = None 43 | for i in xrange(len(datasets)): 44 | if datasets[i][attr] is None: 45 | continue 46 | if error is None: 47 | error = 0. 48 | if datasets[i][attr] > split_value: 49 | error += (labels[i] - c1) ** 2 50 | else: 51 | error += (labels[i] - c2) ** 2 52 | return error 53 | 54 | 55 | def get_target_avg(self , datasets ,targets , attr , split_value): 56 | """根据分裂点,得到两个部分的target目标值 57 | param:datasets:class 训练数据集 58 | param:targets:list 训练数据对应的目标值list 59 | param:attr:int 训练属性坐标 60 | param:split_value:float 属性分裂值 61 | return:数据集中大于split_value值属性的训练target目标平均值c1 , 数据集attr属性中小于等于spliit_value属性的训练target目标平均值c2 62 | raise:None 63 | """ 64 | c1 , c2 = 0., 0. 65 | c1_count , c2_count = 0, 0 66 | for i in xrange(len(datasets)): 67 | if datasets[i][attr] is None: 68 | continue 69 | if datasets[i][attr] > split_value: 70 | c1 += targets[i] 71 | c1_count += 1 72 | else: 73 | c2 += targets[i] 74 | c2_count += 1 75 | return c1 / c1_count , c2/ c2_count if c1 == 0 or c2 == 0 else None,None 76 | 77 | --------------------------------------------------------------------------------