├── 1.perceptron ├── README.md ├── model.py └── train.py ├── 2.KNN ├── README.md └── model.py ├── 3.naive_bayes ├── README.md └── model.py ├── 4.DecisionTree ├── README.md ├── model.py └── treePlotter.py ├── 5. Adaboost ├── README.md ├── adaboost.py ├── logicalRegression.py ├── result.png ├── test.txt ├── train.txt └── utils.py ├── Optimization_method ├── 1.Gradient_decent.ipynb ├── 2. 牛顿法.ipynb └── 3.拟牛顿法.ipynb ├── README.md └── RNN ├── LSTM.ipynb └── RNN.ipynb /1.perceptron/README.md: -------------------------------------------------------------------------------- 1 | # 感知机 2 | 3 | ## 使用方式 4 | 5 | X = np.asarray([[1, 1], [3, 3], [4, 3]]) 6 | y = np.asarray([[-1], [1], [1]]) 7 | model = Perceptron() 8 | model.fit( X, y) 9 | 10 | ## 模型定义 11 | 12 | y = sign(w * x + b) 13 | 14 | ## 损失函数 15 | 利用分错的点到感知机的距离来当作损失函数 16 | 17 | ## 优化方式 18 | 19 | w += 学习率 * y * x 20 | b += 学习率 * y 21 | 22 | 以此方式来来回迭代,完成w和b的修改 -------------------------------------------------------------------------------- /1.perceptron/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | # @Time : 2018/3/17 下午8:07 4 | # @Author : zhanzecheng 5 | # @File : model.py 6 | # @Software: PyCharm 7 | """ 8 | import numpy as np 9 | import random 10 | 11 | class Perceptron: 12 | ''' 13 | 感知机的定义为 f(x) = sign(w * x + b) 14 | 其中 w是n维向量(n为训练数据的维度) 15 | b是一维 16 | ''' 17 | def __init__(self): 18 | 19 | self._W = [] 20 | self._b = 0 21 | self._epochs = 100 22 | self._learningRate = 1 23 | 24 | 25 | def _forword(self, X): 26 | ''' 27 | 感知机的前向传播 28 | :param X: 训练数据 29 | :return: 感知机的分类结果 30 | ''' 31 | logit = np.zeros((len(X), 1)) 32 | 33 | for count, x in enumerate(X): 34 | tmp = 1 if np.matmul(self._W , x) + self._b > 0 else -1 35 | logit[count] = tmp 36 | return logit 37 | 38 | def _loss(self, logit, label): 39 | ''' 40 | 感知机分错的个数和索引 41 | :param logit: 感知机预测的结果 42 | :param label: 训练集真实的结果 43 | :return: 感知机分错的个数和索引 44 | ''' 45 | count = 0 46 | wrong = [] 47 | for index, (a, b )in enumerate(zip(logit, label)): 48 | if a != b: 49 | count += 1 50 | wrong.append(index) 51 | 52 | self._wrong = count 53 | return wrong 54 | 55 | def _backford(self, X_wrong, y): 56 | ''' 57 | 更新感知机的参数 58 | :param X_wrong: 随机选取的分错的点 59 | :param y: 随机选取的分错的真实坐标 60 | :return: 61 | ''' 62 | self._W = self._W + y * X_wrong * self._learningRate 63 | self._b = self._b + y * self._learningRate 64 | 65 | def fit(self, X_train, y_train): 66 | ''' 67 | 训练感知机 68 | :param X_train: 训练集 69 | :param y_train: 标签 70 | :return: 71 | ''' 72 | self._W = np.zeros(shape=(1, X_train.shape[1])) 73 | for epoch in range(self._epochs): 74 | # 1. First we should do the forward 75 | logit = self._forword(X_train) 76 | 77 | # 2. Then we should use stochastic gradient descent to optimize our W & b 78 | wrong = self._loss(logit=logit, label=y_train) 79 | 80 | if len(wrong) == 0: 81 | print('we succeed the training') 82 | break 83 | 84 | X_wrong_index = random.choice(wrong) 85 | self._backford(X_train[X_wrong_index], y_train[X_wrong_index]) 86 | 87 | print('Now we have ', str(self._wrong), ' wrong points') 88 | print('Now we finish our training') 89 | 90 | def predict(self, X_test): 91 | ''' 92 | 利用感知机来预测 93 | :param X_test: 测试集 94 | :return: 95 | ''' 96 | return self._forword(X_test) 97 | 98 | def test(): 99 | X = np.asarray([[1, 1], [3, 3], [4, 3]]) 100 | y = np.asarray([[-1], [1], [1]]) 101 | model = Perceptron() 102 | model.fit( X, y) 103 | 104 | -------------------------------------------------------------------------------- /1.perceptron/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | # @Time : 2018/3/17 下午9:09 4 | # @Author : zhanzecheng 5 | # @File : train.py 6 | # @Software: PyCharm 7 | """ 8 | 9 | import numpy as np 10 | 11 | from model import Perceptron 12 | 13 | def main(): 14 | X = np.asarray([[1, 1], [3, 3], [4, 3]]) 15 | y = np.asarray([[-1], [1], [1]]) 16 | model = Perceptron() 17 | model.fit( X, y) 18 | 19 | if __name__ == '__main__': 20 | main() 21 | -------------------------------------------------------------------------------- /2.KNN/README.md: -------------------------------------------------------------------------------- 1 | # KD树实现的K邻近算法 2 | 3 | ## 使用方式 4 | 5 | data = [[1, 2], [2, 8], [3, 4], [4, 7], [2, 6], [6, 22], [7, 8]] 6 | kdtree = KDTree(data) 7 | distance, point = kdtree.findNN([1,1], 3) 8 | 9 | ## 模型定义 10 | 11 | 利用kd树来划分特征空间 12 | 13 | ## 损失函数 14 | 不含显性的学习过程 15 | 16 | -------------------------------------------------------------------------------- /2.KNN/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | # @Time : 2018/5/21 上午9:50 4 | # @Author : zhanzecheng 5 | # @File : model.py 6 | # @Software: PyCharm 7 | """ 8 | import numpy as np 9 | 10 | class KD_node(): 11 | #定义的kd树节点 12 | def __init__(self, point=None, split=None, LL=None, RR=None): 13 | #节点值 14 | self.point = point 15 | #节点分割维度 16 | self.split = split 17 | #节点左孩子 18 | self.left = LL 19 | #节点右孩子 20 | self.right = RR 21 | 22 | 23 | class KDTree: 24 | def __init__(self, data=None): 25 | ''' 26 | 建树 27 | :param data: 28 | ''' 29 | self.root = None 30 | self.root = self._createNode(self.root, split=self._maxVar(data), data=data) 31 | 32 | pass 33 | 34 | def _createNode(self, root, split=None, data=None): 35 | ''' 36 | 创建kd树 37 | :param split: 38 | :param data: 39 | :return: 40 | ''' 41 | if len(data) == 0: 42 | return None 43 | # 在以split划分的维度上找到中位数 44 | data = list(data) 45 | data.sort(key= lambda x : x[split]) 46 | data = np.array(data) 47 | # 下面用来求中位数 48 | median = len(data) // 2 49 | # 下面递归建立左右子树 50 | root = KD_node(data[median], split) 51 | 52 | root.left = self._createNode(root.left, 53 | split=self._maxVar(data[:median]), 54 | data=data[:median]) 55 | root.right = self._createNode(root.left, 56 | split=self._maxVar(data[median+1:]), 57 | data=data[median+1:]) 58 | return root 59 | 60 | 61 | def _maxVar(self, data=None): 62 | ''' 63 | 用来求数据方差最大的维度 64 | :param data: 65 | :return: 66 | ''' 67 | if len(data) == 0: 68 | return 0 69 | # 按列求均值 70 | data_mean = np.mean(data, axis=0) 71 | # numpy 按列减是直接减的 72 | mean_diff = data - data_mean 73 | # 求得方差 74 | data_var = np.sum(mean_diff ** 2, axis=0) / len(data) 75 | # 求得方差最大位置, 为所要划分的维度 76 | re = np.where(data_var == np.max(data_var)) 77 | return re[0][0] 78 | 79 | 80 | # 下面是kdtree的查找 81 | def _computeDist(self, pt1, pt2): 82 | ''' 83 | 计算两个实例点的特征距离 84 | :param pt1: first 85 | :param pt2: second 86 | :return: float 87 | ''' 88 | pt1 = np.array(pt1) 89 | pt2 = np.array(pt2) 90 | return np.sqrt(np.sum(np.square((pt1 - pt2)))) 91 | 92 | def findNN(self, query, k): 93 | ''' 94 | 查看目标点的最近k个点 95 | :param query: 需要查询的点 96 | :param k: 需要多少个临近点 97 | :return: 98 | ''' 99 | node_K = [] 100 | nodeList = [] 101 | result = [] 102 | temp_root = self.root 103 | # 为了方便,在找到叶子节点同时,把所走过的父节点的距离都保存下来,下一次回溯访问就只需要访问子节点,不需要再访问一遍父节点。 104 | # 下面是为了找到目标点在KD树的划分 105 | while temp_root: 106 | nodeList.append(temp_root) 107 | dd = self._computeDist(query, temp_root.point) 108 | if len(node_K) < k: 109 | node_K.append(dd) 110 | result.append(temp_root.point) 111 | else: 112 | # 选出队列里面最大的元素 113 | max_dist = max(node_K) 114 | if dd < max_dist: 115 | # 类似于优先队列 把该元素pop出来 116 | # TODO: 换成优先队列来实现 117 | index = node_K.index(max_dist) 118 | del node_K[index], result[index] 119 | node_K.append(dd) 120 | result.append(temp_root.point) 121 | ss = temp_root.split 122 | # 找到最靠近的叶子节点 123 | if query[ss] <= temp_root.point[ss]: 124 | temp_root = temp_root.left 125 | else: 126 | temp_root = temp_root.right 127 | 128 | # 回溯访问父节点 129 | while nodeList: 130 | back_point = nodeList.pop() 131 | ss = back_point.split 132 | print('父亲节点 : ', back_point.point, '维度 :', back_point.split) 133 | max_dist = max(node_K) 134 | # 若满足进入该父节点的另外一个子节点的条件 135 | if len(node_K) < k or abs(query[ss] - back_point.point[ss]) < max_dist: 136 | # 进入另外一个子节点 137 | if query[ss] <= back_point.point[ss]: 138 | temp_root = back_point.right 139 | else: 140 | temp_root = back_point.left 141 | # 若不是叶子节点 142 | if temp_root: 143 | nodeList.append(temp_root) 144 | curDist = self._computeDist(temp_root.point, query) 145 | 146 | if max_dist > curDist and len(node_K) == k: 147 | index = node_K.index(max_dist) 148 | del node_K[index], result[index] 149 | node_K.append(curDist) 150 | result.append(temp_root.point) 151 | elif len(node_K) < k: 152 | node_K.append(curDist) 153 | result.append(temp_root.point) 154 | 155 | return node_K, result 156 | 157 | 158 | 159 | 160 | if __name__ == '__main__': 161 | data = [[1, 2], [2, 8], [3, 4], [4, 7], [2, 6], [6, 22], [7, 8]] 162 | 163 | kdtree = KDTree(data) 164 | distance, point = kdtree.findNN([1,1], 3) 165 | print('----> distance: ', distance) 166 | print('----> point: ', point) 167 | 168 | 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /3.naive_bayes/README.md: -------------------------------------------------------------------------------- 1 | # 朴素贝叶斯实现 2 | 3 | ## 使用方式 4 | 5 | cls = NBayes() 6 | x = [[1, 2], [2, 3], [2, 1]] 7 | y = [0, 1, 1] 8 | cls.train(x, y) 9 | print(cls.predict([2, 1])) 10 | 11 | ## 模型定义 12 | 13 | 利用贝叶斯定理于特征条件独立假设 14 | 15 | ## 损失函数 16 | 0-1损失函数 17 | 18 | -------------------------------------------------------------------------------- /3.naive_bayes/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | # @Time : 2018/5/22 上午10:00 4 | # @Author : zhanzecheng 5 | # @File : model.py 6 | # @Software: PyCharm 7 | """ 8 | import numpy as np 9 | 10 | class NBayes: 11 | def __init__(self): 12 | pass 13 | 14 | def _getProbality(self, trainData, trainLabel): 15 | """ 16 | 目前实现的是二分类的朴素贝叶斯 17 | :param trainData: 18 | :param trainLabel: 19 | :return: 20 | """ 21 | if type(trainData) != np.ndarray: 22 | print('---> convert train type to array') 23 | dataLen = len(trainData) 24 | # 得到总共有多少特征 25 | # 计算类别1发生的概率 26 | pAbusive = sum(trainLabel) / dataLen 27 | p0 = [] 28 | p1 = [] 29 | 30 | featureNum = len(trainData[0]) 31 | for feature in range(featureNum): 32 | numsWord = len(trainData[0][feature]) 33 | # 1初始化,防止概率为0的情况. 这里使用的是拉普拉斯平滑方式 34 | p0Num = np.ones(numsWord) 35 | p1Num = np.ones(numsWord) 36 | # 以拉普拉斯平滑方式来初始化分母 37 | p0Denom = 2.0 38 | p1Denom = 2.0 39 | for i in range(dataLen): 40 | if trainLabel[i] == 1: 41 | # 利用了numpy的矩阵相加便捷性 42 | p1Num += trainData[i][feature] 43 | p1Denom += sum(trainData[i][feature]) 44 | else: 45 | p0Num += trainData[i][feature] 46 | p0Denom += sum(trainData[i][feature]) 47 | # 这里利用log的性质进行变换,可以把原来相乘的表达式变成相加 48 | p1Vect = np.log(p1Num / p1Denom) 49 | p0Vect = np.log(p0Num / p0Denom) 50 | p0.append(p0Vect) 51 | p1.append(p1Vect) 52 | return p0, p1, pAbusive 53 | 54 | 55 | def _to_categorical(self, y, num_classes=None): 56 | """ 57 | Converts a class vector (integers) to binary class matrix. 58 | """ 59 | y = np.array(y, dtype='int') 60 | input_shape = y.shape 61 | if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: 62 | input_shape = tuple(input_shape[:-1]) 63 | y = y.ravel() 64 | if not num_classes: 65 | num_classes = np.max(y) + 1 66 | n = y.shape[0] 67 | categorical = np.zeros((n, num_classes), dtype=np.float32) 68 | categorical[np.arange(n), y] = 1 69 | output_shape = input_shape + (num_classes,) 70 | categorical = np.reshape(categorical, output_shape) 71 | return categorical 72 | 73 | def train(self, x, y=None): 74 | x = self._to_categorical(x) 75 | self.x = x 76 | self.p0Vec, self.p1Vec, self.pA = self._getProbality(x, y) 77 | 78 | def predict(self, test): 79 | """ 80 | 这里仅实现了对于batch_size为1的predict 81 | :param test: 82 | :return: 83 | """ 84 | x = self.x[0] 85 | m, n = x.shape 86 | predict = np.zeros((m, n)) 87 | for row, d in enumerate(test): 88 | # TODO: 这里对于没有见过的特征值,仅是简单的赋值为固定值 89 | if d > n: 90 | d = 0 91 | predict[row][int(d)] = 1 92 | 93 | p1 = np.log(self.pA) 94 | for vec2Classify, p1Vec in zip(predict, self.p1Vec): 95 | p1 += sum(vec2Classify * p1Vec) 96 | 97 | p0 = np.log(1.0 - self.pA) 98 | for vec2Classify, p0Vec in zip(predict, self.p0Vec): 99 | p0 += sum(vec2Classify * p0Vec) 100 | 101 | if p1 > p0: 102 | return 1 103 | else: 104 | return 0 105 | 106 | if __name__ == '__main__': 107 | cls = NBayes() 108 | x = [[1, 2], [2, 3], [2, 1]] 109 | y = [0, 1, 1] 110 | cls.train(x, y) 111 | print(cls.predict([2, 1])) -------------------------------------------------------------------------------- /4.DecisionTree/README.md: -------------------------------------------------------------------------------- 1 | # C4.5方式生成的决策树 2 | 3 | ## 使用方式 4 | 5 | data = [ 6 | [1, 2, 3], 7 | [1, 2, 4], 8 | [1, 2, 5], 9 | [0, 2, 1], 10 | [0, 11, 2] 11 | ] 12 | label = [0, 0, 0, 1, 1] 13 | cls = DecisionTree() 14 | # 训练决策树 15 | cls.train(data, label) 16 | 17 | data = [1, 1, 1] 18 | # 测试决策树 19 | print(cls.predict(data)) 20 | 21 | ## 模型定义 22 | 23 | 利用C4.5算法来生成决策树 24 | 25 | ## 损失函数 26 | 不含剪枝过程 27 | 28 | ## TODO: 29 | 增加正则化: 30 | 信息增益最小限度 31 | 剪枝过程 32 | 33 | -------------------------------------------------------------------------------- /4.DecisionTree/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | # @Time : 2018/5/23 上午10:13 4 | # @Author : zhanzecheng 5 | # @File : model.py 6 | # @Software: PyCharm 7 | """ 8 | import math 9 | 10 | class DecisionTree: 11 | def __init__(self): 12 | pass 13 | 14 | def _calcShannonEnt(self, dataSet): 15 | """ 16 | 该函数是用来计算 label 熵 H(x) = -sum(p * log(p)) 17 | :param dataSet: 18 | :return: 19 | """ 20 | dataLen = len(dataSet) 21 | labelCount = {} 22 | for featVec in dataSet: 23 | label = featVec[-1] 24 | if label not in labelCount.keys(): 25 | labelCount[label] = 1 26 | else: 27 | labelCount[label] += 1 28 | shannonEnt = 0.0 29 | # 这里用这个循环来做公式中的sum 30 | for key in labelCount: 31 | prob = labelCount[key] / dataLen 32 | shannonEnt -= prob * math.log(prob, 2) 33 | 34 | return shannonEnt 35 | 36 | def _chooseBestFeatureToSplit(self, dataSet): 37 | """ 38 | 利用[信息增益比]来选择最佳划分维度, 信息增益比相对于信息增益可以优化有较多种类的特征 39 | :param dataSet: 40 | :return: 41 | """ 42 | # 其中- 1是要减去标签 43 | numFeatures = len(dataSet[0]) - 1 44 | baseEntropy = self._calcShannonEnt(dataSet) 45 | bestInfoGainRation = 0.0 46 | bestFeature = -1 47 | for i in range(numFeatures): 48 | # 取出该种特征对应的值 49 | featList = [example[i] for example in dataSet] 50 | uniqueVals = set(featList) 51 | newEntropy = 0 52 | splitInfo = 0 53 | for value in uniqueVals: 54 | subDataSet = self._splitDataSet(dataSet, i, value) # 每个唯一值对应的剩余feature的组成子集 55 | prob = len(subDataSet) / float(len(dataSet)) 56 | newEntropy += prob * self._calcShannonEnt(subDataSet) 57 | splitInfo += -prob * math.log(prob, 2) 58 | infoGain = baseEntropy - newEntropy 59 | if (splitInfo == 0): # fix the overflow bug 60 | continue 61 | infoGainRatio = infoGain / splitInfo #这个feature的infoGainRatio 62 | if (infoGainRatio > bestInfoGainRation): #选择最大的gain ratio 63 | bestInfoGainRation = infoGainRatio 64 | bestFeature = i #选择最大的gain ratio对应的feature 65 | return bestFeature 66 | 67 | def _majorityCnt(self, classList): 68 | """ 69 | 以投票机制来选出max类别 70 | :param classList: 71 | :return: 72 | """ 73 | classCount = {} 74 | for vote in classList: 75 | if vote not in classCount.keys(): 76 | classCount[vote] = 0 77 | classCount[vote] += 1 78 | 79 | sortedClassCount = sorted(classCount.items(), key=lambda x: x[1], reverse=True) 80 | return sortedClassCount[0][0] 81 | 82 | def _splitDataSet(self, dataSet, axis, value): 83 | """ 84 | 输入:数据集,选择维度,选择值 85 | 输出:划分数据集 86 | 描述:按照给定特征划分数据集;去除选择维度中等于选择值的项 87 | :param dataSet: 88 | :param axos: 89 | :param value: 90 | :return: 91 | """ 92 | retDataSet = [] 93 | for featVec in dataSet: 94 | if featVec[axis] == value: # 只看当第i列的值=value时的item 95 | reduceFeatVec = featVec[:].copy() 96 | del reduceFeatVec[axis] 97 | retDataSet.append(reduceFeatVec) 98 | return retDataSet 99 | 100 | def _createTree(self, dataSet, labels): 101 | """ 102 | 以递归的方式来构造决策树 103 | 伪代码: 104 | 1)若数据集中所有实例属于同一类C,则T为单节点树,并将类C作为该节点的类标记,返回T 105 | 2)若特征(A)为空,则返回D中实例中出现最多的类别作为该节点的类别,返回T 106 | 3)否则,计算A中各特征对D的信息增益,选择信息增益最大的特征Ag 107 | 4)[可选]如果Ag小于阈值e,则置T为单节点树,并将D中实例数最大的类作为该节点的标记类,返回T 108 | 5)否则,对Ag的每一可能值ai,依Ag=ai将D分割为若干非空子集Di,对其递归的进行调用 109 | :param dataSet: 110 | :param labels: 111 | :return: 112 | """ 113 | 114 | # 得到数据集中各类别 115 | classList = [example[-1] for example in dataSet] 116 | # 如果所有实例都属于同一类C, 则停止划分 117 | if classList.count(classList[0]) == len(classList): 118 | return classList[0] 119 | # 如果特征为空,则返回出现次数最多的类 120 | if len(dataSet[0]) == 1: 121 | return self._majorityCnt(classList) 122 | 123 | # 否则,计算信息熵增益,并且选取最大的作为分类标准 124 | # TODO: implement the function 125 | bestFeat = self._chooseBestFeatureToSplit(dataSet) 126 | 127 | # 获得特征名 128 | bestFeatName = labels[bestFeat] 129 | # 用字典的方式来建立树 130 | myTree = {bestFeatName:{}} 131 | 132 | # 剔除该特征 133 | del labels[bestFeat] 134 | 135 | # 得到该特征所有的可能取值 136 | featValues = [example[bestFeat] for example in dataSet] 137 | uniqueVals = set(featValues) 138 | for value in uniqueVals: 139 | subLabels = labels[:] 140 | # 递归的建造树 141 | myTree[bestFeatName][value] = self._createTree(self._splitDataSet(dataSet, bestFeat, value), subLabels) 142 | 143 | return myTree 144 | 145 | def _classify(self, inputTree, featLabels, testVec): 146 | """ 147 | 递归的找出测试数据所属于的类别 148 | :param inputTree: 149 | :param featLabels: 150 | :param testVec: 151 | :return: 152 | """ 153 | firstStr = list(inputTree.keys())[0] 154 | secondDict = inputTree[firstStr] 155 | featIndex = featLabels.index(firstStr) 156 | # 递归的访问分类树 157 | for key in secondDict.keys(): 158 | if testVec[featIndex] == key: 159 | if type(secondDict[key]).__name__ == 'dict': 160 | # 如果secondDict[key]仍然是字典,则继续向下层走 161 | classLabel = self._classify(secondDict[key], featLabels, testVec) 162 | else: 163 | # 如果secondDict[key]已经只是分类标签了,则返回这个类别标签 164 | classLabel = secondDict[key] 165 | return classLabel 166 | 167 | 168 | 169 | def train(self,dataSet, label): 170 | """ 171 | 该函数用来训练一个决策树 172 | :param dataSet: 173 | :param label: 174 | :return: 175 | """ 176 | featureNum = len(dataSet[0]) 177 | featureList = [] 178 | # 得到特征名 179 | for i in range(featureNum): 180 | featureList.append(str(i)) 181 | # 联合label和train 182 | for count, d in enumerate(dataSet): 183 | d.extend([label[count]]) 184 | 185 | self.tree = self._createTree(dataSet, featureList) 186 | 187 | def predict(self, dataSet): 188 | """ 189 | 利用训练好的决策树来进行分类 190 | :param data: 191 | :return: 192 | """ 193 | featureNum = len(dataSet) 194 | featureList = [] 195 | # 得到特征名 196 | for i in range(featureNum): 197 | featureList.append(str(i)) 198 | 199 | return self._classify(self.tree, featLabels=featureList, testVec=dataSet) 200 | 201 | 202 | 203 | 204 | if __name__ == '__main__': 205 | data = [ 206 | [1, 2, 3], 207 | [1, 2, 4], 208 | [1, 2, 5], 209 | [0, 2, 1], 210 | [0, 11, 2] 211 | ] 212 | label = [0, 0, 0, 1, 1] 213 | cls = DecisionTree() 214 | # 训练决策树 215 | cls.train(data, label) 216 | print(cls.tree) 217 | data = [1, 1, 1] 218 | # 测试决策树 219 | print(cls.predict(data)) -------------------------------------------------------------------------------- /4.DecisionTree/treePlotter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | # @Time : 2018/5/23 上午10:27 4 | # @Author : zhanzecheng 5 | # @File : treePlotter.py 6 | # @Software: PyCharm 7 | """ 8 | import matplotlib.pyplot as plt 9 | 10 | decisionNode = dict(boxstyle="sawtooth", fc="0.8") 11 | leafNode = dict(boxstyle="round4", fc="0.8") 12 | arrow_args = dict(arrowstyle="<-") 13 | 14 | def plotNode(nodeTxt, centerPt, parentPt, nodeType): 15 | createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', \ 16 | xytext=centerPt, textcoords='axes fraction', \ 17 | va="center", ha="center", bbox=nodeType, arrowprops=arrow_args) 18 | 19 | def getNumLeafs(myTree): 20 | numLeafs = 0 21 | firstStr = list(myTree.keys())[0] 22 | secondDict = myTree[firstStr] 23 | for key in secondDict.keys(): 24 | if type(secondDict[key]).__name__ == 'dict': 25 | numLeafs += getNumLeafs(secondDict[key]) 26 | else: 27 | numLeafs += 1 28 | return numLeafs 29 | 30 | def getTreeDepth(myTree): 31 | maxDepth = 0 32 | firstStr = list(myTree.keys())[0] 33 | secondDict = myTree[firstStr] 34 | for key in secondDict.keys(): 35 | if type(secondDict[key]).__name__ == 'dict': 36 | thisDepth = getTreeDepth(secondDict[key]) + 1 37 | else: 38 | thisDepth = 1 39 | if thisDepth > maxDepth: 40 | maxDepth = thisDepth 41 | return maxDepth 42 | 43 | def plotMidText(cntrPt, parentPt, txtString): 44 | xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0] 45 | yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1] 46 | createPlot.ax1.text(xMid, yMid, txtString) 47 | 48 | def plotTree(myTree, parentPt, nodeTxt): 49 | numLeafs = getNumLeafs(myTree) 50 | depth = getTreeDepth(myTree) 51 | firstStr = list(myTree.keys())[0] 52 | cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalw, plotTree.yOff) 53 | plotMidText(cntrPt, parentPt, nodeTxt) 54 | plotNode(firstStr, cntrPt, parentPt, decisionNode) 55 | secondDict = myTree[firstStr] 56 | plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD 57 | for key in secondDict.keys(): 58 | if type(secondDict[key]).__name__ == 'dict': 59 | plotTree(secondDict[key], cntrPt, str(key)) 60 | else: 61 | plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalw 62 | plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode) 63 | plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key)) 64 | plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD 65 | 66 | def createPlot(inTree): 67 | fig = plt.figure(1, facecolor='white') 68 | fig.clf() 69 | axprops = dict(xticks=[], yticks=[]) 70 | createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) 71 | plotTree.totalw = float(getNumLeafs(inTree)) 72 | plotTree.totalD = float(getTreeDepth(inTree)) 73 | plotTree.xOff = -0.5 / plotTree.totalw 74 | plotTree.yOff = 1.0 75 | plotTree(inTree, (0.5, 1.0), '') 76 | plt.show() -------------------------------------------------------------------------------- /5. Adaboost/README.md: -------------------------------------------------------------------------------- 1 | # Adaboost 2 | 使用 逻辑回归(Logical Regression) 作为Adaboost的基分类器 3 | 4 | ## 使用方式 5 | 6 | # define the base classify 7 | clf = LogisticRegression() 8 | 9 | # using the adaboost method 10 | er_i = adaboost_clf(y, X, y_test, X_test, i, clf) 11 | 12 | 13 | ## 任务说明 14 | 15 | 使用adaboost来做分类任务 16 | -------------------------------------------------------------------------------- /5. Adaboost/adaboost.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from logicalRegression import LogisticRegression 4 | import matplotlib.pyplot as plt 5 | 6 | """ HELPER FUNCTION: GET ERROR RATE =========================================""" 7 | 8 | 9 | def get_error_rate(pred, Y): 10 | return sum(pred != Y) / float(len(Y)) 11 | 12 | 13 | """ HELPER FUNCTION: PRINT ERROR RATE =======================================""" 14 | 15 | 16 | def print_error_rate(err): 17 | print('Error rate: Training: %.4f - Test: %.4f' % err) 18 | 19 | 20 | """ HELPER FUNCTION: GENERIC CLASSIFIER =====================================""" 21 | 22 | 23 | def generic_clf(Y_train, X_train, Y_test, X_test, clf): 24 | clf.fit(X_train, Y_train) 25 | pred_train = clf.predict(X_train) 26 | pred_test = clf.predict(X_test) 27 | return get_error_rate(pred_train, Y_train), \ 28 | get_error_rate(pred_test, Y_test) 29 | 30 | 31 | """ ADABOOST IMPLEMENTATION =================================================""" 32 | 33 | def adaboost_clf(Y_train, X_train, Y_test, X_test, M, clf): 34 | n_train, n_test = len(X_train), len(X_test) 35 | # Initialize weights 36 | w = np.ones(n_train) / n_train 37 | pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)] 38 | 39 | for i in range(M): 40 | # Fit a classifier with the specific weights 41 | weight = np.expand_dims(w, axis=1) 42 | 43 | clf.fit(X_train, Y_train, weight) 44 | pred_train_i = clf.predict(X_train) 45 | pred_test_i = clf.predict(X_test) 46 | # Indicator function 47 | miss = [int(x) for x in (pred_train_i != Y_train)] 48 | # Error 49 | err_m = np.dot(w, miss) / sum(w) 50 | # Alpha 51 | alpha_m = 0.5 * np.log((1 - err_m) / float(err_m+1e-15)) 52 | # New weights 53 | # print(miss[0], Y_train[0]) 54 | # quit() 55 | w = np.multiply(w, np.exp([float(pred_train_i[x]) * (-1) * Y_train[x] * alpha_m for x in range(len(miss))])) 56 | # Add to prediction 57 | pred_train = [sum(x) for x in zip(pred_train, 58 | [x * alpha_m for x in pred_train_i])] 59 | pred_test = [sum(x) for x in zip(pred_test, 60 | [x * alpha_m for x in pred_test_i])] 61 | 62 | pred_train, pred_test = np.sign(pred_train), np.sign(pred_test) 63 | # Return error rate in train and test set 64 | return get_error_rate(pred_train, Y_train), \ 65 | get_error_rate(pred_test, Y_test) 66 | 67 | 68 | """ PLOT FUNCTION ===========================================================""" 69 | 70 | 71 | def plot_error_rate(er_train, er_test): 72 | df_error = pd.DataFrame([er_train, er_test]).T 73 | df_error.columns = ['Training', 'Test'] 74 | plot1 = df_error.plot(linewidth=3, figsize=(8, 6), 75 | color=['lightblue', 'darkblue'], grid=True) 76 | plot1.set_xlabel('Number of iterations', fontsize=12) 77 | plot1.set_xticklabels(range(0, 50, 5)) 78 | plot1.set_ylabel('Error rate', fontsize=12) 79 | plot1.set_title('Error rate vs number of ensemble', fontsize=16) 80 | plt.axhline(y=er_test[0], linewidth=1, color='red', ls='dashed') 81 | plt.show() 82 | plt.savefig('./result.png') 83 | 84 | def load_dataset(): 85 | y = [] 86 | X = [] 87 | with open('./train.txt', 'r') as f: 88 | lines = f.readlines() 89 | for line in lines: 90 | line = line.strip().split(',') 91 | X.append(np.array([eval(x) for x in line[:4]])) 92 | y.append(eval(line[4])) 93 | X = np.array(X) 94 | y = np.array(y) 95 | 96 | X_test = [] 97 | y_test = [] 98 | with open('./test.txt', 'r') as f: 99 | lines = f.readlines() 100 | for line in lines: 101 | line = line.strip().split(',') 102 | X_test.append(np.array([eval(x) for x in line[:4]])) 103 | y_test.append(eval(line[4])) 104 | X_test = np.array(X_test) 105 | y_test = np.array(y_test) 106 | return X, y, X_test, y_test 107 | 108 | 109 | if __name__ == "__main__": 110 | 111 | X, y, X_test, y_test = load_dataset() 112 | 113 | clf = LogisticRegression() 114 | 115 | # Test with different number of iterations 116 | er_train, er_test = [], [] 117 | x_range = [1, 10, 25, 50] 118 | for i in x_range: 119 | print('The ensemble size is %s' % (i)) 120 | er_i = adaboost_clf(y, X, y_test, X_test, i, clf) 121 | # quit() 122 | er_train.append(er_i[0]) 123 | er_test.append(er_i[1]) 124 | 125 | # plot the error rate of trainset and testset 126 | plot_error_rate(er_train, er_test) -------------------------------------------------------------------------------- /5. Adaboost/logicalRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def sigmoid(x): 5 | return 1 / (1 + np.exp(-x)) 6 | 7 | class LogisticRegression(): 8 | """ 9 | Parameters: 10 | ----------- 11 | n_iterations: int 12 | 梯度下降的轮数 13 | learning_rate: float 14 | 梯度下降学习率 15 | """ 16 | def __init__(self, learning_rate=.1, n_iterations=90): 17 | self.learning_rate = learning_rate 18 | self.n_iterations = n_iterations 19 | 20 | def initialize_weights(self, n_features): 21 | # 初始化参数 22 | # 参数范围[-1/sqrt(N), 1/sqrt(N)] 23 | limit = np.sqrt(1 / n_features) 24 | w = np.random.uniform(-limit, limit, (n_features, 1)) 25 | b = 0 26 | self.w = np.insert(w, 0, b, axis=0) 27 | 28 | def fit(self, X, y, weight): 29 | m_samples, n_features = X.shape 30 | self.initialize_weights(n_features) 31 | # 为X增加一列特征x1,x1 = 0 32 | X = np.insert(X, 0, 1, axis=1) 33 | y = np.reshape(y, (m_samples, 1)) 34 | # weight = np.ones(shape=(67, 1)) + 1 35 | # 梯度训练n_iterations轮 36 | for i in range(self.n_iterations): 37 | h_x = X.dot(self.w) 38 | y_pred = sigmoid(h_x) 39 | # print(y.shape) 40 | # print((y_pred - y) * weight) 41 | # quit() 42 | w_grad = X.T.dot((y_pred - y) * weight) 43 | self.w = self.w - self.learning_rate * w_grad 44 | 45 | def predict(self, X): 46 | X = np.insert(X, 0, 1, axis=1) 47 | h_x = X.dot(self.w) 48 | y_pred = np.round([x[0] for x in sigmoid(h_x)]) 49 | return y_pred.astype(int) 50 | 51 | 52 | -------------------------------------------------------------------------------- /5. Adaboost/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanzecheng/Implement_of_Statistical_learning/d708cac1d19e5d829918efb622f343bfa51020c3/5. Adaboost/result.png -------------------------------------------------------------------------------- /5. Adaboost/test.txt: -------------------------------------------------------------------------------- 1 | 3.6216,8.6661,-2.8073,-0.44699,0 2 | 4.5459,8.1674,-2.4586,-1.4621,0 3 | 3.866,-2.6383,1.9242,0.10645,0 4 | 3.4566,9.5228,-4.0112,-3.5944,0 5 | 0.32924,-4.4552,4.5718,-0.9888,0 6 | 4.3684,9.6718,-3.9606,-3.1625,0 7 | 3.5912,3.0129,0.72888,0.56421,0 8 | 2.0922,-6.81,8.4636,-0.60216,0 9 | 3.2032,5.7588,-0.75345,-0.61251,0 10 | 1.5356,9.1772,-2.2718,-0.73535,0 11 | 1.2247,8.7779,-2.2135,-0.80647,0 12 | 3.9899,-2.7066,2.3946,0.86291,0 13 | 1.8993,7.6625,0.15394,-3.1108,0 14 | -1.5768,10.843,2.5462,-2.9362,0 15 | 3.404,8.7261,-2.9915,-0.57242,0 16 | 4.6765,-3.3895,3.4896,1.4771,0 17 | 2.6719,3.0646,0.37158,0.58619,0 18 | 0.80355,2.8473,4.3439,0.6017,0 19 | 1.4479,-4.8794,8.3428,-2.1086,0 20 | 5.2423,11.0272,-4.353,-4.1013,0 21 | 1.4896,3.4288,-4.0309,-1.4259,1 22 | 0.11592,3.2219,-3.4302,-2.8457,1 23 | -3.3924,3.3564,-0.72004,-3.5233,1 24 | -6.1632,8.7096,-0.21621,-3.6345,1 25 | -4.0786,2.9239,0.87026,-0.65389,1 26 | -2.5899,-0.3911,0.93452,0.42972,1 27 | -1.0116,-0.19038,-0.90597,0.003003,1 28 | 0.066129,2.4914,-2.9401,-0.62156,1 29 | -0.24745,1.9368,-2.4697,-0.80518,1 30 | -1.5732,1.0636,-0.71232,-0.8388,1 31 | -2.1668,1.5933,0.045122,-1.678,1 32 | -1.1667,-1.4237,2.9241,0.66119,1 33 | -2.8391,-6.63,10.4849,-0.42113,1 34 | -4.5046,-5.8126,10.8867,-0.52846,1 35 | -2.41,3.7433,-0.40215,-1.2953,1 36 | 0.40614,1.3492,-1.4501,-0.55949,1 37 | -1.3887,-4.8773,6.4774,0.34179,1 38 | -3.7503,-13.4586,17.5932,-2.7771,1 39 | -3.5637,-8.3827,12.393,-1.2823,1 40 | -2.5419,-0.65804,2.6842,1.1952,1 41 | 5.7867,7.8902,-2.6196,-0.48708,0 42 | -0.24811,-0.17797,4.9068,0.15429,0 43 | 0.3292,-4.4552,4.5718,-0.9888,0 44 | 3.9362,10.1622,-3.8235,-4.0172,0 45 | 0.93584,8.8855,-1.6831,-1.6599,0 46 | 4.4338,9.887,-4.6795,-3.7483,0 47 | 0.7057,-5.4981,8.3368,-2.8715,0 48 | 1.1432,-3.7413,5.5777,-0.63578,0 49 | -0.38214,8.3909,2.1624,-3.7405,0 50 | 6.5633,9.8187,-4.4113,-3.2258,0 51 | -1.7479,-5.823,5.8699,1.212,1 52 | -0.95923,-6.7128,4.9857,0.32886,1 53 | 1.3451,0.23589,-1.8785,1.3258,1 54 | 2.2279,4.0951,-4.8037,-2.1112,1 55 | 1.2572,4.8731,-5.2861,-5.8741,1 56 | -5.3857,9.1214,-0.41929,-5.9181,1 57 | -2.9786,2.3445,0.52667,-0.40173,1 58 | -1.5851,-2.1562,1.7082,0.9017,1 59 | -0.21888,-2.2038,-0.0954,0.56421,1 60 | 1.3183,1.9017,-3.3111,0.065071,1 61 | -------------------------------------------------------------------------------- /5. Adaboost/train.txt: -------------------------------------------------------------------------------- 1 | 4.8906,-3.3584,3.4202,1.0905,0 2 | 1.4884,3.6274,3.308,0.48921,0 3 | 4.2969,7.617,-2.3874,-0.96164,0 4 | -0.96511,9.4111,1.7305,-4.8629,0 5 | -1.6162,0.80908,8.1628,0.60817,0 6 | 2.4391,6.4417,-0.80743,-0.69139,0 7 | 2.6881,6.0195,-0.46641,-0.69268,0 8 | 3.6289,0.81322,1.6277,0.77627,0 9 | 4.5679,3.1929,-2.1055,0.29653,0 10 | 3.4805,9.7008,-3.7541,-3.4379,0 11 | 4.1711,8.722,-3.0224,-0.59699,0 12 | -0.2062,9.2207,-3.7044,-6.8103,0 13 | -0.0068919,9.2931,-0.41243,-1.9638,0 14 | 0.96441,5.8395,2.3235,0.066365,0 15 | 2.8561,6.9176,-0.79372,0.48403,0 16 | -0.7869,9.5663,-3.7867,-7.5034,0 17 | 2.0843,6.6258,0.48382,-2.2134,0 18 | -0.7869,9.5663,-3.7867,-7.5034,0 19 | 3.9102,6.065,-2.4534,-0.68234,0 20 | 1.6349,3.286,2.8753,0.087054,0 21 | 4.3239,-4.8835,3.4356,-0.5776,0 22 | 5.262,3.9834,-1.5572,1.0103,0 23 | 3.1452,5.825,-0.51439,-1.4944,0 24 | 2.549,6.1499,-1.1605,-1.2371,0 25 | 4.9264,5.496,-2.4774,-0.50648,0 26 | 4.8265,0.80287,1.6371,1.1875,0 27 | 2.5635,6.7769,-0.61979,0.38576,0 28 | 5.807,5.0097,-2.2384,0.43878,0 29 | 3.1377,-4.1096,4.5701,0.98963,0 30 | -0.78289,11.3603,-0.37644,-7.0495,0 31 | 2.888,0.44696,4.5907,-0.24398,0 32 | 0.49665,5.527,1.7785,-0.47156,0 33 | 4.2586,11.2962,-4.0943,-4.3457,0 34 | 1.7939,-1.1174,1.5454,-0.26079,0 35 | 5.4021,3.1039,-1.1536,1.5651,0 36 | 2.5367,2.599,2.0938,0.20085,0 37 | 4.6054,-4.0765,2.7587,0.31981,0 38 | 2.4235,9.5332,-3.0789,-2.7746,0 39 | 1.0009,7.7846,-0.28219,-2.6608,0 40 | 0.12326,8.9848,-0.9351,-2.4332,0 41 | 3.9529,-2.3548,2.3792,0.48274,0 42 | 4.1373,0.49248,1.093,1.8276,0 43 | 4.7181,10.0153,-3.9486,-3.8582,0 44 | 4.1654,-3.4495,3.643,1.0879,0 45 | 4.4069,10.9072,-4.5775,-4.4271,0 46 | 2.3066,3.5364,0.57551,0.41938,0 47 | 3.7935,7.9853,-2.5477,-1.872,0 48 | 0.049175,6.1437,1.7828,-0.72113,0 49 | 0.24835,7.6439,0.9885,-0.87371,0 50 | 1.1317,3.9647,3.3979,0.84351,0 51 | 2.8033,9.0862,-3.3668,-1.0224,0 52 | 4.4682,2.2907,0.95766,0.83058,0 53 | 5.0185,8.5978,-2.9375,-1.281,0 54 | 1.8664,7.7763,-0.23849,-2.9634,0 55 | 3.245,6.63,-0.63435,0.86937,0 56 | 4.0296,2.6756,0.80685,0.71679,0 57 | -1.1313,1.9037,7.5339,1.022,0 58 | 0.87603,6.8141,0.84198,-0.17156,0 59 | 4.1197,-2.7956,2.0707,0.67412,0 60 | 3.8027,0.81529,2.1041,1.0245,0 61 | 1.4806,7.6377,-2.7876,-1.0341,0 62 | 4.0632,3.584,0.72545,0.39481,0 63 | 4.3064,8.2068,-2.7824,-1.4336,0 64 | 2.4486,-6.3175,7.9632,0.20602,0 65 | 3.2718,1.7837,2.1161,0.61334,0 66 | -0.64472,-4.6062,8.347,-2.7099,0 67 | 2.9543,1.076,0.64577,0.89394,0 68 | 2.1616,-6.8804,8.1517,-0.081048,0 69 | 3.82,10.9279,-4.0112,-5.0284,0 70 | -2.7419,11.4038,2.5394,-5.5793,0 71 | 3.3669,-5.1856,3.6935,-1.1427,0 72 | 4.5597,-2.4211,2.6413,1.6168,0 73 | 5.1129,-0.49871,0.62863,1.1189,0 74 | 3.3397,-4.6145,3.9823,-0.23751,0 75 | 4.2027,0.22761,0.96108,0.97282,0 76 | 3.5438,1.2395,1.997,2.1547,0 77 | 2.3136,10.6651,-3.5288,-4.7672,0 78 | -1.8584,7.886,-1.6643,-1.8384,0 79 | 3.106,9.5414,-4.2536,-4.003,0 80 | 2.9163,10.8306,-3.3437,-4.122,0 81 | 3.9922,-4.4676,3.7304,-0.1095,0 82 | 1.518,5.6946,0.094818,-0.026738,0 83 | 3.2351,9.647,-3.2074,-2.5948,0 84 | 4.2188,6.8162,-1.2804,0.76076,0 85 | 1.7819,6.9176,-1.2744,-1.5759,0 86 | 2.5331,2.9135,-0.822,-0.12243,0 87 | 3.8969,7.4163,-1.8245,0.14007,0 88 | 2.108,6.7955,-0.1708,0.4905,0 89 | 2.8969,0.70768,2.29,1.8663,0 90 | 0.9297,-3.7971,4.6429,-0.2957,0 91 | 3.4642,10.6878,-3.4071,-4.109,0 92 | 4.0713,10.4023,-4.1722,-4.7582,0 93 | -1.4572,9.1214,1.7425,-5.1241,0 94 | -1.5075,1.9224,7.1466,0.89136,0 95 | -0.91718,9.9884,1.1804,-5.2263,0 96 | 2.994,7.2011,-1.2153,0.3211,0 97 | -2.343,12.9516,3.3285,-5.9426,0 98 | 3.7818,-2.8846,2.2558,-0.15734,0 99 | 4.6689,1.3098,0.055404,1.909,0 100 | 3.4663,1.1112,1.7425,1.3388,0 101 | 3.2697,-4.3414,3.6884,-0.29829,0 102 | 5.1302,8.6703,-2.8913,-1.5086,0 103 | 2.0139,6.1416,0.37929,0.56938,0 104 | 0.4339,5.5395,2.033,-0.40432,0 105 | -1.0401,9.3987,0.85998,-5.3336,0 106 | 4.1605,11.2196,-3.6136,-4.0819,0 107 | 5.438,9.4669,-4.9417,-3.9202,0 108 | 5.032,8.2026,-2.6256,-1.0341,0 109 | 5.2418,10.5388,-4.1174,-4.2797,0 110 | -0.2062,9.2207,-3.7044,-6.8103,0 111 | 2.0911,0.94358,4.5512,1.234,0 112 | 1.7317,-0.34765,4.1905,-0.99138,0 113 | 4.1736,3.3336,-1.4244,0.60429,0 114 | 3.9232,-3.2467,3.4579,0.83705,0 115 | 3.8481,10.1539,-3.8561,-4.2228,0 116 | 0.5195,-3.2633,3.0895,-0.9849,0 117 | 3.8584,0.78425,1.1033,1.7008,0 118 | 1.7496,-0.1759,5.1827,1.2922,0 119 | 3.6277,0.9829,0.68861,0.63403,0 120 | 2.7391,7.4018,0.071684,-2.5302,0 121 | 4.5447,8.2274,-2.4166,-1.5875,0 122 | -1.7599,11.9211,2.6756,-3.3241,0 123 | 5.0691,0.21313,0.20278,1.2095,0 124 | 3.4591,11.112,-4.2039,-5.0931,0 125 | 1.9358,8.1654,-0.023425,-2.2586,0 126 | 2.486,-0.99533,5.3404,-0.15475,0 127 | 2.4226,-4.5752,5.947,0.21507,0 128 | 3.9479,-3.7723,2.883,0.019813,0 129 | 2.2634,-4.4862,3.6558,-0.61251,0 130 | 1.3566,4.2358,2.1341,0.3211,0 131 | 5.0452,3.8964,-1.4304,0.86291,0 132 | 3.5499,8.6165,-3.2794,-1.2009,0 133 | 0.17346,7.8695,0.26876,-3.7883,0 134 | 2.4008,9.3593,-3.3565,-3.3526,0 135 | 4.8851,1.5995,-0.00029081,1.6401,0 136 | 4.1927,-3.2674,2.5839,0.21766,0 137 | 1.1166,8.6496,-0.96252,-1.8112,0 138 | 1.0235,6.901,-2.0062,-2.7125,0 139 | -1.803,11.8818,2.0458,-5.2728,0 140 | 0.11739,6.2761,-1.5495,-2.4746,0 141 | 0.5706,-0.0248,1.2421,-0.5621,0 142 | 4.0552,-2.4583,2.2806,1.0323,0 143 | -1.6952,1.0657,8.8294,0.94955,0 144 | -1.1193,10.7271,2.0938,-5.6504,0 145 | 1.8799,2.4707,2.4931,0.37671,0 146 | 3.583,-3.7971,3.4391,-0.12501,0 147 | 0.19081,9.1297,-3.725,-5.8224,0 148 | 3.6582,5.6864,-1.7157,-0.23751,0 149 | -0.13144,-1.7775,8.3316,0.35214,0 150 | 2.3925,9.798,-3.0361,-2.8224,0 151 | 1.6426,3.0149,0.22849,-0.147,0 152 | -0.11783,-1.5789,8.03,-0.028031,0 153 | -0.69572,8.6165,1.8419,-4.3289,0 154 | 2.9421,7.4101,-0.97709,-0.88406,0 155 | -1.7559,11.9459,3.0946,-4.8978,0 156 | -1.2537,10.8803,1.931,-4.3237,0 157 | 3.2585,-4.4614,3.8024,-0.15087,0 158 | 1.8314,6.3672,-0.036278,0.049554,0 159 | 4.5645,-3.6275,2.8684,0.27714,0 160 | 2.7365,-5.0325,6.6608,-0.57889,0 161 | 0.9297,-3.7971,4.6429,-0.2957,0 162 | 3.9663,10.1684,-4.1131,-4.6056,0 163 | 1.4578,-0.08485,4.1785,0.59136,0 164 | 4.8272,3.0687,0.68604,0.80731,0 165 | -2.341,12.3784,0.70403,-7.5836,0 166 | -1.8584,7.886,-1.6643,-1.8384,0 167 | 4.1454,7.257,-1.9153,-0.86078,0 168 | 1.9157,6.0816,0.23705,-2.0116,0 169 | 4.0215,-2.1914,2.4648,1.1409,0 170 | 5.8862,5.8747,-2.8167,-0.30087,0 171 | -2.0897,10.8265,2.3603,-3.4198,0 172 | 4.0026,-3.5943,3.5573,0.26809,0 173 | -0.78689,9.5663,-3.7867,-7.5034,0 174 | 4.1757,10.2615,-3.8552,-4.3056,0 175 | 0.83292,7.5404,0.65005,-0.92544,0 176 | 4.8077,2.2327,-0.26334,1.5534,0 177 | 5.3063,5.2684,-2.8904,-0.52716,0 178 | 2.5605,9.2683,-3.5913,-1.356,0 179 | 2.1059,7.6046,-0.47755,-1.8461,0 180 | 2.1721,-0.73874,5.4672,-0.72371,0 181 | 4.2899,9.1814,-4.6067,-4.3263,0 182 | 3.5156,10.1891,-4.2759,-4.978,0 183 | 2.614,8.0081,-3.7258,-1.3069,0 184 | 0.68087,2.3259,4.9085,0.54998,0 185 | 4.1962,0.74493,0.83256,0.753,0 186 | 6.0919,2.9673,-1.3267,1.4551,0 187 | 1.3234,3.2964,0.2362,-0.11984,0 188 | 1.3264,1.0326,5.6566,-0.41337,0 189 | -0.16735,7.6274,1.2061,-3.6241,0 190 | -1.3,10.2678,-2.953,-5.8638,0 191 | -2.2261,12.5398,2.9438,-3.5258,0 192 | 2.4196,6.4665,-0.75688,0.228,0 193 | 1.0987,0.6394,5.989,-0.58277,0 194 | 4.6464,10.5326,-4.5852,-4.206,0 195 | -0.36038,4.1158,3.1143,-0.37199,0 196 | 1.3562,3.2136,4.3465,0.78662,0 197 | 0.5706,-0.0248,1.2421,-0.5621,0 198 | -2.6479,10.1374,-1.331,-5.4707,0 199 | 3.1219,-3.137,1.9259,-0.37458,0 200 | 5.4944,1.5478,0.041694,1.9284,0 201 | -2.5961,-9.349,9.7942,-0.28018,1 202 | -1.5228,-6.4789,5.7568,0.87325,1 203 | -0.53072,-0.097265,-0.21793,1.0426,1 204 | -0.49081,2.8452,-3.6436,-3.1004,1 205 | -6.5773,6.8017,0.85483,-7.5344,1 206 | -2.4621,2.7645,-0.62578,-2.8573,1 207 | -1.3995,-1.9162,2.5154,0.59912,1 208 | -2.3221,-9.3304,9.233,-0.79871,1 209 | -3.73,-12.9723,12.9817,-2.684,1 210 | -1.6988,-7.1163,5.7902,0.16723,1 211 | -0.26654,-0.64562,-0.42014,0.89136,1 212 | 0.33325,3.3108,-4.5081,-4.012,1 213 | -4.2091,4.7283,-0.49126,-5.2159,1 214 | -2.3142,-0.68494,1.9833,-0.44829,1 215 | -2.4835,-7.4494,6.8964,-0.64484,1 216 | -2.7611,-10.5099,9.0239,-1.9547,1 217 | -0.36025,-4.449,2.1067,0.94308,1 218 | 1.0117,0.9022,-2.3506,0.42714,1 219 | 0.96708,3.8426,-4.9314,-4.1323,1 220 | -5.2049,7.259,0.070827,-7.3004,1 221 | -3.3203,-0.02691,2.9618,-0.44958,1 222 | -2.565,-5.7899,6.0122,0.046968,1 223 | -1.5951,-6.572,4.7689,-0.94354,1 224 | 0.7049,0.17174,-1.7859,0.36119,1 225 | 1.7331,3.9544,-4.7412,-2.5017,1 226 | 0.6818,4.8504,-5.2133,-6.1043,1 227 | -6.3364,9.2848,0.014275,-6.7844,1 228 | -3.8053,2.4273,0.6809,-1.0871,1 229 | -2.1979,-2.1252,1.7151,0.45171,1 230 | -0.87874,-2.2121,-0.051701,0.099985,1 231 | 0.74067,1.7299,-3.1963,-0.1457,1 232 | 0.98296,3.4226,-3.9692,-1.7116,1 233 | -0.3489,3.1929,-3.4054,-3.1832,1 234 | -3.8552,3.5219,-0.38415,-3.8608,1 235 | -6.9599,8.9931,0.2182,-4.572,1 236 | -4.7462,3.1205,1.075,-1.2966,1 237 | -3.2051,-0.14279,0.97565,0.045675,1 238 | -1.7549,-0.080711,-0.75774,-0.3707,1 239 | -0.59587,2.4811,-2.8673,-0.89828,1 240 | -0.89542,2.0279,-2.3652,-1.2746,1 241 | -2.0754,1.2767,-0.64206,-1.2642,1 242 | -3.2778,1.8023,0.1805,-2.3931,1 243 | -2.2183,-1.254,2.9986,0.36378,1 244 | -3.5895,-6.572,10.5251,-0.16381,1 245 | -5.0477,-5.8023,11.244,-0.3901,1 246 | -3.5741,3.944,-0.07912,-2.1203,1 247 | -0.7351,1.7361,-1.4938,-1.1582,1 248 | -2.2617,-4.7428,6.3489,0.11162,1 249 | -4.244,-13.0634,17.1116,-2.8017,1 250 | -4.0218,-8.304,12.555,-1.5099,1 251 | -3.0201,-0.67253,2.7056,0.85774,1 252 | -2.4941,3.5447,-1.3721,-2.8483,1 253 | -0.83121,0.039307,0.05369,-0.23105,1 254 | -2.5665,-6.8824,7.5416,0.70774,1 255 | -4.4018,-12.9371,15.6559,-1.6806,1 256 | -3.7573,-8.2916,10.3032,0.38059,1 257 | -2.4725,-0.40145,1.4855,1.1189,1 258 | -1.9725,2.8825,-2.3086,-2.3724,1 259 | -2.0149,3.6874,-1.9385,-3.8918,1 260 | -0.82053,0.65181,-0.48869,-0.52716,1 261 | -1.7886,-6.3486,5.6154,0.42584,1 262 | -2.9138,-9.4711,9.7668,-0.60216,1 263 | -1.8343,-6.5907,5.6429,0.54998,1 264 | -0.8734,-0.033118,-0.20165,0.55774,1 265 | -0.70346,2.957,-3.5947,-3.1457,1 266 | -6.7387,6.9879,0.67833,-7.5887,1 267 | -2.7723,3.2777,-0.9351,-3.1457,1 268 | -1.6641,-1.3678,1.997,0.52283,1 269 | -2.4349,-9.2497,8.9922,-0.50001,1 270 | -3.793,-12.7095,12.7957,-2.825,1 271 | -1.9551,-6.9756,5.5383,-0.12889,1 272 | -0.69078,-0.50077,-0.35417,0.47498,1 273 | 0.025013,3.3998,-4.4327,-4.2655,1 274 | -4.3967,4.9601,-0.64892,-5.4719,1 275 | -2.456,-0.24418,1.4041,-0.45863,1 276 | -2.62,-6.8555,6.2169,-0.62285,1 277 | -2.9662,-10.3257,8.784,-2.1138,1 278 | -0.71494,-4.4448,2.2241,0.49826,1 279 | 0.6005,0.99945,-2.2126,0.097399,1 280 | 0.61652,3.8944,-4.7275,-4.3948,1 281 | -5.4414,7.2363,0.10938,-7.5642,1 282 | -3.5798,0.45937,2.3457,-0.45734,1 283 | -2.7769,-5.6967,5.9179,0.37671,1 284 | -1.8356,-6.7562,5.0585,-0.55044,1 285 | 0.30081,0.17381,-1.7542,0.48921,1 286 | 1.3403,4.1323,-4.7018,-2.5987,1 287 | 0.26877,4.987,-5.1508,-6.3913,1 288 | -6.5235,9.6014,-0.25392,-6.9642,1 289 | -4.0679,2.4955,0.79571,-1.1039,1 290 | -2.564,-1.7051,1.5026,0.32757,1 291 | -1.3414,-1.9162,-0.15538,-0.11984,1 292 | 0.23874,2.0879,-3.3522,-0.66553,1 293 | 0.6212,3.6771,-4.0771,-2.0711,1 294 | -0.77848,3.4019,-3.4859,-3.5569,1 295 | -4.1244,3.7909,-0.6532,-4.1802,1 296 | -7.0421,9.2,0.25933,-4.6832,1 297 | -4.9462,3.5716,0.82742,-1.4957,1 298 | -3.5359,0.30417,0.6569,-0.2957,1 299 | -2.0662,0.16967,-1.0054,-0.82975,1 300 | -0.88728,2.808,-3.1432,-1.2035,1 301 | -1.0941,2.3072,-2.5237,-1.4453,1 302 | -2.4458,1.6285,-0.88541,-1.4802,1 303 | -3.551,1.8955,0.1865,-2.4409,1 304 | -2.2811,-0.85669,2.7185,0.044382,1 305 | -3.6053,-5.974,10.0916,-0.82846,1 306 | -5.0676,-5.1877,10.4266,-0.86725,1 307 | -3.9204,4.0723,-0.23678,-2.1151,1 308 | -1.1306,1.8458,-1.3575,-1.3806,1 309 | -2.4561,-4.5566,6.4534,-0.056479,1 310 | -4.4775,-13.0303,17.0834,-3.0345,1 311 | -4.1958,-8.1819,12.1291,-1.6017,1 312 | -3.38,-0.7077,2.5325,0.71808,1 313 | -2.4365,3.6026,-1.4166,-2.8948,1 314 | -0.77688,0.13036,-0.031137,-0.35389,1 315 | -2.7083,-6.8266,7.5339,0.59007,1 316 | -4.5531,-12.5854,15.4417,-1.4983,1 317 | -3.8894,-7.8322,9.8208,0.47498,1 318 | -2.5084,-0.22763,1.488,1.2069,1 319 | -2.1652,3.0211,-2.4132,-2.4241,1 320 | -1.8974,3.5074,-1.7842,-3.8491,1 321 | -0.62043,0.5587,-0.38587,-0.66423,1 322 | -1.8387,-6.301,5.6506,0.19567,1 323 | -3,-9.1566,9.5766,-0.73018,1 324 | -1.9116,-6.1603,5.606,0.48533,1 325 | -1.005,0.084831,-0.2462,0.45688,1 326 | -0.87834,3.257,-3.6778,-3.2944,1 327 | -6.651,6.7934,0.68604,-7.5887,1 328 | -2.5463,3.1101,-0.83228,-3.0358,1 329 | -1.4377,-1.432,2.1144,0.42067,1 330 | -2.4554,-9.0407,8.862,-0.86983,1 331 | -3.9411,-12.8792,13.0597,-3.3125,1 332 | -2.1241,-6.8969,5.5992,-0.47156,1 333 | -0.74324,-0.32902,-0.42785,0.23317,1 334 | -0.071503,3.7412,-4.5415,-4.2526,1 335 | -4.2333,4.9166,-0.49212,-5.3207,1 336 | -2.3675,-0.43663,1.692,-0.43018,1 337 | -2.5526,-7.3625,6.9255,-0.66811,1 338 | -3.0986,-10.4602,8.9717,-2.3427,1 339 | -0.89809,-4.4862,2.2009,0.50731,1 340 | 0.56232,1.0015,-2.2726,-0.0060486,1 341 | 0.53936,3.8944,-4.8166,-4.3418,1 342 | -5.3012,7.3915,0.029699,-7.3987,1 343 | -3.3553,0.35591,2.6473,-0.37846,1 344 | -2.7908,-5.7133,5.953,0.45946,1 345 | -1.9983,-6.6072,4.8254,-0.41984,1 346 | 0.15423,0.11794,-1.6823,0.59524,1 347 | 1.208,4.0744,-4.7635,-2.6129,1 348 | 0.2952,4.8856,-5.149,-6.2323,1 349 | -6.4247,9.5311,0.022844,-6.8517,1 350 | -3.9933,2.6218,0.62863,-1.1595,1 351 | -2.659,-1.6058,1.3647,0.16464,1 352 | -1.4094,-2.1252,-0.10397,-0.19225,1 353 | 0.11032,1.9741,-3.3668,-0.65259,1 354 | 0.52374,3.644,-4.0746,-1.9909,1 355 | -0.76794,3.4598,-3.4405,-3.4276,1 356 | -3.9698,3.6812,-0.60008,-4.0133,1 357 | -7.0364,9.2931,0.16594,-4.5396,1 358 | -4.9447,3.3005,1.063,-1.444,1 359 | -3.5933,0.22968,0.7126,-0.3332,1 360 | -2.1674,0.12415,-1.0465,-0.86208,1 361 | -0.9607,2.6963,-3.1226,-1.3121,1 362 | -1.0802,2.1996,-2.5862,-1.2759,1 363 | -2.3277,1.4381,-0.82114,-1.2862,1 364 | -3.7244,1.9037,-0.035421,-2.5095,1 365 | -2.5724,-0.95602,2.7073,-0.16639,1 366 | -3.9297,-6.0816,10.0958,-1.0147,1 367 | -5.2943,-5.1463,10.3332,-1.1181,1 368 | -3.8953,4.0392,-0.3019,-2.1836,1 369 | -1.2244,1.7485,-1.4801,-1.4181,1 370 | -2.6406,-4.4159,5.983,-0.13924,1 371 | -4.6338,-12.7509,16.7166,-3.2168,1 372 | -4.2887,-7.8633,11.8387,-1.8978,1 373 | -3.3458,-0.50491,2.6328,0.53705,1 374 | -1.1188,3.3357,-1.3455,-1.9573,1 375 | 0.55939,-0.3104,0.18307,0.44653,1 376 | -1.5078,-7.3191,7.8981,1.2289,1 377 | -3.506,-12.5667,15.1606,-0.75216,1 378 | -2.9498,-8.273,10.2646,1.1629,1 379 | -1.6029,-0.38903,1.62,1.9103,1 380 | -1.2667,2.8183,-2.426,-1.8862,1 381 | -0.49281,3.0605,-1.8356,-2.834,1 382 | 0.66365,-0.045533,-0.18794,0.23447,1 383 | -0.72068,-6.7583,5.8408,0.62369,1 384 | -1.9966,-9.5001,9.682,-0.12889,1 385 | -0.97325,-6.4168,5.6026,1.0323,1 386 | -0.025314,-0.17383,-0.11339,1.2198,1 387 | 0.062525,2.9301,-3.5467,-2.6737,1 388 | -5.525,6.3258,0.89768,-6.6241,1 389 | -1.2943,2.6735,-0.84085,-2.0323,1 390 | -0.24037,-1.7837,2.135,1.2418,1 391 | -1.3968,-9.6698,9.4652,-0.34872,1 392 | -2.9672,-13.2869,13.4727,-2.6271,1 393 | -1.1005,-7.2508,6.0139,0.36895,1 394 | 0.22432,-0.52147,-0.40386,1.2017,1 395 | 0.90407,3.3708,-4.4987,-3.6965,1 396 | -2.8619,4.5193,-0.58123,-4.2629,1 397 | -1.0833,-0.31247,1.2815,0.41291,1 398 | -1.5681,-7.2446,6.5537,-0.1276,1 399 | -2.0545,-10.8679,9.4926,-1.4116,1 400 | 0.2346,-4.5152,2.1195,1.4448,1 401 | -------------------------------------------------------------------------------- /5. Adaboost/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | # @Time : 2019/4/5 下午3:03 4 | # @Author : zhanzecheng 5 | # @File : utils.py 6 | # @Software: PyCharm 7 | """ 8 | from itertools import combinations_with_replacement 9 | import numpy as np 10 | import math 11 | 12 | def calculate_entropy(y): 13 | """ Calculate the entropy of label array y """ 14 | log2 = lambda x: math.log(x) / math.log(2) 15 | unique_labels = np.unique(y) 16 | entropy = 0 17 | for label in unique_labels: 18 | count = len(y[y == label]) 19 | p = count / len(y) 20 | entropy += -p * log2(p) 21 | return entropy 22 | 23 | 24 | def mean_squared_error(y_true, y_pred): 25 | """ Returns the mean squared error between y_true and y_pred """ 26 | mse = np.mean(np.power(y_true - y_pred, 2)) 27 | return mse 28 | 29 | 30 | def calculate_variance(X): 31 | """ Return the variance of the features in dataset X """ 32 | mean = np.ones(np.shape(X)) * X.mean(0) 33 | n_samples = np.shape(X)[0] 34 | variance = (1 / n_samples) * np.diag((X - mean).T.dot(X - mean)) 35 | 36 | return variance 37 | 38 | 39 | def calculate_std_dev(X): 40 | """ Calculate the standard deviations of the features in dataset X """ 41 | std_dev = np.sqrt(calculate_variance(X)) 42 | return std_dev 43 | 44 | 45 | def euclidean_distance(x1, x2): 46 | """ Calculates the l2 distance between two vectors """ 47 | distance = 0 48 | # Squared distance between each coordinate 49 | for i in range(len(x1)): 50 | distance += pow((x1[i] - x2[i]), 2) 51 | return math.sqrt(distance) 52 | 53 | 54 | def accuracy_score(y_true, y_pred): 55 | """ Compare y_true to y_pred and return the accuracy """ 56 | accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true) 57 | return accuracy 58 | 59 | 60 | def calculate_covariance_matrix(X, Y=None): 61 | """ Calculate the covariance matrix for the dataset X """ 62 | if Y is None: 63 | Y = X 64 | n_samples = np.shape(X)[0] 65 | covariance_matrix = (1 / (n_samples - 1)) * (X - X.mean(axis=0)).T.dot(Y - Y.mean(axis=0)) 66 | 67 | return np.array(covariance_matrix, dtype=float) 68 | 69 | 70 | def calculate_correlation_matrix(X, Y=None): 71 | """ Calculate the correlation matrix for the dataset X """ 72 | if Y is None: 73 | Y = X 74 | n_samples = np.shape(X)[0] 75 | covariance = (1 / n_samples) * (X - X.mean(0)).T.dot(Y - Y.mean(0)) 76 | std_dev_X = np.expand_dims(calculate_std_dev(X), 1) 77 | std_dev_y = np.expand_dims(calculate_std_dev(Y), 1) 78 | correlation_matrix = np.divide(covariance, std_dev_X.dot(std_dev_y.T)) 79 | 80 | return np.array(correlation_matrix, dtype=float) 81 | 82 | def shuffle_data(X, y, seed=None): 83 | """ Random shuffle of the samples in X and y """ 84 | if seed: 85 | np.random.seed(seed) 86 | idx = np.arange(X.shape[0]) 87 | np.random.shuffle(idx) 88 | return X[idx], y[idx] 89 | 90 | 91 | def batch_iterator(X, y=None, batch_size=64): 92 | """ Simple batch generator """ 93 | n_samples = X.shape[0] 94 | for i in np.arange(0, n_samples, batch_size): 95 | begin, end = i, min(i + batch_size, n_samples) 96 | if y is not None: 97 | yield X[begin:end], y[begin:end] 98 | else: 99 | yield X[begin:end] 100 | 101 | 102 | def divide_on_feature(X, feature_i, threshold): 103 | """ Divide dataset based on if sample value on feature index is larger than 104 | the given threshold """ 105 | split_func = None 106 | if isinstance(threshold, int) or isinstance(threshold, float): 107 | split_func = lambda sample: sample[feature_i] >= threshold 108 | else: 109 | split_func = lambda sample: sample[feature_i] == threshold 110 | 111 | X_1 = np.array([sample for sample in X if split_func(sample)]) 112 | X_2 = np.array([sample for sample in X if not split_func(sample)]) 113 | 114 | return np.array([X_1, X_2]) 115 | 116 | 117 | def polynomial_features(X, degree): 118 | n_samples, n_features = np.shape(X) 119 | 120 | def index_combinations(): 121 | combs = [combinations_with_replacement(range(n_features), i) for i in range(0, degree + 1)] 122 | flat_combs = [item for sublist in combs for item in sublist] 123 | return flat_combs 124 | 125 | combinations = index_combinations() 126 | n_output_features = len(combinations) 127 | X_new = np.empty((n_samples, n_output_features)) 128 | 129 | for i, index_combs in enumerate(combinations): 130 | X_new[:, i] = np.prod(X[:, index_combs], axis=1) 131 | 132 | return X_new 133 | 134 | 135 | def get_random_subsets(X, y, n_subsets, replacements=True): 136 | """ Return random subsets (with replacements) of the data """ 137 | n_samples = np.shape(X)[0] 138 | # Concatenate x and y and do a random shuffle 139 | X_y = np.concatenate((X, y.reshape((1, len(y))).T), axis=1) 140 | np.random.shuffle(X_y) 141 | subsets = [] 142 | 143 | # Uses 50% of training samples without replacements 144 | subsample_size = int(n_samples // 2) 145 | if replacements: 146 | subsample_size = n_samples # 100% with replacements 147 | 148 | for _ in range(n_subsets): 149 | idx = np.random.choice( 150 | range(n_samples), 151 | size=np.shape(range(subsample_size)), 152 | replace=replacements) 153 | X = X_y[idx][:, :-1] 154 | y = X_y[idx][:, -1] 155 | subsets.append([X, y]) 156 | return subsets 157 | 158 | 159 | def normalize(X, axis=-1, order=2): 160 | """ Normalize the dataset X """ 161 | l2 = np.atleast_1d(np.linalg.norm(X, order, axis)) 162 | l2[l2 == 0] = 1 163 | return X / np.expand_dims(l2, axis) 164 | 165 | 166 | def standardize(X): 167 | """ Standardize the dataset X """ 168 | X_std = X 169 | mean = X.mean(axis=0) 170 | std = X.std(axis=0) 171 | for col in range(np.shape(X)[1]): 172 | if std[col]: 173 | X_std[:, col] = (X_std[:, col] - mean[col]) / std[col] 174 | # X_std = (X - X.mean(axis=0)) / X.std(axis=0) 175 | return X_std 176 | 177 | 178 | def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None): 179 | """ Split the data into train and test sets """ 180 | if shuffle: 181 | X, y = shuffle_data(X, y, seed) 182 | # Split the training data from test data in the ratio specified in 183 | # test_size 184 | split_i = len(y) - int(len(y) // (1 / test_size)) 185 | X_train, X_test = X[:split_i], X[split_i:] 186 | y_train, y_test = y[:split_i], y[split_i:] 187 | 188 | return X_train, X_test, y_train, y_test 189 | 190 | 191 | def k_fold_cross_validation_sets(X, y, k, shuffle=True): 192 | """ Split the data into k sets of training / test data """ 193 | if shuffle: 194 | X, y = shuffle_data(X, y) 195 | 196 | n_samples = len(y) 197 | left_overs = {} 198 | n_left_overs = (n_samples % k) 199 | if n_left_overs != 0: 200 | left_overs["X"] = X[-n_left_overs:] 201 | left_overs["y"] = y[-n_left_overs:] 202 | X = X[:-n_left_overs] 203 | y = y[:-n_left_overs] 204 | 205 | X_split = np.split(X, k) 206 | y_split = np.split(y, k) 207 | sets = [] 208 | for i in range(k): 209 | X_test, y_test = X_split[i], y_split[i] 210 | X_train = np.concatenate(X_split[:i] + X_split[i + 1:], axis=0) 211 | y_train = np.concatenate(y_split[:i] + y_split[i + 1:], axis=0) 212 | sets.append([X_train, X_test, y_train, y_test]) 213 | 214 | # Add left over samples to last set as training samples 215 | if n_left_overs != 0: 216 | np.append(sets[-1][0], left_overs["X"], axis=0) 217 | np.append(sets[-1][2], left_overs["y"], axis=0) 218 | 219 | return np.array(sets) 220 | 221 | 222 | def to_categorical(x, n_col=None): 223 | """ One-hot encoding of nominal values """ 224 | if not n_col: 225 | n_col = np.amax(x) + 1 226 | one_hot = np.zeros((x.shape[0], n_col)) 227 | one_hot[np.arange(x.shape[0]), x] = 1 228 | return one_hot 229 | 230 | 231 | def to_nominal(x): 232 | """ Conversion from one-hot encoding to nominal """ 233 | return np.argmax(x, axis=1) 234 | 235 | 236 | def make_diagonal(x): 237 | """ Converts a vector into an diagonal matrix """ 238 | m = np.zeros((len(x), len(x))) 239 | for i in range(len(m[0])): 240 | m[i, i] = x[i] 241 | return m -------------------------------------------------------------------------------- /Optimization_method/1.Gradient_decent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### 这里我们使用梯度下降法来拟合房屋价格曲线" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline\n", 17 | "import numpy as np\n", 18 | "import matplotlib.pyplot as plt\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "scrolled": true 26 | }, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "image/png": "\n", 31 | "text/plain": [ 32 | "" 33 | ] 34 | }, 35 | "metadata": {}, 36 | "output_type": "display_data" 37 | } 38 | ], 39 | "source": [ 40 | "spaces = [45, 73, 89, 120, 140, 163]\n", 41 | "prices = [80, 150, 198, 230, 280, 360]\n", 42 | "spaces, prices = np.array(spaces), np.array(prices)\n", 43 | "plt.scatter(spaces, prices, c='g')\n", 44 | "plt.xlabel('house space')\n", 45 | "plt.ylabel('house price')\n", 46 | "plt.show()\n", 47 | "\n", 48 | "## 显示房屋面积和房屋价格的散点图" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "使用梯度下降法我们需要假设$ h(x)$\n", 63 | "这里我们假设 $$ h(x) = \\theta_0 + \\theta_1 * x \\tag{1}$$ (只含有一个特征)\n", 64 | "另损失函数为 $$ J(\\theta) = \\frac{1}{2*6}\\sum_{i=0}^{6}\\{(h_\\theta(X_i)-y_i)^{2} \\tag{2} $$\n", 65 | "假设步长为 $\\lambda$,则每一次的更新公式为:$$\\theta_j = \\theta_j - \\lambda * \\frac{1}{6}\\sum_{i=1}^6(h_\\lambda(X_i) - y_i)*X_{ij} \\tag{3} $$\n", 66 | "下面来用代码实现" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | " h(x) = 0.016206 + 2.078464 * x\n" 79 | ] 80 | }, 81 | { 82 | "data": { 83 | "image/png": "\n", 84 | "text/plain": [ 85 | "" 86 | ] 87 | }, 88 | "metadata": {}, 89 | "output_type": "display_data" 90 | } 91 | ], 92 | "source": [ 93 | "## theta 初始值\n", 94 | "theta0 = 0\n", 95 | "theta1 = 0\n", 96 | "\n", 97 | "## 如果步长选择不对,则 theta 参数更新结果会不对\n", 98 | "step = 0.00005\n", 99 | "\n", 100 | "x_i0 = np.ones((len(spaces)))\n", 101 | "\n", 102 | "# 假设函数\n", 103 | "def h(x) :\n", 104 | " return theta0 + theta1 * x\n", 105 | "\n", 106 | "# 损失函数\n", 107 | "def calc_error() :\n", 108 | " return np.sum(np.power((h(spaces) - prices),2)) / 6\n", 109 | "\n", 110 | "# 损失函数偏导数( theta 0)\n", 111 | "def calc_delta0() :\n", 112 | " return step * np.sum((h(spaces) - prices) * x_i0) / 6\n", 113 | "\n", 114 | "# 损失函数偏导数( theta 1)\n", 115 | "def calc_delta1() :\n", 116 | " return step * np.sum((h(spaces) - prices) * spaces) / 6\n", 117 | "\n", 118 | "# 循环更新 theta 值并计算误差,停止条件为\n", 119 | "# 1. 误差小于某个值\n", 120 | "# 2. 循环次数控制\n", 121 | "k = 0\n", 122 | "while True :\n", 123 | " delta0 = calc_delta0()\n", 124 | " delta1 = calc_delta1()\n", 125 | " theta0 = theta0 - delta0\n", 126 | " theta1 = theta1 - delta1\n", 127 | " error = calc_error()\n", 128 | " # print(\"delta [%f, %f], theta [%f, %f], error %f\" % (delta0, delta1, theta0, theta1, error))\n", 129 | " k = k + 1\n", 130 | " if (k > 10 or error < 200) : \n", 131 | " break\n", 132 | "\n", 133 | "\n", 134 | "print(\" h(x) = %f + %f * x\" % (theta0, theta1))\n", 135 | " \n", 136 | "# 使用假设函数计算出来的价格,用于画拟合曲线\n", 137 | "y_out = h(spaces)\n", 138 | "\n", 139 | "plt.scatter(spaces, prices, c='g')\n", 140 | "plt.plot(spaces, y_out, c='b')\n", 141 | "plt.xlabel('house space')\n", 142 | "plt.ylabel('house price')\n", 143 | "plt.show()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.6.5" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 2 175 | } 176 | -------------------------------------------------------------------------------- /Optimization_method/2. 牛顿法.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 这一次介绍牛顿法,优点是二阶收敛,收敛快。缺点是需要求海森矩阵的逆,计算量大,且不一定有解" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline\n", 17 | "import numpy as np\n", 18 | "import matplotlib.pyplot as plt" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "牛顿法的核心便是选取极值点的临近点作为初始点,以此点切线于x轴的交点作为$x$的更新点,不断反复直至收敛,用公式来表示如下:$$ x_{k+1}=x_k - H^{-1}(x)\\bigtriangledown f(x)$$ \n", 26 | "其中$H^{-1}(x)$为海森矩阵\n", 27 | "选取函数 $$ y=100(x_2-x_1)^2+(1-x_1)^2 \\tag{1}$$\n", 28 | "于是可得函数的梯度$$g(x)=\\bigtriangledown f(x)=(-400(x_2 - x_1^2)x_1-2(1-x_1), 200(x_2-x_1^2))^T$$\n", 29 | "函数$f(x)$的Hesse矩阵为$$\n", 30 | "\\left \\{\\begin{matrix}\n", 31 | "-400(x_2-dx_1^2)+2 & -400x_1 \\\\\n", 32 | "-400x_1 & 200 \\\\\n", 33 | "\\end{matrix}\n", 34 | "\\right \\}\n", 35 | "$$" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 6, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "def jacobian(x):\n", 45 | " \"\"\"\n", 46 | " 返回函数的梯度\n", 47 | " \"\"\"\n", 48 | " return np.array([-400*x[0]*(x[1]-x[0]**2)-2*(1-x[0]),200*(x[1]-x[0]**2)])\n", 49 | "\n", 50 | "def hessian(x):\n", 51 | " \"\"\"\n", 52 | " 返回函数的海森矩阵\n", 53 | " \"\"\"\n", 54 | " return np.array([[-400*(x[1]-3*x[0]**2)+2,-400*x[0]],[-400*x[0],200]])" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "则现在开始实现函数" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 11, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "def newton(x0):\n", 71 | " \"\"\"\n", 72 | " 使用牛顿法来求极值\n", 73 | " \"\"\"\n", 74 | " print('初始点为:')\n", 75 | " print(x0,'\\n')\n", 76 | " W=np.zeros((2,10**3))\n", 77 | " i = 1\n", 78 | " imax = 1000\n", 79 | " W[:,0] = x0 \n", 80 | " x = x0\n", 81 | " # 定义容忍误差\n", 82 | " delta = 1\n", 83 | "\n", 84 | " while i10**(-5):\n", 85 | " p = -np.dot(np.linalg.inv(hessian(x)),jacobian(x))\n", 86 | " x0 = x\n", 87 | " x = x + p\n", 88 | " W[:,i] = x\n", 89 | " # 计算容忍误差\n", 90 | " delta = sum((x-x0)**2)\n", 91 | " print('第',i,'次迭代结果:')\n", 92 | " print(x,'\\n')\n", 93 | " i=i+1\n", 94 | " W=W[:,0:i] # 记录迭代点\n", 95 | " return W" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 12, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "初始点为:\n", 108 | "[-1.2 1. ] \n", 109 | "\n", 110 | "第 1 次迭代结果:\n", 111 | "[-1.1752809 1.38067416] \n", 112 | "\n", 113 | "第 2 次迭代结果:\n", 114 | "[ 0.76311487 -3.17503385] \n", 115 | "\n", 116 | "第 3 次迭代结果:\n", 117 | "[0.76342968 0.58282478] \n", 118 | "\n", 119 | "第 4 次迭代结果:\n", 120 | "[0.99999531 0.94402732] \n", 121 | "\n", 122 | "第 5 次迭代结果:\n", 123 | "[0.9999957 0.99999139] \n", 124 | "\n", 125 | "第 6 次迭代结果:\n", 126 | "[1. 1.] \n", 127 | "\n" 128 | ] 129 | }, 130 | { 131 | "data": { 132 | "image/png": "\n", 133 | "text/plain": [ 134 | "" 135 | ] 136 | }, 137 | "metadata": {}, 138 | "output_type": "display_data" 139 | } 140 | ], 141 | "source": [ 142 | "X1=np.arange(-1.5,1.5+0.05,0.05)\n", 143 | "X2=np.arange(-3.5,2+0.05,0.05)\n", 144 | "[x1,x2]=np.meshgrid(X1,X2)\n", 145 | "f=100*(x2-x1**2)**2+(1-x1)**2; # 给定的函数\n", 146 | "plt.contour(x1,x2,f,20) # 画出函数的20条轮廓线\n", 147 | "x0 = np.array([-1.2,1])\n", 148 | "W=newton(x0)\n", 149 | "\n", 150 | "plt.plot(W[0,:],W[1,:],'g*',W[0,:],W[1,:]) # 画出迭代点收敛的轨迹\n", 151 | "plt.show()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.6.5" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 2 190 | } 191 | -------------------------------------------------------------------------------- /Optimization_method/3.拟牛顿法.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "拟牛顿法的算法步骤如下:\n", 8 | "\n", 9 | "   1) 给出$x_0\\in R^n,H_0\\in R^{n*n},0\\le \\epsilon \\le1,k:=0$;\n", 10 | " \n", 11 | "   2) 若$\\bigtriangledown f(x^k)\\le \\epsilon$,迭代停止;否则求方向:$d_k=-J_k\\bigtriangledown f(x^k)$\n", 12 | " \n", 13 | "   3) 沿着方向做线性搜索$a_k>0$,令$x_{k+1}=x_{k}+a_kd_k$\n", 14 | " \n", 15 | "   4) 校正$H_K$产生$H_{k+1}$,使得牛顿条件成立\n", 16 | " \n", 17 | "   5) k=k+1,转第二步\n", 18 | "\n", 19 | "### 仅需一阶导数,就能完整整个迭代过程" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [] 35 | } 36 | ], 37 | "metadata": { 38 | "kernelspec": { 39 | "display_name": "Python 3", 40 | "language": "python", 41 | "name": "python3" 42 | }, 43 | "language_info": { 44 | "codemirror_mode": { 45 | "name": "ipython", 46 | "version": 3 47 | }, 48 | "file_extension": ".py", 49 | "mimetype": "text/x-python", 50 | "name": "python", 51 | "nbconvert_exporter": "python", 52 | "pygments_lexer": "ipython3", 53 | "version": "3.6.5" 54 | } 55 | }, 56 | "nbformat": 4, 57 | "nbformat_minor": 2 58 | } 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python3 代码实现《统计学习方法》 2 | 3 | ## 实现说明 4 | 5 | 6 | ## 实现进度 7 | 8 | | 模型 | 是否实现 | 9 | | ----------- | ---------------------------------------- | 10 | | 感知机 | 是 | 11 | | kd树-KNN | 是 | 12 | | 朴素贝叶斯方法 | 是 | 13 | | 决策树 | 是 | 14 | | Adaboost | 是 | 15 | 16 | 17 | 18 | | 优化方法 | 是否实现 | 19 | | ----------- | ---------------------------------------- | 20 | | 梯度下降 | 是 | 21 | | 牛顿法 | 是 | 22 | | 拟牛顿法 | 是 | 23 | -------------------------------------------------------------------------------- /RNN/LSTM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "import numpy as np\n", 20 | "import tensorflow as tf\n", 21 | "import matplotlib.pyplot as plt" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "num_steps = 10\n", 31 | "batch_size = 200\n", 32 | "num_classes = 2\n", 33 | "state_size = 16\n", 34 | "learning_rate = 0.1" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "def gen_data(size=1000000):\n", 44 | " X = np.array(np.random.choice(2, size=(size,)))\n", 45 | " Y = []\n", 46 | " '''根据规则生成Y'''\n", 47 | " for i in range(size): \n", 48 | " threshold = 0.5\n", 49 | " if X[i-3] == 1:\n", 50 | " threshold += 0.5\n", 51 | " if X[i-8] == 1:\n", 52 | " threshold -=0.25\n", 53 | " if np.random.rand() > threshold:\n", 54 | " Y.append(0)\n", 55 | " else:\n", 56 | " Y.append(1)\n", 57 | " return X, np.array(Y)\n", 58 | "\n", 59 | "\n", 60 | "'''生成batch数据'''\n", 61 | "def gen_batch(raw_data, batch_size, num_step):\n", 62 | " raw_x, raw_y = raw_data\n", 63 | " data_length = len(raw_x)\n", 64 | " batch_patition_length = data_length // batch_size # ->5000\n", 65 | " data_x = np.zeros([batch_size, batch_patition_length], dtype=np.int32) # ->(200, 5000)\n", 66 | " data_y = np.zeros([batch_size, batch_patition_length], dtype=np.int32) # ->(200, 5000)\n", 67 | " '''填到矩阵的对应位置'''\n", 68 | " for i in range(batch_size):\n", 69 | " data_x[i] = raw_x[batch_patition_length*i:batch_patition_length*(i+1)]# 每一行取batch_patition_length个数,即5000\n", 70 | " data_y[i] = raw_y[batch_patition_length*i:batch_patition_length*(i+1)]\n", 71 | " epoch_size = batch_patition_length // num_steps # ->5000/5=1000 就是每一轮的大小\n", 72 | " for i in range(epoch_size): # 抽取 epoch_size 个数据\n", 73 | " x = data_x[:, i * num_steps:(i + 1) * num_steps] # ->(200, 5)\n", 74 | " y = data_y[:, i * num_steps:(i + 1) * num_steps]\n", 75 | " yield (x, y) # yield 是生成器,生成器函数在生成值后会自动挂起并暂停他们的执行和状态(最后就是for循环结束后的结果,共有1000个(x, y))\n", 76 | "def gen_epochs(n, num_steps):\n", 77 | " for i in range(n):\n", 78 | " yield gen_batch(gen_data(), batch_size, num_steps)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "----> numclass 2 16\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "\n", 96 | "'''定义placeholder'''\n", 97 | "x = tf.placeholder(tf.int32, [batch_size, num_steps], name=\"x\")\n", 98 | "y = tf.placeholder(tf.int32, [batch_size, num_steps], name='y')\n", 99 | "init_state = tf.zeros([batch_size, state_size])\n", 100 | "init_c = tf.zeros([batch_size, state_size])\n", 101 | "'''RNN输入'''\n", 102 | "x_one_hot = tf.one_hot(x, num_classes)\n", 103 | "rnn_inputs = tf.unstack(x_one_hot, axis=1)\n", 104 | "\n", 105 | "print('----> numclass', num_classes, state_size)\n", 106 | "'''定义RNN cell'''\n", 107 | "# Input gate: input, previous output, and bias.\n", 108 | "ix = tf.Variable(tf.truncated_normal([num_classes, state_size], -0.1, 0.1))\n", 109 | "im = tf.Variable(tf.truncated_normal([state_size, state_size], -0.1, 0.1))\n", 110 | "ib = tf.Variable(tf.zeros([1, state_size]))\n", 111 | "# Forget gate: input, previous output, and bias.\n", 112 | "fx = tf.Variable(tf.truncated_normal([num_classes, state_size], -0.1, 0.1))\n", 113 | "fm = tf.Variable(tf.truncated_normal([state_size, state_size], -0.1, 0.1))\n", 114 | "fb = tf.Variable(tf.zeros([1, state_size]))\n", 115 | "# Memory cell: input, state and bias. \n", 116 | "cx = tf.Variable(tf.truncated_normal([num_classes, state_size], -0.1, 0.1))\n", 117 | "cm = tf.Variable(tf.truncated_normal([state_size, state_size], -0.1, 0.1))\n", 118 | "cb = tf.Variable(tf.zeros([1, state_size]))\n", 119 | "# Output gate: input, previous output, and bias.\n", 120 | "ox = tf.Variable(tf.truncated_normal([num_classes, state_size], -0.1, 0.1))\n", 121 | "om = tf.Variable(tf.truncated_normal([state_size, state_size], -0.1, 0.1))\n", 122 | "ob = tf.Variable(tf.zeros([1, state_size]))\n", 123 | "# Variables saving state across unrollings.\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 5, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "def lstm_cell(rnn_input, h, c):\n", 133 | " # a = tf.matmul(rnn_input, ix)\n", 134 | " # print('--a', a.get_shape())\n", 135 | " # a = tf.matmul(h, im)\n", 136 | " # print('--a', a.get_shape())\n", 137 | " input_gate = tf.sigmoid(tf.matmul(rnn_input, ix) + tf.matmul(h, im) + ib)\n", 138 | " forget_gate = tf.sigmoid(tf.matmul(rnn_input, fx) + tf.matmul(h, fm) + fb)\n", 139 | " update = tf.matmul(rnn_input, cx) + tf.matmul(c, cm) + cb\n", 140 | " state = forget_gate * c + input_gate * tf.tanh(update)\n", 141 | " output_gate = tf.sigmoid(tf.matmul(rnn_input, ox) + tf.matmul(h, om) + ob)\n", 142 | " return output_gate * tf.tanh(state), state\n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "----> fianl state (200, 16)\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "state = init_state\n", 160 | "c = init_c\n", 161 | "rnn_outputs = []\n", 162 | "for rnn_input in rnn_inputs:\n", 163 | " state, c = lstm_cell(rnn_input, state, c) # state会重复使用,循环\n", 164 | " rnn_outputs.append(state)\n", 165 | "final_state = rnn_outputs[-1] # 得到最后的state\n", 166 | "print('----> fianl state', final_state.get_shape())\n", 167 | "\n", 168 | "# cell = tf.contrib.rnn.BasicRNNCell(num_units=state_size)\n", 169 | "# rnn_outputs, final_state = tf.contrib.rnn.static_rnn(cell=cell, inputs=rnn_inputs,\n", 170 | "# initial_state=init_state)\n", 171 | "# rnn_outputs, final_state = tf.nn.dynamic_rnn(cell=cell, inputs=rnn_inputs,\n", 172 | "# initial_state=init_state)\n", 173 | "\n", 174 | "\n", 175 | "'''预测,损失,优化'''\n", 176 | "with tf.variable_scope('softmax'):\n", 177 | " W = tf.get_variable('W', [state_size, num_classes])\n", 178 | " b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))\n", 179 | "logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]\n", 180 | "predictions = [tf.nn.softmax(logit) for logit in logits]\n", 181 | "\n", 182 | "y_as_list = tf.unstack(y, num=num_steps, axis=1)\n", 183 | "losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=logit) for logit, label in\n", 184 | " zip(logits, y_as_list)]\n", 185 | "total_loss = tf.reduce_mean(losses)\n", 186 | "train_step = tf.train.AdagradOptimizer(learning_rate).minimize(total_loss)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.6.5" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 2 218 | } 219 | -------------------------------------------------------------------------------- /RNN/RNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "import numpy as np\n", 20 | "import tensorflow as tf\n", 21 | "import matplotlib.pyplot as plt" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "接下来定义超参数" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "num_steps = 10\n", 38 | "batch_size = 200\n", 39 | "num_classes = 2\n", 40 | "state_size = 16\n", 41 | "learning_rate = 0.1" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "接下来生成数据" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "def gen_data(size=1000000):\n", 58 | " X = np.array(np.random.choice(2, size=(size,)))\n", 59 | " Y = []\n", 60 | " '''根据规则生成Y'''\n", 61 | " for i in range(size): \n", 62 | " threshold = 0.5\n", 63 | " if X[i-3] == 1:\n", 64 | " threshold += 0.5\n", 65 | " if X[i-8] == 1:\n", 66 | " threshold -=0.25\n", 67 | " if np.random.rand() > threshold:\n", 68 | " Y.append(0)\n", 69 | " else:\n", 70 | " Y.append(1)\n", 71 | " return X, np.array(Y)\n", 72 | "\n", 73 | "\n", 74 | "'''生成batch数据'''\n", 75 | "def gen_batch(raw_data, batch_size, num_step):\n", 76 | " raw_x, raw_y = raw_data\n", 77 | " data_length = len(raw_x)\n", 78 | " batch_patition_length = data_length // batch_size # ->5000\n", 79 | " data_x = np.zeros([batch_size, batch_patition_length], dtype=np.int32) # ->(200, 5000)\n", 80 | " data_y = np.zeros([batch_size, batch_patition_length], dtype=np.int32) # ->(200, 5000)\n", 81 | " '''填到矩阵的对应位置'''\n", 82 | " for i in range(batch_size):\n", 83 | " data_x[i] = raw_x[batch_patition_length*i:batch_patition_length*(i+1)]# 每一行取batch_patition_length个数,即5000\n", 84 | " data_y[i] = raw_y[batch_patition_length*i:batch_patition_length*(i+1)]\n", 85 | " epoch_size = batch_patition_length // num_steps # ->5000/5=1000 就是每一轮的大小\n", 86 | " for i in range(epoch_size): # 抽取 epoch_size 个数据\n", 87 | " x = data_x[:, i * num_steps:(i + 1) * num_steps] # ->(200, 5)\n", 88 | " y = data_y[:, i * num_steps:(i + 1) * num_steps]\n", 89 | " yield (x, y) # yield 是生成器,生成器函数在生成值后会自动挂起并暂停他们的执行和状态(最后就是for循环结束后的结果,共有1000个(x, y))\n", 90 | "def gen_epochs(n, num_steps):\n", 91 | " for i in range(n):\n", 92 | " yield gen_batch(gen_data(), batch_size, num_steps)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "接下来定义网络结构" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 4, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# 先定义placeholder\n", 109 | "x = tf.placeholder(tf.int32, [batch_size, num_steps], name='x')\n", 110 | "y = tf.placeholder(tf.int32, [batch_size, num_steps], name='y')\n", 111 | "init_state = tf.zeros([batch_size, state_size], name='init_state')\n", 112 | "'''RNN输入'''\n", 113 | "x_one_hot = tf.one_hot(x, num_classes)\n", 114 | "rnn_inputs = tf.unstack(x_one_hot, axis=1)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "这里rnn_cell里面的W和b代表着共用着同一个(就是在时间步下来是共享参数的),这也就是为什么要分开定义rnn_cell和w、b" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "'''接下来定义RNN_cell'''\n", 131 | "with tf.variable_scope('rnn_cell'):\n", 132 | " W = tf.get_variable('W', [num_classes + state_size, state_size]) # 这里其实是 [W, U]\n", 133 | " b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))\n", 134 | "\n", 135 | "# 从这里开始写RNN的公式,当然tf可以自动算梯度\n", 136 | "def rnn_cell(rnn_input, state):\n", 137 | " with tf.variable_scope('rnn_cell', reuse=True):\n", 138 | " W = tf.get_variable('W', [num_classes + state_size, state_size])\n", 139 | " b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))\n", 140 | " return tf.tanh(tf.matmul(tf.concat([rnn_input, state], axis=1), W) + b)\n", 141 | "\n", 142 | " " 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "需要注意的是这里的init_state十分重要" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "'''这里开始做循环的操作'''\n", 159 | "state = init_state\n", 160 | "rnn_outputs = []\n", 161 | "for rnn_input in rnn_inputs:\n", 162 | " state = rnn_cell(rnn_input, state)\n", 163 | " rnn_outputs.append(state)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 7, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "with tf.variable_scope('project'):\n", 173 | " # 这一块是吧多个神经元映射到了一个上面\n", 174 | " W = tf.get_variable('W', [state_size, num_classes])\n", 175 | " b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))\n", 176 | "\n", 177 | "logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]\n", 178 | "# 接下来接一个softmax层\n", 179 | "pred = [tf.nn.softmax(logit) for logit in logits]\n" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 8, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "y_as_list = tf.unstack(y, num=num_steps, axis=1)\n", 189 | "losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label,logits=logit) for logit, label in zip(logits, y_as_list)]\n", 190 | "total_loss = tf.reduce_mean(losses)\n", 191 | "train_step = tf.train.AdagradOptimizer(learning_rate).minimize(total_loss)\n" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [] 200 | } 201 | ], 202 | "metadata": { 203 | "kernelspec": { 204 | "display_name": "Python 3", 205 | "language": "python", 206 | "name": "python3" 207 | }, 208 | "language_info": { 209 | "codemirror_mode": { 210 | "name": "ipython", 211 | "version": 3 212 | }, 213 | "file_extension": ".py", 214 | "mimetype": "text/x-python", 215 | "name": "python", 216 | "nbconvert_exporter": "python", 217 | "pygments_lexer": "ipython3", 218 | "version": "3.6.5" 219 | } 220 | }, 221 | "nbformat": 4, 222 | "nbformat_minor": 2 223 | } 224 | --------------------------------------------------------------------------------