├── README.md └── code ├── 1 ├── knn.py └── mnist_keras.py /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning-Algorithms 2 | 3 | ------ 4 | 5 | > * [本项目地址](https://github.com/huxiaoman7/Machine-Learning-Algorithms.git):https://github.com/huxiaoman7/Machine-Learning-Algorithms.git 6 | > * [本项目博客文章](http://www.cnblogs.com/charlotte77/):http://www.cnblogs.com/charlotte77/ 7 | 8 | ------ 9 | ### 所需环境 10 | 11 | > * Python2.7 12 | > * Numpy 13 | > * matplotlib 14 | > * scipy 15 | > * scikit-learn 16 | > * pandas 17 | ------ 18 | ### 博文目录 19 | #### 1. [机器学习基础与实践(一)————数据清洗](http://www.cnblogs.com/charlotte77/p/5606926.html) 20 | #### 2. [机器学习基础与实践(二)————数据转换](http://www.cnblogs.com/charlotte77/p/5622325.html) 21 | #### 3. [机器学习基础与实践(三)————数据降维之PCA](http://www.cnblogs.com/charlotte77/p/5625984.html) 22 | #### 4. 待补充 23 | 24 | ------ 25 | 本系列原计划与2016年6月至10月完成,但是由于工作繁忙和身体原因更新三篇后一直未更新,最近打算先更新深度学习系列,可能会不断穿插更新机器学习系列。等深度学习系列写完后,会专门再花时间写这个系列。前三篇是去年写的,可能会有一些错误,如果发现了,请及时指正,谢谢^_^! 26 | -------------------------------------------------------------------------------- /code/1: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import numpy as np 3 | from math import sqrt 4 | import pandas as pd 5 | from sklearn.datasets import load_iris 6 | import matplotlib.pyplot as plt 7 | from sklearn.model_selection import train_test_split 8 | 9 | iris = load_iris() 10 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 11 | df['label'] = iris.target 12 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 13 | 14 | data = np.array(df.iloc[:100, [0, 1, -1]]) 15 | train, test = train_test_split(data, test_size=0.1) 16 | x0 = np.array([x0 for i, x0 in enumerate(train) if train[i][-1] == 0]) 17 | x1 = np.array([x1 for i, x1 in enumerate(train) if train[i][-1] == 1]) 18 | 19 | 20 | def show_train(): 21 | plt.scatter(x0[:, 0], x0[:, 1], c='pink', label='[0]') 22 | plt.scatter(x1[:, 0], x1[:, 1], c='orange', label='[1]') 23 | plt.xlabel('sepal length') 24 | plt.ylabel('sepal width') 25 | 26 | 27 | class Node: 28 | def __init__(self, data, depth=0, lchild=None, rchild=None): 29 | self.data = data 30 | self.depth = depth 31 | self.lchild = lchild 32 | self.rchild = rchild 33 | 34 | 35 | class KdTree: 36 | def __init__(self): 37 | self.KdTree = None 38 | self.n = 0 39 | self.nearest = None 40 | 41 | def create(self, dataSet, depth=0): 42 | if len(dataSet) > 0: 43 | m, n = np.shape(dataSet) 44 | self.n = n - 1 45 | axis = depth % self.n 46 | mid = int(m / 2) 47 | dataSetcopy = sorted(dataSet, key=lambda x: x[axis]) 48 | node = Node(dataSetcopy[mid], depth) 49 | if depth == 0: 50 | self.KdTree = node 51 | node.lchild = self.create(dataSetcopy[:mid], depth+1) 52 | node.rchild = self.create(dataSetcopy[mid+1:], depth+1) 53 | return node 54 | return None 55 | 56 | def preOrder(self, node): 57 | if node is not None: 58 | print(node.depth, node.data) 59 | self.preOrder(node.lchild) 60 | self.preOrder(node.rchild) 61 | 62 | def search(self, x, count=1): 63 | nearest = [] 64 | for i in range(count): 65 | nearest.append([-1, None]) 66 | self.nearest = np.array(nearest) 67 | 68 | def recurve(node): 69 | if node is not None: 70 | axis = node.depth % self.n 71 | daxis = x[axis] - node.data[axis] 72 | if daxis < 0: 73 | recurve(node.lchild) 74 | else: 75 | recurve(node.rchild) 76 | 77 | dist = sqrt(sum((p1 - p2) ** 2 for p1, p2 in zip(x, node.data))) 78 | for i, d in enumerate(self.nearest): 79 | if d[0] < 0 or dist < d[0]: 80 | self.nearest = np.insert(self.nearest, i, [dist, node], axis=0) 81 | self.nearest = self.nearest[:-1] 82 | break 83 | 84 | n = list(self.nearest[:, 0]).count(-1) 85 | if self.nearest[-n-1, 0] > abs(daxis): 86 | if daxis < 0: 87 | recurve(node.rchild) 88 | else: 89 | recurve(node.lchild) 90 | 91 | recurve(self.KdTree) 92 | 93 | knn = self.nearest[:, 1] 94 | belong = [] 95 | for i in knn: 96 | belong.append(i.data[-1]) 97 | b = max(set(belong), key=belong.count) 98 | 99 | return self.nearest, b 100 | 101 | 102 | kdt = KdTree() 103 | kdt.create(train) 104 | kdt.preOrder(kdt.KdTree) 105 | 106 | score = 0 107 | for x in test: 108 | input('press Enter to show next:') 109 | show_train() 110 | plt.scatter(x[0], x[1], c='red', marker='x') # 测试点 111 | near, belong = kdt.search(x[:-1], 5) # 设置临近点的个数 112 | if belong == x[-1]: 113 | score += 1 114 | print("test:") 115 | print(x, "predict:", belong) 116 | print("nearest:") 117 | for n in near: 118 | print(n[1].data, "dist:", n[0]) 119 | plt.scatter(n[1].data[0], n[1].data[1], c='green', marker='+') # k个最近邻点 120 | plt.legend() 121 | plt.show() 122 | 123 | score /= len(test) 124 | print("score:", score) 125 | -------------------------------------------------------------------------------- /code/knn.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import numpy as np 3 | from math import sqrt 4 | import pandas as pd 5 | from sklearn.datasets import load_iris 6 | import matplotlib.pyplot as plt 7 | from sklearn.model_selection import train_test_split 8 | 9 | iris = load_iris() 10 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 11 | df['label'] = iris.target 12 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 13 | 14 | data = np.array(df.iloc[:100, [0, 1, -1]]) 15 | train, test = train_test_split(data, test_size=0.1) 16 | x0 = np.array([x0 for i, x0 in enumerate(train) if train[i][-1] == 0]) 17 | x1 = np.array([x1 for i, x1 in enumerate(train) if train[i][-1] == 1]) 18 | 19 | 20 | def show_train(): 21 | plt.scatter(x0[:, 0], x0[:, 1], c='pink', label='[0]') 22 | plt.scatter(x1[:, 0], x1[:, 1], c='orange', label='[1]') 23 | plt.xlabel('sepal length') 24 | plt.ylabel('sepal width') 25 | 26 | 27 | class Node: 28 | def __init__(self, data, depth=0, lchild=None, rchild=None): 29 | self.data = data 30 | self.depth = depth 31 | self.lchild = lchild 32 | self.rchild = rchild 33 | 34 | 35 | class KdTree: 36 | def __init__(self): 37 | self.KdTree = None 38 | self.n = 0 39 | self.nearest = None 40 | 41 | def create(self, dataSet, depth=0): 42 | if len(dataSet) > 0: 43 | m, n = np.shape(dataSet) 44 | self.n = n - 1 45 | axis = depth % self.n 46 | mid = int(m / 2) 47 | dataSetcopy = sorted(dataSet, key=lambda x: x[axis]) 48 | node = Node(dataSetcopy[mid], depth) 49 | if depth == 0: 50 | self.KdTree = node 51 | node.lchild = self.create(dataSetcopy[:mid], depth+1) 52 | node.rchild = self.create(dataSetcopy[mid+1:], depth+1) 53 | return node 54 | return None 55 | 56 | def preOrder(self, node): 57 | if node is not None: 58 | print(node.depth, node.data) 59 | self.preOrder(node.lchild) 60 | self.preOrder(node.rchild) 61 | 62 | def search(self, x, count=1): 63 | nearest = [] 64 | for i in range(count): 65 | nearest.append([-1, None]) 66 | self.nearest = np.array(nearest) 67 | 68 | def recurve(node): 69 | if node is not None: 70 | axis = node.depth % self.n 71 | daxis = x[axis] - node.data[axis] 72 | if daxis < 0: 73 | recurve(node.lchild) 74 | else: 75 | recurve(node.rchild) 76 | 77 | dist = sqrt(sum((p1 - p2) ** 2 for p1, p2 in zip(x, node.data))) 78 | for i, d in enumerate(self.nearest): 79 | if d[0] < 0 or dist < d[0]: 80 | self.nearest = np.insert(self.nearest, i, [dist, node], axis=0) 81 | self.nearest = self.nearest[:-1] 82 | break 83 | 84 | n = list(self.nearest[:, 0]).count(-1) 85 | if self.nearest[-n-1, 0] > abs(daxis): 86 | if daxis < 0: 87 | recurve(node.rchild) 88 | else: 89 | recurve(node.lchild) 90 | 91 | recurve(self.KdTree) 92 | 93 | knn = self.nearest[:, 1] 94 | belong = [] 95 | for i in knn: 96 | belong.append(i.data[-1]) 97 | b = max(set(belong), key=belong.count) 98 | 99 | return self.nearest, b 100 | 101 | 102 | kdt = KdTree() 103 | kdt.create(train) 104 | kdt.preOrder(kdt.KdTree) 105 | 106 | score = 0 107 | for x in test: 108 | input('press Enter to show next:') 109 | show_train() 110 | plt.scatter(x[0], x[1], c='red', marker='x') # 测试点 111 | near, belong = kdt.search(x[:-1], 5) # 设置临近点的个数 112 | if belong == x[-1]: 113 | score += 1 114 | print("test:") 115 | print(x, "predict:", belong) 116 | print("nearest:") 117 | for n in near: 118 | print(n[1].data, "dist:", n[0]) 119 | plt.scatter(n[1].data[0], n[1].data[1], c='green', marker='+') # k个最近邻点 120 | plt.legend() 121 | plt.show() 122 | 123 | score /= len(test) 124 | print("score:", score) 125 | -------------------------------------------------------------------------------- /code/mnist_keras.py: -------------------------------------------------------------------------------- 1 | # Larger CNN for the MNIST Dataset 2 | import numpy 3 | from keras.datasets import mnist 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.layers import Dropout 7 | from keras.layers import Flatten 8 | from keras.layers.convolutional import Convolution2D 9 | from keras.layers.convolutional import MaxPooling2D 10 | from keras.utils import np_utils 11 | import matplotlib.pyplot as plt 12 | from keras.constraints import maxnorm 13 | from keras.optimizers import SGD 14 | # fix random seed for reproducibility 15 | seed = 7 16 | numpy.random.seed(seed) 17 | # load data 18 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 19 | # reshape to be [samples][pixels][width][height] 20 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28).astype('float32') 21 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28).astype('float32') 22 | # normalize inputs from 0-255 to 0-1 23 | X_train = X_train / 255 24 | X_test = X_test / 255 25 | # one hot encode outputs 26 | y_train = np_utils.to_categorical(y_train) 27 | y_test = np_utils.to_categorical(y_test) 28 | num_classes = y_test.shape[1] 29 | ###raw 30 | # define the larger model 31 | def larger_model(): 32 | # create model 33 | model = Sequential() 34 | model.add(Convolution2D(30, 5, 5, border_mode='valid', input_shape=(1, 28, 28), activation='relu')) 35 | model.add(MaxPooling2D(pool_size=(2, 2))) 36 | model.add(Dropout(0.4)) 37 | model.add(Convolution2D(15, 3, 3, activation='relu')) 38 | model.add(MaxPooling2D(pool_size=(2, 2))) 39 | model.add(Dropout(0.4)) 40 | model.add(Flatten()) 41 | model.add(Dense(128, activation='relu')) 42 | model.add(Dropout(0.4)) 43 | model.add(Dense(50, activation='relu')) 44 | model.add(Dropout(0.4)) 45 | model.add(Dense(num_classes, activation='softmax')) 46 | # Compile model 47 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 48 | return model 49 | 50 | # build the model 51 | model = larger_model() 52 | # Fit the model 53 | model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=200, batch_size=200, verbose=2) 54 | # Final evaluation of the model 55 | scores = model.evaluate(X_test, y_test, verbose=0) 56 | print("Large CNN Error: %.2f%%" % (100-scores[1]*100)) 57 | --------------------------------------------------------------------------------