├── README.md
└── code
    ├── 1
    ├── knn.py
    └── mnist_keras.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Machine-Learning-Algorithms
 2 | 
 3 | ------
 4 | 
 5 | > * [本项目地址](https://github.com/huxiaoman7/Machine-Learning-Algorithms.git):https://github.com/huxiaoman7/Machine-Learning-Algorithms.git
 6 | > * [本项目博客文章](http://www.cnblogs.com/charlotte77/):http://www.cnblogs.com/charlotte77/
 7 | 
 8 | ------
 9 | ### 所需环境
10 | 
11 | > * Python2.7
12 | > * Numpy
13 | > * matplotlib
14 | > * scipy
15 | > * scikit-learn
16 | > * pandas
17 | ------
18 | ### 博文目录
19 | #### 1. [机器学习基础与实践(一)————数据清洗](http://www.cnblogs.com/charlotte77/p/5606926.html)
20 | #### 2. [机器学习基础与实践(二)————数据转换](http://www.cnblogs.com/charlotte77/p/5622325.html)
21 | #### 3. [机器学习基础与实践(三)————数据降维之PCA](http://www.cnblogs.com/charlotte77/p/5625984.html)
22 | #### 4. 待补充
23 | 
24 | ------
25 | 本系列原计划与2016年6月至10月完成，但是由于工作繁忙和身体原因更新三篇后一直未更新，最近打算先更新深度学习系列，可能会不断穿插更新机器学习系列。等深度学习系列写完后，会专门再花时间写这个系列。前三篇是去年写的，可能会有一些错误，如果发现了，请及时指正，谢谢^_^!
26 | 


--------------------------------------------------------------------------------
/code/1:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | import numpy as np
  3 | from math import sqrt
  4 | import pandas as pd
  5 | from sklearn.datasets import load_iris
  6 | import matplotlib.pyplot as plt
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | iris = load_iris()
 10 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
 11 | df['label'] = iris.target
 12 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
 13 | 
 14 | data = np.array(df.iloc[:100, [0, 1, -1]])
 15 | train, test = train_test_split(data, test_size=0.1)
 16 | x0 = np.array([x0 for i, x0 in enumerate(train) if train[i][-1] == 0])
 17 | x1 = np.array([x1 for i, x1 in enumerate(train) if train[i][-1] == 1])
 18 | 
 19 | 
 20 | def show_train():
 21 |     plt.scatter(x0[:, 0], x0[:, 1], c='pink', label='[0]')
 22 |     plt.scatter(x1[:, 0], x1[:, 1], c='orange', label='[1]')
 23 |     plt.xlabel('sepal length')
 24 |     plt.ylabel('sepal width')
 25 | 
 26 | 
 27 | class Node:
 28 |     def __init__(self, data, depth=0, lchild=None, rchild=None):
 29 |         self.data = data
 30 |         self.depth = depth
 31 |         self.lchild = lchild
 32 |         self.rchild = rchild
 33 | 
 34 | 
 35 | class KdTree:
 36 |     def __init__(self):
 37 |         self.KdTree = None
 38 |         self.n = 0
 39 |         self.nearest = None
 40 | 
 41 |     def create(self, dataSet, depth=0):
 42 |         if len(dataSet) > 0:
 43 |             m, n = np.shape(dataSet)
 44 |             self.n = n - 1
 45 |             axis = depth % self.n
 46 |             mid = int(m / 2)
 47 |             dataSetcopy = sorted(dataSet, key=lambda x: x[axis])
 48 |             node = Node(dataSetcopy[mid], depth)
 49 |             if depth == 0:
 50 |                 self.KdTree = node
 51 |             node.lchild = self.create(dataSetcopy[:mid], depth+1)
 52 |             node.rchild = self.create(dataSetcopy[mid+1:], depth+1)
 53 |             return node
 54 |         return None
 55 | 
 56 |     def preOrder(self, node):
 57 |         if node is not None:
 58 |             print(node.depth, node.data)
 59 |             self.preOrder(node.lchild)
 60 |             self.preOrder(node.rchild)
 61 | 
 62 |     def search(self, x, count=1):
 63 |         nearest = []
 64 |         for i in range(count):
 65 |             nearest.append([-1, None])
 66 |         self.nearest = np.array(nearest)
 67 | 
 68 |         def recurve(node):
 69 |             if node is not None:
 70 |                 axis = node.depth % self.n
 71 |                 daxis = x[axis] - node.data[axis]
 72 |                 if daxis < 0:
 73 |                     recurve(node.lchild)
 74 |                 else:
 75 |                     recurve(node.rchild)
 76 | 
 77 |                 dist = sqrt(sum((p1 - p2) ** 2 for p1, p2 in zip(x, node.data)))
 78 |                 for i, d in enumerate(self.nearest):
 79 |                     if d[0] < 0 or dist < d[0]:
 80 |                         self.nearest = np.insert(self.nearest, i, [dist, node], axis=0)
 81 |                         self.nearest = self.nearest[:-1]
 82 |                         break
 83 | 
 84 |                 n = list(self.nearest[:, 0]).count(-1)
 85 |                 if self.nearest[-n-1, 0] > abs(daxis):
 86 |                     if daxis < 0:
 87 |                         recurve(node.rchild)
 88 |                     else:
 89 |                         recurve(node.lchild)
 90 | 
 91 |         recurve(self.KdTree)
 92 | 
 93 |         knn = self.nearest[:, 1]
 94 |         belong = []
 95 |         for i in knn:
 96 |             belong.append(i.data[-1])
 97 |         b = max(set(belong), key=belong.count)
 98 | 
 99 |         return self.nearest, b
100 | 
101 | 
102 | kdt = KdTree()
103 | kdt.create(train)
104 | kdt.preOrder(kdt.KdTree)
105 | 
106 | score = 0
107 | for x in test:
108 |     input('press Enter to show next:')
109 |     show_train()
110 |     plt.scatter(x[0], x[1], c='red', marker='x')  # 测试点
111 |     near, belong = kdt.search(x[:-1], 5)  # 设置临近点的个数
112 |     if belong == x[-1]:
113 |         score += 1
114 |     print("test:")
115 |     print(x, "predict:", belong)
116 |     print("nearest:")
117 |     for n in near:
118 |         print(n[1].data, "dist:", n[0])
119 |         plt.scatter(n[1].data[0], n[1].data[1], c='green', marker='+')  # k个最近邻点
120 |     plt.legend()
121 |     plt.show()
122 | 
123 | score /= len(test)
124 | print("score:", score)
125 | 


--------------------------------------------------------------------------------
/code/knn.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | import numpy as np
  3 | from math import sqrt
  4 | import pandas as pd
  5 | from sklearn.datasets import load_iris
  6 | import matplotlib.pyplot as plt
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | iris = load_iris()
 10 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
 11 | df['label'] = iris.target
 12 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
 13 | 
 14 | data = np.array(df.iloc[:100, [0, 1, -1]])
 15 | train, test = train_test_split(data, test_size=0.1)
 16 | x0 = np.array([x0 for i, x0 in enumerate(train) if train[i][-1] == 0])
 17 | x1 = np.array([x1 for i, x1 in enumerate(train) if train[i][-1] == 1])
 18 | 
 19 | 
 20 | def show_train():
 21 |     plt.scatter(x0[:, 0], x0[:, 1], c='pink', label='[0]')
 22 |     plt.scatter(x1[:, 0], x1[:, 1], c='orange', label='[1]')
 23 |     plt.xlabel('sepal length')
 24 |     plt.ylabel('sepal width')
 25 | 
 26 | 
 27 | class Node:
 28 |     def __init__(self, data, depth=0, lchild=None, rchild=None):
 29 |         self.data = data
 30 |         self.depth = depth
 31 |         self.lchild = lchild
 32 |         self.rchild = rchild
 33 | 
 34 | 
 35 | class KdTree:
 36 |     def __init__(self):
 37 |         self.KdTree = None
 38 |         self.n = 0
 39 |         self.nearest = None
 40 | 
 41 |     def create(self, dataSet, depth=0):
 42 |         if len(dataSet) > 0:
 43 |             m, n = np.shape(dataSet)
 44 |             self.n = n - 1
 45 |             axis = depth % self.n
 46 |             mid = int(m / 2)
 47 |             dataSetcopy = sorted(dataSet, key=lambda x: x[axis])
 48 |             node = Node(dataSetcopy[mid], depth)
 49 |             if depth == 0:
 50 |                 self.KdTree = node
 51 |             node.lchild = self.create(dataSetcopy[:mid], depth+1)
 52 |             node.rchild = self.create(dataSetcopy[mid+1:], depth+1)
 53 |             return node
 54 |         return None
 55 | 
 56 |     def preOrder(self, node):
 57 |         if node is not None:
 58 |             print(node.depth, node.data)
 59 |             self.preOrder(node.lchild)
 60 |             self.preOrder(node.rchild)
 61 | 
 62 |     def search(self, x, count=1):
 63 |         nearest = []
 64 |         for i in range(count):
 65 |             nearest.append([-1, None])
 66 |         self.nearest = np.array(nearest)
 67 | 
 68 |         def recurve(node):
 69 |             if node is not None:
 70 |                 axis = node.depth % self.n
 71 |                 daxis = x[axis] - node.data[axis]
 72 |                 if daxis < 0:
 73 |                     recurve(node.lchild)
 74 |                 else:
 75 |                     recurve(node.rchild)
 76 | 
 77 |                 dist = sqrt(sum((p1 - p2) ** 2 for p1, p2 in zip(x, node.data)))
 78 |                 for i, d in enumerate(self.nearest):
 79 |                     if d[0] < 0 or dist < d[0]:
 80 |                         self.nearest = np.insert(self.nearest, i, [dist, node], axis=0)
 81 |                         self.nearest = self.nearest[:-1]
 82 |                         break
 83 | 
 84 |                 n = list(self.nearest[:, 0]).count(-1)
 85 |                 if self.nearest[-n-1, 0] > abs(daxis):
 86 |                     if daxis < 0:
 87 |                         recurve(node.rchild)
 88 |                     else:
 89 |                         recurve(node.lchild)
 90 | 
 91 |         recurve(self.KdTree)
 92 | 
 93 |         knn = self.nearest[:, 1]
 94 |         belong = []
 95 |         for i in knn:
 96 |             belong.append(i.data[-1])
 97 |         b = max(set(belong), key=belong.count)
 98 | 
 99 |         return self.nearest, b
100 | 
101 | 
102 | kdt = KdTree()
103 | kdt.create(train)
104 | kdt.preOrder(kdt.KdTree)
105 | 
106 | score = 0
107 | for x in test:
108 |     input('press Enter to show next:')
109 |     show_train()
110 |     plt.scatter(x[0], x[1], c='red', marker='x')  # 测试点
111 |     near, belong = kdt.search(x[:-1], 5)  # 设置临近点的个数
112 |     if belong == x[-1]:
113 |         score += 1
114 |     print("test:")
115 |     print(x, "predict:", belong)
116 |     print("nearest:")
117 |     for n in near:
118 |         print(n[1].data, "dist:", n[0])
119 |         plt.scatter(n[1].data[0], n[1].data[1], c='green', marker='+')  # k个最近邻点
120 |     plt.legend()
121 |     plt.show()
122 | 
123 | score /= len(test)
124 | print("score:", score)
125 | 


--------------------------------------------------------------------------------
/code/mnist_keras.py:
--------------------------------------------------------------------------------
 1 | # Larger CNN for the MNIST Dataset
 2 | import numpy
 3 | from keras.datasets import mnist
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.layers import Dropout
 7 | from keras.layers import Flatten
 8 | from keras.layers.convolutional import Convolution2D
 9 | from keras.layers.convolutional import MaxPooling2D
10 | from keras.utils import np_utils
11 | import matplotlib.pyplot as plt
12 | from keras.constraints import maxnorm
13 | from keras.optimizers import SGD
14 | # fix random seed for reproducibility
15 | seed = 7
16 | numpy.random.seed(seed)
17 | # load data
18 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
19 | # reshape to be [samples][pixels][width][height]
20 | X_train = X_train.reshape(X_train.shape[0], 1, 28, 28).astype('float32')
21 | X_test = X_test.reshape(X_test.shape[0], 1, 28, 28).astype('float32')
22 | # normalize inputs from 0-255 to 0-1
23 | X_train = X_train / 255
24 | X_test = X_test / 255
25 | # one hot encode outputs
26 | y_train = np_utils.to_categorical(y_train)
27 | y_test = np_utils.to_categorical(y_test)
28 | num_classes = y_test.shape[1]
29 | ###raw
30 | # define the larger model
31 | def larger_model():
32 |     # create model
33 |     model = Sequential()
34 |     model.add(Convolution2D(30, 5, 5, border_mode='valid', input_shape=(1, 28, 28), activation='relu'))
35 |     model.add(MaxPooling2D(pool_size=(2, 2)))
36 |     model.add(Dropout(0.4))
37 |     model.add(Convolution2D(15, 3, 3, activation='relu'))
38 |     model.add(MaxPooling2D(pool_size=(2, 2)))
39 |     model.add(Dropout(0.4))
40 |     model.add(Flatten())
41 |     model.add(Dense(128, activation='relu'))
42 |     model.add(Dropout(0.4))
43 |     model.add(Dense(50, activation='relu'))
44 |     model.add(Dropout(0.4))
45 |     model.add(Dense(num_classes, activation='softmax'))
46 |     # Compile model
47 |     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
48 |     return model
49 | 
50 | # build the model
51 | model = larger_model()
52 | # Fit the model
53 | model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=200, batch_size=200, verbose=2)
54 | # Final evaluation of the model
55 | scores = model.evaluate(X_test, y_test, verbose=0)
56 | print("Large CNN Error: %.2f%%" % (100-scores[1]*100))
57 | 


--------------------------------------------------------------------------------