├── .gitignore ├── CNN ├── Libs │ ├── AlexNet.py │ ├── InceptionNet.py │ ├── SimpleCNN.py │ ├── __init__.py │ └── __pycache__ │ │ ├── AlexNet.cpython-35.pyc │ │ ├── SimpleCNN.cpython-35.pyc │ │ └── __init__.cpython-35.pyc ├── README.md ├── Units │ ├── CNNUnit.py │ ├── YCFUnit.py │ ├── __init__.py │ └── __pycache__ │ │ ├── CNNUnit.cpython-35.pyc │ │ ├── __init__.cpython-35.pyc │ │ ├── model.cpython-35.pyc │ │ └── test1.cpython-35.pyc └── Utils │ ├── DataUtil.py │ ├── __init__.py │ └── __pycache__ │ ├── DataUtil.cpython-35.pyc │ └── __init__.cpython-35.pyc ├── DecisionTree ├── Lib │ ├── DecisionTreeLib.py │ ├── RFLib.py │ ├── __init__.py │ └── __pycache__ │ │ ├── DecisionTreeLib.cpython-36.pyc │ │ ├── RFLib.cpython-36.pyc │ │ └── __init__.cpython-36.pyc ├── README.md ├── Unit │ ├── DecisionTreeUnit.py │ └── __init__.py └── Util │ ├── DataUtil.py │ ├── RandomUtil.py │ ├── __init__.py │ └── __pycache__ │ ├── DataUtil.cpython-36.pyc │ ├── RandomUtil.cpython-36.pyc │ └── __init__.cpython-36.pyc ├── FacePlus ├── Libs │ ├── AverageFace.py │ ├── __init__.py │ └── __pycache__ │ │ ├── AverageFace.cpython-35.pyc │ │ └── __init__.cpython-35.pyc ├── README.md ├── Unit │ ├── CaptureUnit.py │ ├── __init__.py │ └── ghostdriver.log ├── Utils │ ├── LandMarkUtil.py │ ├── ZhiHuUtil.py │ ├── __init__.py │ └── ghostdriver.log ├── dataset │ └── presidents │ │ ├── barak-obama.jpg │ │ ├── bill-clinton.jpg │ │ ├── george-h-bush.jpg │ │ ├── george-w-bush.jpg │ │ ├── jimmy-carter.jpg │ │ └── ronald-regan.jpg └── result │ ├── example.jpg │ └── example_1.jpg ├── FaceReplace ├── Lib │ ├── AGNModel.py │ ├── AutoEncoder.py │ ├── __init__.py │ └── __pycache__ │ │ ├── AutoEncoder.cpython-35.pyc │ │ └── __init__.cpython-35.pyc ├── Main │ ├── FaceAverage.py │ ├── FaceReplace.py │ └── __init__.py ├── README.md └── Tools │ ├── DataObject.py │ ├── Detector.py │ ├── PhotoScrawler.py │ ├── __init__.py │ └── __pycache__ │ ├── DataObject.cpython-35.pyc │ ├── DataObject.cpython-36.pyc │ ├── Detector.cpython-35.pyc │ ├── __init__.cpython-35.pyc │ └── __init__.cpython-36.pyc ├── KNN ├── Lib │ ├── KNNLib.py │ ├── __init__.py │ └── __pycache__ │ │ ├── KNNLib.cpython-36.pyc │ │ └── __init__.cpython-36.pyc ├── README.md └── Unit │ ├── KNNUnit.py │ └── __init__.py ├── LICENSE ├── LinearRegression ├── Lib │ ├── LogisticLib.py │ ├── RFLib.py │ ├── __init__.py │ └── __pycache__ │ │ ├── LogisticLib.cpython-35.pyc │ │ ├── LogisticLib.cpython-36.pyc │ │ ├── RFLib.cpython-35.pyc │ │ ├── RFLib.cpython-36.pyc │ │ ├── __init__.cpython-35.pyc │ │ └── __init__.cpython-36.pyc ├── README.md ├── Unit │ ├── LRUnit.py │ └── __init__.py └── Util │ ├── DataUtil.py │ ├── RandomUtil.py │ ├── __init__.py │ └── __pycache__ │ ├── DataUtil.cpython-35.pyc │ ├── DataUtil.cpython-36.pyc │ ├── RandomUtil.cpython-35.pyc │ ├── RandomUtil.cpython-36.pyc │ ├── __init__.cpython-35.pyc │ └── __init__.cpython-36.pyc ├── README.md ├── RNN ├── Lib │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ └── model.cpython-35.pyc │ └── model.py ├── README.md └── Units │ ├── RNNUnit.py │ ├── Word2VecUnit.py │ ├── __init__.py │ ├── __pycache__ │ ├── Word2VecUnit.cpython-35.pyc │ ├── __init__.cpython-35.pyc │ ├── handWrittenUnit.cpython-35.pyc │ └── test.cpython-35.pyc │ ├── handWrittenUnit.py │ ├── handWrittenUnit_2.py │ └── tempTest.py ├── RandomForest ├── Lib │ ├── RFLib.py │ ├── __init__.py │ └── __pycache__ │ │ ├── RFLib.cpython-36.pyc │ │ └── __init__.cpython-36.pyc ├── README.md ├── Unit │ ├── RFUnit.py │ └── __init__.py └── Util │ ├── DataUtil.py │ ├── RandomUtil.py │ ├── __init__.py │ └── __pycache__ │ ├── DataUtil.cpython-36.pyc │ ├── RandomUtil.cpython-36.pyc │ └── __init__.cpython-36.pyc ├── SVM ├── Lib │ ├── RFLib.py │ ├── SVMLib.py │ ├── __init__.py │ └── __pycache__ │ │ ├── RFLib.cpython-36.pyc │ │ ├── SVMLib.cpython-36.pyc │ │ └── __init__.cpython-36.pyc ├── README.md ├── Unit │ ├── SVMUnit.py │ └── __init__.py └── Util │ ├── DataUtil.py │ ├── RandomUtil.py │ ├── __init__.py │ └── __pycache__ │ ├── DataUtil.cpython-36.pyc │ ├── RandomUtil.cpython-36.pyc │ └── __init__.cpython-36.pyc ├── notebooks ├── RNN Study Notes.ipynb ├── Singular value decomposition.ipynb ├── Softmax Regression.ipynb ├── Tensorflow document.ipynb └── Word2Vector.ipynb ├── pics ├── fa-1.jpg ├── fr-1.jpg ├── fr-2.jpg ├── fr-3.jpg ├── fr-4.jpg ├── fr-7.jpg ├── fr-8.jpg ├── fr-9.jpg ├── rnn-1.png ├── rnn-2.png ├── rnn.png ├── svd.jpg ├── svd1.jpg ├── svd2.jpg └── svd3.jpg └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # for idea 2 | */.idea/ 3 | 4 | # for jupyter notebook 5 | */.ipynb_checkpoints/ -------------------------------------------------------------------------------- /CNN/Libs/AlexNet.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年7月2日 3 | 4 | @author: IL MARE 5 | ''' 6 | from tensorflow.examples.tutorials.mnist import input_data 7 | import tensorflow as tf 8 | import numpy as np 9 | from Utils.DataUtil import ImageObject 10 | from matplotlib import pyplot as plt 11 | import matplotlib as mpl 12 | 13 | save_path = r"G:/Machine-Learning/python/CNN/modelFile/AlexNet/dogandcat/" 14 | 15 | def weight_variable(shape): 16 | initial = tf.truncated_normal(shape, dtype=tf.float32, stddev=0.1) 17 | return tf.Variable(initial) 18 | 19 | def bias_variable(shape): 20 | initial = tf.constant(0.1, shape=shape) 21 | return tf.Variable(initial, dtype=tf.float32) 22 | 23 | def conv2d(x, W, strides, padding="VALID"): 24 | return tf.nn.conv2d(x, W, strides=strides, padding=padding) 25 | 26 | class AlexNet: 27 | def __init__(self, lr, k, classify, maxIter, imageObject): 28 | self._imageObject = imageObject 29 | self._maxIter = maxIter 30 | self._k = k 31 | self._lr = lr 32 | self._classify = classify 33 | self.defineNetwork() 34 | self.defineLoss() 35 | @property 36 | def classify(self): 37 | return self._classify 38 | @property 39 | def keep_prob(self): 40 | return self._keep_prob 41 | @property 42 | def lr(self): 43 | return self._lr 44 | def defineNetwork(self): 45 | self._x = tf.placeholder(dtype=tf.float32, shape=[None, 224, 224, 3]) 46 | image = self._x / 255.0 47 | # image = tf.reshape(self._x, [-1, 28, 28, self._k]) 48 | self._y = tf.placeholder(tf.float32, [None, self._classify]) 49 | self._keep_prob = tf.placeholder(dtype=tf.float32) 50 | with tf.name_scope("conv1") as scope: 51 | kernel = tf.Variable(tf.truncated_normal([11, 11, self._k, 96], 52 | stddev=0.1, dtype=tf.float32)) 53 | h_conv1 = conv2d(image, kernel, [1, 4, 4, 1]) 54 | biases = tf.Variable(tf.constant(0.0, shape=[96], dtype=tf.float32)) 55 | conv1 = tf.nn.relu(h_conv1 + biases) 56 | pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], 57 | strides=[1, 2, 2, 1], padding="SAME") 58 | with tf.name_scope("conv2") as scope: 59 | kernel = tf.Variable(tf.truncated_normal([5, 5, 96, 256], 60 | stddev=0.1, dtype=tf.float32)) 61 | h_conv2 = conv2d(pool1, kernel, strides=[1, 1, 1, 1], padding="SAME") 62 | biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32)) 63 | conv2 = tf.nn.relu(h_conv2 + biases) 64 | pool2 = tf.nn.max_pool(conv2, ksize=[1, 3, 3, 1], 65 | strides=[1, 2, 2, 1], padding="SAME") 66 | with tf.name_scope("conv3") as scope: 67 | kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 384], 68 | stddev=0.1, dtype=tf.float32)) 69 | h_conv3 = conv2d(pool2, kernel, [1, 1, 1, 1], "SAME") 70 | biases = tf.Variable(tf.constant(0.0, dtype=tf.float32, shape=[384])) 71 | conv3 = tf.nn.relu(h_conv3 + biases) 72 | with tf.name_scope("conv4") as scope: 73 | kernel = tf.Variable(tf.truncated_normal([3, 3, 384, 384], 74 | dtype=tf.float32, stddev=0.1)) 75 | h_conv4 = conv2d(conv3, kernel, [1, 1, 1, 1], "SAME") 76 | biases= tf.Variable(tf.constant(0.0, tf.float32, shape=[384]), dtype=tf.float32) 77 | conv4 = tf.nn.relu(h_conv4 + biases) 78 | with tf.name_scope("conv5") as scope: 79 | kernel = tf.Variable(tf.truncated_normal([3, 3, 384, 256], 80 | dtype=tf.float32, stddev=0.1)) 81 | h_conv5 = conv2d(conv4, kernel, [1, 1, 1, 1], "SAME") 82 | biases = tf.Variable(tf.constant(0.0, dtype=tf.float32, shape=[256])) 83 | conv5 = tf.nn.relu(h_conv5 + biases) 84 | pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], 85 | strides=[1, 2, 2, 1], padding="SAME") 86 | self._dim = 1 87 | var = pool5.get_shape().as_list() 88 | for i in range(len(var) - 1): 89 | self._dim *= var[i + 1] 90 | pool5 = tf.reshape(pool5, [-1, self._dim]) 91 | with tf.name_scope("link1") as scope: 92 | kernel = tf.Variable(tf.truncated_normal([self._dim, 4096], stddev=0.1, dtype=tf.float32)) 93 | biases = tf.Variable(tf.constant(0.0, dtype=tf.float32, shape=[4096])) 94 | h_fc = tf.nn.dropout(tf.matmul(pool5, kernel) + biases, keep_prob=self._keep_prob) 95 | with tf.name_scope("link2") as scope: 96 | kernel = tf.Variable(tf.truncated_normal([4096, 4096], stddev=0.1, dtype=tf.float32)) 97 | biases = tf.Variable(tf.constant(0.0, dtype=tf.float32, shape=[4096])) 98 | h_fc1 = tf.nn.dropout(tf.matmul(h_fc, kernel) + biases, keep_prob=self._keep_prob) 99 | with tf.name_scope("link3") as scope: 100 | kernel = tf.Variable(tf.truncated_normal([4096, self._classify], stddev=0.1, dtype=tf.float32)) 101 | biases = tf.Variable(tf.constant(0.0, dtype=tf.float32, shape=[self._classify])) 102 | self._out = tf.matmul(h_fc1, kernel) + biases 103 | self._pre = tf.nn.softmax(self._out) 104 | def defineLoss(self): 105 | self._cross_entry = tf.reduce_mean( 106 | tf.nn.softmax_cross_entropy_with_logits(logits=self._out, labels=self._y)) 107 | self._accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(self._y, 1), 108 | tf.argmax(self._pre, 1)), dtype=tf.float32)) 109 | # vars = tf.trainable_variables() 110 | # grads, _ = tf.clip_by_global_norm(tf.gradients(self._cross_entry, vars), 5) 111 | # optimizer = tf.train.AdamOptimizer(self._lr) 112 | # self._train = optimizer.apply_gradients(zip(grads, vars)) 113 | self._train = tf.train.AdamOptimizer(self._lr).minimize(self._cross_entry) 114 | def train(self): 115 | try: 116 | # mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 117 | fig = plt.figure("cross-entropy") 118 | mpl.rcParams['xtick.labelsize'] = 8 119 | mpl.rcParams['ytick.labelsize'] = 8 120 | ax = fig.add_subplot(111) 121 | ax.grid(True) 122 | ac = [] 123 | aac = [] 124 | with tf.Session() as sess: 125 | sess.run(tf.global_variables_initializer()) 126 | for i in range(self._maxIter): 127 | # train, label = mnist.train.next_batch(50) 128 | train, label = self._imageObject.nextBatch(24) 129 | _, accuracy, loss = sess.run([self._train, self._accuracy, self._cross_entry], feed_dict={self._x: train, 130 | self._y: label, self._keep_prob: 0.5}) 131 | ac.append(accuracy) 132 | aac.append(np.mean(np.array(ac))) 133 | ax.plot(np.arange(len(ac)), np.array(ac), linewidth=0.8, color="b") 134 | ax.plot(np.arange(len(aac)), np.array(aac), linewidth=0.8, color="r") 135 | plt.pause(0.1) 136 | if i % 10 == 0: 137 | print("step {0:d}/{1:d},accuracy: {2:.3f}, loss: {3:.3f}".format(i, self._maxIter, accuracy, loss)) 138 | if i % 100 == 0: 139 | tf.train.Saver().save(sess, "{0}model".format(save_path), global_step=i) 140 | except Exception as e: 141 | print(e) 142 | finally: 143 | plt.show() 144 | def loadModel(self): 145 | self._sess = tf.Session() 146 | tf.train.Saver().restore(self._sess, tf.train.latest_checkpoint(save_path)) 147 | def testCatAndDog(self): 148 | result = [] 149 | for img, label in self._imageObject.generateTestBatch(50): 150 | accuracy = self._sess.run(self._accuracy, 151 | feed_dict={self._x: img, self._y: label, self._keep_prob:1.0}) 152 | result.append(accuracy) 153 | print("step:{0:d}, accuracy: {1:.3f}".format(len(result), accuracy)) 154 | print("average accuracy:", np.mean(np.array(result))) 155 | def test(self): 156 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 157 | count = 0 158 | i = 0 159 | for img, label in zip(mnist.test.images, mnist.test.labels): 160 | img = np.reshape(img, [1, 784]) 161 | label = np.reshape(label, [1, 10]) 162 | pre = self._sess.run(self._pre, 163 | feed_dict={self._x: img, self._y: label, self._keep_prob:1.0}) 164 | if np.equal(np.argmax(pre, 1), np.argmax(label, 1)): 165 | count += 1 166 | i += 1 167 | if i % 100 == 0: 168 | print("step: {0:d}/{1:d}, accuracy: {2:.3f}".format(i, len(mnist.test.images), count / i)) 169 | print("accuracy: ", (count / i)) 170 | 171 | file_path = r"G:/研究生课件/人工神经网络/神经网络/dataset_cat_dog_classification/dataset/" 172 | 173 | if __name__ == "__main__": 174 | obj = ImageObject(file_path) 175 | alex = AlexNet(0.0001, 3, 2, 2000, obj) 176 | alex.train() 177 | # alex.loadModel() 178 | # alex.testCatAndDog() -------------------------------------------------------------------------------- /CNN/Libs/InceptionNet.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年7月2日 3 | 4 | @author: IL MARE 5 | ''' 6 | from tensorflow.examples.tutorials.mnist import input_data 7 | import tensorflow as tf 8 | import numpy as np 9 | import sys 10 | import os 11 | sys.path.append(os.getcwd()) 12 | from Utils.DataUtil import ImageObject 13 | from matplotlib import pyplot as plt 14 | import matplotlib as mpl 15 | from PIL import Image 16 | 17 | save_path = r"G:/Machine-Learning/python/CNN/modelFile/AlexNet/dogandcat/" 18 | file_path = r"G:/研究生课件/人工神经网络/神经网络/dataset_cat_dog_classification/dataset/" 19 | 20 | def weight_variable(shape): 21 | initial = tf.truncated_normal(shape, dtype=tf.float32, stddev=0.1) 22 | return tf.Variable(initial) 23 | 24 | def conv2d(x, W, strides, padding="VALID"): 25 | return tf.nn.conv2d(x, W, strides=strides, padding=padding) 26 | 27 | class AlexNet: 28 | def __init__(self, lr, k, classify, maxIter, imageObject): 29 | self._imageObject = imageObject 30 | self._maxIter = maxIter 31 | self._k = k 32 | self._lr = lr 33 | self._classify = classify 34 | self.defineNetwork() 35 | self.defineLoss() 36 | @property 37 | def classify(self): 38 | return self._classify 39 | @property 40 | def keep_prob(self): 41 | return self._keep_prob 42 | @property 43 | def lr(self): 44 | return self._lr 45 | def defineNetwork(self): 46 | self._x = tf.placeholder(dtype=tf.float32, shape=[None, 224, 224, 3]) 47 | image = self._x / 255.0 48 | # image = tf.reshape(self._x, [-1, 28, 28, self._k]) 49 | self._y = tf.placeholder(tf.float32, [None, self._classify]) 50 | self._keep_prob = tf.placeholder(dtype=tf.float32) 51 | with tf.name_scope("conv1") as scope: 52 | kernel = tf.Variable(tf.truncated_normal([5, 5, self._k, 32], 53 | stddev=0.1, dtype=tf.float32)) 54 | h_conv1 = conv2d(image, kernel, [1, 4, 4, 1]) 55 | # biases = tf.Variable(tf.constant(0.1, shape=[32], dtype=tf.float32)) 56 | biases = tf.Variable(tf.random_normal(shape=[32], stddev=0.1, dtype=tf.float32)) 57 | conv1 = tf.nn.relu(h_conv1 + biases) 58 | pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], 59 | strides=[1, 2, 2, 1], padding="SAME") 60 | with tf.name_scope("conv2") as scope: 61 | kernel = tf.Variable(tf.truncated_normal([5, 5, 32, 64], 62 | dtype=tf.float32, stddev=0.1)) 63 | h_conv5 = conv2d(pool1, kernel, [1, 1, 1, 1], "SAME") 64 | biases = tf.Variable(tf.random_normal(shape=[64], stddev=0.1, dtype=tf.float32)) 65 | # biases = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[64])) 66 | conv5 = tf.nn.relu(h_conv5 + biases) 67 | pool5 = tf.nn.max_pool(conv5, ksize=[1, 2, 2, 1], 68 | strides=[1, 2, 2, 1], padding="SAME") 69 | self._dim = 1 70 | var = pool5.get_shape().as_list() 71 | for i in range(len(var) - 1): 72 | self._dim *= var[i + 1] 73 | pool5 = tf.reshape(pool5, [-1, self._dim]) 74 | with tf.name_scope("link1") as scope: 75 | kernel = tf.Variable(tf.truncated_normal([self._dim, 1024], stddev=0.1, dtype=tf.float32)) 76 | biases = tf.Variable(tf.random_normal(shape=[1024], stddev=0.1, dtype=tf.float32)) 77 | # biases = tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[1024])) 78 | h_fc = tf.nn.dropout(tf.matmul(pool5, kernel) + biases, keep_prob=self._keep_prob) 79 | with tf.name_scope("link3") as scope: 80 | kernel = tf.Variable(tf.truncated_normal([1024, self._classify], stddev=0.1, dtype=tf.float32)) 81 | biases = tf.Variable(tf.random_normal(shape=[self._classify], stddev=0.1, dtype=tf.float32)) 82 | self._out = tf.matmul(h_fc, kernel) + biases 83 | self._pre = tf.nn.softmax(self._out) 84 | def defineLoss(self): 85 | self._cross_entry = tf.reduce_mean( 86 | tf.nn.softmax_cross_entropy_with_logits(logits=self._out, labels=self._y)) 87 | self._accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(self._y, 1), 88 | tf.argmax(self._pre, 1)), dtype=tf.float32)) 89 | # vars = tf.trainable_variables() 90 | # grads, _ = tf.clip_by_global_norm(tf.gradients(self._cross_entry, vars), 5) 91 | # optimizer = tf.train.AdamOptimizer(self._lr) 92 | # self._train = optimizer.apply_gradients(zip(grads, vars)) 93 | self._train = tf.train.AdamOptimizer(self._lr).minimize(self._cross_entry) 94 | def train_1(self): 95 | try: 96 | fig = plt.figure("cross-entropy") 97 | mpl.rcParams['xtick.labelsize'] = 8 98 | mpl.rcParams['ytick.labelsize'] = 8 99 | ax = fig.add_subplot(111) 100 | ax.grid(True) 101 | ac = [] 102 | aac = [] 103 | for i in range(self._maxIter): 104 | train, label = self._imageObject.nextBatch(24) 105 | _, accuracy, loss = self._sess.run([self._train, self._accuracy, self._cross_entry], feed_dict={self._x: train, 106 | self._y: label, self._keep_prob: 0.5}) 107 | ac.append(accuracy) 108 | aac.append(np.mean(np.array(ac))) 109 | ax.plot(np.arange(len(ac)), np.array(ac), linewidth=0.8, color="b") 110 | ax.plot(np.arange(len(aac)), np.array(aac), linewidth=0.8, color="r") 111 | plt.pause(0.1) 112 | if i % 10 == 0: 113 | print("step {0:d}/{1:d},accuracy: {2:.3f}, loss: {3:.3f}".format(i, self._maxIter, accuracy, loss)) 114 | if i % 250 == 0: 115 | tf.train.Saver().save(self._sess, "{0}model".format(save_path), global_step=i) 116 | except Exception as e: 117 | print(e) 118 | finally: 119 | plt.show() 120 | def train(self): 121 | try: 122 | # mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 123 | # fig = plt.figure("cross-entropy") 124 | # mpl.rcParams['xtick.labelsize'] = 8 125 | # mpl.rcParams['ytick.labelsize'] = 8 126 | # ax = fig.add_subplot(111) 127 | # ax.grid(True) 128 | ac = [] 129 | aac = [] 130 | with tf.Session() as sess: 131 | sess.run(tf.global_variables_initializer()) 132 | for i in range(self._maxIter): 133 | # train, label = mnist.train.next_batch(50) 134 | train, label = self._imageObject.nextBatch(24) 135 | _, accuracy, loss = sess.run([self._train, self._accuracy, self._cross_entry], feed_dict={self._x: train, 136 | self._y: label, self._keep_prob: 0.5}) 137 | ac.append(accuracy) 138 | aac.append(np.mean(np.array(ac))) 139 | # ax.plot(np.arange(len(ac)), np.array(ac), linewidth=0.8, color="b") 140 | # ax.plot(np.arange(len(aac)), np.array(aac), linewidth=0.8, color="r") 141 | # plt.pause(0.1) 142 | if i % 10 == 0: 143 | print("step {0:d}/{1:d},accuracy: {2:.3f}, loss: {3:.3f}".format(i, self._maxIter, accuracy, loss)) 144 | if i % 250 == 0: 145 | tf.train.Saver().save(sess, "{0}model".format(save_path), global_step=i) 146 | except Exception as e: 147 | print(e) 148 | finally: 149 | fig = plt.figure("cross-entropy") 150 | mpl.rcParams['xtick.labelsize'] = 8 151 | mpl.rcParams['ytick.labelsize'] = 8 152 | ax = fig.add_subplot(111) 153 | ax.plot(np.arange(len(ac)), np.array(ac), linewidth=0.8, color="b") 154 | ax.plot(np.arange(len(aac)), np.array(aac), linewidth=0.8, color="r") 155 | plt.show() 156 | def loadModel(self): 157 | self._sess = tf.Session() 158 | print(save_path) 159 | print(tf.train.latest_checkpoint(save_path)) 160 | tf.train.Saver().restore(self._sess, tf.train.latest_checkpoint(save_path)) 161 | def testCatAndDog(self): 162 | result = [] 163 | for img, label in self._imageObject.generateTestBatch(200): 164 | accuracy, pre, loss = self._sess.run([self._accuracy, self._pre, self._cross_entry], 165 | feed_dict={self._x: img, self._y: label, self._keep_prob:1.0}) 166 | # for i in range(len(label)): 167 | # lab = label[i] 168 | # predict = pre[i] 169 | # image = img[i] 170 | # if np.argmax(predict) == 0: 171 | # tmp = Image.fromarray(image) 172 | # if np.argmax(lab) == 0: 173 | # tmp.save("g:/dogandcat/cat/cat-{0}-{1}.jpg".format(len(result), i)) 174 | # else: 175 | # tmp.save("g:/dogandcat/cat/dog-{0}-{1}.jpg".format(len(result), i)) 176 | # else: 177 | # tmp = Image.fromarray(image) 178 | # if np.argmax(lab) == 0: 179 | # tmp.save("g:/dogandcat/dog/cat-{0}-{1}.jpg".format(len(result), i)) 180 | # else: 181 | # tmp.save("g:/dogandcat/dog/dog-{0}-{1}.jpg".format(len(result), i)) 182 | 183 | result.append(accuracy) 184 | print("step:{0:d}, accuracy: {1:.3f}, loss: {2: .3f}".format(len(result), accuracy, loss)) 185 | print("average accuracy: {0:.3f}".format(np.mean(np.array(result)))) 186 | def test(self): 187 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 188 | count = 0 189 | i = 0 190 | for img, label in zip(mnist.test.images, mnist.test.labels): 191 | img = np.reshape(img, [1, 784]) 192 | label = np.reshape(label, [1, 10]) 193 | pre = self._sess.run(self._pre, 194 | feed_dict={self._x: img, self._y: label, self._keep_prob:1.0}) 195 | if np.equal(np.argmax(pre, 1), np.argmax(label, 1)): 196 | count += 1 197 | i += 1 198 | if i % 100 == 0: 199 | print("step: {0:d}/{1:d}, accuracy: {2:.3f}".format(i, len(mnist.test.images), count / i)) 200 | print("accuracy: ", (count / i)) 201 | 202 | if __name__ == "__main__": 203 | obj = ImageObject(file_path) 204 | alex = AlexNet(0.0001, 3, 2, 20000, obj) 205 | alex.train() 206 | # alex.loadModel() 207 | # alex.testCatAndDog() -------------------------------------------------------------------------------- /CNN/Libs/SimpleCNN.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年7月1日 3 | 4 | @author: IL MARE 5 | ''' 6 | from tensorflow.examples.tutorials.mnist import input_data 7 | import tensorflow as tf 8 | import numpy as np 9 | from matplotlib import pyplot as plt 10 | import matplotlib as mpl 11 | 12 | save_path = r"G:/Machine-Learning/python/CNN/modelFile/SimpleCNN/" 13 | 14 | def weight_variable(shape): 15 | initial = tf.truncated_normal(shape, dtype=tf.float32, stddev=0.1) 16 | return tf.Variable(initial, dtype=tf.float32) 17 | 18 | def bias_variable(shape): 19 | initial = tf.constant(0.1, shape=shape) 20 | return tf.Variable(initial, dtype=tf.float32) 21 | 22 | def conv2d(x, W): 23 | return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME") 24 | 25 | def max_pool_2x2(x): 26 | return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") 27 | 28 | class SimpleCNN: 29 | def __init__(self, lr, maxIter): 30 | self._maxIter = maxIter 31 | self._lr = lr 32 | self.defineNetWork() 33 | self.defineLoss() 34 | @property 35 | def lr(self): 36 | return self._lr 37 | @property 38 | def keep_prob(self): 39 | return self._keep_prob 40 | def defineNetWork(self): 41 | self._x = tf.placeholder(dtype=tf.float32, shape=[None, 784]) 42 | self._y = tf.placeholder(dtype=tf.float32, shape=[None, 10]) 43 | x_image = tf.reshape(self._x, [-1, 28, 28, 1]) 44 | with tf.name_scope("conv1") as scope: 45 | kernal = weight_variable([5, 5, 1, 32]) 46 | biases = bias_variable([32]) 47 | h_conv1 = tf.nn.relu(conv2d(x_image, kernal) + biases) 48 | h_pool1 = max_pool_2x2(h_conv1) 49 | with tf.name_scope("conv2") as scope: 50 | kernal = weight_variable([5, 5, 32, 64]) 51 | biases = bias_variable([64]) 52 | h_conv2 = tf.nn.relu(conv2d(h_pool1, kernal) + biases) 53 | h_pool2 = max_pool_2x2(h_conv2) 54 | with tf.name_scope("link1") as scope: 55 | kernal = weight_variable([7 * 7 * 64, 1024]) 56 | biases = bias_variable([1024]) 57 | h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) 58 | h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, kernal) + biases) 59 | self._keep_prob = tf.placeholder(dtype=tf.float32) 60 | h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob=self._keep_prob) 61 | with tf.name_scope("link2") as scope: 62 | kernal = weight_variable([1024, 10]) 63 | biases = bias_variable([10]) 64 | self._out = tf.matmul(h_fc1_drop, kernal) + biases 65 | self._y_conv = tf.nn.softmax(self._out) 66 | def defineLoss(self): 67 | self._cross_entry = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self._y, logits=self._out)) 68 | # self._cross_entry = tf.reduce_mean(-tf.reduce_sum(self._y * tf.log(self._y_conv), 69 | # reduction_indices=[1])) 70 | # vars = tf.trainable_variables() 71 | # grads, _ = tf.clip_by_global_norm(tf.gradients(self._cross_entry, vars), 5) 72 | # optimizer = tf.train.AdamOptimizer(self._lr) 73 | # self._train_step = optimizer.apply_gradients(zip(grads, vars)) 74 | self._train_step = tf.train.AdamOptimizer(self._lr).minimize(self._cross_entry) 75 | correct_prediction = tf.equal(tf.argmax(self._y_conv, 1), tf.argmax(self._y, 1)) 76 | self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 77 | def train(self): 78 | try: 79 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 80 | fig = plt.figure("cross-entropy") 81 | mpl.rcParams['xtick.labelsize'] = 8 82 | mpl.rcParams['ytick.labelsize'] = 8 83 | ax = fig.add_subplot(111) 84 | ax.grid(True) 85 | ac = [] 86 | aac = [] 87 | with tf.Session() as sess: 88 | sess.run(tf.global_variables_initializer()) 89 | for i in range(self._maxIter): 90 | train, label = mnist.train.next_batch(50) 91 | _, accuracy, loss = sess.run([self._train_step, self._accuracy, self._cross_entry], feed_dict={self._x: train, 92 | self._y: label, self._keep_prob: 0.5}) 93 | ac.append(accuracy) 94 | aac.append(np.mean(np.array(ac))) 95 | ax.plot(np.arange(len(ac)), np.array(ac), linewidth=0.8, color="b") 96 | ax.plot(np.arange(len(aac)), np.array(aac), linewidth=0.8, color="r") 97 | plt.pause(0.1) 98 | if i % 10 == 0: 99 | print("step {0:d}/{1:d},accuracy: {2:.3f}, loss: {3:.3f}".format(i, self._maxIter, accuracy, loss)) 100 | if i % 100 == 0: 101 | tf.train.Saver().save(sess, "{0}model".format(save_path), global_step=i) 102 | except Exception as e: 103 | print(e) 104 | finally: 105 | plt.show() 106 | def loadModel(self): 107 | self._sess = tf.Session() 108 | tf.train.Saver().restore(self._sess, tf.train.latest_checkpoint(save_path)) 109 | def test(self): 110 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 111 | count = 0 112 | i = 0 113 | for img, label in zip(mnist.test.images, mnist.test.labels): 114 | img = np.reshape(img, [1, 784]) 115 | label = np.reshape(label, [1, 10]) 116 | pre = self._sess.run(self._y_conv, 117 | feed_dict={self._x: img, self._y: label, self._keep_prob:1.0}) 118 | if np.equal(np.argmax(pre, 1), np.argmax(label, 1)): 119 | count += 1 120 | i += 1 121 | if i % 100 == 0: 122 | print("step: {0:d}/{1:d}, accuracy: {2:.3f}".format(i, len(mnist.test.images), count / i)) 123 | print("accuracy: ", (count / i)) 124 | 125 | file_path = r"G:/研究生课件/人工神经网络/神经网络/dataset_cat_dog_classification/dataset/" 126 | 127 | if __name__ == "__main__": 128 | cnn = SimpleCNN(0.001, 2000) 129 | cnn.loadModel() 130 | cnn.test() -------------------------------------------------------------------------------- /CNN/Libs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Libs/__init__.py -------------------------------------------------------------------------------- /CNN/Libs/__pycache__/AlexNet.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Libs/__pycache__/AlexNet.cpython-35.pyc -------------------------------------------------------------------------------- /CNN/Libs/__pycache__/SimpleCNN.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Libs/__pycache__/SimpleCNN.cpython-35.pyc -------------------------------------------------------------------------------- /CNN/Libs/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Libs/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /CNN/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/README.md -------------------------------------------------------------------------------- /CNN/Units/CNNUnit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年6月29日 3 | 4 | @author: IL MARE 5 | ''' 6 | 7 | import Libs.SimpleCNN as SimpleCNN 8 | import Libs.AlexNet as AlexNet 9 | 10 | def testSimpleCNN(): 11 | CNN = SimpleCNN.SimpleCNN(0.001, 2000) 12 | CNN.loadModel() 13 | CNN.test() 14 | 15 | def testAlexNet(): 16 | alex = AlexNet.AlexNet(0.001, 1, 10, 2000) 17 | alex.loadModel() 18 | alex.test() 19 | 20 | if __name__ == "__main__": 21 | testAlexNet() 22 | # testSimpleCNN() -------------------------------------------------------------------------------- /CNN/Units/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Units/__init__.py -------------------------------------------------------------------------------- /CNN/Units/__pycache__/CNNUnit.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Units/__pycache__/CNNUnit.cpython-35.pyc -------------------------------------------------------------------------------- /CNN/Units/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Units/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /CNN/Units/__pycache__/model.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Units/__pycache__/model.cpython-35.pyc -------------------------------------------------------------------------------- /CNN/Units/__pycache__/test1.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Units/__pycache__/test1.cpython-35.pyc -------------------------------------------------------------------------------- /CNN/Utils/DataUtil.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年7月2日 3 | 4 | @author: IL MARE 5 | ''' 6 | import os 7 | import numpy as np 8 | from PIL import Image, ImageEnhance 9 | import re 10 | import time 11 | import random 12 | from matplotlib import pyplot as plt 13 | 14 | #file_path = r"G:/研究生课件/人工神经网络/神经网络/dataset_cat_dog_classification/dataset/" 15 | 16 | class ImageObject: 17 | def __init__(self, filePath, shape=(224, 224)): 18 | self._shape = shape 19 | self._filePath = filePath 20 | self.generateDataSet() 21 | def generateDataSet(self): 22 | list = os.listdir(self._filePath) 23 | self._train_path = "{0}{1}".format(self._filePath, "train/") 24 | self._test_path = "{0}{1}".format(self._filePath, "test/") 25 | if not os.path.exists(self._train_path) or not os.path.exists(self._test_path): 26 | os.mkdir(self._train_path) 27 | os.mkdir(self._test_path) 28 | if os.listdir(self._train_path) and os.listdir(self._test_path): 29 | self._trainSet = set(os.listdir(self._train_path)) 30 | self._testSet = set(os.listdir(self._test_path)) 31 | return 32 | print("正在初始化训练集和测试集。。。") 33 | self._trainSet = set() 34 | self._testSet = set(list) - set(["train", "test"]) 35 | for i in range(len(list)): 36 | if i % 500 == 0: 37 | print(i) 38 | index = np.random.randint(0, len(list), 1)[0] 39 | item = list[index] 40 | if item == "test" or item == "train": 41 | continue 42 | if item not in self._trainSet: 43 | self._trainSet.add(item) 44 | image = Image.open("{0}{1}".format(self._filePath, item)) 45 | image = image.resize(self._shape) 46 | image.save("{0}{1}".format(self._train_path, item)) 47 | self._testSet = self._testSet - self._trainSet 48 | i = 0 49 | for name in self._testSet: 50 | i += 1 51 | if i % 500 == 0: 52 | print(i) 53 | image = Image.open("{0}{1}".format(self._filePath, name)) 54 | image = image.resize(self._shape) 55 | image.save("{0}{1}".format(self._test_path, name)) 56 | def nextBatch(self, num=50): 57 | random.seed(time.time()) 58 | list = random.sample(self._trainSet, num) 59 | train = [] 60 | label = [] 61 | for name in list: 62 | image = Image.open("{0}{1}".format(self._train_path, name)) 63 | train.append(np.asarray(image)) 64 | if re.match(r"^cat.*$", name): 65 | label.append(np.array([1, 0])) 66 | else: 67 | label.append(np.array([0, 1])) 68 | return np.array(train), np.array(label) 69 | def generateTestBatch(self, num=100): 70 | test = [] 71 | label = [] 72 | for name in self._testSet: 73 | image = Image.open("{0}{1}".format(self._test_path, name)) 74 | test.append(np.asarray(image)) 75 | if re.match(r"^cat.*$", name): 76 | label.append(np.array([1, 0])) 77 | else: 78 | label.append(np.array([0, 1])) 79 | if len(test) % num == 0: 80 | yield np.array(test), np.array(label) 81 | test = [] 82 | label = [] 83 | yield np.array(test), np.array(label) 84 | 85 | # if __name__ == "__main__": 86 | # start = time.clock() 87 | # obj = ImageObject(file_path) 88 | # train, label = obj.nextBatch(50) 89 | # print(train.shape, label.shape) 90 | # image = Image.open("{0}{1}".format(file_path, "cat.1.jpg")) 91 | # image = ImageEnhance.Color(image).enhance(5.0) 92 | # image = np.asarray(image) 93 | # figure = plt.figure("test") 94 | # ax = figure.add_subplot(111) 95 | # ax.imshow(image) 96 | # plt.show() 97 | # print(time.clock() - start) -------------------------------------------------------------------------------- /CNN/Utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Utils/__init__.py -------------------------------------------------------------------------------- /CNN/Utils/__pycache__/DataUtil.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Utils/__pycache__/DataUtil.cpython-35.pyc -------------------------------------------------------------------------------- /CNN/Utils/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/CNN/Utils/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /DecisionTree/Lib/DecisionTreeLib.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | 8 | def calShannonEnt(trainLabel): 9 | m = len(trainLabel) 10 | uniqueVal = set(trainLabel) 11 | countDict = {} 12 | shannonNum = 0.0 13 | for label in trainLabel: 14 | countDict[label] = countDict.get(label, 0) + 1 15 | for label in uniqueVal: 16 | p = (countDict[label] / m) 17 | shannonNum -= p * np.log2(p) 18 | return shannonNum 19 | 20 | def splitDataMatrix(dataMatrix, label, axis, value): 21 | returnMat = [] 22 | labelMat = [] 23 | for row, row1 in zip(dataMatrix, label): 24 | if row[axis] == value: 25 | tmp_lst = row[0: axis] 26 | tmp_lst.extend(row[axis + 1:]) 27 | returnMat.append(tmp_lst) 28 | labelMat.append(row1) 29 | return returnMat, labelMat 30 | 31 | def chooseBestFeature(trainSet, label): 32 | m = len(trainSet) 33 | maxGain = -1 34 | baseShannonEnt = calShannonEnt(label) 35 | index = -1 36 | for i in range(len(trainSet[0])): 37 | uniqueAttr = set([example[i] for example in trainSet]) 38 | tmp_Ent = 0 39 | for attr in uniqueAttr: 40 | subSet, labelMat = splitDataMatrix(trainSet, label, i, attr) 41 | newShannonEnt = calShannonEnt(labelMat) 42 | tmp_Ent += float(len(subSet) / m) * newShannonEnt 43 | gain = baseShannonEnt - tmp_Ent 44 | if gain > maxGain: 45 | maxGain = gain 46 | index = i 47 | return index 48 | 49 | def createDecisionTree(trainSet, trainLabel): 50 | if trainLabel.count(trainLabel[0]) == len(trainLabel): 51 | return trainLabel[0] 52 | if len(trainSet[0]) == 0: 53 | return "no" if trainLabel.count("no") > trainLabel.count("yes") else "yes" 54 | index = chooseBestFeature(trainSet, trainLabel) 55 | Tree = {index:{}} 56 | uniqueVal = set([elt[index] for elt in trainSet]) 57 | for value in uniqueVal: 58 | subSet, label = splitDataMatrix(trainSet, trainLabel, index, value) 59 | Tree[index][value] = createDecisionTree(subSet, label) 60 | return Tree 61 | 62 | def predictByDTModel(data, model): 63 | if type(model) == str: 64 | return model 65 | key = iter(model.keys()).__next__() 66 | value = data[key] 67 | res = model[key].get(value, None) 68 | if res != None: 69 | return predictByDTModel(data, res) 70 | else: 71 | tmp_lst = [item for item in model[key].keys()] 72 | return predictByDTModel(data, model[key][np.random.choice(tmp_lst, 1)[0]]) 73 | 74 | def testDTModel(testData, testLabel, model): 75 | predictLabel = [] 76 | for row in testData: 77 | predictLabel.append(predictByDTModel(row, model)) 78 | errorCount = 0 79 | for val, val1 in zip(predictLabel, testLabel): 80 | if val != val1: 81 | errorCount += 1 82 | ratio = float(errorCount) / len(testLabel) 83 | print("DT:total error ratio is %.3f, correct ratio is %.3f" % (ratio, 1 - ratio)) 84 | return ratio -------------------------------------------------------------------------------- /DecisionTree/Lib/RFLib.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | import Util.RandomUtil as RandomUtil 8 | 9 | ''' 10 | 计算香农墒 11 | ''' 12 | def calShannonEnt(trainLabel): 13 | m = len(trainLabel) 14 | uniqueVal = set(trainLabel) 15 | countDict = {} 16 | shannonNum = 0.0 17 | for label in trainLabel: 18 | countDict[label] = countDict.get(label, 0) + 1 19 | for label in uniqueVal: 20 | p = (countDict[label] / m) 21 | shannonNum -= p * np.log2(p) 22 | return shannonNum 23 | ''' 24 | 切分数据集 25 | ''' 26 | def splitDataMatrix(dataMatrix, label, axis, value): 27 | returnMat = [] 28 | labelMat = [] 29 | for row, row1 in zip(dataMatrix, label): 30 | if row[axis] == value: 31 | tmp_lst = row[0: axis] 32 | tmp_lst.extend(row[axis + 1:]) 33 | returnMat.append(tmp_lst) 34 | labelMat.append(row1) 35 | return returnMat, labelMat 36 | ''' 37 | 由信息增益最大化计算出需要切分的属性索引值 38 | ''' 39 | def chooseBestFeature(trainSet, label): 40 | tmp = int(np.log2(len(trainSet[0]))) 41 | k = 1 if tmp == 0 else tmp 42 | indexSet = RandomUtil.generateRandom(0, len(trainSet[0]), k) 43 | m = len(trainSet) 44 | maxGain = -1 45 | baseShannonEnt = calShannonEnt(label) 46 | index = -1 47 | for i in indexSet: 48 | uniqueAttr = set([example[i] for example in trainSet]) 49 | tmp_Ent = 0 50 | for attr in uniqueAttr: 51 | subSet, labelMat = splitDataMatrix(trainSet, label, i, attr) 52 | newShannonEnt = calShannonEnt(labelMat) 53 | tmp_Ent += float(len(subSet) / m) * newShannonEnt 54 | gain = baseShannonEnt - tmp_Ent 55 | if gain > maxGain: 56 | maxGain = gain 57 | index = i 58 | return index 59 | ''' 60 | 训练随机森林所需要的弱分类器 61 | ''' 62 | def generateWeakLearner(trainSet, trainLabel): 63 | if trainLabel.count(trainLabel[0]) == len(trainLabel): 64 | return trainLabel[0] 65 | if len(trainSet[0]) == 0: 66 | return "no" if trainLabel.count("no") > trainLabel.count("yes") else "yes" 67 | index = chooseBestFeature(trainSet, trainLabel) 68 | Tree = {index:{}} 69 | uniqueVal = set([elt[index] for elt in trainSet]) 70 | for value in uniqueVal: 71 | subSet, label = splitDataMatrix(trainSet, trainLabel, index, value) 72 | Tree[index][value] = generateWeakLearner(subSet, label) 73 | return Tree 74 | 75 | def generateRandomForest(trainSet, trainLabel, T): 76 | forest = [] 77 | for i in range(T): 78 | model = generateWeakLearner(trainSet, trainLabel) 79 | forest.append(model) 80 | return forest 81 | 82 | def classfyData(data, model): 83 | if type(model) == str: 84 | return model 85 | key = iter(model.keys()).__next__() 86 | value = data[key] 87 | res = model[key].get(value, None) 88 | if res != None: 89 | return classfyData(data, res) 90 | else: 91 | tmp_lst = [item for item in model[key].keys()] 92 | return classfyData(data, model[key][np.random.choice(tmp_lst, 1)[0]]) 93 | 94 | def predictByRandomForest(models, data): 95 | tmp_lst = [] 96 | for model in models: 97 | predict_label = classfyData(data, model) 98 | tmp_lst.append(predict_label) 99 | tmp_set = set(tmp_lst) 100 | res_lst = [] 101 | for res in tmp_set: 102 | res_lst.append((tmp_lst.count(res), res)) 103 | res_lst = sorted(res_lst, key=lambda index:index[0], reverse=True) 104 | if len(res_lst) == 1: 105 | return res_lst[0][1] 106 | else: 107 | tmp_res = res_lst[0][0] 108 | return_lst = [res_lst[0][1]] 109 | for i in range(1, len(res_lst)): 110 | if res_lst[i][0] == tmp_res: 111 | return_lst.append(res_lst[i][1]) 112 | if len(return_lst) == 1: 113 | return return_lst[0] 114 | else: 115 | return np.random.choice(return_lst, 1)[0] -------------------------------------------------------------------------------- /DecisionTree/Lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/DecisionTree/Lib/__init__.py -------------------------------------------------------------------------------- /DecisionTree/Lib/__pycache__/DecisionTreeLib.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/DecisionTree/Lib/__pycache__/DecisionTreeLib.cpython-36.pyc -------------------------------------------------------------------------------- /DecisionTree/Lib/__pycache__/RFLib.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/DecisionTree/Lib/__pycache__/RFLib.cpython-36.pyc -------------------------------------------------------------------------------- /DecisionTree/Lib/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/DecisionTree/Lib/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /DecisionTree/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/DecisionTree/README.md -------------------------------------------------------------------------------- /DecisionTree/Unit/DecisionTreeUnit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月7日 3 | 4 | @author: IL MARE 5 | ''' 6 | import Util.DataUtil as DataUtil 7 | import Lib.DecisionTreeLib as DTLib 8 | import time 9 | from matplotlib import pyplot as plt 10 | import numpy as np 11 | 12 | def loadDataSet(filename): 13 | print("Loading data...") 14 | dataSet, labelSet = DataUtil.loadDataForRMOrDTModel(filename) 15 | print("Loaded data!") 16 | print("Undersampling data...") 17 | dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, "yes", "no") 18 | print("Undersampled data!") 19 | return dataSet, labelSet 20 | 21 | if __name__ == "__main__": 22 | start = time.clock() 23 | dataSet, labelSet = loadDataSet("bank-additional") 24 | tmp_lst = [] 25 | for i in range(100): 26 | trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(dataSet, labelSet) 27 | model = DTLib.createDecisionTree(trainSet, trainLabel) 28 | errorRatio = DTLib.testDTModel(testSet, testLabel, model) 29 | tmp_lst.append(1 - errorRatio) 30 | y = np.array(tmp_lst, dtype=np.float) 31 | print("the avg correct ratio is %.3f, the std is %.3f" % (y.mean(), y.std())) 32 | x = np.arange(0, len(tmp_lst)) 33 | fig = plt.figure("test") 34 | ax = fig.add_subplot(111) 35 | ax.plot(x, y) 36 | ax.set_ylim([0, 1]) 37 | ax.set_ylabel("correct ratio of DT") 38 | ax.set_xlabel("count of exp") 39 | plt.show() -------------------------------------------------------------------------------- /DecisionTree/Unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/DecisionTree/Unit/__init__.py -------------------------------------------------------------------------------- /DecisionTree/Util/RandomUtil.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | ''' 8 | 该函数用来在一个集合中随机抽取size个互不相同的随机值 9 | ''' 10 | def generateRandomIndex(a, size): 11 | if len(a) < size: 12 | return None 13 | elif len(a) == size: 14 | return set(a) 15 | returnMat = set() 16 | while True: 17 | returnMat.add(np.random.choice(list(a), 1)[0]) 18 | if len(returnMat) == size: 19 | break 20 | return returnMat 21 | ''' 22 | 在指定范围内产生指定数目的不重复的随机数 23 | ''' 24 | def generateRandom(low, high, size): 25 | returnSet = set() 26 | while True: 27 | returnSet.add(np.random.randint(low, high, 1)[0]) 28 | if len(returnSet) == size: 29 | break 30 | return returnSet -------------------------------------------------------------------------------- /DecisionTree/Util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/DecisionTree/Util/__init__.py -------------------------------------------------------------------------------- /DecisionTree/Util/__pycache__/DataUtil.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/DecisionTree/Util/__pycache__/DataUtil.cpython-36.pyc -------------------------------------------------------------------------------- /DecisionTree/Util/__pycache__/RandomUtil.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/DecisionTree/Util/__pycache__/RandomUtil.cpython-36.pyc -------------------------------------------------------------------------------- /DecisionTree/Util/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/DecisionTree/Util/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /FacePlus/Libs/AverageFace.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年7月8日 3 | 4 | @author: IL MARE 5 | ''' 6 | 7 | import numpy as np 8 | import os 9 | import cv2 10 | 11 | class AverageFace: 12 | def __init__(self, filePath, shape=(600, 600), keyPoint=(26, 14)): 13 | if filePath.endswith("/") or filePath.endswith("\\"): 14 | self._filePath = filePath 15 | else: 16 | self._filePath = "{0}/".format(filePath) 17 | self._keyPoint = keyPoint 18 | self._shape = shape 19 | self.read_landmark() 20 | self.readImage() 21 | self.generateImage() 22 | def read_landmark(self): 23 | self._pointsArray = [] 24 | for file in os.listdir(self._filePath): 25 | if file.endswith("txt"): 26 | with open(r"{0}\{1}".format(self._filePath, file)) as fp: 27 | landMark = fp.read().split() 28 | landMark = np.reshape(np.array(landMark, dtype=np.int32), [-1, 2]) 29 | self._pointsArray.append(landMark) 30 | def readImage(self): 31 | self._imageList = [] 32 | for file in os.listdir(self._filePath): 33 | if file.endswith("jpg"): 34 | image = cv2.imread(r"{0}\{1}".format(self._filePath, file)) 35 | image = image / 255.0 36 | self._imageList.append(image) 37 | @staticmethod 38 | def similarityTransform(input, output): 39 | s60 = np.sin(60 * np.pi / 180.0) 40 | c60 = np.cos(60 * np.pi / 180.0) 41 | inPts = np.copy(input).tolist() 42 | outPts = np.copy(output).tolist() 43 | xin = c60 * (inPts[0][0] - inPts[1][0]) - s60 * (inPts[0][1] - inPts[1][1]) + inPts[1][0] 44 | yin = s60 * (inPts[0][0] - inPts[1][0]) - c60 * (inPts[0][1] - inPts[1][1]) + inPts[1][1] 45 | inPts.append([np.int(xin), np.int(yin)]) 46 | xout = c60 * (outPts[0][0] - outPts[1][0]) - s60 * (outPts[0][1] - outPts[1][1]) + outPts[1][0] 47 | yout = s60 * (outPts[0][0] - outPts[1][0]) - c60 * (outPts[0][1] - outPts[1][1]) + outPts[1][1] 48 | outPts.append([np.int(xout), np.int(yout)]) 49 | return cv2.estimateRigidTransform(np.array([inPts], dtype=np.int32), 50 | np.array([outPts], dtype=np.int32), True) 51 | @staticmethod 52 | def calculateDelaunayTriangles(rect, points): 53 | def rectContains(point, rect): 54 | if point[0] < rect[0] or point[0] > rect[2]: 55 | return False 56 | elif point[1] < rect[1] or point[1] > rect[3]: 57 | return False 58 | return True 59 | subDiv = cv2.Subdiv2D(rect) 60 | for point in points: 61 | subDiv.insert((point[0], point[1])) 62 | triangleList = subDiv.getTriangleList() 63 | return_mat = [] 64 | for triangle in triangleList: 65 | pt = [] 66 | pt.append((triangle[0], triangle[1])) 67 | pt.append((triangle[2], triangle[3])) 68 | pt.append((triangle[4], triangle[5])) 69 | if rectContains(pt[0], rect) and rectContains(pt[1], rect) and rectContains(pt[2], rect): 70 | ind = [] 71 | for i in range(3): 72 | for j in range(len(points)): 73 | if np.abs(pt[i][0] - points[j][0]) < 1.0 and np.abs(pt[i][1] - points[j][1]) < 1.0: 74 | ind.append(j) 75 | if len(ind) == 3: 76 | return_mat.append(ind) 77 | return return_mat 78 | @staticmethod 79 | def constrainPoint(p, w, h): 80 | p = (min(max(p[0], 0), w - 1), min(max(p[1], 0), h - 1)) 81 | return p 82 | @staticmethod 83 | def warpTriangle(img1, img2, t1, t2): 84 | def applyAffineTransform(src, srcTri, dstTri, size) : 85 | warpMat = cv2.getAffineTransform(np.float32(srcTri), np.float32(dstTri)) 86 | dst = cv2.warpAffine(src, warpMat, (size[0], size[1]), None, 87 | flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT_101) 88 | return dst 89 | r1 = cv2.boundingRect(np.float32([t1])) 90 | r2 = cv2.boundingRect(np.float32([t2])) 91 | t1Rect = [] 92 | t2Rect = [] 93 | t2RectInt = [] 94 | for i in range(0, 3): 95 | t1Rect.append(((t1[i][0] - r1[0]),(t1[i][1] - r1[1]))) 96 | t2Rect.append(((t2[i][0] - r2[0]),(t2[i][1] - r2[1]))) 97 | t2RectInt.append(((t2[i][0] - r2[0]),(t2[i][1] - r2[1]))) 98 | mask = np.zeros((r2[3], r2[2], 3), dtype = np.float32) 99 | cv2.fillConvexPoly(mask, np.int32(t2RectInt), (1, 1, 1)); 100 | img1Rect = img1[r1[1]:r1[1] + r1[3], r1[0]:r1[0] + r1[2]] 101 | size = (r2[2], r2[3]) 102 | img2Rect = applyAffineTransform(img1Rect, t1Rect, t2Rect, size) 103 | img2Rect = img2Rect * mask 104 | img2[r2[1]: r2[1] + r2[3], r2[0]: r2[0] + r2[2]] = img2[r2[1]: r2[1] + r2[3], 105 | r2[0]: r2[0] + r2[2]] * ((1.0, 1.0, 1.0) - mask) 106 | img2[r2[1]: r2[1] + r2[3], r2[0]: r2[0] + r2[2]] = img2[r2[1]: r2[1] + r2[3], 107 | r2[0]: r2[0] + r2[2]] + img2Rect 108 | def generateImage(self): 109 | width = self._shape[0] 110 | height = self._shape[1] 111 | eyePoint = [(0.34 * width, height / 2.2), (0.66 * width, height / 2.2)] 112 | boundPoint = [(0, 0), (width / 2.0, 0), (width - 1, 0), (width - 1, height / 2.0), (width - 1, height - 1), 113 | (width / 2.0, height - 1), (0, height - 1), (0, height / 2.0)] 114 | pointsAvg = np.array([(0, 0)] * (len(self._pointsArray[0]) + len(boundPoint)), np.float32) 115 | numImages = len(self._imageList) 116 | pointsNorm = [] 117 | imagesNorm = [] 118 | for point, image in zip(self._pointsArray, self._imageList): 119 | eyePointSrc = [point[self._keyPoint[0]], point[self._keyPoint[1]]] 120 | transform = AverageFace.similarityTransform(eyePointSrc, eyePoint) 121 | img = cv2.warpAffine(image, transform, (width, height)) 122 | points = np.reshape(point, [len(self._pointsArray[0]), 1, 2]) 123 | points = np.reshape(cv2.transform(points, transform), [len(self._pointsArray[0]), 2]) 124 | points = np.append(points, boundPoint, 0) 125 | pointsAvg = pointsAvg + points / numImages 126 | pointsNorm.append(points) 127 | imagesNorm.append(img) 128 | rect = (0, 0, width, height) 129 | triangleList = AverageFace.calculateDelaunayTriangles(rect, pointsAvg) 130 | output = np.zeros((width, height, 3), dtype=np.float32) 131 | for i in range(len(imagesNorm)): 132 | img = np.zeros([width, height, 3], dtype=np.float32) 133 | for j in range(len(triangleList)): 134 | tin = [] 135 | tout = [] 136 | for k in range(3): 137 | pIn = pointsNorm[i][triangleList[j][k]] 138 | pIn = AverageFace.constrainPoint(pIn, width, height) 139 | pOut = pointsAvg[triangleList[j][k]] 140 | pOut = AverageFace.constrainPoint(pOut, width, height) 141 | tin.append(pIn) 142 | tout.append(pOut) 143 | AverageFace.warpTriangle(imagesNorm[i], img, tin, tout) 144 | output = output + img 145 | self._output = output / len(imagesNorm) 146 | @property 147 | def averageImage(self): 148 | return self._output 149 | def showImage(self): 150 | cv2.imshow("image", self._output) 151 | cv2.waitKey(0) 152 | def saveImage(self, path): 153 | cv2.imwrite(path, np.int32(self._output * 255.0)) 154 | 155 | 156 | if __name__ == "__main__": 157 | obj = AverageFace(r"G:\python\sources\nwpu\dectImage") 158 | obj.showImage() 159 | obj.saveImage(r"g:/aaa.jpg") 160 | -------------------------------------------------------------------------------- /FacePlus/Libs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/Libs/__init__.py -------------------------------------------------------------------------------- /FacePlus/Libs/__pycache__/AverageFace.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/Libs/__pycache__/AverageFace.cpython-35.pyc -------------------------------------------------------------------------------- /FacePlus/Libs/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/Libs/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /FacePlus/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/README.md -------------------------------------------------------------------------------- /FacePlus/Unit/CaptureUnit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年7月5日 3 | 4 | @author: IL MARE 5 | ''' 6 | from Libs.AverageFace import AverageFace 7 | 8 | data_path = r"G:\python\sources\nwpu\dectImage" 9 | 10 | if __name__ == "__main__": 11 | obj = AverageFace(data_path, (700, 700)) 12 | obj.showImage() -------------------------------------------------------------------------------- /FacePlus/Unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/Unit/__init__.py -------------------------------------------------------------------------------- /FacePlus/Unit/ghostdriver.log: -------------------------------------------------------------------------------- 1 | [INFO - 2018-07-05T12:48:19.252Z] GhostDriver - Main - running on port 56431 2 | [INFO - 2018-07-05T12:48:21.312Z] Session [b2a9f830-8051-11e8-ad53-e1b1ea2e6b85] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true} 3 | [INFO - 2018-07-05T12:48:21.312Z] Session [b2a9f830-8051-11e8-ad53-e1b1ea2e6b85] - page.customHeaders: - {} 4 | [INFO - 2018-07-05T12:48:21.313Z] Session [b2a9f830-8051-11e8-ad53-e1b1ea2e6b85] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-8.1-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}} 5 | [INFO - 2018-07-05T12:48:21.313Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: b2a9f830-8051-11e8-ad53-e1b1ea2e6b85 6 | -------------------------------------------------------------------------------- /FacePlus/Utils/LandMarkUtil.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年7月7日 3 | 4 | @author: IL MARE 5 | ''' 6 | import requests 7 | import json 8 | import os 9 | 10 | landMarkPoints=["mouth_left_corner","contour_right1","right_eyebrow_upper_left_quarter", 11 | "nose_left","left_eyebrow_upper_right_quarter","contour_right8","left_eye_top", 12 | "right_eyebrow_upper_middle","contour_chin","contour_right9","nose_contour_left2", 13 | "contour_right4","left_eye_lower_right_quarter","left_eyebrow_lower_middle", 14 | "right_eye_right_corner","left_eye_pupil","left_eye_bottom", 15 | "nose_contour_lower_middle","right_eye_upper_right_quarter", 16 | "nose_contour_left1","left_eye_right_corner","nose_contour_right2", 17 | "nose_contour_left3","right_eye_bottom","contour_left2","right_eye_center", 18 | "left_eye_left_corner","mouth_upper_lip_bottom","contour_right5", 19 | "contour_left7","mouth_lower_lip_bottom","nose_right", 20 | "mouth_lower_lip_left_contour2","left_eyebrow_lower_left_quarter", 21 | "contour_left5","mouth_upper_lip_top","right_eyebrow_lower_right_quarter", 22 | "mouth_upper_lip_right_contour3","mouth_lower_lip_left_contour1", 23 | "right_eyebrow_upper_right_quarter","right_eyebrow_right_corner", 24 | "left_eyebrow_right_corner","left_eyebrow_upper_middle", 25 | "right_eyebrow_lower_middle","mouth_upper_lip_left_contour3", 26 | "nose_tip","contour_left8","mouth_lower_lip_right_contour1", 27 | "left_eye_center","mouth_lower_lip_right_contour2", 28 | "mouth_lower_lip_right_contour3","nose_contour_right3", 29 | "right_eye_top","contour_left1","contour_right2","contour_right3", 30 | "right_eye_lower_right_quarter","right_eyebrow_lower_left_quarter", 31 | "mouth_upper_lip_right_contour1","contour_left3","mouth_lower_lip_top", 32 | "right_eye_upper_left_quarter","contour_right6","mouth_upper_lip_left_contour2", 33 | "right_eye_pupil","contour_left6","right_eye_lower_left_quarter", 34 | "left_eye_upper_right_quarter","right_eye_left_corner","mouth_right_corner", 35 | "contour_left4","left_eyebrow_lower_right_quarter","mouth_upper_lip_left_contour1", 36 | "left_eyebrow_left_corner","nose_contour_right1","contour_left9", 37 | "left_eye_upper_left_quarter","left_eyebrow_upper_left_quarter", 38 | "right_eyebrow_left_corner","contour_right7","mouth_upper_lip_right_contour2", 39 | "left_eye_lower_left_quarter","mouth_lower_lip_left_contour3"] 40 | 41 | def generateLandMark(imagePath): 42 | try: 43 | Url = "https://api-cn.faceplusplus.com/facepp/v3/detect" 44 | parameters = { 45 | "api_key": "DVTXIboHVgISfgkHXD77WfX2q609WFfe", 46 | "api_secret": "DRTArnHV2GHACBT1qXK3gip6Ub7Wn8UH", 47 | "return_landmark": 1 48 | } 49 | for filename in os.listdir(imagePath): 50 | if filename.endswith("jpg"): 51 | if os.path.exists(r"{0}{1}.txt".format(imagePath, filename[0 : 10])): 52 | continue 53 | files = { 54 | "image_file": open("{0}{1}".format(imagePath, filename), "rb") 55 | } 56 | resp = requests.post(Url, timeout=15, data=parameters, files=files) 57 | obj = json.loads(resp.text) 58 | landMarks = [] 59 | for name in landMarkPoints: 60 | value = obj["faces"][0]["landmark"][name] 61 | landMarks.append(str(value["x"])) 62 | landMarks.append(str(value["y"])) 63 | try: 64 | fp = open(r"{0}{1}.txt".format(imagePath, filename[0 : 10]), "w") 65 | fp.write(" ".join(landMarks)) 66 | print(r"{0}{1}.txt".format(imagePath, filename[0: 10])) 67 | except Exception as e: 68 | print(e) 69 | finally: 70 | fp.close() 71 | except Exception as e: 72 | print(e) 73 | 74 | if __name__ == "__main__": 75 | imagePath = r"G:/python/sources/nwpu/dectImage/" 76 | generateLandMark(imagePath) -------------------------------------------------------------------------------- /FacePlus/Utils/ZhiHuUtil.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2017年5月4日 3 | 4 | @author: IL MARE 5 | ''' 6 | from selenium import webdriver 7 | import time 8 | from bs4 import BeautifulSoup 9 | from urllib.request import urlretrieve 10 | import re 11 | from selenium.webdriver.support.wait import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions 13 | from selenium.webdriver.common.by import By 14 | import os 15 | 16 | def getAllFriendList(broswer, userID, pageNum, friendSet, currentLevel): 17 | try: 18 | broswer.get('https://www.zhihu.com%s?page=%s' % (userID, pageNum)) 19 | WebDriverWait(broswer, 10).until( 20 | expected_conditions.presence_of_all_elements_located((By.CSS_SELECTOR, '.UserLink-link'))) 21 | except: 22 | print('getAllFriendList异常') 23 | else: 24 | bsObj = BeautifulSoup(broswer.page_source, 'html.parser') 25 | elts = bsObj.findAll('a', {'class':'UserLink-link'}) 26 | for elt in elts: 27 | img = elt.find('img') 28 | if img: 29 | friendSet.add(elt) 30 | print('......*' * currentLevel, 'https://www.zhihu.com%s' % (elt.attrs.get('href', 'no data'))) 31 | 32 | def getFriendList(broswer, userID, currentLevel=1): 33 | try: 34 | if currentLevel > totalLevel: 35 | return 36 | if userID == 'no data': 37 | raise Exception() 38 | nameTemp = userID.split('/')[2] 39 | if not nameTemp in alreadyParse: 40 | alreadyParse.add(nameTemp) 41 | else: 42 | return 43 | print('......*' * currentLevel ,'正在解析用户:', nameTemp, '知乎首页:https://www.zhihu.com%s' % (userID), sep=' ') 44 | friendSet = set() 45 | broswer.get('https://www.zhihu.com%s' % (userID)) 46 | WebDriverWait(broswer, 10).until( 47 | expected_conditions.presence_of_all_elements_located((By.CSS_SELECTOR, '.UserLink-link'))) 48 | elt = WebDriverWait(broswer, 10).until( 49 | expected_conditions.presence_of_element_located((By.CSS_SELECTOR, '.Avatar.Avatar--large.UserAvatar-inner'))) 50 | res = re.match('^(https://.*)[0-9]x$', elt.get_attribute('srcset')) 51 | if res: 52 | if not nameTemp in alreadyDownload: 53 | alreadyDownload.add(nameTemp) 54 | url = res.group(1) 55 | writeToFile(url, '%s.%s' % (nameTemp ,url.split('.')[-1])) 56 | print('......*' * currentLevel, '已经下载', nameTemp, '的用户头像', '知乎首页:https://www.zhihu.com%s' % (userID), sep=' ') 57 | except: 58 | print('......*' * currentLevel, 'getFriendList异常') 59 | else: 60 | print('......*' * currentLevel, '正在获取用户', nameTemp, '的关注列表...', sep=' ') 61 | bsObj = BeautifulSoup(broswer.page_source, 'html.parser') 62 | elts = bsObj.findAll('a', {'class':'UserLink-link'}) 63 | for elt in elts: 64 | img = elt.find('img') 65 | if img: 66 | friendSet.add(elt) 67 | print('......*' * currentLevel, 'https://www.zhihu.com%s' % (elt.attrs.get('href', 'no data'))) 68 | elts = bsObj.findAll('button', {'class':'Button PaginationButton Button--plain'}) 69 | if len(elts) != 0: 70 | count = elts[len(elts) - 1].get_text() 71 | for i in range(2, int(count) + 1): 72 | getAllFriendList(broswer, userID, i, friendSet, currentLevel) 73 | print('......*' * currentLevel, '用户', nameTemp, '的关注列表获取完毕', sep=' ') 74 | for elt in friendSet: 75 | href = elt.attrs.get('href', 'no data') 76 | if currentLevel == totalLevel: 77 | img = elt.find('img') 78 | if img: 79 | res = re.match('^(https://.*)[0-9]x$', img.attrs.get('srcset', 'no data')) 80 | if res: 81 | if not href.split('/')[2] in alreadyDownload: 82 | alreadyDownload.add(href.split('/')[2]) 83 | url = res.group(1).replace('_xl', '_xll') 84 | writeToFile(url, '%s.%s' % (href.split('/')[2] ,url.split('.')[-1])) 85 | print('......*' * (currentLevel + 1), '已经下载用户',nameTemp, '的关注用户', href.split('/')[2], '的头像', sep=' ') 86 | getFriendList(broswer, '%s/%s' % (href, userID.split('/')[3]), currentLevel + 1) 87 | 88 | totalLevel = 5#递归层数 89 | defaultPath = 'h:\\zhihu\\'#默认目录 90 | currentPath = '%s%s' % (defaultPath, 'pic\\')#当前目录 91 | alreadyDownload = set()#已经下载的用户头像 92 | alreadyParse = set()#已经解析过的用户 93 | totalUse = 0#文件写入次数 94 | 95 | def writeToFile(url, fileName): 96 | try: 97 | global currentPath, totalUse, defaultPath 98 | totalUse = totalUse + 1 99 | if totalUse % 500 == 0: 100 | tempPath = '{0}pic-{1}\\'.format(defaultPath, totalUse) 101 | if not os.path.exists(tempPath): 102 | os.mkdir(tempPath) 103 | currentPath = '%s' % (tempPath) 104 | if not os.path.exists(currentPath): 105 | os.mkdir(currentPath) 106 | urlretrieve(url, '%s%s' % (currentPath, fileName)) 107 | except: 108 | print('writeToFile异常') 109 | 110 | if __name__ == "__main__": 111 | try: 112 | start = time.clock() 113 | time.sleep(5) 114 | broswer = webdriver.PhantomJS(executable_path= 115 | r"C:\phantomjs-2.1.1-windows\phantomjs-2.1.1-windows\bin\phantomjs.exe") 116 | getFriendList(broswer, r'/people/tu-si-ji-63/following') 117 | except: 118 | print('顶层调用异常') 119 | finally: 120 | broswer.quit() 121 | print('******', '共运行 {0:.3f}秒'.format(time.clock() - start), '一共扫描%d位用户的好友列表' % (len(alreadyParse)), '一共下载%d张用户头像' % (len(alreadyDownload)), sep=' ') -------------------------------------------------------------------------------- /FacePlus/Utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/Utils/__init__.py -------------------------------------------------------------------------------- /FacePlus/Utils/ghostdriver.log: -------------------------------------------------------------------------------- 1 | [INFO - 2018-07-08T08:56:07.054Z] GhostDriver - Main - running on port 60551 2 | [INFO - 2018-07-08T08:56:11.586Z] Session [c3240470-828c-11e8-bf96-69a17efcd03f] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true} 3 | [INFO - 2018-07-08T08:56:11.586Z] Session [c3240470-828c-11e8-bf96-69a17efcd03f] - page.customHeaders: - {} 4 | [INFO - 2018-07-08T08:56:11.586Z] Session [c3240470-828c-11e8-bf96-69a17efcd03f] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-8.1-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}} 5 | [INFO - 2018-07-08T08:56:11.586Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: c3240470-828c-11e8-bf96-69a17efcd03f 6 | -------------------------------------------------------------------------------- /FacePlus/dataset/presidents/barak-obama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/dataset/presidents/barak-obama.jpg -------------------------------------------------------------------------------- /FacePlus/dataset/presidents/bill-clinton.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/dataset/presidents/bill-clinton.jpg -------------------------------------------------------------------------------- /FacePlus/dataset/presidents/george-h-bush.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/dataset/presidents/george-h-bush.jpg -------------------------------------------------------------------------------- /FacePlus/dataset/presidents/george-w-bush.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/dataset/presidents/george-w-bush.jpg -------------------------------------------------------------------------------- /FacePlus/dataset/presidents/jimmy-carter.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/dataset/presidents/jimmy-carter.jpg -------------------------------------------------------------------------------- /FacePlus/dataset/presidents/ronald-regan.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/dataset/presidents/ronald-regan.jpg -------------------------------------------------------------------------------- /FacePlus/result/example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/result/example.jpg -------------------------------------------------------------------------------- /FacePlus/result/example_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FacePlus/result/example_1.jpg -------------------------------------------------------------------------------- /FaceReplace/Lib/AGNModel.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created By ILMARE 3 | @Date 2019-3-6 4 | ''' 5 | import tensorflow as tf 6 | import numpy as np 7 | from tensorflow.examples.tutorials.mnist import input_data 8 | import sklearn 9 | import cv2 10 | 11 | def xavier_init(input_count, output_count, constant=1): 12 | low = -constant * np.sqrt(6.0 / (input_count + output_count)) 13 | high = constant * np.sqrt(6.0 / (input_count + output_count)) 14 | return tf.random_uniform((input_count, output_count), 15 | minval=low, maxval=high, 16 | dtype=tf.float32) 17 | 18 | def standard_scale(X_train, X_test): 19 | pre = sklearn.preprocessing.StandardScaler().fit(X_train) 20 | return pre.transform(X_train), pre.transform(X_test) 21 | 22 | def get_random_block_from_data(data, batch_size): 23 | start_idx = np.random.randint(0, len(data) - batch_size) 24 | return data[start_idx: start_idx + batch_size, :] 25 | 26 | class AGNAutoEncoder: 27 | def __init__(self, input_size, hidden_size, 28 | transfer_function=tf.nn.softplus,learning_rate=0.001, 29 | scale=0.1, batch_size=128, max_step=20): 30 | self._learning_tate = learning_rate 31 | self._max_step = max_step 32 | self._batch_size = batch_size 33 | self._input_size = input_size 34 | self._hidden_size = hidden_size 35 | self._transfer = transfer_function 36 | self._scale = tf.placeholder(tf.float32) 37 | self._training_scale = scale 38 | self._weight = self.init_weights() 39 | self._sess = tf.Session() 40 | self.defineNetwork() 41 | def defineNetwork(self): 42 | self._x = tf.placeholder(shape=[None, self._input_size], dtype=tf.float32) 43 | self._hidden = self._transfer(tf.add(tf.matmul( 44 | self._x + self._training_scale * tf.random_normal((self._input_size, )), 45 | self._weight['w1']), self._weight['b1'])) 46 | self._reconstruction = tf.add(tf.matmul(self._hidden, 47 | self._weight['w2']), self._weight['b2']) 48 | self._loss = 0.5 * tf.reduce_sum(tf.pow( 49 | tf.subtract(self._reconstruction, self._x), 2.0)) 50 | self._optimizer = tf.train.AdamOptimizer(self._learning_tate).minimize(self._loss) 51 | def init_weights(self): 52 | all_weight = dict() 53 | all_weight['w1'] = tf.Variable(initial_value=xavier_init(self._input_size, 54 | self._hidden_size), dtype=tf.float32) 55 | all_weight['b1'] = tf.Variable(initial_value=tf.zeros(shape=[self._hidden_size], 56 | dtype=tf.float32)) 57 | all_weight['w2'] = tf.Variable(initial_value=xavier_init(self._hidden_size, 58 | self._input_size), dtype=tf.float32) 59 | all_weight['b2'] = tf.Variable(initial_value=tf.zeros(shape=[self._input_size], 60 | dtype=tf.float32)) 61 | return all_weight 62 | def part_fit(self, X): 63 | _loss, _ = self._sess.run([self._loss, self._optimizer], 64 | feed_dict={self._x: X, self._scale: self._training_scale}) 65 | return _loss 66 | def calculate_total_cost(self, X): 67 | _loss = self._sess.run([self._loss], 68 | feed_dict={self._x: X, self._scale: self._training_scale}) 69 | return _loss 70 | def transform(self, X): 71 | return self._sess.run(self._hidden, feed_dict={self._x: X, 72 | self._scale: self._training_scale}) 73 | def generate(self, hidden=None): 74 | if hidden is None: 75 | hidden = np.random.normal(size=self._weight['b1']) 76 | return self._sess.run(self._reconstruction, feed_dict={self._hidden: hidden}) 77 | def reconstrct(self, X): 78 | return self._sess.run(self._reconstruction, feed_dict={self._x: X, 79 | self._scale: self._training_scale}) 80 | def getWeight(self): 81 | return self._sess.run(self._weight['w1']) 82 | def getBiases(self): 83 | return self._sess.run(self._weight['b1']) 84 | def train(self): 85 | self._sess.run(tf.global_variables_initializer()) 86 | mnist = input_data.read_data_sets("/home/ilmare/dataSet/mnist", one_hot=True) 87 | train, _ = standard_scale(mnist.train.images, mnist.test.images) 88 | n_examples = int(mnist.train.num_examples) 89 | for idx in range(self._max_step): 90 | avg_cost = 0 91 | total_batch = n_examples // self._batch_size 92 | for batch in range(0, total_batch): 93 | train_tmp = get_random_block_from_data(train, self._batch_size) 94 | _cost = self.part_fit(train_tmp) 95 | avg_cost += _cost / n_examples * self._batch_size 96 | if (idx + 1) % 5 == 0: 97 | print("avg_cost: %.3f" % avg_cost, " step: ", idx) 98 | tf.train.Saver().save(self._sess, save_path="{0}autoencoder".format("/home/ilmare/Desktop/FaceReplace/model/")) 99 | def load_model(self): 100 | tf.train.Saver().restore(self._sess, tf.train.latest_checkpoint("/home/ilmare/Desktop/FaceReplace/model/")) 101 | mnist = input_data.read_data_sets("/home/ilmare/dataSet/mnist", one_hot=True) 102 | source = np.reshape(mnist.train.images[0], [1, 784]) 103 | dest = self.reconstrct(source) 104 | source = np.reshape(source, [28, 28]) 105 | dest = np.reshape(dest, [28, 28]) 106 | print(source.shape, dest.shape) 107 | # fig = plt.figure("test") 108 | # ax = fig.add_subplot(121) 109 | # ax.imshow(source) 110 | # bx = fig.add_subplot(122) 111 | # bx.imshow(dest) 112 | # plt.show() 113 | cv2.imshow("test", dest) 114 | cv2.waitKey(0) -------------------------------------------------------------------------------- /FaceReplace/Lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FaceReplace/Lib/__init__.py -------------------------------------------------------------------------------- /FaceReplace/Lib/__pycache__/AutoEncoder.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FaceReplace/Lib/__pycache__/AutoEncoder.cpython-35.pyc -------------------------------------------------------------------------------- /FaceReplace/Lib/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FaceReplace/Lib/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /FaceReplace/Main/FaceAverage.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created By ILMARE 3 | @Date:2019-2-26 4 | ''' 5 | 6 | import os 7 | import cv2 8 | import numpy as np 9 | import math 10 | from matplotlib import pyplot as plt 11 | from dlib import get_frontal_face_detector as detector 12 | from dlib import shape_predictor as predictor 13 | 14 | def readImages(path): 15 | imagesArray = []; 16 | for filePath in os.listdir(path): 17 | if filePath.endswith(".jpg"): 18 | img = cv2.imread(os.path.join(path, filePath)); 19 | imagesArray.append(img); 20 | 21 | return imagesArray; 22 | 23 | def similarityTransform(inPoints, outPoints): 24 | s60 = math.sin(60 * math.pi / 180); 25 | c60 = math.cos(60 * math.pi / 180); 26 | 27 | inPts = np.copy(inPoints).tolist(); 28 | outPts = np.copy(outPoints).tolist(); 29 | 30 | xin = c60 * (inPts[0][0] - inPts[1][0]) - s60 * (inPts[0][1] - inPts[1][1]) + inPts[1][0]; 31 | yin = s60 * (inPts[0][0] - inPts[1][0]) + c60 * (inPts[0][1] - inPts[1][1]) + inPts[1][1]; 32 | 33 | inPts.append([np.int(xin), np.int(yin)]); 34 | 35 | xout = c60 * (outPts[0][0] - outPts[1][0]) - s60 * (outPts[0][1] - outPts[1][1]) + outPts[1][0]; 36 | yout = s60 * (outPts[0][0] - outPts[1][0]) + c60 * (outPts[0][1] - outPts[1][1]) + outPts[1][1]; 37 | 38 | outPts.append([np.int(xout), np.int(yout)]); 39 | 40 | return cv2.getAffineTransform(np.array(inPts, dtype=np.float32), np.array(outPts, dtype=np.float32)) 41 | 42 | def rectContains(rect, point): 43 | if point[0] < rect[0]: 44 | return False 45 | elif point[1] < rect[1]: 46 | return False 47 | elif point[0] > rect[2]: 48 | return False 49 | elif point[1] > rect[3]: 50 | return False 51 | return True 52 | 53 | 54 | def calculateDelaunayTriangles(rect, points): 55 | subdiv = cv2.Subdiv2D(rect); 56 | for p in points: 57 | subdiv.insert((p[0], p[1])); 58 | triangleList = subdiv.getTriangleList(); 59 | delaunayTri = [] 60 | 61 | for t in triangleList: 62 | pt = [] 63 | pt.append((t[0], t[1])) 64 | pt.append((t[2], t[3])) 65 | pt.append((t[4], t[5])) 66 | 67 | pt1 = (t[0], t[1]) 68 | pt2 = (t[2], t[3]) 69 | pt3 = (t[4], t[5]) 70 | 71 | if rectContains(rect, pt1) and rectContains(rect, pt2) and rectContains(rect, pt3): 72 | ind = [] 73 | for j in range(0, 3): 74 | for k in range(0, len(points)): 75 | if (abs(pt[j][0] - points[k][0]) < 1.0 and abs(pt[j][1] - points[k][1]) < 1.0): 76 | ind.append(k) 77 | if len(ind) == 3: 78 | delaunayTri.append((ind[0], ind[1], ind[2])) 79 | 80 | return delaunayTri 81 | 82 | 83 | def constrainPoint(p, w, h): 84 | p = (min(max(p[0], 0), w - 1), min(max(p[1], 0), h - 1)) 85 | return p; 86 | 87 | def applyAffineTransform(src, srcTri, dstTri, size): 88 | warpMat = cv2.getAffineTransform(np.float32(srcTri), np.float32(dstTri)) 89 | dst = cv2.warpAffine(src, warpMat, (size[0], size[1]), None, flags=cv2.INTER_LINEAR, 90 | borderMode=cv2.BORDER_REFLECT_101) 91 | return dst 92 | 93 | def warpTriangle(img1, img2, t1, t2): 94 | r1 = cv2.boundingRect(np.float32([t1])) 95 | r2 = cv2.boundingRect(np.float32([t2])) 96 | t1Rect = [] 97 | t2Rect = [] 98 | t2RectInt = [] 99 | 100 | for i in range(0, 3): 101 | t1Rect.append(((t1[i][0] - r1[0]), (t1[i][1] - r1[1]))) 102 | t2Rect.append(((t2[i][0] - r2[0]), (t2[i][1] - r2[1]))) 103 | t2RectInt.append(((t2[i][0] - r2[0]), (t2[i][1] - r2[1]))) 104 | mask = np.zeros((r2[3], r2[2], 3), dtype=np.float32) 105 | cv2.fillConvexPoly(mask, np.int32(t2RectInt), (1.0, 1.0, 1.0)); 106 | img1Rect = img1[r1[1]:r1[1] + r1[3], r1[0]:r1[0] + r1[2]] 107 | 108 | size = (r2[2], r2[3]) 109 | 110 | img2Rect = applyAffineTransform(img1Rect, t1Rect, t2Rect, size) 111 | 112 | img2Rect = img2Rect * mask 113 | img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] = img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] * ( 114 | (1.0, 1.0, 1.0) - mask) 115 | 116 | img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] = img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] + img2Rect 117 | 118 | 119 | if __name__ == '__main__': 120 | 121 | path = '/home/ilmare/Desktop/FaceAverage/presidents/' 122 | w = 600; 123 | h = 600; 124 | allPoints = [] 125 | images = readImages(path); 126 | detector_obj = detector() 127 | predictor_obj = predictor(r"/home/ilmare/Desktop/FaceReplace/shape_predictor_68_face_landmarks.dat") 128 | fig = plt.figure("test") 129 | for img, idx in zip(images, range(len(images))): 130 | point = detector_obj(img, 1) 131 | marks = predictor_obj(img, point[0]).parts() 132 | tmp = [] 133 | for p in marks: 134 | tmp.append([p.x, p.y]) 135 | allPoints.append(tmp) 136 | eyecornerDst = [(np.int(0.3 * w), np.int(h / 3)), (np.int(0.7 * w), np.int(h / 3))]; 137 | 138 | imagesNorm = []; 139 | pointsNorm = []; 140 | 141 | boundaryPts = np.array( 142 | [(0, 0), (w / 2, 0), (w - 1, 0), (w - 1, h / 2), (w - 1, h - 1), (w / 2, h - 1), (0, h - 1), (0, h / 2)]); 143 | 144 | pointsAvg = np.array([(0, 0)] * (len(allPoints[0]) + len(boundaryPts)), np.float32()); 145 | 146 | n = len(allPoints[0]); 147 | 148 | numImages = len(images) 149 | 150 | for i in range(0, numImages): 151 | points1 = allPoints[i]; 152 | eyecornerSrc = [allPoints[i][36], allPoints[i][45]]; 153 | tform = similarityTransform(eyecornerSrc, eyecornerDst); 154 | print(tform) 155 | img = np.zeros((h, w, 3), dtype=np.float32) 156 | img = cv2.warpAffine(np.float32(images[i]) / 255.0, tform, (w, h)); 157 | points2 = np.reshape(np.array(points1), (68, 1, 2)); 158 | 159 | points = cv2.transform(points2, tform); 160 | 161 | points = np.float32(np.reshape(points, (68, 2))); 162 | 163 | points = np.append(points, boundaryPts, axis=0) 164 | pointsAvg = pointsAvg + points / numImages; 165 | 166 | pointsNorm.append(points); 167 | imagesNorm.append(img); 168 | 169 | rect = (0, 0, w, h); 170 | dt = calculateDelaunayTriangles(rect, np.array(pointsAvg)); 171 | 172 | output = np.zeros((h, w, 3), np.float32()); 173 | 174 | for i in range(0, len(imagesNorm)): 175 | img = np.zeros((h, w, 3), np.float32()); 176 | for j in range(0, len(dt)): 177 | tin = []; 178 | tout = []; 179 | 180 | for k in range(0, 3): 181 | pIn = pointsNorm[i][dt[j][k]]; 182 | pIn = constrainPoint(pIn, w, h); 183 | 184 | pOut = pointsAvg[dt[j][k]]; 185 | pOut = constrainPoint(pOut, w, h); 186 | 187 | tin.append(pIn); 188 | tout.append(pOut); 189 | 190 | warpTriangle(imagesNorm[i], img, tin, tout); 191 | 192 | output = output + img; 193 | output = output / numImages; 194 | cv2.imshow('image', output); 195 | cv2.waitKey(0); -------------------------------------------------------------------------------- /FaceReplace/Main/FaceReplace.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created By ilmare 3 | @date 2019-2-25 4 | ''' 5 | 6 | import sys 7 | import os 8 | 9 | sys.path.append(os.getcwd()) 10 | 11 | from Tools.Detector import PhotoParser 12 | from Lib.AutoEncoder import AutoEncoder 13 | 14 | modelFile = r"/home/ilmare/Desktop/FaceReplace/shape_predictor_68_face_landmarks.dat" 15 | 16 | if __name__ == "__main__": 17 | videoPath1 = r"/home/yanghang/faceswap/video/source.mp4" 18 | videoPath2 = r"/home/yanghang/faceswap/video-1/source.mp4" 19 | parser1 = PhotoParser(videoPath1, modelFile, (128, 128)) 20 | parser2 = PhotoParser(videoPath2, modelFile, (128, 128)) 21 | obj = AutoEncoder(0.005, 400, 1, "/disk/model/", 3) 22 | obj.train(parser1.trainImagePath, parser2.trainImagePath) -------------------------------------------------------------------------------- /FaceReplace/Main/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FaceReplace/Main/__init__.py -------------------------------------------------------------------------------- /FaceReplace/README.md: -------------------------------------------------------------------------------- 1 | # FaceReplace(换脸) 2 | 3 | 制作这个项目最初的动因是三个月前的一天在网络上看到了关于将94版《射雕英雄传》中朱茵扮演的黄蓉的脸换成了杨幂,而且不得不说AI生成的杨幂的脸在视频中无论是表情,光照还是和原脸的契合程度都非常完美。 4 | 5 | ![](https://github.com/yhswjtuILMARE/Machine-Learning-Study-Notes/blob/master/pics/fr-3.jpg) 6 | 7 | 看完这个视频后除了惊叹于现在AI技术进展神速之外,也感叹现在的技术人员的技术应用能力真的很强,换做是我八辈子也想不到用神经网络来做换脸这种事。最初我直观上以为AI换脸是使用`GAN`来实现的,在网上搜索一番之后发现AI换脸是使用`deepfakes`工具完成的,这个工具在最初推出的时候非常流行,很多人用它来制作明星换脸视频,至于视频内容嘛,都是一些不可描述的东西。虽然`deepfakes`引起了不小的争议,但是技术总是无罪的。结合网上的资料和自己的分析,我发现最初版本的`deepfakes`并不是基于`GAN`而是一个相对简单的模型`AutoEncoder`。这令我非常开心,因为自己的当前的硬件绝对无法训练`GAN`这样的网络,但是跑跑`AutoEncoder`还是没问题的,于是就着手复现这个项目。 8 | 9 | ## 训练数据获取 10 | 11 | 我选取了斗鱼主播刘飞儿和我比较喜欢的明星神仙姐姐刘亦菲作为被换脸人和目标脸提供者。首先下载了刘飞儿的一个直播视频,使用一个小脚本将视频拆分成一帧一帧的图片形式: 12 | 13 | ![](https://github.com/yhswjtuILMARE/Machine-Learning-Study-Notes/blob/master/pics/fr-2.jpg) 14 | 15 | 当然,这些原始的视频图片中包含很多没有脸或者脸很模糊的图像,为了模型的训练稳定,我将这些图片均从数据集中删掉了。接下来的工作是从这些图片中将人脸抠出来,这里使用`dlib`框架从数千张原始图片中抠出了人脸组成训练数据集: 16 | 17 | ![](https://github.com/yhswjtuILMARE/Machine-Learning-Study-Notes/blob/master/pics/fr-4.jpg) 18 | 19 | ## 模型结构 20 | 21 | 模型本身的结构非常简单,就是一个稍微变形的自编码器,其数学表达如下所示: 22 | 23 | ```shell 24 | A'=Encoder1(A) 25 | B'=Encoder2(B) 26 | A''=Decoder(A') 27 | B''=Decoder(B') 28 | ``` 29 | 30 | 这个模型和普通的自编码器不一样的地方就在于:这个自编码器由两个不同的编码器和唯一的解码器组成。这样做的目的是让两个编码器分别编码A脸和B脸,分别提取两张脸的全部特征,解码器根据编码器提取到的特征还原出这张脸。两个编码器共用一个解码器促使该解码器能够针对不同的编码器编码出的特征都能够还原出人脸。通俗地解释这个模型就是让一个画家一直画A脸,直到他能“记住”A脸的所有细节,然后再让这个画家去画B脸,直到他能“记住”B脸的所有细节。最后无论他再画A脸还是B脸都会带有另外一张脸的特征和细节。 31 | 32 | 具体的网络结构如下所示: 33 | 34 | ```Python 35 | Encoder: 64x64x3->8x8x512 36 | x = input_ 37 | x = conv(128)(x) 38 | x = conv(256)(x) 39 | x = conv(512)(x) 40 | x = conv(1024)(x) 41 | x = Dense(ENCODER_DIM)(Flatten()(x)) 42 | x = Dense(4 * 4 * 1024)(x) 43 | x = Reshape((4, 4, 1024))(x) 44 | x = upscale(512)(x) 45 | 46 | Decoder:8x8x512->64x64x3 47 | x = input_ 48 | x = upscale(256)(x) 49 | x = upscale(128)(x) 50 | x = upscale(64)(x) 51 | x = Conv2D(3, kernel_size=5, padding='same', activation='sigmoid')(x) 52 | ``` 53 | 54 | ## 训练Trick 55 | 56 | 如果直接将数据不加处理就直接训练,得到的结果是不理想的。一个训练小Trick是将数据集中的人脸做一定的变换和扭曲,这样能够训练自编码器更强的泛化能力,举例来说就是如此: 57 | 58 | ![](https://github.com/yhswjtuILMARE/Machine-Learning-Study-Notes/blob/master/pics/fr-7.jpg) 59 | 60 | 这样稍微扭曲一些的图片能够提高模型的泛化能力。 61 | 62 | ## 结果 63 | 64 | 由于实验室的服务器上没有GPU,因此训练了相当长的时间,最后结果只能说是差强人意: 65 | 66 | ![](https://github.com/yhswjtuILMARE/Machine-Learning-Study-Notes/blob/master/pics/fr-8.jpg) 67 | 68 | 还好,百度实图稍微给了我一些信心(笑哭): 69 | 70 | ![](https://github.com/yhswjtuILMARE/Machine-Learning-Study-Notes/blob/master/pics/fr-9.jpg) 71 | 72 | 接下来的工作就是将AI生成的脸换到原图上去,这又是一个很大的工程,等到完成了之后再将细节写出来。 73 | -------------------------------------------------------------------------------- /FaceReplace/Tools/DataObject.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created By ILMARE 3 | @Date 2019-3-3 4 | ''' 5 | import re 6 | import os 7 | import numpy as np 8 | import cv2 9 | from matplotlib import pyplot as plt 10 | 11 | random_transform_args = { 12 | 'rotation_range': 10, 13 | 'zoom_range': 0.05, 14 | 'shift_range': 0.05, 15 | 'random_flip': 0.4, 16 | } 17 | 18 | def umeyama(src, dst, estimate_scale): 19 | num = src.shape[0] 20 | dim = src.shape[1] 21 | src_mean = src.mean(axis=0) 22 | dst_mean = dst.mean(axis=0) 23 | src_demean = src - src_mean 24 | dst_demean = dst - dst_mean 25 | A = np.dot(dst_demean.T, src_demean) / num 26 | d = np.ones((dim,), dtype=np.double) 27 | if np.linalg.det(A) < 0: 28 | d[dim - 1] = -1 29 | T = np.eye(dim + 1, dtype=np.double) 30 | U, S, V = np.linalg.svd(A) 31 | rank = np.linalg.matrix_rank(A) 32 | if rank == 0: 33 | return np.nan * T 34 | elif rank == dim - 1: 35 | if np.linalg.det(U) * np.linalg.det(V) > 0: 36 | T[:dim, :dim] = np.dot(U, V) 37 | else: 38 | s = d[dim - 1] 39 | d[dim - 1] = -1 40 | T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V)) 41 | d[dim - 1] = s 42 | else: 43 | T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V.T)) 44 | if estimate_scale: 45 | scale = 1.0 / src_demean.var(axis=0).sum() * np.dot(S, d) 46 | else: 47 | scale = 1.0 48 | T[:dim, dim] = dst_mean - scale * np.dot(T[:dim, :dim], src_mean.T) 49 | T[:dim, :dim] *= scale 50 | return T 51 | 52 | def random_transform(image, rotation_range, zoom_range, shift_range, random_flip): 53 | h, w = image.shape[0:2] 54 | rotation = np.random.uniform(-rotation_range, rotation_range) 55 | scale = np.random.uniform(1 - zoom_range, 1 + zoom_range) 56 | tx = np.random.uniform(-shift_range, shift_range) * w 57 | ty = np.random.uniform(-shift_range, shift_range) * h 58 | mat = cv2.getRotationMatrix2D((w // 2, h // 2), rotation, scale) 59 | mat[:, 2] += (tx, ty) 60 | result = cv2.warpAffine(image, mat, (w, h), borderMode=cv2.BORDER_REPLICATE) 61 | if np.random.random() < random_flip: 62 | result = result[:, ::-1] 63 | return result 64 | 65 | def random_warp(image): 66 | assert image.shape == (128, 128, 3) 67 | range_ = np.linspace(64 - 64, 64 + 64, 9) 68 | mapx = np.broadcast_to(range_, (9, 9)) 69 | mapy = mapx.T 70 | mapx = mapx + np.random.normal(size=(9, 9), scale=2.5) 71 | mapy = mapy + np.random.normal(size=(9, 9), scale=2.5) 72 | interp_mapx = cv2.resize(mapx, (80, 80))[8:72, 8:72].astype('float32') 73 | interp_mapy = cv2.resize(mapy, (80, 80))[8:72, 8:72].astype('float32') 74 | warped_image = cv2.remap(image, interp_mapx, interp_mapy, cv2.INTER_LINEAR) 75 | src_points = np.stack([mapx.ravel(), mapy.ravel()], axis=-1) 76 | dst_points = np.mgrid[0:65:8, 0:65:8].T.reshape(-1, 2) 77 | mat = umeyama(src_points, dst_points, True)[0:2] 78 | target_image = cv2.warpAffine(image, mat, (64, 64)) 79 | return warped_image, target_image 80 | 81 | 82 | def get_training_data(images, batch_size): 83 | indices = np.random.randint(len(images), size=batch_size) 84 | for i, index in enumerate(indices): 85 | image = images[index] 86 | image = random_transform(image, **random_transform_args) 87 | warped_img, target_img = random_warp(image) 88 | if i == 0: 89 | warped_images = np.empty((batch_size,) + warped_img.shape, warped_img.dtype) 90 | target_images = np.empty((batch_size,) + target_img.shape, warped_img.dtype) 91 | warped_images[i] = warped_img 92 | target_images[i] = target_img 93 | return warped_images, target_images 94 | 95 | class ImageTrainObject: 96 | def __init__(self, filePath, batchSize): 97 | self._filePath = filePath 98 | self._batchSize = batchSize 99 | # if re.match(r"^/.+/[^.]+$", self._filePath) is None: 100 | # raise Exception("filePath is invalid") 101 | if self._filePath[len(self._filePath) - 1] != '/': 102 | self._filePath += '/' 103 | self._fileItems = os.listdir(self._filePath) 104 | if batchSize >= self.DataCount: 105 | raise Exception("Too big batchSize") 106 | @property 107 | def DataCount(self): 108 | return len(self._fileItems) 109 | def generateBatch(self): 110 | beginIdx = np.random.randint(0, self.DataCount - self._batchSize) 111 | destFile = self._fileItems[beginIdx: beginIdx + self._batchSize] 112 | return_mat = [] 113 | for file in destFile: 114 | img = cv2.imread("{0}{1}".format(self._filePath, file)) 115 | return_mat.append(img) 116 | return get_training_data(np.array(return_mat, dtype=np.uint8), self._batchSize) 117 | 118 | if __name__ == "__main__": 119 | img = cv2.imread(r"F:\tensorflow\automodel\scrawler\video\trainImg\18.jpg") 120 | img = np.array([img], dtype=np.uint8) 121 | warp, target = get_training_data(img, 1) 122 | fig = plt.figure("compare") 123 | ax = fig.add_subplot(121) 124 | b, g, r = cv2.split(warp[0]) 125 | source = cv2.merge([r, g, b]) 126 | ax.imshow(source) 127 | ax.axis("off") 128 | bx = fig.add_subplot(122) 129 | bx.axis("off") 130 | b, g, r = cv2.split(target[0]) 131 | dest = cv2.merge([r, g, b]) 132 | bx.imshow(dest) 133 | plt.show() 134 | # filePath = r"F:/tensorflow/automodel/scrawler/video/trainImg/" 135 | # batchSize = 64 136 | # obj = ImageTrainObject(filePath, batchSize) 137 | # obj.generateBatch() 138 | # print(obj.DataCount) -------------------------------------------------------------------------------- /FaceReplace/Tools/Detector.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created By ILMARE 3 | @Date 2019-3-2 4 | ''' 5 | 6 | # from dlib import shape_predictor as predictor 7 | # from dlib import get_frontal_face_detector as detector 8 | import cv2 9 | import os 10 | from matplotlib import pyplot as plt 11 | import math 12 | import numpy as np 13 | import re 14 | import datetime 15 | 16 | modelFile = r"/home/ilmare/Desktop/FaceReplace/shape_predictor_68_face_landmarks.dat" 17 | 18 | def transformationFormPoints(sourcePoints, destPoints): 19 | sourcePoints = np.asmatrix(sourcePoints, dtype=np.float32) 20 | destPoints = np.asmatrix(destPoints, dtype=np.float32) 21 | sourceMean = np.mean(sourcePoints, 0) 22 | destMean = np.mean(destPoints, 0) 23 | sourcePoints -= sourceMean 24 | destPoints -= destMean 25 | sourceStd = np.std(sourcePoints) 26 | destStd = np.std(destPoints) 27 | sourcePoints /= sourceStd 28 | destPoints /= destStd 29 | U, S, Vt = np.linalg.svd(destPoints.T * sourcePoints) 30 | R = (U * Vt).T 31 | return np.vstack([np.hstack(((sourceStd / destStd) * R, 32 | sourceMean.T - (sourceStd / destStd) * R * destMean.T)), 33 | np.matrix([0., 0., 1.])]) 34 | 35 | def similarityTransform(inPoints, outPoints): 36 | s60 = math.sin(60 * math.pi / 180); 37 | c60 = math.cos(60 * math.pi / 180); 38 | 39 | inPts = np.copy(inPoints).tolist(); 40 | outPts = np.copy(outPoints).tolist(); 41 | 42 | xin = c60 * (inPts[0][0] - inPts[1][0]) - s60 * (inPts[0][1] - inPts[1][1]) + inPts[1][0]; 43 | yin = s60 * (inPts[0][0] - inPts[1][0]) + c60 * (inPts[0][1] - inPts[1][1]) + inPts[1][1]; 44 | 45 | inPts.append([np.int(xin), np.int(yin)]); 46 | 47 | xout = c60 * (outPts[0][0] - outPts[1][0]) - s60 * (outPts[0][1] - outPts[1][1]) + outPts[1][0]; 48 | yout = s60 * (outPts[0][0] - outPts[1][0]) + c60 * (outPts[0][1] - outPts[1][1]) + outPts[1][1]; 49 | 50 | outPts.append([np.int(xout), np.int(yout)]); 51 | 52 | return cv2.getAffineTransform(np.array(inPts, dtype=np.float32), np.array(outPts, dtype=np.float32)) 53 | 54 | class PhotoParser: 55 | def __init__(self, videoPath, modelFile, destShape): 56 | self._modelFile = modelFile 57 | self._videoPath = videoPath 58 | res = re.match(r"^/.+/\w+\.\w+$", self._videoPath) 59 | if res is None: 60 | raise Exception("video path is invalid") 61 | res = re.search(r"/.+/(?=\w+\.\w+)", self._videoPath) 62 | self._savePath = "{0}{1}/".format(res.group(), "parseImg") 63 | if not os.path.exists(self._savePath): 64 | os.mkdir(self._savePath) 65 | self._trainPath = "{0}{1}/".format(res.group(), "trainImg") 66 | if not os.path.exists(self._trainPath): 67 | os.mkdir(self._trainPath) 68 | self._destShape = destShape 69 | self._photoCount = 0 70 | @property 71 | def trainImagePath(self): 72 | return self._trainPath 73 | def getPhotoFromVideo(self): 74 | vc = cv2.VideoCapture(self._videoPath) 75 | while True: 76 | rval, frame = vc.read() 77 | if rval: 78 | cv2.imwrite("{0}{1}.jpg".format(self._savePath, self._photoCount), frame) 79 | self._photoCount += 1 80 | if (self._photoCount % 100) == 0: 81 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "- Total Parsed Photo:", self._photoCount) 82 | else: 83 | break 84 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "- Total Parsed Photo: ", self._photoCount) 85 | vc.release() 86 | # def detectorPhotoFace(self): 87 | # if len(os.listdir(self._savePath)) == 0: 88 | # raise Exception("There is no photo at {0}".format(self._savePath)) 89 | # detectorObj = detector() 90 | # predictorObj = predictor(self._modelFile) 91 | # imageShape = (640, 360) 92 | # destEyePoint = [(316, 92), (385, 92)] 93 | # try: 94 | # fileList = os.listdir(self._savePath) 95 | # for file, idx in zip(fileList, range(len(fileList))): 96 | # filePath = "{0}{1}".format(self._savePath, file) 97 | # img = cv2.imread(filePath) 98 | # rects = detectorObj(img, 1) 99 | # if len(rects) > 1 or len(rects) == 0: 100 | # print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "-", filePath) 101 | # continue 102 | # img = self.__warpPhoto(predictorObj, destEyePoint, imageShape, img, rects[0]) 103 | # rects = detectorObj(img, 1) 104 | # if len(rects) > 1 or len(rects) == 0: 105 | # print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "-", filePath) 106 | # continue 107 | # rect = rects[0] 108 | # left, top, width, height = rect.left(), rect.top(), rect.right() - rect.left(), rect.bottom() - rect.top() 109 | # img = img[top:top + height, left:left + width, :] 110 | # img = self.__resizePhoto(img) 111 | # cv2.imwrite("{0}{1}.jpg".format(self._trainPath, idx), img) 112 | # if (idx % 100) == 0: 113 | # print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "- current index: ", idx) 114 | # except Exception as e: 115 | # print(e) 116 | def __warpPhoto(self, predictorObj , destEyePoint, imageShape, img, rect): 117 | points = predictorObj(img, rect) 118 | inPts = [(points.parts()[36].x, points.parts()[36].y), (points.parts()[45].x, points.parts()[45].y)] 119 | warpMatrix = similarityTransform(inPts, destEyePoint) 120 | return cv2.warpAffine(img, warpMatrix, dsize=imageShape) 121 | def __resizePhoto(self, img): 122 | try: 123 | height = img.shape[0] 124 | width = img.shape[1] 125 | interval = abs(width - height) 126 | margin = interval // 2 127 | if width > height: 128 | if (interval % 2) == 0: 129 | img = img[:, margin: width - margin, :] 130 | else: 131 | img = img[:, margin + 1: width - margin, :] 132 | elif height > width: 133 | if (interval % 2) == 0: 134 | img = img[margin: height - margin, :, :] 135 | else: 136 | img = img[margin + 1: height - margin, :, :] 137 | return cv2.resize(img, self._destShape) 138 | except Exception as e: 139 | print(e) 140 | return None 141 | 142 | 143 | 144 | if __name__ == "__main__": 145 | videoPath = r"F:/tensorflow/automodel/scrawler/video-1/dest2.mp4" 146 | obj = PhotoParser(videoPath, modelFile, (128, 128)) 147 | obj.getPhotoFromVideo() 148 | 149 | -------------------------------------------------------------------------------- /FaceReplace/Tools/PhotoScrawler.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created By ILMARE 3 | @Date 2019-3-1 4 | ''' 5 | 6 | from bs4 import BeautifulSoup 7 | from urllib.request import urlretrieve 8 | import requests 9 | from PIL import Image 10 | import os 11 | import re 12 | 13 | totalCount = 0 14 | pre_path = r"/home/ilmare/Desktop/FaceReplace/data/image/" 15 | 16 | class PhotoScrawler: 17 | def __init__(self, savePath, destUrl, maxPage): 18 | self._savePath = savePath 19 | self._destUrl = destUrl 20 | self._maxPage = maxPage 21 | def get_title_list(self, destUrl, pageNum=0): 22 | Url = "{0}&ie=utf-8&pn={1}".format(destUrl, pageNum * 50) 23 | print("Parsing page: ", Url) 24 | try: 25 | resp = requests.get(Url) 26 | bsObj = BeautifulSoup(resp.text, "html.parser") 27 | elts = bsObj.find_all("li", {"class": ["j_thread_list", "clearfix"]}) 28 | print(len(elts)) 29 | return_mat = [] 30 | for elt in elts: 31 | repNum = int(elt.find("span", {"class": "threadlist_rep_num center_text"}).text) 32 | a = elt.find("a", {"class": "j_th_tit"}) 33 | link = a.attrs.get("href") 34 | title = a.attrs.get("title") 35 | return_mat.append((title, "{0}{1}".format("http://tieba.baidu.com", link), repNum)) 36 | return return_mat 37 | except Exception as e: 38 | print(e) 39 | return None 40 | def parse_page(self, fronted_Url, pageNum=1): 41 | Url = "{0}?pn={1}".format(fronted_Url, pageNum) 42 | global totalCount 43 | try: 44 | resp = requests.get(Url) 45 | bsObj = BeautifulSoup(resp.text, "html.parser") 46 | ul = bsObj.find("ul", {"class": "l_posts_num"}) 47 | totalPage = int(ul.find("li", {"class": "l_reply_num"}).find_all("span", {"class": "red"})[1].text) 48 | print("----", "Parsing page: ", Url, ", pageNum: ", pageNum, ", totalPage: ", totalPage) 49 | elts = bsObj.find_all("div", {"class": ["l_post", "j_l_post", "l_post_bright", "noborder"]}) 50 | for elt, idx in zip(elts, range(len(elts))): 51 | div = elt.find("div", {"class": "d_post_content j_d_post_content clearfix"}) 52 | imgs = div.find_all("img") 53 | if imgs is not None: 54 | for img in imgs: 55 | src = img.attrs.get("src") 56 | res = re.match(r"^http.*/(image_emoticon)[0-9]+.(png|jpg|jpeg|gif)$", src) 57 | if res is None: 58 | ret = re.search(r"(?<=\.)(png|jpg|jpeg|gif)$", src) 59 | format = None 60 | if ret is not None: 61 | format = ret.group() 62 | if format is None: 63 | urlretrieve(src, "{0}{1}".format(pre_path, totalCount)) 64 | img = Image.open("{0}{1}".format(pre_path, totalCount)) 65 | format = img.format 66 | img.save("{0}{1}.{2}".format(pre_path, totalCount, format.lower())) 67 | os.remove("{0}{1}".format(pre_path, totalCount)) 68 | print("-------- ", idx, ": ", src) 69 | else: 70 | urlretrieve(src, "{0}{1}.{2}".format(pre_path, totalCount, format)) 71 | print("-------- ", idx, ": ", src, "format: ", format) 72 | totalCount += 1 73 | except Exception as e: 74 | print(e) 75 | finally: 76 | if pageNum < totalPage: 77 | self.parse_page(fronted_Url, pageNum + 1) 78 | else: 79 | return 80 | def get_photo_from_tieba(self): 81 | for i in range(self._maxPage): 82 | return_mat = self.get_title_list(self._destUrl, i) 83 | if return_mat is None: 84 | continue 85 | for (title, link, repNum), page in zip(return_mat, range(len(return_mat))): 86 | if repNum <= 3000: 87 | print("===>", title, ", current page: ", i + 1, ", current item: ", page) 88 | self.parse_page(link) 89 | 90 | if __name__ == "__main__": 91 | obj = PhotoScrawler(pre_path, 92 | "http://tieba.baidu.com/f?kw=%E6%9D%A8%E5%B9%82", 1) 93 | obj.get_photo_from_tieba() -------------------------------------------------------------------------------- /FaceReplace/Tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FaceReplace/Tools/__init__.py -------------------------------------------------------------------------------- /FaceReplace/Tools/__pycache__/DataObject.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FaceReplace/Tools/__pycache__/DataObject.cpython-35.pyc -------------------------------------------------------------------------------- /FaceReplace/Tools/__pycache__/DataObject.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FaceReplace/Tools/__pycache__/DataObject.cpython-36.pyc -------------------------------------------------------------------------------- /FaceReplace/Tools/__pycache__/Detector.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FaceReplace/Tools/__pycache__/Detector.cpython-35.pyc -------------------------------------------------------------------------------- /FaceReplace/Tools/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FaceReplace/Tools/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /FaceReplace/Tools/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/FaceReplace/Tools/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /KNN/Lib/KNNLib.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | import os 8 | 9 | def classify0(intX:"需要被分类的数据", dataSet:"数据集", labels:"数据集标签", k:"k值")->tuple: 10 | dataSetSize = dataSet.shape[0] 11 | diffMat = np.tile(intX, (dataSetSize, 1)) - dataSet 12 | sqDiffMat = diffMat ** 2 13 | sqDistance = sqDiffMat.sum(axis=1) 14 | distance = sqDistance ** 0.5 15 | sortedDistanceIndex = distance.argsort() 16 | classCount = {} 17 | for i in range(k): 18 | votelabel = labels[sortedDistanceIndex[i]] 19 | classCount[votelabel] = classCount.get(votelabel, 0) + 1 20 | tmp_count = -1 21 | tmp_flag = "-1" 22 | for item in classCount.items(): 23 | if tmp_count < item[1]: 24 | tmp_flag = item[0] 25 | tmp_count = item[1] 26 | return tmp_flag if tmp_flag != "-1" else None 27 | 28 | def file2matrix(filename):#从文件中读取数据 29 | try: 30 | fp = open(filename, "r") 31 | arrayLine = fp.readlines() 32 | numberOfLine = len(arrayLine) 33 | returnMat = np.zeros((numberOfLine, 3)) 34 | classLabelVector = [] 35 | index = 0 36 | for line in arrayLine: 37 | line = line.strip() 38 | listFromLine = line.split("\t") 39 | returnMat[index, :] = listFromLine[0:3] 40 | classLabelVector.append(int(listFromLine[-1])) 41 | index += 1 42 | return returnMat, classLabelVector 43 | except Exception as e: 44 | print(e) 45 | finally: 46 | fp.close() 47 | 48 | def autoNormal(dataSet):#正规化数据 49 | minVal = dataSet.min(0) 50 | maxVal = dataSet.max(0) 51 | rangeVal = maxVal - minVal 52 | normalDataSet = np.zeros(dataSet.shape, dtype = np.float) 53 | m = dataSet.shape[0] 54 | normalDataSet = dataSet - np.tile(minVal, (m, 1)) 55 | normalDataSet /= np.tile(rangeVal, (m, 1)) 56 | return normalDataSet, rangeVal, minVal -------------------------------------------------------------------------------- /KNN/Lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/KNN/Lib/__init__.py -------------------------------------------------------------------------------- /KNN/Lib/__pycache__/KNNLib.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/KNN/Lib/__pycache__/KNNLib.cpython-36.pyc -------------------------------------------------------------------------------- /KNN/Lib/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/KNN/Lib/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /KNN/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/KNN/README.md -------------------------------------------------------------------------------- /KNN/Unit/KNNUnit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | from urllib.request import urlopen 7 | from PIL import Image 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import os 11 | import Lib.KNNLib as kNN 12 | 13 | default_path = r"g:/machine/kNN_dataSet/" 14 | dest_vertical_dim = 18 15 | def getToken(): 16 | filePath = default_path + "token" 17 | for i in range(100): 18 | try: 19 | resp = urlopen("http://jiaowu.swjtu.edu.cn/servlet/GetRandomNumberToJPEG", timeout=10) 20 | fp = open("{0}/{1:d}.jpg".format(filePath, i), "wb") 21 | fp.write(resp.read()) 22 | except Exception as e: 23 | print(e) 24 | finally: 25 | fp.close() 26 | 27 | def parseImg(): 28 | filePath = default_path + "token" 29 | for filename in os.listdir(filePath): 30 | print("processing the {0}".format(filename)) 31 | img = Image.open(filePath + "/" + filename) 32 | img = img.convert("L") 33 | img = img.point(lambda i : 255 if i > 110 else 0) 34 | img = img.crop((1, 1, 55, 22)) 35 | img.save("{0}new/{1}".format(default_path, filename.split(".")[0] + ".png")) 36 | 37 | def crop(img, rect):#剪裁图片 38 | size = img.shape 39 | left = rect[0] 40 | top = rect[1] 41 | width = rect[2] 42 | height = rect[3] 43 | if (left + width) > size[1]: 44 | return None 45 | elif (top + height) > size[0]: 46 | return None; 47 | return img[top:(top + height), left:(left + width)] 48 | 49 | def getAlphabet(): 50 | filePath = default_path + "new" 51 | fileNames = os.listdir(filePath) 52 | print(len(fileNames)) 53 | for j in range(len(fileNames)): 54 | name = fileNames[j] 55 | if name == ".DS_Store": 56 | continue 57 | img = Image.open(filePath + "/" + name) 58 | img = np.asarray(img) 59 | size = img.shape 60 | print("processing the image {0}".format(name)) 61 | del_lst = [] 62 | for i in range(size[1]): 63 | tmp_lst = img[:, i] 64 | if tmp_lst.sum() / (size[0] * 255) >= 0.9: 65 | if len(del_lst) != 0 and i == (del_lst[-1] + 1): 66 | del_lst.remove(i - 1) 67 | del_lst.append(i) 68 | else: 69 | del_lst.append(i) 70 | if len(del_lst) == 5: 71 | break 72 | for i in range(len(del_lst) - 1): 73 | index = del_lst[i] 74 | width = del_lst[i + 1] - index 75 | sub_img = crop(img, (index, 0, width, size[0])) 76 | sub_img = Image.fromarray(resize_pic(sub_img, dest_vertical_dim)) 77 | sub_img = sub_img.convert("L") 78 | sub_img.save(r"{0}/{2}_{1}_{3}.png".format(default_path + "alphabet", name.split(".")[0], name.split(".")[0][i], i)) 79 | 80 | def resize_pic(img, dim): 81 | size = img.shape 82 | if size[1] < dim: 83 | range_val = dim - size[1] 84 | left = 0 85 | right = 0 86 | if range_val % 2 == 0: 87 | left = right = range_val // 2 88 | else: 89 | left = range_val // 2 + 1 90 | right = left - 1 91 | tmp_matrix = np.zeros((21, left + right + size[1])) 92 | for i in range(tmp_matrix.shape[1]): 93 | if i <= (left - 1): 94 | tmp_matrix[:, i] = np.tile(255, size[0]) 95 | elif i > (left - 1) and i < (left + size[1]): 96 | tmp_matrix[:, i] = img[:, i - left] 97 | else: 98 | tmp_matrix[:, i] = np.tile(255, size[0]) 99 | return tmp_matrix 100 | else: 101 | range_val = size[1] - dim 102 | left = 0 103 | right = 0 104 | if range_val % 2 == 0: 105 | left = right = range_val // 2 106 | else: 107 | left = range_val // 2 + 1 108 | right = left - 1 109 | tmp_matrix = crop(img, (left, 0, dim, 21)) 110 | return tmp_matrix 111 | 112 | def analyzeImg(filename=default_path + "new\ACSS.png"): 113 | img = Image.open(filename) 114 | img = np.asarray(img) 115 | print(img.shape) 116 | size = img.shape 117 | tmp = [] 118 | del_lst = [] 119 | for i in range(size[1]): 120 | tmp_lst = img[:, i] 121 | tmp.append(tmp_lst.sum() / size[0]) 122 | if tmp_lst.sum() / (size[0] * 255) >= 0.9: 123 | if len(del_lst) != 0 and i == (del_lst[-1] + 1): 124 | del_lst.remove(i - 1) 125 | del_lst.append(i) 126 | else: 127 | del_lst.append(i) 128 | if len(del_lst) == 5: 129 | break 130 | fig = plt.figure("test") 131 | ax = fig.add_subplot(321) 132 | ax.imshow(img) 133 | for item in del_lst: 134 | ax.plot([item,item], [0, size[0]]) 135 | cx = fig.add_subplot(322) 136 | x = np.arange(0, len(tmp)) 137 | cx.plot(x, tmp) 138 | for i in range(len(del_lst) - 1): 139 | index = del_lst[i] 140 | width = del_lst[i + 1] - index 141 | temp = 323 + i 142 | dx = fig.add_subplot(temp) 143 | sub_img = crop(img, (index, 0, width, size[0])) 144 | dx.imshow(sub_img) 145 | plt.show() 146 | 147 | def resizeTheImg():#该方法废弃不用,被resize_pic代替 148 | fileName = default_path + "alphabet" 149 | for name in os.listdir(fileName): 150 | filePath = fileName + "/" + name 151 | img = Image.open(filePath) 152 | img = np.asarray(img) 153 | print("resizing the picture {0}, ".format(name), end="") 154 | size = img.shape 155 | if size[1] < dest_vertical_dim: 156 | range_val = dest_vertical_dim - size[1] 157 | left = 0 158 | right = 0 159 | if range_val % 2 == 0: 160 | left = right = range_val // 2 161 | else: 162 | left = range_val // 2 + 1 163 | right = left - 1 164 | tmp_matrix = np.zeros((21, left + right + size[1])) 165 | for i in range(tmp_matrix.shape[1]): 166 | if i <= (left - 1): 167 | tmp_matrix[:, i] = np.tile(255, size[0]) 168 | elif i > (left - 1) and i < (left + size[1]): 169 | tmp_matrix[:, i] = img[:, i - left] 170 | else: 171 | tmp_matrix[:, i] = np.tile(255, size[0]) 172 | img = Image.fromarray(tmp_matrix) 173 | img = img.convert("L") 174 | print("the picture size is {0}, {1}".format(img.size[0], img.size[1])) 175 | img.save(filePath) 176 | else: 177 | range_val = size[1] - dest_vertical_dim 178 | left = 0 179 | right = 0 180 | if range_val % 2 == 0: 181 | left = right = range_val // 2 182 | else: 183 | left = range_val // 2 + 1 184 | right = left - 1 185 | tmp_matrix = crop(img, (left, 0, dest_vertical_dim, 21)) 186 | img = Image.fromarray(tmp_matrix) 187 | img = img.convert("L") 188 | print("the picture size is {0}, {1}".format(img.size[0], img.size[1])) 189 | img.save(filePath) 190 | 191 | def readDataSet():#从本地读出数据集 192 | filePath = default_path + "alphabet" 193 | dirPath = os.listdir(filePath) 194 | m = len(dirPath) 195 | returnMat = np.zeros((m, dest_vertical_dim * 21)) 196 | labels = [] 197 | for i in range(m): 198 | name = dirPath[i] 199 | img = Image.open(filePath + "/" + name) 200 | matrix = np.asarray(img) 201 | tmp_matrix = np.zeros((1, dest_vertical_dim * 21)) 202 | for j in range(matrix.shape[0]): 203 | for k in range(matrix.shape[1]): 204 | tmp_matrix[0, j * dest_vertical_dim + k] = matrix[j, k] 205 | returnMat[i, :] = tmp_matrix 206 | labels.append(name.split("_")[0]) 207 | return returnMat, labels 208 | 209 | def kNNidentify(dataSet, labels, filename=default_path + "834.jpg", k=10):#传入数据集,标签以及图片名称 210 | img = Image.open(filename) 211 | img = img.convert("L") 212 | img = img.point(lambda i : 255 if i > 110 else 0) 213 | img = img.crop((1, 1, 55, 22)) 214 | img = np.asarray(img) 215 | img = resize_pic(img, 57) 216 | size = img.shape 217 | tmp = [] 218 | del_lst = [] 219 | for i in range(size[1]): 220 | tmp_lst = img[:, i] 221 | tmp.append(tmp_lst.sum() / size[0]) 222 | if tmp_lst.sum() / (size[0] * 255) >= 0.9: 223 | if len(del_lst) != 0 and i == (del_lst[-1] + 1): 224 | del_lst.remove(i - 1) 225 | del_lst.append(i) 226 | else: 227 | del_lst.append(i) 228 | if len(del_lst) == 5: 229 | break 230 | resultMat = np.zeros((len(del_lst) - 1, dest_vertical_dim * 21)) 231 | for i in range(len(del_lst) - 1): 232 | index = del_lst[i] 233 | width = del_lst[i + 1] - index 234 | sub_img = crop(img, (index, 0, width, size[0])) 235 | sub_img = resize_pic(sub_img, dest_vertical_dim) 236 | tmp_matrix = np.zeros((1, dest_vertical_dim * 21)) 237 | for j in range(sub_img.shape[0]): 238 | for k in range(sub_img.shape[1]): 239 | tmp_matrix[0, j * dest_vertical_dim + k] = sub_img[j, k] 240 | resultMat[i, :] = tmp_matrix 241 | pattern = [] 242 | for item in resultMat: 243 | pattern.append(kNN.classify0(item, dataSet, labels, 10)) 244 | return "".join(pattern) 245 | 246 | if __name__ == "__main__": 247 | dataSet, labels = readDataSet() 248 | # print(kNNidentify(dataSet, labels, default_path + "33.jpg")) 249 | path = default_path + "testData" 250 | error = 0 251 | dirPath = os.listdir(path) 252 | m = len(dirPath) 253 | for name in dirPath: 254 | filePath = path + "\\" + name 255 | pattern = kNNidentify(dataSet, labels, filePath) 256 | name = name.split(".")[0] 257 | print("the classfier came back with {0}, the real answer is {1}".format(pattern, name)) 258 | if pattern != name: 259 | error += 1 260 | print("the total error ratio is {0:.3f}".format(error / m)) 261 | #==================准备数据集用到的代码============================================= 262 | # getToken()#从远端获得验证码 263 | # parseImg()#处理图片,将图片转化二值化,转化成灰度图 264 | # getAlphabet()#将图片转化成字母图片,注意这一步之前要将上一步得到的图片名称改为验证码中的所代表的字符 265 | # analyzeImg()#分析具体的一张图 -------------------------------------------------------------------------------- /KNN/Unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/KNN/Unit/__init__.py -------------------------------------------------------------------------------- /LinearRegression/Lib/LogisticLib.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | from matplotlib import pyplot as plt 8 | 9 | def sigmod(intX): 10 | return 1.0 / (1 + np.exp(-intX)) 11 | 12 | def gradDescent(dataMatrix, classLabels, maxCycle=10000):#原始梯度下降算法 13 | dataMatrix = np.matrix(dataMatrix, dtype=np.float) 14 | classLabels = np.matrix(classLabels, dtype=np.float).transpose() 15 | alpha = 0.001 16 | m, n = dataMatrix.shape 17 | weight = np.ones((n, 1)) 18 | res = np.zeros((maxCycle, n)) 19 | for i in range(maxCycle): 20 | h = sigmod(dataMatrix * weight) 21 | error = h - classLabels 22 | res[i] = weight.transpose() 23 | weight = weight - alpha * dataMatrix.transpose() * error 24 | return np.matrix(weight, dtype=np.float), res 25 | 26 | def stocGradDescent(dataSetIn, labels, numIter=150):#改进版随机梯度下降算法 27 | dataSetIn = np.matrix(dataSetIn, dtype=np.float) 28 | labels = np.matrix(labels, dtype=np.float).transpose() 29 | m, n = dataSetIn.shape 30 | weight = np.ones((n, 1)) 31 | res = np.ones((numIter * m, n)) 32 | for j in range(numIter): 33 | dataIndex = np.random.randint(0, m, m) 34 | for i in range(m): 35 | alpha = 4 / (1.0 + j + i) + 0.01 36 | h = sigmod(dataSetIn[dataIndex[i]] * weight) 37 | error = h - labels[dataIndex[i]] 38 | res[j * m + i] = weight.transpose() 39 | weight = weight - alpha * dataSetIn[dataIndex[i]].transpose() * error 40 | return np.matrix(weight, dtype=np.float), res 41 | 42 | def plotWeightFig(res, ranges): 43 | if len(ranges) > 6: 44 | return None 45 | fig = plt.figure("Test") 46 | x = np.arange(0, res.shape[0]) 47 | for i in range(len(ranges)): 48 | ax = fig.add_subplot(321 + i) 49 | ax.set_ylabel("w%d" % (ranges[i])) 50 | ax.plot(x, res[:, ranges[i]]) 51 | plt.show() 52 | 53 | def classifyVector(intX, weight): 54 | intX = np.matrix(intX, dtype=np.float) 55 | prob = sigmod(float(intX * weight)) 56 | if prob > 0.5: 57 | return 1 58 | else: 59 | return 0 -------------------------------------------------------------------------------- /LinearRegression/Lib/RFLib.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | import Util.RandomUtil as RandomUtil 8 | 9 | ''' 10 | 计算香农墒 11 | ''' 12 | def calShannonEnt(trainLabel): 13 | m = len(trainLabel) 14 | uniqueVal = set(trainLabel) 15 | countDict = {} 16 | shannonNum = 0.0 17 | for label in trainLabel: 18 | countDict[label] = countDict.get(label, 0) + 1 19 | for label in uniqueVal: 20 | p = (countDict[label] / m) 21 | shannonNum -= p * np.log2(p) 22 | return shannonNum 23 | ''' 24 | 切分数据集 25 | ''' 26 | def splitDataMatrix(dataMatrix, label, axis, value): 27 | returnMat = [] 28 | labelMat = [] 29 | for row, row1 in zip(dataMatrix, label): 30 | if row[axis] == value: 31 | tmp_lst = row[0: axis] 32 | tmp_lst.extend(row[axis + 1:]) 33 | returnMat.append(tmp_lst) 34 | labelMat.append(row1) 35 | return returnMat, labelMat 36 | ''' 37 | 由信息增益最大化计算出需要切分的属性索引值 38 | ''' 39 | def chooseBestFeature(trainSet, label): 40 | tmp = int(np.log2(len(trainSet[0]))) 41 | k = 1 if tmp == 0 else tmp 42 | indexSet = RandomUtil.generateRandom(0, len(trainSet[0]), k) 43 | m = len(trainSet) 44 | maxGain = -1 45 | baseShannonEnt = calShannonEnt(label) 46 | index = -1 47 | for i in indexSet: 48 | uniqueAttr = set([example[i] for example in trainSet]) 49 | tmp_Ent = 0 50 | for attr in uniqueAttr: 51 | subSet, labelMat = splitDataMatrix(trainSet, label, i, attr) 52 | newShannonEnt = calShannonEnt(labelMat) 53 | tmp_Ent += float(len(subSet) / m) * newShannonEnt 54 | gain = baseShannonEnt - tmp_Ent 55 | if gain > maxGain: 56 | maxGain = gain 57 | index = i 58 | return index 59 | ''' 60 | 训练随机森林所需要的弱分类器 61 | ''' 62 | def generateWeakLearner(trainSet, trainLabel): 63 | if trainLabel.count(trainLabel[0]) == len(trainLabel): 64 | return trainLabel[0] 65 | if len(trainSet[0]) == 0: 66 | return "no" if trainLabel.count("no") > trainLabel.count("yes") else "yes" 67 | index = chooseBestFeature(trainSet, trainLabel) 68 | Tree = {index:{}} 69 | uniqueVal = set([elt[index] for elt in trainSet]) 70 | for value in uniqueVal: 71 | subSet, label = splitDataMatrix(trainSet, trainLabel, index, value) 72 | Tree[index][value] = generateWeakLearner(subSet, label) 73 | return Tree 74 | 75 | def generateRandomForest(trainSet, trainLabel, T): 76 | forest = [] 77 | for i in range(T): 78 | model = generateWeakLearner(trainSet, trainLabel) 79 | forest.append(model) 80 | return forest 81 | 82 | def classfyData(data, model): 83 | if type(model) == str: 84 | return model 85 | key = iter(model.keys()).__next__() 86 | value = data[key] 87 | res = model[key].get(value, None) 88 | if res != None: 89 | return classfyData(data, res) 90 | else: 91 | tmp_lst = [item for item in model[key].keys()] 92 | return classfyData(data, model[key][np.random.choice(tmp_lst, 1)[0]]) 93 | 94 | def predictByRandomForest(models, data): 95 | tmp_lst = [] 96 | for model in models: 97 | predict_label = classfyData(data, model) 98 | tmp_lst.append(predict_label) 99 | tmp_set = set(tmp_lst) 100 | res_lst = [] 101 | for res in tmp_set: 102 | res_lst.append((tmp_lst.count(res), res)) 103 | res_lst = sorted(res_lst, key=lambda index:index[0], reverse=True) 104 | if len(res_lst) == 1: 105 | return res_lst[0][1] 106 | else: 107 | tmp_res = res_lst[0][0] 108 | return_lst = [res_lst[0][1]] 109 | for i in range(1, len(res_lst)): 110 | if res_lst[i][0] == tmp_res: 111 | return_lst.append(res_lst[i][1]) 112 | if len(return_lst) == 1: 113 | return return_lst[0] 114 | else: 115 | return np.random.choice(return_lst, 1)[0] -------------------------------------------------------------------------------- /LinearRegression/Lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Lib/__init__.py -------------------------------------------------------------------------------- /LinearRegression/Lib/__pycache__/LogisticLib.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Lib/__pycache__/LogisticLib.cpython-35.pyc -------------------------------------------------------------------------------- /LinearRegression/Lib/__pycache__/LogisticLib.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Lib/__pycache__/LogisticLib.cpython-36.pyc -------------------------------------------------------------------------------- /LinearRegression/Lib/__pycache__/RFLib.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Lib/__pycache__/RFLib.cpython-35.pyc -------------------------------------------------------------------------------- /LinearRegression/Lib/__pycache__/RFLib.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Lib/__pycache__/RFLib.cpython-36.pyc -------------------------------------------------------------------------------- /LinearRegression/Lib/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Lib/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /LinearRegression/Lib/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Lib/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /LinearRegression/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/README.md -------------------------------------------------------------------------------- /LinearRegression/Unit/LRUnit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月4日 3 | 4 | @author: IL MARE 5 | ''' 6 | import Util.DataUtil as DataUtil 7 | from Lib import LogisticLib as LRLib 8 | import time 9 | 10 | if __name__ == "__main__": 11 | start = time.clock() 12 | # dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel("bank-additional")#正统方法 13 | dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel("bank-addtional-format-lr") 14 | trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(dataSet, labelSet) 15 | weight, logList = LRLib.stocGradDescent(trainSet, trainLabel) 16 | errorCount = 0 17 | for data, label in zip(testSet, testLabel): 18 | predict_label = LRLib.classifyVector(data, weight) 19 | if predict_label != label: 20 | errorCount += 1 21 | ratio = errorCount / len(testLabel) 22 | print("the error ratio is %.3f, the correct ratio is %.3f -- %.3fs" % (ratio, 1 - ratio, time.clock() - start)) 23 | LRLib.plotWeightFig(logList, [i for i in range(0, 6)]) -------------------------------------------------------------------------------- /LinearRegression/Unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Unit/__init__.py -------------------------------------------------------------------------------- /LinearRegression/Util/RandomUtil.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | ''' 8 | 该函数用来在一个集合中随机抽取size个互不相同的随机值 9 | ''' 10 | def generateRandomIndex(a, size): 11 | if len(a) < size: 12 | return None 13 | elif len(a) == size: 14 | return set(a) 15 | returnMat = set() 16 | while True: 17 | returnMat.add(np.random.choice(list(a), 1)[0]) 18 | if len(returnMat) == size: 19 | break 20 | return returnMat 21 | ''' 22 | 在指定范围内产生指定数目的不重复的随机数 23 | ''' 24 | def generateRandom(low, high, size): 25 | returnSet = set() 26 | while True: 27 | returnSet.add(np.random.randint(low, high, 1)[0]) 28 | if len(returnSet) == size: 29 | break 30 | return returnSet -------------------------------------------------------------------------------- /LinearRegression/Util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Util/__init__.py -------------------------------------------------------------------------------- /LinearRegression/Util/__pycache__/DataUtil.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Util/__pycache__/DataUtil.cpython-35.pyc -------------------------------------------------------------------------------- /LinearRegression/Util/__pycache__/DataUtil.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Util/__pycache__/DataUtil.cpython-36.pyc -------------------------------------------------------------------------------- /LinearRegression/Util/__pycache__/RandomUtil.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Util/__pycache__/RandomUtil.cpython-35.pyc -------------------------------------------------------------------------------- /LinearRegression/Util/__pycache__/RandomUtil.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Util/__pycache__/RandomUtil.cpython-36.pyc -------------------------------------------------------------------------------- /LinearRegression/Util/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Util/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /LinearRegression/Util/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/LinearRegression/Util/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 机器学习实践与笔记 2 | 3 | 机器学习是时下计算机科学的一个非常火热的分枝,笔者在进入研究生阶段的学习后也渐渐开始了解一些相关的算法。创建这个仓库的目的就是为了把笔者在机器学习领域遇到的问题,学到的知识,编写的代码分享给大家。 4 | 5 | 这个仓库中主要包含`java`和`python`两种类型的代码。包括svm,对数回归,随机森林和决策树等等。目前用java做机器学习研究学习的人不是很多,但是笔者作为一个JavaWeb开发者,一直想在一些web应用上集成机器学习的算法,因此创建了这个目录。java目录下目前还没有什么东西,笔者现在主要在python目录下更新,之后可能会逐渐添加一些java的代码。 6 | 7 | 除此之外,笔者还会发表一些学习笔记。这些笔记可能写的不是那么全面和正规,希望读者见谅,这些笔记主要以jupyter notebook的形式发表。这些笔记主要在`notebooks`目录中收录。 8 | 9 | ## 计算机视觉 10 | 11 | 在这里,可以看到与计算机视觉相关的小demo。比如前段时间的很火的`deepfakes`,笔者根据开源代码复现了核心代码使得程序能够根据一个人的脸生成另外一个人的脸,而这两张脸拥有相同的表情,光照: 12 | 13 | ![](https://github.com/yhswjtuILMARE/Machine-Learning-Study-Notes/blob/master/pics/fr-8.jpg) 14 | 15 | 在这里,你也可以看到根据数百张人脸合成一张平均脸的demo: 16 | 17 | ![](https://github.com/yhswjtuILMARE/Machine-Learning-Study-Notes/blob/master/pics/fa-1.jpg) 18 | 19 | ## 自然语言处理 20 | 21 | 在这里,可以看到使用循环神经网络生成的小说与诗词: 22 | 23 | ``` 24 | 程心 心问:“我不是一个标人的一种 及源尘 的一切都没有关系,那一次都是一个世妄的时间,但也能看到,在这些 系观的存在,人们在 28 年前,他们在这里,他们也不知道,这是他们的时间是一个小时的时间,但也没有文段的。” 25 | 26 |   “我的!” 程心想起了一种声冷,“我的!” AA 的话学她说,“我的我也是 一切都有人的东西。这时,他们也没见过来。”关一帆走过程心的话。他说,“那是在我的人都不会,我们不知道那话的,他们不是这十的话。 他们在这个范像是 一种死票的 辨V,在她的 几睛, 她们也不是为了一个标座的。 27 | 28 |   在他的手指上了, “我想起来,她的几睛就是这十。” 29 | 30 |   “我不是我们的时间里,她在那个世界是 一个世界,她的几睛,她就在这个时代的一个人就是 一个世 界的世界。 他的目光中是一个人, 她不知道,我们在这种 话的话,他们的目光,但她的一切都不是 一个人。” 31 | 32 |   “我不知道,我们的生活,那个时代,他也不是一个人的那个世界的,她不是一个人,不管她是我。” 程心灯灯头。” 程心灯灯头,“你的目光中,那是一个 一个 方面的 草穷来时 代,在那一次,我的人都没有看,他们 都不再 是一个人的那种, 33 | ``` 34 | 还可以看到对`word2vec`的实现以及相关的数学推导: 35 | 36 | ``` 37 | Nearst to "three" : four - <0.740>, five - <0.706>, two - <0.706>, zero - <0.703>, six - <0.695>, eight - <0.691>, seven - <0.661>, one - <0.633> 38 | Nearst to "would" : can - <0.426>, to - <0.424>, transformation - <0.421>, says - <0.421>, adults - <0.416>, had - <0.398>, instrumental - <0.396>, like - <0.388> 39 | Nearst to "known" : repeatedly - <0.413>, regarded - <0.394>, falls - <0.394>, joint - <0.375>, available - <0.369>, sir - <0.365>, UNK - <0.364>, select - <0.362> 40 | Nearst to "world" : holding - <0.423>, died - <0.415>, holy - <0.394>, first - <0.393>, wayne - <0.391>, sea - <0.390>, mathbf - <0.382>, focus - <0.379> 41 | Nearst to "history" : UNK - <0.424>, identical - <0.419>, state - <0.418>, orbital - <0.415>, next - <0.413>, boston - <0.411>, list - <0.410>, frequent - <0.408> 42 | Nearst to "but" : however - <0.503>, that - <0.462>, ties - <0.462>, sky - <0.456>, exposed - <0.452>, generally - <0.441>, because - <0.430>, landscape - <0.414> 43 | Nearst to "most" : supporters - <0.419>, earned - <0.406>, friendly - <0.405>, anarchist - <0.397>, absence - <0.394>, articles - <0.389>, apparent - <0.386>, difference - <0.385> 44 | Nearst to "states" : co - <0.477>, symbolic - <0.428>, market - <0.412>, by - <0.408>, skills - <0.396>, legs - <0.395>, in - <0.392>, alphabet - <0.382> 45 | Nearst to "only" : respectively - <0.434>, powerful - <0.404>, warming - <0.400>, transform - <0.392>, showing - <0.391>, theory - <0.389>, georgia - <0.383>, adventures - <0.382> 46 | Nearst to "while" : and - <0.411>, enlightenment - <0.409>, started - <0.405>, conduct - <0.403>, convention - <0.396>, academy - <0.390>, lived - <0.390>, novels - <0.384> 47 | Nearst to "other" : additional - <0.418>, fall - <0.415>, recovery - <0.398>, caught - <0.391>, gene - <0.390>, egyptian - <0.386>, different - <0.382>, some - <0.371> 48 | Nearst to "which" : that - <0.490>, this - <0.435>, max - <0.424>, and - <0.410>, images - <0.408>, success - <0.407>, benefit - <0.404>, five - <0.403> 49 | Nearst to "they" : why - <0.401>, assassination - <0.401>, he - <0.400>, occasions - <0.400>, southwest - <0.391>, inherited - <0.383>, democrats - <0.382>, norwegian - <0.378> 50 | Nearst to "as" : detail - <0.459>, constitutional - <0.432>, error - <0.416>, cd - <0.412>, compiler - <0.407>, million - <0.405>, certain - <0.404>, creation - <0.397> 51 | Nearst to "use" : deal - <0.438>, maintain - <0.406>, sphere - <0.392>, office - <0.390>, energy - <0.388>, effects - <0.385>, anarchist - <0.384>, every - <0.382> 52 | Nearst to "has" : had - <0.468>, have - <0.455>, is - <0.439>, representative - <0.415>, was - <0.398>, kind - <0.395>, jordan - <0.385>, speech - <0.378> 53 | ``` 54 | 55 | ## 结语 56 | 57 | 总之,笔者将在该项目中持续更新我关于机器学习的项目和理解。至于更深层次的文章请关注我的[ILMARE的博客](http://www.ilmareblog.com),我会在上面发表一些文章来阐述一些机器学习的细节。 58 | -------------------------------------------------------------------------------- /RNN/Lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RNN/Lib/__init__.py -------------------------------------------------------------------------------- /RNN/Lib/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RNN/Lib/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /RNN/Lib/__pycache__/model.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RNN/Lib/__pycache__/model.cpython-35.pyc -------------------------------------------------------------------------------- /RNN/Lib/model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年5月8日 3 | 4 | @author: IL MARE 5 | ''' 6 | import collections 7 | import numpy as np 8 | from tensorflow.contrib import rnn 9 | import tensorflow as tf 10 | import time 11 | 12 | def pick_top_n(preds, vocab_size, top_n=5): 13 | p = np.squeeze(preds) 14 | p[np.argsort(p)[:-top_n]] = 0 15 | p = p / np.sum(p) 16 | c = np.random.choice(vocab_size, 1, p=p)[0] 17 | return c 18 | 19 | class lstm_model: 20 | def __init__(self, hidden_size, num_layer, 21 | corpus, keep_prob, 22 | embedding_size, lr, max_step, 23 | save_path, sampling=False): 24 | if not sampling: 25 | self._num_seq = corpus.num_seq 26 | self._num_step = corpus.num_step 27 | else: 28 | self._num_seq = 1 29 | self._num_step = 1 30 | self._save_path = save_path 31 | self._lr = lr 32 | self._max_step = max_step 33 | self._embedding_size = embedding_size 34 | self._hidden_size = hidden_size 35 | self._num_layer = num_layer 36 | self._corpus = corpus 37 | self._keep_prob = keep_prob 38 | tf.reset_default_graph() 39 | self.init_inputs() 40 | self.build_lstm() 41 | self.define_loss() 42 | self.define_gradients() 43 | def init_inputs(self): 44 | self._x = tf.placeholder(dtype=tf.int32, 45 | shape=[self._num_seq, self._num_step]) 46 | with tf.device("/cpu:0"): 47 | embedding = tf.get_variable("embedding", 48 | shape=[self._corpus.word_num, 49 | self._embedding_size], dtype=tf.float32) 50 | self._inputs = tf.nn.embedding_lookup(embedding, self._x) 51 | def build_lstm(self): 52 | def build_cell(): 53 | cell = rnn.BasicLSTMCell(self._hidden_size, forget_bias=1.0, state_is_tuple=True) 54 | cell = rnn.DropoutWrapper(cell, output_keep_prob=self._keep_prob) 55 | return cell 56 | mul_cell = rnn.MultiRNNCell([build_cell() for _ in range(self._num_layer)], 57 | state_is_tuple=True) 58 | self._init_state = mul_cell.zero_state(self._num_seq, dtype=tf.float32) 59 | outputs, self._final_state = tf.nn.dynamic_rnn(mul_cell, self._inputs, 60 | initial_state=self._init_state) 61 | outputs = tf.reshape(outputs, [-1, self._hidden_size]) 62 | W = tf.Variable(tf.truncated_normal([self._hidden_size, self._corpus.word_num], 63 | stddev=0.1, dtype=tf.float32)) 64 | bais = tf.Variable(tf.zeros([1, self._corpus.word_num], 65 | dtype=tf.float32), dtype=tf.float32) 66 | self._prediction = tf.nn.softmax(tf.matmul(outputs, W) + bais) 67 | def define_loss(self): 68 | self._y = tf.placeholder(dtype=tf.int32, shape=[self._num_seq, self._num_step]) 69 | y_one_hot = tf.reshape(tf.one_hot(self._y, self._corpus.word_num), self._prediction.shape) 70 | self._loss = -tf.reduce_mean(tf.reduce_sum(y_one_hot * tf.log(self._prediction), 71 | reduction_indices=[1])) 72 | def define_gradients(self): 73 | vars = tf.trainable_variables() 74 | grads, _ = tf.clip_by_global_norm(tf.gradients(self._loss, vars), 3) 75 | optimizer = tf.train.AdamOptimizer(self._lr) 76 | self._optimizer = optimizer.apply_gradients(zip(grads, vars)) 77 | def train(self): 78 | with tf.Session() as sess: 79 | return_mat = [] 80 | sess.run(tf.global_variables_initializer()) 81 | state = sess.run(self._init_state) 82 | step = 0 83 | start = time.clock() 84 | for x, y in self._corpus.generate_batch(): 85 | feed = {self._x: x, 86 | self._y: y, 87 | self._init_state:state} 88 | loss, _, state = sess.run([self._loss, self._optimizer, 89 | self._final_state], feed_dict = feed) 90 | return_mat.append(loss) 91 | step += 1 92 | if step % 10 == 0: 93 | end = time.clock() 94 | interval = end - start 95 | yield return_mat 96 | print("迭代次数:{0:d}/{2:d},当前损失:{1:.3f},迭代速度:{3:.3f} 秒/十次,约需要{4:.3f}秒" 97 | .format(step, loss, self._max_step, interval, ((self._max_step - step) / 10) * interval)) 98 | start = time.clock() 99 | if step == self._max_step: 100 | break 101 | tf.train.Saver().save(sess, "{0}model".format(self._save_path), global_step=step) 102 | def load_model(self): 103 | sess = tf.Session() 104 | tf.train.Saver().restore(sess, tf.train.latest_checkpoint(self._save_path)) 105 | self._sess = sess 106 | def sampling(self, init_str, max_sample=30): 107 | sample = [c for c in init_str] 108 | pre = np.ones((self._corpus.word_num, )) 109 | state = self._sess.run(self._init_state) 110 | for c in sample: 111 | feed = {self._x: np.reshape(c, [1, 1]), 112 | self._init_state: state} 113 | pre, state = self._sess.run([self._prediction, self._final_state], 114 | feed_dict=feed) 115 | c = pick_top_n(pre, self._corpus.word_num) 116 | sample.append(c) 117 | for count in range(max_sample): 118 | x = np.zeros([1, 1]) 119 | x[0][0] = c 120 | feed = {self._x: x, 121 | self._init_state: state} 122 | pre, state = self._sess.run([self._prediction, self._final_state], 123 | feed_dict=feed) 124 | c = pick_top_n(pre, self._corpus.word_num) 125 | sample.append(c) 126 | return sample 127 | 128 | class corpus: 129 | ''' 130 | 该对象用于构造语料库,参数解释: 131 | file_path:语料库所在位置 132 | num_seq:一个batch中所包含的句子数 133 | num_step:一个句子中包含的词的数目 134 | max_size:统计语料库中出现频度前max_size的字或词 135 | ''' 136 | def __init__(self, file_path, num_seq=10, num_step=10, max_size=3500): 137 | self._file_path = file_path 138 | with open(self._file_path, "r", encoding="utf_8") as fp: 139 | self._buffer = fp.read() 140 | self._count = collections.Counter(self._buffer).most_common(max_size) 141 | self._word_to_int = dict() 142 | for word, _ in self._count: 143 | self._word_to_int[word] = len(self._word_to_int) 144 | self._int_to_word = dict(zip(self._word_to_int.values(), self._word_to_int.keys())) 145 | self._batch_size = num_seq * num_step 146 | num_batch = len(self._buffer) // self._batch_size 147 | self._buffer = self._buffer[: num_batch * self._batch_size] 148 | self._num_seq = num_seq 149 | self._num_step = num_step 150 | @property 151 | def num_seq(self): 152 | return self._num_seq 153 | @property 154 | def num_step(self): 155 | return self._num_step 156 | @property 157 | def file_path(self): 158 | return self._file_path 159 | @property 160 | def word_num(self): 161 | return len(self._int_to_word) + 1 162 | @property 163 | def batch_size(self): 164 | return self._batch_size 165 | @property 166 | def words(self): 167 | return self._buffer 168 | def sentence_to_int(self, sentence): 169 | return_mat = [] 170 | for word in sentence: 171 | return_mat.append(self.word_to_int(word)) 172 | return np.array(return_mat) 173 | def int_to_sentence(self, row): 174 | return_mat = [] 175 | for index in row: 176 | return_mat.append(self.int_to_word(index)) 177 | return "".join(return_mat) 178 | def word_to_int(self, word): 179 | return self._word_to_int.get(word, len(self._int_to_word)) 180 | def int_to_word(self, index): 181 | return self._int_to_word.get(index, "") 182 | def text_to_attr(self): 183 | return_mat = [] 184 | for _word in self._buffer: 185 | return_mat.append(self.word_to_int(_word)) 186 | return np.array(return_mat) 187 | def attr_to_text(self, attr): 188 | return_mat = [] 189 | for _attr in attr: 190 | return_mat.append(self.int_to_word(_attr)) 191 | return return_mat 192 | def generate_batch(self): 193 | attrs = self.text_to_attr() 194 | attrs = np.reshape(attrs, [self.num_seq, -1]) 195 | while True: 196 | np.random.shuffle(attrs) 197 | for index in range(0, attrs.shape[1], self.num_step): 198 | x = attrs[:, index: index + self.num_step] 199 | y = np.zeros_like(x) 200 | y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0] 201 | yield x, y 202 | 203 | if __name__ == "__main__": 204 | _x = tf.placeholder(dtype=tf.int32, 205 | shape=[100, 50]) 206 | embedding = tf.get_variable("embedding", 207 | shape=[5000, 128], dtype=tf.float32) 208 | _inputs = tf.nn.embedding_lookup(embedding, _x) 209 | print(_inputs.shape) 210 | -------------------------------------------------------------------------------- /RNN/README.md: -------------------------------------------------------------------------------- 1 | # 循环神经网络与`word2vec` 2 | 3 | 循环神经网络是一类非常特殊的神经网络,通常的神经网络并不会考虑数据与数据之间的耦合联系而是把它们当成独立同分布的样本。但事实是很多数据都是存在相互耦合的。比如在NLP(自然语言处理)上,一句话的词语和词语之间是存在很强的时间上的耦合关系的。为了处理这种耦合关系循环神经网络在原有神经网络的基础上进行了改进,让神经元的状态能够在时间序列上传递。当然,这样的传递是存在衰减的,通常并不能传递太远的距离,这也是传统的循环神经网络的一大问题。 4 | 5 | ![](https://github.com/yhswjtuILMARE/Machine-Learning-Study-Notes/blob/master/pics/rnn.png) 6 | 7 | 上图是最基础的循环神经网络的神经元示例,其中仅仅包含一个双曲正切作为激活函数。如果想要了解更多请移步我的博客[ILMARE的博客](https://www.ilmareblog.com),参看其中的文章`循环神经网络(RNN)浅析`。 8 | 9 | `word2vec`是另一类自然语言处理问题。通常的有监督机器学习问题都会将数据转化成多维向量和标记的形式,这些多维向量用于表述数据的种种特征,这样的方式在自然语言处理上遇到了问题,因为组成自然语言的最小单位——词组很难用特征去表示,如果使用`one-hot`编码来向量化词组就会不可避免地造成向量过于庞大,而且完全无法体现出自然语言单词与单词之间的关系。基于以上的考虑谷歌公司提出了`word2vec`技术,这是一种基于概率学的词组编码技术,非常复杂。具体的推导以及证明请移步我的博客[ILMARE的博客](https://www.ilmareblog.com),请参阅文章`对Word2Vec的一点理解`。 10 | 11 | 这个仓库主要包含我对循环神经网络和`word2vec`的一些实践:我基于RNN制作了一个可以自动生成文本的程序,这个Demo的灵感来源于另一个开源项目[Char-RNN-TensorFlow](https://github.com/hzy46/Char-RNN-TensorFlow);还有就是实现了`word2vec`; 12 | 13 | ## 循环神经网络 14 | 15 | 这个小Demo是用于生成文本的,通俗地说就是让AI来写文章。这个Demo的基本思想是这样的:首先将语料库中的词或字进行提取,对其中最频繁出现的前N个进行编号。在进行训练时我们以句子作为训练的基本单位,首先假设M个单词或字构成一个句子然后将这些词按照其在句子中出现的先后顺序随机编码成向量(之所以是随机的是因为在这里我们仅仅是想学习词或字出现的先后规律而并不关心这个词或字本身的意义)后输入到神经网络中进行训练,训练的目标或是预测的label就是这个句子出现的下一个字。 16 | 17 | 模型训练好之后仅仅需要提供一个字,词或者句子作为种子(甚至不需要提供)就可以让网络生成文本,这样做的原理就是基于我们输入的文本预测下一个即将出现的文本,然后再根据预测出的文本进行下一次预测,直到某一个终止条件(例如我们给出的最大生成字或词的数目)停止。 18 | 19 | 在这里,我使用刘慈欣的三体小说作为语料库,进行了一些迭代。由于时间关系并没有训练太久,但是可以观察到模型的的确确是在收敛中,最终在训练了5000个epoch后终止。我们使用`程心`作为种子看看那能生成什么样的文字: 20 | 21 | ``` 22 | 程心 心问:“我不是一个标人的一种 及源尘 的一切都没有关系,那一次都是一个世妄的时间,但也能看到,在这些 系观的存在,人们在 28 年前,他们在这里,他们也不知道,这是他们的时间是一个小时的时间,但也没有文段的。” 23 | 24 |   “我的!” 程心想起了一种声冷,“我的!” AA 的话学她说,“我的我也是 一切都有人的东西。这时,他们也没见过来。”关一帆走过程心的话。他说,“那是在我的人都不会,我们不知道那话的,他们不是这十的话。 他们在这个范像是 一种死票的 辨V,在她的 几睛, 她们也不是为了一个标座的。 25 | 26 |   在他的手指上了, “我想起来,她的几睛就是这十。” 27 | 28 |   “我不是我们的时间里,她在那个世界是 一个世界,她的几睛,她就在这个时代的一个人就是 一个世 界的世界。 他的目光中是一个人, 她不知道,我们在这种 话的话,他们的目光,但她的一切都不是 一个人。” 29 | 30 |   “我不知道,我们的生活,那个时代,他也不是一个人的那个世界的,她不是一个人,不管她是我。” 程心灯灯头。” 程心灯灯头,“你的目光中,那是一个 一个 方面的 草穷来时 代,在那一次,我的人都没有看,他们 都不再 是一个人的那种, 31 | ``` 32 | 可以看到:程序生成的文本还是十分生硬和不通顺,这可能是由于训练不足导致的。不过有意思的是:以“程心”作为种子生成的文本中几乎都出现了“关一帆”的名字,这也说明了在小说中程心和关一帆的联系真的很紧密,以至于网络都将他们视为了拥有很强耦合关系的部分。 33 | 34 | ## `word2vec` 35 | 36 | 这一部分有很强的数学基础,很不容易进行表述,如果想要了解更多请访问我的博客,地址在上面给出了。不过本例实现的`word2vec`原理用最简明的语句进行表述就是:假如有一个句子`I want to play basketball`,我们对其中的单词进行编码时要考虑它和上下文的关系,比如``,``都是对`want`进行编码时需要考虑的。在这样的设定下,如果有另一个语义用法和`want`相近的词,那么其上下文也必然和`want`相似,这样那个词获得的编码向量和`want`的向量就会很接近。 37 | 38 | 以下是实验得出的结果,我随机选取了16个单词并打印出了在编码上和它们最接近的8个词,`<>`中即是词向量的欧氏距离: 39 | 40 | ``` 41 | Nearst to "three" : four - <0.740>, five - <0.706>, two - <0.706>, zero - <0.703>, six - <0.695>, eight - <0.691>, seven - <0.661>, one - <0.633> 42 | Nearst to "would" : can - <0.426>, to - <0.424>, transformation - <0.421>, says - <0.421>, adults - <0.416>, had - <0.398>, instrumental - <0.396>, like - <0.388> 43 | Nearst to "known" : repeatedly - <0.413>, regarded - <0.394>, falls - <0.394>, joint - <0.375>, available - <0.369>, sir - <0.365>, UNK - <0.364>, select - <0.362> 44 | Nearst to "world" : holding - <0.423>, died - <0.415>, holy - <0.394>, first - <0.393>, wayne - <0.391>, sea - <0.390>, mathbf - <0.382>, focus - <0.379> 45 | Nearst to "history" : UNK - <0.424>, identical - <0.419>, state - <0.418>, orbital - <0.415>, next - <0.413>, boston - <0.411>, list - <0.410>, frequent - <0.408> 46 | Nearst to "but" : however - <0.503>, that - <0.462>, ties - <0.462>, sky - <0.456>, exposed - <0.452>, generally - <0.441>, because - <0.430>, landscape - <0.414> 47 | Nearst to "most" : supporters - <0.419>, earned - <0.406>, friendly - <0.405>, anarchist - <0.397>, absence - <0.394>, articles - <0.389>, apparent - <0.386>, difference - <0.385> 48 | Nearst to "states" : co - <0.477>, symbolic - <0.428>, market - <0.412>, by - <0.408>, skills - <0.396>, legs - <0.395>, in - <0.392>, alphabet - <0.382> 49 | Nearst to "only" : respectively - <0.434>, powerful - <0.404>, warming - <0.400>, transform - <0.392>, showing - <0.391>, theory - <0.389>, georgia - <0.383>, adventures - <0.382> 50 | Nearst to "while" : and - <0.411>, enlightenment - <0.409>, started - <0.405>, conduct - <0.403>, convention - <0.396>, academy - <0.390>, lived - <0.390>, novels - <0.384> 51 | Nearst to "other" : additional - <0.418>, fall - <0.415>, recovery - <0.398>, caught - <0.391>, gene - <0.390>, egyptian - <0.386>, different - <0.382>, some - <0.371> 52 | Nearst to "which" : that - <0.490>, this - <0.435>, max - <0.424>, and - <0.410>, images - <0.408>, success - <0.407>, benefit - <0.404>, five - <0.403> 53 | Nearst to "they" : why - <0.401>, assassination - <0.401>, he - <0.400>, occasions - <0.400>, southwest - <0.391>, inherited - <0.383>, democrats - <0.382>, norwegian - <0.378> 54 | Nearst to "as" : detail - <0.459>, constitutional - <0.432>, error - <0.416>, cd - <0.412>, compiler - <0.407>, million - <0.405>, certain - <0.404>, creation - <0.397> 55 | Nearst to "use" : deal - <0.438>, maintain - <0.406>, sphere - <0.392>, office - <0.390>, energy - <0.388>, effects - <0.385>, anarchist - <0.384>, every - <0.382> 56 | Nearst to "has" : had - <0.468>, have - <0.455>, is - <0.439>, representative - <0.415>, was - <0.398>, kind - <0.395>, jordan - <0.385>, speech - <0.378> 57 | ``` 58 | 结果中`<>`内部的数字是相似程度。观察结果的第一条: 59 | 60 | ``` 61 | Nearst to "three" : four - <0.740>, five - <0.706>, two - <0.706>, zero - <0.703>, six - <0.695>, eight - <0.691>, seven - <0.661>, one - <0.633> 62 | ``` 63 | 可以看出,和“three”语义相近的词汇均是英文中的数字词汇,且相似度都在60%以上说明这个模型的效果还是比较不错的。 -------------------------------------------------------------------------------- /RNN/Units/RNNUnit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年4月25日 3 | 4 | @author: IL MARE 5 | ''' 6 | import os 7 | import sys 8 | sys.path.append(os.getcwd()) 9 | from Lib.model import lstm_model 10 | from Lib.model import corpus 11 | import shelve 12 | from matplotlib import pyplot as plt 13 | import matplotlib as mpl 14 | import numpy as np 15 | 16 | path = r"F:/tensorflow/rnn/data/novel.txt" 17 | save_path = r"F:/tensorflow/rnn/model/" 18 | 19 | def train_model(): 20 | obj = corpus(path, 50, 50, 3000) 21 | with shelve.open("{0}corpus".format(save_path)) as fp: 22 | fp["obj"] = obj 23 | model = lstm_model(hidden_size=128, num_layer=2, 24 | corpus=obj, keep_prob=1.0, 25 | embedding_size=128, max_step=5000, 26 | lr=0.005, save_path=save_path) 27 | result = [] 28 | fig = plt.figure("cross-entropy") 29 | mpl.rcParams['xtick.labelsize'] = 8 30 | mpl.rcParams['ytick.labelsize'] = 8 31 | ax = fig.add_subplot(111) 32 | # ax.grid(True) 33 | for return_mat in model.train(): 34 | result.extend(return_mat) 35 | # x = np.arange((len(return_mat))) 36 | # y = np.array(return_mat) 37 | # ax.plot(x, y, linewidth=0.8, color="b") 38 | # plt.pause(0.1) 39 | x = np.arange((len(return_mat))) 40 | y = np.array(return_mat) 41 | ax.plot(x, y, linewidth=0.8, color="b") 42 | plt.show() 43 | 44 | def test_model(init_str, max_sample=200): 45 | obj = None 46 | assert os.path.exists("{0}corpus.bak".format(save_path)), "找不到文件" 47 | with shelve.open("{0}corpus".format(save_path)) as fp: 48 | obj = fp["obj"] 49 | model = lstm_model(hidden_size=128, num_layer=2, 50 | corpus=obj, keep_prob=1.0, 51 | embedding_size=128, max_step=5000, 52 | lr=0.005, save_path=save_path, sampling=True) 53 | model.load_model() 54 | sample = model.sampling(obj.sentence_to_int(init_str), max_sample) 55 | print(obj.int_to_sentence(sample)) 56 | 57 | if __name__ == "__main__": 58 | # train_model() 59 | test_model("程心", max_sample=500) -------------------------------------------------------------------------------- /RNN/Units/Word2VecUnit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年4月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | from urllib.request import urlretrieve 7 | import os 8 | import zipfile 9 | import tensorflow as tf 10 | import collections 11 | import numpy as np 12 | import random 13 | import math 14 | import shelve 15 | 16 | url = "http://mattmahoney.net/dc/" 17 | vocabularySize = 5000 18 | 19 | def download_file(fileName, expectedSize): 20 | if not os.path.exists(fileName): 21 | try: 22 | urlretrieve(url + fileName, fileName) 23 | except Exception as e: 24 | return 25 | statinfo = os.stat(fileName) 26 | if statinfo.st_size == expectedSize: 27 | print("Found and Verified", fileName) 28 | else: 29 | print(statinfo.st_size) 30 | raise Exception("Failed to verify " + fileName) 31 | return fileName 32 | 33 | def read_data(fileName): 34 | with zipfile.ZipFile(fileName) as fp: 35 | data = tf.compat.as_str(fp.read(fp.namelist()[0])).split() 36 | return data 37 | 38 | def build_dataSet(words): 39 | ''' 40 | 该函数返回四个值,第一个值data用来表示该篇文章中所有词出现的频度, 41 | 所一个词没有排在最常出现的前5000名则该位上置0,否则 42 | 置这个词出现频度的排位,排位越靠前说明出现的频度越大。 43 | count用来表示出现频度前5000名的词的出现次数。 44 | dic表示出现频度最高的前5000个词的排序,排序序号越小则出现频度越高, 45 | 以单词为索引reverse_dic是dic的键值倒置字典,以出现频度为索引 46 | ''' 47 | count = [["UNK", -1]] 48 | count.extend(collections.Counter(words).most_common(vocabularySize - 1)) 49 | dic = dict() 50 | for word, num in count: 51 | dic[word] = len(dic) 52 | data = [] 53 | unk_count = 0 54 | for word in words: 55 | index = dic.get(word, 0) 56 | unk_count += 1 if index == 0 else 0 57 | data.append(index) 58 | count[0][1] = unk_count 59 | reverse_dic = dict(zip(dic.values(), dic.keys())) 60 | return data, count, dic, reverse_dic 61 | 62 | dataIndex = 0 63 | def generate_batch(batchSize, skipWindow, numSkip, data): 64 | global dataIndex 65 | assert not batchSize % numSkip, "样本规模大小必须为numSkip的整数倍" 66 | assert numSkip <= skipWindow * 2, "numSkip的大小必须不大于skipWindow的两倍" 67 | batch = np.ndarray(shape=(batchSize), dtype=np.int32) 68 | labels = np.ndarray(shape=(batchSize, 1), dtype=np.int32) 69 | span = 2 * skipWindow + 1 70 | buffer = collections.deque(maxlen=span) 71 | for _ in range(span): 72 | buffer.append(data[dataIndex]) 73 | dataIndex = (dataIndex + 1) % len(data) 74 | for i in range(batchSize // numSkip): 75 | target = skipWindow 76 | target_to_void = [target] 77 | for j in range(numSkip): 78 | while target in target_to_void: 79 | target = random.randint(0, span - 1) 80 | target_to_void.append(target) 81 | batch[i * numSkip + j] = buffer[skipWindow] 82 | labels[i * numSkip + j, 0] = buffer[target] 83 | buffer.append(data[dataIndex]) 84 | dataIndex = (dataIndex + 1) % len(data) 85 | return batch, labels 86 | 87 | batchSize = 128#选取的样本规模大小为128 88 | embeddingSize = 128#生成稠密向量的维度为128 89 | skipWindow = 2#单词关联程度为1 90 | numSkip = 4#与目标单词关联的单词数 91 | 92 | vaildSize = 16#用来测试的单词规模 93 | vaildWindow = 100#抽取前100个出现频率最高的词汇 94 | vaildExamples = np.random.choice(vaildWindow, vaildSize, replace=False)#随机抽取vaildSize个单词索引 95 | numSampled = 64#噪声词汇的数目 96 | 97 | if __name__ == "__main__": 98 | fileName = r"F:\tensorflow\rnn\data\w2v\text8.zip" 99 | download_file(fileName, 31344016) 100 | words = read_data(fileName) 101 | data, count, dic, reverse_dic = build_dataSet(words) 102 | print("Most Common Words (+UNK): ", count[:5]) 103 | print("Sample data: ", data[: 10], [reverse_dic[i] for i in data[: 10]]) 104 | del words 105 | graph = tf.Graph() 106 | with graph.as_default(): 107 | trainInputs = tf.placeholder(tf.int32, [batchSize]) 108 | trainLabels = tf.placeholder(tf.int32, [batchSize, 1]) 109 | vaildDataSet = tf.constant(vaildExamples, tf.int32) 110 | with tf.device("/cpu:0"): 111 | embeddings = tf.Variable( 112 | tf.random_uniform([vocabularySize, embeddingSize], -1.0, 1.0)) 113 | embed = tf.nn.embedding_lookup(embeddings, trainInputs) 114 | nceWeight = tf.Variable(tf.truncated_normal 115 | ([vocabularySize, embeddingSize], stddev=1.0 / math.sqrt(embeddingSize))) 116 | nceBiases = tf.Variable(tf.zeros([vocabularySize])) 117 | nceLoss = tf.reduce_mean(tf.nn.nce_loss(nceWeight, 118 | nceBiases, 119 | trainLabels, 120 | embed, 121 | numSampled, 122 | vocabularySize)) 123 | optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(nceLoss) 124 | normal = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True)) 125 | normalized_embeddings = embeddings / normal 126 | vaild_embeddings = tf.nn.embedding_lookup(normalized_embeddings, vaildDataSet) 127 | similarity = tf.matmul(vaild_embeddings, tf.transpose(normalized_embeddings)) 128 | init = tf.global_variables_initializer() 129 | num_step = 100000 130 | with tf.Session(graph=graph) as sess: 131 | init.run() 132 | print("initialized...") 133 | average_loss = 0 134 | for step in range(num_step): 135 | batch, labels = generate_batch(batchSize, skipWindow, numSkip, data) 136 | feed_dict = {trainInputs: batch, trainLabels: labels} 137 | _, loss_val = sess.run([optimizer, nceLoss], feed_dict=feed_dict) 138 | average_loss += loss_val 139 | if step % 2000 == 0: 140 | average_loss /= 2000.0 141 | print("Average loss at step ", step, " is " , average_loss) 142 | average_loss = 0 143 | if step % 10000 == 0: 144 | sim = similarity.eval() 145 | for i in range(vaildSize): 146 | vaildWord = reverse_dic[vaildExamples[i]] 147 | top_k = 8 148 | nearst = (-sim[i, :]).argsort()[1: top_k + 1] 149 | log_str = "Nearst to %s: " % (vaildWord) 150 | for j in range(top_k): 151 | close_word = reverse_dic[nearst[j]] 152 | possible = -sim[i, :][nearst[j]] 153 | log_str = "%s %s - <%.3f>," % (log_str, close_word, -possible) 154 | print(log_str) 155 | final_embeddings = normalized_embeddings.eval() 156 | print(final_embeddings, final_embeddings.shape) 157 | sim = similarity.eval() 158 | for i in range(vaildSize): 159 | row = -sim[i, :] 160 | log_str = "Nearst to %s :" % (reverse_dic[vaildExamples[i]]) 161 | for index in row.argsort()[1: 9]: 162 | possible = row[index] 163 | str = reverse_dic[index] 164 | log_str = "%s %s - <%.3f>," % (log_str, str, -possible) 165 | print(log_str) 166 | with shelve.open("parameter") as fp: 167 | fp["word2vec"] = final_embeddings 168 | fp["reverse_dic"] = reverse_dic 169 | fp["dic"] = dic -------------------------------------------------------------------------------- /RNN/Units/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RNN/Units/__init__.py -------------------------------------------------------------------------------- /RNN/Units/__pycache__/Word2VecUnit.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RNN/Units/__pycache__/Word2VecUnit.cpython-35.pyc -------------------------------------------------------------------------------- /RNN/Units/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RNN/Units/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /RNN/Units/__pycache__/handWrittenUnit.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RNN/Units/__pycache__/handWrittenUnit.cpython-35.pyc -------------------------------------------------------------------------------- /RNN/Units/__pycache__/test.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RNN/Units/__pycache__/test.cpython-35.pyc -------------------------------------------------------------------------------- /RNN/Units/handWrittenUnit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年5月5日 3 | 4 | @author: IL MARE 5 | ''' 6 | from tensorflow.examples.tutorials.mnist import input_data 7 | import tensorflow as tf 8 | from tensorflow.contrib import rnn 9 | import numpy as np 10 | 11 | 12 | hidden_size = 64 13 | class_num = 10 14 | time_step = 28 15 | layer_num = 3 16 | batch_size = 16 17 | keep_prob = tf.placeholder(tf.float32, name="keep_prob") 18 | 19 | 20 | def func(): 21 | lstm_cell = rnn.BasicLSTMCell(num_units=hidden_size, forget_bias=1.0, state_is_tuple=True) 22 | lstm_cell = rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob) 23 | return lstm_cell 24 | 25 | if __name__ == "__main__": 26 | mnist = input_data.read_data_sets(r"F:\tensorflow\MNIST_DATA", one_hot=True) 27 | mul_cell = rnn.MultiRNNCell([func() for _ in range(layer_num)], state_is_tuple=True) 28 | sess = tf.InteractiveSession() 29 | _x = tf.placeholder(dtype=tf.float32, shape=[batch_size, 784], name="x") 30 | x = tf.nn.dropout(tf.reshape(_x, [-1, 28, 28]), keep_prob=keep_prob) 31 | y = tf.placeholder(dtype=tf.float32, shape=[batch_size, 10], name="y") 32 | state = init_state = mul_cell.zero_state(batch_size, dtype=tf.float32) 33 | # outputs, state = tf.nn.dynamic_rnn(mul_cell, x, initial_state=state, dtype=tf.float32) 34 | outputs = [] 35 | for step in range(28): 36 | output, state = mul_cell(x[:, step, :], state) 37 | outputs.append(output) 38 | h_state = outputs[-1] 39 | W = tf.Variable(tf.random_uniform(shape=[hidden_size, class_num], dtype=tf.float32), dtype=tf.float32) 40 | b = tf.Variable(tf.zeros(shape=[1, 10]), dtype=tf.float32) 41 | y_pre = tf.nn.softmax(tf.matmul(h_state, W) + b, name="predict") 42 | cross_entry = - y * tf.log(y_pre) 43 | loss = tf.reduce_mean(tf.reduce_sum(cross_entry, reduction_indices=[1])) 44 | 45 | train_step = tf.train.AdamOptimizer(0.001).minimize(loss) 46 | sess.run(tf.global_variables_initializer()) 47 | 48 | accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_pre, 1), tf.argmax(y, 1)), dtype=tf.float32)) 49 | for i in range(500): 50 | batch_x, batch_y = mnist.train.next_batch(batch_size) 51 | sess.run(train_step, feed_dict={_x: batch_x, y: batch_y, keep_prob: 1.0}) 52 | if i % 200 == 0: 53 | print(accuracy.eval(feed_dict={_x: batch_x, y: batch_y, keep_prob: 1.0})) 54 | # saver = tf.train.Saver() 55 | # saver.save(sess, r"G:/Machine-Learning-Study-Notes/python/RNN/model.ckpt") 56 | # graph_def = tf.get_default_graph().as_graph_def() 57 | # output_graph_def = graph_util.convert_variables_to_constants(sess, graph_def, ["predict", "x", "keep_prob"]) 58 | # with tf.gfile.GFile(r"G:/Machine-Learning-Study-Notes/python/RNN/model.pb", "wb") as fp: 59 | # fp.write(output_graph_def.SerializeToString()) -------------------------------------------------------------------------------- /RNN/Units/handWrittenUnit_2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年5月6日 3 | 4 | @author: IL MARE 5 | ''' 6 | from tensorflow.python.framework import graph_util 7 | from tensorflow.examples.tutorials.mnist import input_data 8 | import tensorflow as tf 9 | from tensorflow.contrib import rnn 10 | 11 | hidden_size = 64 12 | class_num = 10 13 | time_step = 28 14 | layer_num = 3 15 | batch_size = 16 16 | keep_prob = tf.placeholder(tf.float32, name="keep_prob") 17 | 18 | def func(): 19 | lstm_cell = rnn.BasicLSTMCell(hidden_size, forget_bias=1.0, state_is_tuple=True) 20 | lstm_cell = rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob) 21 | return lstm_cell 22 | 23 | if __name__ == "__main__": 24 | path = r"G:/Machine-Learning-Study-Notes/python/RNN/modelFile/model.ckpt" 25 | mul_cell = rnn.MultiRNNCell([func() for _ in range(layer_num)], state_is_tuple=True) 26 | x = tf.placeholder(dtype=tf.float32, shape=[1, 784]) 27 | input = tf.reshape(x, [-1, 28, 28]) 28 | y = tf.placeholder(dtype=tf.float32, shape=[1, class_num]) 29 | state = mul_cell.zero_state(1, dtype=tf.float32) 30 | outputs = [] 31 | for i in range(28): 32 | output, state = mul_cell(input[:, i, :], state) 33 | outputs.append(output) 34 | h_state = outputs[-1] 35 | W = tf.Variable(tf.random_uniform([hidden_size, class_num] 36 | , -1, 1, dtype=tf.float32), 37 | dtype=tf.float32) 38 | bais = tf.Variable(tf.zeros([1, class_num], dtype=tf.float32), dtype=tf.float32) 39 | y_pre = tf.nn.softmax(tf.matmul(h_state, W) + bais) 40 | sess = tf.InteractiveSession() 41 | tf.train.Saver().restore(sess, path) 42 | mnist = input_data.read_data_sets("/MNIST_data", one_hot=True) 43 | count = mnist.test.images.shape[0] 44 | print(count) 45 | res = 0 46 | for i in range(count): 47 | batch_x, batch_y = mnist.test.images[i: i + 1], mnist.test.labels[i: i + 1] 48 | pre = sess.run(y_pre, feed_dict={x: batch_x, keep_prob: 1.0}) 49 | accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(pre, 1), tf.argmax(batch_y, 1)), tf.float32)) 50 | res += accuracy.eval() 51 | if i % 500 == 0: 52 | print(res / (i + 1)) 53 | print(res / count) 54 | # path = r"G:/Machine-Learning-Study-Notes/python/RNN/model.pb" 55 | # with tf.gfile.GFile(path, "rb") as fp: 56 | # graph_def = tf.GraphDef() 57 | # graph_def.ParseFromString(fp.read()) 58 | # tf.import_graph_def(graph_def) 59 | # sess = tf.InteractiveSession() 60 | # y_pre = sess.graph.get_tensor_by_name("import/predict:0") 61 | # _x = sess.graph.get_tensor_by_name("import/x:0") 62 | # keep_prob = sess.graph.get_tensor_by_name("import/keep_prob:0") 63 | # mnist = input_data.read_data_sets("/MNIST_data", one_hot=True) 64 | # count = mnist.test.images.shape[0] // 16 65 | # res = 0 66 | # for i in range(count): 67 | # batch_x, batch_y = mnist.test.images[i * 16: i * 16 + 16], mnist.test.labels[i * 16: i * 16 + 16] 68 | # pre = sess.run(y_pre, feed_dict={_x: batch_x, keep_prob: 1.0}) 69 | # accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(pre, 1), tf.argmax(batch_y, 1)), tf.float32)) 70 | # res += accuracy.eval() 71 | # print(res / count) -------------------------------------------------------------------------------- /RNN/Units/tempTest.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年5月20日 3 | 4 | @author: IL MARE 5 | ''' 6 | n_queen = 8 7 | result_mat = [] 8 | 9 | def backTrace(column, vaildSet, tagIndex, tag): 10 | global result_mat 11 | for index, col in enumerate(column): 12 | if col != -1: 13 | if index != tagIndex and abs(tag - col) == abs(tagIndex - index): 14 | return False 15 | if len(vaildSet) == 0: 16 | result_mat.append(column.copy()) 17 | return True 18 | for vaild in vaildSet: 19 | column[tagIndex + 1] = vaild 20 | flag = backTrace(column, vaildSet - set([vaild]), tagIndex + 1, vaild) 21 | if not flag: 22 | column[tagIndex + 1] = -1 23 | 24 | 25 | if __name__ == "__main__": 26 | column = [ -1 for i in range(n_queen)] 27 | vaildSet = set(range(n_queen)) 28 | backTrace(column, vaildSet, -1, -1) 29 | for row in result_mat: 30 | print(row) 31 | -------------------------------------------------------------------------------- /RandomForest/Lib/RFLib.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | import Util.RandomUtil as RandomUtil 8 | 9 | ''' 10 | 计算香农墒 11 | ''' 12 | def calShannonEnt(trainLabel): 13 | m = len(trainLabel) 14 | uniqueVal = set(trainLabel) 15 | countDict = {} 16 | shannonNum = 0.0 17 | for label in trainLabel: 18 | countDict[label] = countDict.get(label, 0) + 1 19 | for label in uniqueVal: 20 | p = (countDict[label] / m) 21 | shannonNum -= p * np.log2(p) 22 | return shannonNum 23 | ''' 24 | 切分数据集 25 | ''' 26 | def splitDataMatrix(dataMatrix, label, axis, value): 27 | returnMat = [] 28 | labelMat = [] 29 | for row, row1 in zip(dataMatrix, label): 30 | if row[axis] == value: 31 | tmp_lst = row[0: axis] 32 | tmp_lst.extend(row[axis + 1:]) 33 | returnMat.append(tmp_lst) 34 | labelMat.append(row1) 35 | return returnMat, labelMat 36 | ''' 37 | 由信息增益最大化计算出需要切分的属性索引值 38 | ''' 39 | def chooseBestFeature(trainSet, label): 40 | tmp = int(np.log2(len(trainSet[0]))) 41 | k = 1 if tmp == 0 else tmp 42 | indexSet = RandomUtil.generateRandom(0, len(trainSet[0]), k) 43 | m = len(trainSet) 44 | maxGain = -1 45 | baseShannonEnt = calShannonEnt(label) 46 | index = -1 47 | for i in indexSet: 48 | uniqueAttr = set([example[i] for example in trainSet]) 49 | tmp_Ent = 0 50 | for attr in uniqueAttr: 51 | subSet, labelMat = splitDataMatrix(trainSet, label, i, attr) 52 | newShannonEnt = calShannonEnt(labelMat) 53 | tmp_Ent += float(len(subSet) / m) * newShannonEnt 54 | gain = baseShannonEnt - tmp_Ent 55 | if gain > maxGain: 56 | maxGain = gain 57 | index = i 58 | return index 59 | ''' 60 | 训练随机森林所需要的弱分类器 61 | ''' 62 | def generateWeakLearner(trainSet, trainLabel): 63 | if trainLabel.count(trainLabel[0]) == len(trainLabel): 64 | return trainLabel[0] 65 | if len(trainSet[0]) == 0: 66 | return "no" if trainLabel.count("no") > trainLabel.count("yes") else "yes" 67 | index = chooseBestFeature(trainSet, trainLabel) 68 | Tree = {index:{}} 69 | uniqueVal = set([elt[index] for elt in trainSet]) 70 | for value in uniqueVal: 71 | subSet, label = splitDataMatrix(trainSet, trainLabel, index, value) 72 | Tree[index][value] = generateWeakLearner(subSet, label) 73 | return Tree 74 | 75 | def generateRandomForest(trainSet, trainLabel, T): 76 | forest = [] 77 | for i in range(T): 78 | model = generateWeakLearner(trainSet, trainLabel) 79 | forest.append(model) 80 | return forest 81 | 82 | def classfyData(data, model): 83 | if type(model) == str: 84 | return model 85 | key = iter(model.keys()).__next__() 86 | value = data[key] 87 | res = model[key].get(value, None) 88 | if res != None: 89 | return classfyData(data, res) 90 | else: 91 | tmp_lst = [item for item in model[key].keys()] 92 | return classfyData(data, model[key][np.random.choice(tmp_lst, 1)[0]]) 93 | 94 | def predictByRandomForest(models, data): 95 | tmp_lst = [] 96 | for model in models: 97 | predict_label = classfyData(data, model) 98 | tmp_lst.append(predict_label) 99 | tmp_set = set(tmp_lst) 100 | res_lst = [] 101 | for res in tmp_set: 102 | res_lst.append((tmp_lst.count(res), res)) 103 | res_lst = sorted(res_lst, key=lambda index:index[0], reverse=True) 104 | if len(res_lst) == 1: 105 | return res_lst[0][1] 106 | else: 107 | tmp_res = res_lst[0][0] 108 | return_lst = [res_lst[0][1]] 109 | for i in range(1, len(res_lst)): 110 | if res_lst[i][0] == tmp_res: 111 | return_lst.append(res_lst[i][1]) 112 | if len(return_lst) == 1: 113 | return return_lst[0] 114 | else: 115 | return np.random.choice(return_lst, 1)[0] -------------------------------------------------------------------------------- /RandomForest/Lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RandomForest/Lib/__init__.py -------------------------------------------------------------------------------- /RandomForest/Lib/__pycache__/RFLib.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RandomForest/Lib/__pycache__/RFLib.cpython-36.pyc -------------------------------------------------------------------------------- /RandomForest/Lib/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RandomForest/Lib/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /RandomForest/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RandomForest/README.md -------------------------------------------------------------------------------- /RandomForest/Unit/RFUnit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月4日 3 | 4 | @author: IL MARE 5 | ''' 6 | import Util.DataUtil as DataUtil 7 | import Lib.RFLib as RFLib 8 | import time 9 | from matplotlib import pyplot as plt 10 | import numpy as np 11 | 12 | def loadDataSet(filename): 13 | print("Loading data...") 14 | dataSet, labelSet = DataUtil.loadDataForRMOrDTModel(filename) 15 | print("Loaded data!") 16 | print("Undersampling data...") 17 | dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, "yes", "no") 18 | print("Undersampled data!") 19 | return dataSet, labelSet 20 | 21 | def testRFModel(dataSet, labelSet, T=20): 22 | trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(dataSet, labelSet) 23 | forest = RFLib.generateRandomForest(trainSet, trainLabel, T) 24 | errorCount = 0 25 | for data, label in zip(testSet, testLabel): 26 | predict_label = RFLib.predictByRandomForest(forest, data) 27 | if predict_label != label: 28 | errorCount += 1 29 | RFratio = float(errorCount) / len(testLabel) 30 | print("RF:total error ratio is %.3f, correct ratio is %.3f" % (RFratio, 1 - RFratio)) 31 | return RFratio 32 | 33 | if __name__ == "__main__": 34 | start = time.clock() 35 | dataSet, labelSet = loadDataSet("bank-additional") 36 | tmp_lst = [] 37 | for T in range(20, 0, -1): 38 | totalError = 0 39 | errorList = [] 40 | for i in range(5): 41 | errorRatio = testRFModel(dataSet, labelSet, T) 42 | errorList.append("%.3f" % (1 - errorRatio)) 43 | totalError += errorRatio 44 | print(errorList, "%.3f -- %.3fs" % (1 - totalError / 5.0, time.clock() - start)) 45 | tmp_lst.append((T, errorList, 1 - totalError / 5.0)) 46 | for item in tmp_lst: 47 | print(item) 48 | y = np.array([item[2] for item in tmp_lst], dtype=np.float) 49 | x = np.arange(y.shape[0] + 1, 1, -1) 50 | fig = plt.figure("test") 51 | ax = fig.add_subplot(111) 52 | ax.plot(x, y) 53 | ax.set_ylabel("correct ratio of RF") 54 | ax.set_xlabel("count of basic leaner") 55 | plt.show() -------------------------------------------------------------------------------- /RandomForest/Unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RandomForest/Unit/__init__.py -------------------------------------------------------------------------------- /RandomForest/Util/RandomUtil.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | ''' 8 | 该函数用来在一个集合中随机抽取size个互不相同的随机值 9 | ''' 10 | def generateRandomIndex(a, size): 11 | if len(a) < size: 12 | return None 13 | elif len(a) == size: 14 | return set(a) 15 | returnMat = set() 16 | while True: 17 | returnMat.add(np.random.choice(list(a), 1)[0]) 18 | if len(returnMat) == size: 19 | break 20 | return returnMat 21 | ''' 22 | 在指定范围内产生指定数目的不重复的随机数 23 | ''' 24 | def generateRandom(low, high, size): 25 | returnSet = set() 26 | while True: 27 | returnSet.add(np.random.randint(low, high, 1)[0]) 28 | if len(returnSet) == size: 29 | break 30 | return returnSet -------------------------------------------------------------------------------- /RandomForest/Util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RandomForest/Util/__init__.py -------------------------------------------------------------------------------- /RandomForest/Util/__pycache__/DataUtil.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RandomForest/Util/__pycache__/DataUtil.cpython-36.pyc -------------------------------------------------------------------------------- /RandomForest/Util/__pycache__/RandomUtil.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RandomForest/Util/__pycache__/RandomUtil.cpython-36.pyc -------------------------------------------------------------------------------- /RandomForest/Util/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/RandomForest/Util/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /SVM/Lib/RFLib.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | import Util.RandomUtil as RandomUtil 8 | 9 | ''' 10 | 计算香农墒 11 | ''' 12 | def calShannonEnt(trainLabel): 13 | m = len(trainLabel) 14 | uniqueVal = set(trainLabel) 15 | countDict = {} 16 | shannonNum = 0.0 17 | for label in trainLabel: 18 | countDict[label] = countDict.get(label, 0) + 1 19 | for label in uniqueVal: 20 | p = (countDict[label] / m) 21 | shannonNum -= p * np.log2(p) 22 | return shannonNum 23 | ''' 24 | 切分数据集 25 | ''' 26 | def splitDataMatrix(dataMatrix, label, axis, value): 27 | returnMat = [] 28 | labelMat = [] 29 | for row, row1 in zip(dataMatrix, label): 30 | if row[axis] == value: 31 | tmp_lst = row[0: axis] 32 | tmp_lst.extend(row[axis + 1:]) 33 | returnMat.append(tmp_lst) 34 | labelMat.append(row1) 35 | return returnMat, labelMat 36 | ''' 37 | 由信息增益最大化计算出需要切分的属性索引值 38 | ''' 39 | def chooseBestFeature(trainSet, label): 40 | tmp = int(np.log2(len(trainSet[0]))) 41 | k = 1 if tmp == 0 else tmp 42 | indexSet = RandomUtil.generateRandom(0, len(trainSet[0]), k) 43 | m = len(trainSet) 44 | maxGain = -1 45 | baseShannonEnt = calShannonEnt(label) 46 | index = -1 47 | for i in indexSet: 48 | uniqueAttr = set([example[i] for example in trainSet]) 49 | tmp_Ent = 0 50 | for attr in uniqueAttr: 51 | subSet, labelMat = splitDataMatrix(trainSet, label, i, attr) 52 | newShannonEnt = calShannonEnt(labelMat) 53 | tmp_Ent += float(len(subSet) / m) * newShannonEnt 54 | gain = baseShannonEnt - tmp_Ent 55 | if gain > maxGain: 56 | maxGain = gain 57 | index = i 58 | return index 59 | ''' 60 | 训练随机森林所需要的弱分类器 61 | ''' 62 | def generateWeakLearner(trainSet, trainLabel): 63 | if trainLabel.count(trainLabel[0]) == len(trainLabel): 64 | return trainLabel[0] 65 | if len(trainSet[0]) == 0: 66 | return "no" if trainLabel.count("no") > trainLabel.count("yes") else "yes" 67 | index = chooseBestFeature(trainSet, trainLabel) 68 | Tree = {index:{}} 69 | uniqueVal = set([elt[index] for elt in trainSet]) 70 | for value in uniqueVal: 71 | subSet, label = splitDataMatrix(trainSet, trainLabel, index, value) 72 | Tree[index][value] = generateWeakLearner(subSet, label) 73 | return Tree 74 | 75 | def generateRandomForest(trainSet, trainLabel, T): 76 | forest = [] 77 | for i in range(T): 78 | model = generateWeakLearner(trainSet, trainLabel) 79 | forest.append(model) 80 | return forest 81 | 82 | def classfyData(data, model): 83 | if type(model) == str: 84 | return model 85 | key = iter(model.keys()).__next__() 86 | value = data[key] 87 | res = model[key].get(value, None) 88 | if res != None: 89 | return classfyData(data, res) 90 | else: 91 | tmp_lst = [item for item in model[key].keys()] 92 | return classfyData(data, model[key][np.random.choice(tmp_lst, 1)[0]]) 93 | 94 | def predictByRandomForest(models, data): 95 | tmp_lst = [] 96 | for model in models: 97 | predict_label = classfyData(data, model) 98 | tmp_lst.append(predict_label) 99 | tmp_set = set(tmp_lst) 100 | res_lst = [] 101 | for res in tmp_set: 102 | res_lst.append((tmp_lst.count(res), res)) 103 | res_lst = sorted(res_lst, key=lambda index:index[0], reverse=True) 104 | if len(res_lst) == 1: 105 | return res_lst[0][1] 106 | else: 107 | tmp_res = res_lst[0][0] 108 | return_lst = [res_lst[0][1]] 109 | for i in range(1, len(res_lst)): 110 | if res_lst[i][0] == tmp_res: 111 | return_lst.append(res_lst[i][1]) 112 | if len(return_lst) == 1: 113 | return return_lst[0] 114 | else: 115 | return np.random.choice(return_lst, 1)[0] -------------------------------------------------------------------------------- /SVM/Lib/SVMLib.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | import Lib.RFLib as RFLib 8 | 9 | def kernalTransfrom(dataMatrix, vector, kTup): 10 | if kTup[0] == "lin": 11 | return vector * dataMatrix.transpose() 12 | elif kTup[0] == "rbf": 13 | delta = dataMatrix - vector 14 | K = np.matrix(np.diag(delta * delta.transpose()), dtype=np.float) 15 | K = np.exp(K / (-2 * kTup[1] ** 2)) 16 | return K 17 | else: 18 | raise NameError("Kernal Name Error") 19 | 20 | class osStruct: 21 | def __init__(self, dataMatIn, classlabels, C , toler, kTup): 22 | self.dataMatrix = np.matrix(dataMatIn, dtype=np.float) 23 | self.labelMatrix = np.matrix(classlabels, dtype=np.float).transpose() 24 | self.C = C 25 | self.toler = toler 26 | self.m = self.dataMatrix.shape[0] 27 | self.b = 0 28 | self.alphas = np.matrix(np.zeros((self.m, 1)), dtype=np.float) 29 | self.eCache = np.matrix(np.zeros((self.m, 2)), dtype=np.float) 30 | self.K = np.matrix(np.zeros((self.m, self.m)), dtype=np.float) 31 | for i in range(self.m): 32 | self.K[i] = kernalTransfrom(self.dataMatrix, self.dataMatrix[i, :], kTup) 33 | 34 | def selectJRand(i, m): 35 | j = i 36 | while j == i: 37 | j = np.random.randint(0, m, 1)[0] 38 | return j 39 | 40 | def clipAlpha(alpha, L, H): 41 | if alpha >= H: 42 | return H 43 | elif alpha <= L: 44 | return L 45 | else: 46 | return alpha 47 | 48 | def calEi(obj, i): 49 | fxi = float(np.multiply(obj.alphas, obj.labelMatrix).transpose() * \ 50 | obj.K[:, i]) + obj.b 51 | Ek = fxi - obj.labelMatrix[i, 0] 52 | return float(Ek) 53 | 54 | def updateEi(obj, i): 55 | Ei = calEi(obj, i) 56 | obj.eCache[i] = [1, Ei] 57 | 58 | def selectJIndex(obj, i, Ei): 59 | maxJ = -1 60 | maxdelta = -1 61 | Ek = -1 62 | obj.eCache[i] = [1, Ei] 63 | vaildEiList = np.nonzero(obj.eCache[:, 0].A)[0] 64 | if len(vaildEiList) > 1: 65 | for j in vaildEiList: 66 | if j == i: 67 | continue 68 | Ej = calEi(obj, j) 69 | delta = np.abs(Ei - Ej) 70 | if delta > maxdelta: 71 | maxdelta = delta 72 | maxJ = j 73 | Ek = Ej 74 | else: 75 | maxJ = selectJRand(i, obj.m) 76 | Ek = calEi(obj, maxJ) 77 | return Ek, maxJ 78 | 79 | def innerLoop(obj, i): 80 | Ei = calEi(obj, i) 81 | if (obj.labelMatrix[i, 0] * Ei < -obj.toler and obj.alphas[i, 0] < obj.C) or \ 82 | (obj.labelMatrix[i, 0] * Ei > obj.toler and obj.alphas[i, 0] > 0): 83 | Ej, j = selectJIndex(obj, i, Ei) 84 | alphaIold = obj.alphas[i, 0].copy() 85 | alphaJold = obj.alphas[j, 0].copy() 86 | if obj.labelMatrix[i, 0] == obj.labelMatrix[j, 0]: 87 | L = max(0, obj.alphas[i, 0] + obj.alphas[j, 0] - obj.C) 88 | H = min(obj.C , obj.alphas[i, 0] + obj.alphas[j, 0]) 89 | else: 90 | L = max(0, obj.alphas[j, 0] - obj.alphas[i, 0]) 91 | H = min(obj.C, obj.C - obj.alphas[i, 0] + obj.alphas[j, 0]) 92 | if L == H: 93 | return 0 94 | eta = obj.K[i, i] + obj.K[j, j] - 2 * obj.K[i, j] 95 | if eta <= 0: 96 | return 0 97 | obj.alphas[j, 0] += obj.labelMatrix[j, 0] * (Ei - Ej) / eta 98 | obj.alphas[j, 0] = clipAlpha(obj.alphas[j, 0], L, H) 99 | updateEi(obj, j) 100 | if np.abs(obj.alphas[j, 0] - alphaJold) < 0.00001: 101 | return 0 102 | obj.alphas[i, 0] += obj.labelMatrix[i, 0] * obj.labelMatrix[j, 0] * (alphaJold - obj.alphas[j, 0]) 103 | updateEi(obj, i) 104 | b1 = -Ei - obj.labelMatrix[i, 0] * obj.K[i, i] * (obj.alphas[i, 0] - alphaIold) \ 105 | - obj.labelMatrix[j, 0] * obj.K[i, j] * (obj.alphas[j, 0] - alphaJold) + obj.b 106 | b2 = -Ej - obj.labelMatrix[i, 0] * obj.K[i, j] * (obj.alphas[i, 0] - alphaIold) \ 107 | - obj.labelMatrix[j, 0] * obj.K[j, j] * (obj.alphas[j, 0] - alphaJold) + obj.b 108 | if obj.alphas[i, 0] > 0 and obj.alphas[i, 0] < obj.C: 109 | obj.b = b1 110 | elif obj.alphas[j, 0] > 0 and obj.alphas[j, 0] < obj.C: 111 | obj.b = b2 112 | else: 113 | obj.b = (b1 + b2) / 2.0 114 | return 1 115 | else: 116 | return 0 117 | 118 | def realSMO(trainSet, trainLabels, C, toler, kTup=('lin', 1.3), maxIter=40): 119 | obj = osStruct(trainSet, trainLabels, C, toler, kTup) 120 | entrySet = True 121 | iterNum = 0 122 | alphapairschanged = 0 123 | while (iterNum < maxIter) and (alphapairschanged > 0 or entrySet): 124 | print(iterNum) 125 | alphapairschanged = 0 126 | if entrySet: 127 | for i in range(obj.m): 128 | alphapairschanged += innerLoop(obj, i) 129 | if i % 100 == 0: 130 | print("full set loop, iter: %d, alphapairschanged: %d, iterNum: %d" % (i, alphapairschanged, iterNum)) 131 | iterNum += 1 132 | else: 133 | vaildalphsaList = np.nonzero((obj.alphas.A > 0) * (obj.alphas.A < C))[0] 134 | for i in vaildalphsaList: 135 | alphapairschanged += innerLoop(obj, i) 136 | if i % 100 == 0: 137 | print("non-bound set loop, iter: %d, alphapairschanged: %d, iterNum: %d" % (i, alphapairschanged, iterNum)) 138 | iterNum += 1 139 | if entrySet: 140 | entrySet = False 141 | elif alphapairschanged == 0: 142 | entrySet = True 143 | print("iter num: %d" % (iterNum)) 144 | return obj.alphas, obj.b 145 | 146 | def getSupportVectorandSupportLabel(trainSet, trainLabel, alphas): 147 | vaildalphaList = np.nonzero(alphas.A)[0] 148 | dataMatrix = np.matrix(trainSet, dtype=np.float) 149 | labelMatrix = np.matrix(trainLabel, dtype=np.float).transpose() 150 | sv = dataMatrix[vaildalphaList]#得到支持向量 151 | svl = labelMatrix[vaildalphaList] 152 | return sv, svl 153 | 154 | def predictLabel(data, sv, svl, alphas, b, kTup): 155 | kernal = kernalTransfrom(sv, np.matrix(data, dtype=np.float), kTup).transpose() 156 | fxi = np.multiply(svl.T, alphas[alphas != 0]) * kernal + b 157 | return np.sign(float(fxi)) -------------------------------------------------------------------------------- /SVM/Lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/SVM/Lib/__init__.py -------------------------------------------------------------------------------- /SVM/Lib/__pycache__/RFLib.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/SVM/Lib/__pycache__/RFLib.cpython-36.pyc -------------------------------------------------------------------------------- /SVM/Lib/__pycache__/SVMLib.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/SVM/Lib/__pycache__/SVMLib.cpython-36.pyc -------------------------------------------------------------------------------- /SVM/Lib/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/SVM/Lib/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /SVM/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/SVM/README.md -------------------------------------------------------------------------------- /SVM/Unit/SVMUnit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月4日 3 | 4 | @author: IL MARE 5 | ''' 6 | import time 7 | from Lib import SVMLib as SVMLib 8 | from Util import DataUtil as DataUtil 9 | 10 | if __name__ == "__main__": 11 | start = time.clock() 12 | # dataSet, labelSet = DataUtil.loadDataForSVMOrLRModel("bank-additional", "svm")#正统方法 13 | dataSet, labelSet = DataUtil.loadTempDataForSVMOrLRModel("bank-addtional-format-svm") 14 | dataSet, labelSet = DataUtil.underSampling(dataSet, labelSet, 1, -1) 15 | trainSet, trainLabel, testSet, testLabel = DataUtil.generateTrainSet(dataSet, labelSet) 16 | kTup = ("lin", 1.2) 17 | alphas, b = SVMLib.realSMO(trainSet, trainLabel, 0.6, 0.01, kTup, 10) 18 | errorCount = 0 19 | sv, svl = SVMLib.getSupportVectorandSupportLabel(trainSet, trainLabel, alphas) 20 | for data, label in zip(testSet, testLabel): 21 | predict_label = SVMLib.predictLabel(data, *[sv, svl, alphas, b, kTup]) 22 | if predict_label != label: 23 | errorCount += 1 24 | ratio = errorCount / len(testLabel) 25 | print("the error ratio is %.3f, the correct ratio is %.3f -- %.3fs" % (ratio, 1 - ratio, time.clock() - start)) 26 | -------------------------------------------------------------------------------- /SVM/Unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/SVM/Unit/__init__.py -------------------------------------------------------------------------------- /SVM/Util/DataUtil.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import csv 7 | import numpy as np 8 | import re 9 | import Util.RandomUtil as RandomUtil 10 | from Lib import RFLib as RFLib 11 | 12 | filePath = r"G:\研究生课件\数据挖掘\实验数据" 13 | 14 | ''' 15 | ==========================与随机森林及决策树有关的工具类============================== 16 | ''' 17 | ''' 18 | 为随机森林模型及决策树模型产生数据,函数接受一个csv文件的文件名,具体路径在filePath中写明 19 | ''' 20 | def loadDataForRMOrDTModel(filename): 21 | try: 22 | fp = open("{0}/{1}.csv".format(filePath, filename), "r") 23 | reader = csv.reader(fp) 24 | trainSet = [] 25 | trainLabel = [] 26 | reader.__next__() 27 | for line in reader: 28 | tmp_lst = [] 29 | for msg in line[0].split(";"): 30 | tmp_lst.append(re.search(r"[0-9a-zA-Z.-]+", msg)[0]) 31 | trainSet.append(tmp_lst[0: -1]) 32 | trainLabel.append(tmp_lst[-1]) 33 | return processData(trainSet), trainLabel 34 | except Exception as e: 35 | print(e) 36 | finally: 37 | fp.close() 38 | ''' 39 | 该函数为随机森林服务,将原始数据集中的连续值离散化,便于随机森林处理 40 | ''' 41 | def processData(trainSet): 42 | for row in trainSet: 43 | if float(row[0]) < 20: 44 | row[0] = "1" 45 | elif float(row[0]) >= 20 and float(row[0]) < 30: 46 | row[0] = "2" 47 | elif float(row[0]) >= 30 and float(row[0]) < 40: 48 | row[0] = "3" 49 | elif float(row[0]) >= 40 and float(row[0]) < 50: 50 | row[0] = "4" 51 | elif float(row[0]) >= 50 and float(row[0]) < 60: 52 | row[0] = "5" 53 | elif float(row[0]) >= 60 and float(row[0]) < 70: 54 | row[0] = "6" 55 | else: 56 | row[0] = "7" 57 | row[10] = str(float(row[10]) // 30 + 1) 58 | row[-2] = str(float(row[-2]) // 0.1 + 1) 59 | return trainSet 60 | 61 | ''' 62 | ==========================与SVM及对数回归有关的工具类============================== 63 | ''' 64 | ''' 65 | 为SVM及对数回归模型产生数据,函数接受一个csv文件的文件名,具体路径在filePath中写明 66 | ''' 67 | global_var = {1:['blue-collar', 'entrepreneur', 'unemployed', 'admin.', 'retired', 'services', \ 68 | 'technician', 'self-employed', 'management', 'housemaid', 'student'], 69 | 2:['single', 'married', 'divorced'], 70 | 4:['yes', 'no'], 71 | 5:['yes', 'no'], 72 | 6:['yes', 'no'], 73 | 7:['cellular', 'telephone'], 74 | 14:['failure', 'success', 'nonexistent']} 75 | 76 | global_var_order = {3:['illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree'], 77 | 8:['mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'], 78 | 9:['mon', 'tue', 'wed', 'thu', 'fri']} 79 | ''' 80 | 读取处理好的数据,加快模型测试速度 81 | ''' 82 | def loadTempDataForSVMOrLRModel(filename): 83 | try: 84 | fp = open("{0}/{1}.csv".format(filePath, filename), "r") 85 | reader = csv.reader(fp) 86 | trainSet = [] 87 | trainLabel = [] 88 | for line in reader: 89 | trainSet.append(line[0: -1]) 90 | trainLabel.append(int(line[-1])) 91 | return trainSet, trainLabel 92 | except Exception as e: 93 | print(e) 94 | finally: 95 | fp.close() 96 | ''' 97 | 为SVM和对数回归模型生成数据,重点的功能是将数据量化处理 98 | ''' 99 | def loadDataForSVMOrLRModel(filename, modelType="lr"): 100 | try: 101 | fp = open("{0}/{1}.csv".format(filePath, filename), "r") 102 | reader = csv.reader(fp) 103 | trainSet = [] 104 | trainLabel = [] 105 | reader.__next__() 106 | for line in reader: 107 | tmp_lst = [] 108 | for msg in line[0].split(";"): 109 | tmp_lst.append(re.search(r"[0-9a-zA-Z.-]+", msg)[0]) 110 | trainSet.append(tmp_lst[0: -1]) 111 | trainLabel.append(tmp_lst[-1]) 112 | fullfilltheUnknownValue(trainSet, trainLabel) 113 | quantizedData(trainSet, trainLabel, modelType) 114 | normalData(trainSet, modelType) 115 | return trainSet, trainLabel 116 | except Exception as e: 117 | print(e) 118 | finally: 119 | fp.close() 120 | ''' 121 | 正规化数据 122 | ''' 123 | def normalData(trainSet, modelType): 124 | tmp_lst = [] 125 | for i in range(len(trainSet[0])): 126 | tmp_lst.append(np.array([item[i] for item in trainSet], dtype=np.float)) 127 | for i in range(len(tmp_lst)): 128 | item = tmp_lst[i] 129 | tmp_lst[i] = (item.min(), item.max(), item.mean(), item.std()) 130 | for i in range(len(trainSet)): 131 | for j in range(len(trainSet[i])): 132 | val = tmp_lst[j] 133 | if modelType == "lr": 134 | trainSet[i][j] = (float(trainSet[i][j]) - val[0]) / (val[1] - val[0]) 135 | else: 136 | trainSet[i][j] = (float(trainSet[i][j]) - val[2]) / val[3] 137 | ''' 138 | 为随机森林预测模型产生数据,该函数的作用是删除数据集中unknown的数据 139 | ''' 140 | def formatTrainSet(trainSet, trainLabel, axis): 141 | dataSet = [] 142 | for item in trainSet: 143 | dataSet.append(item.copy()) 144 | labelSet = trainLabel.copy() 145 | value_set = set() 146 | del_index = [] 147 | for i in range(len(dataSet)): 148 | temp = dataSet[i][axis] 149 | if temp == "unknown": 150 | del_index.append(i) 151 | else: 152 | value_set.add(temp) 153 | dataSet[i][axis] = labelSet[i] 154 | labelSet[i] = temp 155 | for i in range(len(del_index)): 156 | index = del_index[i] - i 157 | del dataSet[index] 158 | del labelSet[index] 159 | return dataSet, labelSet, value_set 160 | ''' 161 | 训练随即森林模型用于预测缺失值 162 | ''' 163 | def trainPredictRandomForest(trainSet, trainLabel, axis): 164 | dataSet, labelSet, value_set = formatTrainSet(trainSet, trainLabel, axis) 165 | dataSet1, labelSet1 = underSampling(dataSet, labelSet, *value_set) 166 | forest = RFLib.generateRandomForest(dataSet1, labelSet1, 19) 167 | return forest 168 | ''' 169 | 遍历数据集,将原始数据集中的缺失值补上 170 | ''' 171 | def predictValue(dataSet, labelSet, axis): 172 | forest = trainPredictRandomForest(dataSet, labelSet, axis) 173 | for item in zip(dataSet, labelSet): 174 | if item[0][axis] == "unknown": 175 | tmp_lst = item[0][0:axis] 176 | tmp_lst.extend([item[1]]) 177 | tmp_lst.extend(item[0][axis + 1:]) 178 | predict = RFLib.predictByRandomForest(forest, tmp_lst) 179 | item[0][axis] = predict 180 | ''' 181 | 该函数用于将数据集中为unknown的属性值都用随机森林预测值来补上 182 | ''' 183 | def fullfilltheUnknownValue(dataSet, labelSet): 184 | predict_set = set() 185 | for data, label in zip(dataSet, labelSet): 186 | for i in range(len(data)): 187 | if data[i] == "unknown": 188 | predict_set.add(i) 189 | for index in predict_set: 190 | predictValue(dataSet, labelSet, index) 191 | ''' 192 | 将原始数据集中的离散值量化 193 | ''' 194 | def quantizedData(dataSet, labelSet, modelType="lr"): 195 | if modelType == "lr": 196 | for i in range(len(labelSet)): 197 | if labelSet[i] == "no": 198 | labelSet[i] = 0 199 | else: 200 | labelSet[i] = 1 201 | else: 202 | for i in range(len(labelSet)): 203 | if labelSet[i] == "no": 204 | labelSet[i] = -1 205 | else: 206 | labelSet[i] = 1 207 | global global_var_order, global_var 208 | index_lst = [index for index in global_var_order.keys()] 209 | index_lst.extend([index for index in global_var.keys()]) 210 | index_lst = sorted(index_lst) 211 | for i in range(len(dataSet)): 212 | item = dataSet[i] 213 | tmp_lst = [] 214 | for index in index_lst: 215 | variable = generateDummyVar(item[index], index) if generateDummyVar(item[index], index) \ 216 | else generateOrderVar(item[index], index) 217 | if variable == None: 218 | raise NameError("变量量化失败") 219 | tmp_lst.append((index, variable)) 220 | dataSet[i] = generateNewList(item, tmp_lst) 221 | ''' 222 | 根据量化值扩展远列表 223 | ''' 224 | def generateNewList(oldList, tmp_lst): 225 | return_mat = [] 226 | index_set = list() 227 | for item in tmp_lst: 228 | index_set.append(item[0]) 229 | for i in range(len(oldList)): 230 | if i in index_set: 231 | for item in tmp_lst[0][1]: 232 | return_mat.append(item) 233 | del tmp_lst[0] 234 | else: 235 | return_mat.append(oldList[i]) 236 | return return_mat 237 | ''' 238 | 对无序离散值生成哑变量 239 | ''' 240 | def generateDummyVar(variable, index): 241 | global global_var 242 | var_list = global_var.get(index, None) 243 | if var_list == None: 244 | return None 245 | num_dumm = len(var_list) - 1 246 | retrun_mat = [0] * num_dumm 247 | for i in range(num_dumm): 248 | var = var_list[i] 249 | if var == variable: 250 | retrun_mat[i] = 1 251 | return retrun_mat 252 | return retrun_mat 253 | ''' 254 | 对有序离散值生成连续变量 255 | ''' 256 | def generateOrderVar(variable, index): 257 | global global_var_order 258 | var_list = global_var_order.get(index, None) 259 | if var_list == None: 260 | return None 261 | for i in range(len(var_list)): 262 | if variable == var_list[i]: 263 | return [i + 1] 264 | return None 265 | ''' 266 | =================================通用工具函数========================================== 267 | ''' 268 | ''' 269 | 该函数用语欠抽样原始数据集,由于原始数据集中类别不平衡,正例只有反例的十分之一 270 | 为了模型的泛化能力,需要欠抽样来保证正例和反例数目相同 271 | ''' 272 | 273 | def underSampling(dataSet, labelSet, *args): 274 | trainSet = dataSet.copy() 275 | trainLabel = labelSet.copy() 276 | labelcount_lst = [] 277 | for label in args: 278 | labelcount_lst.append((trainLabel.count(label), label)) 279 | labelcount_lst = sorted(labelcount_lst, key=lambda item:item[0]) 280 | min_val, labelName = labelcount_lst[0] 281 | label_set = set(args) - set([labelName]) 282 | for label in label_set: 283 | tmp_set = set() 284 | for item in enumerate(trainLabel): 285 | if item[1] == label: 286 | tmp_set.add(item[0]) 287 | indexSet = RandomUtil.generateRandomIndex(tmp_set, min_val) 288 | del_set = tmp_set - indexSet 289 | del_set = sorted(list(del_set)) 290 | for i in range(len(del_set)): 291 | index = del_set[i] - i 292 | del trainSet[index] 293 | del trainLabel[index] 294 | return trainSet, trainLabel 295 | 296 | ''' 297 | 该方法在欠抽样后的数据集上工作,使用自助法产生训练集和测试集,训练集大小为愿数据集大小的62% 298 | 测试集大小为原始数据集大小的32% 299 | ''' 300 | def generateTrainSet(dataSet, labelSet): 301 | trainSet = [] 302 | trainLabel = [] 303 | testSet = [] 304 | testLabel = [] 305 | m = len(labelSet) 306 | trainIndex = set() 307 | totalIndex = set() 308 | for i in range(m): 309 | index = np.random.randint(0, m, 1)[0] 310 | trainIndex.add(index) 311 | totalIndex.add(i) 312 | trainSet.append(dataSet[index]) 313 | trainLabel.append(labelSet[index]) 314 | for item in totalIndex - trainIndex: 315 | testSet.append(dataSet[item]) 316 | testLabel.append(labelSet[item]) 317 | return trainSet, trainLabel, testSet, testLabel -------------------------------------------------------------------------------- /SVM/Util/RandomUtil.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2018年3月28日 3 | 4 | @author: IL MARE 5 | ''' 6 | import numpy as np 7 | ''' 8 | 该函数用来在一个集合中随机抽取size个互不相同的随机值 9 | ''' 10 | def generateRandomIndex(a, size): 11 | if len(a) < size: 12 | return None 13 | elif len(a) == size: 14 | return set(a) 15 | returnMat = set() 16 | while True: 17 | returnMat.add(np.random.choice(list(a), 1)[0]) 18 | if len(returnMat) == size: 19 | break 20 | return returnMat 21 | ''' 22 | 在指定范围内产生指定数目的不重复的随机数 23 | ''' 24 | def generateRandom(low, high, size): 25 | returnSet = set() 26 | while True: 27 | returnSet.add(np.random.randint(low, high, 1)[0]) 28 | if len(returnSet) == size: 29 | break 30 | return returnSet -------------------------------------------------------------------------------- /SVM/Util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/SVM/Util/__init__.py -------------------------------------------------------------------------------- /SVM/Util/__pycache__/DataUtil.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/SVM/Util/__pycache__/DataUtil.cpython-36.pyc -------------------------------------------------------------------------------- /SVM/Util/__pycache__/RandomUtil.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/SVM/Util/__pycache__/RandomUtil.cpython-36.pyc -------------------------------------------------------------------------------- /SVM/Util/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/SVM/Util/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /notebooks/Tensorflow document.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tensorflow部分方法介绍\n", 8 | "\n", 9 | "* **random_uniform** 函数用来产生一个随机生成的矩阵\n", 10 | "\n", 11 | "```python\n", 12 | "tf.random_uniform(shape, dtype, seed, seed2, name)\n", 13 | "```\n", 14 | "\n", 15 | " 参数说明:\n", 16 | "\n", 17 | " * shape:生成随机矩阵的大小,通常以列表或元组形式给出。例如:shape=(size1, size2)或shape=[size1, size2]\n", 18 | "\n", 19 | " * dtype:生成随机矩阵中值的类型。\n", 20 | "\n", 21 | " * seed:随机数生成的下界。\n", 22 | "\n", 23 | " * seed2:随机数生成的的上界。\n", 24 | "\n", 25 | "\n", 26 | "* **embedding_lookup** 函数用于截取矩阵的某几行\n", 27 | "\n", 28 | "```python\n", 29 | "tf.nn.embedding_lookup(params, ids)\n", 30 | "```\n", 31 | "\n", 32 | " 参数说明:\n", 33 | " \n", 34 | " * params:截取的目标矩阵。\n", 35 | " \n", 36 | " * ids:截取行数,通常用列表形式给出。例如要截取params矩阵的第1,2,3行,就应该如下写:ids=[1,2,3]\n", 37 | "\n", 38 | "* **truncated_normal** 用于产生一个服从正态分布的矩阵\n", 39 | "\n", 40 | "```python\n", 41 | "tf.truncated_normal(shape, mean, stddev, dtype, seed, name)\n", 42 | "```\n", 43 | "\n", 44 | " 参数说明:\n", 45 | " \n", 46 | " * shape:矩阵的大小,通常用元组形式给出。例如:shape=(size1, size2)\n", 47 | " \n", 48 | " * mean:矩阵中均值的大小。\n", 49 | " \n", 50 | " * stddev:矩阵中标准差。\n", 51 | "\n", 52 | "* **variable_scope**用于产生一个类似命名空间的效果,可以起到一个变量隔离的作用\n", 53 | "\n", 54 | "```\n", 55 | "with tf.variable_scope(\"name\", reuse=False) as scope:\n", 56 | " a = tf.get_variable(\"var_name\")\n", 57 | " scope.reuse_variables()\n", 58 | " b = tf.get_variable(\"var_name\")\n", 59 | " assert a == b\n", 60 | "```\n", 61 | "\n", 62 | " 参数说明:\n", 63 | " \n", 64 | " * reuse:在此命名空间中,变量是否可重用,默认为不可重用,若强行重用(无论是在一个scope上下文管理器中,还是在两个同名的上下文管理器中)会引发异常。\n", 65 | " \n", 66 | "* **trainable_variables()**返回所有的variable型的变量,若变量声明中trainable为False的除外\n", 67 | "\n", 68 | "```\n", 69 | "a = tf.Variable(tf.float32, [1])\n", 70 | "b = tf.get_variable(\"b\", tf.float32, [1], trainable=False)\n", 71 | "c = tf.trainable_variables()\n", 72 | "print(c)\n", 73 | "#\n", 74 | "```\n", 75 | "\n", 76 | "* **tf.gradients(y, x)**用于求函数y关于x的梯度,函数y的表达式中一定要有x项,否则会报错。\n", 77 | "\n", 78 | "```\n", 79 | "sess = tf.InteractiveSession()\n", 80 | " with tf.variable_scope(\"foo\") as scope:\n", 81 | " a = tf.get_variable(\"a\", shape=(10), dtype=tf.int32)\n", 82 | " x = tf.constant([1,2,3,4,5,6,7,8,9,10])\n", 83 | " b = tf.constant([2,2,2,2,2,2,2,2,2,2])\n", 84 | " y = a * x + b * a\n", 85 | " sess.run(y, feed_dict={a:[3,3,3,3,3,3,3,3,3,3]})\n", 86 | " for item in tf.gradients(y, a):\n", 87 | " print(item.eval())\n", 88 | "#[ 3 4 5 6 7 8 9 10 11 12]\n", 89 | "```\n", 90 | " \n", 91 | " 参数说明:\n", 92 | " \n", 93 | " * y:函数表达式变量名。\n", 94 | " \n", 95 | " * x:需要求梯度的变量。\n", 96 | " \n", 97 | "* **clip_by_global_norm**求一个合适的梯度,以防止梯度爆炸等不好的情况\n", 98 | "\n", 99 | "```\n", 100 | "tf.clip_by_global_norm(grad, max_grad_norm)\n", 101 | "```\n", 102 | "\n", 103 | " 参数说明:\n", 104 | " \n", 105 | " * grad:梯度,注意这里是tf.gradients求出的梯度张量,该函数在这个张量的基础上做调整。\n", 106 | " \n", 107 | " * max_grad_norm:一个截取比率。\n" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [] 116 | } 117 | ], 118 | "metadata": { 119 | "kernelspec": { 120 | "display_name": "Python 3", 121 | "language": "python", 122 | "name": "python3" 123 | }, 124 | "language_info": { 125 | "codemirror_mode": { 126 | "name": "ipython", 127 | "version": 3 128 | }, 129 | "file_extension": ".py", 130 | "mimetype": "text/x-python", 131 | "name": "python", 132 | "nbconvert_exporter": "python", 133 | "pygments_lexer": "ipython3", 134 | "version": "3.5.2" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 2 139 | } 140 | -------------------------------------------------------------------------------- /pics/fa-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/fa-1.jpg -------------------------------------------------------------------------------- /pics/fr-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/fr-1.jpg -------------------------------------------------------------------------------- /pics/fr-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/fr-2.jpg -------------------------------------------------------------------------------- /pics/fr-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/fr-3.jpg -------------------------------------------------------------------------------- /pics/fr-4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/fr-4.jpg -------------------------------------------------------------------------------- /pics/fr-7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/fr-7.jpg -------------------------------------------------------------------------------- /pics/fr-8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/fr-8.jpg -------------------------------------------------------------------------------- /pics/fr-9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/fr-9.jpg -------------------------------------------------------------------------------- /pics/rnn-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/rnn-1.png -------------------------------------------------------------------------------- /pics/rnn-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/rnn-2.png -------------------------------------------------------------------------------- /pics/rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/rnn.png -------------------------------------------------------------------------------- /pics/svd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/svd.jpg -------------------------------------------------------------------------------- /pics/svd1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/svd1.jpg -------------------------------------------------------------------------------- /pics/svd2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/svd2.jpg -------------------------------------------------------------------------------- /pics/svd3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yhilmare/machine-learning-notes/fab6178303f3f1d3475df5736cfc70f3062e7514/pics/svd3.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.0.2 2 | numpy==1.15.4 3 | opencv-python==3.4.5.20 4 | sklearn==0.0 5 | tensorboard==1.12.2 6 | tensorflow==1.12.0 --------------------------------------------------------------------------------