├── README.md └── 机器学习实验准备 ├── 参考代码 ├── LinearRegression │ ├── Data │ │ └── Folds5x2_pp.csv │ └── LinearRegression.py ├── SVM │ └── SupportVectorMachine.py ├── bayes_classify_demo │ ├── test.csv │ ├── watermelon3_0_En.csv │ └── watermelon_clf.py └── cnn-text-classification-tf │ ├── data │ └── rt-polaritydata │ │ ├── rt-polarity.neg │ │ └── rt-polarity.pos │ ├── data_helpers.py │ ├── eval.py │ ├── readme.txt │ ├── text_cnn.py │ └── train.py ├── 实验一 线性回归实验指导书.doc ├── 实验一 线性回归模型实验指导.pptx ├── 实验三 贝叶斯分类实验指导书.doc ├── 实验三 贝叶斯分类解决西瓜问题.pptx ├── 实验二 支持向量机实验指导书.doc ├── 实验二 支持向量机模型实验指导.pptx ├── 实验四 基于tensorflow实现CNN文本分类.pptx ├── 实验四 基于tensorflow实现cnn文本处理实验指导书.doc └── 文件说明.txt /README.md: -------------------------------------------------------------------------------- 1 | # ML-Experiments 2 | 整理记录本人担任课程助教设计的四个机器学习实验,主要涉及简单的线性回归、朴素贝叶斯分类器、支持向量机、CNN做文本分类。 3 | 内附实验指导书、讲解PPT、参考代码,欢迎各位码友讨论交流。 4 |   5 | # 文件说明 6 | 1、实验讲解PPT 7 | - 实验一 线性回归模型实验指导 8 | - 实验二 支持向量机模型实验指导 9 | - 实验三 贝叶斯分类解决西瓜问题 10 | - 实验四 基于tensorflow实现CNN文本分类 11 | 12 | 2、实验指导书 13 | - 实验一 线性回归实验指导书 14 | - 实验二 支持向量机实验指导书 15 | - 实验三 贝叶斯分类实验指导书 16 | - 实验四 基于tensorflow实现cnn文本处理实验指导书 17 | 18 | 3、实验参考代码 19 | - 实验一 LinearRegression 20 | - 实验二 SVM 21 | - 实验三 bayes_classify_demo 22 | - 实验四 cnn-text-classification-tf 23 | 24 | # 致谢 25 | - 感谢学校提供的助教平台,让我能够有机会选择感兴趣的课程,担任助教,提升自我; 26 | - 感谢导师提供的助教机会,让我可以将所学的机器学习理论知识用于实践; 27 | - 感谢实验室的同门,帮助我修改实验文档,协助我完成实验答疑。 28 |   29 | ### 注:实验都比较基础,尚有继续提升完善的空间,欢迎大家指教,谢谢。 30 | -------------------------------------------------------------------------------- /机器学习实验准备/参考代码/LinearRegression/LinearRegression.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | import matplotlib.pyplot as plt #画图工具 3 | import numpy as np 4 | import pandas as pd #使用pandas读取csv数据 5 | from sklearn import datasets,linear_model 6 | #from sklearn.cross_validation import train_test_split #适用于0.18之前的版本 7 | from sklearn.model_selection import train_test_split #适用于0.20及之后的版本 8 | from sklearn.linear_model import LinearRegression 9 | from sklearn import metrics # 引入sklearn模型评价工具库 10 | 11 | """ 12 | 读取样本数据,并将数据集分为训练集和测试集 13 | """ 14 | def getTrainSetAndTestSet(DataPath): 15 | data = pd.read_csv(DataPath) 16 | X = data[['AT','V','AP','RH']] #AT, V,AP和RH这4个列作为样本特征。 17 | y = data[['PE']] #用PE作为样本输出 18 | X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1) #随机划分训练集和测试集,默认把数据集的25%作为测试集 19 | #查看训练集和测试集的维度 20 | print "训练集和测试集的维度:" 21 | print X_train.shape 22 | print y_train.shape 23 | print X_test.shape 24 | print y_test.shape 25 | return X_train,X_test,y_train,y_test 26 | 27 | """ 28 | 训练Linear Regreesion模型,得到训练参数 29 | """ 30 | def TrainLinearRegression(X_train,y_train): 31 | linreg = LinearRegression() #未经训练的机器学习模型 32 | linreg.fit(X_train,y_train) #对模型传入输入数据x_train和输出数据y_train并进行训练 33 | """ 34 | 输出线性回归的截距和各个系数,得到线性回归预测的回归函数: 35 | PE=447.06297099−1.97376045∗AT−0.23229086∗V+0.0693515∗AP−0.15806957∗RH 36 | """ 37 | print "线性回归各个系数(W):", linreg.coef_ 38 | print "线性回归参数截距(b):",linreg.intercept_ 39 | return linreg 40 | 41 | """ 42 | 使用均方误差(Mean Squared Error, MSE)和均方根误差(Root Mean Squared Error, RMSE)在测试集上的表现来评价模型的好坏。 43 | """ 44 | def EvaluationModel(linreg,X_test,y_test): 45 | y_pred = linreg.predict(X_test) 46 | # 用scikit-learn计算MSE 47 | print "均方误差MSE:",metrics.mean_squared_error(y_test,y_pred) 48 | #用scikit-learn计算RMSE 49 | print "均方根误差RMSE:",np.sqrt(metrics.mean_squared_error(y_test,y_pred)) 50 | return y_pred 51 | 52 | """ 53 | 可视化的方式直观的表示模型学习效果的好坏 54 | 对于输出y来说,真实值和预测值都是一维的,同时,真实值和预测值一一对应,它们之间的差值越小,预测越准确。 55 | 显然,如果预测值=真实值,那么它们的差值最小,即上图中的黑色虚线。横坐标是真实值,纵坐标是预测值,那么对于所有的真实值, 56 | 预测值离着黑线越近,预测越准确。 57 | """ 58 | def Visualization(y_test,y_pred): 59 | fig,ax = plt.subplots() 60 | ax.scatter(y_test,y_pred) 61 | ax.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()],'k--',lw=5) # ’k–-’,k指线为黑色,–是线的形状。lw指定线宽。 62 | ax.set_xlabel("Measured") 63 | ax.set_ylabel("Predicted") 64 | plt.show() 65 | 66 | if __name__ == "__main__": 67 | DataPath = "./Data/Folds5x2_pp.csv" #数据的相对路径 68 | X_train, X_test, y_train, y_test = getTrainSetAndTestSet(DataPath) 69 | linreg = TrainLinearRegression(X_train,y_train) 70 | y_pred = EvaluationModel(linreg, X_test, y_test) 71 | Visualization(y_test, y_pred) 72 | -------------------------------------------------------------------------------- /机器学习实验准备/参考代码/SVM/SupportVectorMachine.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn import svm 5 | from sklearn.datasets import make_blobs 6 | 7 | """ 8 | --------获取数据-------- 9 | 随机生成40个独立的点,一共有两类 10 | X数据格式[[6.37734541 ,-10.61510727],[6.50072722 ,-3.82403586]...] 11 | Y数据格式[1 0 1 0 1 ...1 0 1] 0和1代表类别 12 | """ 13 | X,y = make_blobs(n_samples=40,centers=2,random_state=6) #随机生成2类数据,一共含40个样本 14 | """ 15 | print X 16 | print y 17 | """ 18 | 19 | """ 20 | --------训练SVM模型-------- 21 | kernel='linear' 核函数选择Linear核,主要用于线性可分的情形。参数少,速度快。 22 | C=1000 惩罚参数,C越大,相当于惩罚松弛变量,希望松弛变量接近0,即对误分类的惩罚增大, 23 | 趋向于对训练集全分对的情况,这样对训练集测试时准确率很高,但泛化能力弱。C值小,对误分类的惩罚减小,允许容错, 24 | 将他们当成噪声点,泛化能力较强。 25 | """ 26 | clf = svm.SVC(kernel='linear',C=1000) #SVC时SVM的一种Type,是用来做分类的 27 | clf.fit(X,y) #喂给模型数据,进行训练 28 | print "模型参数W:",clf.coef_ #模型参数W 29 | print "模型参数b:",clf.intercept_ #模型参数b 30 | print "支持向量为:",clf.support_vectors_ #输出支持向量 31 | xpredict = [10,-10] 32 | xpredict = np.array(xpredict).reshape(1,-1) #sklearn 0.17版本之后需要这条语句,之前版本直接传入xpredict即可 33 | print xpredict,"预测为:",clf.predict(xpredict),"类别" 34 | xpredict = [10,-2] 35 | xpredict = np.array(xpredict).reshape(1,-1) 36 | print xpredict,"预测为:",clf.predict(xpredict),"类别" 37 | 38 | """ 39 | --------可视化(作为了解)-------- 40 | """ 41 | 42 | plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.Paired) #X[:,0]就是取所有行的第0个数据, X[:,1] 就是取所有行的第1个数据。 43 | #plot the decision function 44 | ax = plt.gca() 45 | xlim = ax.get_xlim() #获取X轴的范围 46 | ylim = ax.get_ylim() #获取Y轴的范围 47 | #create grid to evaluate model 48 | xx = np.linspace(xlim[0],xlim[1],30) # 返回30个均匀分布的样本,在[xlim[0],xlim[1]]范围内 49 | yy = np.linspace(ylim[0],ylim[1],30) 50 | YY,XX = np.meshgrid(yy,xx) 51 | xy = np.vstack([XX.ravel(),YY.ravel()]).T 52 | Z = clf.decision_function(xy).reshape(XX.shape) 53 | #绘出支持向量的分割界和分割面(二维分割面为直线) 54 | ax.contour(XX,YY,Z,colors='k',levels=[-1,0,1],alpha=0.5,linestyles=['--','-','--']) 55 | #绘出支持向量,用红色表示 56 | ax.scatter(clf.support_vectors_[:,0],clf.support_vectors_[:,1],linewidth=1,facecolors='red') 57 | plt.show() 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /机器学习实验准备/参考代码/bayes_classify_demo/test.csv: -------------------------------------------------------------------------------- 1 | No.,Color,Root,Knocks,Texture,Umbilicus,Touch,Density,SugerRatio,Label 2 | 1,1,1,1,1,1,1,0.697,0.46,1 3 | -------------------------------------------------------------------------------- /机器学习实验准备/参考代码/bayes_classify_demo/watermelon3_0_En.csv: -------------------------------------------------------------------------------- 1 | No.,Color,Root,Knocks,Texture,Umbilicus,Touch,Density,SugerRatio,Label 2 | 1,1,1,1,1,1,1,0.697,0.46,1 3 | 2,2,1,2,1,1,1,0.774,0.376,1 4 | 3,2,1,1,1,1,1,0.634,0.264,1 5 | 4,1,1,2,1,1,1,0.608,0.318,1 6 | 5,3,1,1,1,1,1,0.556,0.215,1 7 | 6,1,2,1,1,2,2,0.403,0.237,1 8 | 7,2,2,1,2,2,2,0.481,0.149,1 9 | 8,2,2,1,2,2,1,0.437,0.211,1 10 | 9,2,2,2,1,2,1,0.666,0.091,0 11 | 10,1,3,3,2,3,2,0.243,0.267,0 12 | 11,3,3,3,3,3,1,0.245,0.057,0 13 | 12,3,1,1,3,3,2,0.343,0.099,0 14 | 13,1,2,1,2,1,1,0.639,0.161,0 15 | 14,3,2,2,2,1,1,0.657,0.198,0 16 | 15,2,2,1,1,2,2,0.36,0.37,0 17 | 16,3,1,1,3,3,1,0.593,0.042,0 18 | 17,1,1,2,2,2,1,0.719,0.103,0 19 | -------------------------------------------------------------------------------- /机器学习实验准备/参考代码/bayes_classify_demo/watermelon_clf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import csv 3 | import numpy as np 4 | from math import sqrt 5 | 6 | attr_num = [3, 3, 3, 3, 3, 2] 7 | 8 | 9 | def loadCsv(filename): 10 | lines = csv.reader(open(filename, "r")) 11 | dataset = list(lines) 12 | for i in range(1, len(dataset)): 13 | dataset[i] = [float(x) for x in dataset[i]] 14 | result = np.array(dataset[1:]) 15 | return result[:, 1:] 16 | 17 | 18 | def pre_problity(datasets): 19 | pos_prob = 1.0 * (np.sum(datasets[:, -1] == 1.0) + 1) / (np.shape(datasets)[0] + 2) 20 | neg_prob = 1.0 * (np.sum(datasets[:, -1] == 0.0) + 1) / (np.shape(datasets)[0] + 2) 21 | return [pos_prob, neg_prob] 22 | 23 | 24 | def cond_attr_problity(datasets, testdata): 25 | cond_result = np.zeros([np.shape(datasets)[1] - 1, 2]) 26 | pos_data = datasets[datasets[:, -1] == 1.0, :] 27 | neg_data = datasets[datasets[:, -1] == 0.0, :] 28 | for i in range(len(attr_num)): 29 | cond_result[i, 0] = 1.0 * (np.sum(pos_data[:, i] == testdata[0, i]) + 1) / ( 30 | np.sum(datasets[:, -1] == 1.0) + attr_num[i]) 31 | cond_result[i, 1] = 1.0 * (np.sum(neg_data[:, i] == testdata[0, i]) + 1) / ( 32 | np.sum(datasets[:, -1] == 0.0) + attr_num[i]) 33 | 34 | for j in range(6, 8): 35 | # mean,std computation 36 | pos_mean = np.mean(datasets[(datasets[:, -1] == 1.0), j]) 37 | pos_std = np.std(datasets[(datasets[:, -1] == 1.0), j]) 38 | neg_mean = np.mean(datasets[(datasets[:, -1] == 0.0), j]) 39 | neg_std = np.std(datasets[(datasets[:, -1] == 0.0), j]) 40 | cond_result[j, 0] = 1.0 / (sqrt(2 * np.pi) * pos_std) * np.exp( 41 | -1 * (testdata[0, j] - pos_mean) ** 2 / (2 * pos_std ** 2)) 42 | cond_result[j, 1] = 1.0 / (sqrt(2 * np.pi) * neg_std) * np.exp( 43 | -1 * (testdata[0, j] - neg_mean) ** 2 / (2 * neg_std ** 2)) 44 | return cond_result 45 | 46 | 47 | def classify_data(cond_result, pre_result): 48 | pos_result = pre_result[0] 49 | neg_result = pre_result[1] 50 | for i in range(np.shape(cond_result)[0]): 51 | pos_result *= cond_result[i, 0] 52 | neg_result *= cond_result[i, 1] 53 | if pos_result > neg_result: 54 | print('好瓜') 55 | print(pos_result) 56 | else: 57 | print('坏瓜') 58 | print(neg_result) 59 | 60 | 61 | def main(): 62 | filename = 'watermelon3_0_En.csv' 63 | dataset = loadCsv(filename) 64 | testname = 'test.csv' 65 | testdata = loadCsv(testname) 66 | pre_result = pre_problity(dataset) 67 | cond_result = cond_attr_problity(dataset, testdata) 68 | classify_data(cond_result, pre_result) 69 | 70 | 71 | main() -------------------------------------------------------------------------------- /机器学习实验准备/参考代码/cnn-text-classification-tf/data_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | import itertools 4 | from collections import Counter 5 | 6 | 7 | def clean_str(string): 8 | """ 9 | Tokenization/string cleaning for all datasets except for SST. 10 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 11 | """ 12 | #清理数据替换无词义的符号 13 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 14 | string = re.sub(r"\'s", " \'s", string) 15 | string = re.sub(r"\'ve", " \'ve", string) 16 | string = re.sub(r"n\'t", " n\'t", string) 17 | string = re.sub(r"\'re", " \'re", string) 18 | string = re.sub(r"\'d", " \'d", string) 19 | string = re.sub(r"\'ll", " \'ll", string) 20 | string = re.sub(r",", " , ", string) 21 | string = re.sub(r"!", " ! ", string) 22 | string = re.sub(r"\(", " \( ", string) 23 | string = re.sub(r"\)", " \) ", string) 24 | string = re.sub(r"\?", " \? ", string) 25 | string = re.sub(r"\s{2,}", " ", string) 26 | return string.strip().lower() 27 | 28 | 29 | def load_data_and_labels(positive_data_file, negative_data_file): 30 | """ 31 | Loads MR polarity data from files, splits the data into words and generates labels. 32 | Returns split sentences and labels. 33 | 从文件加载MRpolarity数据,将数据拆分成单词并生成标签。返回分离的句子和标签。 34 | """ 35 | # Load data from files 36 | #加载数据 37 | positive_examples = list(open(positive_data_file, "r").readlines()) 38 | positive_examples = [s.strip() for s in positive_examples]#去空格 39 | negative_examples = list(open(negative_data_file, "r").readlines()) 40 | negative_examples = [s.strip() for s in negative_examples]#去空格 41 | # Split by words 42 | x_text = positive_examples + negative_examples 43 | x_text = [clean_str(sent) for sent in x_text]#字符过滤,实现函数见clean_str() 44 | # Generate labels 45 | #生成标签 46 | positive_labels = [[0, 1] for _ in positive_examples] 47 | negative_labels = [[1, 0] for _ in negative_examples] 48 | y = np.concatenate([positive_labels, negative_labels], 0)#将两种label连在一起 49 | return [x_text, y] 50 | 51 | 52 | #创建batch迭代模块 53 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 54 | """ 55 | Generates a batch iterator for a dataset. 56 | """ 57 | #每次只输出shuffled_data[start_index:end_index]这么多 58 | data = np.array(data) 59 | data_size = len(data) 60 | num_batches_per_epoch = int((len(data)-1)/batch_size) + 1# 每一个epoch有多少个batch_size 61 | for epoch in range(num_epochs): 62 | # Shuffle the data at each epoch 63 | #每一代都清理数据 64 | if shuffle: 65 | shuffle_indices = np.random.permutation(np.arange(data_size)) 66 | shuffled_data = data[shuffle_indices] 67 | else: 68 | shuffled_data = data 69 | for batch_num in range(num_batches_per_epoch): 70 | start_index = batch_num * batch_size #当前batch的索引开始 71 | end_index = min((batch_num + 1) * batch_size, data_size) # 判断下一个batch是不是超过最后一个数据了 72 | yield shuffled_data[start_index:end_index] 73 | -------------------------------------------------------------------------------- /机器学习实验准备/参考代码/cnn-text-classification-tf/eval.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | import time 7 | import datetime 8 | import data_helpers 9 | from text_cnn import TextCNN 10 | from tensorflow.contrib import learn 11 | import csv 12 | 13 | # Parameters 14 | # ================================================== 15 | 16 | # Data Parameters 17 | tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.") 18 | tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the positive data.") 19 | 20 | # Eval Parameters 21 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 22 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") 23 | tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data") 24 | 25 | # Misc Parameters 26 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 27 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 28 | 29 | 30 | FLAGS = tf.flags.FLAGS 31 | FLAGS._parse_flags() 32 | print("\nParameters:") 33 | for attr, value in sorted(FLAGS.__flags.items()): 34 | print("{}={}".format(attr.upper(), value)) 35 | print("") 36 | 37 | # CHANGE THIS: Load data. Load your own data here 38 | if FLAGS.eval_train: 39 | x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) 40 | y_test = np.argmax(y_test, axis=1) 41 | else: 42 | x_raw = ["a masterpiece four years in the making", "everything is off."] 43 | y_test = [1, 0] 44 | 45 | # Map data into vocabulary 46 | vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") 47 | vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) 48 | x_test = np.array(list(vocab_processor.transform(x_raw))) 49 | 50 | print("\nEvaluating...\n") 51 | 52 | # Evaluation 53 | # ================================================== 54 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) 55 | graph = tf.Graph() 56 | with graph.as_default(): 57 | session_conf = tf.ConfigProto( 58 | allow_soft_placement=FLAGS.allow_soft_placement, 59 | log_device_placement=FLAGS.log_device_placement) 60 | sess = tf.Session(config=session_conf) 61 | with sess.as_default(): 62 | # Load the saved meta graph and restore variables 63 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 64 | saver.restore(sess, checkpoint_file) 65 | 66 | # Get the placeholders from the graph by name 67 | input_x = graph.get_operation_by_name("input_x").outputs[0] 68 | # input_y = graph.get_operation_by_name("input_y").outputs[0] 69 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 70 | 71 | # Tensors we want to evaluate 72 | predictions = graph.get_operation_by_name("output/predictions").outputs[0] 73 | 74 | # Generate batches for one epoch 75 | batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) 76 | 77 | # Collect the predictions here 78 | all_predictions = [] 79 | 80 | for x_test_batch in batches: 81 | batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0}) 82 | all_predictions = np.concatenate([all_predictions, batch_predictions]) 83 | 84 | # Print accuracy if y_test is defined 85 | if y_test is not None: 86 | correct_predictions = float(sum(all_predictions == y_test)) 87 | print("Total number of test examples: {}".format(len(y_test))) 88 | print("Accuracy: {:g}".format(correct_predictions/float(len(y_test)))) 89 | 90 | # Save the evaluation to a csv 91 | predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions)) 92 | out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv") 93 | print("Saving evaluation to {0}".format(out_path)) 94 | with open(out_path, 'w') as f: 95 | csv.writer(f).writerows(predictions_human_readable) -------------------------------------------------------------------------------- /机器学习实验准备/参考代码/cnn-text-classification-tf/readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digfound/ML-Experiments/66a1d81ae82b17c66dc29a686e230098a892062a/机器学习实验准备/参考代码/cnn-text-classification-tf/readme.txt -------------------------------------------------------------------------------- /机器学习实验准备/参考代码/cnn-text-classification-tf/text_cnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | #定义CNN网络实现的类 5 | class TextCNN(object): 6 | """ 7 | A CNN for text classification. 8 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. 9 | 使用embedding,其次是卷积,最大池和softmax层。 10 | """ 11 | #sequence_length: 句子的长度,我们把所有的句子都填充成了相同的长度(该数据集是59)。 12 | #num_classes: 输出层的类别数,我们这个例子是2(正向和负向)。 13 | #vocab_size: 我们词汇表的大小。定义 embedding 层的大小的时候需要这个参数,embedding层的形状是[vocabulary_size, embedding_size]。 14 | #embedding_size: 嵌入的维度。 15 | #filter_sizes: 我们想要 convolutional filters 覆盖的words的个数,对于每个size,我们会有 num_filters 个 filters。比如 [3,4,5] 表示我们有分别滑过3,4,5个 words 的 filters,总共是3 * num_filters 个 filters。 16 | #num_filters: 每一个filter size的filters数量(见上面)。 17 | def __init__( 18 | self, sequence_length, num_classes, vocab_size, 19 | embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):#将train.py中textCNN里定义的参数传进来 20 | 21 | # Placeholders for input, output and dropout 22 | self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") # input_x输入语料,待训练的内容,维度是sequence_length,"N个词构成的N维向量" 23 | self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") # input_y输入语料,待训练的内容标签,维度是num_classes,"正面 || 负面" 24 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")# dropout_keep_prob dropout参数,防止过拟合,训练时用 25 | 26 | # Keeping track of l2 regularization loss (optional) 27 | l2_loss = tf.constant(0.0) 28 | 29 | # Embedding layer 30 | # 指定运算结构的运行位置在cpu非gpu,因为"embedding"无法运行在gpu 31 | # 通过tf.name_scope指定"embedding" 32 | with tf.device('/cpu:0'), tf.name_scope("embedding"): 33 | self.W = tf.Variable( 34 | tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), 35 | name="W") 36 | self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) 37 | self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) 38 | 39 | # Create a convolution + maxpool layer for each filter size 40 | pooled_outputs = [] 41 | # filter_sizes卷积核尺寸,枚举后遍历 42 | for i, filter_size in enumerate(filter_sizes): 43 | with tf.name_scope("conv-maxpool-%s" % filter_size): 44 | # Convolution Layer 45 | filter_shape = [filter_size, embedding_size, 1, num_filters]# 4个参数分别为filter_size高h,embedding_size宽w,channel为1,filter个数 46 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")# W进行高斯初始化 47 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")# b给初始化为一个常量 48 | conv = tf.nn.conv2d( 49 | self.embedded_chars_expanded, 50 | W, 51 | strides=[1, 1, 1, 1], 52 | padding="VALID", # 这里不需要padding 53 | name="conv") 54 | # Apply nonlinearity 激活函数 55 | # 可以理解为,正面或者负面评价有一些标志词汇,这些词汇概率被增强,即一旦出现这些词汇,倾向性分类进正或负面评价, 56 | # 该激励函数可加快学习进度,增加稀疏性,因为让确定的事情更确定,噪声的影响就降到了最低。 57 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 58 | # Maxpooling over the outputs 59 | #池化 60 | pooled = tf.nn.max_pool( 61 | h, 62 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 63 | strides=[1, 1, 1, 1], 64 | padding='VALID',# 这里不需要padding 65 | name="pool") 66 | pooled_outputs.append(pooled) 67 | 68 | # Combine all the pooled features 69 | #聚合所有池特征 70 | num_filters_total = num_filters * len(filter_sizes) 71 | self.h_pool = tf.concat(pooled_outputs, 3) 72 | self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])# 扁平化数据,跟全连接层相连 73 | 74 | # Add dropout 75 | # drop层,防止过拟合,参数为dropout_keep_prob 76 | # 过拟合的本质是采样失真,噪声权重影响了判断,如果采样足够多,足够充分,噪声的影响可以被量化到趋近事实,也就无从过拟合。 77 | # 即数据越大,drop和正则化就越不需要。 78 | with tf.name_scope("dropout"): 79 | self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) 80 | 81 | # Final (unnormalized) scores and predictions 82 | #输出层 83 | with tf.name_scope("output"): 84 | W = tf.get_variable( 85 | "W", 86 | shape=[num_filters_total, num_classes],#前面连扁平化后的池化操作 87 | initializer=tf.contrib.layers.xavier_initializer())# 定义初始化方式 88 | b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") 89 | # 损失函数导入 90 | l2_loss += tf.nn.l2_loss(W) 91 | l2_loss += tf.nn.l2_loss(b) 92 | #xw+b 93 | self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")#得分函数 94 | self.predictions = tf.argmax(self.scores, 1, name="predictions")#预测结果 95 | 96 | # CalculateMean cross-entropy loss 97 | with tf.name_scope("loss"): 98 | #loss,交叉熵损失函数 99 | losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) 100 | self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss 101 | 102 | # Accuracy 103 | with tf.name_scope("accuracy"): 104 | #准确率,求和计算算数平均值 105 | correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 106 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 107 | -------------------------------------------------------------------------------- /机器学习实验准备/参考代码/cnn-text-classification-tf/train.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | import time 7 | import datetime 8 | import data_helpers 9 | from text_cnn import TextCNN 10 | from tensorflow.contrib import learn 11 | 12 | # Parameters 13 | # ================================================== 14 | 15 | # Data loading params 16 | #语料文件路径定义 17 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 18 | tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.") 19 | tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.") 20 | 21 | # Model Hyperparameters 22 | #定义网络超参数 23 | tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)") 24 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") 25 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") 26 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 27 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)") 28 | 29 | # Training parameters 30 | #训练参数 31 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") #批大小 32 | tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")#总训练次数 33 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")#每训练一百次测试一下 34 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")#保存一次模型 35 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")#保存模型参数 36 | # Misc Parameters 37 | # 38 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")#加一个布尔类型的参数,要不要自动分配 39 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")#要不要打印日志 40 | #打印一下相关初始参数 41 | FLAGS = tf.flags.FLAGS 42 | FLAGS._parse_flags() 43 | print("\nParameters:") 44 | for attr, value in sorted(FLAGS.__flags.items()): 45 | print("{}={}".format(attr.upper(), value)) 46 | print("") 47 | 48 | 49 | # Data Preparation 50 | # ================================================== 51 | 52 | # Load data 53 | #数据准备,加载数据 54 | print("Loading data...") 55 | x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) 56 | 57 | # Build vocabulary 58 | #建立词汇 59 | max_document_length = max([len(x.split(" ")) for x in x_text])#计算最长的词汇长度 60 | vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)#tf提供的工具,将数据填充为最大长度,默认0填充 61 | x = np.array(list(vocab_processor.fit_transform(x_text))) 62 | 63 | # Randomly shuffle data 64 | #随机清洗数据 65 | np.random.seed(10) 66 | shuffle_indices = np.random.permutation(np.arange(len(y)))#np.arange生成随机序列 67 | x_shuffled = x[shuffle_indices] 68 | y_shuffled = y[shuffle_indices] 69 | 70 | # Split train/test set 71 | # 将数据按训练train和测试dev分块 72 | # TODO: This is very crude, should use cross-validation 73 | dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 74 | x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] 75 | y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 76 | print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) 77 | print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) 78 | 79 | 80 | # Training 81 | #训练开始 82 | # ================================================== 83 | 84 | with tf.Graph().as_default(): 85 | session_conf = tf.ConfigProto( 86 | allow_soft_placement=FLAGS.allow_soft_placement, 87 | log_device_placement=FLAGS.log_device_placement) 88 | sess = tf.Session(config=session_conf) 89 | with sess.as_default(): 90 | #卷积池化网络导入 91 | cnn = TextCNN( 92 | sequence_length=x_train.shape[1], 93 | num_classes=y_train.shape[1],#分几类 94 | vocab_size=len(vocab_processor.vocabulary_), 95 | embedding_size=FLAGS.embedding_dim, 96 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),# 上面定义的filter_sizes拿过来,"3,4,5"按","分割 97 | num_filters=FLAGS.num_filters,#一共有几个filter 98 | l2_reg_lambda=FLAGS.l2_reg_lambda)#L2正则化项 99 | 100 | # Define Training procedure 101 | #定义训练程序 102 | global_step = tf.Variable(0, name="global_step", trainable=False) 103 | optimizer = tf.train.AdamOptimizer(1e-3)#定义优化器 104 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 105 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 106 | 107 | # Keep track of gradient values and sparsity (optional) 108 | #跟踪梯度值和稀疏即tensorboard 109 | grad_summaries = [] 110 | for g, v in grads_and_vars: 111 | if g is not None: 112 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) 113 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 114 | grad_summaries.append(grad_hist_summary) 115 | grad_summaries.append(sparsity_summary) 116 | grad_summaries_merged = tf.summary.merge(grad_summaries) 117 | 118 | # Output directory for models and summaries 119 | #模型和summaries的输出目录 120 | timestamp = str(int(time.time())) 121 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 122 | print("Writing to {}\n".format(out_dir)) 123 | 124 | # Summaries for loss and accuracy 125 | #损失函数和准确率的参数保存 126 | loss_summary = tf.summary.scalar("loss", cnn.loss) 127 | acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) 128 | 129 | # Train Summaries 130 | #训练数据保存 131 | train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) 132 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 133 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 134 | 135 | # Dev summaries 136 | #测试数据保存 137 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) 138 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 139 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) 140 | 141 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 142 | #模型保存目录,如果没有则创建 143 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 144 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 145 | if not os.path.exists(checkpoint_dir): 146 | os.makedirs(checkpoint_dir) 147 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) 148 | 149 | # Write vocabulary 150 | # 151 | vocab_processor.save(os.path.join(out_dir, "vocab")) 152 | 153 | # Initialize all variables 154 | #初始化所有变量 155 | sess.run(tf.global_variables_initializer()) 156 | 157 | #定义训练函数 158 | def train_step(x_batch, y_batch): 159 | """ 160 | A single training step 161 | #单个训练步骤 162 | """ 163 | feed_dict = { 164 | cnn.input_x: x_batch, 165 | cnn.input_y: y_batch, 166 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob 167 | } 168 | _, step, summaries, loss, accuracy = sess.run( 169 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], 170 | feed_dict) 171 | time_str = datetime.datetime.now().isoformat()#取当前时间,Python的函数 172 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 173 | train_summary_writer.add_summary(summaries, step) 174 | 175 | #定义测试函数 176 | def dev_step(x_batch, y_batch, writer=None): 177 | """ 178 | Evaluates model on a dev set 179 | #用测试集评估模型 180 | """ 181 | feed_dict = { 182 | cnn.input_x: x_batch, 183 | cnn.input_y: y_batch, 184 | cnn.dropout_keep_prob: 1.0#神经元全部保留 185 | } 186 | step, summaries, loss, accuracy = sess.run( 187 | [global_step, dev_summary_op, cnn.loss, cnn.accuracy], 188 | feed_dict) 189 | time_str = datetime.datetime.now().isoformat() 190 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 191 | if writer: 192 | writer.add_summary(summaries, step) 193 | 194 | # Generate batches 195 | batches = data_helpers.batch_iter( 196 | list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) 197 | # Training loop. For each batch... 198 | for batch in batches: 199 | x_batch, y_batch = zip(*batch)#按batch把数据拿进来 200 | train_step(x_batch, y_batch) 201 | current_step = tf.train.global_step(sess, global_step)#将Session和global_step值传进来 202 | if current_step % FLAGS.evaluate_every == 0: # 每FLAGS.evaluate_every次每100执行一次测试 203 | print("\nEvaluation:") 204 | dev_step(x_dev, y_dev, writer=dev_summary_writer) 205 | print("") 206 | if current_step % FLAGS.checkpoint_every == 0:# 每checkpoint_every次执行一次保存模型 207 | path = saver.save(sess, checkpoint_prefix, global_step=current_step)# 定义模型保存路径 208 | print("Saved model checkpoint to {}\n".format(path)) 209 | -------------------------------------------------------------------------------- /机器学习实验准备/实验一 线性回归实验指导书.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digfound/ML-Experiments/66a1d81ae82b17c66dc29a686e230098a892062a/机器学习实验准备/实验一 线性回归实验指导书.doc -------------------------------------------------------------------------------- /机器学习实验准备/实验一 线性回归模型实验指导.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digfound/ML-Experiments/66a1d81ae82b17c66dc29a686e230098a892062a/机器学习实验准备/实验一 线性回归模型实验指导.pptx -------------------------------------------------------------------------------- /机器学习实验准备/实验三 贝叶斯分类实验指导书.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digfound/ML-Experiments/66a1d81ae82b17c66dc29a686e230098a892062a/机器学习实验准备/实验三 贝叶斯分类实验指导书.doc -------------------------------------------------------------------------------- /机器学习实验准备/实验三 贝叶斯分类解决西瓜问题.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digfound/ML-Experiments/66a1d81ae82b17c66dc29a686e230098a892062a/机器学习实验准备/实验三 贝叶斯分类解决西瓜问题.pptx -------------------------------------------------------------------------------- /机器学习实验准备/实验二 支持向量机实验指导书.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digfound/ML-Experiments/66a1d81ae82b17c66dc29a686e230098a892062a/机器学习实验准备/实验二 支持向量机实验指导书.doc -------------------------------------------------------------------------------- /机器学习实验准备/实验二 支持向量机模型实验指导.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digfound/ML-Experiments/66a1d81ae82b17c66dc29a686e230098a892062a/机器学习实验准备/实验二 支持向量机模型实验指导.pptx -------------------------------------------------------------------------------- /机器学习实验准备/实验四 基于tensorflow实现CNN文本分类.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digfound/ML-Experiments/66a1d81ae82b17c66dc29a686e230098a892062a/机器学习实验准备/实验四 基于tensorflow实现CNN文本分类.pptx -------------------------------------------------------------------------------- /机器学习实验准备/实验四 基于tensorflow实现cnn文本处理实验指导书.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digfound/ML-Experiments/66a1d81ae82b17c66dc29a686e230098a892062a/机器学习实验准备/实验四 基于tensorflow实现cnn文本处理实验指导书.doc -------------------------------------------------------------------------------- /机器学习实验准备/文件说明.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digfound/ML-Experiments/66a1d81ae82b17c66dc29a686e230098a892062a/机器学习实验准备/文件说明.txt --------------------------------------------------------------------------------