├── .gitignore ├── README.md ├── character_dnn ├── README.md ├── __init__.py ├── character_eval.py ├── character_inference.py ├── character_train.py └── input_data.py ├── character_svm ├── svm_action.py ├── svm_dbow_test.py └── svm_tfidf_test.py ├── data ├── essay_data │ ├── essays.csv │ ├── vocab1_test.txt │ └── vocab1_train.txt ├── label │ ├── test_label.npy │ └── train_label.npy └── vec │ ├── doc2vec_test_vec_dbow.txt │ ├── doc2vec_train_vec_dbow.txt │ ├── emotion_test_vec.npy │ ├── emotion_train_vec.npy │ ├── textmind_test_vec.npy │ └── textmind_train_vec.npy ├── features ├── __init__.py ├── crawl_textmind_data │ ├── README.md │ ├── __init__.py │ ├── crawler.py │ ├── input_textmind_data.py │ ├── test_label.npy │ ├── textmind_test_vec.npy │ ├── textmind_train_vec.npy │ └── train_label.npy ├── doc2vec │ ├── __init__.py │ ├── doc2vec_action.py │ ├── doc2vec_test_vec_dm.npy │ ├── doc2vec_train_vec_dm.npy │ ├── test_label.npy │ └── train_label.npy ├── emotion_lexicon │ ├── Emotion_Lexicon.csv │ ├── README.md │ ├── __init__.py │ ├── data_helper.py │ ├── emotion_test_label.npy │ ├── emotion_test_vec.npy │ ├── emotion_train_label.npy │ └── emotion_train_vec.npy ├── process_data1.py └── tfidf │ ├── test_label.npy │ ├── tfidf_action.py │ ├── tfidf_test_vec_tfidf.npy │ ├── tfidf_train_vec_tfidf.npy │ └── train_label.npy ├── model └── README.md └── utils ├── __init__.py └── logger.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # Environments 84 | .env 85 | .venv 86 | env/ 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | .idea 103 | .gitignore~ 104 | *.bak -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 基于深层神经网络的面向社交媒体的用户性格分类 2 | 性格分类依据大五人格模型, 数据源采用开放的数据集essay。 在本文的实现中, 3 | 提取用户文本的 tfidf 特征、 LIWC 心理学特征和情感特征、 以及基于 doc2vec 的文本特征等, 4 | 建立 SVM 和深层神经网络的分类模型。 5 | 6 | [参考](https://github.com/SenticNet/personality-detection) 7 | 8 | version: 9 | python 3.6 10 | tensorflow 1.10 -------------------------------------------------------------------------------- /character_dnn/README.md: -------------------------------------------------------------------------------- 1 | # DNN 神经网络 2 | 运行character_train.py 3 | 修改神经网络相关参数:如下 4 | 5 | INPUT_NODE = 11 # 用户的特征维度 6 | OUTPUT_NODE = 5 # 输出5个类别的性格 7 | /# LAYER1_NODE = 8 隱藏层的节点数 根据经验公式lgn 8 | expr = 0.43 * INPUT_NODE * 5 + 0.12 * 5 * 5 + 2.54 * INPUT_NODE + 0.77 * 5 + 0.35 9 | LAYER1_NODE = int(math.sqrt(expr) + 0.51) 10 | -------------------------------------------------------------------------------- /character_dnn/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'gu' 2 | -------------------------------------------------------------------------------- /character_dnn/character_eval.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | 测试过程 4 | """ 5 | __author__ = 'gu' 6 | 7 | import os 8 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 9 | import time 10 | import tensorflow as tf 11 | from character_dnn import character_inference 12 | import numpy as np 13 | from character_dnn import input_data 14 | 15 | MOVING_AVERAGE_DECAY = 0.99 # 活动平均衰减率 16 | MODEL_SAVE_PATH = "model/" 17 | MODEL_NAME = "character_model" 18 | print(MODEL_SAVE_PATH) 19 | # 加载的时间间隔。 20 | EVAL_INTERVAL_SECS = 2 21 | 22 | # 加载d2v 和 tfidf的数据 23 | train_list_side, train_list_tag, text_list_side, text_list_tag = input_data.load_data_label('') 24 | 25 | def evaluate(): 26 | with tf.Graph().as_default() as g: 27 | x = tf.placeholder(tf.float32, [None, character_inference.INPUT_NODE], name='x-input') 28 | y_ = tf.placeholder(tf.int64, name='y-input') 29 | validate_feed = {x: text_list_side, y_: text_list_tag} 30 | 31 | y = character_inference.inference(x, None) 32 | # y = character_inference.inference_nlayer(x, None) 33 | 34 | # correct_prediction = tf.equal(tf.argmax(y, 1), y_) 35 | # accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 36 | 37 | variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY) 38 | variables_to_restore = variable_averages.variables_to_restore() 39 | saver = tf.train.Saver(variables_to_restore) 40 | dict_acc = {} 41 | dict_precision = {} 42 | dict_recall = {} 43 | dict_f1 = {} 44 | dict_acc_lsit = {} 45 | 46 | while True: 47 | with tf.Session() as sess: 48 | # tf.train.get_checkpoint_state 会根据checkpoint文件自动找到目录中最新模型的文件名 49 | ckpt = tf.train.get_checkpoint_state(MODEL_SAVE_PATH) 50 | if ckpt and ckpt.model_checkpoint_path: 51 | saver.restore(sess, ckpt.model_checkpoint_path) 52 | global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] 53 | # accuracy_score = sess.run(accuracy, feed_dict=validate_feed) 54 | 55 | # accuracy_score = get_acc(sess,true_y, pred_y) 56 | # print("After %s training step(s), validation accuracy = %g" % (global_step, accuracy_score)) 57 | 58 | # print("the input data are \n%s" % test_list_side) 59 | # print("the truly answer are \n%s" % test_list_tag) 60 | eval_aws = sess.run(y, feed_dict=validate_feed) 61 | # print("the evaluate answer are \n%s" % eval_aws) 62 | 63 | accuracy_score, acc_list = get_acc(sess, text_list_tag, eval_aws) 64 | print("After %s training step(s), all validation accuracy = %g" % (global_step, accuracy_score)) 65 | print("After %s training step(s), 5 validation accuracy = %s" % (global_step, acc_list)) 66 | 67 | precision_list = get_precision(text_list_tag, eval_aws) 68 | print("After %s training step(s), 5 precision = %s" % (global_step, precision_list)) 69 | 70 | recall_list = get_recall(text_list_tag, eval_aws) 71 | print("After %s training step(s), 5 recall = %s" % (global_step, recall_list)) 72 | 73 | f1_list = get_f1(precision_list, recall_list) 74 | print("After %s training step(s), 5 f1 = %s" % (global_step, f1_list)) 75 | print("==========================================") 76 | 77 | if int(global_step) > 1: 78 | dict_acc[global_step] = accuracy_score 79 | dict_precision[global_step] = precision_list 80 | dict_recall[global_step] = recall_list 81 | dict_f1[global_step] = f1_list 82 | dict_acc_lsit[global_step] = acc_list 83 | if int(global_step) == 29001: 84 | # print("================全部准确率===================") 85 | # sort_dict(dict_acc) 86 | print("================5个准确率===================") 87 | sort_dict(dict_acc_lsit) 88 | print("================5个精准率===================") 89 | sort_dict(dict_precision) 90 | print("================5个召回率===================") 91 | sort_dict(dict_recall) 92 | print("================5个f1===================") 93 | sort_dict(dict_f1) 94 | break 95 | 96 | else: 97 | print('No checkpoint file found') 98 | return 99 | time.sleep(EVAL_INTERVAL_SECS) 100 | 101 | 102 | def get_acc(sess, true_y, pred_y): 103 | """ 104 | 计算总的准确率和5个标签的准确率 105 | :param sess: 106 | :param true_y: 107 | :param pred_y: 108 | :return: 109 | """ 110 | pred_y_ = np.where(pred_y > 0, 1, 0) 111 | correct_prediction = tf.equal(true_y, pred_y_) 112 | accuracy = sess.run(tf.reduce_mean(tf.cast(correct_prediction, tf.float32))) 113 | acc_list = [] 114 | for clazz in range(5): 115 | true_class1 = true_y[:, clazz] 116 | pred_class1 = pred_y[:, clazz] 117 | pred_class1_ = np.where(pred_class1 > 0, 1, 0) 118 | acc = 0 119 | for i in range(len(true_class1)): 120 | if true_class1[i] == pred_class1_[i]: 121 | acc += 1 122 | acc_list.append(acc * 1.0 / len(true_class1)) 123 | return accuracy, acc_list 124 | 125 | 126 | def get_precision(true_y, pred_y): 127 | """ 128 | 返回五个标签的精确率 129 | :param true_y: 130 | :param pred_y: 131 | :return: 132 | """ 133 | precison_list = [] 134 | for clazz in range(5): 135 | true_class1 = true_y[:, clazz] 136 | pred_class1 = pred_y[:, clazz] 137 | pred_class1_ = np.where(pred_class1 > 0, 1, 0) 138 | precison = 0 139 | for i in range(len(true_class1)): 140 | if true_class1[i] == 1 and pred_class1_[i] == 1: 141 | precison += 1 142 | precison_list.append(precison * 1.0 / np.sum(pred_class1_)) 143 | return precison_list 144 | 145 | 146 | def get_recall(true_y, pred_y): 147 | """ 148 | 返回5个标签的召回率 149 | :param true_y: 150 | :param pred_y: 151 | :return: 152 | """ 153 | recall_list = [] 154 | for clazz in range(5): 155 | true_class1 = true_y[:, clazz] 156 | pred_class1 = pred_y[:, clazz] 157 | pred_class1_ = np.where(pred_class1 > 0, 1, 0) 158 | precison = 0 159 | for i in range(len(true_class1)): 160 | if true_class1[i] == 1 and pred_class1_[i] == 1: 161 | precison += 1 162 | recall_list.append(precison * 1.0 / np.sum(true_class1)) 163 | return recall_list 164 | 165 | 166 | def get_f1(precison_list, recall_list): 167 | """ 168 | 返回5个标签的f1值 169 | :param precison: 170 | :param recall: 171 | :return: 172 | """ 173 | f1_list = [] 174 | for i in range(5): 175 | precison = precison_list[i] 176 | recall = recall_list[i] 177 | f1_list.append((2 * precison * recall) / (precison + recall)) 178 | return f1_list 179 | 180 | 181 | def mymean(acc_list): 182 | acc_set = set(acc_list[1:]) 183 | mean_acc = np.average(list(acc_set)) 184 | print('After 20091 training steps mean_acc', mean_acc) 185 | 186 | 187 | def sort_dict(dict): 188 | sorted_dict = sorted(dict.items(), key=lambda e: e[0], reverse=False) 189 | print(sorted_dict) 190 | item0 = 0 191 | item1 = 0 192 | item2 = 0 193 | item3 = 0 194 | item4 = 0 195 | for ke in sorted_dict: 196 | k = ke[1] 197 | # print(k) 198 | item0 = item0 + k[0] 199 | item1 = item1 + k[1] 200 | item2 = item2 + k[2] 201 | item3 = item3 + k[3] 202 | item4 = item4 + k[4] 203 | le = len(sorted_dict) 204 | print([item0 / le, item1 / le, item2 / le, item3 / le, item4 / le]) 205 | 206 | 207 | def main(argv=None): 208 | evaluate() 209 | if __name__ == '__main__': 210 | tf.app.run() 211 | -------------------------------------------------------------------------------- /character_dnn/character_inference.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | 定义看前向传播的过程以及神经网络中的参数 4 | """ 5 | 6 | import tensorflow as tf 7 | import math 8 | 9 | # 神经网络相关参数 10 | INPUT_NODE = 11 # 用户的特征维度 11 | OUTPUT_NODE = 5 # 输出5个类别的性格 12 | # LAYER1_NODE = 8 # 隱藏层的节点数 根据经验公式lgn 13 | expr = 0.43 * INPUT_NODE * 5 + 0.12 * 5 * 5 + 2.54 * INPUT_NODE + 0.77 * 5 + 0.35 14 | LAYER1_NODE = int(math.sqrt(expr) + 0.51) 15 | 16 | 17 | def get_weight_variable(shape, regularizer): 18 | # 通过 tf.get_variable获取变量 和Variable 一样,在测试的时候会通过保存的模型来加载这些变量的取值。 19 | # 滑动平均变量重命名(影子变量),所以可以直接通过同样的变量名字取到变量本身 20 | weights = tf.get_variable("weights", shape, initializer=tf.truncated_normal_initializer(stddev=0.1)) 21 | if regularizer != None: 22 | # 加入损失集合 23 | tf.add_to_collection('losses', regularizer(weights)) 24 | return weights 25 | 26 | 27 | def inference(input_tensor, regularizer): 28 | """ 29 | 一层隱藏层神经网络前向传播算法 30 | :param input_tensor: 31 | :param regularizer: 32 | :return: 33 | """ 34 | # 声明第一层神经网络的变量并完成前向传播 35 | with tf.variable_scope('layer1'): 36 | # 生成隱藏层的参数 37 | weights = get_weight_variable([INPUT_NODE, LAYER1_NODE], regularizer) 38 | # 偏置设置为0.1 39 | biases = tf.get_variable("biases", [LAYER1_NODE], initializer=tf.constant_initializer(0.1)) 40 | # 使用ReLU的激活函数 去线性化 41 | layer1 = tf.nn.relu(tf.matmul(input_tensor, weights) + biases) 42 | 43 | # 声明第二层神经网络的变量并完成前向传播 44 | with tf.variable_scope('layer2'): 45 | # 生成输出层的参数 46 | weights = get_weight_variable([LAYER1_NODE, OUTPUT_NODE], regularizer) 47 | biases = tf.get_variable("biases", [OUTPUT_NODE], initializer=tf.constant_initializer(0.1)) 48 | layer2 = tf.matmul(layer1, weights) + biases 49 | 50 | # 返回最后的前向传播的结果 51 | return layer2 52 | 53 | 54 | def get_weight(shape, regularizer): 55 | """ 56 | 获取一层神经网络边上的权重,并将这个权重的L2正则化损失加入名称为’losses‘的集合中 57 | :param shape: 维度——对应多少个输入和多少个输出 58 | :param lamd: 正则化项的权重 59 | :return: 神经网络边上的权重 60 | """ 61 | # 生成一个变量 代表权重 62 | var = tf.Variable(tf.random_normal(shape=shape), dtype=tf.float32) 63 | if regularizer != None: 64 | # 加入损失集合 65 | # 将这个权重的L2正则化损失加入名称为’losses‘的集合中 66 | tf.add_to_collection('losses', regularizer(var)) 67 | # 返回一层神经网络边上的权重 68 | return var 69 | 70 | 71 | def inference_nlayer(input_tensor, regularizer): 72 | """ 73 | n层神经网络前向传播算法 74 | :param input_tensor: 75 | :param regularizer: 76 | :return: 77 | """ 78 | # 定义没一层网络中的节点数 79 | layer_dimension = [INPUT_NODE, 100, 100, 100, OUTPUT_NODE] 80 | # 神经网络的层数 81 | n_layers = len(layer_dimension) 82 | 83 | # 这个变量维护前向传播时最深的层,开始时就是输入层 84 | cur_layer = input_tensor 85 | # 当前层的节点数 86 | in_dimension = layer_dimension[0] 87 | 88 | # 通过循环来生成5层全连接的神经网络结构 89 | for i in range(1, n_layers): 90 | # layer_dimension[i]为下一层的节点个数 91 | out_dimension = layer_dimension[i] 92 | # 生成当前层中权重的变量,并把这个变量的L2正则化损失加入计算图上的集合 93 | weight = get_weight([in_dimension, out_dimension], regularizer) 94 | bias = tf.Variable(tf.constant(0.1, shape=[out_dimension])) 95 | 96 | # 使用ReLU激活函数 97 | cur_layer = tf.nn.relu(tf.matmul(cur_layer, weight) + bias) 98 | # 进入下一层之前将下一层的节点个数更新为当前层节点个数 99 | in_dimension = layer_dimension[i] 100 | return cur_layer 101 | -------------------------------------------------------------------------------- /character_dnn/character_train.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | 定义了神经网络的训练过程 4 | """ 5 | 6 | import tensorflow as tf 7 | from character_dnn import character_inference 8 | import os 9 | from character_dnn import input_data 10 | 11 | # 1. 定义神经网络结构相关的参数。 12 | BATCH_SIZE = 50 # 一个训练batch中的训练数据个数,数字越小,训练过程越接近随机梯度下降 13 | LEARNING_RATE_BASE = 0.8 # 基础的学习率 14 | LEARNING_RATE_DECAY = 0.99 # 学习率衰减率 15 | REGULARIZATION_RATE = 0.0001 # 描述模型复杂度的正则化在损失函数的系数 16 | TRAINING_STEPS = 30000 # 训练轮数 17 | MOVING_AVERAGE_DECAY = 0.99 # 活动平均衰减率 18 | MODEL_SAVE_PATH = "model/" 19 | MODEL_NAME = "character_model" 20 | 21 | # 加载d2v 和 tfidf的数据 22 | train_list_side, train_list_tag, text_list_side, text_list_tag = input_data.load_data_label('') 23 | TRAIN_NUM_EXAMPLES = DATASET_SIZE = len(train_list_side) # 训练数据的总数 24 | 25 | # 2. 定义训练过程。 26 | def train(): 27 | # 定义输入输出placeholder。 28 | x = tf.placeholder(tf.float32, [None, character_inference.INPUT_NODE], name='x-input') 29 | y_ = tf.placeholder(tf.float32, name='y-input') 30 | # L2正则化 31 | regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE) 32 | # 计算在当前参数下神经网络前向传播的结果 33 | y = character_inference.inference(x, regularizer) 34 | # y = character_inference.inference_nlayer(x,regularizer) 35 | # 定义存储训练轮数的便利那个。这个变量不需要计算滑动平均值,所以这里指定这个变量为不可训练的变量 36 | global_step = tf.Variable(0, trainable=False) 37 | 38 | # ///////////////====定义损失函数、学习率、滑动平均操作以及训练过程。=====////////////// 39 | # 初始化滑动平均类 40 | variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) 41 | # 在所有代表神经网络参数的变量上使用滑动平均 42 | variables_averages_op = variable_averages.apply(tf.trainable_variables()) 43 | 44 | # 计算交叉熵作为刻画预测值和真实值之间茶军的损失函数 45 | """ 46 | // 参考损失函数的计算 http://blog.csdn.net/u013250416/article/details/78230464 47 | sigmoid_cross_entropy_with_logits 应用于多标签或者二分类 48 | """ 49 | # 多目标损失函数 50 | cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_) 51 | cross_entropy_mean = tf.reduce_mean(cross_entropy) 52 | # 总损失等于交叉熵损失和正则化损失的和 53 | loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses')) 54 | 55 | # 指数衰减设置学习率 56 | learning_rate = tf.train.exponential_decay( 57 | LEARNING_RATE_BASE, 58 | global_step, 59 | DATASET_SIZE / BATCH_SIZE, 60 | LEARNING_RATE_DECAY, 61 | staircase=True) 62 | # 优化损失函数,在minimize中传入global_step将自动更新global_step,从而更新学习率 63 | train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step) 64 | 65 | # 在训练神经网络模型时既需要通过反向传播来更新神经网络的参数,又要更新每一个参数的滑动平均值。 66 | with tf.control_dependencies([train_step, variables_averages_op]): 67 | train_op = tf.no_op(name='train') 68 | 69 | # 初始化TensorFlow持久化类。 70 | saver = tf.train.Saver() 71 | with tf.Session() as sess: 72 | tf.initialize_all_variables().run() 73 | 74 | for i in range(TRAINING_STEPS): 75 | 76 | # # 每次选取batch_size样本进行训练 77 | # start = (i * BATCH_SIZE) % DATASET_SIZE 78 | # end = min(start + BATCH_SIZE, DATASET_SIZE) 79 | # _, loss_value, step = sess.run([train_op, loss, global_step], 80 | # feed_dict={x: train_list_side[start:end], 81 | # y_: train_list_tag[start:end]}) 82 | 83 | # 每次选取all_size样本进行训练 84 | print(train_list_side.shape) 85 | print(train_list_tag.shape) 86 | _, loss_value, step = sess.run([train_op, loss, global_step], 87 | feed_dict={x: train_list_side, 88 | y_: train_list_tag}) 89 | if i % 1000 == 0: 90 | print("After %d training step(s), loss on training batch is %g." % (step, loss_value)) 91 | saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=global_step) 92 | 93 | 94 | def main(argv=None): 95 | train() 96 | if __name__ == '__main__': 97 | tf.app.run() 98 | -------------------------------------------------------------------------------- /character_dnn/input_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """Functions for reading character_dnn data.""" 3 | import os 4 | 5 | import numpy as np 6 | 7 | 8 | def load_data_label(base_model_dir): 9 | train_vec_filename = os.path.join(base_model_dir, "../data/vec/emotion_train_vec.npy") 10 | train_label_filename = os.path.join(base_model_dir, '../data/label/train_label.npy') 11 | test_vec_filename = os.path.join(base_model_dir, '../data/vec/emotion_test_vec.npy') 12 | test_label_filename = os.path.join(base_model_dir, '../data/label/test_label.npy') 13 | 14 | X_train = np.load(train_vec_filename) 15 | print('X_train', X_train.shape) 16 | Y_train = np.load(train_label_filename) 17 | print('Y_train', Y_train.shape) 18 | X_test = np.load(test_vec_filename) 19 | print('X_test', X_test.shape) 20 | Y_test = np.load(test_label_filename) 21 | print('Y_test', Y_test.shape) 22 | return X_train, Y_train, X_test, Y_test 23 | 24 | def load_data_label_combine(X_train, X_test, X1_train, X1_test): 25 | """ 26 | 列向合并矩阵 27 | combine two arr into one 28 | :return: 29 | """ 30 | X_train_all = np.hstack((X_train, X1_train)) 31 | X_test_all = np.hstack((X_test, X1_test)) 32 | return X_train_all, X_test_all 33 | 34 | 35 | if __name__ == '__main__': 36 | X_train, Y_train, X_test, Y_test = load_data_label('') 37 | print(X_test) 38 | print(Y_test) 39 | -------------------------------------------------------------------------------- /character_svm/svm_action.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | svm tfidf_d2v_dm_dbow_textmind_emotion for character 4 | calculate their acc precision recall f1 value 5 | """ 6 | 7 | from __future__ import division 8 | 9 | from sklearn import svm 10 | from numpy import * 11 | import numpy as np 12 | import os 13 | from features.crawl_textmind_data import input_textmind_data 14 | from utils import logger 15 | 16 | LOG = logger.get_logger() 17 | 18 | 19 | class SVMCharacterPredict: 20 | def myAcc(self, y_true, y_pred): 21 | """ 22 | 准确值计算 23 | :param y_true: 24 | :param y_pred: 25 | :return: 26 | """ 27 | true_num = 0 28 | # for i in range(y_true.__len__()): 29 | # # print y_true[i] 30 | for i in range(y_pred.__len__()): 31 | if y_true[i] == y_pred[i]: 32 | true_num += 1 33 | return true_num 34 | 35 | def mymean(self, list_predict_score, array_test): 36 | """ 37 | my mean count 38 | :param list_predict_score: 39 | :param array_test: 40 | :return: 41 | """ 42 | num_total = 0 43 | num_total = array_test.shape[0] * 5 44 | # print "total numbers : " + str(num_total) 45 | return list_predict_score / (num_total) 46 | 47 | def train_eval(self, X_train, y_train, X_text, y_text): 48 | """ 49 | 输入矩阵 训练模型并计算准确率 50 | :param X_text: 51 | :param X_train: 52 | :param y_text: 53 | :param y_train: 54 | :return: 55 | """ 56 | pred_y = [] 57 | true_acc = 0 58 | for i in range(5): 59 | list_train_tags = [] 60 | list_test_tags = [] 61 | # # print "第" + str(i) + "个分类器训练" 62 | # first build train tag 63 | for line in y_train: 64 | list_train_tags.append(line[i]) 65 | # first build text tag 66 | for line in y_text: 67 | list_test_tags.append(line[i]) 68 | clf = svm.SVC(probability=True) 69 | clf = svm.SVC(kernel='linear', probability=True) 70 | # 逻辑回归训练模型 71 | clf.fit(X_train, list_train_tags) 72 | # 用模型预测 73 | y_pred_te = clf.predict_proba(X_text) 74 | # # print np.argmax(y_pred_te, axis=1) 75 | # # print "**" * 50 76 | # # print list_test_tags 77 | # #获取准确的个数 78 | # # print self.myAcc(list_test_tags, y_pred_te) 79 | 80 | # 最大数的索引 81 | y_pred = np.argmax(y_pred_te, axis=1) 82 | true_acc += self.myAcc(list_test_tags, y_pred) 83 | pred_y.append(y_pred) 84 | 85 | # print "true acc numbers: " + str(true_acc) 86 | pred_y_ = map(list, zip(*pred_y)) 87 | pred_y_ = mat(pred_y_) 88 | return self.mymean(true_acc, X_text), pred_y_ 89 | 90 | def predict_by_textmind(self): 91 | """ 92 | svm 文心特征 93 | :return: 94 | """ 95 | X_train, Y_train, X_test, Y_test = input_textmind_data.load_textmind_data_label_with_normalization( 96 | '../crawl_textmind_data') 97 | mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test) 98 | print("textmind+支持向量机 准确率平均值为: " + str(mymean)) 99 | # LOG.info("textmind+支持向量机 准确率平均值为: " + str(mymean)) 100 | 101 | acc_list = self.get_acc(Y_test, pred_y) 102 | print("After training step(s), 5 validation accuracy = %s" % acc_list) 103 | precision_list = self.get_precision(Y_test, pred_y) 104 | print("After training step(s), 5 precision = %s" % precision_list) 105 | recall_list = self.get_recall(Y_test, pred_y) 106 | print("After training step(s), 5 recall = %s" % recall_list) 107 | f1_list = self.get_f1(precision_list, recall_list) 108 | print("After training step(s), 5 f1 = %s" % f1_list) 109 | print("==========================================") 110 | return X_train, Y_train, X_test, Y_test 111 | 112 | def predict_by_d2v_dm(self): 113 | """ 114 | d2v_dm 训练 115 | :return: 116 | """ 117 | base_model_dir = '' 118 | train_vec_filename = os.path.join(base_model_dir, "doc2vec_train_vec_dm.npy") 119 | train_label_filename = os.path.join(base_model_dir, 'doc2vec_train_label_dm.npy') 120 | test_vec_filename = os.path.join(base_model_dir, 'doc2vec_test_vec_dm.npy') 121 | test_label_filename = os.path.join(base_model_dir, 'doc2vec_test_label_dm.npy') 122 | 123 | X_train, Y_train, X_test, Y_test = self.load_arr(test_label_filename, test_vec_filename, train_label_filename, 124 | train_vec_filename) 125 | mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test) 126 | # print "d2v_dm+支持向量机 准确率平均值为: " + str(mymean) 127 | LOG.info("d2v_dm+支持向量机 准确率平均值为: " + str(mymean)) 128 | return X_train, Y_train, X_test, Y_test 129 | 130 | def predict_by_d2v_dbow(self): 131 | """ 132 | d2v_dbow 训练 133 | :return: 134 | """ 135 | base_model_dir = 'E:\\Koo\\Projects\\PycharmProjects\\TensorFlow_DNN_Character_Classification\\features\\doc2vec' 136 | train_vec_filename = os.path.join(base_model_dir, "doc2vec_train_vec_dm.npy") 137 | train_label_filename = os.path.join(base_model_dir, 'train_label.npy') 138 | test_vec_filename = os.path.join(base_model_dir, 'doc2vec_test_vec_dm.npy') 139 | test_label_filename = os.path.join(base_model_dir, 'test_label.npy') 140 | 141 | X_train, Y_train, X_test, Y_test = self.load_arr(test_label_filename, test_vec_filename, train_label_filename, 142 | train_vec_filename) 143 | mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test) 144 | print("d2v_dbow+支持向量机 准确率平均值为: " + str(mymean)) 145 | # LOG.info("d2v_dbow+支持向量机 准确率平均值为: " + str(mymean)) 146 | print(pred_y.shape) 147 | acc_list = self.get_acc(Y_test, pred_y) 148 | print("After training step(s), 5 validation accuracy = %s" % acc_list) 149 | precision_list = self.get_precision(Y_test, pred_y) 150 | print("After training step(s), 5 precision = %s" % precision_list) 151 | recall_list = self.get_recall(Y_test, pred_y) 152 | print("After training step(s), 5 recall = %s" % recall_list) 153 | f1_list = self.get_f1(precision_list, recall_list) 154 | print("After training step(s), 5 f1 = %s" % f1_list) 155 | print("==========================================") 156 | 157 | return X_train, Y_train, X_test, Y_test 158 | 159 | def predict_by_tfidf(self): 160 | """ 161 | tfidf 训练 162 | :return: 163 | """ 164 | base_model_dir = 'E:\\Koo\Projects\\PycharmProjects\\TensorFlow_DNN_Character_Classification\\features\\tfidf' 165 | train_vec_filename = os.path.join(base_model_dir, "tfidf_train_vec_tfidf.npy") 166 | train_label_filename = os.path.join(base_model_dir, 'train_label.npy') 167 | test_vec_filename = os.path.join(base_model_dir, 'tfidf_test_vec_tfidf.npy') 168 | test_label_filename = os.path.join(base_model_dir, 'test_label.npy') 169 | 170 | X_train, Y_train, X_test, Y_test = self.load_arr(test_label_filename, test_vec_filename, train_label_filename, 171 | train_vec_filename) 172 | mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test) 173 | print("tfidf+支持向量机 准确率平均值为: " + str(mymean)) 174 | # LOG.info("tfidf + 停用词 +支持向量机 准确率平均值为: " + str(mymean)) 175 | 176 | print(Y_test.shape) 177 | print(pred_y.shape) 178 | acc_list = self.get_acc(Y_test, pred_y) 179 | print("After training step(s), 5 validation accuracy = %s" % acc_list) 180 | precision_list = self.get_precision(Y_test, pred_y) 181 | print("After training step(s), 5 precision = %s" % precision_list) 182 | recall_list = self.get_recall(Y_test, pred_y) 183 | print("After training step(s), 5 recall = %s" % recall_list) 184 | f1_list = self.get_f1(precision_list, recall_list) 185 | print("After training step(s), 5 f1 = %s" % f1_list) 186 | print("==========================================") 187 | 188 | return X_train, Y_train, X_test, Y_test 189 | 190 | def predict_by_tfidf_stopword(self): 191 | """ 192 | tfidf 训练 193 | :return: 194 | """ 195 | base_model_dir = '' 196 | train_vec_filename = os.path.join(base_model_dir, "tfidf_train_vec_tfidf_stopword.npy") 197 | train_label_filename = os.path.join(base_model_dir, 'doc2vec_train_label_dm.npy') 198 | test_vec_filename = os.path.join(base_model_dir, 'tfidf_test_vec_tfidf_stopword.npy') 199 | test_label_filename = os.path.join(base_model_dir, 'doc2vec_test_label_dm.npy') 200 | 201 | X_train, Y_train, X_test, Y_test = self.load_arr(test_label_filename, test_vec_filename, train_label_filename, 202 | train_vec_filename) 203 | mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test) 204 | print("tfidf+stopword+支持向量机 准确率平均值为: " + str(mymean)) 205 | # LOG.info("tfidf+支持向量机 准确率平均值为: " + str(mymean)) 206 | 207 | acc_list = self.get_acc(Y_test, pred_y) 208 | print("After training step(s), 5 validation accuracy = %s" % acc_list) 209 | precision_list = self.get_precision(Y_test, pred_y) 210 | print("After training step(s), 5 precision = %s" % precision_list) 211 | recall_list = self.get_recall(Y_test, pred_y) 212 | print("After training step(s), 5 recall = %s" % recall_list) 213 | f1_list = self.get_f1(precision_list, recall_list) 214 | print("After training step(s), 5 f1 = %s" % f1_list) 215 | print("==========================================") 216 | 217 | return X_train, Y_train, X_test, Y_test 218 | 219 | def load_arr(self, test_label_filename, test_vec_filename, train_label_filename, train_vec_filename): 220 | X_train = np.load(train_vec_filename) 221 | # # print('X_train', X_train.shape) 222 | Y_train = np.load(train_label_filename) 223 | # # print('Y_train', Y_train.shape) 224 | X_test = np.load(test_vec_filename) 225 | # # print('X_test', X_test.shape) 226 | Y_test = np.load(test_label_filename) 227 | # # print('Y_test', Y_test.shape) 228 | return X_train, Y_train, X_test, Y_test 229 | 230 | def predict_by_emotion(self): 231 | """ 232 | 情感特征 233 | :return: 234 | """ 235 | from features.emotion_lexicon import data_helper 236 | 237 | X_train, Y_train, X_test, Y_test = data_helper.load_emotion_data_label('../Emotion_Lexicon') 238 | mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test) 239 | # print "情感特征+支持向量机 准确率平均值为: " + str(mymean) 240 | LOG.info("情感特征+支持向量机 准确率平均值为: " + str(mymean)) 241 | return X_train, Y_train, X_test, Y_test 242 | 243 | def predict_by_combine(self): 244 | """ 245 | 组合特征训练 246 | :return: 247 | """ 248 | from character_dnn import input_data 249 | 250 | base_model_dir = '' 251 | train_vec_filename = os.path.join(base_model_dir, "tfidf_train_vec_tfidf.npy") 252 | train_label_filename = os.path.join(base_model_dir, 'doc2vec_train_label_dm.npy') 253 | test_vec_filename = os.path.join(base_model_dir, 'tfidf_test_vec_tfidf.npy') 254 | test_label_filename = os.path.join(base_model_dir, 'doc2vec_test_label_dm.npy') 255 | 256 | X_train, Y_train, X_test, Y_test = self.load_arr(test_label_filename, test_vec_filename, train_label_filename, 257 | train_vec_filename) 258 | train_vec_filename = os.path.join(base_model_dir, "doc2vec_train_vec_dbow.npy") 259 | train_label_filename = os.path.join(base_model_dir, 'doc2vec_train_label_dm.npy') 260 | test_vec_filename = os.path.join(base_model_dir, 'doc2vec_test_vec_dbow.npy') 261 | test_label_filename = os.path.join(base_model_dir, 'doc2vec_test_label_dm.npy') 262 | 263 | X1_train, Y1_train, X1_test, Y1_test = self.load_arr(test_label_filename, test_vec_filename, 264 | train_label_filename, 265 | train_vec_filename) 266 | 267 | train_list_side, text_list_side = input_data.load_data_label_combine(X_train, X_test, X1_train, X1_test) 268 | mymean, pred_y = self.train_eval(train_list_side, Y_train, text_list_side, Y_test) 269 | # print "综合特征+支持向量机 准确率平均值为: " + str(mymean) 270 | LOG.info("tfidf+dbow+综合特征+支持向量机 准确率平均值为: " + str(mymean)) 271 | 272 | acc_list = self.get_acc(Y_test, pred_y) 273 | print("After training step(s), 5 validation accuracy = %s" % acc_list) 274 | precision_list = self.get_precision(Y_test, pred_y) 275 | print("After training step(s), 5 precision = %s" % precision_list) 276 | recall_list = self.get_recall(Y_test, pred_y) 277 | print("After training step(s), 5 recall = %s" % recall_list) 278 | f1_list = self.get_f1(precision_list, recall_list) 279 | print("After training step(s), 5 f1 = %s" % f1_list) 280 | print("==========================================") 281 | 282 | def predict_by_combine_two(self, fun1, fun2, fun1name, fun2name): 283 | from character_dnn import input_data 284 | 285 | X_train, Y_train, X_test, Y_test = fun1 286 | X1_train, Y1_train, X1_test, Y1_test = fun2 287 | train_list_side, text_list_side = input_data.load_data_label_combine(X_train, X_test, X1_train, X1_test) 288 | mymean, pred_y = self.train_eval(train_list_side, Y_train, text_list_side, Y_test) 289 | print("综合特征+支持向量机 准确率平均值为: " + str(mymean)) 290 | LOG.info(fun1name + " + " + fun2name + " 综合特征+支持向量机 准确率平均值为: " + str(mymean)) 291 | 292 | acc_list = self.get_acc(Y_test, pred_y) 293 | print("After training step(s), 5 validation accuracy = %s" % acc_list) 294 | precision_list = self.get_precision(Y_test, pred_y) 295 | print("After training step(s), 5 precision = %s" % precision_list) 296 | recall_list = self.get_recall(Y_test, pred_y) 297 | print("After training step(s), 5 recall = %s" % recall_list) 298 | f1_list = self.get_f1(precision_list, recall_list) 299 | print("After training step(s), 5 f1 = %s" % f1_list) 300 | print("==========================================") 301 | 302 | def predict_by_combine_three(self): 303 | from character_dnn import input_data 304 | 305 | X_train, Y_train, X_test, Y_test = self.predict_by_tfidf() 306 | X1_train, Y1_train, X1_test, Y1_test = self.predict_by_d2v_dbow() 307 | X2_train, Y2_train, X2_test, Y2_test = self.predict_by_emotion() 308 | X3_train, X3_test = input_data.load_data_label_combine(X_train, X_test, X1_train, X1_test) 309 | train_list_side, text_list_side = input_data.load_data_label_combine(X3_train, X3_test, X2_train, X2_test) 310 | mymean, pred_y = self.train_eval(train_list_side, Y_train, text_list_side, Y_test) 311 | # print "综合特征+支持向量机 准确率平均值为: " + str(mymean) 312 | LOG.info(" tiidf + d2v_dbow + emotion 综合特征+支持向量机 准确率平均值为: " + str(mymean)) 313 | 314 | def get_acc(self, true_y, pred_y): 315 | """ 316 | 计算总的准确率和5个标签的准确率 317 | :param sess: 318 | :param true_y: 319 | :param pred_y: 320 | :return: 321 | """ 322 | acc_list = [] 323 | for clazz in range(5): 324 | true_class1 = true_y[:, clazz] 325 | pred_class1 = pred_y[:, clazz] 326 | acc = 0 327 | for i in range(len(true_class1)): 328 | if true_class1[i] == pred_class1[i]: 329 | acc += 1 330 | acc_list.append(acc * 1.0 / len(true_class1)) 331 | return acc_list 332 | 333 | def get_precision(self, true_y, pred_y): 334 | """ 335 | 返回五个标签的精确率 336 | :param true_y: 337 | :param pred_y: 338 | :return: 339 | """ 340 | precison_list = [] 341 | for clazz in range(5): 342 | true_class1 = true_y[:, clazz] 343 | pred_class1 = pred_y[:, clazz] 344 | precison = 0 345 | for i in range(len(true_class1)): 346 | if true_class1[i] == 1 and pred_class1[i] == 1: 347 | precison += 1 348 | precison_list.append(precison * 1.0 / np.sum(pred_class1)) 349 | return precison_list 350 | 351 | def get_recall(self, true_y, pred_y): 352 | """ 353 | 返回5个标签的召回率 354 | :param true_y: 355 | :param pred_y: 356 | :return: 357 | """ 358 | recall_list = [] 359 | for clazz in range(5): 360 | true_class1 = true_y[:, clazz] 361 | pred_class1 = pred_y[:, clazz] 362 | precison = 0 363 | for i in range(len(true_class1)): 364 | if true_class1[i] == 1 and pred_class1[i] == 1: 365 | precison += 1 366 | recall_list.append(precison * 1.0 / np.sum(true_class1)) 367 | return recall_list 368 | 369 | def get_f1(self, precison_list, recall_list): 370 | """ 371 | 返回5个标签的f1值 372 | :param precison: 373 | :param recall: 374 | :return: 375 | """ 376 | f1_list = [] 377 | for i in range(5): 378 | precison = precison_list[i] 379 | recall = recall_list[i] 380 | f1_list.append((2 * precison * recall) / (precison + recall)) 381 | return f1_list 382 | 383 | 384 | 385 | 386 | if __name__ == '__main__': 387 | user_predict = SVMCharacterPredict() 388 | # user_predict.predict_by_combine() 389 | # for _ in range(2): 390 | # user_predict.predict_by_combine_three() 391 | 392 | # # 训练10次 393 | for _ in range(10): 394 | LOG.info("=========开始第" + str(_ + 1) + "轮训练组合===========") 395 | # fun2 = user_predict.predict_by_textmind() 396 | fun3 = user_predict.predict_by_d2v_dbow() 397 | # fun5 = user_predict.predict_by_tfidf() 398 | # user_predict.predict_by_tfidf_stopword() 399 | # f2name = 'textmind' 400 | # f3name = 'dbow' 401 | # f5name = 'tfidf' 402 | # user_predict.predict_by_combine_two(fun2, fun3, f2name, f3name) 403 | # user_predict.predict_by_combine_two(fun2, fun5, f2name, f5name) 404 | # user_predict.predict_by_combine_two(fun3, fun5, f3name, f5name) 405 | # user_predict.predict_by_combine() 406 | -------------------------------------------------------------------------------- /character_svm/svm_dbow_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | ''' doc2vc-svm stack for gender''' 3 | 4 | from __future__ import division 5 | 6 | import codecs 7 | from sklearn import svm 8 | from sklearn.externals import joblib 9 | 10 | import gensim 11 | from numpy import * 12 | from gensim.models.doc2vec import Doc2Vec, LabeledSentence 13 | import os 14 | import numpy as np 15 | 16 | 17 | class user_predict: 18 | def __init__(self, train_document, text_document): 19 | self.train_document = train_document 20 | self.text_document = text_document 21 | 22 | # -----------------------准确值计算----------------------- 23 | def myAcc(self, y_true, y_pred): 24 | true_num = 0 25 | # 最大数的索引 26 | y_pred = np.argmax(y_pred, axis=1) 27 | 28 | # for i in range(y_true.__len__()): 29 | # print y_true[i] 30 | for i in range(y_pred.__len__()): 31 | if y_true[i] == y_pred[i]: 32 | true_num += 1 33 | return true_num 34 | 35 | # -----------------------load data----------------------- 36 | def load_data(self, doc): 37 | 38 | list_name = [] 39 | list_total = [] 40 | list_gender = [] 41 | # 对应标签导入词典 42 | f = codecs.open(doc) 43 | temp = f.readlines() 44 | print(len(temp)) 45 | 46 | for i in range(len(temp)): 47 | temp[i] = temp[i].split(" ") 48 | user_name = temp[i][0] 49 | tags = temp[i][1:6] 50 | 51 | query = temp[i][6:] 52 | query = " ".join(query).strip().replace("\n", "") 53 | list_total.append(query) 54 | list_gender.append(tags) 55 | 56 | print(list_total.__len__()) 57 | print(list_gender.__len__()) 58 | # 标签转化,男:0,女:1 59 | list_tag = [] 60 | for line in list_gender: 61 | list_t = [] 62 | for j in line: 63 | j = int(j) 64 | list_t.append(j) 65 | list_tag.append(list_t) 66 | 67 | print("data have read ") 68 | return list_total, list_tag 69 | 70 | # -------------------------prepare d2w svd ----------------------- 71 | def prepare_lsi(self, doc): 72 | 73 | list_total, list_tag = self.load_data(doc) 74 | 75 | stop_word = [] 76 | 77 | # 构建语料库 78 | X_doc = [] 79 | TaggededDocument = gensim.models.doc2vec.TaggedDocument 80 | for i in range(list_total.__len__()): 81 | word_list = list_total[i] 82 | document = TaggededDocument(word_list, tags=[i]) 83 | X_doc.append(document) 84 | 85 | return X_doc, list_total, list_tag 86 | 87 | def train_lsi_model(self, doc): 88 | 89 | X_doc, list_total, list_tag = self.prepare_lsi(doc) 90 | # 训练模型 91 | model_dm = Doc2Vec(X_doc, dm=0, size=300, negative=5, hs=0, min_count=1, window=30, sample=1e-5, workers=8, 92 | alpha=0.04, min_alpha=0.025) 93 | joblib.dump(model_dm, "model_d2v_dbow.model") 94 | print("d2w模型训练完成") 95 | 96 | return model_dm 97 | 98 | def write_d2v(self, X_sp, doc_name): 99 | """ 100 | 保存doc2vec的特征向量 101 | :param X_sp: 102 | :param doc_name: 103 | :return: 104 | """ 105 | np.save("doc2vec_" + doc_name + ".npy",X_sp) 106 | 107 | print("*****************write done over *****************") 108 | 109 | def train_lsi(self, doc, str_vec): 110 | 111 | if (os.path.exists("model_d2v_dbow.model")): 112 | 113 | # load train model 114 | model_dm = joblib.load("model_d2v_dbow.model") 115 | else: 116 | # load train model 117 | model_dm = self.train_lsi_model(doc) 118 | 119 | # prepare data 120 | X_doc, list_total, list_tag = self.prepare_lsi(doc) 121 | 122 | for i in range(10): 123 | # 一个用户作为一个文件去进行d2v的计算 124 | model_dm.train(X_doc, total_examples=model_dm.corpus_count, epochs=2) 125 | X_d2v = np.array([model_dm.docvecs[i] for i in range(len(list_total))]) 126 | 127 | print(X_d2v.shape) 128 | 129 | list_side = X_d2v 130 | 131 | self.write_d2v(list_side, str_vec) 132 | print(" doc2vec 矩阵构建完成----------------") 133 | 134 | return list_total, list_tag, list_side 135 | 136 | # ------------------------my mean count------------------ 137 | 138 | def mymean(self, list_predict_score, array_test): 139 | num_total = 0 140 | num_total = array_test.shape[0] * 5 141 | print("total numbers : " + str(num_total)) 142 | return list_predict_score / (num_total) 143 | 144 | # ------------------------------begin to predict------------ 145 | def predict(self): 146 | str1 = "train_vec_dbow" 147 | str2 = "test_vec_dbow" 148 | train_list_total, train_list_tag, train_list_side = self.train_lsi(self.train_document, str1) 149 | print("train model done -------------------") 150 | 151 | text_list_total, text_list_tag, text_list_side = self.train_lsi(self.text_document, str2) 152 | print("text model done -------------------") 153 | 154 | TR = train_list_total.__len__() 155 | TE = text_list_total.__len__() 156 | n = 5 157 | 158 | train_list_side = mat(train_list_side) 159 | text_list_side = mat(text_list_side) 160 | 161 | X_train = train_list_side[:TR] 162 | y_train = train_list_tag[:TR] 163 | y_train = np.array(y_train) 164 | 165 | print("train shape :---------------------") 166 | print(X_train.shape) 167 | 168 | X_text = text_list_side[:TE] 169 | y_text = text_list_tag[:TE] 170 | y_text = np.array(y_text) 171 | 172 | print("text shape :---------------------") 173 | print(X_text.shape) 174 | 175 | # kfold折叠交叉验证 176 | list_myAcc = [] 177 | true_acc = 0 178 | 179 | for i in range(5): 180 | list_train_tags = [] 181 | list_test_tags = [] 182 | print("第" + str(i) + "个分类器训练") 183 | 184 | # first build train tag 185 | for line in y_train: 186 | list_train_tags.append(line[i]) 187 | 188 | # first build text tag 189 | for line in y_text: 190 | list_test_tags.append(line[i]) 191 | 192 | clf = svm.SVC(probability=True) 193 | 194 | clf = svm.SVC(kernel='linear', probability=True) 195 | 196 | # 逻辑回归训练模型 197 | clf.fit(X_train, list_train_tags) 198 | # 用模型预测 199 | y_pred_te = clf.predict_proba(X_text) 200 | 201 | print(np.argmax(y_pred_te, axis=1)) 202 | print("**" * 50) 203 | print(list_test_tags) 204 | 205 | # #获取准确的个数 206 | print(self.myAcc(list_test_tags, y_pred_te)) 207 | true_acc += self.myAcc(list_test_tags, y_pred_te) 208 | 209 | print("true acc numbers: " + str(true_acc)) 210 | 211 | print("d2w_dbow + 支持向量机 准确率平均值为: ") 212 | print(self.mymean(true_acc, X_text)) 213 | 214 | 215 | if __name__ == '__main__': 216 | base_dir = 'E:\\Koo\\Projects\\PycharmProjects\\TensorFlow_DNN_Character_Classification\\data\essay_data' 217 | user_predict = user_predict(os.path.join(base_dir, "vocab1_train.txt"), os.path.join(base_dir, "vocab1_test.txt")) 218 | user_predict.predict() -------------------------------------------------------------------------------- /character_svm/svm_tfidf_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | tfidf模型构建特征并计算运算结果 4 | """ 5 | from __future__ import division 6 | 7 | import codecs 8 | import os 9 | 10 | from sklearn.feature_extraction.text import TfidfVectorizer 11 | from sklearn import svm 12 | from sklearn.externals import joblib 13 | from numpy import * 14 | from gensim import models, corpora 15 | import numpy as np 16 | 17 | INPUT_SIZE = 300 # 训练特征维度参数 18 | 19 | 20 | class user_predict: 21 | def __init__(self, train_document, text_document): 22 | self.train_document = train_document 23 | self.text_document = text_document 24 | 25 | # -----------------------准确值计算----------------------- 26 | def myAcc(self, y_true, y_pred): 27 | true_num = 0 28 | # 最大数的索引 29 | y_pred = np.argmax(y_pred, axis=1) 30 | 31 | # for i in range(y_true.__len__()): 32 | # print y_true[i] 33 | for i in range(y_pred.__len__()): 34 | if y_true[i] == y_pred[i]: 35 | true_num += 1 36 | return true_num 37 | 38 | # -----------------------load data----------------------- 39 | def load_data(self, doc): 40 | 41 | list_name = [] 42 | list_total = [] 43 | list_gender = [] 44 | # 对应标签导入词典 45 | f = codecs.open(doc) 46 | temp = f.readlines() 47 | print(len(temp)) 48 | 49 | for i in range(len(temp)): 50 | temp[i] = temp[i].split(" ") 51 | user_name = temp[i][0] 52 | tags = temp[i][1:6] 53 | 54 | query = temp[i][6:] 55 | query = " ".join(query).strip().replace("\n", "") 56 | list_total.append(query) 57 | list_gender.append(tags) 58 | 59 | print(list_total.__len__()) 60 | print(list_gender.__len__()) 61 | list_tag = [] 62 | for line in list_gender: 63 | list_t = [] 64 | for j in line: 65 | j = int(j) 66 | list_t.append(j) 67 | list_tag.append(list_t) 68 | 69 | print("data have read ") 70 | return list_total, list_tag 71 | 72 | def load_stopword(self): 73 | """ 74 | 加载停用词语 75 | :param stopworddoc: 76 | :return: 77 | """ 78 | stop_word = [] 79 | return stop_word 80 | # with open('EN_Stopword.txt') as f: 81 | # lines = f.readlines() 82 | # for line in lines: 83 | # word = line.replace('\n', '') 84 | # if word != '': 85 | # stop_word.append(word) 86 | # with open('ENstopwords.txt') as f: 87 | # lines = f.readlines() 88 | # for line in lines: 89 | # word = line.replace('\n', '') 90 | # if word != '': 91 | # stop_word.append(word) 92 | # 93 | # return list(set(stop_word)) 94 | 95 | # -------------------------prepare lsi svd ----------------------- 96 | def prepare_lsi(self, doc): 97 | 98 | # 给训练集用的 99 | list_total, list_tag = self.load_data(doc) 100 | 101 | stop_word = self.load_stopword() 102 | 103 | texts = [[word for word in document.lower().split() if word not in stop_word] 104 | for document in list_total] 105 | 106 | # train dictionary done# 抽取一个bag-of-words,将文档的token映射为id 107 | dictionary = corpora.Dictionary(texts) # 生成词典 # {'a': 0, 'damaged': 1, 'gold': 3, 'fire': 2} 108 | # print dictionary.token2id 109 | # 产生文档向量,将用字符串表示的文档转换为用id和词频表示的文档向量 110 | corpus = [dictionary.doc2bow(text) for text in texts] 111 | # [[(0, 1), (6, 1)], [(0, 1), (9, 2), (10, 1)], [(0, 1), (3, 1)]] 112 | # 例如(9,2)这个元素代表第二篇文档中id为9的单词出现了2次 113 | 114 | # 用TFIDF的方法计算词频,sublinear_tf 表示学习率 115 | tfv = TfidfVectorizer(min_df=1, max_df=0.95, sublinear_tf=True, stop_words=stop_word) 116 | # 对文本中所有的用户对应的所有的评论里面的单词进行TFIDF的计算,找出每个词对应的tfidf值 117 | X_sp = tfv.fit_transform(list_total) 118 | # train model done基于这些“训练文档”计算一个TF-IDF模型 119 | tfidf_model = models.TfidfModel(corpus) 120 | joblib.dump(tfidf_model, "tfidf_model.model") 121 | 122 | # 转化文档向量,将用词频表示的文档向量表示为一个用tf-idf值表示的文档向量 123 | corpus_tfidf = tfidf_model[corpus] 124 | # [[(1, 0.6633689723434505), (2, 0.6633689723434505)],[(7, 0.16073253746956623), (8, 0.4355066251613605)]] 125 | 126 | # 训练LSI模型 即将训练文档向量组成的矩阵SVD分解,并做一个秩为2的近似SVD分解 127 | lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=INPUT_SIZE) 128 | joblib.dump(dictionary, "tfidf_dictionary.dict") 129 | print("训练集lsi -----") 130 | joblib.dump(lsi_model, "tfidf_lsi.model") 131 | 132 | return tfidf_model, dictionary 133 | 134 | def train_lsi(self, doc, str_doc): 135 | if not (os.path.exists("tfidf_model.model")): 136 | print("prepare model") 137 | # load train model 138 | tfidf_model, dictionary = self.prepare_lsi(doc) 139 | # load data 140 | list_total, list_tag = self.load_data(doc) 141 | stop_word = self.load_stopword() 142 | texts = [[word for word in document.lower().split() if word not in stop_word] 143 | for document in list_total] 144 | corpus = [dictionary.doc2bow(text) for text in texts] 145 | 146 | else: 147 | print("use model") 148 | # load train valid text 149 | tfidf_model = joblib.load("tfidf_model.model") 150 | dictionary = joblib.load("tfidf_dictionary.dict") 151 | # load data 152 | list_total, list_tag = self.load_data(doc) 153 | stop_word = self.load_stopword() 154 | texts = [[word for word in document.lower().split() if word not in stop_word] 155 | for document in list_total] 156 | corpus = [dictionary.doc2bow(text) for text in texts] 157 | lsi_model = joblib.load("tfidf_lsi.model") 158 | corpus_tfidf = tfidf_model[corpus] 159 | list_side = [] 160 | corpus_lsi = lsi_model[corpus_tfidf] 161 | nodes = list(corpus_lsi) 162 | 163 | for i in range(len(nodes)): 164 | list_d = [] 165 | for j in range(INPUT_SIZE): 166 | # print nodes[i][j] 167 | list_d.append(nodes[i][j][1]) 168 | list_side.append(list_d) 169 | 170 | list_vec = mat(list_side) 171 | self.write_d2v(list_vec, str_doc) 172 | print("lsi 矩阵构建完成----------------") 173 | return list_total, list_tag, list_side 174 | 175 | # -----------------------write vec-------------------- 176 | def write_d2v(self, X_sp, doc_name): 177 | file_name = "tfidf_" + doc_name + ".npy" 178 | np.save(file_name, X_sp) 179 | print("*****************write done over *****************") 180 | 181 | # ------------------------my mean count------------------ 182 | def mymean(self, list_predict_score, array_test): 183 | num_total = 0 184 | num_total = array_test.shape[0] * 5 185 | print("total numbers : " + str(num_total)) 186 | return list_predict_score / (num_total) 187 | 188 | # ------------------------------begin to predict------------ 189 | def predict(self): 190 | str1 = "train_vec_tfidf" 191 | str2 = "test_vec_tfidf" 192 | train_list_total, train_list_tag, train_list_side = self.train_lsi(self.train_document, str1) 193 | print("train model done -------------------") 194 | text_list_total, text_list_tag, text_list_side = self.train_lsi(self.text_document, str2) 195 | print("text model done -------------------") 196 | TR = train_list_total.__len__() 197 | TE = text_list_total.__len__() 198 | n = 5 199 | train_list_side = mat(train_list_side) 200 | text_list_side = mat(text_list_side) 201 | X_train = train_list_side[:TR] 202 | y_train = train_list_tag[:TR] 203 | y_train = np.array(y_train) 204 | 205 | print("train shape :---------------------") 206 | print(X_train.shape) 207 | 208 | X_text = text_list_side[:TE] 209 | y_text = text_list_tag[:TE] 210 | y_text = np.array(y_text) 211 | 212 | print("text shape :---------------------") 213 | print(X_text.shape) 214 | 215 | # kfold折叠交叉验证 216 | list_myAcc = [] 217 | self.train_eval(X_train, y_train, X_text, y_text) 218 | 219 | def train_eval(self, X_train, y_train, X_text, y_text): 220 | true_acc = 0 221 | for i in range(5): 222 | list_train_tags = [] 223 | list_test_tags = [] 224 | print("第" + str(i) + "个分类器训练") 225 | 226 | # first build train tag 227 | for line in y_train: 228 | list_train_tags.append(line[i]) 229 | 230 | # first build text tag 231 | for line in y_text: 232 | list_test_tags.append(line[i]) 233 | 234 | clf = svm.SVC(probability=True) 235 | 236 | clf = svm.SVC(kernel='linear', probability=True) 237 | # 逻辑回归训练模型 238 | clf.fit(X_train, list_train_tags) 239 | # 用模型预测 240 | y_pred_te = clf.predict_proba(X_text) 241 | 242 | print(np.argmax(y_pred_te, axis=1)) 243 | print("**" * 50) 244 | print(list_test_tags) 245 | 246 | # #获取准确的个数 247 | print(self.myAcc(list_test_tags, y_pred_te)) 248 | true_acc += self.myAcc(list_test_tags, y_pred_te) 249 | print("true acc numbers: " + str(true_acc)) 250 | print("LSI + 支持向量机 准确率平均值为: ") 251 | print(self.mymean(true_acc, X_text)) 252 | 253 | 254 | if __name__ == '__main__': 255 | base_dir = 'E:\\Koo\\Projects\\PycharmProjects\\TensorFlow_DNN_Character_Classification\\data\essay_data' 256 | user_predict = user_predict(os.path.join(base_dir, "vocab1_train.txt"), 257 | os.path.join(base_dir, "vocab1_test.txt")) 258 | # for _ in range(9): 259 | # print('训练次数', _) 260 | user_predict.predict() 261 | -------------------------------------------------------------------------------- /data/essay_data/essays.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/essay_data/essays.csv -------------------------------------------------------------------------------- /data/label/test_label.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/label/test_label.npy -------------------------------------------------------------------------------- /data/label/train_label.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/label/train_label.npy -------------------------------------------------------------------------------- /data/vec/emotion_test_vec.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/vec/emotion_test_vec.npy -------------------------------------------------------------------------------- /data/vec/emotion_train_vec.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/vec/emotion_train_vec.npy -------------------------------------------------------------------------------- /data/vec/textmind_test_vec.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/vec/textmind_test_vec.npy -------------------------------------------------------------------------------- /data/vec/textmind_train_vec.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/vec/textmind_train_vec.npy -------------------------------------------------------------------------------- /features/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/__init__.py -------------------------------------------------------------------------------- /features/crawl_textmind_data/README.md: -------------------------------------------------------------------------------- 1 | # [请求文心系统](http://ccpl.psych.ac.cn/textmind/)获取文本特征 -------------------------------------------------------------------------------- /features/crawl_textmind_data/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'gu' 2 | -------------------------------------------------------------------------------- /features/crawl_textmind_data/crawler.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | """ 4 | 爬取文心系统的数据,提取特征 5 | """ 6 | 7 | import http.cookiejar 8 | import json 9 | import urllib.request 10 | import urllib.parse 11 | import numpy as np 12 | 13 | from features.crawl_textmind_data import input_textmind_data 14 | 15 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:35.0) Gecko/20100101 Firefox/35.0'} 16 | 17 | 18 | class Crawler: 19 | def __init__(self): 20 | self.cj = http.cookiejar.LWPCookieJar() 21 | # 将一个保存cookie对象,和一个HTTP的cookie的处理器绑定 22 | self.cookie_processor = urllib.request.HTTPCookieProcessor(self.cj) 23 | # 创建一个opener,将保存了cookie的http处理器,还有设置一个handler用于处理http的URL的打开 24 | self.opener = urllib.request.build_opener(self.cookie_processor, 25 | urllib.request.HTTPHandler) # 将包含了cookie、http处理器、http的handler的资源和urllib2对象绑定在一起 26 | urllib.request.install_opener(self.opener) 27 | 28 | def doPost(self, text): 29 | print("正在请求文心...") 30 | PostData = { 31 | "str": text 32 | } 33 | PostData = urllib.parse.urlencode(PostData).encode("utf-8") 34 | request = urllib.request.Request('http://ccpl.psych.ac.cn/textmind/analysis', headers=headers) 35 | with urllib.request.urlopen(request, data=PostData) as f: 36 | resp = f.read() 37 | print(resp) 38 | return resp 39 | 40 | def parse_textmind_feature(self, json_str): 41 | feature_list = [] 42 | json_dict = json.loads(json_str) 43 | print(json_dict) 44 | if json_dict['status'] == 'success': 45 | result_list = json_dict['result'] 46 | for elem in result_list: 47 | name = elem['name'] 48 | value = elem['value'] 49 | feature_list.append(value) 50 | else: 51 | raise ValueError('文心系统分析返回数据异常') 52 | return feature_list 53 | 54 | def save_arr(self, filename, X_sp): 55 | """ 56 | 特征向量保存 57 | """ 58 | np.save(filename, X_sp) 59 | print("*****************write done over *****************") 60 | 61 | def textmind_action(self, train_lines, test_lines): 62 | """ 63 | 输入文本[] 保存特征 64 | :param train_lines: 65 | :param test_lines: 66 | :return: 67 | """ 68 | X_train, y_train = self.get_input_output(train_lines) 69 | X_test, y_test = self.get_input_output(test_lines) 70 | 71 | textmind_train_vec_dm = "textmind_train_vec.npy" 72 | textmind_train_label_dm = "train_label.npy" 73 | textmind_test_vec_dm = "textmind_test_vec.npy" 74 | textmind_test_label_dm = "test_label.npy" 75 | 76 | self.save_arr(textmind_train_vec_dm, np.array(X_train)) 77 | self.save_arr(textmind_train_label_dm, np.array(y_train)) 78 | self.save_arr(textmind_test_vec_dm, np.array(X_test)) 79 | self.save_arr(textmind_test_label_dm, np.array(y_test)) 80 | 81 | def get_input_output(self, lines): 82 | """ 83 | 输入文本的lines 返回每行对应的文心特征 和 对应的标签 84 | :param lines: 85 | :return: 86 | """ 87 | list_input_feature = [] 88 | list_output_tag = [] 89 | for t_line in lines: 90 | temp = t_line.split() 91 | user_name = temp[0] 92 | tags = temp[1:6] 93 | query = temp[6:] 94 | print(query) 95 | query = " ".join(query).strip().replace("\n", "") 96 | json_str = self.doPost(query) 97 | feature_list = self.parse_textmind_feature(json_str) 98 | list_input_feature.append(feature_list) 99 | 100 | list_tag = [] 101 | for tag in tags: 102 | j = int(tag) 103 | list_tag.append(j) 104 | list_output_tag.append(list_tag) 105 | return list_input_feature, list_output_tag 106 | 107 | 108 | if __name__ == '__main__': 109 | craw = Crawler() 110 | train_lines, test_lines = input_textmind_data.load_corpus() 111 | craw.textmind_action(train_lines, test_lines) 112 | -------------------------------------------------------------------------------- /features/crawl_textmind_data/input_textmind_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """读取文件""" 3 | import os 4 | 5 | from tensorflow.python.platform import gfile 6 | 7 | __author__ = 'gu' 8 | 9 | import numpy as np 10 | 11 | 12 | def load_corpus(): 13 | """ 14 | 加载训练数据 15 | :return: 16 | """ 17 | base_dir = 'E:\Koo\Projects\PycharmProjects\TensorFlow_DNN_Character_Classification\data\essay_data' 18 | 19 | train_txt_path = os.path.join(base_dir, "vocab1_train.txt") 20 | test_txt_path = os.path.join(base_dir, "vocab1_test.txt") 21 | 22 | return read_lines(train_txt_path), read_lines(test_txt_path) 23 | 24 | 25 | def read_lines(train_txt_path): 26 | with gfile.Open(train_txt_path, 'r') as f: 27 | lines = f.readlines() 28 | for line in lines: 29 | print(line) 30 | print('txt lines length', len(lines)) 31 | return lines 32 | 33 | 34 | def load_textmind_data_label(base_model_dir): 35 | """ 36 | 加载textmind矩阵 37 | :param base_model_dir: 38 | :return: 39 | """ 40 | textmind_train_vec = "textmind_train_vec.npy" 41 | textmind_train_label = "train_label.npy" 42 | textmind_test_vec = "textmind_test_vec.npy" 43 | textmind_test_label = "test_label.npy" 44 | 45 | train_vec_filename = os.path.join(base_model_dir, textmind_train_vec) 46 | train_label_filename = os.path.join(base_model_dir, textmind_train_label) 47 | test_vec_filename = os.path.join(base_model_dir, textmind_test_vec) 48 | test_label_filename = os.path.join(base_model_dir, textmind_test_label) 49 | 50 | X_train = np.load(train_vec_filename) 51 | print('X_train', X_train.shape) 52 | Y_train = np.load(train_label_filename) 53 | print('Y_train', Y_train.shape) 54 | X_test = np.load(test_vec_filename) 55 | print('X_test', X_test.shape) 56 | Y_test = np.load(test_label_filename) 57 | print('Y_test', Y_test.shape) 58 | return X_train, Y_train, X_test, Y_test 59 | 60 | 61 | def load_textmind_data_label_with_normalization(base_model_dir): 62 | """ 63 | 加载textmind矩阵 并进行数据平滑 64 | :param base_model_dir: 65 | :return: 66 | """ 67 | textmind_train_vec = "textmind_train_vec.npy" 68 | textmind_train_label = "train_label.npy" 69 | textmind_test_vec = "textmind_test_vec.npy" 70 | textmind_test_label = "test_label.npy" 71 | 72 | train_vec_filename = os.path.join(base_model_dir, textmind_train_vec) 73 | train_label_filename = os.path.join(base_model_dir, textmind_train_label) 74 | test_vec_filename = os.path.join(base_model_dir, textmind_test_vec) 75 | test_label_filename = os.path.join(base_model_dir, textmind_test_label) 76 | 77 | X_train = np.load(train_vec_filename) 78 | print('X_train', X_train.shape) 79 | Y_train = np.load(train_label_filename) 80 | print('Y_train', Y_train.shape) 81 | X_test = np.load(test_vec_filename) 82 | print('X_test', X_test.shape) 83 | Y_test = np.load(test_label_filename) 84 | print('Y_test', Y_test.shape) 85 | X_train_1 = np.where(X_train >= 0, np.log(X_train + 1), 0) 86 | X_test_1 = np.where(X_test >= 0, np.log(X_test + 1), 0) 87 | return X_train_1, Y_train, X_test_1, Y_test 88 | 89 | 90 | if __name__ == '__main__': 91 | X_train, Y_train, X_test, Y_test = load_textmind_data_label_with_normalization('') 92 | print(X_test) 93 | print(Y_test) 94 | -------------------------------------------------------------------------------- /features/crawl_textmind_data/test_label.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/crawl_textmind_data/test_label.npy -------------------------------------------------------------------------------- /features/crawl_textmind_data/textmind_test_vec.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/crawl_textmind_data/textmind_test_vec.npy -------------------------------------------------------------------------------- /features/crawl_textmind_data/textmind_train_vec.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/crawl_textmind_data/textmind_train_vec.npy -------------------------------------------------------------------------------- /features/crawl_textmind_data/train_label.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/crawl_textmind_data/train_label.npy -------------------------------------------------------------------------------- /features/doc2vec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/doc2vec/__init__.py -------------------------------------------------------------------------------- /features/doc2vec/doc2vec_action.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | 基于doc2vec的提取文本的特征特征 4 | """ 5 | import codecs 6 | import os 7 | from sklearn.externals import joblib 8 | import gensim 9 | from gensim.models.doc2vec import Doc2Vec, LabeledSentence 10 | from numpy import mat 11 | 12 | import numpy as np 13 | 14 | __author__ = 'gu' 15 | 16 | 17 | class D2vAction: 18 | def __init__(self, base_model_dir, train_document, text_document): 19 | """ 20 | 初始化路径 21 | :param base_model_dir: 存放模型的目录 22 | :param train_document: 训练样本 23 | :param text_document: 测试样本 24 | :return: 25 | """ 26 | self.base_model_dir = base_model_dir 27 | self.train_document = train_document 28 | self.text_document = text_document 29 | 30 | def train_lsi(self, doc): 31 | model_d2v_dm_path = os.path.join(self.base_model_dir, "model_d2v_dm.model") 32 | if os.path.exists(model_d2v_dm_path): 33 | print("已经存在模型!") 34 | model_dm = joblib.load(model_d2v_dm_path) 35 | else: 36 | model_dm = self.train_lsi_model(doc) 37 | X_doc, list_total, list_tag = self.prepare_lsi(doc) 38 | for i in range(10): 39 | # 一个用户作为一个文件去进行d2v的计算 40 | model_dm.train(X_doc, total_examples=model_dm.corpus_count, epochs=2) 41 | X_d2v = np.array([model_dm.docvecs[i] for i in range(len(list_total))]) 42 | print(X_d2v.shape) 43 | list_side = X_d2v # doc2vec 矩阵 44 | print(" doc2vec 矩阵构建完成----------------") 45 | return list_total, list_tag, list_side 46 | 47 | def train_lsi_model(self, doc): 48 | X_doc, list_total, list_tag = self.prepare_lsi(doc) 49 | # 训练模型 50 | model_dm = Doc2Vec(X_doc, dm=1, size=300, negative=5, hs=0, min_count=5, window=8, sample=1e-5, workers=4, 51 | alpha=0.025, min_alpha=0.025) 52 | joblib.dump(model_dm, "model_d2v_dm.model") 53 | print("d2w模型训练完成") 54 | return model_dm 55 | 56 | def prepare_lsi(self, doc): 57 | # 返回文本和标签 58 | list_total, list_tag = self.load_data(doc) 59 | # 构建语料库 60 | X_doc = [] 61 | TaggededDocument = gensim.models.doc2vec.TaggedDocument 62 | for i in range(list_total.__len__()): 63 | word_list = list_total[i] 64 | document = TaggededDocument(word_list, tags=[i]) 65 | X_doc.append(document) 66 | return X_doc, list_total, list_tag 67 | 68 | def load_data(self, doc): 69 | list_name = [] 70 | list_total = [] 71 | list_gender = [] 72 | # 对应标签导入词典 73 | f = codecs.open(doc) 74 | temp = f.readlines() 75 | f.close() 76 | 77 | for i in range(len(temp)): 78 | temp[i] = temp[i].split(" ") 79 | user_name = temp[i][0] 80 | tags = temp[i][1:6] 81 | 82 | query = temp[i][6:] 83 | query = " ".join(query).strip().replace("\n", "") 84 | 85 | list_total.append(query) 86 | list_gender.append(tags) 87 | 88 | list_tag = [] 89 | for line in list_gender: 90 | list_t = [] 91 | for j in line: 92 | j = int(j) 93 | list_t.append(j) 94 | list_tag.append(list_t) 95 | 96 | print("data have read ") 97 | return list_total, list_tag 98 | 99 | def get_d2v_feature(self): 100 | train_list_total, train_list_tag, train_list_side = self.train_lsi(self.train_document) 101 | print("train model done -------------------") 102 | 103 | text_list_total, text_list_tag, text_list_side = self.train_lsi(self.text_document) 104 | print("text model done -------------------") 105 | 106 | TR = train_list_total.__len__() 107 | TE = text_list_total.__len__() 108 | # 将输入解释为矩阵。 109 | train_list_side = mat(train_list_side) 110 | text_list_side = mat(text_list_side) 111 | # train_list_tag = mat(train_list_tag, dtype=float) 112 | # text_list_tag = mat(text_list_tag, dtype=float) 113 | 114 | X_train = train_list_side[:TR] 115 | y_train = train_list_tag[:TR] 116 | y_train = np.array(y_train) 117 | print("train shape :---------------------") 118 | print(X_train.shape) 119 | 120 | X_text = text_list_side[:TE] 121 | y_text = text_list_tag[:TE] 122 | y_text = np.array(y_text) 123 | print("text shape :---------------------") 124 | print(X_text.shape) 125 | print(train_list_side) 126 | print(train_list_tag) 127 | print(text_list_side) 128 | print(text_list_tag) 129 | 130 | return train_list_side, train_list_tag, text_list_side, text_list_tag 131 | 132 | def write_d2v(self, filename, X_sp): 133 | """ 134 | doc2vec的特征向量保存 135 | :param X_sp: 136 | :param doc_name: 137 | :return: 138 | """ 139 | np.save(filename, X_sp) 140 | print("*****************write done over *****************") 141 | 142 | 143 | def load_data_label(): 144 | """ 145 | 加载训练数据 146 | :return: 147 | """ 148 | base_dir = 'E:\Koo\Projects\PycharmProjects\TensorFlow_DNN_Character_Classification\data\essay_data' 149 | base_model_dir = '' 150 | d2vAction = D2vAction(base_model_dir, 151 | os.path.join(base_dir, "vocab1_train.txt"), 152 | os.path.join(base_dir, "vocab1_test.txt")) 153 | train_list_side, train_list_tag, text_list_side, text_list_tag = d2vAction.get_d2v_feature() 154 | str1 = "doc2vec_train_vec_dm.npy" 155 | str1_1 = "train_label.npy" 156 | str2 = "doc2vec_test_vec_dm.npy" 157 | str2_2 = "test_label.npy" 158 | d2vAction.write_d2v(str1, np.array(train_list_side)) 159 | d2vAction.write_d2v(str1_1, np.array(train_list_tag)) 160 | d2vAction.write_d2v(str2, np.array(text_list_side)) 161 | d2vAction.write_d2v(str2_2, np.array(text_list_tag)) 162 | 163 | 164 | if __name__ == '__main__': 165 | load_data_label() 166 | -------------------------------------------------------------------------------- /features/doc2vec/doc2vec_test_vec_dm.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/doc2vec/doc2vec_test_vec_dm.npy -------------------------------------------------------------------------------- /features/doc2vec/doc2vec_train_vec_dm.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/doc2vec/doc2vec_train_vec_dm.npy -------------------------------------------------------------------------------- /features/doc2vec/test_label.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/doc2vec/test_label.npy -------------------------------------------------------------------------------- /features/doc2vec/train_label.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/doc2vec/train_label.npy -------------------------------------------------------------------------------- /features/emotion_lexicon/README.md: -------------------------------------------------------------------------------- 1 | # [英文情感词汇](http://www.saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm)获取情感特征 -------------------------------------------------------------------------------- /features/emotion_lexicon/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'gu' 2 | -------------------------------------------------------------------------------- /features/emotion_lexicon/data_helper.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | import re 4 | import csv 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | def build_emotion_lexicon_dict(datafile): 11 | """ 12 | load emotion csv 13 | """ 14 | print('loading emotion dict...') 15 | vocab_emotion_dict = defaultdict(float) 16 | with open(datafile, "rb") as csvf: 17 | csvreader = csv.reader(csvf, delimiter=',', quotechar='"') 18 | first_line = True 19 | for line in csvreader: 20 | if first_line: 21 | first_line = False 22 | continue 23 | status = [] 24 | # print(line) 25 | try: 26 | line.remove('') 27 | except ValueError: 28 | None 29 | word = line[0] 30 | orig_rev = word.strip().lower() 31 | status.append(orig_rev) 32 | # print(orig_rev) 33 | word_emotion_value = [] 34 | for value in line[1:]: 35 | word_emotion_value.append(1 if value == '1' else 0) 36 | # print(word_emotion_value) 37 | vocab_emotion_dict[orig_rev] = word_emotion_value 38 | print('emotion word size are %s ' % len(vocab_emotion_dict)) 39 | return vocab_emotion_dict 40 | 41 | 42 | def build_emotion_feature(filename, vocab_emotion_dict): 43 | """ 44 | build emotion feature 45 | """ 46 | X_input = [] 47 | y_output = [] 48 | with open(filename, "rb") as f: 49 | lines = f.readlines() 50 | for line in lines: 51 | text = line.strip().split() 52 | y = [1 if s == '1' else 0 for s in text[1:6]] 53 | emotion_value = [] 54 | for word in text[6:]: 55 | if vocab_emotion_dict.__contains__(word): 56 | word_emotion_values = vocab_emotion_dict[word] 57 | emotion_value.append(word_emotion_values) 58 | value = map(sum, zip(*emotion_value)) 59 | X_input.append(value) 60 | y_output.append(y) 61 | X_input = np.mat(X_input) 62 | y_output = np.mat(y_output) 63 | print('X_input.shape', X_input.shape) 64 | print('y_output.shape', y_output.shape) 65 | return X_input, y_output 66 | 67 | 68 | def save_arr(filename, X_sp): 69 | np.save(filename, X_sp) 70 | print('write done', filename) 71 | 72 | 73 | def load_data_label(): 74 | """ 75 | load and save arr 76 | :return: 77 | """ 78 | base_dir = '' 79 | data_folder = os.path.join(base_dir, 'Emotion_Lexicon.csv') 80 | print("loading data...") 81 | emotion_dict = build_emotion_lexicon_dict(data_folder) 82 | X_train, y_train = build_emotion_feature('../data/essay_data/vocab1_train.txt', emotion_dict) 83 | X_test, y_test = build_emotion_feature('../data/essay_data/vocab1_test.txt', emotion_dict) 84 | save_arr('emotion_train_vec.npy', X_train) 85 | save_arr('emotion_train_label.npy', y_train) 86 | save_arr('emotion_test_vec.npy', X_test) 87 | save_arr('emotion_test_label.npy', y_test) 88 | return X_train, y_train, X_test, y_test 89 | 90 | 91 | def load_emotion_data_label(base_model_dir): 92 | """ 93 | load .npy file for arr 94 | :param base_model_dir: 95 | :return: 96 | """ 97 | train_vec_filename = os.path.join(base_model_dir, "emotion_train_vec.npy") 98 | train_label_filename = os.path.join(base_model_dir, 'emotion_train_label.npy') 99 | test_vec_filename = os.path.join(base_model_dir, 'emotion_test_vec.npy') 100 | test_label_filename = os.path.join(base_model_dir, 'emotion_test_label.npy') 101 | X_train = np.load(train_vec_filename) 102 | print('X_train', X_train.shape) 103 | Y_train = np.load(train_label_filename) 104 | print('Y_train', Y_train.shape) 105 | X_test = np.load(test_vec_filename) 106 | print('X_test', X_test.shape) 107 | Y_test = np.load(test_label_filename) 108 | print('Y_test', Y_test.shape) 109 | return X_train, Y_train, X_test, Y_test 110 | 111 | 112 | if __name__ == "__main__": 113 | # load_data_label() 114 | load_emotion_data_label('') 115 | -------------------------------------------------------------------------------- /features/emotion_lexicon/emotion_test_label.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/emotion_lexicon/emotion_test_label.npy -------------------------------------------------------------------------------- /features/emotion_lexicon/emotion_test_vec.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/emotion_lexicon/emotion_test_vec.npy -------------------------------------------------------------------------------- /features/emotion_lexicon/emotion_train_label.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/emotion_lexicon/emotion_train_label.npy -------------------------------------------------------------------------------- /features/emotion_lexicon/emotion_train_vec.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/emotion_lexicon/emotion_train_vec.npy -------------------------------------------------------------------------------- /features/process_data1.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | import re 4 | import csv 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | def build_data_cv(datafile, cv=10, clean_string=True): 11 | """ 12 | Loads data and split into 10 folds. 13 | """ 14 | revs = [] 15 | vocab = defaultdict(float) 16 | 17 | with open(datafile, "rb") as csvf: 18 | csvreader = csv.reader(csvf, delimiter=',', quotechar='"') 19 | first_line = True 20 | for line in csvreader: 21 | if first_line: 22 | first_line = False 23 | continue 24 | status = [] 25 | sentences = re.split(r'[.?]', line[1].strip()) 26 | try: 27 | sentences.remove('') 28 | except ValueError: 29 | None 30 | 31 | for sent in sentences: 32 | if clean_string: 33 | orig_rev = clean_str(sent.strip()) 34 | if orig_rev == '': 35 | continue 36 | words = set(orig_rev.split()) 37 | splitted = orig_rev.split() 38 | if len(splitted) > 150: 39 | orig_rev = [] 40 | splits = int(np.floor(len(splitted) / 20)) 41 | for index in range(splits): 42 | orig_rev.append(' '.join(splitted[index * 20:(index + 1) * 20])) 43 | if len(splitted) > splits * 20: 44 | orig_rev.append(' '.join(splitted[splits * 20:])) 45 | status.extend(orig_rev) 46 | else: 47 | status.append(orig_rev) 48 | else: 49 | orig_rev = sent.strip().lower() 50 | words = set(orig_rev.split()) 51 | status.append(orig_rev) 52 | 53 | for word in words: 54 | vocab[word] += 1 55 | 56 | datum = {"y0": 1 if line[2].lower() == 'y' else 0, 57 | "y1": 1 if line[3].lower() == 'y' else 0, 58 | "y2": 1 if line[4].lower() == 'y' else 0, 59 | "y3": 1 if line[5].lower() == 'y' else 0, 60 | "y4": 1 if line[6].lower() == 'y' else 0, 61 | "text": status, 62 | "user": line[0], 63 | "num_words": np.max([len(sent.split()) for sent in status]), 64 | "split": np.random.randint(0, cv)} 65 | revs.append(datum) 66 | 67 | return revs, vocab 68 | 69 | 70 | def clean_str(string, TREC=False): 71 | """ 72 | Tokenization/string cleaning for all datasets except for SST. 73 | Every dataset is lower cased except for TREC 74 | """ 75 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 76 | string = re.sub(r"\'s", " \'s ", string) 77 | string = re.sub(r"\'ve", " have ", string) 78 | string = re.sub(r"n\'t", " not ", string) 79 | string = re.sub(r"\'re", " are ", string) 80 | string = re.sub(r"\'d", " would ", string) 81 | string = re.sub(r"\'ll", " will ", string) 82 | string = re.sub(r",", " , ", string) 83 | string = re.sub(r"!", " ! ", string) 84 | string = re.sub(r"\(", " ( ", string) 85 | string = re.sub(r"\)", " ) ", string) 86 | string = re.sub(r"\?", " \? ", string) 87 | # string = re.sub(r"[a-zA-Z]{4,}", "", string) 88 | string = re.sub(r"\s{2,}", " ", string) 89 | return string.strip() if TREC else string.strip().lower() 90 | 91 | 92 | def clean_str_sst(string): 93 | """ 94 | Tokenization/string cleaning for the SST dataset 95 | """ 96 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 97 | string = re.sub(r"\s{2,}", " ", string) 98 | return string.strip().lower() 99 | 100 | 101 | if __name__ == "__main__": 102 | base_dir = '/data/essays_data' 103 | 104 | data_folder = os.path.join(base_dir, 'essays.csv') 105 | print("loading data...") 106 | revs, vocab = build_data_cv(data_folder, cv=10, clean_string=True) 107 | num_words = pd.DataFrame(revs)["num_words"] 108 | max_l = np.max(num_words) 109 | print("data loaded!") 110 | print("number of status: " + str(len(revs))) 111 | print("vocab size: " + str(len(vocab))) 112 | print("max sentence length: " + str(max_l)) 113 | -------------------------------------------------------------------------------- /features/tfidf/test_label.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/tfidf/test_label.npy -------------------------------------------------------------------------------- /features/tfidf/tfidf_action.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | '''tfidf-lsi-svm stack for gender''' 3 | 4 | from __future__ import division 5 | 6 | import codecs 7 | import os 8 | 9 | from sklearn.feature_extraction.text import TfidfVectorizer 10 | from sklearn import svm 11 | from sklearn.externals import joblib 12 | from numpy import * 13 | from gensim import models, corpora 14 | import numpy as np 15 | 16 | INPUT_SIZE = 300 # 训练特征维度参数 17 | 18 | 19 | class user_predict: 20 | def __init__(self, train_document, text_document): 21 | self.train_document = train_document 22 | self.text_document = text_document 23 | 24 | # -----------------------准确值计算----------------------- 25 | def myAcc(self, y_true, y_pred): 26 | true_num = 0 27 | # 最大数的索引 28 | y_pred = np.argmax(y_pred, axis=1) 29 | 30 | # for i in range(y_true.__len__()): 31 | # print y_true[i] 32 | for i in range(y_pred.__len__()): 33 | if y_true[i] == y_pred[i]: 34 | true_num += 1 35 | return true_num 36 | 37 | # -----------------------load data----------------------- 38 | def load_data(self, doc): 39 | list_name = [] 40 | list_total = [] 41 | list_gender = [] 42 | # 对应标签导入词典 43 | f = codecs.open(doc) 44 | temp = f.readlines() 45 | print(len(temp)) 46 | 47 | for i in range(len(temp)): 48 | temp[i] = temp[i].split(" ") 49 | user_name = temp[i][0] 50 | tags = temp[i][1:6] 51 | query = temp[i][6:] 52 | query = " ".join(query).strip().replace("\n", "") 53 | list_total.append(query) 54 | list_gender.append(tags) 55 | print(list_total.__len__()) 56 | print(list_gender.__len__()) 57 | list_tag = [] 58 | for line in list_gender: 59 | list_t = [] 60 | for j in line: 61 | j = int(j) 62 | list_t.append(j) 63 | list_tag.append(list_t) 64 | print("data have read ") 65 | return list_total, list_tag 66 | 67 | def load_stopword(self): 68 | """ 69 | 加载停用词语 70 | :param stopworddoc: 71 | :return: 72 | """ 73 | stop_word = [] 74 | return stop_word 75 | # with open('EN_Stopword.txt') as f: 76 | # lines = f.readlines() 77 | # for line in lines: 78 | # word = line.replace('\n', '') 79 | # if word != '': 80 | # stop_word.append(word) 81 | # with open('ENstopwords.txt') as f: 82 | # lines = f.readlines() 83 | # for line in lines: 84 | # word = line.replace('\n', '') 85 | # if word != '': 86 | # stop_word.append(word) 87 | # 88 | # return list(set(stop_word)) 89 | 90 | # -------------------------prepare lsi svd ----------------------- 91 | def prepare_lsi(self, doc): 92 | # 给训练集用的 93 | list_total, list_tag = self.load_data(doc) 94 | stop_word = self.load_stopword() 95 | texts = [[word for word in document.lower().split() if word not in stop_word] 96 | for document in list_total] 97 | # train dictionary done# 抽取一个bag-of-words,将文档的token映射为id 98 | dictionary = corpora.Dictionary(texts) # 生成词典 # {'a': 0, 'damaged': 1, 'gold': 3, 'fire': 2} 99 | # print dictionary.token2id 100 | # 产生文档向量,将用字符串表示的文档转换为用id和词频表示的文档向量 101 | corpus = [dictionary.doc2bow(text) for text in texts] 102 | # [[(0, 1), (6, 1)], [(0, 1), (9, 2), (10, 1)], [(0, 1), (3, 1)]] 103 | # 例如(9,2)这个元素代表第二篇文档中id为9的单词出现了2次 104 | # 用TFIDF的方法计算词频,sublinear_tf 表示学习率 105 | tfv = TfidfVectorizer(min_df=1, max_df=0.95, sublinear_tf=True, stop_words=stop_word) 106 | # 对文本中所有的用户对应的所有的评论里面的单词进行TFIDF的计算,找出每个词对应的tfidf值 107 | X_sp = tfv.fit_transform(list_total) 108 | # train model done基于这些“训练文档”计算一个TF-IDF模型 109 | tfidf_model = models.TfidfModel(corpus) 110 | joblib.dump(tfidf_model, "tfidf_model.model") 111 | # 转化文档向量,将用词频表示的文档向量表示为一个用tf-idf值表示的文档向量 112 | corpus_tfidf = tfidf_model[corpus] 113 | # [[(1, 0.6633689723434505), (2, 0.6633689723434505)],[(7, 0.16073253746956623), (8, 0.4355066251613605)]] 114 | # 训练LSI模型 即将训练文档向量组成的矩阵SVD分解,并做一个秩为2的近似SVD分解 115 | lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=INPUT_SIZE) 116 | joblib.dump(dictionary, "tfidf_dictionary.dict") 117 | print("训练集lsi -----") 118 | joblib.dump(lsi_model, "tfidf_lsi.model") 119 | return tfidf_model, dictionary 120 | 121 | def train_lsi(self, doc, str_doc): 122 | if not (os.path.exists("tfidf_model.model")): 123 | print("prepare model") 124 | tfidf_model, dictionary = self.prepare_lsi(doc) 125 | list_total, list_tag = self.load_data(doc) 126 | stop_word = self.load_stopword() 127 | texts = [[word for word in document.lower().split() if word not in stop_word] 128 | for document in list_total] 129 | corpus = [dictionary.doc2bow(text) for text in texts] 130 | else: 131 | print("use model") 132 | # load train valid text 133 | tfidf_model = joblib.load("tfidf_model.model") 134 | dictionary = joblib.load("tfidf_dictionary.dict") 135 | # load data 136 | list_total, list_tag = self.load_data(doc) 137 | stop_word = self.load_stopword() 138 | texts = [[word for word in document.lower().split() if word not in stop_word] 139 | for document in list_total] 140 | corpus = [dictionary.doc2bow(text) for text in texts] 141 | lsi_model = joblib.load("tfidf_lsi.model") 142 | corpus_tfidf = tfidf_model[corpus] 143 | list_side = [] 144 | corpus_lsi = lsi_model[corpus_tfidf] 145 | nodes = list(corpus_lsi) 146 | for i in range(len(nodes)): 147 | list_d = [] 148 | for j in range(INPUT_SIZE): 149 | # print nodes[i][j] 150 | list_d.append(nodes[i][j][1]) 151 | list_side.append(list_d) 152 | list_vec = mat(list_side) 153 | self.write_mat(list_vec, str_doc) 154 | print("lsi 矩阵构建完成----------------") 155 | return list_total, list_tag, list_side 156 | 157 | def write_mat(self, X_sp, doc_name): 158 | file_name = "tfidf_" + doc_name + ".npy" 159 | np.save(file_name, X_sp) 160 | print("*****************write done over *****************") 161 | 162 | 163 | if __name__ == '__main__': 164 | base_dir = 'E:\Koo\Projects\PycharmProjects\TensorFlow_DNN_Character_Classification\data\essay_data' 165 | user_predict = user_predict(os.path.join(base_dir, "vocab1_train.txt"), 166 | os.path.join(base_dir, "vocab1_test.txt")) 167 | 168 | user_predict.train_lsi(user_predict.train_document,"train_vec_tfidf") 169 | user_predict.train_lsi(user_predict.text_document,"test_vec_tfidf") -------------------------------------------------------------------------------- /features/tfidf/tfidf_test_vec_tfidf.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/tfidf/tfidf_test_vec_tfidf.npy -------------------------------------------------------------------------------- /features/tfidf/tfidf_train_vec_tfidf.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/tfidf/tfidf_train_vec_tfidf.npy -------------------------------------------------------------------------------- /features/tfidf/train_label.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/tfidf/train_label.npy -------------------------------------------------------------------------------- /model/README.md: -------------------------------------------------------------------------------- 1 | # 保存模型 -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'gu' 2 | -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | __author__ = 'gu' 3 | import os 4 | import logging 5 | 6 | 7 | def get_logger(): 8 | logging.basicConfig(filename=os.path.join('' '../log.txt'), 9 | format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', 10 | datefmt='%a, %d %b %Y %H:%M:%S', level=logging.INFO) 11 | # 定义一个Handler打印INFO及以上级别的日志到sys.stderr 12 | console = logging.StreamHandler() 13 | console.setLevel(logging.INFO) 14 | logging.getLogger('').addHandler(console) 15 | return logging 16 | --------------------------------------------------------------------------------