├── .gitignore
├── README.md
├── character_dnn
    ├── README.md
    ├── __init__.py
    ├── character_eval.py
    ├── character_inference.py
    ├── character_train.py
    └── input_data.py
├── character_svm
    ├── svm_action.py
    ├── svm_dbow_test.py
    └── svm_tfidf_test.py
├── data
    ├── essay_data
    │   ├── essays.csv
    │   ├── vocab1_test.txt
    │   └── vocab1_train.txt
    ├── label
    │   ├── test_label.npy
    │   └── train_label.npy
    └── vec
    │   ├── doc2vec_test_vec_dbow.txt
    │   ├── doc2vec_train_vec_dbow.txt
    │   ├── emotion_test_vec.npy
    │   ├── emotion_train_vec.npy
    │   ├── textmind_test_vec.npy
    │   └── textmind_train_vec.npy
├── features
    ├── __init__.py
    ├── crawl_textmind_data
    │   ├── README.md
    │   ├── __init__.py
    │   ├── crawler.py
    │   ├── input_textmind_data.py
    │   ├── test_label.npy
    │   ├── textmind_test_vec.npy
    │   ├── textmind_train_vec.npy
    │   └── train_label.npy
    ├── doc2vec
    │   ├── __init__.py
    │   ├── doc2vec_action.py
    │   ├── doc2vec_test_vec_dm.npy
    │   ├── doc2vec_train_vec_dm.npy
    │   ├── test_label.npy
    │   └── train_label.npy
    ├── emotion_lexicon
    │   ├── Emotion_Lexicon.csv
    │   ├── README.md
    │   ├── __init__.py
    │   ├── data_helper.py
    │   ├── emotion_test_label.npy
    │   ├── emotion_test_vec.npy
    │   ├── emotion_train_label.npy
    │   └── emotion_train_vec.npy
    ├── process_data1.py
    └── tfidf
    │   ├── test_label.npy
    │   ├── tfidf_action.py
    │   ├── tfidf_test_vec_tfidf.npy
    │   ├── tfidf_train_vec_tfidf.npy
    │   └── train_label.npy
├── model
    └── README.md
└── utils
    ├── __init__.py
    └── logger.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # Environments
 84 | .env
 85 | .venv
 86 | env/
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | .idea
103 | .gitignore~
104 | *.bak


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 基于深层神经网络的面向社交媒体的用户性格分类
 2 | 性格分类依据大五人格模型， 数据源采用开放的数据集essay。 在本文的实现中，
 3 | 提取用户文本的 tfidf 特征、 LIWC 心理学特征和情感特征、 以及基于 doc2vec 的文本特征等，
 4 | 建立 SVM 和深层神经网络的分类模型。
 5 | 
 6 | [参考](https://github.com/SenticNet/personality-detection)
 7 | 
 8 | version:
 9 | python 3.6
10 | tensorflow 1.10


--------------------------------------------------------------------------------
/character_dnn/README.md:
--------------------------------------------------------------------------------
 1 | # DNN 神经网络
 2 | 运行character_train.py
 3 | 修改神经网络相关参数：如下
 4 | 
 5 | INPUT_NODE = 11  # 用户的特征维度
 6 | OUTPUT_NODE = 5  # 输出5个类别的性格
 7 | /# LAYER1_NODE = 8  隱藏层的节点数 根据经验公式lgn
 8 | expr = 0.43 * INPUT_NODE * 5 + 0.12 * 5 * 5 + 2.54 * INPUT_NODE + 0.77 * 5 + 0.35
 9 | LAYER1_NODE = int(math.sqrt(expr) + 0.51)
10 | 


--------------------------------------------------------------------------------
/character_dnn/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'gu'
2 | 


--------------------------------------------------------------------------------
/character_dnn/character_eval.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 | 测试过程
  4 | """
  5 | __author__ = 'gu'
  6 | 
  7 | import os
  8 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  9 | import time
 10 | import tensorflow as tf
 11 | from character_dnn import character_inference
 12 | import numpy as np
 13 | from character_dnn import input_data
 14 | 
 15 | MOVING_AVERAGE_DECAY = 0.99  # 活动平均衰减率
 16 | MODEL_SAVE_PATH = "model/"
 17 | MODEL_NAME = "character_model"
 18 | print(MODEL_SAVE_PATH)
 19 | # 加载的时间间隔。
 20 | EVAL_INTERVAL_SECS = 2
 21 | 
 22 | # 加载d2v 和 tfidf的数据
 23 | train_list_side, train_list_tag, text_list_side, text_list_tag = input_data.load_data_label('')
 24 | 
 25 | def evaluate():
 26 |     with tf.Graph().as_default() as g:
 27 |         x = tf.placeholder(tf.float32, [None, character_inference.INPUT_NODE], name='x-input')
 28 |         y_ = tf.placeholder(tf.int64, name='y-input')
 29 |         validate_feed = {x: text_list_side, y_: text_list_tag}
 30 | 
 31 |         y = character_inference.inference(x, None)
 32 |         # y = character_inference.inference_nlayer(x, None)
 33 | 
 34 |         # correct_prediction = tf.equal(tf.argmax(y, 1), y_)
 35 |         # accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
 36 | 
 37 |         variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY)
 38 |         variables_to_restore = variable_averages.variables_to_restore()
 39 |         saver = tf.train.Saver(variables_to_restore)
 40 |         dict_acc = {}
 41 |         dict_precision = {}
 42 |         dict_recall = {}
 43 |         dict_f1 = {}
 44 |         dict_acc_lsit = {}
 45 | 
 46 |         while True:
 47 |             with tf.Session() as sess:
 48 |                 # tf.train.get_checkpoint_state 会根据checkpoint文件自动找到目录中最新模型的文件名
 49 |                 ckpt = tf.train.get_checkpoint_state(MODEL_SAVE_PATH)
 50 |                 if ckpt and ckpt.model_checkpoint_path:
 51 |                     saver.restore(sess, ckpt.model_checkpoint_path)
 52 |                     global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
 53 |                     # accuracy_score = sess.run(accuracy, feed_dict=validate_feed)
 54 | 
 55 |                     # accuracy_score = get_acc(sess,true_y, pred_y)
 56 |                     # print("After %s training step(s), validation accuracy = %g" % (global_step, accuracy_score))
 57 | 
 58 |                     # print("the input data are \n%s" % test_list_side)
 59 |                     # print("the truly answer are \n%s" % test_list_tag)
 60 |                     eval_aws = sess.run(y, feed_dict=validate_feed)
 61 |                     # print("the evaluate answer are \n%s" % eval_aws)
 62 | 
 63 |                     accuracy_score, acc_list = get_acc(sess, text_list_tag, eval_aws)
 64 |                     print("After %s training step(s), all validation accuracy = %g" % (global_step, accuracy_score))
 65 |                     print("After %s training step(s), 5 validation accuracy = %s" % (global_step, acc_list))
 66 | 
 67 |                     precision_list = get_precision(text_list_tag, eval_aws)
 68 |                     print("After %s training step(s), 5 precision = %s" % (global_step, precision_list))
 69 | 
 70 |                     recall_list = get_recall(text_list_tag, eval_aws)
 71 |                     print("After %s training step(s), 5 recall = %s" % (global_step, recall_list))
 72 | 
 73 |                     f1_list = get_f1(precision_list, recall_list)
 74 |                     print("After %s training step(s), 5 f1 = %s" % (global_step, f1_list))
 75 |                     print("==========================================")
 76 | 
 77 |                     if int(global_step) > 1:
 78 |                         dict_acc[global_step] = accuracy_score
 79 |                         dict_precision[global_step] = precision_list
 80 |                         dict_recall[global_step] = recall_list
 81 |                         dict_f1[global_step] = f1_list
 82 |                         dict_acc_lsit[global_step] = acc_list
 83 |                     if int(global_step) == 29001:
 84 |                         # print("================全部准确率===================")
 85 |                         # sort_dict(dict_acc)
 86 |                         print("================5个准确率===================")
 87 |                         sort_dict(dict_acc_lsit)
 88 |                         print("================5个精准率===================")
 89 |                         sort_dict(dict_precision)
 90 |                         print("================5个召回率===================")
 91 |                         sort_dict(dict_recall)
 92 |                         print("================5个f1===================")
 93 |                         sort_dict(dict_f1)
 94 |                         break
 95 | 
 96 |                 else:
 97 |                     print('No checkpoint file found')
 98 |                     return
 99 |             time.sleep(EVAL_INTERVAL_SECS)
100 | 
101 | 
102 | def get_acc(sess, true_y, pred_y):
103 |     """
104 |     计算总的准确率和5个标签的准确率
105 |     :param sess:
106 |     :param true_y:
107 |     :param pred_y:
108 |     :return:
109 |     """
110 |     pred_y_ = np.where(pred_y > 0, 1, 0)
111 |     correct_prediction = tf.equal(true_y, pred_y_)
112 |     accuracy = sess.run(tf.reduce_mean(tf.cast(correct_prediction, tf.float32)))
113 |     acc_list = []
114 |     for clazz in range(5):
115 |         true_class1 = true_y[:, clazz]
116 |         pred_class1 = pred_y[:, clazz]
117 |         pred_class1_ = np.where(pred_class1 > 0, 1, 0)
118 |         acc = 0
119 |         for i in range(len(true_class1)):
120 |             if true_class1[i] == pred_class1_[i]:
121 |                 acc += 1
122 |         acc_list.append(acc * 1.0 / len(true_class1))
123 |     return accuracy, acc_list
124 | 
125 | 
126 | def get_precision(true_y, pred_y):
127 |     """
128 |     返回五个标签的精确率
129 |     :param true_y:
130 |     :param pred_y:
131 |     :return:
132 |     """
133 |     precison_list = []
134 |     for clazz in range(5):
135 |         true_class1 = true_y[:, clazz]
136 |         pred_class1 = pred_y[:, clazz]
137 |         pred_class1_ = np.where(pred_class1 > 0, 1, 0)
138 |         precison = 0
139 |         for i in range(len(true_class1)):
140 |             if true_class1[i] == 1 and pred_class1_[i] == 1:
141 |                 precison += 1
142 |         precison_list.append(precison * 1.0 / np.sum(pred_class1_))
143 |     return precison_list
144 | 
145 | 
146 | def get_recall(true_y, pred_y):
147 |     """
148 |     返回5个标签的召回率
149 |     :param true_y:
150 |     :param pred_y:
151 |     :return:
152 |     """
153 |     recall_list = []
154 |     for clazz in range(5):
155 |         true_class1 = true_y[:, clazz]
156 |         pred_class1 = pred_y[:, clazz]
157 |         pred_class1_ = np.where(pred_class1 > 0, 1, 0)
158 |         precison = 0
159 |         for i in range(len(true_class1)):
160 |             if true_class1[i] == 1 and pred_class1_[i] == 1:
161 |                 precison += 1
162 |         recall_list.append(precison * 1.0 / np.sum(true_class1))
163 |     return recall_list
164 | 
165 | 
166 | def get_f1(precison_list, recall_list):
167 |     """
168 |     返回5个标签的f1值
169 |     :param precison:
170 |     :param recall:
171 |     :return:
172 |     """
173 |     f1_list = []
174 |     for i in range(5):
175 |         precison = precison_list[i]
176 |         recall = recall_list[i]
177 |         f1_list.append((2 * precison * recall) / (precison + recall))
178 |     return f1_list
179 | 
180 | 
181 | def mymean(acc_list):
182 |     acc_set = set(acc_list[1:])
183 |     mean_acc = np.average(list(acc_set))
184 |     print('After 20091 training steps mean_acc', mean_acc)
185 | 
186 | 
187 | def sort_dict(dict):
188 |     sorted_dict = sorted(dict.items(), key=lambda e: e[0], reverse=False)
189 |     print(sorted_dict)
190 |     item0 = 0
191 |     item1 = 0
192 |     item2 = 0
193 |     item3 = 0
194 |     item4 = 0
195 |     for ke in sorted_dict:
196 |         k = ke[1]
197 |         # print(k)
198 |         item0 = item0 + k[0]
199 |         item1 = item1 + k[1]
200 |         item2 = item2 + k[2]
201 |         item3 = item3 + k[3]
202 |         item4 = item4 + k[4]
203 |     le = len(sorted_dict)
204 |     print([item0 / le, item1 / le, item2 / le, item3 / le, item4 / le])
205 | 
206 | 
207 | def main(argv=None):
208 |     evaluate()
209 | if __name__ == '__main__':
210 |     tf.app.run()
211 | 


--------------------------------------------------------------------------------
/character_dnn/character_inference.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 | 定义看前向传播的过程以及神经网络中的参数
  4 | """
  5 | 
  6 | import tensorflow as tf
  7 | import math
  8 | 
  9 | # 神经网络相关参数
 10 | INPUT_NODE = 11  # 用户的特征维度
 11 | OUTPUT_NODE = 5  # 输出5个类别的性格
 12 | # LAYER1_NODE = 8  # 隱藏层的节点数 根据经验公式lgn
 13 | expr = 0.43 * INPUT_NODE * 5 + 0.12 * 5 * 5 + 2.54 * INPUT_NODE + 0.77 * 5 + 0.35
 14 | LAYER1_NODE = int(math.sqrt(expr) + 0.51)
 15 | 
 16 | 
 17 | def get_weight_variable(shape, regularizer):
 18 |     # 通过 tf.get_variable获取变量 和Variable 一样，在测试的时候会通过保存的模型来加载这些变量的取值。
 19 |     # 滑动平均变量重命名（影子变量），所以可以直接通过同样的变量名字取到变量本身
 20 |     weights = tf.get_variable("weights", shape, initializer=tf.truncated_normal_initializer(stddev=0.1))
 21 |     if regularizer != None:
 22 |         # 加入损失集合
 23 |         tf.add_to_collection('losses', regularizer(weights))
 24 |     return weights
 25 | 
 26 | 
 27 | def inference(input_tensor, regularizer):
 28 |     """
 29 |     一层隱藏层神经网络前向传播算法
 30 |     :param input_tensor:
 31 |     :param regularizer:
 32 |     :return:
 33 |     """
 34 |     # 声明第一层神经网络的变量并完成前向传播
 35 |     with tf.variable_scope('layer1'):
 36 |         # 生成隱藏层的参数
 37 |         weights = get_weight_variable([INPUT_NODE, LAYER1_NODE], regularizer)
 38 |         # 偏置设置为0.1
 39 |         biases = tf.get_variable("biases", [LAYER1_NODE], initializer=tf.constant_initializer(0.1))
 40 |         # 使用ReLU的激活函数 去线性化
 41 |         layer1 = tf.nn.relu(tf.matmul(input_tensor, weights) + biases)
 42 | 
 43 |     # 声明第二层神经网络的变量并完成前向传播
 44 |     with tf.variable_scope('layer2'):
 45 |         # 生成输出层的参数
 46 |         weights = get_weight_variable([LAYER1_NODE, OUTPUT_NODE], regularizer)
 47 |         biases = tf.get_variable("biases", [OUTPUT_NODE], initializer=tf.constant_initializer(0.1))
 48 |         layer2 = tf.matmul(layer1, weights) + biases
 49 | 
 50 |     # 返回最后的前向传播的结果
 51 |     return layer2
 52 | 
 53 | 
 54 | def get_weight(shape, regularizer):
 55 |     """
 56 |     获取一层神经网络边上的权重，并将这个权重的L2正则化损失加入名称为’losses‘的集合中
 57 |     :param shape: 维度——对应多少个输入和多少个输出
 58 |     :param lamd: 正则化项的权重
 59 |     :return: 神经网络边上的权重
 60 |     """
 61 |     # 生成一个变量 代表权重
 62 |     var = tf.Variable(tf.random_normal(shape=shape), dtype=tf.float32)
 63 |     if regularizer != None:
 64 |         # 加入损失集合
 65 |         # 将这个权重的L2正则化损失加入名称为’losses‘的集合中
 66 |         tf.add_to_collection('losses', regularizer(var))
 67 |     # 返回一层神经网络边上的权重
 68 |     return var
 69 | 
 70 | 
 71 | def inference_nlayer(input_tensor, regularizer):
 72 |     """
 73 |     n层神经网络前向传播算法
 74 |     :param input_tensor:
 75 |     :param regularizer:
 76 |     :return:
 77 |     """
 78 |     # 定义没一层网络中的节点数
 79 |     layer_dimension = [INPUT_NODE, 100, 100, 100, OUTPUT_NODE]
 80 |     # 神经网络的层数
 81 |     n_layers = len(layer_dimension)
 82 | 
 83 |     # 这个变量维护前向传播时最深的层，开始时就是输入层
 84 |     cur_layer = input_tensor
 85 |     # 当前层的节点数
 86 |     in_dimension = layer_dimension[0]
 87 | 
 88 |     # 通过循环来生成5层全连接的神经网络结构
 89 |     for i in range(1, n_layers):
 90 |         # layer_dimension[i]为下一层的节点个数
 91 |         out_dimension = layer_dimension[i]
 92 |         # 生成当前层中权重的变量，并把这个变量的L2正则化损失加入计算图上的集合
 93 |         weight = get_weight([in_dimension, out_dimension], regularizer)
 94 |         bias = tf.Variable(tf.constant(0.1, shape=[out_dimension]))
 95 | 
 96 |         # 使用ReLU激活函数
 97 |         cur_layer = tf.nn.relu(tf.matmul(cur_layer, weight) + bias)
 98 |         # 进入下一层之前将下一层的节点个数更新为当前层节点个数
 99 |         in_dimension = layer_dimension[i]
100 |     return cur_layer
101 | 


--------------------------------------------------------------------------------
/character_dnn/character_train.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """
 3 | 定义了神经网络的训练过程
 4 | """
 5 | 
 6 | import tensorflow as tf
 7 | from character_dnn import character_inference
 8 | import os
 9 | from character_dnn import input_data
10 | 
11 | # 1. 定义神经网络结构相关的参数。
12 | BATCH_SIZE = 50  # 一个训练batch中的训练数据个数，数字越小，训练过程越接近随机梯度下降
13 | LEARNING_RATE_BASE = 0.8  # 基础的学习率
14 | LEARNING_RATE_DECAY = 0.99  # 学习率衰减率
15 | REGULARIZATION_RATE = 0.0001  # 描述模型复杂度的正则化在损失函数的系数
16 | TRAINING_STEPS = 30000  # 训练轮数
17 | MOVING_AVERAGE_DECAY = 0.99  # 活动平均衰减率
18 | MODEL_SAVE_PATH = "model/"
19 | MODEL_NAME = "character_model"
20 | 
21 | # 加载d2v 和 tfidf的数据
22 | train_list_side, train_list_tag, text_list_side, text_list_tag = input_data.load_data_label('')
23 | TRAIN_NUM_EXAMPLES = DATASET_SIZE = len(train_list_side)  # 训练数据的总数
24 | 
25 | # 2. 定义训练过程。
26 | def train():
27 |     # 定义输入输出placeholder。
28 |     x = tf.placeholder(tf.float32, [None, character_inference.INPUT_NODE], name='x-input')
29 |     y_ = tf.placeholder(tf.float32, name='y-input')
30 |     # L2正则化
31 |     regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
32 |     # 计算在当前参数下神经网络前向传播的结果
33 |     y = character_inference.inference(x, regularizer)
34 |     # y = character_inference.inference_nlayer(x,regularizer)
35 |     # 定义存储训练轮数的便利那个。这个变量不需要计算滑动平均值，所以这里指定这个变量为不可训练的变量
36 |     global_step = tf.Variable(0, trainable=False)
37 | 
38 |     # ///////////////====定义损失函数、学习率、滑动平均操作以及训练过程。=====//////////////
39 |     # 初始化滑动平均类
40 |     variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
41 |     # 在所有代表神经网络参数的变量上使用滑动平均
42 |     variables_averages_op = variable_averages.apply(tf.trainable_variables())
43 | 
44 |     # 计算交叉熵作为刻画预测值和真实值之间茶军的损失函数
45 |     """
46 |     // 参考损失函数的计算 http://blog.csdn.net/u013250416/article/details/78230464
47 |     sigmoid_cross_entropy_with_logits  应用于多标签或者二分类
48 |     """
49 |     # 多目标损失函数
50 |     cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_)
51 |     cross_entropy_mean = tf.reduce_mean(cross_entropy)
52 |     # 总损失等于交叉熵损失和正则化损失的和
53 |     loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))
54 | 
55 |     # 指数衰减设置学习率
56 |     learning_rate = tf.train.exponential_decay(
57 |         LEARNING_RATE_BASE,
58 |         global_step,
59 |         DATASET_SIZE / BATCH_SIZE,
60 |         LEARNING_RATE_DECAY,
61 |         staircase=True)
62 |     # 优化损失函数,在minimize中传入global_step将自动更新global_step,从而更新学习率
63 |     train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
64 | 
65 |     # 在训练神经网络模型时既需要通过反向传播来更新神经网络的参数，又要更新每一个参数的滑动平均值。
66 |     with tf.control_dependencies([train_step, variables_averages_op]):
67 |         train_op = tf.no_op(name='train')
68 | 
69 |     # 初始化TensorFlow持久化类。
70 |     saver = tf.train.Saver()
71 |     with tf.Session() as sess:
72 |         tf.initialize_all_variables().run()
73 | 
74 |         for i in range(TRAINING_STEPS):
75 | 
76 |             # # 每次选取batch_size样本进行训练
77 |             # start = (i * BATCH_SIZE) % DATASET_SIZE
78 |             # end = min(start + BATCH_SIZE, DATASET_SIZE)
79 |             # _, loss_value, step = sess.run([train_op, loss, global_step],
80 |             #                                feed_dict={x: train_list_side[start:end],
81 |             #                                           y_: train_list_tag[start:end]})
82 | 
83 |             # 每次选取all_size样本进行训练
84 |             print(train_list_side.shape)
85 |             print(train_list_tag.shape)
86 |             _, loss_value, step = sess.run([train_op, loss, global_step],
87 |                                            feed_dict={x: train_list_side,
88 |                                                       y_: train_list_tag})
89 |             if i % 1000 == 0:
90 |                 print("After %d training step(s), loss on training batch is %g." % (step, loss_value))
91 |                 saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=global_step)
92 | 
93 | 
94 | def main(argv=None):
95 |     train()
96 | if __name__ == '__main__':
97 |     tf.app.run()
98 | 


--------------------------------------------------------------------------------
/character_dnn/input_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """Functions for reading character_dnn data."""
 3 | import os
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | def load_data_label(base_model_dir):
 9 |     train_vec_filename = os.path.join(base_model_dir, "../data/vec/emotion_train_vec.npy")
10 |     train_label_filename = os.path.join(base_model_dir, '../data/label/train_label.npy')
11 |     test_vec_filename = os.path.join(base_model_dir, '../data/vec/emotion_test_vec.npy')
12 |     test_label_filename = os.path.join(base_model_dir, '../data/label/test_label.npy')
13 | 
14 |     X_train = np.load(train_vec_filename)
15 |     print('X_train', X_train.shape)
16 |     Y_train = np.load(train_label_filename)
17 |     print('Y_train', Y_train.shape)
18 |     X_test = np.load(test_vec_filename)
19 |     print('X_test', X_test.shape)
20 |     Y_test = np.load(test_label_filename)
21 |     print('Y_test', Y_test.shape)
22 |     return X_train, Y_train, X_test, Y_test
23 | 
24 | def load_data_label_combine(X_train, X_test, X1_train, X1_test):
25 |     """
26 |     列向合并矩阵
27 |     combine two arr into one
28 |     :return:
29 |     """
30 |     X_train_all = np.hstack((X_train, X1_train))
31 |     X_test_all = np.hstack((X_test, X1_test))
32 |     return X_train_all, X_test_all
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     X_train, Y_train, X_test, Y_test = load_data_label('')
37 |     print(X_test)
38 |     print(Y_test)
39 | 


--------------------------------------------------------------------------------
/character_svm/svm_action.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | svm tfidf_d2v_dm_dbow_textmind_emotion for character
  4 | calculate their acc precision recall f1 value
  5 | """
  6 | 
  7 | from __future__ import division
  8 | 
  9 | from sklearn import svm
 10 | from numpy import *
 11 | import numpy as np
 12 | import os
 13 | from features.crawl_textmind_data import input_textmind_data
 14 | from utils import logger
 15 | 
 16 | LOG = logger.get_logger()
 17 | 
 18 | 
 19 | class SVMCharacterPredict:
 20 |     def myAcc(self, y_true, y_pred):
 21 |         """
 22 |         准确值计算
 23 |         :param y_true:
 24 |         :param y_pred:
 25 |         :return:
 26 |         """
 27 |         true_num = 0
 28 |         # for i in range(y_true.__len__()):
 29 |         #     # print y_true[i]
 30 |         for i in range(y_pred.__len__()):
 31 |             if y_true[i] == y_pred[i]:
 32 |                 true_num += 1
 33 |         return true_num
 34 | 
 35 |     def mymean(self, list_predict_score, array_test):
 36 |         """
 37 |         my mean count
 38 |         :param list_predict_score:
 39 |         :param array_test:
 40 |         :return:
 41 |         """
 42 |         num_total = 0
 43 |         num_total = array_test.shape[0] * 5
 44 |         # print "total numbers : " + str(num_total)
 45 |         return list_predict_score / (num_total)
 46 | 
 47 |     def train_eval(self, X_train, y_train, X_text, y_text):
 48 |         """
 49 |         输入矩阵 训练模型并计算准确率
 50 |         :param X_text:
 51 |         :param X_train:
 52 |         :param y_text:
 53 |         :param y_train:
 54 |         :return:
 55 |         """
 56 |         pred_y = []
 57 |         true_acc = 0
 58 |         for i in range(5):
 59 |             list_train_tags = []
 60 |             list_test_tags = []
 61 |             # # print "第" + str(i) + "个分类器训练"
 62 |             # first build train tag
 63 |             for line in y_train:
 64 |                 list_train_tags.append(line[i])
 65 |             # first build text tag
 66 |             for line in y_text:
 67 |                 list_test_tags.append(line[i])
 68 |             clf = svm.SVC(probability=True)
 69 |             clf = svm.SVC(kernel='linear', probability=True)
 70 |             # 逻辑回归训练模型
 71 |             clf.fit(X_train, list_train_tags)
 72 |             # 用模型预测
 73 |             y_pred_te = clf.predict_proba(X_text)
 74 |             # # print np.argmax(y_pred_te, axis=1)
 75 |             # # print "**" * 50
 76 |             # # print list_test_tags
 77 |             # #获取准确的个数
 78 |             # # print self.myAcc(list_test_tags, y_pred_te)
 79 | 
 80 |             # 最大数的索引
 81 |             y_pred = np.argmax(y_pred_te, axis=1)
 82 |             true_acc += self.myAcc(list_test_tags, y_pred)
 83 |             pred_y.append(y_pred)
 84 | 
 85 |         # print "true acc numbers: " + str(true_acc)
 86 |         pred_y_ = map(list, zip(*pred_y))
 87 |         pred_y_ = mat(pred_y_)
 88 |         return self.mymean(true_acc, X_text), pred_y_
 89 | 
 90 |     def predict_by_textmind(self):
 91 |         """
 92 |         svm 文心特征
 93 |         :return:
 94 |         """
 95 |         X_train, Y_train, X_test, Y_test = input_textmind_data.load_textmind_data_label_with_normalization(
 96 |             '../crawl_textmind_data')
 97 |         mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test)
 98 |         print("textmind+支持向量机　准确率平均值为: " + str(mymean))
 99 |         # LOG.info("textmind+支持向量机　准确率平均值为: " + str(mymean))
100 | 
101 |         acc_list = self.get_acc(Y_test, pred_y)
102 |         print("After training step(s), 5 validation accuracy = %s" % acc_list)
103 |         precision_list = self.get_precision(Y_test, pred_y)
104 |         print("After training step(s), 5 precision = %s" % precision_list)
105 |         recall_list = self.get_recall(Y_test, pred_y)
106 |         print("After training step(s), 5 recall = %s" % recall_list)
107 |         f1_list = self.get_f1(precision_list, recall_list)
108 |         print("After training step(s), 5 f1 = %s" % f1_list)
109 |         print("==========================================")
110 |         return X_train, Y_train, X_test, Y_test
111 | 
112 |     def predict_by_d2v_dm(self):
113 |         """
114 |         d2v_dm 训练
115 |         :return:
116 |         """
117 |         base_model_dir = ''
118 |         train_vec_filename = os.path.join(base_model_dir, "doc2vec_train_vec_dm.npy")
119 |         train_label_filename = os.path.join(base_model_dir, 'doc2vec_train_label_dm.npy')
120 |         test_vec_filename = os.path.join(base_model_dir, 'doc2vec_test_vec_dm.npy')
121 |         test_label_filename = os.path.join(base_model_dir, 'doc2vec_test_label_dm.npy')
122 | 
123 |         X_train, Y_train, X_test, Y_test = self.load_arr(test_label_filename, test_vec_filename, train_label_filename,
124 |                                                          train_vec_filename)
125 |         mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test)
126 |         # print "d2v_dm+支持向量机　准确率平均值为: " + str(mymean)
127 |         LOG.info("d2v_dm+支持向量机　准确率平均值为: " + str(mymean))
128 |         return X_train, Y_train, X_test, Y_test
129 | 
130 |     def predict_by_d2v_dbow(self):
131 |         """
132 |         d2v_dbow 训练
133 |         :return:
134 |         """
135 |         base_model_dir = 'E:\\Koo\\Projects\\PycharmProjects\\TensorFlow_DNN_Character_Classification\\features\\doc2vec'
136 |         train_vec_filename = os.path.join(base_model_dir, "doc2vec_train_vec_dm.npy")
137 |         train_label_filename = os.path.join(base_model_dir, 'train_label.npy')
138 |         test_vec_filename = os.path.join(base_model_dir, 'doc2vec_test_vec_dm.npy')
139 |         test_label_filename = os.path.join(base_model_dir, 'test_label.npy')
140 | 
141 |         X_train, Y_train, X_test, Y_test = self.load_arr(test_label_filename, test_vec_filename, train_label_filename,
142 |                                                          train_vec_filename)
143 |         mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test)
144 |         print("d2v_dbow+支持向量机　准确率平均值为: " + str(mymean))
145 |         # LOG.info("d2v_dbow+支持向量机　准确率平均值为: " + str(mymean))
146 |         print(pred_y.shape)
147 |         acc_list = self.get_acc(Y_test, pred_y)
148 |         print("After training step(s), 5 validation accuracy = %s" % acc_list)
149 |         precision_list = self.get_precision(Y_test, pred_y)
150 |         print("After training step(s), 5 precision = %s" % precision_list)
151 |         recall_list = self.get_recall(Y_test, pred_y)
152 |         print("After training step(s), 5 recall = %s" % recall_list)
153 |         f1_list = self.get_f1(precision_list, recall_list)
154 |         print("After training step(s), 5 f1 = %s" % f1_list)
155 |         print("==========================================")
156 | 
157 |         return X_train, Y_train, X_test, Y_test
158 | 
159 |     def predict_by_tfidf(self):
160 |         """
161 |         tfidf 训练
162 |         :return:
163 |         """
164 |         base_model_dir = 'E:\\Koo\Projects\\PycharmProjects\\TensorFlow_DNN_Character_Classification\\features\\tfidf'
165 |         train_vec_filename = os.path.join(base_model_dir, "tfidf_train_vec_tfidf.npy")
166 |         train_label_filename = os.path.join(base_model_dir, 'train_label.npy')
167 |         test_vec_filename = os.path.join(base_model_dir, 'tfidf_test_vec_tfidf.npy')
168 |         test_label_filename = os.path.join(base_model_dir, 'test_label.npy')
169 | 
170 |         X_train, Y_train, X_test, Y_test = self.load_arr(test_label_filename, test_vec_filename, train_label_filename,
171 |                                                          train_vec_filename)
172 |         mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test)
173 |         print("tfidf+支持向量机　准确率平均值为: " + str(mymean))
174 |         # LOG.info("tfidf + 停用词 +支持向量机　准确率平均值为: " + str(mymean))
175 | 
176 |         print(Y_test.shape)
177 |         print(pred_y.shape)
178 |         acc_list = self.get_acc(Y_test, pred_y)
179 |         print("After training step(s), 5 validation accuracy = %s" % acc_list)
180 |         precision_list = self.get_precision(Y_test, pred_y)
181 |         print("After training step(s), 5 precision = %s" % precision_list)
182 |         recall_list = self.get_recall(Y_test, pred_y)
183 |         print("After training step(s), 5 recall = %s" % recall_list)
184 |         f1_list = self.get_f1(precision_list, recall_list)
185 |         print("After training step(s), 5 f1 = %s" % f1_list)
186 |         print("==========================================")
187 | 
188 |         return X_train, Y_train, X_test, Y_test
189 | 
190 |     def predict_by_tfidf_stopword(self):
191 |         """
192 |         tfidf 训练
193 |         :return:
194 |         """
195 |         base_model_dir = ''
196 |         train_vec_filename = os.path.join(base_model_dir, "tfidf_train_vec_tfidf_stopword.npy")
197 |         train_label_filename = os.path.join(base_model_dir, 'doc2vec_train_label_dm.npy')
198 |         test_vec_filename = os.path.join(base_model_dir, 'tfidf_test_vec_tfidf_stopword.npy')
199 |         test_label_filename = os.path.join(base_model_dir, 'doc2vec_test_label_dm.npy')
200 | 
201 |         X_train, Y_train, X_test, Y_test = self.load_arr(test_label_filename, test_vec_filename, train_label_filename,
202 |                                                          train_vec_filename)
203 |         mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test)
204 |         print("tfidf+stopword+支持向量机　准确率平均值为: " + str(mymean))
205 |         # LOG.info("tfidf+支持向量机　准确率平均值为: " + str(mymean))
206 | 
207 |         acc_list = self.get_acc(Y_test, pred_y)
208 |         print("After training step(s), 5 validation accuracy = %s" % acc_list)
209 |         precision_list = self.get_precision(Y_test, pred_y)
210 |         print("After training step(s), 5 precision = %s" % precision_list)
211 |         recall_list = self.get_recall(Y_test, pred_y)
212 |         print("After training step(s), 5 recall = %s" % recall_list)
213 |         f1_list = self.get_f1(precision_list, recall_list)
214 |         print("After training step(s), 5 f1 = %s" % f1_list)
215 |         print("==========================================")
216 | 
217 |         return X_train, Y_train, X_test, Y_test
218 | 
219 |     def load_arr(self, test_label_filename, test_vec_filename, train_label_filename, train_vec_filename):
220 |         X_train = np.load(train_vec_filename)
221 |         # # print('X_train', X_train.shape)
222 |         Y_train = np.load(train_label_filename)
223 |         # # print('Y_train', Y_train.shape)
224 |         X_test = np.load(test_vec_filename)
225 |         # # print('X_test', X_test.shape)
226 |         Y_test = np.load(test_label_filename)
227 |         # # print('Y_test', Y_test.shape)
228 |         return X_train, Y_train, X_test, Y_test
229 | 
230 |     def predict_by_emotion(self):
231 |         """
232 |         情感特征
233 |         :return:
234 |         """
235 |         from features.emotion_lexicon import data_helper
236 | 
237 |         X_train, Y_train, X_test, Y_test = data_helper.load_emotion_data_label('../Emotion_Lexicon')
238 |         mymean, pred_y = self.train_eval(X_train, Y_train, X_test, Y_test)
239 |         # print "情感特征+支持向量机　准确率平均值为: " + str(mymean)
240 |         LOG.info("情感特征+支持向量机　准确率平均值为: " + str(mymean))
241 |         return X_train, Y_train, X_test, Y_test
242 | 
243 |     def predict_by_combine(self):
244 |         """
245 |         组合特征训练
246 |         :return:
247 |         """
248 |         from character_dnn import input_data
249 | 
250 |         base_model_dir = ''
251 |         train_vec_filename = os.path.join(base_model_dir, "tfidf_train_vec_tfidf.npy")
252 |         train_label_filename = os.path.join(base_model_dir, 'doc2vec_train_label_dm.npy')
253 |         test_vec_filename = os.path.join(base_model_dir, 'tfidf_test_vec_tfidf.npy')
254 |         test_label_filename = os.path.join(base_model_dir, 'doc2vec_test_label_dm.npy')
255 | 
256 |         X_train, Y_train, X_test, Y_test = self.load_arr(test_label_filename, test_vec_filename, train_label_filename,
257 |                                                          train_vec_filename)
258 |         train_vec_filename = os.path.join(base_model_dir, "doc2vec_train_vec_dbow.npy")
259 |         train_label_filename = os.path.join(base_model_dir, 'doc2vec_train_label_dm.npy')
260 |         test_vec_filename = os.path.join(base_model_dir, 'doc2vec_test_vec_dbow.npy')
261 |         test_label_filename = os.path.join(base_model_dir, 'doc2vec_test_label_dm.npy')
262 | 
263 |         X1_train, Y1_train, X1_test, Y1_test = self.load_arr(test_label_filename, test_vec_filename,
264 |                                                              train_label_filename,
265 |                                                              train_vec_filename)
266 | 
267 |         train_list_side, text_list_side = input_data.load_data_label_combine(X_train, X_test, X1_train, X1_test)
268 |         mymean, pred_y = self.train_eval(train_list_side, Y_train, text_list_side, Y_test)
269 |         # print "综合特征+支持向量机　准确率平均值为: " + str(mymean)
270 |         LOG.info("tfidf+dbow+综合特征+支持向量机　准确率平均值为: " + str(mymean))
271 | 
272 |         acc_list = self.get_acc(Y_test, pred_y)
273 |         print("After training step(s), 5 validation accuracy = %s" % acc_list)
274 |         precision_list = self.get_precision(Y_test, pred_y)
275 |         print("After training step(s), 5 precision = %s" % precision_list)
276 |         recall_list = self.get_recall(Y_test, pred_y)
277 |         print("After training step(s), 5 recall = %s" % recall_list)
278 |         f1_list = self.get_f1(precision_list, recall_list)
279 |         print("After training step(s), 5 f1 = %s" % f1_list)
280 |         print("==========================================")
281 | 
282 |     def predict_by_combine_two(self, fun1, fun2, fun1name, fun2name):
283 |         from character_dnn import input_data
284 | 
285 |         X_train, Y_train, X_test, Y_test = fun1
286 |         X1_train, Y1_train, X1_test, Y1_test = fun2
287 |         train_list_side, text_list_side = input_data.load_data_label_combine(X_train, X_test, X1_train, X1_test)
288 |         mymean, pred_y = self.train_eval(train_list_side, Y_train, text_list_side, Y_test)
289 |         print("综合特征+支持向量机　准确率平均值为: " + str(mymean))
290 |         LOG.info(fun1name + " + " + fun2name + " 综合特征+支持向量机　准确率平均值为: " + str(mymean))
291 | 
292 |         acc_list = self.get_acc(Y_test, pred_y)
293 |         print("After training step(s), 5 validation accuracy = %s" % acc_list)
294 |         precision_list = self.get_precision(Y_test, pred_y)
295 |         print("After training step(s), 5 precision = %s" % precision_list)
296 |         recall_list = self.get_recall(Y_test, pred_y)
297 |         print("After training step(s), 5 recall = %s" % recall_list)
298 |         f1_list = self.get_f1(precision_list, recall_list)
299 |         print("After training step(s), 5 f1 = %s" % f1_list)
300 |         print("==========================================")
301 | 
302 |     def predict_by_combine_three(self):
303 |         from character_dnn import input_data
304 | 
305 |         X_train, Y_train, X_test, Y_test = self.predict_by_tfidf()
306 |         X1_train, Y1_train, X1_test, Y1_test = self.predict_by_d2v_dbow()
307 |         X2_train, Y2_train, X2_test, Y2_test = self.predict_by_emotion()
308 |         X3_train, X3_test = input_data.load_data_label_combine(X_train, X_test, X1_train, X1_test)
309 |         train_list_side, text_list_side = input_data.load_data_label_combine(X3_train, X3_test, X2_train, X2_test)
310 |         mymean, pred_y = self.train_eval(train_list_side, Y_train, text_list_side, Y_test)
311 |         # print "综合特征+支持向量机　准确率平均值为: " + str(mymean)
312 |         LOG.info(" tiidf + d2v_dbow + emotion 综合特征+支持向量机　准确率平均值为: " + str(mymean))
313 | 
314 |     def get_acc(self, true_y, pred_y):
315 |         """
316 |         计算总的准确率和5个标签的准确率
317 |         :param sess:
318 |         :param true_y:
319 |         :param pred_y:
320 |         :return:
321 |         """
322 |         acc_list = []
323 |         for clazz in range(5):
324 |             true_class1 = true_y[:, clazz]
325 |             pred_class1 = pred_y[:, clazz]
326 |             acc = 0
327 |             for i in range(len(true_class1)):
328 |                 if true_class1[i] == pred_class1[i]:
329 |                     acc += 1
330 |             acc_list.append(acc * 1.0 / len(true_class1))
331 |         return acc_list
332 | 
333 |     def get_precision(self, true_y, pred_y):
334 |         """
335 |         返回五个标签的精确率
336 |         :param true_y:
337 |         :param pred_y:
338 |         :return:
339 |         """
340 |         precison_list = []
341 |         for clazz in range(5):
342 |             true_class1 = true_y[:, clazz]
343 |             pred_class1 = pred_y[:, clazz]
344 |             precison = 0
345 |             for i in range(len(true_class1)):
346 |                 if true_class1[i] == 1 and pred_class1[i] == 1:
347 |                     precison += 1
348 |             precison_list.append(precison * 1.0 / np.sum(pred_class1))
349 |         return precison_list
350 | 
351 |     def get_recall(self, true_y, pred_y):
352 |         """
353 |         返回5个标签的召回率
354 |         :param true_y:
355 |         :param pred_y:
356 |         :return:
357 |         """
358 |         recall_list = []
359 |         for clazz in range(5):
360 |             true_class1 = true_y[:, clazz]
361 |             pred_class1 = pred_y[:, clazz]
362 |             precison = 0
363 |             for i in range(len(true_class1)):
364 |                 if true_class1[i] == 1 and pred_class1[i] == 1:
365 |                     precison += 1
366 |             recall_list.append(precison * 1.0 / np.sum(true_class1))
367 |         return recall_list
368 | 
369 |     def get_f1(self, precison_list, recall_list):
370 |         """
371 |         返回5个标签的f1值
372 |         :param precison:
373 |         :param recall:
374 |         :return:
375 |         """
376 |         f1_list = []
377 |         for i in range(5):
378 |             precison = precison_list[i]
379 |             recall = recall_list[i]
380 |             f1_list.append((2 * precison * recall) / (precison + recall))
381 |         return f1_list
382 | 
383 | 
384 | 
385 | 
386 | if __name__ == '__main__':
387 |     user_predict = SVMCharacterPredict()
388 |     # user_predict.predict_by_combine()
389 |     # for _ in range(2):
390 |     #     user_predict.predict_by_combine_three()
391 | 
392 |     # # 训练10次
393 |     for _ in range(10):
394 |         LOG.info("=========开始第" + str(_ + 1) + "轮训练组合===========")
395 |         # fun2 = user_predict.predict_by_textmind()
396 |         fun3 = user_predict.predict_by_d2v_dbow()
397 |         # fun5 = user_predict.predict_by_tfidf()
398 |         # user_predict.predict_by_tfidf_stopword()
399 |         # f2name = 'textmind'
400 |         # f3name = 'dbow'
401 |         # f5name = 'tfidf'
402 |         # user_predict.predict_by_combine_two(fun2, fun3, f2name, f3name)
403 |         # user_predict.predict_by_combine_two(fun2, fun5, f2name, f5name)
404 |         # user_predict.predict_by_combine_two(fun3, fun5, f3name, f5name)
405 |         # user_predict.predict_by_combine()
406 | 


--------------------------------------------------------------------------------
/character_svm/svm_dbow_test.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | ''' doc2vc-svm stack for gender'''
  3 | 
  4 | from __future__ import division
  5 | 
  6 | import codecs
  7 | from sklearn import svm
  8 | from sklearn.externals import joblib
  9 | 
 10 | import gensim
 11 | from numpy import *
 12 | from gensim.models.doc2vec import Doc2Vec, LabeledSentence
 13 | import os
 14 | import numpy as np
 15 | 
 16 | 
 17 | class user_predict:
 18 |     def __init__(self, train_document, text_document):
 19 |         self.train_document = train_document
 20 |         self.text_document = text_document
 21 | 
 22 |     # -----------------------准确值计算-----------------------
 23 |     def myAcc(self, y_true, y_pred):
 24 |         true_num = 0
 25 |         # 最大数的索引
 26 |         y_pred = np.argmax(y_pred, axis=1)
 27 | 
 28 |         # for i in range(y_true.__len__()):
 29 |         #     print y_true[i]
 30 |         for i in range(y_pred.__len__()):
 31 |             if y_true[i] == y_pred[i]:
 32 |                 true_num += 1
 33 |         return true_num
 34 | 
 35 |     # -----------------------load data-----------------------
 36 |     def load_data(self, doc):
 37 | 
 38 |         list_name = []
 39 |         list_total = []
 40 |         list_gender = []
 41 |         # 对应标签导入词典
 42 |         f = codecs.open(doc)
 43 |         temp = f.readlines()
 44 |         print(len(temp))
 45 | 
 46 |         for i in range(len(temp)):
 47 |             temp[i] = temp[i].split(" ")
 48 |             user_name = temp[i][0]
 49 |             tags = temp[i][1:6]
 50 | 
 51 |             query = temp[i][6:]
 52 |             query = " ".join(query).strip().replace("\n", "")
 53 |             list_total.append(query)
 54 |             list_gender.append(tags)
 55 | 
 56 |         print(list_total.__len__())
 57 |         print(list_gender.__len__())
 58 |         # 标签转化,男:0,女:1
 59 |         list_tag = []
 60 |         for line in list_gender:
 61 |             list_t = []
 62 |             for j in line:
 63 |                 j = int(j)
 64 |                 list_t.append(j)
 65 |             list_tag.append(list_t)
 66 | 
 67 |         print("data have read ")
 68 |         return list_total, list_tag
 69 | 
 70 |     # -------------------------prepare d2w svd -----------------------
 71 |     def prepare_lsi(self, doc):
 72 | 
 73 |         list_total, list_tag = self.load_data(doc)
 74 | 
 75 |         stop_word = []
 76 | 
 77 |         # 构建语料库
 78 |         X_doc = []
 79 |         TaggededDocument = gensim.models.doc2vec.TaggedDocument
 80 |         for i in range(list_total.__len__()):
 81 |             word_list = list_total[i]
 82 |             document = TaggededDocument(word_list, tags=[i])
 83 |             X_doc.append(document)
 84 | 
 85 |         return X_doc, list_total, list_tag
 86 | 
 87 |     def train_lsi_model(self, doc):
 88 | 
 89 |         X_doc, list_total, list_tag = self.prepare_lsi(doc)
 90 |         # 训练模型
 91 |         model_dm = Doc2Vec(X_doc, dm=0, size=300, negative=5, hs=0, min_count=1, window=30, sample=1e-5, workers=8,
 92 |                            alpha=0.04, min_alpha=0.025)
 93 |         joblib.dump(model_dm, "model_d2v_dbow.model")
 94 |         print("d2w模型训练完成")
 95 | 
 96 |         return model_dm
 97 | 
 98 |     def write_d2v(self, X_sp, doc_name):
 99 |         """
100 |         保存doc2vec的特征向量
101 |         :param X_sp:
102 |         :param doc_name:
103 |         :return:
104 |         """
105 |         np.save("doc2vec_" + doc_name + ".npy",X_sp)
106 | 
107 |         print("*****************write done over *****************")
108 | 
109 |     def train_lsi(self, doc, str_vec):
110 | 
111 |         if (os.path.exists("model_d2v_dbow.model")):
112 | 
113 |             # load train model
114 |             model_dm = joblib.load("model_d2v_dbow.model")
115 |         else:
116 |             # load train model
117 |             model_dm = self.train_lsi_model(doc)
118 | 
119 |         # prepare data
120 |         X_doc, list_total, list_tag = self.prepare_lsi(doc)
121 | 
122 |         for i in range(10):
123 |             # 一个用户作为一个文件去进行d2v的计算
124 |             model_dm.train(X_doc, total_examples=model_dm.corpus_count, epochs=2)
125 |             X_d2v = np.array([model_dm.docvecs[i] for i in range(len(list_total))])
126 | 
127 |         print(X_d2v.shape)
128 | 
129 |         list_side = X_d2v
130 | 
131 |         self.write_d2v(list_side, str_vec)
132 |         print(" doc2vec 矩阵构建完成----------------")
133 | 
134 |         return list_total, list_tag, list_side
135 | 
136 |     # ------------------------my mean count------------------
137 | 
138 |     def mymean(self, list_predict_score, array_test):
139 |         num_total = 0
140 |         num_total = array_test.shape[0] * 5
141 |         print("total numbers : " + str(num_total))
142 |         return list_predict_score / (num_total)
143 | 
144 |     # ------------------------------begin to predict------------
145 |     def predict(self):
146 |         str1 = "train_vec_dbow"
147 |         str2 = "test_vec_dbow"
148 |         train_list_total, train_list_tag, train_list_side = self.train_lsi(self.train_document, str1)
149 |         print("train model done -------------------")
150 | 
151 |         text_list_total, text_list_tag, text_list_side = self.train_lsi(self.text_document, str2)
152 |         print("text model done  -------------------")
153 | 
154 |         TR = train_list_total.__len__()
155 |         TE = text_list_total.__len__()
156 |         n = 5
157 | 
158 |         train_list_side = mat(train_list_side)
159 |         text_list_side = mat(text_list_side)
160 | 
161 |         X_train = train_list_side[:TR]
162 |         y_train = train_list_tag[:TR]
163 |         y_train = np.array(y_train)
164 | 
165 |         print("train shape :---------------------")
166 |         print(X_train.shape)
167 | 
168 |         X_text = text_list_side[:TE]
169 |         y_text = text_list_tag[:TE]
170 |         y_text = np.array(y_text)
171 | 
172 |         print("text shape :---------------------")
173 |         print(X_text.shape)
174 | 
175 |         # kfold折叠交叉验证
176 |         list_myAcc = []
177 |         true_acc = 0
178 | 
179 |         for i in range(5):
180 |             list_train_tags = []
181 |             list_test_tags = []
182 |             print("第" + str(i) + "个分类器训练")
183 | 
184 |             # first build train tag
185 |             for line in y_train:
186 |                 list_train_tags.append(line[i])
187 | 
188 |             # first build text tag
189 |             for line in y_text:
190 |                 list_test_tags.append(line[i])
191 | 
192 |             clf = svm.SVC(probability=True)
193 | 
194 |             clf = svm.SVC(kernel='linear', probability=True)
195 | 
196 |             # 逻辑回归训练模型
197 |             clf.fit(X_train, list_train_tags)
198 |             # 用模型预测
199 |             y_pred_te = clf.predict_proba(X_text)
200 | 
201 |             print(np.argmax(y_pred_te, axis=1))
202 |             print("**" * 50)
203 |             print(list_test_tags)
204 | 
205 |             # #获取准确的个数
206 |             print(self.myAcc(list_test_tags, y_pred_te))
207 |             true_acc += self.myAcc(list_test_tags, y_pred_te)
208 | 
209 |         print("true acc numbers: " + str(true_acc))
210 | 
211 |         print("d2w_dbow + 支持向量机　准确率平均值为: ")
212 |         print(self.mymean(true_acc, X_text))
213 | 
214 | 
215 | if __name__ == '__main__':
216 |     base_dir = 'E:\\Koo\\Projects\\PycharmProjects\\TensorFlow_DNN_Character_Classification\\data\essay_data'
217 |     user_predict = user_predict(os.path.join(base_dir, "vocab1_train.txt"), os.path.join(base_dir, "vocab1_test.txt"))
218 |     user_predict.predict()


--------------------------------------------------------------------------------
/character_svm/svm_tfidf_test.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | """
  3 | tfidf模型构建特征并计算运算结果
  4 | """
  5 | from __future__ import division
  6 | 
  7 | import codecs
  8 | import os
  9 | 
 10 | from sklearn.feature_extraction.text import TfidfVectorizer
 11 | from sklearn import svm
 12 | from sklearn.externals import joblib
 13 | from numpy import *
 14 | from gensim import models, corpora
 15 | import numpy as np
 16 | 
 17 | INPUT_SIZE = 300  # 训练特征维度参数
 18 | 
 19 | 
 20 | class user_predict:
 21 |     def __init__(self, train_document, text_document):
 22 |         self.train_document = train_document
 23 |         self.text_document = text_document
 24 | 
 25 |     # -----------------------准确值计算-----------------------
 26 |     def myAcc(self, y_true, y_pred):
 27 |         true_num = 0
 28 |         # 最大数的索引
 29 |         y_pred = np.argmax(y_pred, axis=1)
 30 | 
 31 |         # for i in range(y_true.__len__()):
 32 |         #     print y_true[i]
 33 |         for i in range(y_pred.__len__()):
 34 |             if y_true[i] == y_pred[i]:
 35 |                 true_num += 1
 36 |         return true_num
 37 | 
 38 |     # -----------------------load data-----------------------
 39 |     def load_data(self, doc):
 40 | 
 41 |         list_name = []
 42 |         list_total = []
 43 |         list_gender = []
 44 |         # 对应标签导入词典
 45 |         f = codecs.open(doc)
 46 |         temp = f.readlines()
 47 |         print(len(temp))
 48 | 
 49 |         for i in range(len(temp)):
 50 |             temp[i] = temp[i].split(" ")
 51 |             user_name = temp[i][0]
 52 |             tags = temp[i][1:6]
 53 | 
 54 |             query = temp[i][6:]
 55 |             query = " ".join(query).strip().replace("\n", "")
 56 |             list_total.append(query)
 57 |             list_gender.append(tags)
 58 | 
 59 |         print(list_total.__len__())
 60 |         print(list_gender.__len__())
 61 |         list_tag = []
 62 |         for line in list_gender:
 63 |             list_t = []
 64 |             for j in line:
 65 |                 j = int(j)
 66 |                 list_t.append(j)
 67 |             list_tag.append(list_t)
 68 | 
 69 |         print("data have read ")
 70 |         return list_total, list_tag
 71 | 
 72 |     def load_stopword(self):
 73 |         """
 74 |         加载停用词语
 75 |         :param stopworddoc:
 76 |         :return:
 77 |         """
 78 |         stop_word = []
 79 |         return stop_word
 80 |         # with open('EN_Stopword.txt') as f:
 81 |         #     lines = f.readlines()
 82 |         #     for line in lines:
 83 |         #         word = line.replace('\n', '')
 84 |         #         if word != '':
 85 |         #             stop_word.append(word)
 86 |         # with open('ENstopwords.txt') as f:
 87 |         #     lines = f.readlines()
 88 |         #     for line in lines:
 89 |         #         word = line.replace('\n', '')
 90 |         #         if word != '':
 91 |         #             stop_word.append(word)
 92 |         #
 93 |         # return list(set(stop_word))
 94 | 
 95 |     # -------------------------prepare lsi svd -----------------------
 96 |     def prepare_lsi(self, doc):
 97 | 
 98 |         # 给训练集用的
 99 |         list_total, list_tag = self.load_data(doc)
100 | 
101 |         stop_word = self.load_stopword()
102 | 
103 |         texts = [[word for word in document.lower().split() if word not in stop_word]
104 |                  for document in list_total]
105 | 
106 |         # train dictionary done# 抽取一个bag-of-words，将文档的token映射为id
107 |         dictionary = corpora.Dictionary(texts)  # 生成词典 # {'a': 0, 'damaged': 1, 'gold': 3, 'fire': 2}
108 |         # print dictionary.token2id
109 |         # 产生文档向量，将用字符串表示的文档转换为用id和词频表示的文档向量
110 |         corpus = [dictionary.doc2bow(text) for text in texts]
111 |         # [[(0, 1), (6, 1)], [(0, 1), (9, 2), (10, 1)], [(0, 1), (3, 1)]]
112 |         # 例如（9，2）这个元素代表第二篇文档中id为9的单词出现了2次
113 | 
114 |         # 用TFIDF的方法计算词频,sublinear_tf 表示学习率
115 |         tfv = TfidfVectorizer(min_df=1, max_df=0.95, sublinear_tf=True, stop_words=stop_word)
116 |         # 对文本中所有的用户对应的所有的评论里面的单词进行ＴＦＩＤＦ的计算，找出每个词对应的tfidf值
117 |         X_sp = tfv.fit_transform(list_total)
118 |         # train model done基于这些“训练文档”计算一个TF-IDF模型
119 |         tfidf_model = models.TfidfModel(corpus)
120 |         joblib.dump(tfidf_model, "tfidf_model.model")
121 | 
122 |         # 转化文档向量，将用词频表示的文档向量表示为一个用tf-idf值表示的文档向量
123 |         corpus_tfidf = tfidf_model[corpus]
124 |         # [[(1, 0.6633689723434505), (2, 0.6633689723434505)],[(7, 0.16073253746956623), (8, 0.4355066251613605)]]
125 | 
126 |         # 训练LSI模型 即将训练文档向量组成的矩阵SVD分解，并做一个秩为2的近似SVD分解
127 |         lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=INPUT_SIZE)
128 |         joblib.dump(dictionary, "tfidf_dictionary.dict")
129 |         print("训练集lsi -----")
130 |         joblib.dump(lsi_model, "tfidf_lsi.model")
131 | 
132 |         return tfidf_model, dictionary
133 | 
134 |     def train_lsi(self, doc, str_doc):
135 |         if not (os.path.exists("tfidf_model.model")):
136 |             print("prepare model")
137 |             # load train model
138 |             tfidf_model, dictionary = self.prepare_lsi(doc)
139 |             # load data
140 |             list_total, list_tag = self.load_data(doc)
141 |             stop_word = self.load_stopword()
142 |             texts = [[word for word in document.lower().split() if word not in stop_word]
143 |                      for document in list_total]
144 |             corpus = [dictionary.doc2bow(text) for text in texts]
145 | 
146 |         else:
147 |             print("use model")
148 |             # load train valid text
149 |             tfidf_model = joblib.load("tfidf_model.model")
150 |             dictionary = joblib.load("tfidf_dictionary.dict")
151 |             # load data
152 |             list_total, list_tag = self.load_data(doc)
153 |             stop_word = self.load_stopword()
154 |             texts = [[word for word in document.lower().split() if word not in stop_word]
155 |                      for document in list_total]
156 |             corpus = [dictionary.doc2bow(text) for text in texts]
157 |         lsi_model = joblib.load("tfidf_lsi.model")
158 |         corpus_tfidf = tfidf_model[corpus]
159 |         list_side = []
160 |         corpus_lsi = lsi_model[corpus_tfidf]
161 |         nodes = list(corpus_lsi)
162 | 
163 |         for i in range(len(nodes)):
164 |             list_d = []
165 |             for j in range(INPUT_SIZE):
166 |                 # print nodes[i][j]
167 |                 list_d.append(nodes[i][j][1])
168 |             list_side.append(list_d)
169 | 
170 |         list_vec = mat(list_side)
171 |         self.write_d2v(list_vec, str_doc)
172 |         print("lsi 矩阵构建完成----------------")
173 |         return list_total, list_tag, list_side
174 | 
175 |     # -----------------------write vec--------------------
176 |     def write_d2v(self, X_sp, doc_name):
177 |         file_name = "tfidf_" + doc_name + ".npy"
178 |         np.save(file_name, X_sp)
179 |         print("*****************write done over *****************")
180 | 
181 |     # ------------------------my mean count------------------
182 |     def mymean(self, list_predict_score, array_test):
183 |         num_total = 0
184 |         num_total = array_test.shape[0] * 5
185 |         print("total numbers : " + str(num_total))
186 |         return list_predict_score / (num_total)
187 | 
188 |     # ------------------------------begin to predict------------
189 |     def predict(self):
190 |         str1 = "train_vec_tfidf"
191 |         str2 = "test_vec_tfidf"
192 |         train_list_total, train_list_tag, train_list_side = self.train_lsi(self.train_document, str1)
193 |         print("train model done -------------------")
194 |         text_list_total, text_list_tag, text_list_side = self.train_lsi(self.text_document, str2)
195 |         print("text model done  -------------------")
196 |         TR = train_list_total.__len__()
197 |         TE = text_list_total.__len__()
198 |         n = 5
199 |         train_list_side = mat(train_list_side)
200 |         text_list_side = mat(text_list_side)
201 |         X_train = train_list_side[:TR]
202 |         y_train = train_list_tag[:TR]
203 |         y_train = np.array(y_train)
204 | 
205 |         print("train shape :---------------------")
206 |         print(X_train.shape)
207 | 
208 |         X_text = text_list_side[:TE]
209 |         y_text = text_list_tag[:TE]
210 |         y_text = np.array(y_text)
211 | 
212 |         print("text shape :---------------------")
213 |         print(X_text.shape)
214 | 
215 |         # kfold折叠交叉验证
216 |         list_myAcc = []
217 |         self.train_eval(X_train, y_train, X_text, y_text)
218 | 
219 |     def train_eval(self, X_train, y_train, X_text, y_text):
220 |         true_acc = 0
221 |         for i in range(5):
222 |             list_train_tags = []
223 |             list_test_tags = []
224 |             print("第" + str(i) + "个分类器训练")
225 | 
226 |             # first build train tag
227 |             for line in y_train:
228 |                 list_train_tags.append(line[i])
229 | 
230 |             # first build text tag
231 |             for line in y_text:
232 |                 list_test_tags.append(line[i])
233 | 
234 |             clf = svm.SVC(probability=True)
235 | 
236 |             clf = svm.SVC(kernel='linear', probability=True)
237 |             # 逻辑回归训练模型
238 |             clf.fit(X_train, list_train_tags)
239 |             # 用模型预测
240 |             y_pred_te = clf.predict_proba(X_text)
241 | 
242 |             print(np.argmax(y_pred_te, axis=1))
243 |             print("**" * 50)
244 |             print(list_test_tags)
245 | 
246 |             # #获取准确的个数
247 |             print(self.myAcc(list_test_tags, y_pred_te))
248 |             true_acc += self.myAcc(list_test_tags, y_pred_te)
249 |         print("true acc numbers: " + str(true_acc))
250 |         print("LSI + 支持向量机　准确率平均值为: ")
251 |         print(self.mymean(true_acc, X_text))
252 | 
253 | 
254 | if __name__ == '__main__':
255 |     base_dir = 'E:\\Koo\\Projects\\PycharmProjects\\TensorFlow_DNN_Character_Classification\\data\essay_data'
256 |     user_predict = user_predict(os.path.join(base_dir, "vocab1_train.txt"),
257 |                                 os.path.join(base_dir, "vocab1_test.txt"))
258 |     # for _ in range(9):
259 |     # print('训练次数', _)
260 |     user_predict.predict()
261 | 


--------------------------------------------------------------------------------
/data/essay_data/essays.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/essay_data/essays.csv


--------------------------------------------------------------------------------
/data/label/test_label.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/label/test_label.npy


--------------------------------------------------------------------------------
/data/label/train_label.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/label/train_label.npy


--------------------------------------------------------------------------------
/data/vec/emotion_test_vec.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/vec/emotion_test_vec.npy


--------------------------------------------------------------------------------
/data/vec/emotion_train_vec.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/vec/emotion_train_vec.npy


--------------------------------------------------------------------------------
/data/vec/textmind_test_vec.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/vec/textmind_test_vec.npy


--------------------------------------------------------------------------------
/data/vec/textmind_train_vec.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/data/vec/textmind_train_vec.npy


--------------------------------------------------------------------------------
/features/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/__init__.py


--------------------------------------------------------------------------------
/features/crawl_textmind_data/README.md:
--------------------------------------------------------------------------------
1 | # [请求文心系统](http://ccpl.psych.ac.cn/textmind/)获取文本特征


--------------------------------------------------------------------------------
/features/crawl_textmind_data/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'gu'
2 | 


--------------------------------------------------------------------------------
/features/crawl_textmind_data/crawler.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | """
  4 | 爬取文心系统的数据，提取特征
  5 | """
  6 | 
  7 | import http.cookiejar
  8 | import json
  9 | import urllib.request
 10 | import urllib.parse
 11 | import numpy as np
 12 | 
 13 | from features.crawl_textmind_data import input_textmind_data
 14 | 
 15 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:35.0) Gecko/20100101 Firefox/35.0'}
 16 | 
 17 | 
 18 | class Crawler:
 19 |     def __init__(self):
 20 |         self.cj = http.cookiejar.LWPCookieJar()
 21 |         # 将一个保存cookie对象，和一个HTTP的cookie的处理器绑定
 22 |         self.cookie_processor = urllib.request.HTTPCookieProcessor(self.cj)
 23 |         # 创建一个opener，将保存了cookie的http处理器，还有设置一个handler用于处理http的URL的打开
 24 |         self.opener = urllib.request.build_opener(self.cookie_processor,
 25 |                                                   urllib.request.HTTPHandler)  # 将包含了cookie、http处理器、http的handler的资源和urllib2对象绑定在一起
 26 |         urllib.request.install_opener(self.opener)
 27 | 
 28 |     def doPost(self, text):
 29 |         print("正在请求文心...")
 30 |         PostData = {
 31 |             "str": text
 32 |         }
 33 |         PostData = urllib.parse.urlencode(PostData).encode("utf-8")
 34 |         request = urllib.request.Request('http://ccpl.psych.ac.cn/textmind/analysis', headers=headers)
 35 |         with urllib.request.urlopen(request, data=PostData) as f:
 36 |             resp = f.read()
 37 |             print(resp)
 38 |             return resp
 39 | 
 40 |     def parse_textmind_feature(self, json_str):
 41 |         feature_list = []
 42 |         json_dict = json.loads(json_str)
 43 |         print(json_dict)
 44 |         if json_dict['status'] == 'success':
 45 |             result_list = json_dict['result']
 46 |             for elem in result_list:
 47 |                 name = elem['name']
 48 |                 value = elem['value']
 49 |                 feature_list.append(value)
 50 |         else:
 51 |             raise ValueError('文心系统分析返回数据异常')
 52 |         return feature_list
 53 | 
 54 |     def save_arr(self, filename, X_sp):
 55 |         """
 56 |         特征向量保存
 57 |         """
 58 |         np.save(filename, X_sp)
 59 |         print("*****************write done over *****************")
 60 | 
 61 |     def textmind_action(self, train_lines, test_lines):
 62 |         """
 63 |         输入文本[] 保存特征
 64 |         :param train_lines:
 65 |         :param test_lines:
 66 |         :return:
 67 |         """
 68 |         X_train, y_train = self.get_input_output(train_lines)
 69 |         X_test, y_test = self.get_input_output(test_lines)
 70 | 
 71 |         textmind_train_vec_dm = "textmind_train_vec.npy"
 72 |         textmind_train_label_dm = "train_label.npy"
 73 |         textmind_test_vec_dm = "textmind_test_vec.npy"
 74 |         textmind_test_label_dm = "test_label.npy"
 75 | 
 76 |         self.save_arr(textmind_train_vec_dm, np.array(X_train))
 77 |         self.save_arr(textmind_train_label_dm, np.array(y_train))
 78 |         self.save_arr(textmind_test_vec_dm, np.array(X_test))
 79 |         self.save_arr(textmind_test_label_dm, np.array(y_test))
 80 | 
 81 |     def get_input_output(self, lines):
 82 |         """
 83 |         输入文本的lines 返回每行对应的文心特征 和 对应的标签
 84 |         :param lines:
 85 |         :return:
 86 |         """
 87 |         list_input_feature = []
 88 |         list_output_tag = []
 89 |         for t_line in lines:
 90 |             temp = t_line.split()
 91 |             user_name = temp[0]
 92 |             tags = temp[1:6]
 93 |             query = temp[6:]
 94 |             print(query)
 95 |             query = " ".join(query).strip().replace("\n", "")
 96 |             json_str = self.doPost(query)
 97 |             feature_list = self.parse_textmind_feature(json_str)
 98 |             list_input_feature.append(feature_list)
 99 | 
100 |             list_tag = []
101 |             for tag in tags:
102 |                 j = int(tag)
103 |                 list_tag.append(j)
104 |             list_output_tag.append(list_tag)
105 |         return list_input_feature, list_output_tag
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     craw = Crawler()
110 |     train_lines, test_lines = input_textmind_data.load_corpus()
111 |     craw.textmind_action(train_lines, test_lines)
112 | 


--------------------------------------------------------------------------------
/features/crawl_textmind_data/input_textmind_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | """读取文件"""
 3 | import os
 4 | 
 5 | from tensorflow.python.platform import gfile
 6 | 
 7 | __author__ = 'gu'
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | def load_corpus():
13 |     """
14 |     加载训练数据
15 |     :return:
16 |     """
17 |     base_dir = 'E:\Koo\Projects\PycharmProjects\TensorFlow_DNN_Character_Classification\data\essay_data'
18 | 
19 |     train_txt_path = os.path.join(base_dir, "vocab1_train.txt")
20 |     test_txt_path = os.path.join(base_dir, "vocab1_test.txt")
21 | 
22 |     return read_lines(train_txt_path), read_lines(test_txt_path)
23 | 
24 | 
25 | def read_lines(train_txt_path):
26 |     with gfile.Open(train_txt_path, 'r') as f:
27 |         lines = f.readlines()
28 |         for line in lines:
29 |             print(line)
30 |         print('txt lines length', len(lines))
31 |         return lines
32 | 
33 | 
34 | def load_textmind_data_label(base_model_dir):
35 |     """
36 |     加载textmind矩阵
37 |     :param base_model_dir:
38 |     :return:
39 |     """
40 |     textmind_train_vec = "textmind_train_vec.npy"
41 |     textmind_train_label = "train_label.npy"
42 |     textmind_test_vec = "textmind_test_vec.npy"
43 |     textmind_test_label = "test_label.npy"
44 | 
45 |     train_vec_filename = os.path.join(base_model_dir, textmind_train_vec)
46 |     train_label_filename = os.path.join(base_model_dir, textmind_train_label)
47 |     test_vec_filename = os.path.join(base_model_dir, textmind_test_vec)
48 |     test_label_filename = os.path.join(base_model_dir, textmind_test_label)
49 | 
50 |     X_train = np.load(train_vec_filename)
51 |     print('X_train', X_train.shape)
52 |     Y_train = np.load(train_label_filename)
53 |     print('Y_train', Y_train.shape)
54 |     X_test = np.load(test_vec_filename)
55 |     print('X_test', X_test.shape)
56 |     Y_test = np.load(test_label_filename)
57 |     print('Y_test', Y_test.shape)
58 |     return X_train, Y_train, X_test, Y_test
59 | 
60 | 
61 | def load_textmind_data_label_with_normalization(base_model_dir):
62 |     """
63 |     加载textmind矩阵 并进行数据平滑
64 |     :param base_model_dir:
65 |     :return:
66 |     """
67 |     textmind_train_vec = "textmind_train_vec.npy"
68 |     textmind_train_label = "train_label.npy"
69 |     textmind_test_vec = "textmind_test_vec.npy"
70 |     textmind_test_label = "test_label.npy"
71 | 
72 |     train_vec_filename = os.path.join(base_model_dir, textmind_train_vec)
73 |     train_label_filename = os.path.join(base_model_dir, textmind_train_label)
74 |     test_vec_filename = os.path.join(base_model_dir, textmind_test_vec)
75 |     test_label_filename = os.path.join(base_model_dir, textmind_test_label)
76 | 
77 |     X_train = np.load(train_vec_filename)
78 |     print('X_train', X_train.shape)
79 |     Y_train = np.load(train_label_filename)
80 |     print('Y_train', Y_train.shape)
81 |     X_test = np.load(test_vec_filename)
82 |     print('X_test', X_test.shape)
83 |     Y_test = np.load(test_label_filename)
84 |     print('Y_test', Y_test.shape)
85 |     X_train_1 = np.where(X_train >= 0, np.log(X_train + 1), 0)
86 |     X_test_1 = np.where(X_test >= 0, np.log(X_test + 1), 0)
87 |     return X_train_1, Y_train, X_test_1, Y_test
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     X_train, Y_train, X_test, Y_test = load_textmind_data_label_with_normalization('')
92 |     print(X_test)
93 |     print(Y_test)
94 | 


--------------------------------------------------------------------------------
/features/crawl_textmind_data/test_label.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/crawl_textmind_data/test_label.npy


--------------------------------------------------------------------------------
/features/crawl_textmind_data/textmind_test_vec.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/crawl_textmind_data/textmind_test_vec.npy


--------------------------------------------------------------------------------
/features/crawl_textmind_data/textmind_train_vec.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/crawl_textmind_data/textmind_train_vec.npy


--------------------------------------------------------------------------------
/features/crawl_textmind_data/train_label.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/crawl_textmind_data/train_label.npy


--------------------------------------------------------------------------------
/features/doc2vec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/doc2vec/__init__.py


--------------------------------------------------------------------------------
/features/doc2vec/doc2vec_action.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 | 基于doc2vec的提取文本的特征特征
  4 | """
  5 | import codecs
  6 | import os
  7 | from sklearn.externals import joblib
  8 | import gensim
  9 | from gensim.models.doc2vec import Doc2Vec, LabeledSentence
 10 | from numpy import mat
 11 | 
 12 | import numpy as np
 13 | 
 14 | __author__ = 'gu'
 15 | 
 16 | 
 17 | class D2vAction:
 18 |     def __init__(self, base_model_dir, train_document, text_document):
 19 |         """
 20 |         初始化路径
 21 |         :param base_model_dir: 存放模型的目录
 22 |         :param train_document: 训练样本
 23 |         :param text_document: 测试样本
 24 |         :return:
 25 |         """
 26 |         self.base_model_dir = base_model_dir
 27 |         self.train_document = train_document
 28 |         self.text_document = text_document
 29 | 
 30 |     def train_lsi(self, doc):
 31 |         model_d2v_dm_path = os.path.join(self.base_model_dir, "model_d2v_dm.model")
 32 |         if os.path.exists(model_d2v_dm_path):
 33 |             print("已经存在模型！")
 34 |             model_dm = joblib.load(model_d2v_dm_path)
 35 |         else:
 36 |             model_dm = self.train_lsi_model(doc)
 37 |         X_doc, list_total, list_tag = self.prepare_lsi(doc)
 38 |         for i in range(10):
 39 |             # 一个用户作为一个文件去进行d2v的计算
 40 |             model_dm.train(X_doc, total_examples=model_dm.corpus_count, epochs=2)
 41 |             X_d2v = np.array([model_dm.docvecs[i] for i in range(len(list_total))])
 42 |         print(X_d2v.shape)
 43 |         list_side = X_d2v  # doc2vec 矩阵
 44 |         print(" doc2vec 矩阵构建完成----------------")
 45 |         return list_total, list_tag, list_side
 46 | 
 47 |     def train_lsi_model(self, doc):
 48 |         X_doc, list_total, list_tag = self.prepare_lsi(doc)
 49 |         # 训练模型
 50 |         model_dm = Doc2Vec(X_doc, dm=1, size=300, negative=5, hs=0, min_count=5, window=8, sample=1e-5, workers=4,
 51 |                            alpha=0.025, min_alpha=0.025)
 52 |         joblib.dump(model_dm, "model_d2v_dm.model")
 53 |         print("d2w模型训练完成")
 54 |         return model_dm
 55 | 
 56 |     def prepare_lsi(self, doc):
 57 |         # 返回文本和标签
 58 |         list_total, list_tag = self.load_data(doc)
 59 |         # 构建语料库
 60 |         X_doc = []
 61 |         TaggededDocument = gensim.models.doc2vec.TaggedDocument
 62 |         for i in range(list_total.__len__()):
 63 |             word_list = list_total[i]
 64 |             document = TaggededDocument(word_list, tags=[i])
 65 |             X_doc.append(document)
 66 |         return X_doc, list_total, list_tag
 67 | 
 68 |     def load_data(self, doc):
 69 |         list_name = []
 70 |         list_total = []
 71 |         list_gender = []
 72 |         # 对应标签导入词典
 73 |         f = codecs.open(doc)
 74 |         temp = f.readlines()
 75 |         f.close()
 76 | 
 77 |         for i in range(len(temp)):
 78 |             temp[i] = temp[i].split(" ")
 79 |             user_name = temp[i][0]
 80 |             tags = temp[i][1:6]
 81 | 
 82 |             query = temp[i][6:]
 83 |             query = " ".join(query).strip().replace("\n", "")
 84 | 
 85 |             list_total.append(query)
 86 |             list_gender.append(tags)
 87 | 
 88 |         list_tag = []
 89 |         for line in list_gender:
 90 |             list_t = []
 91 |             for j in line:
 92 |                 j = int(j)
 93 |                 list_t.append(j)
 94 |             list_tag.append(list_t)
 95 | 
 96 |         print("data have read ")
 97 |         return list_total, list_tag
 98 | 
 99 |     def get_d2v_feature(self):
100 |         train_list_total, train_list_tag, train_list_side = self.train_lsi(self.train_document)
101 |         print("train model done -------------------")
102 | 
103 |         text_list_total, text_list_tag, text_list_side = self.train_lsi(self.text_document)
104 |         print("text model done  -------------------")
105 | 
106 |         TR = train_list_total.__len__()
107 |         TE = text_list_total.__len__()
108 |         # 将输入解释为矩阵。
109 |         train_list_side = mat(train_list_side)
110 |         text_list_side = mat(text_list_side)
111 |         # train_list_tag = mat(train_list_tag, dtype=float)
112 |         # text_list_tag = mat(text_list_tag, dtype=float)
113 | 
114 |         X_train = train_list_side[:TR]
115 |         y_train = train_list_tag[:TR]
116 |         y_train = np.array(y_train)
117 |         print("train shape :---------------------")
118 |         print(X_train.shape)
119 | 
120 |         X_text = text_list_side[:TE]
121 |         y_text = text_list_tag[:TE]
122 |         y_text = np.array(y_text)
123 |         print("text shape :---------------------")
124 |         print(X_text.shape)
125 |         print(train_list_side)
126 |         print(train_list_tag)
127 |         print(text_list_side)
128 |         print(text_list_tag)
129 | 
130 |         return train_list_side, train_list_tag, text_list_side, text_list_tag
131 | 
132 |     def write_d2v(self, filename, X_sp):
133 |         """
134 |         doc2vec的特征向量保存
135 |         :param X_sp:
136 |         :param doc_name:
137 |         :return:
138 |         """
139 |         np.save(filename, X_sp)
140 |         print("*****************write done over *****************")
141 | 
142 | 
143 | def load_data_label():
144 |     """
145 |     加载训练数据
146 |     :return:
147 |     """
148 |     base_dir = 'E:\Koo\Projects\PycharmProjects\TensorFlow_DNN_Character_Classification\data\essay_data'
149 |     base_model_dir = ''
150 |     d2vAction = D2vAction(base_model_dir,
151 |                           os.path.join(base_dir, "vocab1_train.txt"),
152 |                           os.path.join(base_dir, "vocab1_test.txt"))
153 |     train_list_side, train_list_tag, text_list_side, text_list_tag = d2vAction.get_d2v_feature()
154 |     str1 = "doc2vec_train_vec_dm.npy"
155 |     str1_1 = "train_label.npy"
156 |     str2 = "doc2vec_test_vec_dm.npy"
157 |     str2_2 = "test_label.npy"
158 |     d2vAction.write_d2v(str1, np.array(train_list_side))
159 |     d2vAction.write_d2v(str1_1, np.array(train_list_tag))
160 |     d2vAction.write_d2v(str2, np.array(text_list_side))
161 |     d2vAction.write_d2v(str2_2, np.array(text_list_tag))
162 | 
163 | 
164 | if __name__ == '__main__':
165 |     load_data_label()
166 | 


--------------------------------------------------------------------------------
/features/doc2vec/doc2vec_test_vec_dm.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/doc2vec/doc2vec_test_vec_dm.npy


--------------------------------------------------------------------------------
/features/doc2vec/doc2vec_train_vec_dm.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/doc2vec/doc2vec_train_vec_dm.npy


--------------------------------------------------------------------------------
/features/doc2vec/test_label.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/doc2vec/test_label.npy


--------------------------------------------------------------------------------
/features/doc2vec/train_label.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/doc2vec/train_label.npy


--------------------------------------------------------------------------------
/features/emotion_lexicon/README.md:
--------------------------------------------------------------------------------
1 | # [英文情感词汇](http://www.saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm)获取情感特征


--------------------------------------------------------------------------------
/features/emotion_lexicon/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'gu'
2 | 


--------------------------------------------------------------------------------
/features/emotion_lexicon/data_helper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections import defaultdict
  3 | import re
  4 | import csv
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | 
 10 | def build_emotion_lexicon_dict(datafile):
 11 |     """
 12 |     load emotion csv
 13 |     """
 14 |     print('loading emotion dict...')
 15 |     vocab_emotion_dict = defaultdict(float)
 16 |     with open(datafile, "rb") as csvf:
 17 |         csvreader = csv.reader(csvf, delimiter=',', quotechar='"')
 18 |         first_line = True
 19 |         for line in csvreader:
 20 |             if first_line:
 21 |                 first_line = False
 22 |                 continue
 23 |             status = []
 24 |             # print(line)
 25 |             try:
 26 |                 line.remove('')
 27 |             except ValueError:
 28 |                 None
 29 |             word = line[0]
 30 |             orig_rev = word.strip().lower()
 31 |             status.append(orig_rev)
 32 |             # print(orig_rev)
 33 |             word_emotion_value = []
 34 |             for value in line[1:]:
 35 |                 word_emotion_value.append(1 if value == '1' else 0)
 36 |             # print(word_emotion_value)
 37 |             vocab_emotion_dict[orig_rev] = word_emotion_value
 38 |     print('emotion word size are %s ' % len(vocab_emotion_dict))
 39 |     return vocab_emotion_dict
 40 | 
 41 | 
 42 | def build_emotion_feature(filename, vocab_emotion_dict):
 43 |     """
 44 |     build emotion feature
 45 |     """
 46 |     X_input = []
 47 |     y_output = []
 48 |     with open(filename, "rb") as f:
 49 |         lines = f.readlines()
 50 |         for line in lines:
 51 |             text = line.strip().split()
 52 |             y = [1 if s == '1' else 0 for s in text[1:6]]
 53 |             emotion_value = []
 54 |             for word in text[6:]:
 55 |                 if vocab_emotion_dict.__contains__(word):
 56 |                     word_emotion_values = vocab_emotion_dict[word]
 57 |                     emotion_value.append(word_emotion_values)
 58 |             value = map(sum, zip(*emotion_value))
 59 |             X_input.append(value)
 60 |             y_output.append(y)
 61 |     X_input = np.mat(X_input)
 62 |     y_output = np.mat(y_output)
 63 |     print('X_input.shape', X_input.shape)
 64 |     print('y_output.shape', y_output.shape)
 65 |     return X_input, y_output
 66 | 
 67 | 
 68 | def save_arr(filename, X_sp):
 69 |     np.save(filename, X_sp)
 70 |     print('write done', filename)
 71 | 
 72 | 
 73 | def load_data_label():
 74 |     """
 75 |     load and save arr
 76 |     :return:
 77 |     """
 78 |     base_dir = ''
 79 |     data_folder = os.path.join(base_dir, 'Emotion_Lexicon.csv')
 80 |     print("loading data...")
 81 |     emotion_dict = build_emotion_lexicon_dict(data_folder)
 82 |     X_train, y_train = build_emotion_feature('../data/essay_data/vocab1_train.txt', emotion_dict)
 83 |     X_test, y_test = build_emotion_feature('../data/essay_data/vocab1_test.txt', emotion_dict)
 84 |     save_arr('emotion_train_vec.npy', X_train)
 85 |     save_arr('emotion_train_label.npy', y_train)
 86 |     save_arr('emotion_test_vec.npy', X_test)
 87 |     save_arr('emotion_test_label.npy', y_test)
 88 |     return X_train, y_train, X_test, y_test
 89 | 
 90 | 
 91 | def load_emotion_data_label(base_model_dir):
 92 |     """
 93 |     load .npy file for arr
 94 |     :param base_model_dir:
 95 |     :return:
 96 |     """
 97 |     train_vec_filename = os.path.join(base_model_dir, "emotion_train_vec.npy")
 98 |     train_label_filename = os.path.join(base_model_dir, 'emotion_train_label.npy')
 99 |     test_vec_filename = os.path.join(base_model_dir, 'emotion_test_vec.npy')
100 |     test_label_filename = os.path.join(base_model_dir, 'emotion_test_label.npy')
101 |     X_train = np.load(train_vec_filename)
102 |     print('X_train', X_train.shape)
103 |     Y_train = np.load(train_label_filename)
104 |     print('Y_train', Y_train.shape)
105 |     X_test = np.load(test_vec_filename)
106 |     print('X_test', X_test.shape)
107 |     Y_test = np.load(test_label_filename)
108 |     print('Y_test', Y_test.shape)
109 |     return X_train, Y_train, X_test, Y_test
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     # load_data_label()
114 |     load_emotion_data_label('')
115 | 


--------------------------------------------------------------------------------
/features/emotion_lexicon/emotion_test_label.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/emotion_lexicon/emotion_test_label.npy


--------------------------------------------------------------------------------
/features/emotion_lexicon/emotion_test_vec.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/emotion_lexicon/emotion_test_vec.npy


--------------------------------------------------------------------------------
/features/emotion_lexicon/emotion_train_label.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/emotion_lexicon/emotion_train_label.npy


--------------------------------------------------------------------------------
/features/emotion_lexicon/emotion_train_vec.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/emotion_lexicon/emotion_train_vec.npy


--------------------------------------------------------------------------------
/features/process_data1.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections import defaultdict
  3 | import re
  4 | import csv
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | 
 10 | def build_data_cv(datafile, cv=10, clean_string=True):
 11 |     """
 12 |     Loads data and split into 10 folds.
 13 |     """
 14 |     revs = []
 15 |     vocab = defaultdict(float)
 16 | 
 17 |     with open(datafile, "rb") as csvf:
 18 |         csvreader = csv.reader(csvf, delimiter=',', quotechar='"')
 19 |         first_line = True
 20 |         for line in csvreader:
 21 |             if first_line:
 22 |                 first_line = False
 23 |                 continue
 24 |             status = []
 25 |             sentences = re.split(r'[.?]', line[1].strip())
 26 |             try:
 27 |                 sentences.remove('')
 28 |             except ValueError:
 29 |                 None
 30 | 
 31 |             for sent in sentences:
 32 |                 if clean_string:
 33 |                     orig_rev = clean_str(sent.strip())
 34 |                     if orig_rev == '':
 35 |                         continue
 36 |                     words = set(orig_rev.split())
 37 |                     splitted = orig_rev.split()
 38 |                     if len(splitted) > 150:
 39 |                         orig_rev = []
 40 |                         splits = int(np.floor(len(splitted) / 20))
 41 |                         for index in range(splits):
 42 |                             orig_rev.append(' '.join(splitted[index * 20:(index + 1) * 20]))
 43 |                         if len(splitted) > splits * 20:
 44 |                             orig_rev.append(' '.join(splitted[splits * 20:]))
 45 |                         status.extend(orig_rev)
 46 |                     else:
 47 |                         status.append(orig_rev)
 48 |                 else:
 49 |                     orig_rev = sent.strip().lower()
 50 |                     words = set(orig_rev.split())
 51 |                     status.append(orig_rev)
 52 | 
 53 |                 for word in words:
 54 |                     vocab[word] += 1
 55 | 
 56 |             datum = {"y0": 1 if line[2].lower() == 'y' else 0,
 57 |                      "y1": 1 if line[3].lower() == 'y' else 0,
 58 |                      "y2": 1 if line[4].lower() == 'y' else 0,
 59 |                      "y3": 1 if line[5].lower() == 'y' else 0,
 60 |                      "y4": 1 if line[6].lower() == 'y' else 0,
 61 |                      "text": status,
 62 |                      "user": line[0],
 63 |                      "num_words": np.max([len(sent.split()) for sent in status]),
 64 |                      "split": np.random.randint(0, cv)}
 65 |             revs.append(datum)
 66 | 
 67 |     return revs, vocab
 68 | 
 69 | 
 70 | def clean_str(string, TREC=False):
 71 |     """
 72 |     Tokenization/string cleaning for all datasets except for SST.
 73 |     Every dataset is lower cased except for TREC
 74 |     """
 75 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 76 |     string = re.sub(r"\'s", " \'s ", string)
 77 |     string = re.sub(r"\'ve", " have ", string)
 78 |     string = re.sub(r"n\'t", " not ", string)
 79 |     string = re.sub(r"\'re", " are ", string)
 80 |     string = re.sub(r"\'d", " would ", string)
 81 |     string = re.sub(r"\'ll", " will ", string)
 82 |     string = re.sub(r",", " , ", string)
 83 |     string = re.sub(r"!", " ! ", string)
 84 |     string = re.sub(r"\(", " ( ", string)
 85 |     string = re.sub(r"\)", " ) ", string)
 86 |     string = re.sub(r"\?", " \? ", string)
 87 |     #    string = re.sub(r"[a-zA-Z]{4,}", "", string)
 88 |     string = re.sub(r"\s{2,}", " ", string)
 89 |     return string.strip() if TREC else string.strip().lower()
 90 | 
 91 | 
 92 | def clean_str_sst(string):
 93 |     """
 94 |     Tokenization/string cleaning for the SST dataset
 95 |     """
 96 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 97 |     string = re.sub(r"\s{2,}", " ", string)
 98 |     return string.strip().lower()
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     base_dir = '/data/essays_data'
103 | 
104 |     data_folder = os.path.join(base_dir, 'essays.csv')
105 |     print("loading data...")
106 |     revs, vocab = build_data_cv(data_folder, cv=10, clean_string=True)
107 |     num_words = pd.DataFrame(revs)["num_words"]
108 |     max_l = np.max(num_words)
109 |     print("data loaded!")
110 |     print("number of status: " + str(len(revs)))
111 |     print("vocab size: " + str(len(vocab)))
112 |     print("max sentence length: " + str(max_l))
113 | 


--------------------------------------------------------------------------------
/features/tfidf/test_label.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/tfidf/test_label.npy


--------------------------------------------------------------------------------
/features/tfidf/tfidf_action.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | '''tfidf-lsi-svm stack for gender'''
  3 | 
  4 | from __future__ import division
  5 | 
  6 | import codecs
  7 | import os
  8 | 
  9 | from sklearn.feature_extraction.text import TfidfVectorizer
 10 | from sklearn import svm
 11 | from sklearn.externals import joblib
 12 | from numpy import *
 13 | from gensim import models, corpora
 14 | import numpy as np
 15 | 
 16 | INPUT_SIZE = 300  # 训练特征维度参数
 17 | 
 18 | 
 19 | class user_predict:
 20 |     def __init__(self, train_document, text_document):
 21 |         self.train_document = train_document
 22 |         self.text_document = text_document
 23 | 
 24 |     # -----------------------准确值计算-----------------------
 25 |     def myAcc(self, y_true, y_pred):
 26 |         true_num = 0
 27 |         # 最大数的索引
 28 |         y_pred = np.argmax(y_pred, axis=1)
 29 | 
 30 |         # for i in range(y_true.__len__()):
 31 |         #     print y_true[i]
 32 |         for i in range(y_pred.__len__()):
 33 |             if y_true[i] == y_pred[i]:
 34 |                 true_num += 1
 35 |         return true_num
 36 | 
 37 |     # -----------------------load data-----------------------
 38 |     def load_data(self, doc):
 39 |         list_name = []
 40 |         list_total = []
 41 |         list_gender = []
 42 |         # 对应标签导入词典
 43 |         f = codecs.open(doc)
 44 |         temp = f.readlines()
 45 |         print(len(temp))
 46 | 
 47 |         for i in range(len(temp)):
 48 |             temp[i] = temp[i].split(" ")
 49 |             user_name = temp[i][0]
 50 |             tags = temp[i][1:6]
 51 |             query = temp[i][6:]
 52 |             query = " ".join(query).strip().replace("\n", "")
 53 |             list_total.append(query)
 54 |             list_gender.append(tags)
 55 |         print(list_total.__len__())
 56 |         print(list_gender.__len__())
 57 |         list_tag = []
 58 |         for line in list_gender:
 59 |             list_t = []
 60 |             for j in line:
 61 |                 j = int(j)
 62 |                 list_t.append(j)
 63 |             list_tag.append(list_t)
 64 |         print("data have read ")
 65 |         return list_total, list_tag
 66 | 
 67 |     def load_stopword(self):
 68 |         """
 69 |         加载停用词语
 70 |         :param stopworddoc:
 71 |         :return:
 72 |         """
 73 |         stop_word = []
 74 |         return stop_word
 75 |         # with open('EN_Stopword.txt') as f:
 76 |         #     lines = f.readlines()
 77 |         #     for line in lines:
 78 |         #         word = line.replace('\n', '')
 79 |         #         if word != '':
 80 |         #             stop_word.append(word)
 81 |         # with open('ENstopwords.txt') as f:
 82 |         #     lines = f.readlines()
 83 |         #     for line in lines:
 84 |         #         word = line.replace('\n', '')
 85 |         #         if word != '':
 86 |         #             stop_word.append(word)
 87 |         #
 88 |         # return list(set(stop_word))
 89 | 
 90 |     # -------------------------prepare lsi svd -----------------------
 91 |     def prepare_lsi(self, doc):
 92 |         # 给训练集用的
 93 |         list_total, list_tag = self.load_data(doc)
 94 |         stop_word = self.load_stopword()
 95 |         texts = [[word for word in document.lower().split() if word not in stop_word]
 96 |                  for document in list_total]
 97 |         # train dictionary done# 抽取一个bag-of-words，将文档的token映射为id
 98 |         dictionary = corpora.Dictionary(texts)  # 生成词典 # {'a': 0, 'damaged': 1, 'gold': 3, 'fire': 2}
 99 |         # print dictionary.token2id
100 |         # 产生文档向量，将用字符串表示的文档转换为用id和词频表示的文档向量
101 |         corpus = [dictionary.doc2bow(text) for text in texts]
102 |         # [[(0, 1), (6, 1)], [(0, 1), (9, 2), (10, 1)], [(0, 1), (3, 1)]]
103 |         # 例如（9，2）这个元素代表第二篇文档中id为9的单词出现了2次
104 |         # 用TFIDF的方法计算词频,sublinear_tf 表示学习率
105 |         tfv = TfidfVectorizer(min_df=1, max_df=0.95, sublinear_tf=True, stop_words=stop_word)
106 |         # 对文本中所有的用户对应的所有的评论里面的单词进行ＴＦＩＤＦ的计算，找出每个词对应的tfidf值
107 |         X_sp = tfv.fit_transform(list_total)
108 |         # train model done基于这些“训练文档”计算一个TF-IDF模型
109 |         tfidf_model = models.TfidfModel(corpus)
110 |         joblib.dump(tfidf_model, "tfidf_model.model")
111 |         # 转化文档向量，将用词频表示的文档向量表示为一个用tf-idf值表示的文档向量
112 |         corpus_tfidf = tfidf_model[corpus]
113 |         # [[(1, 0.6633689723434505), (2, 0.6633689723434505)],[(7, 0.16073253746956623), (8, 0.4355066251613605)]]
114 |         # 训练LSI模型 即将训练文档向量组成的矩阵SVD分解，并做一个秩为2的近似SVD分解
115 |         lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=INPUT_SIZE)
116 |         joblib.dump(dictionary, "tfidf_dictionary.dict")
117 |         print("训练集lsi -----")
118 |         joblib.dump(lsi_model, "tfidf_lsi.model")
119 |         return tfidf_model, dictionary
120 | 
121 |     def train_lsi(self, doc, str_doc):
122 |         if not (os.path.exists("tfidf_model.model")):
123 |             print("prepare model")
124 |             tfidf_model, dictionary = self.prepare_lsi(doc)
125 |             list_total, list_tag = self.load_data(doc)
126 |             stop_word = self.load_stopword()
127 |             texts = [[word for word in document.lower().split() if word not in stop_word]
128 |                      for document in list_total]
129 |             corpus = [dictionary.doc2bow(text) for text in texts]
130 |         else:
131 |             print("use model")
132 |             # load train valid text
133 |             tfidf_model = joblib.load("tfidf_model.model")
134 |             dictionary = joblib.load("tfidf_dictionary.dict")
135 |             # load data
136 |             list_total, list_tag = self.load_data(doc)
137 |             stop_word = self.load_stopword()
138 |             texts = [[word for word in document.lower().split() if word not in stop_word]
139 |                      for document in list_total]
140 |             corpus = [dictionary.doc2bow(text) for text in texts]
141 |         lsi_model = joblib.load("tfidf_lsi.model")
142 |         corpus_tfidf = tfidf_model[corpus]
143 |         list_side = []
144 |         corpus_lsi = lsi_model[corpus_tfidf]
145 |         nodes = list(corpus_lsi)
146 |         for i in range(len(nodes)):
147 |             list_d = []
148 |             for j in range(INPUT_SIZE):
149 |                 # print nodes[i][j]
150 |                 list_d.append(nodes[i][j][1])
151 |             list_side.append(list_d)
152 |         list_vec = mat(list_side)
153 |         self.write_mat(list_vec, str_doc)
154 |         print("lsi 矩阵构建完成----------------")
155 |         return list_total, list_tag, list_side
156 | 
157 |     def write_mat(self, X_sp, doc_name):
158 |         file_name = "tfidf_" + doc_name + ".npy"
159 |         np.save(file_name, X_sp)
160 |         print("*****************write done over *****************")
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     base_dir = 'E:\Koo\Projects\PycharmProjects\TensorFlow_DNN_Character_Classification\data\essay_data'
165 |     user_predict = user_predict(os.path.join(base_dir, "vocab1_train.txt"),
166 |                                 os.path.join(base_dir, "vocab1_test.txt"))
167 | 
168 |     user_predict.train_lsi(user_predict.train_document,"train_vec_tfidf")
169 |     user_predict.train_lsi(user_predict.text_document,"test_vec_tfidf")


--------------------------------------------------------------------------------
/features/tfidf/tfidf_test_vec_tfidf.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/tfidf/tfidf_test_vec_tfidf.npy


--------------------------------------------------------------------------------
/features/tfidf/tfidf_train_vec_tfidf.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/tfidf/tfidf_train_vec_tfidf.npy


--------------------------------------------------------------------------------
/features/tfidf/train_label.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gugug/TensorFlow_DNN_Character_Classification/c114c88723808ab4ae8c0bc397f0e663373d9a5b/features/tfidf/train_label.npy


--------------------------------------------------------------------------------
/model/README.md:
--------------------------------------------------------------------------------
1 | # 保存模型


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'gu'
2 | 


--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | __author__ = 'gu'
 3 | import os
 4 | import logging
 5 | 
 6 | 
 7 | def get_logger():
 8 |     logging.basicConfig(filename=os.path.join('' '../log.txt'),
 9 |                         format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
10 |                         datefmt='%a, %d %b %Y %H:%M:%S', level=logging.INFO)
11 |     # 定义一个Handler打印INFO及以上级别的日志到sys.stderr
12 |     console = logging.StreamHandler()
13 |     console.setLevel(logging.INFO)
14 |     logging.getLogger('').addHandler(console)
15 |     return logging
16 | 


--------------------------------------------------------------------------------