├── .gitignore
├── README.md
├── cnn_model.py
├── cnn_rnn_model.py
├── config.py
├── data
    ├── predict_first.csv
    └── train_first.csv
├── data_helper.py
├── predict.py
├── preprocess.py
├── rnn_cnn_model.py
├── rnn_model.py
├── train.py
└── utils
    ├── __init__.py
    ├── config.py
    ├── log.py
    ├── model_helper.py
    └── nlp_util.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text Classify
 2 | 
 3 | 之前参加了一个比赛，比赛内容是和文本分类有关，实现了常见的rnn,cnn,cnn-rnn等模型，将代码整理以下，方便大家参考。
 4 | 
 5 | ## 模型训练步骤
 6 | 
 7 | ### 运行环境
 8 | tensorflow == 1.3
 9 | python == 2.7
10 | 
11 | ### 预处理数据
12 | 
13 | - 数据分词
14 | 由于数据量较大，预先将数据分词，减少后来训练调参时候的分词时间开销。
15 | 
16 | - 训练词向量
17 | 训练词向量以便后续模型使用，同时保存词向量矩阵。
18 | 
19 | 执行命令：`python preprocess.py`
20 | 
21 | ### 模型训练模式
22 | 
23 | 有三种训练方式：single, multi, kfold, 默认为single
24 | single表示只会训练一次模型；
25 | multi会多次训练模型，取多次训练结果的平均，每次训练数据选取随机；
26 | kfold采取k折交叉验证训练模型，取多次训练结果的平均。
27 | 
28 | 执行命令：`python train.py --mode==single`
29 | 


--------------------------------------------------------------------------------
/cnn_model.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python 
  2 | #-*- encoding: utf-8 -*-
  3 | 
  4 | 
  5 | import os
  6 | import time
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | from utils import model_helper
 10 | import config
 11 | 
 12 | 
 13 | class ModelParas(object):
 14 |     embedding_size = config.embedding_size
 15 |     cell_num_units = 256
 16 |     num_layers = 1
 17 |     batch_size = 64
 18 |     cnn_dropout = 0.0
 19 |     rnn_dropout = 0.0 
 20 |     learning_rate = 0.01 
 21 |     decay = 0.99
 22 |     lrshrink = 5
 23 |     uniform_init_scale = 0.04
 24 |     clip_gradient_norm = 5.0
 25 |     filter_sizes = [3, 4, 5]
 26 |     l2_reg_lambda = 0.0
 27 |     max_pool_size = 4
 28 |     num_filters = 32
 29 |     epochs = 20 
 30 | 
 31 | 
 32 | class Model(object):
 33 |     
 34 | 
 35 |     def __init__(self, paras, sess, mode, emb_matrix):
 36 |         self.paras = paras
 37 |         self.sess = sess
 38 |         self.mode = mode
 39 | 
 40 |         # Model variable
 41 |         with tf.device('/cpu:0'):
 42 |             self.embeddings = tf.get_variable(
 43 |                 name = 'embeddings',
 44 |                 shape = emb_matrix.shape,
 45 |                 dtype = tf.float32,
 46 |                 initializer = tf.constant_initializer(emb_matrix))
 47 |         self.global_step = tf.get_variable( 
 48 |             name = 'global_step', 
 49 |             dtype = tf.int32,
 50 |             initializer = 1,
 51 |             trainable = False)
 52 | 
 53 |         self._build_graph()
 54 | 
 55 |     
 56 |     def _create_placeholder(self):
 57 |         self.lr = tf.placeholder(tf.float32, [], name = 'learning_rate')
 58 |         self.sents = tf.placeholder(tf.int32, [None, None], name = 'sents')
 59 |         with tf.device('/cpu:0'):
 60 |             self.emb_sents = tf.nn.embedding_lookup(
 61 |                 self.embeddings, self.sents)
 62 |             # Expand dimension so meet input requirement of 2d-conv
 63 |             self.emb_expand = tf.expand_dims(self.emb_sents, -1)
 64 |         self.sent_lengths = tf.placeholder(tf.int32, [None], name = 'sent_lengths')
 65 |         self.pad = tf.placeholder(tf.float32, [None, 1, embedding_size, 1], name='pad')
 66 |         self.labels = tf.placeholder(tf.int32, [None], name = 'labels')
 67 | 
 68 | 
 69 |     def _inference(self):
 70 |         # Convolution network
 71 |         with tf.name_scope('cnn'):
 72 | 
 73 |             # After conv and pooling, 
 74 |             max_length = tf.reduce_max(self.sent_lengths)
 75 |             div_value = tf.div(tf.cast(max_length, tf.float32), self.paras.max_pool_size)
 76 |             reduced_size = tf.cast(tf.ceil(div_value), tf.int32) 
 77 | 
 78 |             pooled_concat = []
 79 |             for i, filter_size in enumerate(self.paras.filter_sizes):
 80 |                 with tf.name_scope('conv-pool-%s' % filter_size):
 81 |                     # Padding zero to keep conv output has same dimention as input
 82 |                     # shape is : [batch_size, sent_length, emb_size, channel]
 83 |                     num_prio = (filter_size - 1) // 2
 84 |                     num_post = (filter_size - 1) - num_prio
 85 |                     pad_prio = tf.concat([self.pad] * num_prio,1)
 86 |                     pad_post = tf.concat([self.pad] * num_post,1)
 87 |                     emb_pad = tf.concat([pad_prio, self.emb_expand, pad_post], 1)
 88 |                     # Prepare filter for conv
 89 |                     filter_ = tf.get_variable(
 90 |                         name = 'filter-%s' % filter_size,
 91 |                         shape = [filter_size, self.paras.embedding_size, 1, self.paras.num_filters])
 92 |                     # conv: [batch_size, sent_length, 1, num_filters]
 93 |                     conv = tf.nn.conv2d(
 94 |                         input = self.emb_pad,
 95 |                         filter = filter_,
 96 |                         strides = [1, 1, 1, 1],
 97 |                         padding = 'VALID',
 98 |                         name = 'conv')
 99 |                     # Bias
100 |                     b = tf.get_variable(
101 |                         name = 'bias-%s' % filter_size,
102 |                         shape = [self.paras.num_filters])
103 |                     h = tf.nn.relu(tf.nn.bias_add(conv, b))
104 |                     # Max pooling over the outputs
105 |                     pooled = tf.nn.max_pool(
106 |                         value = h, 
107 |                         ksize = [1, self.paras.max_pool_size, 1, 1],
108 |                         trides = [1, self.paras.max_pool_size, 1, 1], 
109 |                         padding ='SAME', 
110 |                         name ='pool')
111 |                     pooled = pooled.reshape(pooled, [-1, reduced_size, self.paras.num_filters])
112 |                     pooled_concat.append(pooled)
113 |             # pooled_concat: [batch_size, reduced_size, filter_sizes * num_filters]
114 |             pooled_concat = tf.concat(pooled_concat, 2)
115 |             if self.mode == tf.contrib.learn.ModeKeys.TRAIN: 
116 |                 pooled_concat = tf.nn.dropout(pooled_concat, 1.0 - self.paras.cnn_dropout)
117 | 
118 |         # RNN network 
119 |         with tf.name_scope('rnn'):
120 |             cells_fw = model_helper.create_rnn_cell(
121 |                 'lstm', 
122 |                 self.paras.cell_num_units,
123 |                 self.paras.num_layers,
124 |                 self.paras.rnn_dropout,
125 |                 self.mode)
126 |             cells_bw = model_helper.create_rnn_cell(
127 |                 'lstm', 
128 |                 self.paras.cell_num_units,
129 |                 self.paras.num_layers,
130 |                 self.paras.rnn_dropout,
131 |                 self.mode)
132 |             outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
133 |                 cells_fw, 
134 |                 cells_bw,
135 |                 inputs = pooled_concat,
136 |                 dtype = tf.float32)
137 |             # states_fw: (batch_size, reduced_size, cell_size)
138 |             states_fw, states_bw = outputs 
139 |             concat_states = tf.concat([states_fw, states_bw], axis = 2)
140 |             # sent_states: (batch_size, 2 * cell_size)
141 |             self.sent_states = tf.reduce_max(concat_states, axis = 1)
142 | 
143 |         with tf.name_scope('classify'):
144 |             hidden1 = tf.contrib.layers.fully_connected(
145 |                 inputs = self.sent_states,
146 |                 num_outputs = 512)
147 |             hidden2 = tf.contrib.layers.fully_connected(
148 |                 inputs = hidden1,
149 |                 num_outputs = 5)
150 |             self.predicts = tf.reduce_max(tf.contrib.layers.fully_connected(
151 |                 inputs = hidden2,
152 |                 activation_fn = None,
153 |                 num_outputs = 1), axis = 1)
154 |             self.mse = tf.reduce_mean(tf.cast(
155 |                 tf.squared_difference(
156 |                     self.labels, 
157 |                     tf.cast(tf.round(self.predicts), tf.int32)),
158 |                 tf.float32))
159 | 
160 |         with tf.name_scope('accuracy'):
161 |             correct_prediction = tf.equal(self.labels, 
162 |                 tf.cast(tf.round(self.predicts), tf.int32))
163 |             self.accuracy = tf.reduce_mean(tf.cast(
164 |                 correct_prediction, tf.float32))
165 | 
166 | 
167 |     def _create_loss(self):
168 |         with tf.name_scope('loss'):
169 |             self.loss = tf.reduce_mean(
170 |                 tf.losses.mean_squared_error(
171 |                     labels = tf.cast(self.labels, tf.float32),
172 |                     predictions = self.predicts)) 
173 | 
174 | 
175 |     def _create_optimizer(self):
176 |         with tf.name_scope('optimizer'):
177 |             self.optimizer = tf.contrib.layers.optimize_loss(
178 |                 loss = self.loss, 
179 |                 global_step = self.global_step, 
180 |                 learning_rate = self.lr, 
181 |                 optimizer = 'SGD', 
182 |                 clip_gradients = self.paras.clip_gradient_norm) 
183 | 
184 | 
185 |     def _create_summary(self):
186 |         log_path = os.path.join(config.model_path, 'tensorboard')
187 |         self.train_writer = tf.summary.FileWriter(
188 |             os.path.join(log_path, 'train'), self.sess.graph)
189 |         self.test_writer = tf.summary.FileWriter(
190 |             os.path.join(log_path, 'test'), self.sess.graph)
191 |         with tf.name_scope('summary') as scope:
192 |             tf.summary.scalar('loss', self.loss) 
193 |             tf.summary.scalar('accuracy', self.accuracy)
194 | 
195 | 
196 |     def _build_graph(self):
197 |         self._create_placeholder()
198 |         self._inference()
199 |         self._create_loss()
200 |         self._create_optimizer()
201 |         self._create_summary()
202 |         print 'Build graph done'
203 | 
204 | 
205 | def test():
206 |     sess = tf.Session()
207 |     paras = ModelParas()
208 | 
209 |     emb_matrix = NlpUtil.build_emb_matrix()
210 |     Model(paras, sess, tf.contrib.learn.ModeKeys.TRAIN)
211 |     
212 | 
213 | if __name__ == '__main__':
214 |     pass
215 | 


--------------------------------------------------------------------------------
/cnn_rnn_model.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python 
  2 | #-*- encoding: utf-8 -*-
  3 | 
  4 | 
  5 | import os
  6 | import time
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | from utils import model_helper
 10 | import config
 11 | 
 12 | 
 13 | class ModelParas(object):
 14 |     embedding_size = config.embedding_size
 15 |     batch_size = 64
 16 |     sequence_length = None
 17 |     learning_rate = 0.01 
 18 |     decay = 0.99
 19 |     lrshrink = 5
 20 |     uniform_init_scale = 0.04
 21 |     clip_gradient_norm = 5.0
 22 |     l2_reg_lambda = 0.0
 23 |     nclasses = 5
 24 |     epochs = 20 
 25 | 
 26 |     # CNN
 27 |     cnn_dropout = 0.5
 28 |     filter_sizes = [3, 4, 5]
 29 |     max_pool_size = 2
 30 |     num_filters = 128
 31 | 
 32 |     # RNN
 33 |     rnn_dropout = 0.0 
 34 |     cell_num_units = 256
 35 |     num_layers = 1
 36 | 
 37 | 
 38 | class Model(object):
 39 |     
 40 | 
 41 |     def __init__(self, paras, sess, mode, emb_matrix):
 42 |         self.paras = paras
 43 |         self.sess = sess
 44 |         self.mode = mode
 45 |         self.emb_matrix = emb_matrix
 46 |         self._build_graph()
 47 | 
 48 |     
 49 |     def _create_placeholder(self):
 50 |         self.sents = tf.placeholder(tf.int32, [None, None], name = 'sents')
 51 |         self.sent_lengths = tf.placeholder(tf.int32, [None], name = 'sent_lengths')
 52 |         self.pad = tf.placeholder(tf.float32, [None, 1, self.paras.embedding_size, 1], name='pad')
 53 |         self.labels = tf.placeholder(tf.int32, [None], name = 'labels')
 54 |         self.lr = tf.placeholder(tf.float32, [], name = 'learning_rate')
 55 | 
 56 | 
 57 |     def _create_variable(self):
 58 |         # Model variable
 59 |         with tf.device('/cpu:0'):
 60 |             self.embeddings = tf.get_variable(
 61 |                 name = 'embeddings',
 62 |                 shape = self.emb_matrix.shape,
 63 |                 dtype = tf.float32,
 64 |                 initializer = tf.constant_initializer(self.emb_matrix))
 65 |         self.global_step = tf.get_variable( 
 66 |             name = 'global_step', 
 67 |             dtype = tf.int32,
 68 |             initializer = 1,
 69 |             trainable = False)
 70 | 
 71 | 
 72 |     def _inference(self):
 73 |         with tf.device('/cpu:0'):
 74 |             self.emb_sents = tf.nn.embedding_lookup(
 75 |                 self.embeddings, self.sents)
 76 |             # Expand dimension so meet input requirement of 2d-conv
 77 |             self.emb_expand = tf.expand_dims(self.emb_sents, -1)
 78 | 
 79 |         # Convolution network
 80 |         with tf.name_scope('cnn'):
 81 |             # After conv and pooling, 
 82 |             max_length = tf.reduce_max(self.sent_lengths)
 83 |             div_value = tf.div(tf.cast(max_length, tf.float32), self.paras.max_pool_size)
 84 |             reduced_size = tf.cast(tf.ceil(div_value), tf.int32) 
 85 |             pooled_concat = []
 86 |             for filter_size in self.paras.filter_sizes:
 87 |                 with tf.name_scope('conv-pool-%s' % filter_size):
 88 |                     # Padding zero to keep conv output has same dimention as input
 89 |                     # shape is : [batch_size, sent_length, emb_size, channel]
 90 |                     num_prio = (filter_size - 1) // 2
 91 |                     num_post = (filter_size - 1) - num_prio
 92 |                     pad_prio = tf.concat([self.pad] * num_prio, 1)
 93 |                     pad_post = tf.concat([self.pad] * num_post, 1)
 94 |                     emb_pad = tf.concat([pad_prio, self.emb_expand, pad_post], 1)
 95 |                     # Prepare filter for conv
 96 |                     filter_ = tf.get_variable(
 97 |                         name = 'filter-%s' % filter_size,
 98 |                         shape = [filter_size, self.paras.embedding_size, 1, self.paras.num_filters])
 99 |                     # conv: [batch_size, sent_length, 1, num_filters]
100 |                     conv = tf.nn.conv2d(
101 |                         input = emb_pad,
102 |                         filter = filter_,
103 |                         strides = [1, 1, 1, 1],
104 |                         padding = 'VALID',
105 |                         name = 'conv')
106 |                     # Bias
107 |                     b = tf.get_variable(
108 |                         name = 'bias-%s' % filter_size,
109 |                         shape = [self.paras.num_filters])
110 |                     h = tf.nn.relu(tf.nn.bias_add(conv, b))
111 |                     # Max pooling over the outputs
112 |                     pooled = tf.nn.max_pool(
113 |                         value = h, 
114 |                         ksize = [1, self.paras.max_pool_size, 1, 1],
115 |                         strides = [1, self.paras.max_pool_size, 1, 1], 
116 |                         padding ='SAME', 
117 |                         name ='pool')
118 |                     pooled = tf.reshape(pooled, [-1, reduced_size, self.paras.num_filters])
119 |                     pooled_concat.append(pooled)
120 |             # pooled_concat: (batch_size, reduced_size, filter_sizes * num_filters)
121 |             self.pooled_concat = tf.concat(pooled_concat, 2)
122 |             if self.mode == tf.contrib.learn.ModeKeys.TRAIN: 
123 |                 self.pooled_concat = tf.nn.dropout(self.pooled_concat, 1.0 - self.paras.cnn_dropout)
124 | 
125 |         # RNN network 
126 |         with tf.name_scope('rnn'):
127 |             cells_fw = model_helper.create_rnn_cell(
128 |                 'lstm', 
129 |                 self.paras.cell_num_units,
130 |                 self.paras.num_layers,
131 |                 self.paras.rnn_dropout,
132 |                 self.mode)
133 |             cells_bw = model_helper.create_rnn_cell(
134 |                 'lstm', 
135 |                 self.paras.cell_num_units,
136 |                 self.paras.num_layers,
137 |                 self.paras.rnn_dropout,
138 |                 self.mode)
139 |             outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
140 |                 cells_fw, 
141 |                 cells_bw,
142 |                 inputs = self.pooled_concat,
143 |                 dtype = tf.float32)
144 |             # states_fw: (batch_size, reduced_size, cell_size)
145 |             states_fw, states_bw = outputs 
146 |             concat_states = tf.concat([states_fw, states_bw], axis = 2)
147 |             # sent_states: (batch_size, 2 * cell_size)
148 |             self.sent_states = tf.reduce_max(concat_states, axis = 1)
149 | 
150 |         with tf.name_scope('classify'):
151 |             hidden1 = tf.contrib.layers.fully_connected(
152 |                 inputs = self.sent_states,
153 |                 num_outputs = 512)
154 |             hidden2 = tf.contrib.layers.fully_connected(
155 |                 inputs = hidden1,
156 |                 num_outputs = 5)
157 |             self.predicts = tf.reduce_max(tf.contrib.layers.fully_connected(
158 |                 inputs = hidden2,
159 |                 activation_fn = None,
160 |                 num_outputs = 1), axis = 1)
161 |             self.mse = tf.reduce_mean(tf.cast(
162 |                 tf.squared_difference(
163 |                     self.labels, 
164 |                     tf.cast(tf.round(self.predicts), tf.int32)),
165 |                 tf.float32))
166 | 
167 |         with tf.name_scope('accuracy'):
168 |             correct_prediction = tf.equal(self.labels, 
169 |                 tf.cast(tf.round(self.predicts), tf.int32))
170 |             self.accuracy = tf.reduce_mean(tf.cast(
171 |                 correct_prediction, tf.float32))
172 | 
173 | 
174 |     def _create_loss(self):
175 |         with tf.name_scope('loss'):
176 |             self.loss = tf.reduce_mean(
177 |                 tf.losses.mean_squared_error(
178 |                     labels = tf.cast(self.labels, tf.float32),
179 |                     predictions = self.predicts)) 
180 | 
181 | 
182 |     def _create_optimizer(self):
183 |         with tf.name_scope('optimizer'):
184 |             self.optimizer = tf.contrib.layers.optimize_loss(
185 |                 loss = self.loss, 
186 |                 global_step = self.global_step, 
187 |                 learning_rate = self.lr, 
188 |                 optimizer = 'SGD', 
189 |                 clip_gradients = self.paras.clip_gradient_norm) 
190 | 
191 | 
192 |     def _create_summary(self):
193 |         log_path = os.path.join(config.model_path, 'tensorboard')
194 |         self.train_writer = tf.summary.FileWriter(
195 |             os.path.join(log_path, 'train'), self.sess.graph)
196 |         self.test_writer = tf.summary.FileWriter(
197 |             os.path.join(log_path, 'test'), self.sess.graph)
198 |         with tf.name_scope('summary') as scope:
199 |             tf.summary.scalar('loss', self.loss) 
200 |             tf.summary.scalar('accuracy', self.accuracy)
201 | 
202 | 
203 |     def _build_graph(self):
204 |         self._create_variable()
205 |         self._create_placeholder()
206 |         self._inference()
207 |         self._create_loss()
208 |         self._create_optimizer()
209 |         self._create_summary()
210 |         print 'Build graph done'
211 | 
212 | 
213 | def test():
214 |     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # so the IDs match nvidia-smi
215 |     os.environ["CUDA_VISIBLE_DEVICES"] = "" # "0, 1" for multiple
216 |     from data_helper import Helper
217 |     sess = tf.Session()
218 |     paras = ModelParas()
219 |     emb_matrix = Helper.get_emb_matrix()
220 |     Model(paras, sess, tf.contrib.learn.ModeKeys.TRAIN, emb_matrix)
221 |    
222 | 
223 | if __name__ == '__main__':
224 |     test()
225 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | #-*- encoding: utf-8 -*-
 3 | 
 4 | 
 5 | # Data
 6 | raw_train_fpath = './data/train_first.csv'
 7 | raw_predict_fpath = './data/predict_first.csv'
 8 | train_fpath = './data/train.txt'
 9 | predict_fpath = './data/predict.txt'
10 | 
11 | 
12 | # Word2vec
13 | embedding_size = 300 
14 | word2vec_fpath = './model/word2vec/w2v_win1_d%d.model' % embedding_size
15 | emb_matrix_fpath = './model/word2vec/emb_matrix_d%d.npy' % embedding_size
16 | word2id_fpath = './model/word2vec/word2id.txt'
17 | 
18 | 
19 | # Model path
20 | model_path = './model/m0'
21 | 
22 | 
23 | # Result path
24 | result_path = './data/result.csv'
25 | 


--------------------------------------------------------------------------------
/data_helper.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | 
  3 | 
  4 | import codecs
  5 | import numpy as np
  6 | from utils.nlp_util import NlpUtil
  7 | import config
  8 | 
  9 | 
 10 | class Helper(object):
 11 |     
 12 | 
 13 |     @classmethod
 14 |     def init(cls):
 15 |         pass
 16 | 
 17 | 
 18 |     @classmethod
 19 |     def sort_by_length(cls, sents, labels):
 20 |         len_array = np.array([len(s) for s in sents])
 21 |         len_perm = len_array.argsort()
 22 |         sents = sents[len_perm]
 23 |         labels = labels[len_perm]
 24 |         return sents, labels
 25 | 
 26 | 
 27 |     @classmethod
 28 |     def get_data(cls, is_train_data = True, partition = None, 
 29 |                  sort_flag = True, rand_seed = None):
 30 |         if rand_seed is not None:
 31 |             np.random.seed(rand_seed)
 32 | 
 33 |         word2id = {}
 34 |         with codecs.open(config.word2id_fpath, 'r', 'utf-8') as in_f:
 35 |             for line in in_f:
 36 |                 word, id_ = line.rstrip().split('\t')
 37 |                 word2id[word] = int(id_)
 38 | 
 39 |         def split_text(text):
 40 |             ret = [word2id[w] for w in text.split('|') if w in word2id]
 41 |             return ret
 42 | 
 43 |         if is_train_data:
 44 |             # Return data for training
 45 |             if partition is None:
 46 |                 partition = [0.8, 0.1, 0.1]
 47 |             partition = [0.0] + [sum(partition[:id_+1]) for id_ in range(3)]
 48 |             with codecs.open(config.train_fpath, 'r', 'utf-8') as in_f:
 49 |                 train_corpus = [line.strip().split('\t') for line in in_f]
 50 |                 train_data = [split_text(item[1]) for item in train_corpus]
 51 |                 labels = np.array([int(item[2]) for item in train_corpus], 
 52 |                     dtype = np.int32) - 1
 53 |             train_length = len(train_data)
 54 |             perm = np.random.permutation(train_length)
 55 |             train_data = np.array(train_data)[perm]
 56 |             labels = labels[perm]
 57 |             train, dev, test = {}, {}, {}
 58 |             data_type = ['train', 'dev', 'test']
 59 |             part = np.array(partition) * train_length
 60 |             part = part.astype(np.int32)    
 61 |             for id_, type_ in enumerate(data_type):
 62 |                 sents_ = train_data[part[id_] : part[id_+1]]
 63 |                 labels_ = labels[part[id_] : part[id_+1]]
 64 |                 if sort_flag is True:
 65 |                     sents_, labels_ = cls.sort_by_length(sents_, labels_)
 66 |                 eval(type_)['sents'] = sents_ 
 67 |                 eval(type_)['labels'] = labels_ 
 68 |             # print len(train['sents']), len(dev['sents']), len(test['sents'])
 69 |             # print '|'.join(map(str, test['sents'][-1])), test['labels'][-1]
 70 |             return train, dev, test
 71 |         else:
 72 |             # Return data for prediction
 73 |             with codecs.open(config.predict_fpath, 'r', 'utf-8') as in_f:
 74 |                 predict_corpus = [line.strip().split('\t') for line in in_f]
 75 |                 predict_ids = np.array([item[0] for item in predict_corpus])
 76 |                 predict = np.array([split_text(item[1]) for item in predict_corpus])
 77 |                 if sort_flag:
 78 |                     predict, predict_ids = cls.sort_by_length(predict, predict_ids)
 79 |             return predict_ids, predict
 80 | 
 81 | 
 82 |     @classmethod
 83 |     def get_batch(cls, batch, sequence_length = None):
 84 |         if sequence_length:
 85 |             lengths = np.array([len(x[:sequence_length]) for x in batch])
 86 |         else:
 87 |             lengths = np.array([len(x) for x in batch])
 88 |         max_len = np.max(lengths)
 89 |         batch_len = len(batch)
 90 |         embed = np.zeros((batch_len, max_len), np.int32)
 91 |         for i in xrange(batch_len):
 92 |             for j in xrange(lengths[i]):
 93 |                 embed[i, j] = batch[i][j] 
 94 |         return embed, lengths
 95 | 
 96 | 
 97 |     @classmethod
 98 |     def get_emb_matrix(cls):
 99 |         emb_matrix = np.load(config.emb_matrix_fpath)
100 |         print 'Load embedding matrix success' 
101 |         return emb_matrix
102 | 
103 | 
104 | def test():
105 |     train, dev, test = Helper.get_data(is_train_data = True, 
106 |                                        sort_flag = False, 
107 |                                        rand_seed = 1234)
108 | 
109 |     print train['sents'][:3]
110 |     batch = Helper.get_batch(train['sents'][:3])
111 |     print batch
112 |     Helper.get_emb_matrix()
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     test()
117 | 
118 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | #-*- encoding: utf-8 -*-
  3 | 
  4 | 
  5 | import os
  6 | import time
  7 | import math
  8 | import numpy as np
  9 | from collections import defaultdict, Counter
 10 | import codecs
 11 | import tensorflow as tf
 12 | from rnn_model import Model, ModelParas
 13 | from data_helper import Helper
 14 | from utils.log import logger
 15 | import config
 16 | 
 17 | 
 18 | def load_model(mode):
 19 |     tf.reset_default_graph()
 20 |     paras = ModelParas()
 21 |     sess = tf.Session()
 22 |     save_path = os.path.join(config.model_path, 'model/model.ckpt')
 23 |     emb_matrix = Helper.get_emb_matrix()
 24 |     with tf.variable_scope('Model'):
 25 |         model = Model(paras, sess, mode, emb_matrix)
 26 |     saver = tf.train.Saver()
 27 |     saver.restore(model.sess, save_path)
 28 |     return model
 29 | 
 30 | 
 31 | def predict(save_path):
 32 |     model = load_model(mode = tf.contrib.learn.ModeKeys.EVAL)
 33 |     predict_ids, predict = Helper.get_data(is_train_data = False)
 34 |     batch_size = model.paras.batch_size
 35 |     steps = int(math.ceil(len(predict_ids) * 1.0 / batch_size))
 36 |     with codecs.open(save_path, 'w', 'utf-8') as out_f:
 37 |         for step in xrange(steps):
 38 |             begin = step * batch_size
 39 |             end = (step + 1) * batch_size
 40 |             ids = predict_ids[begin: end]
 41 |             batch_sents, batch_lengths = Helper.get_batch(
 42 |                 predict[begin: end], model.paras.sequence_length)
 43 |             feed_dict = {
 44 |                 model.sents: batch_sents,
 45 |                 model.sent_lengths: batch_lengths}
 46 |             res = model.sess.run(model.predicts, feed_dict)
 47 |             ids = ids.tolist()
 48 |             res = res.tolist()
 49 |             msgs = predict[begin: end].tolist()
 50 |             for id_, val, msg in zip(ids, res, msgs):
 51 |                 out_f.write('%s,%f\n' % (id_, val + 1))
 52 |     del model, predict_ids, predict
 53 |     print 'Predict done'
 54 | 
 55 | 
 56 | def fine_tune_result():
 57 |     ratio = np.array([0.00587, 0.00973, 0.09389, 0.28954, 0.60097], np.float32)
 58 |     part = np.array([np.sum(ratio[:i]) for i in range(6)]) * 30000
 59 |     part[-1] = 30000
 60 |     part = part.astype(np.int32) 
 61 |     print part
 62 |     with codecs.open(config.result_path, 'r', 'utf-8') as in_f, \
 63 |         codecs.open('fine_tune.csv', 'w', 'utf-8') as out_f:
 64 |         id_score_list = []
 65 |         for line in in_f:
 66 |             id_, score = line.rstrip().split(',')
 67 |             id_score_list.append((id_, float(score)))
 68 |         id_score_list.sort(key = lambda x: x[1])
 69 |         for index, item in enumerate(id_score_list):
 70 |             for i in range(5):
 71 |                 if part[i] <= index < part[i + 1]:
 72 |                     out_f.write('%s,%d\n' % (item[0], i + 1)) 
 73 |                     break
 74 |     print 'Fine tune result done'
 75 |    
 76 | 
 77 | def _get_vote_value(array):
 78 |     array = [int(np.round(x)) for x in array]
 79 |     cnt_dict = Counter(array) 
 80 |     max_v = max([v for k, v in cnt_dict.items()])
 81 |     for k, v in cnt_dict.items()[::-1]:
 82 |         if v == max_v:
 83 |             return k
 84 | 
 85 | 
 86 | def _get_mean_value(array):
 87 |     return np.mean(array) 
 88 |     
 89 | 
 90 | def fuse_result(fuse_mode = 'mean'):
 91 |     id2result = defaultdict(list)
 92 |     total_score = 0.0 
 93 |     file_count = 0
 94 |     for file_ in os.listdir(config.model_path):
 95 |         if not file_.startswith('result'):
 96 |             continue
 97 |         file_count += 1
 98 |         file_ = os.path.join(config.model_path, file_)
 99 |         with codecs.open(file_, 'r', 'utf-8') as in_f:
100 |             score = float(file_.split('_')[1])
101 |             total_score += score
102 |             for line in in_f:
103 |                 id_, kind_ = line.strip().split(',')
104 |                 tuple_ = (float(kind_), score)
105 |                 id2result[id_].append(tuple_)
106 |     with codecs.open(config.result_path, 'w', 'utf-8') as out_f:
107 |         for id_, list_ in id2result.iteritems():
108 |             array = [kind_ for kind_, score_ in list_]
109 |             if fuse_mode == 'mean':
110 |                 fuse_kind = _get_mean_value(array)
111 |             else:
112 |                 fuse_kind = _get_vote_value(array)
113 |             out_f.write('%s,%f\n' % (id_, fuse_kind))
114 |     print id2result['16866b2f-c7e5-319d-b47b-cc9317812bc9'] 
115 |     print 'Fuse result done'
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     #predict(config.result_path)
120 |     fuse_result()
121 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | #-*- encoding: utf-8 -*-
 3 | 
 4 | 
 5 | import numpy as np
 6 | import codecs
 7 | from utils.nlp_util import NlpUtil
 8 | import config
 9 | 
10 | 
11 | def tokenize_corpus(corpus_fpath, save_fpath, is_train_data = True):
12 | 
13 |     def precess_line(line, is_train_data = True):
14 |         try:
15 |             line = line.strip()
16 |             if is_train_data:
17 |                 line, flag = line.rsplit(',', 1)
18 |             id_, text = line.split(',', 1)
19 |             text = text.replace('|', ' ')
20 |             text = text.replace('\t', ' ')
21 |             text = '|'.join(['<s>'] + NlpUtil.tokenize(text, True) + ['</s>'])
22 |             #text = '|'.join(NlpUtil.tokenize(text, True))
23 |             return ('\t'.join([id_, text, flag]) + '\n' if is_train_data
24 |                 else '\t'.join([id_, text]) + '\n')
25 |         except Exception as e:
26 |             print ('line=%s, errmsg=%s', line, e)
27 | 
28 |     with codecs.open(corpus_fpath, 'r', 'utf-8') as in_f, \
29 |         codecs.open(save_fpath, 'w', 'utf-8') as out_f:
30 |         in_f.readline()
31 |         for line in in_f:
32 |             out_f.write(precess_line(line, is_train_data))
33 |     print 'Tokenize done'
34 | 
35 | 
36 | def _get_corpus():
37 |     corpus = []
38 |     for file_ in [config.train_fpath, config.predict_fpath]:
39 |         with codecs.open(file_, 'r', 'utf-8') as in_f:
40 |             corpus_tmp = [line.strip().split('\t')[1].split('|')
41 |                           for line in in_f] 
42 |             corpus.extend(corpus_tmp)
43 |     print 'Get corpus done, length is %d' % len(corpus)
44 |     return corpus
45 | 
46 | 
47 | def build_emb_matrix(corpus):
48 |     corpus_ = []
49 |     _ = map(lambda x: corpus_.extend(x), corpus)
50 |     word2id = NlpUtil.build_word2id(corpus_)
51 |     word2vec = NlpUtil.load_word2vec(config.word2vec_fpath)
52 |     emb_matrix = NlpUtil.build_emb_matrix(word2vec,
53 |         config.embedding_size, word2id)
54 |     np.save(config.emb_matrix_fpath, emb_matrix)
55 |     with codecs.open(config.word2id_fpath, 'w', 'utf-8') as out_f:
56 |         out_f.write('\n'.join(['%s\t%d' % (k, v) for k, v in word2id.iteritems()]))
57 |     print 'Build emb_matrix done'
58 | 
59 | 
60 | if __name__ == '__main__': 
61 |     # Tokenize data
62 |     tokenize_corpus(config.raw_train_fpath, config.train_fpath,
63 |         is_train_data = True)
64 |     tokenize_corpus(config.raw_predict_fpath, config.predict_fpath, 
65 |         is_train_data = False)
66 |     corpus = _get_corpus()
67 | 
68 |     # Train word2vec
69 |     NlpUtil.train_word2vec(corpus, './model/word2vec')
70 | 
71 |     # Build emb matrix
72 |     build_emb_matrix(corpus)
73 | 


--------------------------------------------------------------------------------
/rnn_cnn_model.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python 
  2 | #-*- encoding: utf-8 -*-
  3 | 
  4 | 
  5 | import os
  6 | import time
  7 | import tensorflow as tf
  8 | from utils import model_helper
  9 | import config
 10 | 
 11 | 
 12 | class ModelParas(object):
 13 |     embedding_size = config.embedding_size
 14 |     batch_size = 64
 15 |     sequence_length = 70
 16 |     learning_rate = 0.01 
 17 |     decay = 0.99
 18 |     lrshrink = 5
 19 |     uniform_init_scale = 0.04
 20 |     clip_gradient_norm = 5.0
 21 |     l2_reg_lambda = 0.0
 22 |     nclasses = 5
 23 |     epochs = 20 
 24 | 
 25 |     # RNN
 26 |     cell_num_units = 256
 27 |     num_layers = 1
 28 |     rnn_dropout = 0.0 
 29 | 
 30 |     # CNN
 31 |     filter_sizes = [3, 4, 5]
 32 |     num_filters = 32
 33 |     cnn_dropout = 0.0
 34 | 
 35 | 
 36 | class Model(object):
 37 |     
 38 | 
 39 |     def __init__(self, paras, sess, mode, emb_matrix):
 40 |         self.paras = paras
 41 |         self.sess = sess
 42 |         self.mode = mode
 43 |         self.emb_matrix = emb_matrix
 44 |         self._build_graph()
 45 | 
 46 | 
 47 |     def _create_placeholder(self):
 48 |         self.lr = tf.placeholder(tf.float32, [], name = 'learning_rate')
 49 |         self.sents = tf.placeholder(tf.int32, [None, None], name = 'sents')
 50 |         self.sent_lengths = tf.placeholder(tf.int32, [None], name = 'sent_lengths')
 51 |         self.labels = tf.placeholder(tf.int32, [None], name = 'labels')
 52 | 
 53 | 
 54 |     def _create_variable(self):
 55 |         with tf.device('/cpu:0'):
 56 |             self.embeddings = tf.get_variable(
 57 |                 name = 'embeddings',
 58 |                 shape = self.emb_matrix.shape,
 59 |                 dtype = tf.float32,
 60 |                 initializer = tf.constant_initializer(self.emb_matrix))
 61 |         self.global_step = tf.get_variable( 
 62 |             name = 'global_step', 
 63 |             dtype = tf.int32,
 64 |             initializer = 1,
 65 |             trainable = False)
 66 |         self.num_filters_total = self.paras.num_filters * len(self.paras.filter_sizes)
 67 |         self.w_projection = tf.get_variable(
 68 |             name = 'w_projection', 
 69 |             shape = [self.num_filters_total, self.paras.nclasses])
 70 |         self.b_projection = tf.get_variable(
 71 |             name = 'b_projection', 
 72 |             shape = [self.paras.nclasses])
 73 |         self.l2_loss = tf.constant(0.0)
 74 | 
 75 |     
 76 |     def _inference(self):
 77 |         paras = self.paras
 78 |         with tf.device('/cpu:0'):
 79 |             self.emb_sents = tf.nn.embedding_lookup(
 80 |                 self.embeddings, self.sents)
 81 | 
 82 |         # RNN network
 83 |         with tf.name_scope('RNN'):
 84 |             cells_fw = model_helper.create_rnn_cell(
 85 |                 'lstm', 
 86 |                 paras.cell_num_units,
 87 |                 paras.num_layers,
 88 |                 paras.rnn_dropout,
 89 |                 self.mode)
 90 |             cells_bw = model_helper.create_rnn_cell(
 91 |                 'lstm', 
 92 |                 paras.cell_num_units,
 93 |                 paras.num_layers,
 94 |                 paras.rnn_dropout,
 95 |                 self.mode)
 96 |             outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
 97 |                 cells_fw, 
 98 |                 cells_bw,
 99 |                 inputs = self.emb_sents,
100 |                 sequence_length = self.sent_lengths,
101 |                 dtype = tf.float32)
102 |             # states_fw: (batch_size, sent_len, cell_size)
103 |             states_fw, states_bw = outputs 
104 |             # concat_states: (batch_size, sent_len, cell_size * 2)
105 |             concat_states = tf.concat([states_fw, states_bw], axis = 2)
106 |             # rnn_states_expand: (batch_size, sent_len, cell_size * 2, 1)
107 |             self.rnn_states_expand = tf.expand_dims(concat_states, -1)
108 | 
109 |         # CNN network
110 |         with tf.name_scope('CNN'):
111 |             pooled_concat = []
112 |             for filter_size in paras.filter_sizes:
113 |                 with tf.name_scope('conv-pool-%s' % filter_size):
114 |                     # filter: (shape)
115 |                     filter_ = tf.get_variable(
116 |                         name = 'filter-%s' % filter_size,
117 |                         shape = [filter_size, paras.cell_num_units * 2, 1, paras.num_filters])
118 |                     # conv: (batch_size, sequence_length - filter + 1, 1, num_filters)
119 |                     conv = tf.nn.conv2d(
120 |                         input = self.rnn_states_expand,
121 |                         filter = filter_,
122 |                         strides = [1, 1, 1, 1],
123 |                         padding = 'VALID',
124 |                         name = 'conv')
125 |                     # bias: (num_filters, 1)
126 |                     b = tf.get_variable(
127 |                         name = 'bias-%s' % filter_size,
128 |                         shape = [paras.num_filters])
129 |                     h = tf.nn.relu(tf.nn.bias_add(conv, b))
130 |                     # pooled: (batch_size, 1, 1, num_filters)
131 |                     pooled = tf.nn.max_pool(
132 |                         value = h, 
133 |                         ksize = [1, paras.sequence_length - filter_size + 1, 1, 1],
134 |                         strides = [1, 1, 1, 1], 
135 |                         padding ='VALID', 
136 |                         name ='pool')
137 |                     pooled_concat.append(pooled)
138 |             # h_pool: (batch_size, 1, 1, num_filters_total)
139 |             h_pool = tf.concat(pooled_concat, 3)
140 |             # h_pool_flat: (batch_size, num_filters_total)
141 |             self.h_pool_flat = tf.reshape(h_pool, [-1, self.num_filters_total])
142 |             # dropout 
143 |             if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
144 |                 self.h_pool_flat = tf.nn.dropout(self.h_pool_flat, 1.0 - paras.cnn_dropout)
145 | 
146 |         with tf.name_scope('classify'):
147 |             # logits: (batch_size, n_classes)
148 |             logits = tf.nn.xw_plus_b(self.h_pool_flat, w_projection, b_projection, 'logits')
149 |             # predicts: (batch_size, 1)
150 |             self.predicts = tf.reduce_max(tf.contrib.layers.fully_connected(
151 |                 inputs = logits,
152 |                 activation_fn = None,
153 |                 num_outputs = 1), axis = 1)
154 |             self.mse = tf.reduce_mean(tf.cast(
155 |                 tf.squared_difference(
156 |                     self.labels, 
157 |                     tf.cast(tf.round(self.predicts), tf.int32)),
158 |                 tf.float32))
159 | 
160 |         with tf.name_scope('accuracy'):
161 |             correct_prediction = tf.equal(self.labels, 
162 |                 tf.cast(tf.round(self.predicts), tf.int32))
163 |             self.accuracy = tf.reduce_mean(tf.cast(
164 |                 correct_prediction, tf.float32))
165 | 
166 | 
167 |     def _create_loss(self):
168 |         with tf.name_scope('loss'):
169 |             self.loss = tf.reduce_mean(
170 |                 tf.losses.mean_squared_error(
171 |                     labels = tf.cast(self.labels, tf.float32),
172 |                     predictions = self.predicts)) 
173 |             # Add l2 loss reg
174 |             l2_loss += tf.nn.l2_loss(w_projection)
175 |             l2_loss += tf.nn.l2_loss(b_projection)
176 |             self.loss += l2_loss * self.paras.l2_reg_lambda
177 | 
178 | 
179 |     def _create_optimizer(self):
180 |         self.optimizer = tf.contrib.layers.optimize_loss(
181 |             loss = self.loss, 
182 |             global_step = self.global_step, 
183 |             learning_rate = self.lr, 
184 |             optimizer = 'SGD', 
185 |             clip_gradients = self.paras.clip_gradient_norm) 
186 | 
187 | 
188 |     def _create_summary(self):
189 |         log_path = os.path.join(config.model_path, 'tensorboard')
190 |         self.train_writer = tf.summary.FileWriter(
191 |             os.path.join(log_path, 'train'), self.sess.graph)
192 |         self.test_writer = tf.summary.FileWriter(
193 |             os.path.join(log_path, 'test'), self.sess.graph)
194 |         with tf.name_scope('summaries') as scope:
195 |             tf.summary.scalar('loss', self.loss) 
196 |             tf.summary.scalar('accuracy', self.accuracy)
197 | 
198 | 
199 |     def _build_graph(self):
200 |         self._create_placeholder()
201 |         self._create_variable()
202 |         self._inference()
203 |         self._create_loss()
204 |         self._create_optimizer()
205 |         self._create_summary()
206 |         print 'Build graph done'
207 | 
208 | 
209 | def test():
210 |     from data_helper import Helper
211 |     sess = tf.Session()
212 |     paras = ModelParas()
213 |     emb_matrix = Helper.get_emb_matrix()
214 |     Model(paras, sess, tf.contrib.learn.ModeKeys.TRAIN, emb_matrix)
215 |     
216 | 
217 | if __name__ == '__main__':
218 |     test()
219 | 


--------------------------------------------------------------------------------
/rnn_model.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python 
  2 | #-*- encoding: utf-8 -*-
  3 | 
  4 | 
  5 | import os
  6 | import time
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | from utils import model_helper
 10 | import config
 11 | 
 12 | 
 13 | class ModelParas(object):
 14 |     embedding_size = config.embedding_size
 15 |     sequence_length = None
 16 |     cell_num_units = 512
 17 |     num_layers = 1
 18 |     batch_size = 64
 19 |     dropout = 0.0 
 20 |     learning_rate = 0.01 
 21 |     decay = 0.99
 22 |     lrshrink = 5
 23 |     uniform_init_scale = 0.04
 24 |     clip_gradient_norm = 5.0
 25 |     epochs = 20 
 26 | 
 27 | 
 28 | class Model(object):
 29 |     
 30 | 
 31 |     def __init__(self, paras, sess, mode, emb_matrix):
 32 |         self.paras = paras
 33 |         self.sess = sess
 34 |         self.mode = mode
 35 | 
 36 |         # Model variable
 37 |         with tf.device('/cpu:0'):
 38 |             self.embeddings = tf.get_variable(
 39 |                 name = 'embeddings',
 40 |                 shape = emb_matrix.shape,
 41 |                 dtype = tf.float32,
 42 |                 initializer = tf.constant_initializer(emb_matrix))
 43 |         self.global_step = tf.get_variable( 
 44 |             name = 'global_step', 
 45 |             dtype = tf.int32,
 46 |             initializer = 1,
 47 |             trainable = False)
 48 | 
 49 |         self._build_graph()
 50 | 
 51 |     
 52 |     def _create_placeholder(self):
 53 |         self.lr = tf.placeholder(tf.float32, [], name = 'learning_rate')
 54 |         self.sents = tf.placeholder(tf.int32, [None, None], name = 'sents')
 55 |         with tf.device('/cpu:0'):
 56 |             self.emb_sents = tf.nn.embedding_lookup(
 57 |                 self.embeddings, self.sents)
 58 |         self.sent_lengths = tf.placeholder(tf.int32, [None], name = 'sent_lengths')
 59 |         self.labels = tf.placeholder(tf.int32, [None], name = 'labels')
 60 | 
 61 | 
 62 |     def _inference(self):
 63 |         with tf.variable_scope('encoder') as varscope:
 64 |             cells_fw = model_helper.create_rnn_cell(
 65 |                 'lstm', 
 66 |                 self.paras.cell_num_units,
 67 |                 self.paras.num_layers,
 68 |                 self.paras.dropout,
 69 |                 self.mode)
 70 |             cells_bw = model_helper.create_rnn_cell(
 71 |                 'lstm', 
 72 |                 self.paras.cell_num_units,
 73 |                 self.paras.num_layers,
 74 |                 self.paras.dropout,
 75 |                 self.mode)
 76 |             outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
 77 |                 cells_fw, 
 78 |                 cells_bw,
 79 |                 inputs = self.emb_sents,
 80 |                 sequence_length = self.sent_lengths,
 81 |                 dtype = tf.float32,
 82 |                 scope = varscope)
 83 |             # states_fw: (batch_size, sent_len, cell_size)
 84 |             states_fw, states_bw = outputs 
 85 |             concat_states = tf.concat([states_fw, states_bw], axis = 2)
 86 |             # sent_states: (batch_size, 2 * cell_size)
 87 |             self.sent_states = tf.reduce_max(concat_states, axis = 1)
 88 | 
 89 |         with tf.variable_scope('classify_layer') as varscope:
 90 |             hidden1 = tf.contrib.layers.fully_connected(
 91 |                 inputs = self.sent_states,
 92 |                 num_outputs = 512)
 93 |             hidden2 = tf.contrib.layers.fully_connected(
 94 |                 inputs = hidden1,
 95 |                 num_outputs = 5)
 96 |             self.predicts = tf.reduce_max(tf.contrib.layers.fully_connected(
 97 |                 inputs = hidden2,
 98 |                 activation_fn = None,
 99 |                 num_outputs = 1), axis = 1)
100 |             self.mse = tf.reduce_mean(tf.cast(
101 |                 tf.squared_difference(
102 |                     self.labels, 
103 |                     tf.cast(tf.round(self.predicts), tf.int32)),
104 |                 tf.float32))
105 | 
106 |         with tf.variable_scope('accuracy') as varscope:
107 |             correct_prediction = tf.equal(self.labels, 
108 |                 tf.cast(tf.round(self.predicts), tf.int32))
109 |             self.accuracy = tf.reduce_mean(tf.cast(
110 |                 correct_prediction, tf.float32))
111 | 
112 | 
113 |     def _create_loss(self):
114 |         with tf.variable_scope('loss') as varscope:
115 |             self.loss = tf.reduce_mean(
116 |                 tf.losses.mean_squared_error(
117 |                     labels = tf.cast(self.labels, tf.float32),
118 |                     predictions = self.predicts)) 
119 | 
120 | 
121 |     def _create_optimizer(self):
122 |         self.optimizer = tf.contrib.layers.optimize_loss(
123 |             loss = self.loss, 
124 |             global_step = self.global_step, 
125 |             learning_rate = self.lr, 
126 |             optimizer = 'SGD', 
127 |             clip_gradients = self.paras.clip_gradient_norm) 
128 | 
129 | 
130 |     def _create_summary(self):
131 |         log_path = os.path.join(config.model_path, 'tensorboard')
132 |         self.train_writer = tf.summary.FileWriter(
133 |             os.path.join(log_path, 'train'), self.sess.graph)
134 |         self.test_writer = tf.summary.FileWriter(
135 |             os.path.join(log_path, 'test'), self.sess.graph)
136 |         with tf.name_scope('summaries') as scope:
137 |             tf.summary.scalar('loss', self.loss) 
138 |             tf.summary.scalar('accuracy', self.accuracy)
139 | 
140 | 
141 |     def _build_graph(self):
142 |         self._create_placeholder()
143 |         self._inference()
144 |         self._create_loss()
145 |         self._create_optimizer()
146 |         self._create_summary()
147 |         print 'Build graph done'
148 | 
149 | 
150 | def test():
151 |     sess = tf.Session()
152 |     paras = ModelParas()
153 | 
154 |     emb_matrix = NlpUtil.build_emb_matrix()
155 |     Model(paras, sess, tf.contrib.learn.ModeKeys.TRAIN)
156 |     
157 | 
158 | if __name__ == '__main__':
159 |     pass
160 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | #-*- encoding: utf-8 -*-
  3 | 
  4 | 
  5 | import os
  6 | import time
  7 | import math
  8 | import codecs
  9 | import numpy as np
 10 | from sklearn.model_selection import KFold
 11 | import tensorflow as tf
 12 | from rnn_model import Model, ModelParas
 13 | from data_helper import Helper
 14 | from predict import predict, fuse_result
 15 | from utils.log import logger
 16 | import config
 17 | 
 18 | 
 19 | tf.flags.DEFINE_string('model', 'rnn', 'select model, default is rnn')
 20 | tf.flags.DEFINE_string('mode', 'single', 'single, multi or kfold, default is single') 
 21 | flags = tf.flags.FLAGS
 22 | 
 23 | 
 24 | def run_epoch(model, input_data):
 25 |     start_time = time.time()
 26 |     paras = model.paras
 27 |     average_loss, average_acc, average_mse = 0.0, 0.0, 0.0
 28 |     sents, labels = input_data['sents'], input_data['labels']
 29 |     data_length = len(sents)
 30 |     if data_length == 0: 
 31 |         return None 
 32 |     steps = int(math.ceil(data_length * 1.0 / paras.batch_size))
 33 | 
 34 |     for step in xrange(steps):
 35 |         begin = step * paras.batch_size
 36 |         end = (step + 1) * paras.batch_size
 37 |         batch_sents, batch_lengths = Helper.get_batch(
 38 |             sents[begin: end], paras.sequence_length)
 39 |         batch_labels = labels[begin: end]
 40 |         feed_dict = {
 41 |             model.sents: batch_sents,
 42 |             model.sent_lengths: batch_lengths,
 43 |             model.labels: batch_labels.T,
 44 |             model.lr: paras.learning_rate}
 45 |         if flags.model == 'cnn_rnn':
 46 |             feed_dict[model.pad] = np.zeros((
 47 |                 len(labels[begin: end]), 1, paras.embedding_size, 1))
 48 |         fetches = {
 49 |             'b_loss': model.loss,
 50 |             'b_acc': model.accuracy,
 51 |             'global_step': model.global_step,
 52 |             'b_mse': model.mse,
 53 |         }
 54 |         if model.mode == tf.contrib.learn.ModeKeys.TRAIN:
 55 |             fetches['optimizer'] = model.optimizer
 56 |         vals = model.sess.run(fetches, feed_dict)
 57 |         b_loss, b_acc, b_mse, global_step = (
 58 |             vals['b_loss'], vals['b_acc'], 
 59 |             vals['b_mse'], vals['global_step'])
 60 |         b_score = 1.0 / (1.0 + np.sqrt(b_mse))
 61 |         average_loss += b_loss 
 62 |         average_acc += b_acc 
 63 |         average_mse += b_mse
 64 |         if (model.mode == tf.contrib.learn.ModeKeys.TRAIN and global_step % 10 == 0):
 65 |             logger.debug('step=%d, b_loss=%.4f, b_acc=%.4f, b_mse=%.4f, b_score=%.4f', 
 66 |                 global_step, b_loss, b_acc, b_mse, b_score)
 67 | 
 68 |     average_loss /= steps
 69 |     average_acc /= steps
 70 |     average_mse /= steps
 71 |     rmse_score = 1.0 / (1.0 + np.sqrt(average_mse))
 72 |     logger.debug('average_loss=%.4f, average_acc=%.4f, average_mse=%.4f, rmse_score=%.4f', 
 73 |         average_loss, average_acc, average_mse, rmse_score)
 74 |     return rmse_score, global_step
 75 | 
 76 | 
 77 | def train(train_data, valid_data, test_data, emb_matrix):
 78 |     """Train the model"""
 79 |     start_time = time.time()
 80 |     paras = ModelParas()
 81 |     tf.reset_default_graph()
 82 |     sess = tf.Session()
 83 |     # Init initialzer
 84 |     uniform_initializer = tf.random_uniform_initializer(
 85 |         minval = -paras.uniform_init_scale, 
 86 |         maxval = paras.uniform_init_scale)
 87 |     # Define model for train and evaluate
 88 |     with tf.name_scope('train'):
 89 |         with tf.variable_scope('Model', reuse = None, 
 90 |                                 initializer = uniform_initializer):
 91 |             model_train = Model(paras, 
 92 |                                 sess, 
 93 |                                 tf.contrib.learn.ModeKeys.TRAIN,
 94 |                                 emb_matrix)
 95 |     with tf.name_scope('valid'):
 96 |         with tf.variable_scope('Model', reuse = True, 
 97 |                                 initializer = uniform_initializer):
 98 |             model_eval = Model(paras, 
 99 |                                sess, 
100 |                                tf.contrib.learn.ModeKeys.EVAL,
101 |                                emb_matrix)
102 |     # Model Train
103 |     init_op = tf.global_variables_initializer()
104 |     sess.run(init_op)
105 |     best_score = -np.inf
106 |     saver = tf.train.Saver()
107 |     save_path = os.path.join(config.model_path, 'model/model.ckpt')
108 |     for epoch in xrange(paras.epochs):
109 |         logger.debug('>>> Epoch %d, learning_rate=%.4f', 
110 |                      epoch, paras.learning_rate) 
111 |         run_epoch(model_train, train_data) 
112 |         logger.debug('>>> Running Valid')
113 |         score, global_step = run_epoch(model_eval, valid_data)
114 |         if score > best_score:
115 |             best_score = score
116 |             saver.save(sess, save_path)
117 |             logger.debug('Score improved, save model to %s', save_path)
118 |         else:
119 |             saver.restore(sess, save_path)
120 |             logger.debug('Score not improved, load previous best model')
121 |         logger.debug('Epoch %d done, time=%.4f minutes', 
122 |                      epoch, (time.time() - start_time) / 60)
123 |     logger.debug('>>> Running Test')
124 |     run_epoch(model_eval, test_data)
125 |     del model_train
126 |     del model_eval
127 |     logger.debug('Predict result')
128 |     predict(save_path = os.path.join(config.model_path, 
129 |         'result_%f' % best_score))
130 | 
131 | 
132 | def tmp_predict(model, save_path):
133 |     predict_ids, predict = Helper.get_data(is_train_data = False)
134 |     batch_size = model.paras.batch_size
135 |     steps = int(math.ceil(len(predict_ids) * 1.0 / batch_size))
136 |     with codecs.open(save_path, 'w', 'utf-8') as out_f:
137 |         for step in xrange(steps):
138 |             begin = step * batch_size
139 |             end = (step + 1) * batch_size
140 |             ids = predict_ids[begin: end]
141 |             batch_sents, batch_lengths = Helper.get_batch(
142 |                 predict[begin: end], model.paras.sequence_length)
143 |             feed_dict = {
144 |                 model.sents: batch_sents,
145 |                 model.sent_lengths: batch_lengths}
146 |             res = model.sess.run(model.predicts, feed_dict)
147 |             ids = ids.tolist()
148 |             res = res.tolist()
149 |             msgs = predict[begin: end].tolist()
150 |             for id_, val, msg in zip(ids, res, msgs):
151 |                 out_f.write('%s,%f\n' % (id_, val))
152 |     del predict_ids, predict
153 |     print 'Predict done'
154 | 
155 | 
156 | 
157 | def main(_):
158 |     start_time = time.time()
159 |     logger.info('Train begin...')
160 |     emb_matrix = Helper.get_emb_matrix()
161 |     if flags.mode == 'single':
162 |         train_data, valid_data, test_data = Helper.get_data(
163 |             is_train_data = True, partition = [0.8, 0.2], rand_seed = 666) 
164 |         train(train_data, valid_data, test_data, emb_matrix)
165 |     elif flags.mode == 'multi':
166 |         for i in range(10):
167 |             print '>>> Multi %d' % i
168 |             train_data, valid_data, test_data = Helper.get_data(
169 |                 is_train_data = True, partition = [0.8, 0.2], rand_seed = None) 
170 |             train(train_data, valid_data, test_data, emb_matrix)
171 |         fuse_result()
172 |     elif flags.mode == 'kfold':
173 |         data_, _, _  = Helper.get_data(
174 |             is_train_data = True, partition = [1.0], sort_flag = False) 
175 |         sents, labels = data_['sents'], data_['labels']
176 |         kf = KFold(n_splits = 10, shuffle = True, random_state = 123)
177 |         train_data, test_data = {}, {}
178 |         cnt = 1
179 |         for train_index, test_index in kf.split(sents):
180 |             print '>>> KFold %d' % cnt
181 |             cnt += 1
182 |             train_data['sents'] = sents[train_index] 
183 |             train_data['labels'] = labels[train_index]
184 |             test_data['sents'] = sents[test_index]
185 |             test_data['labels'] = labels[test_index]
186 |             train_data['sents'], train_data['labels'] = Helper.sort_by_length(
187 |                 train_data['sents'], train_data['labels'])
188 |             test_data['sents'], test_data['labels'] = Helper.sort_by_length(
189 |                 test_data['sents'], test_data['labels'])
190 |             train(train_data, test_data, _, emb_matrix)
191 |         fuse_result()
192 |     else:
193 |         raise ValueError('Train mode must be `single | multi | kfold` !') 
194 |     logger.info('Train done, time=%.4f hours' % ((time.time() - start_time) / 3600))
195 | 
196 | 
197 | if __name__ == '__main__':
198 |     log_path = './log/train.log'
199 |     if os.path.exists(log_path):
200 |         os.remove(log_path)
201 |     logger.start(log_path, name = __name__)
202 |     model_path = config.model_path
203 |     if tf.gfile.Exists(model_path):
204 |         tf.gfile.DeleteRecursively(model_path)
205 |         logger.debug('Remove old model folder.')
206 |     tf.app.run()
207 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Dikea/Text-Classification/135b5dec09fcd065b88aa4fdb037607aa8340565/utils/__init__.py


--------------------------------------------------------------------------------
/utils/config.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | #-*- encoding: utf-8 -*-
 3 | 
 4 | 
 5 | stop_word_set = set([u'，', u',', u'。', u'.', u'…', u'·', 
 6 |     u'“', u'”', u'"', u'\'', u'（', u'）', u'(', u')', u'～', 
 7 |     u'~', u'、', u'\\', u'/', u'：', u':', u'；', u';', u'！', 
 8 |     u'!', u'？', u'?', u'×', u'=', u'<', u'>', u'[', u']', u'$', 
 9 |     u'@', u'-', u'_', u'│', u'|', u'↑', u'┬',
10 | ])
11 | 


--------------------------------------------------------------------------------
/utils/log.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | #-*- encoding: utf-8 -*-
 3 | 
 4 | 
 5 | import logging
 6 | import logging.handlers
 7 | import os
 8 | import os.path
 9 | 
10 | 
11 | class Logger(object):
12 |     _inst = None
13 |     _level_dict = {
14 |         'CRITICAL': logging.CRITICAL,
15 |         'ERROR': logging.ERROR,
16 |         'WARNING': logging.WARNING,
17 |         'INFO': logging.INFO,
18 |         'DEBUG': logging.DEBUG,
19 |         'NOTSET': logging.NOTSET,
20 |     }
21 |         
22 |     @classmethod
23 |     def start(cls, log_path, name = None, level = None):
24 |         if cls._inst is not None:
25 |             return cls._inst
26 |         
27 |         fpath = '/'.join(log_path.split('/')[0 : -1])
28 |         if False == os.path.exists(fpath):
29 |             os.mkdir(fpath)
30 |         fmt = '[%(levelname)s] %(asctime)s, pid=%(process)d, src=%(filename)s:%(lineno)d, %(message)s'
31 |         datefmt = '%Y-%m-%d %H:%M:%S'
32 |         cls._inst = logging.getLogger(name)
33 |         log_level = Logger._level_dict[level] if level else 'DEBUG'
34 |         cls._inst.setLevel(log_level)
35 | 
36 |         handler = logging.handlers.RotatingFileHandler(
37 |             log_path, maxBytes = 500 * (1<<20), backupCount = 8)
38 |         fmtter = logging.Formatter(fmt, datefmt)
39 |         handler.setFormatter(fmtter)
40 | 
41 |         cls._inst.addHandler(handler)
42 | 
43 |     @classmethod
44 |     def get(cls):
45 |         return cls._inst
46 | 
47 |     @classmethod
48 |     def info(cls, *args):
49 |         return cls._inst.info(*args)
50 | 
51 |     @classmethod
52 |     def debug(cls, *args):
53 |         return cls._inst.debug(*args)
54 | 
55 |     @classmethod
56 |     def warn(cls, *args):
57 |         return cls._inst.warn(*args)
58 | 
59 | 
60 | global logger
61 | logger = Logger
62 | 
63 | 


--------------------------------------------------------------------------------
/utils/model_helper.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | #-*- encoding: utf-8 -*-
 3 | 
 4 | 
 5 | import tensorflow as tf
 6 | 
 7 | 
 8 | def create_rnn_cell(unit_type, num_units, num_layers, 
 9 |                     dropout, mode, forget_bias = 1.0):
10 |     """Create multi-layer RNN cell."""
11 |     cell_list = []
12 |     for i in range(num_layers):
13 |         single_cell = _single_cell(
14 |             unit_type = unit_type,
15 |             num_units = num_units,
16 |             forget_bias = forget_bias,
17 |             dropout = dropout,
18 |             mode = mode)
19 |         cell_list.append(single_cell)
20 |     if len(cell_list) == 1:
21 |         return cell_list[0]
22 |     else: 
23 |         return tf.contrib.rnn.MultiRNNCell(cell_list)
24 | 
25 | 
26 | def _single_cell(unit_type, num_units, dropout, mode, forget_bias = 1.0):
27 |     """Create an instance of a single RNN cell.""" 
28 |     # Dropout (equal 1 - keep_prob) is set to 0 during eval and infer
29 |     dropout = dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 0.0
30 | 
31 |     if unit_type == 'lstm':
32 |         single_cell = tf.contrib.rnn.BasicLSTMCell(
33 |             num_units,
34 |             forget_bias = forget_bias)
35 | 
36 |     if dropout > 0.0:
37 |         single_cell = tf.contrib.rnn.DropoutWrapper(
38 |             cell = single_cell, 
39 |             input_keep_prob = (1.0 - dropout))
40 | 
41 |     return single_cell
42 | 
43 | 
44 | def save_model(save_path, sess, inputs, outputs):
45 |     """Save model"""
46 |     if tf.gfile.Exists(save_path):
47 |         tf.gfile.DeleteRecursively(save_path)
48 |     builder = tf.saved_model.builder.SavedModelBuilder(save_path)
49 |     inputs_ = {k: tf.saved_model.utils.build_tensor_info(v)
50 |         for k, v in inputs.iteritems()}
51 |     outputs_ = {k: tf.saved_model.utils.build_tensor_info(v)
52 |         for k, v in outputs.iteritems()}
53 |     signature = tf.saved_model.signature_def_utils.build_signature_def(
54 |         inputs_, outputs_, 'signature_')
55 |     builder.add_meta_graph_and_variables(sess, ['saved_model'], 
56 |         signature_def_map = {'signature': signature})
57 |     builder.save()
58 | 
59 | 
60 | def get_model_tensor(save_path, sess, inputs_fields, outpus_fields):
61 |     """Load model"""
62 |     meta_graph_def = tf.saved_model.loader.load(sess, 
63 |         ['saved_model'], save_path)
64 |     signature = meta_graph_def.signature_def
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/utils/nlp_util.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | #-*- encoding: utf-8 -*-
  3 | 
  4 | 
  5 | import os
  6 | import time
  7 | import jieba
  8 | import codecs
  9 | import collections
 10 | import numpy as np
 11 | from gensim import models
 12 | import config
 13 | 
 14 | 
 15 | class NlpUtil(object):
 16 |     
 17 | 
 18 |     @classmethod
 19 |     def tokenize(cls, text, filter_stop_word = False):
 20 |         if not isinstance(text, unicode):
 21 |             return [str(text)]
 22 |         tokens = jieba.lcut(text)
 23 |         if filter_stop_word:
 24 |             stop_word_set = config.stop_word_set
 25 |             tokens = filter(lambda w: w not in stop_word_set, tokens)
 26 |         return tokens
 27 | 
 28 | 
 29 |     @classmethod
 30 |     def train_word2vec(cls, corpus, wv_fpath = ''):
 31 |         time_s = time.time()
 32 |         vec_size = 300
 33 |         win_size = 1
 34 |         print ('begin to train model...')
 35 |         w2v_model = models.word2vec.Word2Vec(corpus,
 36 |                                              size = vec_size,
 37 |                                              window = win_size,
 38 |                                              min_count = 2,
 39 |                                              workers = 4,
 40 |                                              sg = 1,
 41 |                                              negative = 15,
 42 |                                              iter = 7)
 43 |         w2v_model.train(corpus, total_examples = len(corpus), epochs = w2v_model.iter)
 44 |         save_fpath = os.path.join(wv_fpath,
 45 |             'w2v_win%s_d%s.model' % (win_size, vec_size))
 46 |         w2v_model.save(save_fpath)
 47 |         print ('save model success, model_path=%s, time=%.4f sec.' 
 48 |                 % (save_fpath, time.time() - time_s))
 49 | 
 50 | 
 51 |     @classmethod 
 52 |     def load_word2vec(cls, w2v_fpath):
 53 |         w2v_model = models.word2vec.Word2Vec.load(w2v_fpath)
 54 |         print 'load word2vec success'
 55 |         wv = w2v_model.wv
 56 |         del w2v_model
 57 |         return wv
 58 | 
 59 | 
 60 |     @classmethod
 61 |     def build_word2id(cls, corpus): 
 62 |         """Convert corpus from word to id
 63 |         Args:
 64 |             corpus: a list of all words   
 65 | 
 66 |         Returns:
 67 |             word_to_id: a dict of word to id
 68 |         """
 69 |         counter = collections.Counter(corpus)
 70 |         count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
 71 |         words, _ = list(zip(*count_pairs))
 72 |         word2id = dict(zip(words, range(1, len(words) + 1)))
 73 |         return word2id
 74 | 
 75 |     
 76 |     @classmethod
 77 |     def build_emb_matrix(cls, word2vec, emb_size, word2id, 
 78 |                          init_scale = 0.25, norm_flag = False):
 79 |         vocab_size = len(word2id) 
 80 |         emb_matrix = np.zeros((vocab_size + 1, emb_size), np.float32)
 81 |         for w, id_ in word2id.iteritems():
 82 |             if w in word2vec:
 83 |                 emb_matrix[id_] = word2vec[w] 
 84 |             else:
 85 |                 emb_matrix[id_] = np.random.uniform(
 86 |                     -init_scale, init_scale, emb_size)   
 87 |         return emb_matrix
 88 |      
 89 | 
 90 | def test():
 91 |     # Test tokenize
 92 |     print '|'.join(NlpUtil.tokenize(u'天气很好')).encode('utf-8')
 93 |     
 94 |     '''
 95 |     # Test word2vec
 96 |     wv = NlpUtil.load_word2vec('./model/word2vec/w2v_win1_d128.model')
 97 |     print wv[u'天气']
 98 |     print '|'.join([x[0] for x in wv.most_similar(positive = [u'天气'])]).encode('utf-8')
 99 |     '''
100 |     
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     test()
105 | 


--------------------------------------------------------------------------------