├── .gitignore
├── README.md
├── data
└── placeholder
├── model.jpeg
└── src
├── bilm
├── __init__.py
├── data.py
├── elmo.py
├── model.py
└── training.py
├── config.py
├── model
├── __init__.py
├── attention.py
├── bilstm_model.py
├── capsule_model.py
├── convlstm_model.py
├── dpcnn_model.py
├── han_model.py
├── hybrid_nn_1.py
├── lightgbm_model.py
├── lstmconv_model.py
├── lstmgru_model.py
├── ml_models.py
├── model_basic.py
├── model_component.py
├── modeling.py
├── my_callbacks.py
├── rcnn_model.py
├── snapshot.py
├── textcnn_model.py
└── xgboost_model.py
├── pack_sub_dt2.py
├── preprocess
├── .ipynb_checkpoints
│ └── EDA-checkpoint.ipynb
├── EDA.ipynb
├── word_tests.txt
└── words.txt
├── stacking.py
├── tokenization.py
├── train_elmo.py
└── train_predict.py
/.gitignore:
--------------------------------------------------------------------------------
1 | ckpt*/
2 | ./src/bilm/dump/
3 | ./src/bilm/result/
4 | ./src/runs/
5 | data/
6 | backup/
7 | src/loss/
8 | # Byte-compiled / optimized / DLL files
9 | __pycache__/
10 | *.py[cod]
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | bin/
17 | build/
18 | develop-eggs/
19 | dist/
20 | eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 |
30 | # Installer logs
31 | pip-log.txt
32 | pip-delete-this-directory.txt
33 |
34 | # Unit test / coverage reports
35 | .tox/
36 | .coverage
37 | .cache
38 | nosetests.xml
39 | coverage.xml
40 |
41 | # Translations
42 | *.mo
43 |
44 | # Mr Developer
45 | .mr.developer.cfg
46 | .project
47 | .pydevproject
48 |
49 | # Rope
50 | .ropeproject
51 |
52 | # Django stuff:
53 | *.log
54 | *.pot
55 |
56 | # Sphinx documentation
57 | docs/_build/
58 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CCF-BDCI2018 汽车领域ASC挑战赛
2 |
3 | 以前没接触过ASC、TSC领域,最开始纠结这是单分类还是多分类问题,走了一些弯路。最终我们回到ASC赛道上,根据直觉,我们设计了一个基于memory的lstm-attention模型,复赛B榜线上在0.69左右,融合最终得分0.70,单模型结构图如下:
4 |
5 | 
6 |
7 |
8 | 后面时间比较紧张,复现今年ASC论文的代码效果都不好,最终排名6/1701,思路如同代码所写,很简单。
9 |
10 | 原始数据可在[比赛数据](https://www.datafountain.cn/competitions/310/details/data-evaluation)处下载,由于这次我们问题建模方式比较多,数据预处理代码也比较多,所以我会上传一份处理好的数据(包括处理好的Bert特征和百度百科词向量)放在[百度云盘](https://pan.baidu.com/s/1ZrgQ6Wp_sFRPrZGjZiBPaA),下载后请解压放在`data/`目录下。
11 |
12 | ELMo哈工大基于pytorch的pretrain版本和我用tf pretrain训练集的效果都不好,但是我也保留了tf pretrain版本代码。
13 |
14 | Bert我们没有弄finetune,直接抽取的特征,效果和百度百科词向量相当。
15 |
16 | 若有任何想法可以提issue或者pull request,也可以微信与我直接讨论。希望大家一起学习进步。
17 |
18 |
19 | ### 一、环境
20 |
21 | |环境/库|版本|
22 | |:---------:|----------|
23 | |Ubuntu|16.04.5 LTS|
24 | |python|3.6|
25 | |jupyter notebook|4.2.3|
26 | |tensorflow-gpu|1.9.1|
27 | |numpy|1.14.1|
28 | |pandas|0.23.0|
29 | |matplotlib|2.2.2|
30 | |tqdm|4.24.0|
31 |
32 | 这里最重要的就是我们用的Cudnn版本的lstm,所以需要tensorflow版本大于1.4.0,相应的cuda版本不能用8.0,需要9.0及以上。
33 |
34 |
35 | ### 二、数据预处理
36 |
37 | 都写在`jupyter`里了,运行`src/preprocess/EDA.ipynb`生成各种文件,可用看看思路,但是建议直接下载云盘处理好的结果。
38 |
39 |
40 | ### 三、深度模型训练
41 |
42 | 数据预处理好即可用直接train模型,单GPU运行,模型请参考`src/config.py`自选,参数名含义请参考`src/train_predict.py`:
43 |
44 | ```
45 | python train_predict.py --gpu 7 --model aspv0 --feature word --epoch 20 --bs 128 --oe
46 | ```
47 |
48 |
49 | ### 四、模型融合输出
50 |
51 | ```
52 | python stacking.py --gpu 1 --data_type 3
53 | ```
54 |
55 | 这里是`stacking`和`pesudo label`一起做了,请修改代码自选是否用伪标签。
56 |
57 | 这里数据集比较合适,伪标签有一定提分作用。
58 |
59 | ### 五、提交结果
60 |
61 | 修改`src/pack_sub_dt2.py`里对应stacking生成的`pre_path`概率结果路径,运行
62 |
63 | ```
64 | python python pack_sub_dt2.py
65 | ```
66 |
67 | 生成提交结果。
68 |
69 |
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/data/placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/data/placeholder
--------------------------------------------------------------------------------
/model.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/model.jpeg
--------------------------------------------------------------------------------
/src/bilm/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .data import Batcher, TokenBatcher
3 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \
4 | dump_bilm_embeddings
5 | from .elmo import weight_layers
6 |
7 |
--------------------------------------------------------------------------------
/src/bilm/data.py:
--------------------------------------------------------------------------------
1 | # originally based on https://github.com/tensorflow/models/tree/master/lm_1b
2 | import glob
3 | import random
4 |
5 | import numpy as np
6 |
7 | from typing import List
8 |
9 |
10 | class Vocabulary(object):
11 | '''
12 | A token vocabulary. Holds a map from token to ids and provides
13 | a method for encoding text to a sequence of ids.
14 | '''
15 | def __init__(self, filename, validate_file=False):
16 | '''
17 | filename = the vocabulary file. It is a flat text file with one
18 | (normalized) token per line. In addition, the file should also
19 | contain the special tokens , , (case sensitive).
20 | '''
21 | self._id_to_word = []
22 | self._word_to_id = {}
23 | self._unk = -1
24 | self._bos = -1
25 | self._eos = -1
26 |
27 | with open(filename) as f:
28 | idx = 0
29 | for line in f:
30 | word_name = line.strip()
31 | if word_name == '':
32 | self._bos = idx
33 | elif word_name == '':
34 | self._eos = idx
35 | elif word_name == '':
36 | self._unk = idx
37 | if word_name == '!!!MAXTERMID':
38 | continue
39 |
40 | self._id_to_word.append(word_name)
41 | self._word_to_id[word_name] = idx
42 | idx += 1
43 |
44 | # check to ensure file has special tokens
45 | if validate_file:
46 | if self._bos == -1 or self._eos == -1 or self._unk == -1:
47 | raise ValueError("Ensure the vocabulary file has "
48 | ", , tokens")
49 |
50 | @property
51 | def bos(self):
52 | return self._bos
53 |
54 | @property
55 | def eos(self):
56 | return self._eos
57 |
58 | @property
59 | def unk(self):
60 | return self._unk
61 |
62 | @property
63 | def size(self):
64 | return len(self._id_to_word)
65 |
66 | def word_to_id(self, word):
67 | if word in self._word_to_id:
68 | return self._word_to_id[word]
69 | return self.unk
70 |
71 | def id_to_word(self, cur_id):
72 | return self._id_to_word[cur_id]
73 |
74 | def decode(self, cur_ids):
75 | """Convert a list of ids to a sentence, with space inserted."""
76 | return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids])
77 |
78 | def encode(self, sentence, reverse=False, split=True):
79 | """Convert a sentence to a list of ids, with special tokens added.
80 | Sentence is a single string with tokens separated by whitespace.
81 |
82 | If reverse, then the sentence is assumed to be reversed, and
83 | this method will swap the BOS/EOS tokens appropriately."""
84 |
85 | if split:
86 | word_ids = [
87 | self.word_to_id(cur_word) for cur_word in sentence.split()
88 | ]
89 | else:
90 | word_ids = [self.word_to_id(cur_word) for cur_word in sentence]
91 |
92 | if reverse:
93 | return np.array([self.eos] + word_ids + [self.bos], dtype=np.int32)
94 | else:
95 | return np.array([self.bos] + word_ids + [self.eos], dtype=np.int32)
96 |
97 |
98 | class UnicodeCharsVocabulary(Vocabulary):
99 | """Vocabulary containing character-level and word level information.
100 |
101 | Has a word vocabulary that is used to lookup word ids and
102 | a character id that is used to map words to arrays of character ids.
103 |
104 | The character ids are defined by ord(c) for c in word.encode('utf-8')
105 | This limits the total number of possible char ids to 256.
106 | To this we add 5 additional special ids: begin sentence, end sentence,
107 | begin word, end word and padding.
108 |
109 | WARNING: for prediction, we add +1 to the output ids from this
110 | class to create a special padding id (=0). As a result, we suggest
111 | you use the `Batcher`, `TokenBatcher`, and `LMDataset` classes instead
112 | of this lower level class. If you are using this lower level class,
113 | then be sure to add the +1 appropriately, otherwise embeddings computed
114 | from the pre-trained model will be useless.
115 | """
116 | def __init__(self, filename, max_word_length, **kwargs):
117 | super(UnicodeCharsVocabulary, self).__init__(filename, **kwargs)
118 | self._max_word_length = max_word_length
119 |
120 | # char ids 0-255 come from utf-8 encoding bytes
121 | # assign 256-300 to special chars
122 | self.bos_char = 256 #
123 | self.eos_char = 257 #
124 | self.bow_char = 258 #
125 | self.eow_char = 259 #
126 | self.pad_char = 260 #
127 |
128 | num_words = len(self._id_to_word)
129 |
130 | self._word_char_ids = np.zeros([num_words, max_word_length],
131 | dtype=np.int32)
132 |
133 | # the charcter representation of the begin/end of sentence characters
134 | def _make_bos_eos(c):
135 | r = np.zeros([self.max_word_length], dtype=np.int32)
136 | r[:] = self.pad_char
137 | r[0] = self.bow_char
138 | r[1] = c
139 | r[2] = self.eow_char
140 | return r
141 | self.bos_chars = _make_bos_eos(self.bos_char)
142 | self.eos_chars = _make_bos_eos(self.eos_char)
143 |
144 | for i, word in enumerate(self._id_to_word):
145 | self._word_char_ids[i] = self._convert_word_to_char_ids(word)
146 |
147 | self._word_char_ids[self.bos] = self.bos_chars
148 | self._word_char_ids[self.eos] = self.eos_chars
149 | # TODO: properly handle
150 |
151 | @property
152 | def word_char_ids(self):
153 | return self._word_char_ids
154 |
155 | @property
156 | def max_word_length(self):
157 | return self._max_word_length
158 |
159 | def _convert_word_to_char_ids(self, word):
160 | code = np.zeros([self.max_word_length], dtype=np.int32)
161 | code[:] = self.pad_char
162 |
163 | word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length-2)]
164 | code[0] = self.bow_char
165 | for k, chr_id in enumerate(word_encoded, start=1):
166 | code[k] = chr_id
167 | code[k + 1] = self.eow_char
168 |
169 | return code
170 |
171 | def word_to_char_ids(self, word):
172 | if word in self._word_to_id:
173 | return self._word_char_ids[self._word_to_id[word]]
174 | else:
175 | return self._convert_word_to_char_ids(word)
176 |
177 | def encode_chars(self, sentence, reverse=False, split=True):
178 | '''
179 | Encode the sentence as a white space delimited string of tokens.
180 | '''
181 | if split:
182 | chars_ids = [self.word_to_char_ids(cur_word)
183 | for cur_word in sentence.split()]
184 | else:
185 | chars_ids = [self.word_to_char_ids(cur_word)
186 | for cur_word in sentence]
187 | if reverse:
188 | return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars])
189 | else:
190 | return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])
191 |
192 |
193 | class Batcher(object):
194 | '''
195 | Batch sentences of tokenized text into character id matrices.
196 | '''
197 | def __init__(self, lm_vocab_file: str, max_token_length: int):
198 | '''
199 | lm_vocab_file = the language model vocabulary file (one line per
200 | token)
201 | max_token_length = the maximum number of characters in each token
202 | '''
203 | self._lm_vocab = UnicodeCharsVocabulary(
204 | lm_vocab_file, max_token_length
205 | )
206 | self._max_token_length = max_token_length
207 |
208 | def batch_sentences(self, sentences: List[List[str]]):
209 | '''
210 | Batch the sentences as character ids
211 | Each sentence is a list of tokens without or , e.g.
212 | [['The', 'first', 'sentence', '.'], ['Second', '.']]
213 | '''
214 | n_sentences = len(sentences)
215 | max_length = max(len(sentence) for sentence in sentences) + 2
216 |
217 | X_char_ids = np.zeros(
218 | (n_sentences, max_length, self._max_token_length),
219 | dtype=np.int64
220 | )
221 |
222 | for k, sent in enumerate(sentences):
223 | length = len(sent) + 2
224 | char_ids_without_mask = self._lm_vocab.encode_chars(
225 | sent, split=False)
226 | # add one so that 0 is the mask value
227 | X_char_ids[k, :length, :] = char_ids_without_mask + 1
228 |
229 | return X_char_ids
230 |
231 |
232 | class TokenBatcher(object):
233 | '''
234 | Batch sentences of tokenized text into token id matrices.
235 | '''
236 | def __init__(self, lm_vocab_file: str):
237 | '''
238 | lm_vocab_file = the language model vocabulary file (one line per
239 | token)
240 | '''
241 | self._lm_vocab = Vocabulary(lm_vocab_file)
242 |
243 | def batch_sentences(self, sentences: List[List[str]]):
244 | '''
245 | Batch the sentences as character ids
246 | Each sentence is a list of tokens without or , e.g.
247 | [['The', 'first', 'sentence', '.'], ['Second', '.']]
248 | '''
249 | n_sentences = len(sentences)
250 | max_length = max(len(sentence) for sentence in sentences) + 2
251 |
252 | X_ids = np.zeros((n_sentences, max_length), dtype=np.int64)
253 |
254 | for k, sent in enumerate(sentences):
255 | length = len(sent) + 2
256 | ids_without_mask = self._lm_vocab.encode(sent, split=False)
257 | # add one so that 0 is the mask value
258 | X_ids[k, :length] = ids_without_mask + 1
259 |
260 | return X_ids
261 |
262 |
263 | ##### for training
264 | def _get_batch(generator, batch_size, num_steps, max_word_length):
265 | """Read batches of input."""
266 | cur_stream = [None] * batch_size
267 |
268 | no_more_data = False
269 | while True:
270 | inputs = np.zeros([batch_size, num_steps], np.int32)
271 | if max_word_length is not None:
272 | char_inputs = np.zeros([batch_size, num_steps, max_word_length],
273 | np.int32)
274 | else:
275 | char_inputs = None
276 | targets = np.zeros([batch_size, num_steps], np.int32)
277 |
278 | for i in range(batch_size):
279 | cur_pos = 0
280 |
281 | while cur_pos < num_steps:
282 | if cur_stream[i] is None or len(cur_stream[i][0]) <= 1:
283 | try:
284 | cur_stream[i] = list(next(generator))
285 | except StopIteration:
286 | # No more data, exhaust current streams and quit
287 | no_more_data = True
288 | break
289 |
290 | how_many = min(len(cur_stream[i][0]) - 1, num_steps - cur_pos)
291 | next_pos = cur_pos + how_many
292 |
293 | inputs[i, cur_pos:next_pos] = cur_stream[i][0][:how_many]
294 | if max_word_length is not None:
295 | char_inputs[i, cur_pos:next_pos] = cur_stream[i][1][
296 | :how_many]
297 | targets[i, cur_pos:next_pos] = cur_stream[i][0][1:how_many+1]
298 |
299 | cur_pos = next_pos
300 |
301 | cur_stream[i][0] = cur_stream[i][0][how_many:]
302 | if max_word_length is not None:
303 | cur_stream[i][1] = cur_stream[i][1][how_many:]
304 |
305 | if no_more_data:
306 | # There is no more data. Note: this will not return data
307 | # for the incomplete batch
308 | break
309 |
310 | X = {'token_ids': inputs, 'tokens_characters': char_inputs,
311 | 'next_token_id': targets}
312 |
313 | yield X
314 |
315 | class LMDataset(object):
316 | """
317 | Hold a language model dataset.
318 |
319 | A dataset is a list of tokenized files. Each file contains one sentence
320 | per line. Each sentence is pre-tokenized and white space joined.
321 | """
322 | def __init__(self, filepattern, vocab, reverse=False, test=False,
323 | shuffle_on_load=False):
324 | '''
325 | filepattern = a glob string that specifies the list of files.
326 | vocab = an instance of Vocabulary or UnicodeCharsVocabulary
327 | reverse = if True, then iterate over tokens in each sentence in reverse
328 | test = if True, then iterate through all data once then stop.
329 | Otherwise, iterate forever.
330 | shuffle_on_load = if True, then shuffle the sentences after loading.
331 | '''
332 | self._vocab = vocab
333 | self._all_shards = glob.glob(filepattern)
334 | print('Found %d shards at %s' % (len(self._all_shards), filepattern))
335 | self._shards_to_choose = []
336 |
337 | self._reverse = reverse
338 | self._test = test
339 | self._shuffle_on_load = shuffle_on_load
340 | self._use_char_inputs = hasattr(vocab, 'encode_chars')
341 |
342 | self._ids = self._load_random_shard()
343 |
344 | def _choose_random_shard(self):
345 | if len(self._shards_to_choose) == 0:
346 | self._shards_to_choose = list(self._all_shards)
347 | random.shuffle(self._shards_to_choose)
348 | shard_name = self._shards_to_choose.pop()
349 | return shard_name
350 |
351 | def _load_random_shard(self):
352 | """Randomly select a file and read it."""
353 | if self._test:
354 | if len(self._all_shards) == 0:
355 | # we've loaded all the data
356 | # this will propogate up to the generator in get_batch
357 | # and stop iterating
358 | raise StopIteration
359 | else:
360 | shard_name = self._all_shards.pop()
361 | else:
362 | # just pick a random shard
363 | shard_name = self._choose_random_shard()
364 |
365 | ids = self._load_shard(shard_name)
366 | self._i = 0
367 | self._nids = len(ids)
368 | return ids
369 |
370 | def _load_shard(self, shard_name):
371 | """Read one file and convert to ids.
372 |
373 | Args:
374 | shard_name: file path.
375 |
376 | Returns:
377 | list of (id, char_id) tuples.
378 | """
379 | print('Loading data from: %s' % shard_name)
380 | with open(shard_name) as f:
381 | sentences_raw = f.readlines()
382 |
383 | if self._reverse:
384 | sentences = []
385 | for sentence in sentences_raw:
386 | splitted = sentence.split()
387 | splitted.reverse()
388 | sentences.append(' '.join(splitted))
389 | else:
390 | sentences = sentences_raw
391 |
392 | if self._shuffle_on_load:
393 | random.shuffle(sentences)
394 |
395 | ids = [self.vocab.encode(sentence, self._reverse)
396 | for sentence in sentences]
397 | if self._use_char_inputs:
398 | chars_ids = [self.vocab.encode_chars(sentence, self._reverse)
399 | for sentence in sentences]
400 | else:
401 | chars_ids = [None] * len(ids)
402 |
403 | print('Loaded %d sentences.' % len(ids))
404 | print('Finished loading')
405 | return list(zip(ids, chars_ids))
406 |
407 | def get_sentence(self):
408 | while True:
409 | if self._i == self._nids:
410 | self._ids = self._load_random_shard()
411 | ret = self._ids[self._i]
412 | self._i += 1
413 | yield ret
414 |
415 | @property
416 | def max_word_length(self):
417 | if self._use_char_inputs:
418 | return self._vocab.max_word_length
419 | else:
420 | return None
421 |
422 | def iter_batches(self, batch_size, num_steps):
423 | for X in _get_batch(self.get_sentence(), batch_size, num_steps,
424 | self.max_word_length):
425 |
426 | # token_ids = (batch_size, num_steps)
427 | # char_inputs = (batch_size, num_steps, 50) of character ids
428 | # targets = word ID of next word (batch_size, num_steps)
429 | yield X
430 |
431 | @property
432 | def vocab(self):
433 | return self._vocab
434 |
435 | class BidirectionalLMDataset(object):
436 | def __init__(self, filepattern, vocab, test=False, shuffle_on_load=False):
437 | '''
438 | bidirectional version of LMDataset
439 | '''
440 | self._data_forward = LMDataset(
441 | filepattern, vocab, reverse=False, test=test,
442 | shuffle_on_load=shuffle_on_load)
443 | self._data_reverse = LMDataset(
444 | filepattern, vocab, reverse=True, test=test,
445 | shuffle_on_load=shuffle_on_load)
446 |
447 | def iter_batches(self, batch_size, num_steps):
448 | max_word_length = self._data_forward.max_word_length
449 |
450 | for X, Xr in zip(
451 | _get_batch(self._data_forward.get_sentence(), batch_size,
452 | num_steps, max_word_length),
453 | _get_batch(self._data_reverse.get_sentence(), batch_size,
454 | num_steps, max_word_length)
455 | ):
456 |
457 | for k, v in Xr.items():
458 | X[k + '_reverse'] = v
459 |
460 | yield X
461 |
462 |
463 | class InvalidNumberOfCharacters(Exception):
464 | pass
465 |
466 |
--------------------------------------------------------------------------------
/src/bilm/elmo.py:
--------------------------------------------------------------------------------
1 |
2 | import tensorflow as tf
3 |
4 | def weight_layers(name, bilm_ops, l2_coef=None,
5 | use_top_only=False, do_layer_norm=False):
6 | '''
7 | Weight the layers of a biLM with trainable scalar weights to
8 | compute ELMo representations.
9 |
10 | For each output layer, this returns two ops. The first computes
11 | a layer specific weighted average of the biLM layers, and
12 | the second the l2 regularizer loss term.
13 | The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES
14 |
15 | Input:
16 | name = a string prefix used for the trainable variable names
17 | bilm_ops = the tensorflow ops returned to compute internal
18 | representations from a biLM. This is the return value
19 | from BidirectionalLanguageModel(...)(ids_placeholder)
20 | l2_coef: the l2 regularization coefficient $\lambda$.
21 | Pass None or 0.0 for no regularization.
22 | use_top_only: if True, then only use the top layer.
23 | do_layer_norm: if True, then apply layer normalization to each biLM
24 | layer before normalizing
25 |
26 | Output:
27 | {
28 | 'weighted_op': op to compute weighted average for output,
29 | 'regularization_op': op to compute regularization term
30 | }
31 | '''
32 | def _l2_regularizer(weights):
33 | if l2_coef is not None:
34 | return l2_coef * tf.reduce_sum(tf.square(weights))
35 | else:
36 | return 0.0
37 |
38 | # Get ops for computing LM embeddings and mask
39 | lm_embeddings = bilm_ops['lm_embeddings']
40 | mask = bilm_ops['mask']
41 |
42 | n_lm_layers = int(lm_embeddings.get_shape()[1])
43 | lm_dim = int(lm_embeddings.get_shape()[3])
44 |
45 | with tf.control_dependencies([lm_embeddings, mask]):
46 | # Cast the mask and broadcast for layer use.
47 | mask_float = tf.cast(mask, 'float32')
48 | broadcast_mask = tf.expand_dims(mask_float, axis=-1)
49 |
50 | def _do_ln(x):
51 | # do layer normalization excluding the mask
52 | x_masked = x * broadcast_mask
53 | N = tf.reduce_sum(mask_float) * lm_dim
54 | mean = tf.reduce_sum(x_masked) / N
55 | variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask)**2
56 | ) / N
57 | return tf.nn.batch_normalization(
58 | x, mean, variance, None, None, 1E-12
59 | )
60 |
61 | if use_top_only:
62 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
63 | # just the top layer
64 | sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1)
65 | # no regularization
66 | reg = 0.0
67 | else:
68 | W = tf.get_variable(
69 | '{}_ELMo_W'.format(name),
70 | shape=(n_lm_layers, ),
71 | initializer=tf.zeros_initializer,
72 | regularizer=_l2_regularizer,
73 | trainable=True,
74 | )
75 |
76 | # normalize the weights
77 | normed_weights = tf.split(
78 | tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers
79 | )
80 | # split LM layers
81 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
82 |
83 | # compute the weighted, normalized LM activations
84 | pieces = []
85 | for w, t in zip(normed_weights, layers):
86 | if do_layer_norm:
87 | pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1)))
88 | else:
89 | pieces.append(w * tf.squeeze(t, squeeze_dims=1))
90 | sum_pieces = tf.add_n(pieces)
91 |
92 | # get the regularizer
93 | reg = [
94 | r for r in tf.get_collection(
95 | tf.GraphKeys.REGULARIZATION_LOSSES)
96 | if r.name.find('{}_ELMo_W/'.format(name)) >= 0
97 | ]
98 | if len(reg) != 1:
99 | raise ValueError
100 |
101 | # scale the weighted sum by gamma
102 | gamma = tf.get_variable(
103 | '{}_ELMo_gamma'.format(name),
104 | shape=(1, ),
105 | initializer=tf.ones_initializer,
106 | regularizer=None,
107 | trainable=True,
108 | )
109 | weighted_lm_layers = sum_pieces * gamma
110 |
111 | ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg}
112 |
113 | return ret
114 |
115 |
--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
1 | # from model.lightgbm_model import LightGbmModel
2 | # from model.xgboost_model import XgboostModel
3 | from model.textcnn_model import TextCNNModel
4 | from model.dpcnn_model import DpcnnModel
5 | from model.capsule_model import CapsuleModel
6 | from model.rcnn_model import RCNNModel
7 | from model.attention import AttentionModel
8 | from model.convlstm_model import ConvlstmModel
9 | from model.lstmconv_model import LstmconvModel
10 | from model.lstmgru_model import LstmgruModel
11 | from model.han_model import HANModel
12 | from model.hybrid_nn_1 import HybridNN1Model
13 | from model.ml_models import SVCClassifier
14 | from model.ml_models import Fasttext
15 | from model.bilstm_model import *
16 |
17 |
18 | class Config(object):
19 |
20 | """Docstring for Config. """
21 |
22 | def __init__(self):
23 | """TODO: to be defined1. """
24 | self.model = {
25 | # 'xgboost': XgboostModel,
26 | # 'lightgbm': LightGbmModel,
27 | # 'svc': SVCClassifier,
28 | # 'fasttext': Fasttext,
29 |
30 | # dl model
31 | 'aspv0': BilstmV0,
32 | 'aspv1': BilstmV1,
33 | # 'aspv2': BilstmV2,
34 | 'textcnn': TextCNNModel,
35 | 'lstmgru': LstmgruModel,
36 | 'attention': AttentionModel,
37 | 'convlstm': ConvlstmModel,
38 | 'lstmconv': LstmconvModel,
39 | # 'dpcnn': DpcnnModel,
40 | # 'rcnn': RCNNModel,
41 | # 'capsule': CapsuleModel,
42 | # 'han': HANModel,
43 | # 'hybridnn1': HybridNN1Model,
44 | }
45 | self.CHAR_MAXLEN = 190
46 | self.WORD_MAXLEN = 128
47 |
48 | self.HANN_SENT = 20
49 | self.HANN_WORD_LEN = 40
50 | self.HANN_CHAR_LEN = 70
51 | self.EMBED_SIZE = 300
52 | self.main_feature = 'word'
53 | self.is_debug = True
54 | # self.elmo_word_options_file = './bilm/dump/options.word.json'
55 | # self.elmo_word_weight_file = './bilm/dump/weights.word.hdf5'
56 | # self.elmo_word_embed_file = './bilm/dump/vocab_embedding.word.hdf5'
57 | # self.elmo_word_vocab_file = '../data/word2vec_models/word2vec.word.300d.vocab.txt'
58 |
59 | # self.elmo_char_options_file = './bilm/dump/options.char.json'
60 | # self.elmo_char_weight_file = './bilm/dump/weights.char.hdf5'
61 | # self.elmo_char_embed_file = './bilm/dump/vocab_embedding.char.hdf5'
62 | # self.elmo_char_vocab_file = '../data/word2vec_models/word2vec.char.300d.vocab.txt'
63 |
64 | # self.elmo_qiuqiu_options_file = './bilm/dump/tmp/options.json'
65 | # self.elmo_qiuqiu_weight_file = './bilm/dump/tmp/weight-11-4.hdf5'
66 | # self.elmo_qiuqiu_embed_file = './bilm/dump/tmp/word_embedding.after.elmo-11-4.hdf5'
67 | # self.elmo_qiuqiu_vocab_file = './bilm/dump/tmp/sa_elmo_vocabs.txt'
68 |
69 | self.loss_path = '../data/loss'
70 | self.TEST_X = '../data/csvs/test_public.csv'
71 | self.TRAIN_MULTI_X = '../data/csvs/train_multi.csv'
72 | self.TRAIN_JP = '../data/csvs/round2zh2jp.csv'
73 | self.TRAIN_EN = '../data/csvs/round2zh2en.csv'
74 | # self.SENTIMENT_EMBED_PATH = '../data/sentiment_embedding.pkl'
75 |
76 | # self.BERT_VOCAB_FILES = '../data/chinese_L-12_H-768_A-12/vocab.txt'
77 | # self.BERT_CONFIG_FILES = '../data/chinese_L-12_H-768_A-12/bert_config.json'
78 |
79 | # self.Y_DISTILLATION = '../data/result/oof.pkl'
80 |
81 | # property 等待调用到它时才计算,先加载embed size再加载对应词向量
82 | @property
83 | def char_stoi_file(self):
84 | if self.car:
85 | return '../data/char_item_to_id.cars-home.pkl'
86 | else:
87 | return '../data/char_item_to_id.pkl'
88 |
89 | @property
90 | def word_stoi_file(self):
91 | if self.car:
92 | return '../data/word_item_to_id.cars-home.pkl'
93 | else:
94 | return '../data/word_item_to_id.pkl'
95 |
96 | @property
97 | def char_w2v_file(self):
98 | if self.outer_embed:
99 | return '../data/word2vec_models/sgns.baidubaike.bigram-char'
100 | else:
101 | if not self.car:
102 | return '../data/word2vec_models/word2vec.char.{}d.model.txt'.format(self.EMBED_SIZE)
103 | else:
104 | return '../data/word2vec_models/word2vec.char.{}d.model.cars-home.txt'.format(self.EMBED_SIZE)
105 |
106 |
107 | @property
108 | def word_w2v_file(self):
109 |
110 | if self.outer_embed:
111 | return '../data/word2vec_models/sgns.baidubaike.bigram-char'
112 | else:
113 | if not self.car:
114 | return '../data/word2vec_models/word2vec.word.{}d.model.txt'.format(self.EMBED_SIZE)
115 | else:
116 | return '../data/word2vec_models/word2vec.word.{}d.model.cars-home.txt'.format(self.EMBED_SIZE)
117 |
118 | @property
119 | def TRAIN_X(self):
120 | if self.data_type == 0:
121 | return '../data/csvs/train_single_label.csv'
122 | elif self.data_type == 1:
123 | return '../data/csvs/train_single_label.csv'
124 | elif self.data_type == 2:
125 | return '../data/csvs/train_multi.csv'
126 | elif self.data_type == 3:
127 | return '../data/csvs/train_multi.csv'
128 | elif self.data_type == 4:
129 | return '../data/csvs/train.csv'
130 | elif self.data_type == 5:
131 | return '../data/csvs/multi_train.csv'
132 |
133 | @property
134 | def n_classes(self):
135 | if self.data_type == 0:
136 | return 10
137 | elif self.data_type == 1:
138 | return 3
139 | elif self.data_type == 2:
140 | return 4
141 | elif self.data_type == 3:
142 | return 4
143 | elif self.data_type == 4:
144 | return 3
145 | elif self.data_type == 5:
146 | return 30
147 |
148 |
149 |
150 |
--------------------------------------------------------------------------------
/src/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/src/model/__init__.py
--------------------------------------------------------------------------------
/src/model/attention.py:
--------------------------------------------------------------------------------
1 | from model.model_basic import BasicDeepModel
2 | import tensorflow as tf
3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
4 | from bilm.elmo import weight_layers
5 | from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
6 |
7 | n_sub = 10
8 |
9 |
10 | class AttentionModel(BasicDeepModel):
11 | def __init__(self, name='basicModel', n_folds=5, config=None):
12 | name = 'attention' + config.main_feature
13 | self.hidden_dim = 150
14 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
15 |
16 | def create_model(self, share_dense=True, concat_sub=True):
17 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y')
18 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
19 |
20 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
21 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
22 |
23 | if self.main_feature.lower() in ['word', 'char']:
24 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
25 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
26 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
27 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
28 |
29 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
30 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
31 | if self.main_feature == 'elmo_word':
32 | options_file = self.config.elmo_word_options_file
33 | weight_file = self.config.elmo_word_weight_file
34 | embed_file = self.config.elmo_word_embed_file
35 | elif self.main_feature == 'elmo_char':
36 | options_file = self.config.elmo_char_options_file
37 | weight_file = self.config.elmo_char_weight_file
38 | embed_file = self.config.elmo_char_embed_file
39 | elif self.main_feature == 'elmo_qiuqiu':
40 | options_file = self.config.elmo_qiuqiu_options_file
41 | weight_file = self.config.elmo_qiuqiu_weight_file
42 | embed_file = self.config.elmo_qiuqiu_embed_file
43 | self.bilm = BidirectionalLanguageModel(options_file,
44 | weight_file,
45 | use_character_inputs=False,
46 | embedding_weight_file=embed_file,
47 | max_batch_size=self.batch_size)
48 | bilm_embedding_op = self.bilm(self.input_x)
49 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
50 | self.word_encoding = bilm_embedding['weighted_op']
51 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
52 |
53 | else:
54 | exit('wrong feature')
55 |
56 | c_outputs = []
57 | for c in range(n_sub):
58 | with tf.variable_scope('lstm-{}'.format(c)):
59 | # self.forward = self.LSTM()
60 | # self.backward = self.LSTM()
61 | # x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32)
62 | # x = tf.concat(x, -1)
63 | #### cudnn lstm ####
64 | self.forward_lstm = cudnn_rnn.CudnnLSTM(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32)
65 | self.forward_gru = cudnn_rnn.CudnnGRU(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32)
66 | x, _ = self.forward_lstm(tf.transpose(self.word_encoding, [1, 0, 2]))
67 | x, _ = self.forward_gru(x)
68 | x = tf.transpose(x, [1, 0, 2])
69 |
70 | with tf.variable_scope('pooling-{}'.format(c)):
71 | max_pooled = tf.reshape(tf.reduce_max(x, 1), [-1, 2*self.hidden_dim])
72 | avg_pooled = tf.reshape(tf.reduce_mean(x, 1), [-1, 2*self.hidden_dim])
73 |
74 | att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim], name='att_w')
75 | att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b')
76 | att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v')
77 |
78 | x_reshape = tf.reshape(x, [-1, 2*self.hidden_dim])
79 | score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(x_reshape, att_w)) + att_b, att_v), [-1, 1, self.max_len])
80 | alpha = tf.nn.softmax(score, axis=-1)
81 | att_pooled = tf.reshape(tf.matmul(alpha, x), [-1, 2*self.hidden_dim])
82 |
83 | concat_pooled = tf.concat((max_pooled, att_pooled, avg_pooled), -1)
84 |
85 | concat_pooled = tf.nn.dropout(concat_pooled, self.dropout_keep_prob)
86 | dense = tf.layers.dense(concat_pooled, 4, activation=None)
87 | c_outputs.append(dense)
88 |
89 | self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4])
90 | y_ = tf.nn.softmax(self.logits)
91 | self.prob = tf.reshape(y_, [-1, n_sub, 4])
92 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
93 |
94 | if not self.config.balance:
95 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
96 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
97 | else:
98 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
99 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
100 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
101 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
102 | class0_weight = 1 # 第0类的权重系数
103 | class1_weight = 3 # 第1类的权重系数
104 | class2_weight = 3 # 第2类的权重系数
105 | class3_weight = 3 # 第3类的权重系数
106 | # coe = tf.constant([1., 1., 1., 1.])
107 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
108 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
109 |
110 | y = tf.reshape(self.input_y, [-1, 4])
111 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
112 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
113 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
114 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
115 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
116 |
117 | return self
118 |
119 |
120 |
--------------------------------------------------------------------------------
/src/model/bilstm_model.py:
--------------------------------------------------------------------------------
1 | from model.model_basic import BasicDeepModel
2 | from model import modeling
3 | import tensorflow as tf
4 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
5 | from bilm.elmo import weight_layers
6 |
7 | n_sub = 10
8 |
9 | class BilstmV0(BasicDeepModel):
10 | def __init__(self, name='basicModel', n_folds=5, config=None):
11 | name = 'qiuqiuv0' + config.main_feature
12 | self.hidden_dim = 300
13 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
14 |
15 | def create_model(self):
16 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,10,4], name='input_y')
17 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
18 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
19 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
20 |
21 | if self.main_feature.lower() in ['word', 'char']:
22 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
23 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
24 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
25 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
26 |
27 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
28 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
29 | if self.main_feature == 'elmo_word':
30 | options_file = self.config.elmo_word_options_file
31 | weight_file = self.config.elmo_word_weight_file
32 | embed_file = self.config.elmo_word_embed_file
33 | elif self.main_feature == 'elmo_char':
34 | options_file = self.config.elmo_char_options_file
35 | weight_file = self.config.elmo_char_weight_file
36 | embed_file = self.config.elmo_char_embed_file
37 | elif self.main_feature == 'elmo_qiuqiu':
38 | options_file = self.config.elmo_qiuqiu_options_file
39 | weight_file = self.config.elmo_qiuqiu_weight_file
40 | embed_file = self.config.elmo_qiuqiu_embed_file
41 |
42 | self.bilm = BidirectionalLanguageModel(options_file,
43 | weight_file,
44 | use_character_inputs=False,
45 | embedding_weight_file=embed_file,
46 | max_batch_size=self.batch_size)
47 | bilm_embedding_op = self.bilm(self.input_x)
48 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
49 | self.word_encoding = bilm_embedding['weighted_op']
50 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
51 |
52 | else:
53 | exit('wrong feature')
54 |
55 | self.layer_embedding = tf.get_variable(shape=[10, self.hidden_dim], name='layer_embedding')
56 | # self.layer_embedding = tf.get_variable(initializer=self.sentiment_embed, name='layer_embedding')
57 |
58 | self.forward = self.LSTM()
59 | self.backwad = self.LSTM()
60 | # self.forward2 = self.LSTM()
61 | # self.backwad2 = self.LSTM()
62 |
63 | # add point
64 | self.forward2 = self.GRU()
65 | self.backwad2 = self.GRU()
66 |
67 | with tf.variable_scope('sentence_encode'):
68 | all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backwad,self.word_encoding,dtype=tf.float32)
69 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
70 | output_sentence = tf.concat(axis=2, values=all_output_words)
71 |
72 | with tf.variable_scope('sentence_encode2'):
73 | all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward2,self.backwad2,output_sentence,dtype=tf.float32)
74 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
75 | output_sentence = tf.concat(axis=2, values=all_output_words)
76 | output_sentence = tf.layers.dense(output_sentence, self.hidden_dim, activation=tf.nn.tanh)
77 | sentence_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len, self.hidden_dim])
78 | sentence_reshape_tile = tf.tile(sentence_reshape, [1, 10, 1, 1]) # 句子复制10份
79 |
80 | layer_reshape = tf.reshape(self.layer_embedding, [1, 10, 1, self.hidden_dim])
81 | layer_reshape_tile = tf.tile(layer_reshape, [self.batch_size, 1, self.max_len, 1])
82 |
83 | embed_concat = tf.reshape(tf.concat(axis=3,values=[sentence_reshape_tile,layer_reshape_tile]),[-1,2*self.hidden_dim])
84 |
85 | self.att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim],name='att_w')
86 | self.att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b')
87 | self.att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v')
88 |
89 | score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(embed_concat,self.att_w) + self.att_b),self.att_v),[-1,10,self.max_len])
90 | alpah = tf.nn.softmax(score,axis=2)
91 | layer_sentence = tf.matmul(alpah,output_sentence)
92 |
93 | layer_reshape2 = tf.reshape(self.layer_embedding,[1,10,self.hidden_dim])
94 | layer_reshape2_tile = tf.tile(layer_reshape2,[self.batch_size,1,1])
95 | layer_sentence = tf.concat(axis=2,values=[layer_sentence,layer_reshape2_tile])
96 | layer_sentence = tf.reshape(layer_sentence,[-1,2*self.hidden_dim])
97 |
98 | layer_sentence = tf.layers.dense(layer_sentence,self.hidden_dim,activation=tf.nn.relu)
99 |
100 | # add point
101 | layer_sentence = tf.nn.dropout(layer_sentence, self.dropout_keep_prob)
102 |
103 | self.logits = tf.layers.dense(layer_sentence, 4, activation=None)
104 | y_ = tf.nn.softmax(self.logits, axis=1)
105 | self.prob = tf.reshape(y_, [-1, 10, 4])
106 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
107 |
108 | if not self.config.balance:
109 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
110 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
111 | else:
112 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
113 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
114 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
115 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
116 | class0_weight = 1 # 第0类的权重系数
117 | class1_weight = 3 # 第1类的权重系数
118 | class2_weight = 3 # 第2类的权重系数
119 | class3_weight = 3 # 第3类的权重系数
120 | # coe = tf.constant([1., 1., 1., 1.])
121 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
122 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
123 |
124 | y = tf.reshape(self.input_y, [-1, 4])
125 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
126 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
127 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
128 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
129 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
130 |
131 | return self
132 |
133 | def LSTM(self, layers=1):
134 | lstms = []
135 | for num in range(layers):
136 | lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
137 | print(lstm.name)
138 | # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
139 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
140 | lstms.append(lstm)
141 |
142 | lstms = tf.contrib.rnn.MultiRNNCell(lstms)
143 | return lstms
144 |
145 | def GRU(self, layers=1):
146 | lstms = []
147 | for num in range(layers):
148 | # lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
149 | lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
150 | print(lstm.name)
151 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
152 | lstms.append(lstm)
153 |
154 | lstms = tf.contrib.rnn.MultiRNNCell(lstms)
155 | return lstms
156 |
157 |
158 | class BilstmV1(BasicDeepModel):
159 | def __init__(self, name='basicModel', n_folds=5, config=None):
160 | name = 'qiuqiuv1' + config.main_feature
161 | self.hidden_dim = 300
162 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
163 |
164 | def create_model(self, concat_sub=True):
165 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,10,4], name='input_y')
166 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
167 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
168 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
169 |
170 | if self.main_feature.lower() in ['word', 'char']:
171 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
172 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
173 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
174 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
175 |
176 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
177 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
178 | if self.main_feature == 'elmo_word':
179 | options_file = self.config.elmo_word_options_file
180 | weight_file = self.config.elmo_word_weight_file
181 | embed_file = self.config.elmo_word_embed_file
182 | elif self.main_feature == 'elmo_char':
183 | options_file = self.config.elmo_char_options_file
184 | weight_file = self.config.elmo_char_weight_file
185 | embed_file = self.config.elmo_char_embed_file
186 | elif self.main_feature == 'elmo_qiuqiu':
187 | options_file = self.config.elmo_qiuqiu_options_file
188 | weight_file = self.config.elmo_qiuqiu_weight_file
189 | embed_file = self.config.elmo_qiuqiu_embed_file
190 |
191 | self.bilm = BidirectionalLanguageModel(options_file,
192 | weight_file,
193 | use_character_inputs=False,
194 | embedding_weight_file=embed_file,
195 | max_batch_size=self.batch_size)
196 | bilm_embedding_op = self.bilm(self.input_x)
197 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
198 | self.word_encoding = bilm_embedding['weighted_op']
199 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
200 |
201 | else:
202 | exit('wrong feature')
203 |
204 | self.layer_embedding = tf.get_variable(shape=[10, self.hidden_dim], name='layer_embedding')
205 | layer_reshape = tf.reshape(self.layer_embedding, [1, 10, 1, self.hidden_dim])
206 | layer_reshape_tile = tf.tile(layer_reshape, [self.batch_size, 1, self.max_len, 1])
207 |
208 | self.forward = self.LSTM()
209 | self.backwad = self.LSTM()
210 | self.forward2 = self.LSTM()
211 | self.backwad2 = self.LSTM()
212 |
213 | with tf.variable_scope('sentence_encode'):
214 | s1_out, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backwad,self.word_encoding,dtype=tf.float32)
215 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
216 | s1_out = tf.concat(axis=2, values=s1_out)
217 | s1_reshape = tf.reshape(s1_out, [-1, 1, self.max_len, 2*self.hidden_dim])
218 | s1_tile = tf.tile(s1_reshape, [1, 10, 1, 1]) # 第一层lstm复制10份
219 |
220 | s2_input = tf.reshape(tf.concat((s1_tile, layer_reshape_tile), -1), [-1, self.max_len, 3*self.hidden_dim])
221 |
222 | with tf.variable_scope('sentence_encode2'):
223 | s2_out, _ = tf.nn.bidirectional_dynamic_rnn(self.forward2,self.backwad2,s2_input,dtype=tf.float32)
224 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
225 | s2_out = tf.reshape(tf.concat(axis=-1, values=s2_out), [-1, 10, self.max_len, 2*self.hidden_dim])
226 | res_out = s2_out + s1_tile
227 | res_dense = tf.layers.dense(res_out, self.hidden_dim, activation=tf.nn.relu)
228 |
229 | res_layer_concat = tf.reshape(tf.concat((res_dense, layer_reshape_tile), -1), [-1, 2*self.hidden_dim])
230 |
231 | self.att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim],name='att_w')
232 | self.att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b')
233 | self.att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v')
234 |
235 | score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(res_layer_concat, self.att_w) + self.att_b),self.att_v),[-1,1,self.max_len])
236 | alpha = tf.nn.softmax(score)
237 | layer_sentence = tf.reshape(tf.matmul(alpha, tf.reshape(res_out, [-1, self.max_len, 2*self.hidden_dim])), [-1, n_sub, 2*self.hidden_dim])
238 |
239 | if concat_sub:
240 | # 是否拼接layer_sub信息
241 | layer_sub = tf.reshape(self.layer_embedding, [1, n_sub, self.hidden_dim])
242 | layer_sub_tile = tf.tile(layer_sub, [self.batch_size, 1, 1])
243 |
244 | layer_total = tf.concat((layer_sentence, layer_sub_tile), -1)
245 | outputs = tf.reshape(layer_total, [-1, 3*self.hidden_dim])
246 | else:
247 | outputs = tf.reshape(layer_sentence, [-1, 2*self.hidden_dim])
248 |
249 | self.logits = tf.layers.dense(outputs, 4, activation=None)
250 | y_ = tf.nn.softmax(self.logits)
251 | self.prob = tf.reshape(y_, [-1, 10, 4])
252 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
253 |
254 | if not self.config.balance:
255 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
256 | self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
257 | else:
258 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
259 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
260 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
261 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
262 | class0_weight = 1 # 第0类的权重系数
263 | class1_weight = 3 # 第1类的权重系数
264 | class2_weight = 3 # 第2类的权重系数
265 | class3_weight = 3 # 第3类的权重系数
266 | # coe = tf.constant([1., 1., 1., 1.])
267 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
268 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
269 |
270 | y = tf.reshape(self.input_y, [-1, 4])
271 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
272 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
273 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
274 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
275 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
276 |
277 | return self
278 |
279 | def LSTM(self, layers=1):
280 | lstms = []
281 | for num in range(layers):
282 | lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
283 | print(lstm.name)
284 | # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
285 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
286 | lstms.append(lstm)
287 |
288 | lstms = tf.contrib.rnn.MultiRNNCell(lstms)
289 | return lstms
290 |
291 | def GRU(self, layers=1):
292 | lstms = []
293 | for num in range(layers):
294 | # lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
295 | lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
296 | print(lstm.name)
297 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
298 | lstms.append(lstm)
299 |
300 | lstms = tf.contrib.rnn.MultiRNNCell(lstms)
301 | return lstms
302 |
303 |
304 | class BilstmV2(BasicDeepModel):
305 | def __init__(self, name='basicModel', n_folds=5, config=None):
306 | name = 'qiuqiuv2' + config.main_feature
307 | self.hidden_dim = 300
308 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
309 |
310 | def create_model(self):
311 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,10,4], name='input_y')
312 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
313 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
314 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
315 |
316 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None,190], name='input_ids')
317 | self.mask_ids = tf.placeholder(dtype=tf.int32, shape=[None,190], name='mask_ids')
318 | self.type_ids = tf.placeholder(dtype=tf.int32, shape=[None,190], name='type_ids')
319 | self.is_training = tf.placeholder(dtype=tf.bool, name='is_training')
320 |
321 | # bert_hidden_size = bert_output_layer.shape[-1].value
322 | # hidden_size = output_layer.shape[-1].value
323 |
324 | if self.main_feature.lower() in ['word', 'char']:
325 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
326 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
327 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
328 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
329 |
330 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
331 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
332 | if self.main_feature == 'elmo_word':
333 | options_file = self.config.elmo_word_options_file
334 | weight_file = self.config.elmo_word_weight_file
335 | embed_file = self.config.elmo_word_embed_file
336 | elif self.main_feature == 'elmo_char':
337 | options_file = self.config.elmo_char_options_file
338 | weight_file = self.config.elmo_char_weight_file
339 | embed_file = self.config.elmo_char_embed_file
340 | elif self.main_feature == 'elmo_qiuqiu':
341 | options_file = self.config.elmo_qiuqiu_options_file
342 | weight_file = self.config.elmo_qiuqiu_weight_file
343 | embed_file = self.config.elmo_qiuqiu_embed_file
344 |
345 | self.bilm = BidirectionalLanguageModel(options_file,
346 | weight_file,
347 | use_character_inputs=False,
348 | embedding_weight_file=embed_file,
349 | max_batch_size=self.batch_size)
350 | bilm_embedding_op = self.bilm(self.input_x)
351 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
352 | self.word_encoding = bilm_embedding['weighted_op']
353 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
354 |
355 | else:
356 | exit('wrong feature')
357 |
358 | self.layer_embedding = tf.get_variable(shape=[10, self.hidden_dim], name='layer_embedding')
359 |
360 | self.forward = self.LSTM()
361 | self.backwad = self.LSTM()
362 | # self.forward2 = self.LSTM()
363 | # self.backwad2 = self.LSTM()
364 |
365 | # add point
366 | self.forward2 = self.GRU()
367 | self.backwad2 = self.GRU()
368 |
369 | # bert使用
370 | bert_config = modeling.BertConfig.from_json_file(self.config.BERT_CONFIG_FILES)
371 |
372 | bert_model = modeling.BertModel(
373 | config=bert_config,
374 | is_training=self.is_training,
375 | input_ids=self.input_ids,
376 | input_mask=self.mask_ids,
377 | token_type_ids=self.type_ids
378 | )
379 | if self.is_training is not None:
380 | print('bert config hidden dropout -- ---', bert_config.hidden_dropout_prob)
381 | print('bert config hidden dropout -- ---', bert_config.attention_probs_dropout_prob)
382 | self.word_encoding = bert_model.get_sequence_output()
383 | all_layer_output = bert_model.get_all_encoder_layers()
384 | self.word_encoding = (all_layer_output[0] + all_layer_output[1] + all_layer_output[2] + all_layer_output[3]) / 4
385 | with tf.variable_scope('sentence_encode'):
386 | all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backwad,self.word_encoding,dtype=tf.float32)
387 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
388 | output_sentence = tf.concat(axis=2, values=all_output_words)
389 |
390 | with tf.variable_scope('sentence_encode2'):
391 | all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward2,self.backwad2,output_sentence,dtype=tf.float32)
392 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
393 | output_sentence = tf.concat(axis=2, values=all_output_words)
394 | output_sentence = tf.layers.dense(output_sentence, self.hidden_dim, activation=tf.nn.tanh)
395 | sentence_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len, self.hidden_dim])
396 | sentence_reshape_tile = tf.tile(sentence_reshape, [1, 10, 1, 1]) # 句子复制10份
397 |
398 | layer_reshape = tf.reshape(self.layer_embedding, [1, 10, 1, self.hidden_dim])
399 | layer_reshape_tile = tf.tile(layer_reshape, [self.batch_size, 1, self.max_len, 1])
400 |
401 | embed_concat = tf.reshape(tf.concat(axis=3,values=[sentence_reshape_tile,layer_reshape_tile]),[-1,2*self.hidden_dim])
402 |
403 | self.att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim],name='att_w')
404 | self.att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b')
405 | self.att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v')
406 |
407 | score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(embed_concat,self.att_w) + self.att_b),self.att_v),[-1,10,self.max_len])
408 | alpah = tf.nn.softmax(score,axis=2)
409 | layer_sentence = tf.matmul(alpah,output_sentence)
410 |
411 | layer_reshape2 = tf.reshape(self.layer_embedding,[1,10,self.hidden_dim])
412 | layer_reshape2_tile = tf.tile(layer_reshape2,[self.batch_size,1,1])
413 | layer_sentence = tf.concat(axis=2,values=[layer_sentence,layer_reshape2_tile])
414 | layer_sentence = tf.reshape(layer_sentence,[-1,2*self.hidden_dim])
415 |
416 | layer_sentence = tf.layers.dense(layer_sentence,self.hidden_dim,activation=tf.nn.relu)
417 |
418 | # add point
419 | layer_sentence = tf.nn.dropout(layer_sentence, self.dropout_keep_prob)
420 |
421 | self.logits = tf.layers.dense(layer_sentence, 4, activation=None)
422 | y_ = tf.nn.softmax(self.logits, axis=1)
423 | self.prob = tf.reshape(y_, [-1, 10, 4])
424 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
425 |
426 | if not self.config.balance:
427 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
428 | self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
429 | else:
430 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
431 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
432 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
433 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
434 | class0_weight = 1 # 第0类的权重系数
435 | class1_weight = 3 # 第1类的权重系数
436 | class2_weight = 3 # 第2类的权重系数
437 | class3_weight = 3 # 第3类的权重系数
438 | # coe = tf.constant([1., 1., 1., 1.])
439 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
440 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
441 |
442 | y = tf.reshape(self.input_y, [-1, 4])
443 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
444 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
445 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
446 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
447 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
448 |
449 | return self
450 |
451 | def LSTM(self, layers=1):
452 | lstms = []
453 | for num in range(layers):
454 | lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
455 | print(lstm.name)
456 | # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
457 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
458 | lstms.append(lstm)
459 |
460 | lstms = tf.contrib.rnn.MultiRNNCell(lstms)
461 | return lstms
462 |
463 | def GRU(self, layers=1):
464 | lstms = []
465 | for num in range(layers):
466 | # lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
467 | lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
468 | print(lstm.name)
469 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
470 | lstms.append(lstm)
471 |
472 | lstms = tf.contrib.rnn.MultiRNNCell(lstms)
473 | return lstms
474 |
475 |
--------------------------------------------------------------------------------
/src/model/capsule_model.py:
--------------------------------------------------------------------------------
1 | from keras.layers import *
2 | from keras.models import *
3 | from model.model_basic import BasicDeepModel
4 | from model.model_component import Capsule
5 | from keras import regularizers
6 |
7 | class CapsuleModel(BasicDeepModel):
8 | def __init__(self, name='basicModel', num_flods=5, config=None):
9 | name = 'capsule' + config.main_feature
10 | BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config)
11 |
12 | def create_model(self):
13 | Routings = 5
14 | Num_capsule = 10
15 | Dim_capsule = 16
16 | dropout_p = 0.25
17 | rate_drop_dense = 0.28
18 | gru_len = 128
19 | if self.main_feature == 'char':
20 | input = Input(shape=(self.max_len,), name='char')
21 | else:
22 | input = Input(shape=(self.max_len,), name='word')
23 |
24 | embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding')
25 | x = Masking(mask_value=self.mask_value)(input)
26 | x = embedding(x)
27 |
28 | x = SpatialDropout1D(rate_drop_dense)(x)
29 |
30 | x = Bidirectional(GRU(gru_len, activation='relu', dropout=dropout_p, recurrent_dropout=dropout_p, return_sequences=True))(x)
31 | # x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
32 |
33 | capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings,
34 | share_weights=True)(x)
35 |
36 | capsule = Flatten()(capsule)
37 | capsule = Dropout(dropout_p)(capsule)
38 | dense = Dense(self.n_class, activation="softmax")(capsule)
39 | res_model = Model(inputs=[input], outputs=dense)
40 |
41 | return res_model
42 |
--------------------------------------------------------------------------------
/src/model/convlstm_model.py:
--------------------------------------------------------------------------------
1 | from model.model_basic import BasicDeepModel
2 | import tensorflow as tf
3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
4 | from bilm.elmo import weight_layers
5 |
6 | n_sub = 10
7 |
8 | class ConvlstmModel(BasicDeepModel):
9 | def __init__(self, name='basicModel', n_folds=5, config=None):
10 | name = 'convlstm' + config.main_feature
11 | self.hidden_dim = 300
12 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
13 |
14 | def LSTM(self, layers=1):
15 | lstms = []
16 | for num in range(layers):
17 | lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
18 | print(lstm.name)
19 | # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
20 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
21 | lstms.append(lstm)
22 | lstms = tf.contrib.rnn.MultiRNNCell(lstms)
23 | return lstms
24 |
25 | def GRU(self, layers=1):
26 | lstms = []
27 | for num in range(layers):
28 | # lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
29 | lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
30 | print(lstm.name)
31 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
32 | lstms.append(lstm)
33 | lstms = tf.contrib.rnn.MultiRNNCell(lstms)
34 | return lstms
35 |
36 | def create_model(self, share_dense=True, concat_sub=True):
37 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y')
38 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
39 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
40 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
41 |
42 | if self.main_feature.lower() in ['word', 'char']:
43 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
44 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
45 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
46 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
47 |
48 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
49 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
50 | if self.main_feature == 'elmo_word':
51 | options_file = self.config.elmo_word_options_file
52 | weight_file = self.config.elmo_word_weight_file
53 | embed_file = self.config.elmo_word_embed_file
54 | elif self.main_feature == 'elmo_char':
55 | options_file = self.config.elmo_char_options_file
56 | weight_file = self.config.elmo_char_weight_file
57 | embed_file = self.config.elmo_char_embed_file
58 | elif self.main_feature == 'elmo_qiuqiu':
59 | options_file = self.config.elmo_qiuqiu_options_file
60 | weight_file = self.config.elmo_qiuqiu_weight_file
61 | embed_file = self.config.elmo_qiuqiu_embed_file
62 |
63 | self.bilm = BidirectionalLanguageModel(options_file,
64 | weight_file,
65 | use_character_inputs=False,
66 | embedding_weight_file=embed_file,
67 | max_batch_size=self.batch_size)
68 | bilm_embedding_op = self.bilm(self.input_x)
69 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
70 | self.word_encoding = bilm_embedding['weighted_op']
71 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
72 |
73 | else:
74 | exit('wrong feature')
75 |
76 | inputs_expanded = tf.expand_dims(self.word_encoding, -1)
77 | n_filters = 128
78 | filter_shape = [3, self.embed_size, 1, n_filters]
79 | W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W')
80 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters]))
81 | conv = tf.nn.conv2d(inputs_expanded, W, strides=[1]*4, padding='VALID', name='conv2d')
82 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
83 | h = tf.reshape(h, [-1, self.max_len-3+1, n_filters])
84 |
85 | self.forward = self.LSTM()
86 | self.backward = self.LSTM()
87 | x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, h, dtype=tf.float32)
88 | x = tf.concat(x, -1)
89 | output_sentence = tf.layers.dense(x, self.hidden_dim, activation=tf.nn.relu)
90 |
91 | x_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len-3+1, self.hidden_dim])
92 | x_tile = tf.tile(x_reshape, [1, n_sub, 1, 1]) # 句子复制n_sub份
93 |
94 | sub_embedding = tf.get_variable(shape=[n_sub, self.hidden_dim], name='sub_embedding')
95 | sub_reshape = tf.reshape(sub_embedding, [1, n_sub, 1, self.hidden_dim])
96 | sub_tile = tf.tile(sub_reshape, [self.batch_size, 1, self.max_len-3+1, 1])
97 |
98 | embed_concat = tf.reshape(tf.concat((x_tile, sub_tile), -1), [-1, 2*self.hidden_dim])
99 |
100 | att_w = tf.get_variable(shape=[2*self.hidden_dim, self.hidden_dim], name='att_w')
101 | att_b = tf.get_variable(shape=[self.hidden_dim], name='att_b')
102 | att_v = tf.get_variable(shape=[self.hidden_dim, 1], name='att_v')
103 |
104 | score = tf.matmul(tf.nn.tanh(tf.matmul(embed_concat, att_w) + att_b), att_v)
105 | score_fit = tf.reshape(score, [-1, n_sub, self.max_len-3+1])
106 | alpha = tf.nn.softmax(score_fit)
107 |
108 | layer_sentence = tf.matmul(alpha, output_sentence)
109 |
110 | if concat_sub:
111 | # 是否拼接layer_sub信息
112 | layer_sub = tf.reshape(sub_embedding, [1, n_sub, self.hidden_dim])
113 | layer_sub_tile = tf.tile(layer_sub, [self.batch_size, 1, 1])
114 |
115 | layer_total = tf.concat((layer_sentence, layer_sub_tile), -1)
116 | outputs = tf.reshape(layer_total, [-1, 2*self.hidden_dim])
117 | else:
118 | outputs = tf.reshape(layer_sentence, [-1, self.hidden_dim])
119 |
120 | self.logits = tf.layers.dense(layer_sentence, 4, activation=None)
121 | y_ = tf.nn.softmax(self.logits)
122 | self.prob = tf.reshape(y_, [-1, 10, 4])
123 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
124 |
125 | if not self.config.balance:
126 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
127 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
128 | else:
129 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
130 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
131 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
132 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
133 | class0_weight = 1 # 第0类的权重系数
134 | class1_weight = 3 # 第1类的权重系数
135 | class2_weight = 3 # 第2类的权重系数
136 | class3_weight = 3 # 第3类的权重系数
137 | # coe = tf.constant([1., 1., 1., 1.])
138 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
139 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
140 |
141 | y = tf.reshape(self.input_y, [-1, 4])
142 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
143 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
144 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
145 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
146 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
147 |
148 | return self
149 |
150 |
--------------------------------------------------------------------------------
/src/model/dpcnn_model.py:
--------------------------------------------------------------------------------
1 | from keras.models import *
2 | from keras.layers import *
3 | from model.model_basic import BasicDeepModel
4 | from keras import regularizers
5 |
6 |
7 | dp = 4
8 | filter_nr = 64
9 | filter_size = 3
10 | max_pool_size = 3
11 | max_pool_strides = 2
12 | dense_nr = 128
13 | spatial_dropout = 0.5
14 | dense_dropout = 0.5
15 |
16 |
17 | class DpcnnModel(BasicDeepModel):
18 | def __init__(self, name='basicModel', num_flods=5, config=None):
19 | name = 'dpcnn' + config.main_feature
20 | BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config)
21 |
22 | def create_model(self):
23 | if self.main_feature == 'char':
24 | input = Input(shape=(self.max_len,), name='char')
25 | else:
26 | input = Input(shape=(self.max_len,), name='word')
27 |
28 | embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding')
29 | x = Masking(mask_value=self.mask_value)(input)
30 | x = embedding(x)
31 | x = SpatialDropout1D(0.5)(x)
32 |
33 | block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(x)
34 | block1 = BatchNormalization()(block1)
35 | block1 = PReLU()(block1)
36 | block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1)
37 | block1 = BatchNormalization()(block1)
38 | block1 = PReLU()(block1)
39 |
40 | # we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output
41 | # if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output
42 | resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear')(x)
43 | resize_emb = PReLU()(resize_emb)
44 |
45 | block1_output = add([block1, resize_emb])
46 | x = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output)
47 |
48 | for i in range(dp):
49 | block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(x)
50 | block1 = BatchNormalization()(block1)
51 | block1 = PReLU()(block1)
52 | block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1)
53 | block1 = BatchNormalization()(block1)
54 | block1 = PReLU()(block1)
55 |
56 | block_output = add([block1, x])
57 | if i + 1 != dp:
58 | x = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block_output)
59 |
60 | x = GlobalMaxPooling1D()(block_output)
61 | output = Dense(dense_nr, activation='linear')(x)
62 | output = BatchNormalization()(output)
63 | x = PReLU()(output)
64 |
65 | # output = Dropout(dense_dropout)(output)
66 | if self.config.data_type == 3:
67 | dense = Dense(self.n_class, activation="sigmoid")(x)
68 | else:
69 | dense = Dense(self.n_class, activation="softmax")(x)
70 | res_model = Model(inputs=[input], outputs=dense)
71 |
72 | return res_model
73 |
--------------------------------------------------------------------------------
/src/model/han_model.py:
--------------------------------------------------------------------------------
1 | from keras.models import *
2 | from keras.layers import *
3 | from model.model_basic import BasicDeepModel
4 | from model.model_component import AttLayer
5 | from model.model_component import AttentionWithContext
6 |
7 |
8 | class HANModel(BasicDeepModel):
9 | def __init__(self, name='basicModel', num_flods=5, config=None):
10 | name = 'han' + config.main_feature
11 | BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config)
12 |
13 | def create_model(self):
14 |
15 | if self.config.main_feature == 'word':
16 | input = Input(shape=(self.config.HANN_WORD_LEN,), dtype='int32')
17 | else:
18 | input = Input(shape=(self.config.HANN_CHAR_LEN,), dtype='int32')
19 |
20 | mask = Masking(mask_value=self.mask_value)(input)
21 | embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding')
22 | x = embedding(mask)
23 | x = SpatialDropout1D(0.5)(x)
24 | x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
25 | l_att = AttLayer(100)(x)
26 | # l_att = AttentionWithContext()(x)
27 | sentEncoder = Model(input, l_att)
28 |
29 | if self.config.main_feature == 'word':
30 | word_input = Input(shape=(self.config.HANN_SENT, self.config.HANN_WORD_LEN), name='hann_word')
31 | word_encoder = TimeDistributed(sentEncoder)(word_input)
32 | word_sent_lstm = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(word_encoder)
33 | # x = AttLayer(100)(word_sent_lstm)
34 | x = AttentionWithContext()(word_sent_lstm)
35 | x = Dropout(0.2)(x)
36 | if self.config.data_type == 3:
37 | dense = Dense(self.n_class, activation="sigmoid")(x)
38 | else:
39 | dense = Dense(self.n_class, activation="softmax")(x)
40 | model = Model(word_input, dense)
41 | else:
42 | char_input = Input(shape=(self.config.HANN_SENT, self.config.HANN_CHAR_LEN), name='hann_char')
43 | char_encoder = TimeDistributed(sentEncoder)(char_input)
44 | char_sent_lstm = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(char_encoder)
45 | x = AttLayer(100)(char_sent_lstm)
46 | # x = AttentionWithContext()(char_sent_lstm)
47 | x = Dropout(0.2)(x)
48 | if self.config.data_type == 3:
49 | dense = Dense(self.n_class, activation="sigmoid")(x)
50 | else:
51 | dense = Dense(self.n_class, activation="softmax")(x)
52 | model = Model(char_input, dense)
53 | return model
54 |
55 |
56 |
--------------------------------------------------------------------------------
/src/model/hybrid_nn_1.py:
--------------------------------------------------------------------------------
1 | from keras.models import *
2 | from keras.layers import *
3 | from keras import backend as K
4 | from model.model_basic import BasicDeepModel
5 | from model.model_component import AttLayer
6 | from model.model_component import Capsule
7 |
8 |
9 | class HybridNN1Model(BasicDeepModel):
10 | def __init__(self, name='basicModel', num_flods=5, config=None):
11 | name = 'hybridnn1' + config.main_feature
12 | BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config)
13 |
14 | def create_model(self):
15 | if self.main_feature == 'char':
16 | input = Input(shape=(self.max_len,), name='char')
17 | else:
18 | input = Input(shape=(self.max_len,), name='word')
19 |
20 | embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding')
21 | x = Masking(mask_value=self.mask_value)(input)
22 | x = embedding(x)
23 |
24 | x = SpatialDropout1D(0.5)(x)
25 | x = GRU(256, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(x) # ??
26 | capsule1 = Capsule(19, 17, 5)(x)
27 | capsule1 = Flatten()(capsule1)
28 | capsule2 = Capsule(19, 16, 5)(x)
29 | capsule2 = Flatten()(capsule2)
30 | output = concatenate([capsule1, capsule2])
31 |
32 | output = Dense(256)(output)
33 | output = BatchNormalization()(output)
34 | output = Activation('relu')(output)
35 | output = Dropout(0.2)(output)
36 |
37 | output = Dense(256)(output)
38 | output = BatchNormalization()(output)
39 | output = Activation('relu')(output)
40 | x = Dropout(0.2)(output)
41 |
42 | if self.config.data_type == 3:
43 | dense = Dense(self.n_class, activation="sigmoid")(x)
44 | else:
45 | dense = Dense(self.n_class, activation="softmax")(x)
46 | model = Model(inputs=[input], output=dense)
47 |
48 | return model
49 |
50 |
51 |
--------------------------------------------------------------------------------
/src/model/lightgbm_model.py:
--------------------------------------------------------------------------------
1 | import lightgbm as lgbm
2 | from model.model_basic import BasicStaticModel
3 |
4 | class LightGbmModel(BasicStaticModel):
5 | def __init__(self, num_folds=5, config=None):
6 | lgbm_params = {'objective': 'multiclass',
7 | 'bagging_seed': 10,
8 | 'boosting_type': 'gbdt',
9 | 'feature_fraction': 0.9,
10 | 'feature_fraction_seed': 10,
11 | 'lambda_l1': 0.5,
12 | 'lambda_l2': 0.5,
13 | 'learning_rate': 0.01,
14 | 'metric': 'multi_logloss',
15 | 'min_child_weight': 1,
16 | # 'min_split_gain': 0,
17 | 'device': 'gpu',
18 | 'gpu_platform_id': 0,
19 | 'gpu_device_id': config.gpu,
20 | 'min_sum_hessian_in_leaf': 0.1,
21 | 'num_leaves': 64,
22 | 'num_thread': -1,
23 | 'num_class': config.n_class,
24 | 'verbose': 1}
25 | self.config = config
26 | BasicStaticModel.__init__(self, lgbm_params, num_folds, 'lightGBM', n_class=config.n_class)
27 |
28 | def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test):
29 |
30 | dtrain = lgbm.Dataset(kfold_X_train, label=y_train)
31 | dwatch = lgbm.Dataset(kfold_X_valid, label=y_test)
32 |
33 | best = lgbm.train(self.params, dtrain, num_boost_round=300, verbose_eval=10, valid_sets=dwatch,
34 | early_stopping_rounds=10)
35 | # 对验证集predict
36 |
37 | pred = best.predict(kfold_X_valid)
38 | results = best.predict(test)
39 |
40 | return pred, results, best
41 |
42 |
--------------------------------------------------------------------------------
/src/model/lstmconv_model.py:
--------------------------------------------------------------------------------
1 | from model.model_basic import BasicDeepModel
2 | import tensorflow as tf
3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
4 | from bilm.elmo import weight_layers
5 | from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
6 |
7 | n_sub = 10
8 | n_filters = 100
9 |
10 |
11 | class LstmconvModel(BasicDeepModel):
12 | def __init__(self, name='basicModel', n_folds=5, config=None):
13 | name = 'lstmconv' + config.main_feature
14 | self.hidden_dim = 300
15 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
16 |
17 | def LSTM(self, layers=1):
18 | lstms = []
19 | for num in range(layers):
20 | lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
21 | print(lstm.name)
22 | # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
23 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
24 | lstms.append(lstm)
25 |
26 | lstms = tf.contrib.rnn.MultiRNNCell(lstms)
27 | return lstms
28 |
29 | def create_model(self, share_dense=True, concat_sub=True):
30 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y')
31 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
32 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
33 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
34 |
35 | if self.main_feature.lower() in ['word', 'char']:
36 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
37 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
38 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
39 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
40 |
41 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
42 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
43 | if self.main_feature == 'elmo_word':
44 | options_file = self.config.elmo_word_options_file
45 | weight_file = self.config.elmo_word_weight_file
46 | embed_file = self.config.elmo_word_embed_file
47 | elif self.main_feature == 'elmo_char':
48 | options_file = self.config.elmo_char_options_file
49 | weight_file = self.config.elmo_char_weight_file
50 | embed_file = self.config.elmo_char_embed_file
51 | elif self.main_feature == 'elmo_qiuqiu':
52 | options_file = self.config.elmo_qiuqiu_options_file
53 | weight_file = self.config.elmo_qiuqiu_weight_file
54 | embed_file = self.config.elmo_qiuqiu_embed_file
55 | self.bilm = BidirectionalLanguageModel(options_file,
56 | weight_file,
57 | use_character_inputs=False,
58 | embedding_weight_file=embed_file,
59 | max_batch_size=self.batch_size)
60 | bilm_embedding_op = self.bilm(self.input_x)
61 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
62 | self.word_encoding = bilm_embedding['weighted_op']
63 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
64 |
65 | else:
66 | exit('wrong feature')
67 |
68 | c_outputs = []
69 | for c in range(n_sub):
70 | with tf.variable_scope('lstm-{}'.format(c)):
71 | # self.forward = self.LSTM()
72 | # self.backward = self.LSTM()
73 | # x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32)
74 | # x = tf.concat(x, -1)
75 | #### cudnn lstm ####
76 | self.forward = cudnn_rnn.CudnnLSTM(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32)
77 | x, _ = self.forward(tf.transpose(self.word_encoding, [1, 0, 2]))
78 | x = tf.transpose(x, [1, 0, 2])
79 |
80 | with tf.variable_scope('conv-{}'.format(c)):
81 | inputs_expanded = tf.expand_dims(x, -1)
82 | filter_shape = [3, 2*self.hidden_dim, 1, n_filters]
83 | W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W')
84 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters]))
85 | conv = tf.nn.conv2d(inputs_expanded, W, strides=[1]*4, padding='VALID', name='conv2d')
86 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
87 | max_pooled = tf.nn.max_pool(h,
88 | ksize=[1, self.max_len-3+1, 1, 1],
89 | strides=[1, 1, 1, 1],
90 | padding='VALID',
91 | name='max_pool')
92 | avg_pooled = tf.nn.avg_pool(h,
93 | ksize=[1, self.max_len-3+1, 1, 1],
94 | strides=[1, 1, 1, 1],
95 | padding='VALID',
96 | name='avg_pool')
97 | concat_pooled = tf.reshape(tf.concat((max_pooled, avg_pooled), -1), [-1, 2*n_filters])
98 |
99 | concat_pooled = tf.nn.dropout(concat_pooled, self.dropout_keep_prob)
100 | dense = tf.layers.dense(concat_pooled, 4, activation=None)
101 | c_outputs.append(dense)
102 |
103 | self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4])
104 | y_ = tf.nn.softmax(self.logits)
105 | self.prob = tf.reshape(y_, [-1, n_sub, 4])
106 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
107 |
108 | if not self.config.balance:
109 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
110 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
111 | else:
112 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
113 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
114 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
115 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
116 | class0_weight = 1 # 第0类的权重系数
117 | class1_weight = 3 # 第1类的权重系数
118 | class2_weight = 3 # 第2类的权重系数
119 | class3_weight = 3 # 第3类的权重系数
120 | # coe = tf.constant([1., 1., 1., 1.])
121 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
122 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
123 |
124 | y = tf.reshape(self.input_y, [-1, 4])
125 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
126 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
127 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
128 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
129 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
130 |
131 | return self
132 |
133 | def create_model_v1(self, share_dense=True, concat_sub=True):
134 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y')
135 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
136 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
137 |
138 | if self.main_feature.lower() in ['word', 'char']:
139 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
140 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
141 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
142 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
143 |
144 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
145 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
146 | if self.main_feature == 'elmo_word':
147 | options_file = self.config.elmo_word_options_file
148 | weight_file = self.config.elmo_word_weight_file
149 | embed_file = self.config.elmo_word_embed_file
150 | elif self.main_feature == 'elmo_char':
151 | options_file = self.config.elmo_char_options_file
152 | weight_file = self.config.elmo_char_weight_file
153 | embed_file = self.config.elmo_char_embed_file
154 | elif self.main_feature == 'elmo_qiuqiu':
155 | options_file = self.config.elmo_qiuqiu_options_file
156 | weight_file = self.config.elmo_qiuqiu_weight_file
157 | embed_file = self.config.elmo_qiuqiu_embed_file
158 |
159 | self.bilm = BidirectionalLanguageModel(options_file,
160 | weight_file,
161 | use_character_inputs=False,
162 | embedding_weight_file=embed_file,
163 | max_batch_size=self.batch_size)
164 | bilm_embedding_op = self.bilm(self.input_x)
165 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
166 | self.word_encoding = bilm_embedding['weighted_op']
167 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
168 |
169 | else:
170 | exit('wrong feature')
171 |
172 | self.forward = self.LSTM()
173 | self.backward = self.LSTM()
174 | x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32)
175 | x = tf.concat(x, -1)
176 |
177 | inputs_expanded = tf.expand_dims(x, -1)
178 | filter_shape = [3, 2*self.hidden_dim, 1, n_filters]
179 | W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W')
180 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters]))
181 | conv = tf.nn.conv2d(inputs_expanded, W, strides=[1]*4, padding='VALID', name='conv2d')
182 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
183 | output_sentence = tf.reshape(h, [-1, self.max_len-3+1, n_filters])
184 |
185 | # output_sentence = tf.layers.dense(x, self.hidden_dim, activation=tf.nn.relu)
186 |
187 | x_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len-3+1, n_filters])
188 | x_tile = tf.tile(x_reshape, [1, n_sub, 1, 1]) # 句子复制n_sub份
189 |
190 | sub_embedding = tf.get_variable(shape=[n_sub, n_filters], name='sub_embedding')
191 | sub_reshape = tf.reshape(sub_embedding, [1, n_sub, 1, n_filters])
192 | sub_tile = tf.tile(sub_reshape, [self.batch_size, 1, self.max_len-3+1, 1])
193 |
194 | embed_concat = tf.reshape(tf.concat((x_tile, sub_tile), -1), [-1, 2*n_filters])
195 |
196 | att_w = tf.get_variable(shape=[2*n_filters, n_filters], name='att_w')
197 | att_b = tf.get_variable(shape=[n_filters], name='att_b')
198 | att_v = tf.get_variable(shape=[n_filters, 1], name='att_v')
199 |
200 | score = tf.matmul(tf.nn.tanh(tf.matmul(embed_concat, att_w) + att_b), att_v)
201 | score_fit = tf.reshape(score, [-1, n_sub, self.max_len-3+1])
202 | alpha = tf.nn.softmax(score_fit)
203 |
204 | layer_sentence = tf.matmul(alpha, output_sentence)
205 |
206 | if concat_sub:
207 | # 是否拼接layer_sub信息
208 | layer_sub = tf.reshape(sub_embedding, [1, n_sub, n_filters])
209 | layer_sub_tile = tf.tile(layer_sub, [self.batch_size, 1, 1])
210 |
211 | layer_total = tf.concat((layer_sentence, layer_sub_tile), -1)
212 | outputs = tf.reshape(layer_total, [-1, 2*n_filters])
213 | else:
214 | outputs = tf.reshape(layer_sentence, [-1, n_filters])
215 |
216 | self.logits = tf.layers.dense(layer_sentence, 4, activation=None)
217 | y_ = tf.nn.softmax(self.logits)
218 | self.prob = tf.reshape(y_, [-1, 10, 4])
219 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
220 |
221 | if not self.config.balance:
222 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
223 | self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
224 | else:
225 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
226 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
227 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
228 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
229 | class0_weight = 0.7 # 第0类的权重系数
230 | class1_weight = 1.3 # 第1类的权重系数
231 | class2_weight = 1 # 第2类的权重系数
232 | class3_weight = 1.3 # 第3类的权重系数
233 | # coe = tf.constant([1., 1., 1., 1.])
234 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
235 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
236 |
237 | y = tf.reshape(self.input_y, [-1, 4])
238 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
239 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
240 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
241 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
242 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
243 |
244 | return self
245 |
246 |
247 |
--------------------------------------------------------------------------------
/src/model/lstmgru_model.py:
--------------------------------------------------------------------------------
1 | from model.model_basic import BasicDeepModel
2 | import tensorflow as tf
3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
4 | from bilm.elmo import weight_layers
5 | from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
6 |
7 | n_sub = 10
8 |
9 |
10 | class LstmgruModel(BasicDeepModel):
11 | def __init__(self, name='basicModel', n_folds=5, config=None):
12 | name = 'lstmgru' + config.main_feature
13 | self.hidden_dim = 300
14 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
15 |
16 | def create_model(self, share_dense=True, concat_sub=True):
17 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y')
18 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
19 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
20 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
21 |
22 | if self.main_feature.lower() in ['word', 'char']:
23 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
24 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
25 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
26 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
27 |
28 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
29 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
30 | if self.main_feature == 'elmo_word':
31 | options_file = self.config.elmo_word_options_file
32 | weight_file = self.config.elmo_word_weight_file
33 | embed_file = self.config.elmo_word_embed_file
34 | elif self.main_feature == 'elmo_char':
35 | options_file = self.config.elmo_char_options_file
36 | weight_file = self.config.elmo_char_weight_file
37 | embed_file = self.config.elmo_char_embed_file
38 | elif self.main_feature == 'elmo_qiuqiu':
39 | options_file = self.config.elmo_qiuqiu_options_file
40 | weight_file = self.config.elmo_qiuqiu_weight_file
41 | embed_file = self.config.elmo_qiuqiu_embed_file
42 |
43 | self.bilm = BidirectionalLanguageModel(options_file,
44 | weight_file,
45 | use_character_inputs=False,
46 | embedding_weight_file=embed_file,
47 | max_batch_size=self.batch_size)
48 | bilm_embedding_op = self.bilm(self.input_x)
49 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
50 | self.word_encoding = bilm_embedding['weighted_op']
51 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
52 |
53 | else:
54 | exit('wrong feature')
55 |
56 | c_outputs = []
57 | for c in range(n_sub):
58 | with tf.variable_scope('lstm-{}'.format(c)):
59 | # self.forward = self.LSTM()
60 | # self.backward = self.LSTM()
61 | # x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32)
62 | # x = tf.concat(x, -1)
63 | #### cudnn lstm ####
64 | self.forward_lstm = cudnn_rnn.CudnnLSTM(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32)
65 | self.forward_gru = cudnn_rnn.CudnnGRU(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32)
66 | x, _ = self.forward_lstm(tf.transpose(self.word_encoding, [1, 0, 2]))
67 | x, _ = self.forward_gru(x)
68 | x = tf.transpose(x, [1, 0, 2])
69 |
70 | with tf.variable_scope('pooling-{}'.format(c)):
71 | max_pooled = tf.reshape(tf.reduce_max(x, 1), [-1, 2*self.hidden_dim])
72 | avg_pooled = tf.reshape(tf.reduce_mean(x, 1), [-1, 2*self.hidden_dim])
73 | concat_pooled = tf.concat((max_pooled, avg_pooled), -1)
74 |
75 | concat_pooled = tf.nn.dropout(concat_pooled, self.dropout_keep_prob)
76 | dense = tf.layers.dense(concat_pooled, 4, activation=None)
77 | c_outputs.append(dense)
78 |
79 | self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4])
80 | y_ = tf.nn.softmax(self.logits)
81 | self.prob = tf.reshape(y_, [-1, n_sub, 4])
82 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
83 |
84 | if not self.config.balance:
85 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
86 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
87 | else:
88 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
89 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
90 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
91 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
92 | class0_weight = 1 # 第0类的权重系数
93 | class1_weight = 3 # 第1类的权重系数
94 | class2_weight = 3 # 第2类的权重系数
95 | class3_weight = 3 # 第3类的权重系数
96 | # coe = tf.constant([1., 1., 1., 1.])
97 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
98 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
99 |
100 | y = tf.reshape(self.input_y, [-1, 4])
101 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
102 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
103 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
104 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
105 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
106 |
107 | return self
108 |
109 |
--------------------------------------------------------------------------------
/src/model/ml_models.py:
--------------------------------------------------------------------------------
1 | from model.model_basic import BasicStaticModel
2 | from sklearn import svm
3 | from sklearn.svm import SVC
4 | from sklearn.naive_bayes import MultinomialNB
5 | from sklearn.calibration import CalibratedClassifierCV
6 | from sklearn.metrics import f1_score
7 | from skift import FirstColFtClassifier
8 |
9 | import logging
10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | class SVCClassifier(BasicStaticModel):
15 |
16 | def __init__(self, name='basicModel', n_folds=5, config=None):
17 | BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config)
18 |
19 | def create_model(self):
20 | classifier = SVC(kernel="rbf")
21 | classifier = CalibratedClassifierCV(classifier)
22 | classifier = SVC(kernel="linear")
23 | self.classifier = classifier
24 | self.classifier = svm.LinearSVC(loss='hinge', tol=1e-4, C=0.6)
25 | return self.classifier
26 |
27 |
28 | class Fasttext(BasicStaticModel):
29 | def __init__(self, name='basicModel', n_folds=5, config=None):
30 | BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config)
31 |
32 | def create_model(self):
33 | sk_clf = FirstColFtClassifier(lr=1.0, epoch=10,
34 | wordNgrams=1,
35 | minCount=5, verbose=2)
36 | return sk_clf
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/src/model/model_component.py:
--------------------------------------------------------------------------------
1 | from keras.layers import *
2 | from keras.models import *
3 |
4 |
5 | class AttLayer(Layer):
6 | def __init__(self, attention_dim):
7 | self.init = initializers.get('normal')
8 | self.supports_masking = True
9 | self.attention_dim = attention_dim
10 | super(AttLayer, self).__init__()
11 |
12 | def build(self, input_shape):
13 | assert len(input_shape) == 3
14 | self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
15 | self.b = K.variable(self.init((self.attention_dim, )))
16 | self.u = K.variable(self.init((self.attention_dim, 1)))
17 | self.trainable_weights = [self.W, self.b, self.u]
18 | super(AttLayer, self).build(input_shape)
19 |
20 | def compute_mask(self, inputs, mask=None):
21 | return mask
22 |
23 | def call(self, x, mask=None):
24 | # size of x :[batch_size, sel_len, attention_dim]
25 | # size of u :[batch_size, attention_dim]
26 | # uit = tanh(xW+b)
27 | uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
28 | ait = K.dot(uit, self.u)
29 | ait = K.squeeze(ait, -1)
30 |
31 | ait = K.exp(ait)
32 |
33 | if mask is not None:
34 | # Cast the mask to floatX to avoid float64 upcasting in theano
35 | ait *= K.cast(mask, K.floatx())
36 | ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
37 | ait = K.expand_dims(ait)
38 | weighted_input = x * ait
39 | output = K.sum(weighted_input, axis=1)
40 |
41 | return output
42 |
43 | def compute_output_shape(self, input_shape):
44 | return (input_shape[0], input_shape[-1])
45 |
46 |
47 | class AttentionWeightedAverage(Layer):
48 | """
49 | Computes a weighted average of the different channels across timesteps.
50 | Uses 1 parameter pr. channel to compute the attention value for a single timestep.
51 | """
52 |
53 | def __init__(self, return_attention=False, **kwargs):
54 | self.init = initializers.get('uniform')
55 | self.supports_masking = True
56 | self.return_attention = return_attention
57 | super(AttentionWeightedAverage, self).__init__(**kwargs)
58 |
59 | def build(self, input_shape):
60 | self.input_spec = [InputSpec(ndim=3)]
61 | assert len(input_shape) == 3
62 |
63 | self.W = self.add_weight(shape=(input_shape[2], 1),
64 | name='{}_W'.format(self.name),
65 | initializer=self.init)
66 | self.trainable_weights = [self.W]
67 | super(AttentionWeightedAverage, self).build(input_shape)
68 |
69 | def call(self, x, mask=None):
70 | # computes a probability distribution over the timesteps
71 | # uses 'max trick' for numerical stability
72 | # reshape is done to avoid issue with Tensorflow
73 | # and 1-dimensional weights
74 | logits = K.dot(x, self.W)
75 | x_shape = K.shape(x)
76 | logits = K.reshape(logits, (x_shape[0], x_shape[1]))
77 | ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))
78 |
79 | # masked timesteps have zero weight
80 | if mask is not None:
81 | mask = K.cast(mask, K.floatx())
82 | ai = ai * mask
83 | att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
84 | weighted_input = x * K.expand_dims(att_weights)
85 | result = K.sum(weighted_input, axis=1)
86 | if self.return_attention:
87 | return [result, att_weights]
88 | return result
89 |
90 | def get_output_shape_for(self, input_shape):
91 | return self.compute_output_shape(input_shape)
92 |
93 | def compute_output_shape(self, input_shape):
94 | output_len = input_shape[2]
95 | if self.return_attention:
96 | return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
97 | return (input_shape[0], output_len)
98 |
99 | def compute_mask(self, input, input_mask=None):
100 | if isinstance(input_mask, list):
101 | return [None] * len(input_mask)
102 | else:
103 | return None
104 |
105 | def squash(x, axis=-1):
106 | # s_squared_norm is really small
107 | # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
108 | # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
109 | # return scale * x
110 | s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
111 | scale = K.sqrt(s_squared_norm + K.epsilon())
112 | return x / scale
113 |
114 |
115 | # A Capsule Implement with Pure Keras
116 | class Capsule(Layer):
117 | def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
118 | activation='default', **kwargs):
119 | super(Capsule, self).__init__(**kwargs)
120 | self.num_capsule = num_capsule
121 | self.dim_capsule = dim_capsule
122 | self.routings = routings
123 | self.kernel_size = kernel_size
124 | self.share_weights = share_weights
125 | if activation == 'default':
126 | self.activation = squash
127 | else:
128 | self.activation = Activation(activation)
129 |
130 | def build(self, input_shape):
131 | super(Capsule, self).build(input_shape)
132 | input_dim_capsule = input_shape[-1]
133 | if self.share_weights:
134 | self.W = self.add_weight(name='capsule_kernel',
135 | shape=(1, input_dim_capsule,
136 | self.num_capsule * self.dim_capsule),
137 | # shape=self.kernel_size,
138 | initializer='glorot_uniform',
139 | trainable=True)
140 | else:
141 | input_num_capsule = input_shape[-2]
142 | self.W = self.add_weight(name='capsule_kernel',
143 | shape=(input_num_capsule,
144 | input_dim_capsule,
145 | self.num_capsule * self.dim_capsule),
146 | initializer='glorot_uniform',
147 | trainable=True)
148 |
149 | def call(self, u_vecs):
150 | if self.share_weights:
151 | u_hat_vecs = K.conv1d(u_vecs, self.W)
152 | else:
153 | u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])
154 |
155 | batch_size = K.shape(u_vecs)[0]
156 | input_num_capsule = K.shape(u_vecs)[1]
157 | u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
158 | self.num_capsule, self.dim_capsule))
159 | u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
160 | # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]
161 |
162 | b = K.zeros_like(u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule]
163 | for i in range(self.routings):
164 | b = K.permute_dimensions(b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule]
165 | c = K.softmax(b)
166 | c = K.permute_dimensions(c, (0, 2, 1))
167 | b = K.permute_dimensions(b, (0, 2, 1))
168 | outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
169 | if i < self.routings - 1:
170 | b = K.batch_dot(outputs, u_hat_vecs, [2, 3])
171 |
172 | return outputs
173 |
174 | def compute_output_shape(self, input_shape):
175 | return (None, self.num_capsule, self.dim_capsule)
176 |
177 | def dot_product(x, kernel):
178 | """
179 | Wrapper for dot product operation, in order to be compatible with both
180 | Theano and Tensorflow
181 | Args:
182 | x (): input
183 | kernel (): weights
184 | Returns:
185 | """
186 | if K.backend() == 'tensorflow':
187 | return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
188 | else:
189 | return K.dot(x, kernel)
190 |
191 |
192 | class AttentionWithContext(Layer):
193 | """
194 | Attention operation, with a context/query vector, for temporal data.
195 | Supports Masking.
196 | Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
197 | "Hierarchical Attention Networks for Document Classification"
198 | by using a context vector to assist the attention
199 | # Input shape
200 | 3D tensor with shape: `(samples, steps, features)`.
201 | # Output shape
202 | 2D tensor with shape: `(samples, features)`.
203 | How to use:
204 | Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
205 | The dimensions are inferred based on the output shape of the RNN.
206 | Note: The layer has been tested with Keras 2.0.6
207 | Example:
208 | model.add(LSTM(64, return_sequences=True))
209 | model.add(AttentionWithContext())
210 | # next add a Dense layer (for classification/regression) or whatever...
211 | """
212 |
213 | def __init__(self,
214 | W_regularizer=None, u_regularizer=None, b_regularizer=None,
215 | W_constraint=None, u_constraint=None, b_constraint=None,
216 | bias=True, **kwargs):
217 |
218 | self.supports_masking = True
219 | self.init = initializers.get('glorot_uniform')
220 |
221 | self.W_regularizer = regularizers.get(W_regularizer)
222 | self.u_regularizer = regularizers.get(u_regularizer)
223 | self.b_regularizer = regularizers.get(b_regularizer)
224 |
225 | self.W_constraint = constraints.get(W_constraint)
226 | self.u_constraint = constraints.get(u_constraint)
227 | self.b_constraint = constraints.get(b_constraint)
228 |
229 | self.bias = bias
230 | super(AttentionWithContext, self).__init__(**kwargs)
231 |
232 | def build(self, input_shape):
233 | assert len(input_shape) == 3
234 |
235 | self.W = self.add_weight((input_shape[-1], input_shape[-1],),
236 | initializer=self.init,
237 | name='{}_W'.format(self.name),
238 | regularizer=self.W_regularizer,
239 | constraint=self.W_constraint)
240 | if self.bias:
241 | self.b = self.add_weight((input_shape[-1],),
242 | initializer='zero',
243 | name='{}_b'.format(self.name),
244 | regularizer=self.b_regularizer,
245 | constraint=self.b_constraint)
246 |
247 | self.u = self.add_weight((input_shape[-1],),
248 | initializer=self.init,
249 | name='{}_u'.format(self.name),
250 | regularizer=self.u_regularizer,
251 | constraint=self.u_constraint)
252 |
253 | super(AttentionWithContext, self).build(input_shape)
254 |
255 | def compute_mask(self, input, input_mask=None):
256 | # do not pass the mask to the next layers
257 | return None
258 |
259 | def call(self, x, mask=None):
260 | uit = dot_product(x, self.W)
261 |
262 | if self.bias:
263 | uit += self.b
264 |
265 | uit = K.tanh(uit)
266 | ait = K.dot(uit, self.u)
267 |
268 | a = K.exp(ait)
269 |
270 | # apply mask after the exp. will be re-normalized next
271 | if mask is not None:
272 | # Cast the mask to floatX to avoid float64 upcasting in theano
273 | a *= K.cast(mask, K.floatx())
274 |
275 | # in some cases especially in the early stages of training the sum may be almost zero
276 | # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
277 | # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
278 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
279 |
280 | a = K.expand_dims(a)
281 | weighted_input = x * a
282 | return K.sum(weighted_input, axis=1)
283 |
284 | def compute_output_shape(self, input_shape):
285 | return input_shape[0], input_shape[-1]
286 |
--------------------------------------------------------------------------------
/src/model/my_callbacks.py:
--------------------------------------------------------------------------------
1 | import keras as keras
2 | from keras import backend as K
3 | import numpy as np
4 | import warnings
5 | import glob
6 | import os
7 | from keras.models import load_model
8 | import pickle
9 |
10 |
11 | class JZTrainCategory(keras.callbacks.Callback):
12 | def __init__(self, filepath, nb_epochs=20, nb_snapshots=1, monitor='val_loss', factor=0.1, verbose=1, patience=1,
13 | save_weights_only=False,
14 | decay_factor_value=1.0,
15 | mode='auto', period=1):
16 | super(JZTrainCategory, self).__init__()
17 | self.nb_epochs = nb_epochs
18 | self.monitor = monitor
19 | self.verbose = verbose
20 | self.filepath = filepath
21 | self.init_factor = factor
22 | self.decay_factor_value = decay_factor_value
23 | self.factor = factor
24 | self.save_weights_only = save_weights_only
25 | self.patience = patience
26 | self.r_patience = 0
27 | self.check = nb_epochs // nb_snapshots
28 | self.monitor_val_list = []
29 | if mode not in ['auto', 'min', 'max']:
30 | warnings.warn('ModelCheckpoint mode %s is unknown, '
31 | 'fallback to auto mode.' % (mode),
32 | RuntimeWarning)
33 | mode = 'auto'
34 | if mode == 'min':
35 | self.monitor_op = np.less
36 | self.init_best = np.Inf
37 | elif mode == 'max':
38 | self.monitor_op = np.greater
39 | self.init_best = -np.Inf
40 | else:
41 | if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
42 | self.monitor_op = np.greater
43 | self.init_best = -np.Inf
44 | else:
45 | self.monitor_op = np.less
46 | self.init_best = np.Inf
47 |
48 | @staticmethod
49 | def compile_official_f1_score(y_true, y_pred):
50 | y_true = K.reshape(y_true, (-1, 10))
51 | y_true = K.cast(y_true, 'float32')
52 | y_pred = K.round(y_pred)
53 |
54 | tp = K.sum(y_pred * y_true)
55 | fp = K.sum(K.cast(K.greater(y_pred - y_true, 0.), 'float32'))
56 | fn = K.sum(K.cast(K.greater(y_true - y_pred, 0.), 'float32'))
57 | p = tp / (tp + fp)
58 | r = tp / (tp + fn)
59 | f = 2*p*r/(p+r)
60 | return f
61 |
62 | def on_batch_begin(self, batch, logs={}):
63 | return
64 |
65 | def on_batch_end(self, batch, logs={}):
66 | return
67 |
68 | def on_train_end(self, logs={}):
69 | return
70 |
71 | def on_train_begin(self, logs={}):
72 | self.init_lr = K.get_value(self.model.optimizer.lr)
73 | self.best = self.init_best
74 | return
75 |
76 | def on_epoch_begin(self, epoch, logs=None):
77 | return
78 |
79 | def on_epoch_end(self, epoch, logs=None):
80 | logs = logs or {}
81 | logs['lr'] = K.get_value(self.model.optimizer.lr)
82 |
83 | n_recurrent = epoch // self.check
84 | self.save_path = '{}/{}.h5'.format(self.filepath, n_recurrent)
85 | os.makedirs(self.filepath, exist_ok=True)
86 | current = logs.get(self.monitor)
87 | if current is None:
88 | warnings.warn('Can save best model only with %s available, '
89 | 'skipping.' % (self.monitor), RuntimeWarning)
90 |
91 | else:
92 | if self.monitor_op(current, self.best):
93 | # if better result: save model
94 | self.r_patience = 0
95 | if self.verbose > 0:
96 | print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
97 | ' saving model to %s'
98 | % (epoch + 1, self.monitor, self.best,
99 | current, self.save_path))
100 | self.best = current
101 | if self.save_weights_only:
102 | self.model.save_weights(self.save_path)
103 | # pickle.dump(self.model.get_weights(), open('./debug_weight.pkl', 'wb'))
104 | symbolic_weights = getattr(self.model.optimizer, 'weights')
105 | weight_values = K.batch_get_value(symbolic_weights)
106 | with open('{}/optimizer.pkl'.format(self.filepath), 'wb') as f:
107 | pickle.dump(weight_values, f)
108 | else:
109 | self.model.save(self.save_path)
110 |
111 | else:
112 | # if worse resule: reload last best model saved
113 | self.r_patience += 1
114 | if self.verbose > 0:
115 | if self.r_patience == self.patience:
116 | print('\nEpoch %05d: %s did not improve from %0.5f' %
117 | (epoch + 1, self.monitor, self.best))
118 | if self.save_weights_only:
119 | self.model.load_weights(self.save_path)
120 | self.model._make_train_function()
121 | with open('{}/optimizer.pkl'.format(self.filepath), 'rb') as f:
122 | weight_values = pickle.load(f)
123 | self.model.optimizer.set_weights(weight_values)
124 | else:
125 | self.model = load_model(self.save_path, custom_objects={'compile_official_f1_score': JZTrainCategory.compile_official_f1_score})
126 | # set new learning rate
127 | old_lr = K.get_value(self.model.optimizer.lr)
128 | new_lr = old_lr * self.factor
129 | self.factor *= self.decay_factor_value # 衰减系数衰减
130 | K.set_value(self.model.optimizer.lr, new_lr)
131 | print('\nReload model and decay learningrate from {} to {}\n'.format(old_lr, new_lr))
132 | self.r_patience = 0
133 |
134 | if (epoch+1) % self.check == 0:
135 | self.monitor_val_list.append(self.best)
136 | self.best = self.init_best
137 | self.factor = self.init_factor
138 |
139 | if (epoch+1) != self.nb_epochs:
140 | K.set_value(self.model.optimizer.lr, self.init_lr)
141 | print('At epoch-{} reset learning rate to mountain-top init lr {}'.format(epoch+1, self.init_lr))
142 |
143 |
--------------------------------------------------------------------------------
/src/model/rcnn_model.py:
--------------------------------------------------------------------------------
1 | from model.model_basic import BasicDeepModel
2 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
3 | from bilm.elmo import weight_layers
4 | import tensorflow as tf
5 | import numpy as np
6 |
7 | from tensorflow.contrib import rnn
8 | import tensorflow.contrib.layers as layers
9 |
10 | filter_sizes = [1, 2, 3, 4]
11 | n_filter = 128
12 | hidden_size = 300
13 | n_sub = 10
14 | n_sent = 4
15 |
16 |
17 | class RCNNModel(BasicDeepModel):
18 | def __init__(self, name='basicModel', n_folds=10, config=None):
19 | name = 'RCNN' + config.main_feature
20 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
21 |
22 | def create_model(self, share_dense=True):
23 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, n_sub, n_sent], name='input_y')
24 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
25 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
26 |
27 | if self.main_feature.lower() in ['word', 'char']:
28 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, self.max_len], name='input_x')
29 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
30 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
31 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
32 |
33 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
34 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
35 | if self.main_feature == 'elmo_word':
36 | options_file = self.config.elmo_word_options_file
37 | weight_file = self.config.elmo_word_weight_file
38 | embed_file = self.config.elmo_word_embed_file
39 | elif self.main_feature == 'elmo_char':
40 | options_file = self.config.elmo_char_options_file
41 | weight_file = self.config.elmo_char_weight_file
42 | embed_file = self.config.elmo_char_embed_file
43 | elif self.main_feature == 'elmo_qiuqiu':
44 | options_file = self.config.elmo_qiuqiu_options_file
45 | weight_file = self.config.elmo_qiuqiu_weight_file
46 | embed_file = self.config.elmo_qiuqiu_embed_file
47 |
48 | self.bilm = BidirectionalLanguageModel(options_file,
49 | weight_file,
50 | use_character_inputs=False,
51 | embedding_weight_file=embed_file,
52 | max_batch_size=self.batch_size)
53 | bilm_embedding_op = self.bilm(self.input_x)
54 | bilm_embedding = weight_layers('output', bilm_embedding_op, l2_coef=0.0)
55 | self.word_encoding = bilm_embedding['weighted_op']
56 | self.word_encoding = tf.nn.dropout(self.word_encoding,self.dropout_keep_prob) # new
57 |
58 | else:
59 | exit('wrong feature')
60 |
61 | rcnn_outputs = []
62 | for i in range(n_sub):
63 | with tf.variable_scope('rcnn_output_%d' % i):
64 | output_bigru = self.bi_gru(self.word_encoding, hidden_size)
65 | output = self.textcnn(output_bigru, self.max_len)
66 | rcnn_outputs.append(output)
67 |
68 | n_filter_total = n_filter * len(filter_sizes)
69 | outputs = tf.reshape(tf.concat(rcnn_outputs, 1), (-1, n_sub, n_filter_total))
70 |
71 | if share_dense:
72 | cnn_outputs = tf.reshape(outputs, (-1, n_filter_total))
73 | W = tf.get_variable('W', shape=[n_filter_total, self.n_classes])
74 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes]))
75 | self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores')
76 | else:
77 | cnn_outputs = tf.reshape(tf.concat(outputs, 1), (-1, n_sub, n_filter_total))
78 | W = tf.get_variable('W', shape=[self.batch_size, n_filter_total, self.n_classes])
79 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes]))
80 | self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores')
81 |
82 | y_ = tf.nn.softmax(self.logits)
83 | self.prob = tf.reshape(y_, [-1, n_sub, 4])
84 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
85 |
86 | if not self.config.balance:
87 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
88 | else:
89 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
90 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
91 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
92 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
93 | class0_weight = 1 # 第0类的权重系数
94 | class1_weight = 3 # 第1类的权重系数
95 | class2_weight = 3 # 第2类的权重系数
96 | class3_weight = 3 # 第3类的权重系数
97 | # coe = tf.constant([1., 1., 1., 1.])
98 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
99 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
100 |
101 | y = tf.reshape(self.input_y, [-1, 4])
102 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
103 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
104 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
105 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
106 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
107 |
108 | return self
109 |
110 | def textcnn(self, cnn_inputs, n_step):
111 | # cnn_inputs = [batch_size, n_step, -1]
112 | inputs = tf.expand_dims(cnn_inputs, -1)
113 | pooled_outputs = []
114 | for i, filter_size in enumerate(filter_sizes):
115 | with tf.variable_scope('conv-maxpool-%s' % filter_size):
116 | filter_shape = [filter_size, hidden_size*2+self.embed_size, 1, n_filter]
117 | W_filter = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W_filter')
118 | beta = tf.get_variable(initializer=tf.constant(0.1, shape=[n_filter]), name='beta')
119 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1]*4, padding='VALID', name='conv')
120 | h = tf.nn.relu(tf.nn.bias_add(conv, beta), name='relu')
121 | pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
122 | strides=[1]*4, padding='VALID', name='pool')
123 | pooled_outputs.append(pooled)
124 | h_pool = tf.concat(pooled_outputs, 3)
125 | h_pool_flat = tf.reshape(h_pool, [-1, n_filter * len(filter_sizes)])
126 | h_drop = tf.nn.dropout(h_pool_flat, self.dropout_keep_prob)
127 | return h_drop
128 |
129 | def gru_cell(self, hidden_size):
130 | cell = rnn.GRUCell(hidden_size, reuse=tf.get_variable_scope().reuse)
131 | return rnn.DropoutWrapper(cell, output_keep_prob=self.output_keep_prob)
132 |
133 | def bi_gru(self, inputs, hidden_size, res_add=True):
134 | """build the bi-GRU network. Return the encoder represented vector.
135 | X_inputs: [batch_size, n_step]
136 | n_step: 句子的词数量;或者文档的句子数。
137 | outputs: [batch_size, n_step, hidden_size*2+embedding_size(if res_add)]
138 | """
139 | cells_fw = [self.gru_cell(hidden_size) for _ in range(1)]
140 | cells_bw = [self.gru_cell(hidden_size) for _ in range(1)]
141 | initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw]
142 | initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw]
143 | outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
144 | initial_states_fw=initial_states_fw,
145 | initial_states_bw=initial_states_bw,
146 | dtype=tf.float32)
147 | if res_add:
148 | outputs = tf.concat([outputs, inputs], axis=2)
149 | return outputs
150 |
151 | # def batchnorm(self, Ylogits, offset, convolutional=False):
152 | # exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, )
153 |
154 |
--------------------------------------------------------------------------------
/src/model/snapshot.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 |
4 | import keras.callbacks as callbacks
5 | from keras.callbacks import Callback
6 |
7 | class SnapshotModelCheckpoint(Callback):
8 | """Callback that saves the snapshot weights of the model.
9 | Saves the model weights on certain epochs (which can be considered the
10 | snapshot of the model at that epoch).
11 | Should be used with the cosine annealing learning rate schedule to save
12 | the weight just before learning rate is sharply increased.
13 | # Arguments:
14 | nb_epochs: total number of epochs that the model will be trained for.
15 | nb_snapshots: number of times the weights of the model will be saved.
16 | fn_prefix: prefix for the filename of the weights.
17 | """
18 |
19 | def __init__(self, nb_epochs, nb_snapshots, fn_prefix='Model'):
20 | super(SnapshotModelCheckpoint, self).__init__()
21 |
22 | self.check = nb_epochs // nb_snapshots
23 | self.fn_prefix = fn_prefix
24 |
25 | def on_epoch_end(self, epoch, logs={}):
26 | if epoch != 0 and (epoch + 1) % self.check == 0:
27 | filepath = self.fn_prefix + "-%d.h5" % ((epoch + 1) // self.check)
28 | self.model.save_weights(filepath, overwrite=True)
29 | # if epoch == 1:
30 | # self.model.get_layer('embedding').trainable = True
31 | # self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
32 | # print('now we begin to train our embeding layers')
33 | # self.model.summary()
34 |
35 |
36 | class SnapshotCallbackBuilder:
37 | """Callback builder for snapshot ensemble training of a model.
38 | Creates a list of callbacks, which are provided when training a model
39 | so as to save the model weights at certain epochs, and then sharply
40 | increase the learning rate.
41 | """
42 |
43 | def __init__(self, nb_epochs, nb_snapshots, init_lr=0.1):
44 | """
45 | Initialize a snapshot callback builder.
46 | # Arguments:
47 | nb_epochs: total number of epochs that the model will be trained for.
48 | nb_snapshots: number of times the weights of the model will be saved.
49 | init_lr: initial learning rate
50 | """
51 | self.T = nb_epochs
52 | self.M = nb_snapshots
53 | self.alpha_zero = init_lr
54 |
55 | def get_callbacks(self, model_save_place='./', model_prefix='Model'):
56 | """
57 | Creates a list of callbacks that can be used during training to create a
58 | snapshot ensemble of the model.
59 | Args:
60 | model_prefix: prefix for the filename of the weights.
61 | Returns: list of 3 callbacks [ModelCheckpoint, LearningRateScheduler,
62 | SnapshotModelCheckpoint] which can be provided to the 'fit' function
63 | """
64 | if not os.path.exists(model_save_place):
65 | os.makedirs(model_save_place)
66 |
67 | callback_list = [
68 | callbacks.LearningRateScheduler(schedule=self._cosine_anneal_schedule),
69 | SnapshotModelCheckpoint(self.T, self.M, fn_prefix='%s/%s' % (model_save_place, model_prefix))]
70 |
71 | return callback_list
72 |
73 | def _cosine_anneal_schedule(self, t):
74 | cos_inner = np.pi * (t % (self.T // self.M)) # t - 1 is used when t has 1-based indexing.
75 | cos_inner /= self.T // self.M
76 | cos_out = np.cos(cos_inner) + 1
77 | alpha = float(self.alpha_zero / 2 * cos_out)
78 | print('lr: {}'.format(alpha))
79 | return alpha
80 |
--------------------------------------------------------------------------------
/src/model/textcnn_model.py:
--------------------------------------------------------------------------------
1 | from model.model_basic import BasicDeepModel
2 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
3 | from bilm.elmo import weight_layers
4 | import tensorflow as tf
5 | import numpy as np
6 |
7 |
8 | filter_sizes = [1, 2, 3, 4]
9 | n_filters = 128
10 | n_sub = 10
11 | n_sent = 4
12 |
13 |
14 | class TextCNNModel(BasicDeepModel):
15 |
16 | def __init__(self, name='basicModel', n_folds=5, config=None):
17 | name = 'textCNN' + config.main_feature
18 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
19 |
20 | def create_model(self, share_dense=True):
21 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, n_sub, n_sent], name='input_y')
22 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
23 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
24 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
25 |
26 | if self.main_feature.lower() in ['word', 'char']:
27 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
28 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
29 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
30 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
31 |
32 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
33 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
34 | if self.main_feature == 'elmo_word':
35 | options_file = self.config.elmo_word_options_file
36 | weight_file = self.config.elmo_word_weight_file
37 | embed_file = self.config.elmo_word_embed_file
38 | elif self.main_feature == 'elmo_char':
39 | options_file = self.config.elmo_char_options_file
40 | weight_file = self.config.elmo_char_weight_file
41 | embed_file = self.config.elmo_char_embed_file
42 | elif self.main_feature == 'elmo_qiuqiu':
43 | options_file = self.config.elmo_qiuqiu_options_file
44 | weight_file = self.config.elmo_qiuqiu_weight_file
45 | embed_file = self.config.elmo_qiuqiu_embed_file
46 |
47 | self.bilm = BidirectionalLanguageModel(options_file,
48 | weight_file,
49 | use_character_inputs=False,
50 | embedding_weight_file=embed_file,
51 | max_batch_size=self.batch_size)
52 | bilm_embedding_op = self.bilm(self.input_x)
53 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
54 | self.word_encoding = bilm_embedding['weighted_op']
55 | self.word_encoding = tf.nn.dropout(self.word_encoding,self.dropout_keep_prob) # new
56 |
57 | else:
58 | exit('wrong feature')
59 |
60 | all_input_expanded = tf.expand_dims(self.word_encoding, -1)
61 |
62 | c_outputs = []
63 | for c in range(n_sub):
64 | pooled_outputs = []
65 | for i, filter_size in enumerate(filter_sizes):
66 | with tf.variable_scope('conv-maxpool-{}-{}'.format(c, filter_size)):
67 | # 卷积层
68 | filter_shape = [filter_size, self.embed_size, 1, n_filters]
69 | W = tf.get_variable('W', initializer=tf.truncated_normal(filter_shape, stddev=0.1))
70 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters]))
71 | conv = tf.nn.conv2d(all_input_expanded, W, strides=[1]*4, padding='VALID', name='conv')
72 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
73 | pooled = tf.nn.max_pool(h,
74 | ksize=[1, self.max_len - filter_size + 1, 1, 1],
75 | strides=[1, 1, 1, 1],
76 | padding='VALID',
77 | name='pool')
78 | pooled_outputs.append(pooled)
79 | num_filters_total = n_filters * len(filter_sizes)
80 | h_pool = tf.concat(pooled_outputs, 3)
81 | h_pool_flatten = tf.reshape(h_pool, [-1, 1, num_filters_total])
82 | h_drop = tf.nn.dropout(h_pool_flatten, self.dropout_keep_prob)
83 | dense = tf.layers.dense(h_drop, 4, activation=None)
84 | c_outputs.append(dense)
85 |
86 | self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4])
87 | y_ = tf.nn.softmax(self.logits)
88 | self.prob = tf.reshape(y_, [-1, n_sub, 4])
89 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
90 |
91 | if not self.config.balance:
92 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
93 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
94 | else:
95 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
96 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
97 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
98 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
99 | class0_weight = 1 # 第0类的权重系数
100 | class1_weight = 3 # 第1类的权重系数
101 | class2_weight = 3 # 第2类的权重系数
102 | class3_weight = 3 # 第3类的权重系数
103 | # coe = tf.constant([1., 1., 1., 1.])
104 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
105 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
106 |
107 | y = tf.reshape(self.input_y, [-1, 4])
108 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
109 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
110 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
111 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
112 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
113 |
114 | return self
115 |
116 | def create_model_v1(self, share_dense=True):
117 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, n_sub, n_sent], name='input_y')
118 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
119 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
120 |
121 | if self.main_feature.lower() in ['word', 'char']:
122 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
123 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
124 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
125 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
126 |
127 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
128 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
129 | if self.main_feature == 'elmo_word':
130 | options_file = self.config.elmo_word_options_file
131 | weight_file = self.config.elmo_word_weight_file
132 | embed_file = self.config.elmo_word_embed_file
133 | elif self.main_feature == 'elmo_char':
134 | options_file = self.config.elmo_char_options_file
135 | weight_file = self.config.elmo_char_weight_file
136 | embed_file = self.config.elmo_char_embed_file
137 | elif self.main_feature == 'elmo_qiuqiu':
138 | options_file = self.config.elmo_qiuqiu_options_file
139 | weight_file = self.config.elmo_qiuqiu_weight_file
140 | embed_file = self.config.elmo_qiuqiu_embed_file
141 |
142 | self.bilm = BidirectionalLanguageModel(options_file,
143 | weight_file,
144 | use_character_inputs=False,
145 | embedding_weight_file=embed_file,
146 | max_batch_size=self.batch_size)
147 | bilm_embedding_op = self.bilm(self.input_x)
148 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
149 | self.word_encoding = bilm_embedding['weighted_op']
150 | self.word_encoding = tf.nn.dropout(self.word_encoding,self.dropout_keep_prob) # new
151 |
152 | else:
153 | exit('wrong feature')
154 |
155 | all_input_expanded = tf.expand_dims(self.word_encoding, -1)
156 | # all_input_expanded = tf.tile(all_input_expanded, [1,1,1,10])
157 |
158 | c_outputs = []
159 | for c in range(n_sub):
160 | pooled_outputs = []
161 | for i, filter_size in enumerate(filter_sizes):
162 | with tf.variable_scope('conv-maxpool-{}-{}'.format(c, filter_size)):
163 | # 卷积层
164 | filter_shape = [filter_size, self.embed_size, 1, n_filters]
165 | W = tf.get_variable('W', initializer=tf.truncated_normal(filter_shape, stddev=0.1))
166 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters]))
167 | conv = tf.nn.conv2d(all_input_expanded, W, strides=[1]*4, padding='VALID', name='conv')
168 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
169 | pooled = tf.nn.max_pool(h,
170 | ksize=[1, self.max_len - filter_size + 1, 1, 1],
171 | strides=[1, 1, 1, 1],
172 | padding='VALID',
173 | name='pool')
174 | pooled_outputs.append(pooled)
175 | num_filters_total = n_filters * len(filter_sizes)
176 | h_pool = tf.concat(pooled_outputs, 3)
177 | h_pool_flatten = tf.reshape(h_pool, [-1, num_filters_total])
178 | h_drop = tf.nn.dropout(h_pool_flatten, self.dropout_keep_prob)
179 | c_outputs.append(h_drop)
180 | cnn_outputs = tf.reshape(tf.concat(c_outputs, 1), (-1, n_sub, num_filters_total))
181 |
182 | if share_dense:
183 | cnn_outputs = tf.reshape(cnn_outputs, (-1, num_filters_total))
184 | W = tf.get_variable('W', shape=[num_filters_total, self.n_classes])
185 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes]))
186 | self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores')
187 | else:
188 | cnn_outputs = tf.reshape(tf.concat(c_outputs, 1), (-1, n_sub, num_filters_total))
189 | W = tf.get_variable('W', shape=[self.batch_size, num_filters_total, self.n_classes])
190 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes]))
191 | self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores')
192 |
193 | y_ = tf.nn.softmax(self.logits)
194 | self.prob = tf.reshape(y_, [-1, n_sub, 4])
195 | self.prediction = tf.argmax(self.prob, 2, name="prediction")
196 |
197 | if not self.config.balance:
198 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
199 | self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
200 | else:
201 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数
202 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数
203 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数
204 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数
205 | class0_weight = 1 # 第0类的权重系数
206 | class1_weight = 3 # 第1类的权重系数
207 | class2_weight = 3 # 第2类的权重系数
208 | class3_weight = 3 # 第3类的权重系数
209 | # coe = tf.constant([1., 1., 1., 1.])
210 | # y = tf.reshape(self.input_y, [-1, 4]) * coe
211 | # self.loss = -tf.reduce_mean(y * tf.log(y_))
212 |
213 | y = tf.reshape(self.input_y, [-1, 4])
214 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
215 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
216 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
217 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
218 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
219 |
220 | return self
221 |
222 |
--------------------------------------------------------------------------------
/src/model/xgboost_model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/src/model/xgboost_model.py
--------------------------------------------------------------------------------
/src/pack_sub_dt2.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle
3 | import pandas as pd
4 | import glob
5 | import numpy as np
6 | from tqdm import tqdm
7 | from sklearn.metrics import f1_score
8 |
9 | test_df = pd.read_csv('../data/csvs/test_public.csv')
10 | train_df = pd.read_csv('../data/csvs/train_multi.csv')
11 | true_labels = train_df.iloc[:, 6:].values
12 |
13 | submit_df = pd.DataFrame(columns=['content_id', 'subject', 'sentiment_value', 'sentiment_word'])
14 | train_oof_df = pd.DataFrame(columns=['content_id', 'subject', 'sentiment_value', 'sentiment_word'])
15 | submit_df['content_id'] = test_df['content_id']
16 | train_oof_df['content_id'] = train_df['content_id']
17 |
18 |
19 |
20 | pre_path = '../data/result/0.807*'
21 | pre_filenames = glob.glob(pre_path)
22 | train_oof_filenames = glob.glob(pre_path.replace('pre', 'oof'))
23 |
24 | pre = np.argmax(pickle.load(open(pre_filenames[0], 'rb')), 2)
25 | train_oof_pred = np.argmax(pickle.load(open(train_oof_filenames[0], 'rb')), 2)
26 |
27 | print(pre_filenames)
28 | label_itos = [s.split('_')[1] for s in pickle.load(open('../data/sub_list.pkl', 'rb'))]
29 | n_none = 0
30 | n_mul_label = {}
31 |
32 | f1s = []
33 |
34 | content_ids = []
35 | subjects = []
36 | sentiment_values = []
37 | lost_ids = []
38 |
39 | for idx, c_id in enumerate(test_df['content_id']):
40 | n_label = np.sum(pre[idx] > 0)
41 | if not n_label:
42 | n_none += 1
43 | lost_ids.append(c_id)
44 | else:
45 | n_mul_label[n_label] = n_mul_label.get(n_label, 0) + 1
46 | labels = list(np.where(pre[idx]>0)[0])
47 | for l in labels:
48 | content_ids.append(c_id)
49 | subjects.append(label_itos[l])
50 | sentiment_values.append(pre[idx][l]-2)
51 |
52 | soft_df = pd.read_csv('../data/submit/676.csv')
53 | lost_df = soft_df[soft_df['content_id'].isin(lost_ids)]
54 | submit_df = pd.DataFrame({'content_id': content_ids + list(lost_df['content_id']),
55 | 'subject': subjects + list(lost_df['subject']),
56 | 'sentiment_value': sentiment_values + list(lost_df['sentiment_value']),
57 | # 'subject': subjects + ['']*len(lost_ids),
58 | # 'sentiment_value': sentiment_values + ['']*len(lost_ids),
59 | 'sentiment_word': ['']*(len(lost_df)+len(subjects))})
60 |
61 | print('n_none:', n_none)
62 | print('n_pad:', len(lost_df))
63 | os.makedirs('../data/submit', exist_ok=True)
64 | submit_df.to_csv('../data/submit/dt3_stacking_submission.csv', index=None)
65 |
66 | # for i in range(train_oof_pred.shape[1]):
67 | # pre_label = train_oof_pred[:, i]
68 | # true_label = true_labels[:, i]
69 | # f1 = f1_score(true_label, pre_label, average='macro')
70 | # f1s.append(f1)
71 |
72 | # f1 = np.mean(f1s)
73 | # print('f1s->', f1s)
74 | # print('mean f1', f1)
75 | # print('n_none:', n_none)
76 | # os.makedirs('../data/submit', exist_ok=True)
77 |
78 | # submit_df.to_csv('../data/submit/dt2_{}_submission.csv'.format(f1), index=None)
79 |
80 |
81 |
--------------------------------------------------------------------------------
/src/stacking.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import glob
3 | import pandas as pd
4 | from config import Config
5 | from keras.utils import np_utils
6 | from keras.layers import *
7 | from model.snapshot import SnapshotCallbackBuilder
8 | from model.my_callbacks import JZTrainCategory
9 | from keras.models import *
10 | from sklearn.preprocessing import MinMaxScaler
11 | from sklearn.model_selection import KFold
12 | from sklearn.metrics import accuracy_score, f1_score
13 |
14 | from model.model_basic import BasicModel
15 | import numpy as np
16 | import os
17 |
18 |
19 | def get_f1_score(x, y, verbose=False):
20 | tp = np.sum(np.logical_and(y > 0, x == y))
21 | fp = np.sum(np.logical_and(x > 0, y == 0)) + np.sum(np.logical_and(x * y > 0, y != x)) # 多判或者错判
22 | fn = np.sum(np.logical_and(y > 0, x == 0)) # 漏判
23 |
24 | P = float(tp) / (float(tp + fp) + 1e-8)
25 | R = float(tp) / (float(tp + fn) + 1e-8)
26 | F = 2 * P * R / (P + R + 1e-8)
27 |
28 | if verbose:
29 | print('P->', P)
30 | print('R->', R)
31 | print('F->', F)
32 | return F
33 |
34 |
35 | def data_prepare():
36 | train_df = pd.read_csv(config.TRAIN_X)
37 |
38 | if config.data_type == 0:
39 | train_y = {}
40 | sub_list = pickle.load(open('../data/sub_list.pkl', 'rb'))
41 | for sub in sub_list:
42 | train_y_val = train_df[sub].values
43 | train_y[sub] = np_utils.to_categorical(train_y_val, num_classes=config.n_class)
44 | elif config.data_type == 1:
45 | train_y = train_df['c_numerical'].values
46 | train_y = np_utils.to_categorical(train_y, num_classes=config.n_class)
47 | elif config.data_type == 2:
48 | train_y = {}
49 | train_y['subject'] = train_df['sub_numerical'].values
50 | train_y['subject'] = np_utils.to_categorical(train_y['subject'], num_classes=10)
51 | train_y['sentiment_value'] = train_df['sentiment_value'].values
52 | train_y['sentiment_value'] = np_utils.to_categorical(train_y['sentiment_value'], num_classes=3)
53 |
54 | elif config.data_type == 3:
55 | # 主要融合这个
56 | train_y = train_df.iloc[:, 6:].values
57 | targets = train_y.reshape(-1)
58 | one_hot_targets = np.eye(config.n_classes)[targets]
59 | train_y = one_hot_targets.reshape(-1, 10, config.n_classes)
60 | elif config.data_type == 4:
61 | train_y = (train_df['sentiment_value']+1).values
62 | train_y = np_utils.to_categorical(train_y, num_classes=config.n_class)
63 | elif config.data_type == 5:
64 | train_y = train_df.iloc[:, 4:].values
65 |
66 | else:
67 | exit('错误数据类别')
68 |
69 | # oof features
70 | filenames = glob.glob('../data/result-qiuqiu/*oof*')
71 | filenames.extend(glob.glob('../data/result-dt3-op1-embed300-debugFalse-distillation/*oof*'))
72 | filenames.extend(glob.glob('../data/11_11_result/*oof*'))
73 | # filenames.extend(glob.glob('../data/result-dt3-op1-embed300-debugFalse/*oof*'))
74 | # filenames.extend(glob.glob('../data/result-dt3-op1-embed300-debugFalse-enhance/*oof*'))
75 |
76 | # filenames = glob.glob('../data/result-stacking/*oof*'.format(args.data_type))
77 | # def filter(filename, f_value):
78 | # return float(filename.split('_')[-3][1:-4]) > f_value
79 |
80 | # filenames = [e for e in filenames if filter(e, args.f_value)]
81 | # filenames = glob.glob('../data/result-dt{}-op1-embed300-debugFalse-enhance/*oof*'.format(args.data_type))
82 | from pprint import pprint
83 | pprint(filenames)
84 |
85 | oof_filename = []
86 | test_filename = []
87 | for j, filename in enumerate(filenames):
88 | p_filename = filename.replace('_oof_', '_pre_')
89 | oof_filename.append(filename)
90 | test_filename.append(p_filename)
91 |
92 | oof_data = []
93 | test_data = []
94 | for i, (tra, tes) in enumerate(zip(oof_filename, test_filename)):
95 |
96 | oof_feature = pickle.load(open(tra, 'rb'))
97 | print(tra, oof_feature.shape)
98 | oof_data.append(oof_feature)
99 |
100 | oof_feature = pickle.load(open(tes, 'rb'))
101 | print(tes, oof_feature.shape)
102 | test_data.append(oof_feature)
103 |
104 | train_x = np.concatenate(oof_data, axis=-1)
105 | test_x = np.concatenate(test_data, axis=-1)
106 | # train_x = np.reshape(train_x, [-1, train_x.shape[-1]])
107 | # test_x = np.reshape(test_x, [-1, test_x.shape[-1]])
108 | print('train_x shape: ', train_x.shape)
109 | print('train_y shape: ', train_y.shape)
110 | print('test_x shape: ', test_x.shape)
111 |
112 | return train_x, train_y, test_x
113 |
114 |
115 | def get_model(train_x):
116 | input_x = Input(shape=(train_x.shape[-2], train_x.shape[-1]), name='input')
117 | x = Dense(256, activation='relu')(input_x)
118 | x = Dropout(0.5)(x)
119 | x = Dense(128, activation='relu')(x)
120 | x = Dropout(0.5)(x)
121 | x = Dense(4, activation="softmax")(x)
122 | res_model = Model(inputs=[input_x], outputs=x)
123 | return res_model
124 |
125 |
126 | # 第一次stacking
127 | def stacking_first(train, train_y, test):
128 | savepath = './stack_op{}_dt{}_f_value{}/'.format(args.option, args.data_type, args.f_value)
129 | os.makedirs(savepath, exist_ok=True)
130 |
131 | count_kflod = 0
132 | num_folds = 5
133 | kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
134 | predict = np.zeros((test.shape[0], 10, 4))
135 | oof_predict = np.zeros((train.shape[0], 10, 4))
136 | scores = []
137 |
138 | for i, (train_index, test_index) in enumerate(kf.split(train)):
139 | print('第{}折'.format(i))
140 |
141 | kfold_X_train = {}
142 | kfold_X_valid = {}
143 |
144 | y_train, y_test = train_y[train_index], train_y[test_index]
145 |
146 | kfold_X_train, kfold_X_valid = train[train_index], train[test_index]
147 |
148 | model_prefix = savepath + 'DNN' + str(count_kflod)
149 | if not os.path.exists(model_prefix):
150 | os.mkdir(model_prefix)
151 |
152 | M = 3 # number of snapshots
153 | alpha_zero = 1e-3 # initial learning rate
154 | snap_epoch = 30
155 |
156 | snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero)
157 | # M = 1 # number of snapshots
158 | # snap_epoch = 16
159 | # jz_schedule = JZTrainCategory(model_prefix, snap_epoch, M, save_weights_only=True, monitor='val_loss', factor=0.7, patience=1)
160 |
161 | res_model = get_model(train)
162 | res_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
163 | res_model.summary()
164 |
165 | # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1, class_weight=class_weight)
166 | res_model.fit(kfold_X_train, y_train, batch_size=BATCH_SIZE, epochs=snap_epoch, verbose=1,
167 | validation_data=(kfold_X_valid, y_test),
168 | callbacks=snapshot.get_callbacks(model_save_place=model_prefix))
169 |
170 | evaluations = []
171 | for i in os.listdir(model_prefix):
172 | if '.h5' in i:
173 | evaluations.append(i)
174 |
175 | test_pred_ = np.zeros((test.shape[0], 10, 4))
176 | oof_pred_ = np.zeros((len(kfold_X_valid), 10, 4))
177 | for run, i in enumerate(evaluations):
178 | print('loading from {}'.format(os.path.join(model_prefix, i)))
179 | res_model.load_weights(os.path.join(model_prefix, i))
180 | test_pred_ += res_model.predict(test, verbose=1, batch_size=256) / len(evaluations)
181 | oof_pred_ += res_model.predict(kfold_X_valid, batch_size=256) / len(evaluations)
182 |
183 | predict += test_pred_ / num_folds
184 | oof_predict[test_index] = oof_pred_
185 |
186 | f1 = get_f1_score(np.argmax(oof_pred_, -1), np.argmax(y_test, -1), verbose=True)
187 | print(i, ' kflod cv f1 : ', str(f1))
188 | count_kflod += 1
189 | scores.append(f1)
190 | print('f1 {} -> {}'.format(scores, np.mean(scores)))
191 | return predict, oof_predict, np.mean(scores)
192 |
193 | import lightgbm as lgb
194 | def stacking_lightgbm(train, train_y, test):
195 | train_y = np.argmax(train_y, 1)
196 | count_kflod = 0
197 | num_folds = 5
198 | kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
199 | predict = np.zeros((test.shape[0], config.n_class))
200 | oof_predict = np.zeros((train.shape[0], config.n_class))
201 | scores = []
202 | f1s = []
203 |
204 | params = {'objective': 'multiclass',
205 | 'bagging_seed': 10,
206 | 'boosting_type': 'gbdt',
207 | 'feature_fraction': 0.9,
208 | 'feature_fraction_seed': 10,
209 | 'lambda_l1': 0.5,
210 | 'lambda_l2': 0.5,
211 | 'learning_rate': 0.01,
212 | 'metric': 'multi_logloss',
213 | 'min_child_weight': 1,
214 | # 'min_split_gain': 0,
215 | 'device': 'gpu',
216 | 'gpu_platform_id': 0,
217 | 'gpu_device_id': config.gpu,
218 | 'min_sum_hessian_in_leaf': 0.1,
219 | 'num_leaves': 64,
220 | 'num_thread': -1,
221 | 'num_class': config.n_class,
222 | 'verbose': 1}
223 |
224 | for train_index, test_index in kf.split(train):
225 |
226 | y_train, y_test = train_y[train_index], train_y[test_index]
227 | kfold_X_train, kfold_X_valid = train[train_index], train[test_index]
228 |
229 | d_train = lgb.Dataset(kfold_X_train, label=y_train)
230 | d_watch = lgb.Dataset(kfold_X_valid, label=y_test)
231 |
232 | best = lgb.train(params, d_train, num_boost_round=100, verbose_eval=5,
233 | valid_sets=d_watch,
234 | early_stopping_rounds=6)
235 |
236 | preds1 = best.predict(test)
237 | preds2 = best.predict(kfold_X_valid)
238 |
239 | predict += preds1 / num_folds
240 | # oof_predict[test_index] = preds2
241 |
242 | accuracy = mb.cal_acc(preds2, y_test)
243 | f1 = mb.cal_f_alpha(preds2, y_test, n_out=config.n_class)
244 |
245 | print('the kflod cv is : ', str(accuracy))
246 | print('the kflod f1 is : ', str(f1))
247 | count_kflod += 1
248 | scores.append(accuracy)
249 | f1s.append(f1)
250 | print('total scores is ', np.mean(scores))
251 | print('total f1 is ', np.mean(f1s))
252 | # return predict, np.mean(scores)
253 | return predict
254 |
255 |
256 | from sklearn.linear_model import LogisticRegression
257 | def stacking_lr(train, train_y, test):
258 | train_y = np.argmax(train_y, 1)
259 | count_kflod = 0
260 | num_folds = 6
261 | kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
262 | predict = np.zeros((test.shape[0], config.n_class))
263 | scores = []
264 | f1s = []
265 | for train_index, test_index in kf.split(train):
266 |
267 | y_train, y_test = train_y[train_index], train_y[test_index]
268 | kfold_X_train, kfold_X_valid = train[train_index], train[test_index]
269 |
270 | print('拟合数据')
271 | best = LogisticRegression(C=4, dual=True)
272 | best.fit(kfold_X_train, y_train)
273 |
274 | print('预测结果')
275 | preds1 = best.predict_proba(test)
276 | preds2 = best.predict_proba(kfold_X_valid)
277 |
278 | predict += preds1 / num_folds
279 | accuracy = mb.cal_acc(preds2, y_test)
280 | f1 = mb.cal_f_alpha(preds2, y_test, n_out=config.n_class)
281 |
282 | print('the kflod cv is : ', str(accuracy))
283 | print('the kflod f1 is : ', str(f1))
284 | count_kflod += 1
285 | scores.append(accuracy)
286 | f1s.append(f1)
287 | print('total scores is ', np.mean(scores))
288 | print('total f1 is ', np.mean(f1s))
289 | # return predict, np.mean(scores)
290 | return predict
291 |
292 | from sklearn import svm
293 | from sklearn.calibration import CalibratedClassifierCV
294 |
295 | def stacking_svm(train, train_y, test):
296 | train_y = np.argmax(train_y, 1)
297 | count_kflod = 0
298 | num_folds = 6
299 | kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
300 | predict = np.zeros((test.shape[0], config.n_class))
301 | scores = []
302 | f1s = []
303 | for train_index, test_index in kf.split(train):
304 |
305 | y_train, y_test = train_y[train_index], train_y[test_index]
306 | kfold_X_train, kfold_X_valid = train[train_index], train[test_index]
307 |
308 | print('拟合数据')
309 | best = svm.LinearSVC()
310 | best = CalibratedClassifierCV(best)
311 | best.fit(kfold_X_train, y_train)
312 |
313 | print('预测结果')
314 | preds1 = best.predict_proba(test)
315 | preds2 = best.predict_proba(kfold_X_valid)
316 |
317 | predict += preds1 / num_folds
318 | accuracy = mb.cal_acc(preds2, y_test)
319 | f1 = mb.cal_f_alpha(preds2, y_test, n_out=config.n_class)
320 |
321 | print('the kflod cv is : ', str(accuracy))
322 | print('the kflod f1 is : ', str(f1))
323 | count_kflod += 1
324 | scores.append(accuracy)
325 | f1s.append(f1)
326 | print('total scores is ', np.mean(scores))
327 | print('total f1 is ', np.mean(f1s))
328 | # return predict, np.mean(scores)
329 | return predict
330 |
331 |
332 | # 使用pseudo-labeling做第二次stacking
333 | def stacking_pseudo(train, train_y, test, results):
334 | answer = np.reshape(np.argmax(results, axis=-1), [-1])
335 | answer = np.reshape(np.eye(4)[answer], [-1, 10, 4])
336 |
337 | train_y = np.concatenate([train_y, answer], axis=0)
338 | train = np.concatenate([train, test], axis=0)
339 |
340 | savepath = './pesudo_{}_dt{}/'.format(args.option, args.data_type)
341 | if not os.path.exists(savepath):
342 | os.mkdir(savepath)
343 | count_kflod = 0
344 | num_folds = 5
345 | kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
346 | predict = np.zeros((test.shape[0], 10, 4))
347 | oof_predict = np.zeros((train.shape[0], 10, 4))
348 | scores = []
349 |
350 | for i, (train_index, test_index) in enumerate(kf.split(train)):
351 | print('第{}折'.format(i))
352 |
353 | kfold_X_train = {}
354 | kfold_X_valid = {}
355 |
356 | y_train, y_test = train_y[train_index], train_y[test_index]
357 |
358 | kfold_X_train, kfold_X_valid = train[train_index], train[test_index]
359 |
360 | model_prefix = savepath + 'DNN' + str(count_kflod)
361 | if not os.path.exists(model_prefix):
362 | os.mkdir(model_prefix)
363 |
364 | M = 3 # number of snapshots
365 | alpha_zero = 1e-3 # initial learning rate
366 | snap_epoch = 30
367 |
368 | snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero)
369 | # M = 1 # number of snapshots
370 | # snap_epoch = 16
371 | # jz_schedule = JZTrainCategory(model_prefix, snap_epoch, M, save_weights_only=True, monitor='val_loss', factor=0.7, patience=1)
372 |
373 | res_model = get_model(train)
374 | res_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
375 | res_model.summary()
376 |
377 | # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1, class_weight=class_weight)
378 | res_model.fit(kfold_X_train, y_train, batch_size=BATCH_SIZE, epochs=snap_epoch, verbose=1,
379 | validation_data=(kfold_X_valid, y_test),
380 | callbacks=snapshot.get_callbacks(model_save_place=model_prefix))
381 |
382 | evaluations = []
383 | for i in os.listdir(model_prefix):
384 | if '.h5' in i:
385 | evaluations.append(i)
386 |
387 | test_pred_ = np.zeros((test.shape[0], 10, 4))
388 | oof_pred_ = np.zeros((len(kfold_X_valid), 10, 4))
389 | for run, i in enumerate(evaluations):
390 | print('loading from {}'.format(os.path.join(model_prefix, i)))
391 | res_model.load_weights(os.path.join(model_prefix, i))
392 | test_pred_ += res_model.predict(test, verbose=1, batch_size=256) / len(evaluations)
393 | oof_pred_ += res_model.predict(kfold_X_valid, batch_size=256) / len(evaluations)
394 |
395 | predict += test_pred_ / num_folds
396 | oof_predict[test_index] = oof_pred_
397 |
398 | f1 = get_f1_score(np.argmax(oof_pred_, -1), np.argmax(y_test, -1), verbose=True)
399 | print(i, ' kflod cv f1 : ', str(f1))
400 | count_kflod += 1
401 | scores.append(f1)
402 | print('f1 {} -> {}'.format(scores, np.mean(scores)))
403 | return predict, np.mean(scores)
404 |
405 | def save_result(predict, prefix):
406 | os.makedirs('../data/result', exist_ok=True)
407 | with open('../data/result/{}.pkl'.format(prefix), 'wb') as f:
408 | pickle.dump(predict, f)
409 |
410 | if __name__ == '__main__':
411 | import argparse
412 | parser = argparse.ArgumentParser()
413 | parser.add_argument('--gpu', type=str, default='6')
414 | parser.add_argument('--model', type=str, help='模型')
415 | parser.add_argument('--option', type=int, default=1, help='训练方式')
416 | parser.add_argument('--data_type', type=int, default=1, help='问题模式, 0为4分类, 1为单分类, 2为先分主题再分情感')
417 | parser.add_argument('--feature', default='word', type=str, help='选择word或者char作为特征')
418 | parser.add_argument('--es', default=200, type=int, help='embed size')
419 | parser.add_argument('--debug', default=False, action='store_true')
420 | parser.add_argument('--bs', default=256, type=int, help='batch size')
421 | parser.add_argument('--f_value', default=0.0, type=float)
422 | args = parser.parse_args()
423 |
424 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
425 |
426 | import tensorflow as tf
427 | from keras.backend.tensorflow_backend import set_session
428 | tf_config = tf.ConfigProto()
429 | tf_config.gpu_options.allow_growth=True
430 | set_session(tf.Session(config=tf_config))
431 |
432 | mb = BasicModel()
433 | config = Config()
434 | config.gpu = args.gpu
435 | config.data_type = args.data_type
436 | BATCH_SIZE = args.bs
437 |
438 | # cv_stacking()
439 |
440 | # normal stacking
441 | train, train_y, test = data_prepare()
442 |
443 | predicts, oof_predicts, score = stacking_first(train, train_y, test)
444 | save_result(predicts, prefix=str(score))
445 | # save_result(oof_predicts, prefix='oof')
446 |
447 | # predicts = stacking_lightgbm(train, train_y, test)
448 | # save_result(predicts[:10000], prefix='stacking_lgb_first_op{}_{}_{}'.format(args.option, args.data_type, args.f_value))
449 |
450 | # predicts = stacking_lr(train, train_y, test)
451 | # save_result(predicts[:10000], prefix='stacking_lr_first_op{}_{}_{}'.format(args.option, args.data_type, args.f_value))
452 |
453 | # predicts = stacking_svm(train, train_y, test)
454 | # save_result(predicts[:10000], prefix='stacking_svm_first_op{}_{}_{}'.format(args.option, args.data_type, args.f_value))
455 |
456 | # 假标签
457 | predicts, score = stacking_pseudo(train, train_y, test, predicts)
458 | save_result(predicts, prefix=str(score))
459 |
--------------------------------------------------------------------------------
/src/tokenization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 | import unicodedata
23 | import six
24 | import tensorflow as tf
25 |
26 |
27 | def convert_to_unicode(text):
28 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
29 | if six.PY3:
30 | if isinstance(text, str):
31 | return text
32 | elif isinstance(text, bytes):
33 | return text.decode("utf-8", "ignore")
34 | else:
35 | raise ValueError("Unsupported string type: %s" % (type(text)))
36 | elif six.PY2:
37 | if isinstance(text, str):
38 | return text.decode("utf-8", "ignore")
39 | elif isinstance(text, unicode):
40 | return text
41 | else:
42 | raise ValueError("Unsupported string type: %s" % (type(text)))
43 | else:
44 | raise ValueError("Not running on Python2 or Python 3?")
45 |
46 |
47 | def printable_text(text):
48 | """Returns text encoded in a way suitable for print or `tf.logging`."""
49 |
50 | # These functions want `str` for both Python2 and Python3, but in one case
51 | # it's a Unicode string and in the other it's a byte string.
52 | if six.PY3:
53 | if isinstance(text, str):
54 | return text
55 | elif isinstance(text, bytes):
56 | return text.decode("utf-8", "ignore")
57 | else:
58 | raise ValueError("Unsupported string type: %s" % (type(text)))
59 | elif six.PY2:
60 | if isinstance(text, str):
61 | return text
62 | elif isinstance(text, unicode):
63 | return text.encode("utf-8")
64 | else:
65 | raise ValueError("Unsupported string type: %s" % (type(text)))
66 | else:
67 | raise ValueError("Not running on Python2 or Python 3?")
68 |
69 |
70 | def load_vocab(vocab_file):
71 | """Loads a vocabulary file into a dictionary."""
72 | vocab = collections.OrderedDict()
73 | index = 0
74 | with tf.gfile.GFile(vocab_file, "r") as reader:
75 | while True:
76 | token = convert_to_unicode(reader.readline())
77 | if not token:
78 | break
79 | token = token.strip()
80 | vocab[token] = index
81 | index += 1
82 | return vocab
83 |
84 |
85 | def convert_by_vocab(vocab, items):
86 | """Converts a sequence of [tokens|ids] using the vocab."""
87 | output = []
88 | for item in items:
89 | output.append(vocab[item])
90 | return output
91 |
92 |
93 | def convert_tokens_to_ids(vocab, tokens):
94 | return convert_by_vocab(vocab, tokens)
95 |
96 |
97 | def convert_ids_to_tokens(inv_vocab, ids):
98 | return convert_by_vocab(inv_vocab, ids)
99 |
100 |
101 | def whitespace_tokenize(text):
102 | """Runs basic whitespace cleaning and splitting on a peice of text."""
103 | text = text.strip()
104 | if not text:
105 | return []
106 | tokens = text.split()
107 | return tokens
108 |
109 |
110 | class FullTokenizer(object):
111 | """Runs end-to-end tokenziation."""
112 |
113 | def __init__(self, vocab_file, do_lower_case=True):
114 | self.vocab = load_vocab(vocab_file)
115 | self.inv_vocab = {v: k for k, v in self.vocab.items()}
116 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
117 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
118 |
119 | def tokenize(self, text):
120 | split_tokens = []
121 | for token in self.basic_tokenizer.tokenize(text):
122 | for sub_token in self.wordpiece_tokenizer.tokenize(token):
123 | split_tokens.append(sub_token)
124 |
125 | return split_tokens
126 |
127 | def convert_tokens_to_ids(self, tokens):
128 | return convert_by_vocab(self.vocab, tokens)
129 |
130 | def convert_ids_to_tokens(self, ids):
131 | return convert_by_vocab(self.inv_vocab, ids)
132 |
133 |
134 | class BasicTokenizer(object):
135 | """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
136 |
137 | def __init__(self, do_lower_case=True):
138 | """Constructs a BasicTokenizer.
139 |
140 | Args:
141 | do_lower_case: Whether to lower case the input.
142 | """
143 | self.do_lower_case = do_lower_case
144 |
145 | def tokenize(self, text):
146 | """Tokenizes a piece of text."""
147 | text = convert_to_unicode(text)
148 | text = self._clean_text(text)
149 |
150 | # This was added on November 1st, 2018 for the multilingual and Chinese
151 | # models. This is also applied to the English models now, but it doesn't
152 | # matter since the English models were not trained on any Chinese data
153 | # and generally don't have any Chinese data in them (there are Chinese
154 | # characters in the vocabulary because Wikipedia does have some Chinese
155 | # words in the English Wikipedia.).
156 | text = self._tokenize_chinese_chars(text)
157 |
158 | orig_tokens = whitespace_tokenize(text)
159 | split_tokens = []
160 | for token in orig_tokens:
161 | if self.do_lower_case:
162 | token = token.lower()
163 | token = self._run_strip_accents(token)
164 | split_tokens.extend(self._run_split_on_punc(token))
165 |
166 | output_tokens = whitespace_tokenize(" ".join(split_tokens))
167 | return output_tokens
168 |
169 | def _run_strip_accents(self, text):
170 | """Strips accents from a piece of text."""
171 | text = unicodedata.normalize("NFD", text)
172 | output = []
173 | for char in text:
174 | cat = unicodedata.category(char)
175 | if cat == "Mn":
176 | continue
177 | output.append(char)
178 | return "".join(output)
179 |
180 | def _run_split_on_punc(self, text):
181 | """Splits punctuation on a piece of text."""
182 | chars = list(text)
183 | i = 0
184 | start_new_word = True
185 | output = []
186 | while i < len(chars):
187 | char = chars[i]
188 | if _is_punctuation(char):
189 | output.append([char])
190 | start_new_word = True
191 | else:
192 | if start_new_word:
193 | output.append([])
194 | start_new_word = False
195 | output[-1].append(char)
196 | i += 1
197 |
198 | return ["".join(x) for x in output]
199 |
200 | def _tokenize_chinese_chars(self, text):
201 | """Adds whitespace around any CJK character."""
202 | output = []
203 | for char in text:
204 | cp = ord(char)
205 | if self._is_chinese_char(cp):
206 | output.append(" ")
207 | output.append(char)
208 | output.append(" ")
209 | else:
210 | output.append(char)
211 | return "".join(output)
212 |
213 | def _is_chinese_char(self, cp):
214 | """Checks whether CP is the codepoint of a CJK character."""
215 | # This defines a "chinese character" as anything in the CJK Unicode block:
216 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
217 | #
218 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
219 | # despite its name. The modern Korean Hangul alphabet is a different block,
220 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write
221 | # space-separated words, so they are not treated specially and handled
222 | # like the all of the other languages.
223 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
224 | (cp >= 0x3400 and cp <= 0x4DBF) or #
225 | (cp >= 0x20000 and cp <= 0x2A6DF) or #
226 | (cp >= 0x2A700 and cp <= 0x2B73F) or #
227 | (cp >= 0x2B740 and cp <= 0x2B81F) or #
228 | (cp >= 0x2B820 and cp <= 0x2CEAF) or
229 | (cp >= 0xF900 and cp <= 0xFAFF) or #
230 | (cp >= 0x2F800 and cp <= 0x2FA1F)): #
231 | return True
232 |
233 | return False
234 |
235 | def _clean_text(self, text):
236 | """Performs invalid character removal and whitespace cleanup on text."""
237 | output = []
238 | for char in text:
239 | cp = ord(char)
240 | if cp == 0 or cp == 0xfffd or _is_control(char):
241 | continue
242 | if _is_whitespace(char):
243 | output.append(" ")
244 | else:
245 | output.append(char)
246 | return "".join(output)
247 |
248 |
249 | class WordpieceTokenizer(object):
250 | """Runs WordPiece tokenziation."""
251 |
252 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
253 | self.vocab = vocab
254 | self.unk_token = unk_token
255 | self.max_input_chars_per_word = max_input_chars_per_word
256 |
257 | def tokenize(self, text):
258 | """Tokenizes a piece of text into its word pieces.
259 |
260 | This uses a greedy longest-match-first algorithm to perform tokenization
261 | using the given vocabulary.
262 |
263 | For example:
264 | input = "unaffable"
265 | output = ["un", "##aff", "##able"]
266 |
267 | Args:
268 | text: A single token or whitespace separated tokens. This should have
269 | already been passed through `BasicTokenizer.
270 |
271 | Returns:
272 | A list of wordpiece tokens.
273 | """
274 |
275 | text = convert_to_unicode(text)
276 |
277 | output_tokens = []
278 | for token in whitespace_tokenize(text):
279 | chars = list(token)
280 | if len(chars) > self.max_input_chars_per_word:
281 | output_tokens.append(self.unk_token)
282 | continue
283 |
284 | is_bad = False
285 | start = 0
286 | sub_tokens = []
287 | while start < len(chars):
288 | end = len(chars)
289 | cur_substr = None
290 | while start < end:
291 | substr = "".join(chars[start:end])
292 | if start > 0:
293 | substr = "##" + substr
294 | if substr in self.vocab:
295 | cur_substr = substr
296 | break
297 | end -= 1
298 | if cur_substr is None:
299 | is_bad = True
300 | break
301 | sub_tokens.append(cur_substr)
302 | start = end
303 |
304 | if is_bad:
305 | output_tokens.append(self.unk_token)
306 | else:
307 | output_tokens.extend(sub_tokens)
308 | return output_tokens
309 |
310 |
311 | def _is_whitespace(char):
312 | """Checks whether `chars` is a whitespace character."""
313 | # \t, \n, and \r are technically contorl characters but we treat them
314 | # as whitespace since they are generally considered as such.
315 | if char == " " or char == "\t" or char == "\n" or char == "\r":
316 | return True
317 | cat = unicodedata.category(char)
318 | if cat == "Zs":
319 | return True
320 | return False
321 |
322 |
323 | def _is_control(char):
324 | """Checks whether `chars` is a control character."""
325 | # These are technically control characters but we count them as whitespace
326 | # characters.
327 | if char == "\t" or char == "\n" or char == "\r":
328 | return False
329 | cat = unicodedata.category(char)
330 | if cat.startswith("C"):
331 | return True
332 | return False
333 |
334 |
335 | def _is_punctuation(char):
336 | """Checks whether `chars` is a punctuation character."""
337 | cp = ord(char)
338 | # We treat all non-letter/number ASCII as punctuation.
339 | # Characters such as "^", "$", and "`" are not in the Unicode
340 | # Punctuation class but we treat them as punctuation anyways, for
341 | # consistency.
342 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
343 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
344 | return True
345 | cat = unicodedata.category(char)
346 | if cat.startswith("P"):
347 | return True
348 | return False
349 |
--------------------------------------------------------------------------------
/src/train_elmo.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import numpy as np
5 |
6 | from bilm.training import train, load_options_latest_checkpoint, load_vocab
7 | from bilm.data import BidirectionalLMDataset
8 |
9 |
10 | def main(args):
11 | # load the vocab
12 | vocab = load_vocab(args.vocab_file, None)
13 |
14 | # define the options
15 | batch_size = 512 # batch size for each GPU
16 | n_gpus = 3
17 | os.environ['CUDA_VISIBLE_DEVICES'] = '1, 2, 6'
18 |
19 | # number of tokens in training data (this for 1B Word Benchmark)
20 | # word 8799
21 | # char 2355
22 | n_train_tokens = 768648884
23 | # n_train_tokens = 8799
24 |
25 | options = {
26 | 'bidirectional': True,
27 |
28 | # 'char_cnn': {'activation': 'relu',
29 | # 'embedding': {'dim': 16},
30 | # 'filters': [[1, 32],
31 | # [2, 32],
32 | # [3, 64],
33 | # [4, 128],
34 | # [5, 256],
35 | # [6, 512],
36 | # [7, 1024]],
37 | # 'max_characters_per_token': 50,
38 | # 'n_characters': 261,
39 | # 'n_highway': 2},
40 |
41 | 'dropout': 0.1,
42 |
43 | 'lstm': {
44 | 'cell_clip': 3,
45 | 'dim': 4096,
46 | 'n_layers': 2,
47 | 'proj_clip': 3,
48 | 'projection_dim': 512,
49 | 'use_skip_connections': True},
50 |
51 | 'all_clip_norm_val': 10.0,
52 |
53 | 'n_epochs': 10,
54 | 'n_train_tokens': n_train_tokens,
55 | 'batch_size': batch_size,
56 | 'n_tokens_vocab': vocab.size,
57 | 'unroll_steps': 20,
58 | 'n_negative_samples_batch': 1024,
59 | }
60 |
61 | print('vocab_size:', vocab.size)
62 | prefix = args.train_prefix
63 | data = BidirectionalLMDataset(prefix, vocab, test=False,
64 | shuffle_on_load=True)
65 |
66 | tf_save_dir = args.save_dir
67 | tf_log_dir = args.save_dir
68 | train(options, data, n_gpus, tf_save_dir, tf_log_dir)
69 |
70 |
71 | if __name__ == '__main__':
72 | parser = argparse.ArgumentParser()
73 | parser.add_argument('--save_dir', help='Location of checkpoint files')
74 | parser.add_argument('--vocab_file', help='Vocabulary file')
75 | parser.add_argument('--train_prefix', help='Prefix for train files')
76 |
77 | args = parser.parse_args()
78 | main(args)
79 |
--------------------------------------------------------------------------------
/src/train_predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import pickle
4 | from config import Config
5 | import numpy as np
6 | from tqdm import tqdm
7 | from sklearn.feature_extraction.text import TfidfVectorizer
8 | from sklearn.feature_extraction.text import HashingVectorizer
9 | import logging
10 | from gensim.models.word2vec import Word2Vec
11 | from bilm import TokenBatcher
12 | from scipy.sparse import hstack
13 |
14 | import tokenization
15 | from keras.preprocessing import sequence
16 | from keras.utils import np_utils
17 | import tensorflow as tf
18 |
19 | # np.random.seed(201)
20 | # tf.set_random_seed(201)
21 |
22 | logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')
23 | logger = logging.getLogger(__name__)
24 |
25 |
26 | def deep_data_prepare(config):
27 | print('深度学习模型数据准备')
28 | train_df = pd.read_csv(config.TRAIN_X)
29 | train_jp = pd.read_csv(config.TRAIN_JP)
30 | train_en = pd.read_csv(config.TRAIN_EN)
31 | test_df = pd.read_csv(config.TEST_X)
32 |
33 | char_sw_list = pickle.load(open('../data/char_stopword.pkl', 'rb'))
34 | word_sw_list = pickle.load(open('../data/word_stopword.pkl', 'rb'))
35 | # 用词向量
36 | # 用字向量
37 | train_x_char = train_df['char']
38 | train_x_word = train_df['word']
39 | # train_x_sent_word = [w for w in open('../data/sentiment_word.txt')]
40 | # train_x_sent_char = [w for w in open('../data/sentiment_word.txt')]
41 | train_jp_char = train_jp['char']
42 | train_jp_word = train_jp['word']
43 | train_en_char = train_en['char']
44 | train_en_word = train_en['word']
45 |
46 | train_char = pd.concat((train_x_char, train_jp_char, train_en_char))
47 | train_word = pd.concat((train_x_word, train_jp_word, train_en_word))
48 | test_char = test_df['char']
49 | test_word = test_df['word']
50 |
51 | if config.data_type == 0:
52 | train_y = train_df['sub_numerical'].values
53 | train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes)
54 |
55 | elif config.data_type == 1:
56 | train_y = train_df['sentiment_value'].values
57 | train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes)
58 |
59 | elif config.data_type == 2:
60 | train_y = np.array(train_df.iloc[:, 6:].values)
61 | elif config.data_type == 3:
62 | train_y = train_df.iloc[:, 6:].values
63 | targets = train_y.reshape(-1)
64 | one_hot_targets = np.eye(config.n_classes)[targets]
65 | train_y = one_hot_targets.reshape(-1, 10, config.n_classes)
66 | elif config.data_type == 4:
67 | train_y = (train_df['sentiment_value']+1).values
68 | train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes)
69 | elif config.data_type == 5:
70 | train_y = train_df.iloc[:, 4:].values
71 | else:
72 | exit('错误数据类别')
73 |
74 | UNK_CHAR = len(char_stoi)
75 | PAD_CHAR = len(char_stoi) + 1
76 |
77 | UNK_WORD = len(word_stoi)
78 | PAD_WORD = len(word_stoi) + 1
79 |
80 | def generate_hann_data(df):
81 | import re
82 | hann_train_word = np.full(shape=(len(df['word']), config.HANN_SENT, config.HANN_WORD_LEN), fill_value=PAD_WORD)
83 | hann_train_char = np.full(shape=(len(df['char']), config.HANN_SENT, config.HANN_CHAR_LEN), fill_value=PAD_CHAR)
84 |
85 | for i, sentences in enumerate(df['word']):
86 | sentences = re.split(r" 。 | , ", sentences)
87 | for j, sent in enumerate(sentences):
88 | if j < config.HANN_SENT:
89 | k = 0
90 | word_tokens = sent.split()
91 | for _, word in enumerate(word_tokens):
92 | if k < config.HANN_WORD_LEN and word not in word_sw_list and word in word_stoi:
93 | hann_train_word[i, j, k] = word_stoi[word]
94 | k += 1
95 |
96 | for i, sentences in enumerate(df['char']):
97 | sentences = re.split(r" 。 | , ", sentences)
98 | for j, sent in enumerate(sentences):
99 | if j < config.HANN_SENT:
100 | k = 0
101 | word_tokens = sent.split()
102 | for _, word in enumerate(word_tokens):
103 | if k < config.HANN_CHAR_LEN and word not in char_sw_list and word in char_stoi:
104 | hann_train_char[i, j, k] = char_stoi[word]
105 | k += 1
106 | return hann_train_word, hann_train_char
107 |
108 | hann_train_word, hann_train_char = generate_hann_data(train_df)
109 | hann_test_word, hann_test_char = generate_hann_data(test_df)
110 |
111 | def word2id(train_dialogs, type='char'):
112 | if type == 'char':
113 | stoi = char_stoi
114 | max_len = config.CHAR_MAXLEN
115 | UNK = UNK_CHAR
116 | sw_list = set(char_sw_list)
117 | elif type == 'word':
118 | stoi = word_stoi
119 | max_len = config.WORD_MAXLEN
120 | UNK = UNK_WORD
121 | sw_list = set(word_sw_list)
122 | else:
123 | exit('类型错误')
124 |
125 | train_x = []
126 | for d in tqdm(train_dialogs):
127 | d = str(d).split()
128 | line = []
129 | for token in d:
130 | if token in sw_list\
131 | or token == ''\
132 | or token == ' ':
133 | continue
134 | if token in stoi:
135 | line.append(stoi[token])
136 | else:
137 | line.append(UNK)
138 |
139 | train_x.append(line[:max_len])
140 | return train_x
141 |
142 | # 普通模型数据
143 | train_x_word = word2id(train_word, type='word')
144 | train_x_char = word2id(train_char, type='char')
145 | test_x_char = word2id(test_char, type='char')
146 | test_x_word = word2id(test_word, type='word')
147 |
148 | # train_x_sent_word = word2id(train_x_sent_word, type='word')
149 | # train_x_sent_char = word2id(train_x_sent_char, type='char')
150 | # rcnn模型数据准备
151 | UNK_CHAR = PAD_CHAR
152 | UNK_WORD = PAD_WORD
153 |
154 | train_word_left = [[UNK_WORD] + w[:-1] for w in train_x_word]
155 | train_word_right = [w[1:] + [UNK_WORD] for w in train_x_word]
156 | train_char_left = [[UNK_CHAR] + w[:-1] for w in train_x_char]
157 | train_char_right = [w[1:] + [UNK_CHAR] for w in train_x_char]
158 |
159 | test_word_left = [[UNK_WORD] + w[:-1] for w in test_x_word]
160 | test_word_right = [w[1:] + [UNK_WORD] for w in test_x_word]
161 | test_char_left = [[UNK_CHAR] + w[:-1] for w in test_x_char]
162 | test_char_right = [w[1:] + [UNK_CHAR] for w in test_x_char]
163 |
164 | train_x_char = sequence.pad_sequences(train_x_char, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
165 | train_x_word = sequence.pad_sequences(train_x_word, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
166 | train_x_char_left = sequence.pad_sequences(train_char_left, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
167 | train_x_word_left = sequence.pad_sequences(train_word_left, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
168 | train_x_char_right = sequence.pad_sequences(train_char_right, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
169 | train_x_word_right = sequence.pad_sequences(train_word_right, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
170 |
171 | test_x_char = sequence.pad_sequences(test_x_char, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
172 | test_x_word = sequence.pad_sequences(test_x_word, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
173 | test_x_char_left = sequence.pad_sequences(test_char_left, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
174 | test_x_word_left = sequence.pad_sequences(test_word_left, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
175 | test_x_char_right = sequence.pad_sequences(test_char_right, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
176 | test_x_word_right = sequence.pad_sequences(test_word_right, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
177 |
178 | print('train_x char shape is: ', train_x_char.shape)
179 | print('train_x word shape is: ', train_x_word.shape)
180 | print('test_x char shape is: ', test_x_char.shape)
181 | print('test_x word shape is: ', test_x_word.shape)
182 |
183 | train = {}
184 | test = {}
185 | # tokenizer = tokenization.FullTokenizer(
186 | # vocab_file=config.BERT_VOCAB_FILES, do_lower_case=False)
187 |
188 | # def get_bert_data(corpus):
189 | # input_ids = []
190 | # input_mask = []
191 | # input_segment_ids = []
192 |
193 | # for sent in train_df['word'].values:
194 | # sent = ''.join(sent.strip().split())
195 | # tmp_token_ids = tokenizer.convert_tokens_to_ids(['[CLS]'] + tokenizer.tokenize(sent)[:188] + ['[SEP]'])
196 | # tmp_mask = [1] * len(tmp_token_ids)
197 | # tmp_segment_ids = [0] * len(tmp_token_ids)
198 | # if len(tmp_token_ids) < 190:
199 | # tmp_segment_ids.extend([0] * (190-len(tmp_token_ids)))
200 | # tmp_mask.extend([0] * (190-len(tmp_token_ids)))
201 | # tmp_token_ids.extend([0] * (190-len(tmp_token_ids)))
202 | # input_ids.append(tmp_token_ids)
203 | # input_mask.append(tmp_mask)
204 | # input_segment_ids.append(tmp_segment_ids)
205 | # return np.array(input_ids, dtype='int32'), np.array(input_mask, dtype='int32'), np.array(input_segment_ids, dtype='int32')
206 |
207 | # train['token_id'], train['mask_id'], train['type_id'] = get_bert_data(train_df['word'].values)
208 | # test['token_id'], test['mask_id'], test['type_id'] = get_bert_data(test_df['word'].values)
209 |
210 | train['word'] = train_x_word
211 | train['char'] = train_x_char
212 | # train['word_sent'] = train_x_sent_word
213 | # train['char_sent'] = train_x_sent_char
214 | # rcnn
215 | train['word_left'] = train_x_word_left
216 | train['word_right'] = train_x_word_right
217 | train['char_left'] = train_x_char_left
218 | train['char_right'] = train_x_char_right
219 | # han
220 | train['hann_word'] = hann_train_word
221 | train['hann_char'] = hann_train_char
222 |
223 | test['word'] = test_x_word
224 | test['char'] = test_x_char
225 | test['word_left'] = test_x_word_left
226 | test['word_right'] = test_x_word_right
227 | test['char_left'] = test_x_char_left
228 | test['char_right'] = test_x_char_right
229 | test['hann_word'] = hann_test_word
230 | test['hann_char'] = hann_test_char
231 |
232 | assert train['word_left'].shape == train['word_right'].shape == train['word'].shape
233 | assert train['char_left'].shape == train['char_right'].shape == train['char'].shape
234 | assert test['word_left'].shape == test['word_right'].shape == test['word'].shape
235 | assert test['char_left'].shape == test['char_right'].shape == test['char'].shape
236 |
237 | # batcher = TokenBatcher(config.elmo_word_vocab_file)
238 | # train['elmo_word'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in train_df['word']])
239 | # test['elmo_word'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in test_df['word']])
240 |
241 | # batcher = TokenBatcher(config.elmo_char_vocab_file)
242 | # train['elmo_char'] = batcher.batch_sentences([str(w).split()[:config.CHAR_MAXLEN] for w in train_df['char']])
243 | # test['elmo_char'] = batcher.batch_sentences([str(w).split()[:config.CHAR_MAXLEN] for w in test_df['char']])
244 |
245 | # batcher = TokenBatcher(config.elmo_qiuqiu_vocab_file)
246 | # train['elmo_qiuqiu'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in train_df['word']])
247 | # test['elmo_qiuqiu'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in test_df['word']])
248 |
249 | return train, train_y, test
250 |
251 |
252 | def init_embedding(config, type='word'):
253 | model_file = config.word_w2v_file if type == 'word' else config.char_w2v_file
254 | item_to_id = word_stoi if type == 'word' else char_stoi
255 | vocab_len = len(item_to_id) + 2
256 | print('Vocabulaty size : ', vocab_len)
257 | print('create embedding matrix')
258 |
259 | def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
260 | embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(model_file).readlines()[1:])
261 |
262 | all_embs = np.stack(embeddings_index.values())
263 | embed_matrix = np.random.normal(all_embs.mean(), all_embs.std(), size=(vocab_len, config.EMBED_SIZE)).astype(dtype='float32')
264 | embed_matrix[-1] = 0 # padding
265 |
266 | for word, i in tqdm(item_to_id.items()):
267 | embedding_vector = embeddings_index.get(word)
268 | if embedding_vector is not None:
269 | embed_matrix[i] = embedding_vector
270 | return embed_matrix
271 |
272 |
273 | def deep_data_cache():
274 | char_w2v_embed = init_embedding(config, type='char')
275 | word_w2v_embed = init_embedding(config, type='word')
276 |
277 | train, train_y, test = deep_data_prepare(config)
278 | os.makedirs('../data/cache/', exist_ok=True)
279 | pickle.dump((train, train_y, test, char_w2v_embed, word_w2v_embed), open('../data/cache/deep_data_oe{}_es{}_dt{}_f{}.pkl'.format(config.outer_embed, config.EMBED_SIZE, config.data_type, config.main_feature), 'wb'))
280 |
281 |
282 | def deep_data_process():
283 | deep_data_cache()
284 | (train, train_y, test, char_w2v_embed, word_w2v_embed) = pickle.load(open('../data/cache/deep_data_oe{}_es{}_dt{}_f{}.pkl'.format(config.outer_embed, config.EMBED_SIZE, config.data_type, config.main_feature), 'rb'))
285 | config.char_embedding = char_w2v_embed
286 | config.word_embedding = word_w2v_embed
287 |
288 | model = config.model[args.model](config=config, n_folds=5)
289 | if config.data_type == 0:
290 | model.single_train_predict(train, train_y, test, option=config.option)
291 | elif config.data_type == 1:
292 | model.single_train_predict(train, train_y, test, option=config.option)
293 |
294 | elif config.data_type == 2:
295 | model.multi_train_predict(train, train_y, test, option=config.option)
296 | elif config.data_type == 3:
297 | model.four_classify_train_predict(train, train_y, test, option=config.option)
298 | # # model.multi_train_predict(train, train_y, test, option=config.option)
299 | # elif config.data_type == 4:
300 | # model.single_train_predict(train, train_y, test, option=config.option)
301 | # elif config.data_type == 5:
302 | # model.multi_train_predict(train, train_y, test, option=config.option)
303 |
304 | else:
305 | exit('错误数据类别')
306 |
307 |
308 | def static_data_prepare():
309 | model_name = config.model_name
310 | if not model_name:
311 | model_name = "model_dict.pkl"
312 | logger.info('start load data')
313 | train_df = pd.read_csv(config.TRAIN_MULTI_X)
314 | test_df = pd.read_csv(config.TEST_X)
315 | if model_name in 'svc':
316 | content_word = pd.concat((train_df['word'], test_df['word']))
317 | content_char = pd.concat((train_df['char'], test_df['char']))
318 | word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=5, norm='l2')
319 | char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 5), min_df=1, norm='l2')
320 |
321 | ha = HashingVectorizer(ngram_range=(1, 1), lowercase=False)
322 | discuss_ha = ha.fit_transform(content_word)
323 |
324 | logger.info('start word feature extraction')
325 | word_feature = word_vectorizer.fit_transform(content_word)
326 | logger.info("complete word feature extraction models")
327 | logger.info("vocab len: %d" % len(word_vectorizer.vocabulary_.keys()))
328 |
329 | logger.info('start char feature extraction')
330 | char_feature = char_vectorizer.fit_transform(content_char)
331 | logger.info("complete char feature extraction models")
332 | logger.info("vocab len: %d" % len(char_vectorizer.vocabulary_.keys()))
333 |
334 | train_feature = hstack([word_feature[:len(train_df)], char_feature[:len(train_df)]]).tocsr()
335 | test_feature = hstack([word_feature[len(train_df):], char_feature[len(train_df):]]).tocsr()
336 |
337 | train_feature = hstack((word_feature[:len(train_df)], discuss_ha[:len(train_df)])).tocsr()
338 | test_feature = hstack((word_feature[len(train_df):], discuss_ha[len(train_df):])).tocsr()
339 |
340 | train_feature = word_feature[:len(train_df)]
341 | test_feature = word_feature[len(train_df):]
342 |
343 | logger.info("complete char feature extraction models")
344 | logger.info("train feature shape: {}".format(np.shape(train_feature)))
345 | logger.info("test feature shape: {}".format(np.shape(test_feature)))
346 |
347 | train_y = np.array(train_df.iloc[:, 6:].values)
348 | else:
349 | train_feature = np.asarray([train_df['word']]).T
350 | train_y = np.array(train_df.iloc[:, 6:].values)
351 | test_feature = np.asarray([test_df['word']]).T
352 | return train_feature, train_y, test_feature
353 |
354 |
355 | def static_data_process():
356 | # model train
357 | train_x, train_y, test = static_data_prepare()
358 | model = config.model[args.model](config=config, n_folds=5)
359 | model.train_predict(train_x, train_y, test, option=config.option)
360 |
361 |
362 | if __name__ == '__main__':
363 | import argparse
364 | parser = argparse.ArgumentParser()
365 | parser.add_argument('--gpu', type=str, default='6')
366 | parser.add_argument('--model', type=str, help='模型')
367 | parser.add_argument('--option', type=int, default=1, help='训练方式')
368 | parser.add_argument('--epoch', type=int, default=10)
369 | parser.add_argument('--data_type', type=int, default=3, help='问题模式, 0分单主题, 1分单情感, 2为十个四分类, 3为asp')
370 | parser.add_argument('--feature', default='word', type=str, help='选择word或者char作为特征')
371 | parser.add_argument('--es', default=300, type=int, help='embed size')
372 | parser.add_argument('--debug', default=False, action='store_true', help='debug只会跑一折')
373 | parser.add_argument('--oe', default=False, action='store_true', help='百度百科预训练词向量')
374 | parser.add_argument('--ml', default=False, action='store_true', help='是否使用传统模型')
375 | parser.add_argument('--car', default=False, action='store_true', help='是否用汽车之家数据训练的词向量')
376 | parser.add_argument('--balance', default=False, action='store_true', help='根据样例比修改loss权重')
377 | parser.add_argument('--bs', default=64, type=int, help='batch size')
378 | args = parser.parse_args()
379 |
380 | # 设置keras后台和gpu
381 | # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
382 |
383 | config = Config()
384 | config.option = args.option
385 | config.outer_embed = args.oe
386 | config.n_epochs = args.epoch
387 | config.main_feature = args.feature
388 | config.model_name = args.model
389 | config.is_debug = args.debug
390 | config.BATCH_SIZE = args.bs
391 | config.gpu = args.gpu
392 | config.EMBED_SIZE = args.es
393 | config.data_type = args.data_type
394 | config.car = args.car
395 | config.balance = args.balance
396 |
397 | if config.model_name in ['svc', 'fasttext']:
398 | args.ml = True
399 |
400 | if args.ml:
401 | static_data_process()
402 | else:
403 | char_stoi = pickle.load(open(config.char_stoi_file, 'rb'))
404 | word_stoi = pickle.load(open(config.word_stoi_file, 'rb'))
405 |
406 | deep_data_process()
407 |
408 |
--------------------------------------------------------------------------------