├── .gitignore
├── README.md
├── data
    └── placeholder
├── model.jpeg
└── src
    ├── bilm
        ├── __init__.py
        ├── data.py
        ├── elmo.py
        ├── model.py
        └── training.py
    ├── config.py
    ├── model
        ├── __init__.py
        ├── attention.py
        ├── bilstm_model.py
        ├── capsule_model.py
        ├── convlstm_model.py
        ├── dpcnn_model.py
        ├── han_model.py
        ├── hybrid_nn_1.py
        ├── lightgbm_model.py
        ├── lstmconv_model.py
        ├── lstmgru_model.py
        ├── ml_models.py
        ├── model_basic.py
        ├── model_component.py
        ├── modeling.py
        ├── my_callbacks.py
        ├── rcnn_model.py
        ├── snapshot.py
        ├── textcnn_model.py
        └── xgboost_model.py
    ├── pack_sub_dt2.py
    ├── preprocess
        ├── .ipynb_checkpoints
        │   └── EDA-checkpoint.ipynb
        ├── EDA.ipynb
        ├── word_tests.txt
        └── words.txt
    ├── stacking.py
    ├── tokenization.py
    ├── train_elmo.py
    └── train_predict.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | ckpt*/
 2 | ./src/bilm/dump/
 3 | ./src/bilm/result/
 4 | ./src/runs/
 5 | data/
 6 | backup/
 7 | src/loss/
 8 | # Byte-compiled / optimized / DLL files
 9 | __pycache__/
10 | *.py[cod]
11 | 
12 | # C extensions
13 | *.so
14 | 
15 | # Distribution / packaging
16 | bin/
17 | build/
18 | develop-eggs/
19 | dist/
20 | eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | 
30 | # Installer logs
31 | pip-log.txt
32 | pip-delete-this-directory.txt
33 | 
34 | # Unit test / coverage reports
35 | .tox/
36 | .coverage
37 | .cache
38 | nosetests.xml
39 | coverage.xml
40 | 
41 | # Translations
42 | *.mo
43 | 
44 | # Mr Developer
45 | .mr.developer.cfg
46 | .project
47 | .pydevproject
48 | 
49 | # Rope
50 | .ropeproject
51 | 
52 | # Django stuff:
53 | *.log
54 | *.pot
55 | 
56 | # Sphinx documentation
57 | docs/_build/
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CCF-BDCI2018 汽车领域ASC挑战赛
 2 | 
 3 | 以前没接触过ASC、TSC领域，最开始纠结这是单分类还是多分类问题，走了一些弯路。最终我们回到ASC赛道上，根据直觉，我们设计了一个基于memory的lstm-attention模型，复赛B榜线上在0.69左右，融合最终得分0.70，单模型结构图如下：
 4 | 
 5 | ![](./model.jpeg)
 6 | 
 7 | 
 8 | 后面时间比较紧张，复现今年ASC论文的代码效果都不好，最终排名6/1701，思路如同代码所写，很简单。
 9 | 
10 | 原始数据可在[比赛数据](https://www.datafountain.cn/competitions/310/details/data-evaluation)处下载，由于这次我们问题建模方式比较多，数据预处理代码也比较多，所以我会上传一份处理好的数据(包括处理好的Bert特征和百度百科词向量)放在[百度云盘](https://pan.baidu.com/s/1ZrgQ6Wp_sFRPrZGjZiBPaA)，下载后请解压放在`data/`目录下。
11 | 
12 | ELMo哈工大基于pytorch的pretrain版本和我用tf pretrain训练集的效果都不好，但是我也保留了tf pretrain版本代码。
13 | 
14 | Bert我们没有弄finetune，直接抽取的特征，效果和百度百科词向量相当。
15 | 
16 | 若有任何想法可以提issue或者pull request，也可以微信与我直接讨论。希望大家一起学习进步。
17 | 
18 | 
19 | ### 一、环境
20 | 
21 | |环境/库|版本|
22 | |:---------:|----------|
23 | |Ubuntu|16.04.5 LTS|
24 | |python|3.6|
25 | |jupyter notebook|4.2.3|
26 | |tensorflow-gpu|1.9.1|
27 | |numpy|1.14.1|
28 | |pandas|0.23.0|
29 | |matplotlib|2.2.2|
30 | |tqdm|4.24.0|
31 | 
32 | 这里最重要的就是我们用的Cudnn版本的lstm，所以需要tensorflow版本大于1.4.0，相应的cuda版本不能用8.0，需要9.0及以上。
33 | 
34 | 
35 | ### 二、数据预处理
36 | 
37 | 都写在`jupyter`里了，运行`src/preprocess/EDA.ipynb`生成各种文件，可用看看思路，但是建议直接下载云盘处理好的结果。
38 | 
39 | 
40 | ### 三、深度模型训练
41 | 
42 | 数据预处理好即可用直接train模型，单GPU运行，模型请参考`src/config.py`自选，参数名含义请参考`src/train_predict.py`：
43 | 
44 | ```
45 | python train_predict.py --gpu 7 --model aspv0 --feature word --epoch 20 --bs 128 --oe
46 | ```
47 | 
48 | 
49 | ### 四、模型融合输出
50 | 
51 | ```
52 | python stacking.py --gpu 1 --data_type 3
53 | ```
54 | 
55 | 这里是`stacking`和`pesudo label`一起做了，请修改代码自选是否用伪标签。
56 | 
57 | 这里数据集比较合适，伪标签有一定提分作用。
58 | 
59 | ### 五、提交结果
60 | 
61 | 修改`src/pack_sub_dt2.py`里对应stacking生成的`pre_path`概率结果路径，运行
62 | 
63 | ```
64 | python python pack_sub_dt2.py
65 | ```
66 | 
67 | 生成提交结果。
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/data/placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/data/placeholder


--------------------------------------------------------------------------------
/model.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/model.jpeg


--------------------------------------------------------------------------------
/src/bilm/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .data import Batcher, TokenBatcher
3 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \
4 |     dump_bilm_embeddings
5 | from .elmo import weight_layers
6 | 
7 | 


--------------------------------------------------------------------------------
/src/bilm/data.py:
--------------------------------------------------------------------------------
  1 | # originally based on https://github.com/tensorflow/models/tree/master/lm_1b
  2 | import glob
  3 | import random
  4 | 
  5 | import numpy as np
  6 | 
  7 | from typing import List
  8 | 
  9 | 
 10 | class Vocabulary(object):
 11 |     '''
 12 |     A token vocabulary.  Holds a map from token to ids and provides
 13 |     a method for encoding text to a sequence of ids.
 14 |     '''
 15 |     def __init__(self, filename, validate_file=False):
 16 |         '''
 17 |         filename = the vocabulary file.  It is a flat text file with one
 18 |             (normalized) token per line.  In addition, the file should also
 19 |             contain the special tokens <S>, </S>, <UNK> (case sensitive).
 20 |         '''
 21 |         self._id_to_word = []
 22 |         self._word_to_id = {}
 23 |         self._unk = -1
 24 |         self._bos = -1
 25 |         self._eos = -1
 26 | 
 27 |         with open(filename) as f:
 28 |             idx = 0
 29 |             for line in f:
 30 |                 word_name = line.strip()
 31 |                 if word_name == '<S>':
 32 |                     self._bos = idx
 33 |                 elif word_name == '</S>':
 34 |                     self._eos = idx
 35 |                 elif word_name == '<UNK>':
 36 |                     self._unk = idx
 37 |                 if word_name == '!!!MAXTERMID':
 38 |                     continue
 39 | 
 40 |                 self._id_to_word.append(word_name)
 41 |                 self._word_to_id[word_name] = idx
 42 |                 idx += 1
 43 | 
 44 |         # check to ensure file has special tokens
 45 |         if validate_file:
 46 |             if self._bos == -1 or self._eos == -1 or self._unk == -1:
 47 |                 raise ValueError("Ensure the vocabulary file has "
 48 |                                  "<S>, </S>, <UNK> tokens")
 49 | 
 50 |     @property
 51 |     def bos(self):
 52 |         return self._bos
 53 | 
 54 |     @property
 55 |     def eos(self):
 56 |         return self._eos
 57 | 
 58 |     @property
 59 |     def unk(self):
 60 |         return self._unk
 61 | 
 62 |     @property
 63 |     def size(self):
 64 |         return len(self._id_to_word)
 65 | 
 66 |     def word_to_id(self, word):
 67 |         if word in self._word_to_id:
 68 |             return self._word_to_id[word]
 69 |         return self.unk
 70 | 
 71 |     def id_to_word(self, cur_id):
 72 |         return self._id_to_word[cur_id]
 73 | 
 74 |     def decode(self, cur_ids):
 75 |         """Convert a list of ids to a sentence, with space inserted."""
 76 |         return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids])
 77 | 
 78 |     def encode(self, sentence, reverse=False, split=True):
 79 |         """Convert a sentence to a list of ids, with special tokens added.
 80 |         Sentence is a single string with tokens separated by whitespace.
 81 | 
 82 |         If reverse, then the sentence is assumed to be reversed, and
 83 |             this method will swap the BOS/EOS tokens appropriately."""
 84 | 
 85 |         if split:
 86 |             word_ids = [
 87 |                 self.word_to_id(cur_word) for cur_word in sentence.split()
 88 |             ]
 89 |         else:
 90 |             word_ids = [self.word_to_id(cur_word) for cur_word in sentence]
 91 | 
 92 |         if reverse:
 93 |             return np.array([self.eos] + word_ids + [self.bos], dtype=np.int32)
 94 |         else:
 95 |             return np.array([self.bos] + word_ids + [self.eos], dtype=np.int32)
 96 | 
 97 | 
 98 | class UnicodeCharsVocabulary(Vocabulary):
 99 |     """Vocabulary containing character-level and word level information.
100 | 
101 |     Has a word vocabulary that is used to lookup word ids and
102 |     a character id that is used to map words to arrays of character ids.
103 | 
104 |     The character ids are defined by ord(c) for c in word.encode('utf-8')
105 |     This limits the total number of possible char ids to 256.
106 |     To this we add 5 additional special ids: begin sentence, end sentence,
107 |         begin word, end word and padding.
108 | 
109 |     WARNING: for prediction, we add +1 to the output ids from this
110 |     class to create a special padding id (=0).  As a result, we suggest
111 |     you use the `Batcher`, `TokenBatcher`, and `LMDataset` classes instead
112 |     of this lower level class.  If you are using this lower level class,
113 |     then be sure to add the +1 appropriately, otherwise embeddings computed
114 |     from the pre-trained model will be useless.
115 |     """
116 |     def __init__(self, filename, max_word_length, **kwargs):
117 |         super(UnicodeCharsVocabulary, self).__init__(filename, **kwargs)
118 |         self._max_word_length = max_word_length
119 | 
120 |         # char ids 0-255 come from utf-8 encoding bytes
121 |         # assign 256-300 to special chars
122 |         self.bos_char = 256  # <begin sentence>
123 |         self.eos_char = 257  # <end sentence>
124 |         self.bow_char = 258  # <begin word>
125 |         self.eow_char = 259  # <end word>
126 |         self.pad_char = 260 # <padding>
127 | 
128 |         num_words = len(self._id_to_word)
129 | 
130 |         self._word_char_ids = np.zeros([num_words, max_word_length],
131 |             dtype=np.int32)
132 | 
133 |         # the charcter representation of the begin/end of sentence characters
134 |         def _make_bos_eos(c):
135 |             r = np.zeros([self.max_word_length], dtype=np.int32)
136 |             r[:] = self.pad_char
137 |             r[0] = self.bow_char
138 |             r[1] = c
139 |             r[2] = self.eow_char
140 |             return r
141 |         self.bos_chars = _make_bos_eos(self.bos_char)
142 |         self.eos_chars = _make_bos_eos(self.eos_char)
143 | 
144 |         for i, word in enumerate(self._id_to_word):
145 |             self._word_char_ids[i] = self._convert_word_to_char_ids(word)
146 | 
147 |         self._word_char_ids[self.bos] = self.bos_chars
148 |         self._word_char_ids[self.eos] = self.eos_chars
149 |         # TODO: properly handle <UNK>
150 | 
151 |     @property
152 |     def word_char_ids(self):
153 |         return self._word_char_ids
154 | 
155 |     @property
156 |     def max_word_length(self):
157 |         return self._max_word_length
158 | 
159 |     def _convert_word_to_char_ids(self, word):
160 |         code = np.zeros([self.max_word_length], dtype=np.int32)
161 |         code[:] = self.pad_char
162 | 
163 |         word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length-2)]
164 |         code[0] = self.bow_char
165 |         for k, chr_id in enumerate(word_encoded, start=1):
166 |             code[k] = chr_id
167 |         code[k + 1] = self.eow_char
168 | 
169 |         return code
170 | 
171 |     def word_to_char_ids(self, word):
172 |         if word in self._word_to_id:
173 |             return self._word_char_ids[self._word_to_id[word]]
174 |         else:
175 |             return self._convert_word_to_char_ids(word)
176 | 
177 |     def encode_chars(self, sentence, reverse=False, split=True):
178 |         '''
179 |         Encode the sentence as a white space delimited string of tokens.
180 |         '''
181 |         if split:
182 |             chars_ids = [self.word_to_char_ids(cur_word)
183 |                      for cur_word in sentence.split()]
184 |         else:
185 |             chars_ids = [self.word_to_char_ids(cur_word)
186 |                      for cur_word in sentence]
187 |         if reverse:
188 |             return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars])
189 |         else:
190 |             return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])
191 | 
192 | 
193 | class Batcher(object):
194 |     ''' 
195 |     Batch sentences of tokenized text into character id matrices.
196 |     '''
197 |     def __init__(self, lm_vocab_file: str, max_token_length: int):
198 |         '''
199 |         lm_vocab_file = the language model vocabulary file (one line per
200 |             token)
201 |         max_token_length = the maximum number of characters in each token
202 |         '''
203 |         self._lm_vocab = UnicodeCharsVocabulary(
204 |             lm_vocab_file, max_token_length
205 |         )
206 |         self._max_token_length = max_token_length
207 | 
208 |     def batch_sentences(self, sentences: List[List[str]]):
209 |         '''
210 |         Batch the sentences as character ids
211 |         Each sentence is a list of tokens without <s> or </s>, e.g.
212 |         [['The', 'first', 'sentence', '.'], ['Second', '.']]
213 |         '''
214 |         n_sentences = len(sentences)
215 |         max_length = max(len(sentence) for sentence in sentences) + 2
216 | 
217 |         X_char_ids = np.zeros(
218 |             (n_sentences, max_length, self._max_token_length),
219 |             dtype=np.int64
220 |         )
221 | 
222 |         for k, sent in enumerate(sentences):
223 |             length = len(sent) + 2
224 |             char_ids_without_mask = self._lm_vocab.encode_chars(
225 |                 sent, split=False)
226 |             # add one so that 0 is the mask value
227 |             X_char_ids[k, :length, :] = char_ids_without_mask + 1
228 | 
229 |         return X_char_ids
230 | 
231 | 
232 | class TokenBatcher(object):
233 |     ''' 
234 |     Batch sentences of tokenized text into token id matrices.
235 |     '''
236 |     def __init__(self, lm_vocab_file: str):
237 |         '''
238 |         lm_vocab_file = the language model vocabulary file (one line per
239 |             token)
240 |         '''
241 |         self._lm_vocab = Vocabulary(lm_vocab_file)
242 | 
243 |     def batch_sentences(self, sentences: List[List[str]]):
244 |         '''
245 |         Batch the sentences as character ids
246 |         Each sentence is a list of tokens without <s> or </s>, e.g.
247 |         [['The', 'first', 'sentence', '.'], ['Second', '.']]
248 |         '''
249 |         n_sentences = len(sentences)
250 |         max_length = max(len(sentence) for sentence in sentences) + 2
251 | 
252 |         X_ids = np.zeros((n_sentences, max_length), dtype=np.int64)
253 | 
254 |         for k, sent in enumerate(sentences):
255 |             length = len(sent) + 2
256 |             ids_without_mask = self._lm_vocab.encode(sent, split=False)
257 |             # add one so that 0 is the mask value
258 |             X_ids[k, :length] = ids_without_mask + 1
259 | 
260 |         return X_ids
261 | 
262 | 
263 | ##### for training
264 | def _get_batch(generator, batch_size, num_steps, max_word_length):
265 |     """Read batches of input."""
266 |     cur_stream = [None] * batch_size
267 | 
268 |     no_more_data = False
269 |     while True:
270 |         inputs = np.zeros([batch_size, num_steps], np.int32)
271 |         if max_word_length is not None:
272 |             char_inputs = np.zeros([batch_size, num_steps, max_word_length],
273 |                                 np.int32)
274 |         else:
275 |             char_inputs = None
276 |         targets = np.zeros([batch_size, num_steps], np.int32)
277 | 
278 |         for i in range(batch_size):
279 |             cur_pos = 0
280 | 
281 |             while cur_pos < num_steps:
282 |                 if cur_stream[i] is None or len(cur_stream[i][0]) <= 1:
283 |                     try:
284 |                         cur_stream[i] = list(next(generator))
285 |                     except StopIteration:
286 |                         # No more data, exhaust current streams and quit
287 |                         no_more_data = True
288 |                         break
289 | 
290 |                 how_many = min(len(cur_stream[i][0]) - 1, num_steps - cur_pos)
291 |                 next_pos = cur_pos + how_many
292 | 
293 |                 inputs[i, cur_pos:next_pos] = cur_stream[i][0][:how_many]
294 |                 if max_word_length is not None:
295 |                     char_inputs[i, cur_pos:next_pos] = cur_stream[i][1][
296 |                                                                     :how_many]
297 |                 targets[i, cur_pos:next_pos] = cur_stream[i][0][1:how_many+1]
298 | 
299 |                 cur_pos = next_pos
300 | 
301 |                 cur_stream[i][0] = cur_stream[i][0][how_many:]
302 |                 if max_word_length is not None:
303 |                     cur_stream[i][1] = cur_stream[i][1][how_many:]
304 | 
305 |         if no_more_data:
306 |             # There is no more data.  Note: this will not return data
307 |             # for the incomplete batch
308 |             break
309 | 
310 |         X = {'token_ids': inputs, 'tokens_characters': char_inputs,
311 |                  'next_token_id': targets}
312 | 
313 |         yield X
314 | 
315 | class LMDataset(object):
316 |     """
317 |     Hold a language model dataset.
318 | 
319 |     A dataset is a list of tokenized files.  Each file contains one sentence
320 |         per line.  Each sentence is pre-tokenized and white space joined.
321 |     """
322 |     def __init__(self, filepattern, vocab, reverse=False, test=False,
323 |                  shuffle_on_load=False):
324 |         '''
325 |         filepattern = a glob string that specifies the list of files.
326 |         vocab = an instance of Vocabulary or UnicodeCharsVocabulary
327 |         reverse = if True, then iterate over tokens in each sentence in reverse
328 |         test = if True, then iterate through all data once then stop.
329 |             Otherwise, iterate forever.
330 |         shuffle_on_load = if True, then shuffle the sentences after loading.
331 |         '''
332 |         self._vocab = vocab
333 |         self._all_shards = glob.glob(filepattern)
334 |         print('Found %d shards at %s' % (len(self._all_shards), filepattern))
335 |         self._shards_to_choose = []
336 | 
337 |         self._reverse = reverse
338 |         self._test = test
339 |         self._shuffle_on_load = shuffle_on_load
340 |         self._use_char_inputs = hasattr(vocab, 'encode_chars')
341 | 
342 |         self._ids = self._load_random_shard()
343 | 
344 |     def _choose_random_shard(self):
345 |         if len(self._shards_to_choose) == 0:
346 |             self._shards_to_choose = list(self._all_shards)
347 |             random.shuffle(self._shards_to_choose)
348 |         shard_name = self._shards_to_choose.pop()
349 |         return shard_name
350 | 
351 |     def _load_random_shard(self):
352 |         """Randomly select a file and read it."""
353 |         if self._test:
354 |             if len(self._all_shards) == 0:
355 |                 # we've loaded all the data
356 |                 # this will propogate up to the generator in get_batch
357 |                 # and stop iterating
358 |                 raise StopIteration
359 |             else:
360 |                 shard_name = self._all_shards.pop()
361 |         else:
362 |             # just pick a random shard
363 |             shard_name = self._choose_random_shard()
364 | 
365 |         ids = self._load_shard(shard_name)
366 |         self._i = 0
367 |         self._nids = len(ids)
368 |         return ids
369 | 
370 |     def _load_shard(self, shard_name):
371 |         """Read one file and convert to ids.
372 | 
373 |         Args:
374 |             shard_name: file path.
375 | 
376 |         Returns:
377 |             list of (id, char_id) tuples.
378 |         """
379 |         print('Loading data from: %s' % shard_name)
380 |         with open(shard_name) as f:
381 |             sentences_raw = f.readlines()
382 | 
383 |         if self._reverse:
384 |             sentences = []
385 |             for sentence in sentences_raw:
386 |                 splitted = sentence.split()
387 |                 splitted.reverse()
388 |                 sentences.append(' '.join(splitted))
389 |         else:
390 |             sentences = sentences_raw
391 | 
392 |         if self._shuffle_on_load:
393 |             random.shuffle(sentences)
394 | 
395 |         ids = [self.vocab.encode(sentence, self._reverse)
396 |                for sentence in sentences]
397 |         if self._use_char_inputs:
398 |             chars_ids = [self.vocab.encode_chars(sentence, self._reverse)
399 |                      for sentence in sentences]
400 |         else:
401 |             chars_ids = [None] * len(ids)
402 | 
403 |         print('Loaded %d sentences.' % len(ids))
404 |         print('Finished loading')
405 |         return list(zip(ids, chars_ids))
406 | 
407 |     def get_sentence(self):
408 |         while True:
409 |             if self._i == self._nids:
410 |                 self._ids = self._load_random_shard()
411 |             ret = self._ids[self._i]
412 |             self._i += 1
413 |             yield ret
414 | 
415 |     @property
416 |     def max_word_length(self):
417 |         if self._use_char_inputs:
418 |             return self._vocab.max_word_length
419 |         else:
420 |             return None
421 | 
422 |     def iter_batches(self, batch_size, num_steps):
423 |         for X in _get_batch(self.get_sentence(), batch_size, num_steps,
424 |                            self.max_word_length):
425 | 
426 |             # token_ids = (batch_size, num_steps)
427 |             # char_inputs = (batch_size, num_steps, 50) of character ids
428 |             # targets = word ID of next word (batch_size, num_steps)
429 |             yield X
430 | 
431 |     @property
432 |     def vocab(self):
433 |         return self._vocab
434 | 
435 | class BidirectionalLMDataset(object):
436 |     def __init__(self, filepattern, vocab, test=False, shuffle_on_load=False):
437 |         '''
438 |         bidirectional version of LMDataset
439 |         '''
440 |         self._data_forward = LMDataset(
441 |             filepattern, vocab, reverse=False, test=test,
442 |             shuffle_on_load=shuffle_on_load)
443 |         self._data_reverse = LMDataset(
444 |             filepattern, vocab, reverse=True, test=test,
445 |             shuffle_on_load=shuffle_on_load)
446 | 
447 |     def iter_batches(self, batch_size, num_steps):
448 |         max_word_length = self._data_forward.max_word_length
449 | 
450 |         for X, Xr in zip(
451 |             _get_batch(self._data_forward.get_sentence(), batch_size,
452 |                       num_steps, max_word_length),
453 |             _get_batch(self._data_reverse.get_sentence(), batch_size,
454 |                       num_steps, max_word_length)
455 |             ):
456 | 
457 |             for k, v in Xr.items():
458 |                 X[k + '_reverse'] = v
459 | 
460 |             yield X
461 | 
462 | 
463 | class InvalidNumberOfCharacters(Exception):
464 |     pass
465 | 
466 | 


--------------------------------------------------------------------------------
/src/bilm/elmo.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | 
  4 | def weight_layers(name, bilm_ops, l2_coef=None,
  5 |                   use_top_only=False, do_layer_norm=False):
  6 |     '''
  7 |     Weight the layers of a biLM with trainable scalar weights to
  8 |     compute ELMo representations.
  9 | 
 10 |     For each output layer, this returns two ops.  The first computes
 11 |         a layer specific weighted average of the biLM layers, and
 12 |         the second the l2 regularizer loss term.
 13 |     The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES 
 14 | 
 15 |     Input:
 16 |         name = a string prefix used for the trainable variable names
 17 |         bilm_ops = the tensorflow ops returned to compute internal
 18 |             representations from a biLM.  This is the return value
 19 |             from BidirectionalLanguageModel(...)(ids_placeholder)
 20 |         l2_coef: the l2 regularization coefficient $\lambda$.
 21 |             Pass None or 0.0 for no regularization.
 22 |         use_top_only: if True, then only use the top layer.
 23 |         do_layer_norm: if True, then apply layer normalization to each biLM
 24 |             layer before normalizing
 25 | 
 26 |     Output:
 27 |         {
 28 |             'weighted_op': op to compute weighted average for output,
 29 |             'regularization_op': op to compute regularization term
 30 |         }
 31 |     '''
 32 |     def _l2_regularizer(weights):
 33 |         if l2_coef is not None:
 34 |             return l2_coef * tf.reduce_sum(tf.square(weights))
 35 |         else:
 36 |             return 0.0
 37 | 
 38 |     # Get ops for computing LM embeddings and mask
 39 |     lm_embeddings = bilm_ops['lm_embeddings']
 40 |     mask = bilm_ops['mask']
 41 | 
 42 |     n_lm_layers = int(lm_embeddings.get_shape()[1])
 43 |     lm_dim = int(lm_embeddings.get_shape()[3])
 44 | 
 45 |     with tf.control_dependencies([lm_embeddings, mask]):
 46 |         # Cast the mask and broadcast for layer use.
 47 |         mask_float = tf.cast(mask, 'float32')
 48 |         broadcast_mask = tf.expand_dims(mask_float, axis=-1)
 49 | 
 50 |         def _do_ln(x):
 51 |             # do layer normalization excluding the mask
 52 |             x_masked = x * broadcast_mask
 53 |             N = tf.reduce_sum(mask_float) * lm_dim
 54 |             mean = tf.reduce_sum(x_masked) / N
 55 |             variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask)**2
 56 |                                     ) / N
 57 |             return tf.nn.batch_normalization(
 58 |                 x, mean, variance, None, None, 1E-12
 59 |             )
 60 | 
 61 |         if use_top_only:
 62 |             layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
 63 |             # just the top layer
 64 |             sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1)
 65 |             # no regularization
 66 |             reg = 0.0
 67 |         else:
 68 |             W = tf.get_variable(
 69 |                 '{}_ELMo_W'.format(name),
 70 |                 shape=(n_lm_layers, ),
 71 |                 initializer=tf.zeros_initializer,
 72 |                 regularizer=_l2_regularizer,
 73 |                 trainable=True,
 74 |             )
 75 | 
 76 |             # normalize the weights
 77 |             normed_weights = tf.split(
 78 |                 tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers
 79 |             )
 80 |             # split LM layers
 81 |             layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
 82 |     
 83 |             # compute the weighted, normalized LM activations
 84 |             pieces = []
 85 |             for w, t in zip(normed_weights, layers):
 86 |                 if do_layer_norm:
 87 |                     pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1)))
 88 |                 else:
 89 |                     pieces.append(w * tf.squeeze(t, squeeze_dims=1))
 90 |             sum_pieces = tf.add_n(pieces)
 91 |     
 92 |             # get the regularizer 
 93 |             reg = [
 94 |                 r for r in tf.get_collection(
 95 |                                 tf.GraphKeys.REGULARIZATION_LOSSES)
 96 |                 if r.name.find('{}_ELMo_W/'.format(name)) >= 0
 97 |             ]
 98 |             if len(reg) != 1:
 99 |                 raise ValueError
100 | 
101 |         # scale the weighted sum by gamma
102 |         gamma = tf.get_variable(
103 |             '{}_ELMo_gamma'.format(name),
104 |             shape=(1, ),
105 |             initializer=tf.ones_initializer,
106 |             regularizer=None,
107 |             trainable=True,
108 |         )
109 |         weighted_lm_layers = sum_pieces * gamma
110 | 
111 |         ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg}
112 | 
113 |     return ret
114 | 
115 | 


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
  1 | # from model.lightgbm_model import LightGbmModel
  2 | # from model.xgboost_model import XgboostModel
  3 | from model.textcnn_model import TextCNNModel
  4 | from model.dpcnn_model import DpcnnModel
  5 | from model.capsule_model import CapsuleModel
  6 | from model.rcnn_model import RCNNModel
  7 | from model.attention import AttentionModel
  8 | from model.convlstm_model import ConvlstmModel
  9 | from model.lstmconv_model import LstmconvModel
 10 | from model.lstmgru_model import LstmgruModel
 11 | from model.han_model import HANModel
 12 | from model.hybrid_nn_1 import HybridNN1Model
 13 | from model.ml_models import SVCClassifier
 14 | from model.ml_models import Fasttext
 15 | from model.bilstm_model import *
 16 | 
 17 | 
 18 | class Config(object):
 19 | 
 20 |     """Docstring for Config. """
 21 | 
 22 |     def __init__(self):
 23 |         """TODO: to be defined1. """
 24 |         self.model = {
 25 |             # 'xgboost': XgboostModel,
 26 |             # 'lightgbm': LightGbmModel,
 27 |             # 'svc': SVCClassifier,
 28 |             # 'fasttext': Fasttext,
 29 | 
 30 |             # dl model
 31 |             'aspv0': BilstmV0,
 32 |             'aspv1': BilstmV1,
 33 |             # 'aspv2': BilstmV2,
 34 |             'textcnn': TextCNNModel,
 35 |             'lstmgru': LstmgruModel,
 36 |             'attention': AttentionModel,
 37 |             'convlstm': ConvlstmModel,
 38 |             'lstmconv': LstmconvModel,
 39 |             # 'dpcnn': DpcnnModel,
 40 |             # 'rcnn': RCNNModel,
 41 |             # 'capsule': CapsuleModel,
 42 |             # 'han': HANModel,
 43 |             # 'hybridnn1': HybridNN1Model,
 44 |         }
 45 |         self.CHAR_MAXLEN = 190
 46 |         self.WORD_MAXLEN = 128
 47 | 
 48 |         self.HANN_SENT = 20
 49 |         self.HANN_WORD_LEN = 40
 50 |         self.HANN_CHAR_LEN = 70
 51 |         self.EMBED_SIZE = 300
 52 |         self.main_feature = 'word'
 53 |         self.is_debug = True
 54 |         # self.elmo_word_options_file = './bilm/dump/options.word.json'
 55 |         # self.elmo_word_weight_file = './bilm/dump/weights.word.hdf5'
 56 |         # self.elmo_word_embed_file = './bilm/dump/vocab_embedding.word.hdf5'
 57 |         # self.elmo_word_vocab_file = '../data/word2vec_models/word2vec.word.300d.vocab.txt'
 58 | 
 59 |         # self.elmo_char_options_file = './bilm/dump/options.char.json'
 60 |         # self.elmo_char_weight_file = './bilm/dump/weights.char.hdf5'
 61 |         # self.elmo_char_embed_file = './bilm/dump/vocab_embedding.char.hdf5'
 62 |         # self.elmo_char_vocab_file = '../data/word2vec_models/word2vec.char.300d.vocab.txt'
 63 | 
 64 |         # self.elmo_qiuqiu_options_file = './bilm/dump/tmp/options.json'
 65 |         # self.elmo_qiuqiu_weight_file = './bilm/dump/tmp/weight-11-4.hdf5'
 66 |         # self.elmo_qiuqiu_embed_file = './bilm/dump/tmp/word_embedding.after.elmo-11-4.hdf5'
 67 |         # self.elmo_qiuqiu_vocab_file = './bilm/dump/tmp/sa_elmo_vocabs.txt'
 68 | 
 69 |         self.loss_path = '../data/loss'
 70 |         self.TEST_X = '../data/csvs/test_public.csv'
 71 |         self.TRAIN_MULTI_X = '../data/csvs/train_multi.csv'
 72 |         self.TRAIN_JP = '../data/csvs/round2zh2jp.csv'
 73 |         self.TRAIN_EN = '../data/csvs/round2zh2en.csv'
 74 |         # self.SENTIMENT_EMBED_PATH = '../data/sentiment_embedding.pkl'
 75 | 
 76 |         # self.BERT_VOCAB_FILES = '../data/chinese_L-12_H-768_A-12/vocab.txt'
 77 |         # self.BERT_CONFIG_FILES = '../data/chinese_L-12_H-768_A-12/bert_config.json'
 78 | 
 79 |         # self.Y_DISTILLATION = '../data/result/oof.pkl'
 80 | 
 81 |     # property 等待调用到它时才计算，先加载embed size再加载对应词向量
 82 |     @property
 83 |     def char_stoi_file(self):
 84 |         if self.car:
 85 |             return '../data/char_item_to_id.cars-home.pkl'
 86 |         else:
 87 |             return '../data/char_item_to_id.pkl'
 88 | 
 89 |     @property
 90 |     def word_stoi_file(self):
 91 |         if self.car:
 92 |             return '../data/word_item_to_id.cars-home.pkl'
 93 |         else:
 94 |             return '../data/word_item_to_id.pkl'
 95 | 
 96 |     @property
 97 |     def char_w2v_file(self):
 98 |         if self.outer_embed:
 99 |             return '../data/word2vec_models/sgns.baidubaike.bigram-char'
100 |         else:
101 |             if not self.car:
102 |                 return '../data/word2vec_models/word2vec.char.{}d.model.txt'.format(self.EMBED_SIZE)
103 |             else:
104 |                 return '../data/word2vec_models/word2vec.char.{}d.model.cars-home.txt'.format(self.EMBED_SIZE)
105 | 
106 | 
107 |     @property
108 |     def word_w2v_file(self):
109 | 
110 |         if self.outer_embed:
111 |             return '../data/word2vec_models/sgns.baidubaike.bigram-char'
112 |         else:
113 |             if not self.car:
114 |                 return '../data/word2vec_models/word2vec.word.{}d.model.txt'.format(self.EMBED_SIZE)
115 |             else:
116 |                 return '../data/word2vec_models/word2vec.word.{}d.model.cars-home.txt'.format(self.EMBED_SIZE)
117 | 
118 |     @property
119 |     def TRAIN_X(self):
120 |         if self.data_type == 0:
121 |             return '../data/csvs/train_single_label.csv'
122 |         elif self.data_type == 1:
123 |             return '../data/csvs/train_single_label.csv'
124 |         elif self.data_type == 2:
125 |             return '../data/csvs/train_multi.csv'
126 |         elif self.data_type == 3:
127 |             return '../data/csvs/train_multi.csv'
128 |         elif self.data_type == 4:
129 |             return '../data/csvs/train.csv'
130 |         elif self.data_type == 5:
131 |             return '../data/csvs/multi_train.csv'
132 | 
133 |     @property
134 |     def n_classes(self):
135 |         if self.data_type == 0:
136 |             return 10
137 |         elif self.data_type == 1:
138 |             return 3
139 |         elif self.data_type == 2:
140 |             return 4
141 |         elif self.data_type == 3:
142 |             return 4
143 |         elif self.data_type == 4:
144 |             return 3
145 |         elif self.data_type == 5:
146 |             return 30
147 | 
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------
/src/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/src/model/__init__.py


--------------------------------------------------------------------------------
/src/model/attention.py:
--------------------------------------------------------------------------------
  1 | from model.model_basic import BasicDeepModel
  2 | import tensorflow as tf
  3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
  4 | from bilm.elmo import weight_layers
  5 | from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
  6 | 
  7 | n_sub = 10
  8 | 
  9 | 
 10 | class AttentionModel(BasicDeepModel):
 11 |     def __init__(self, name='basicModel', n_folds=5, config=None):
 12 |         name = 'attention' + config.main_feature
 13 |         self.hidden_dim = 150
 14 |         BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
 15 | 
 16 |     def create_model(self, share_dense=True, concat_sub=True):
 17 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y')
 18 |         self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
 19 | 
 20 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
 21 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
 22 | 
 23 |         if self.main_feature.lower() in ['word', 'char']:
 24 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
 25 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
 26 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
 27 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 28 | 
 29 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
 30 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
 31 |             if self.main_feature == 'elmo_word':
 32 |                 options_file = self.config.elmo_word_options_file
 33 |                 weight_file = self.config.elmo_word_weight_file
 34 |                 embed_file = self.config.elmo_word_embed_file
 35 |             elif self.main_feature == 'elmo_char':
 36 |                 options_file = self.config.elmo_char_options_file
 37 |                 weight_file = self.config.elmo_char_weight_file
 38 |                 embed_file = self.config.elmo_char_embed_file
 39 |             elif self.main_feature == 'elmo_qiuqiu':
 40 |                 options_file = self.config.elmo_qiuqiu_options_file
 41 |                 weight_file = self.config.elmo_qiuqiu_weight_file
 42 |                 embed_file = self.config.elmo_qiuqiu_embed_file
 43 |             self.bilm = BidirectionalLanguageModel(options_file,
 44 |                                                     weight_file,
 45 |                                                     use_character_inputs=False,
 46 |                                                     embedding_weight_file=embed_file,
 47 |                                                     max_batch_size=self.batch_size)
 48 |             bilm_embedding_op = self.bilm(self.input_x)
 49 |             bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
 50 |             self.word_encoding = bilm_embedding['weighted_op']
 51 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 52 | 
 53 |         else:
 54 |             exit('wrong feature')
 55 | 
 56 |         c_outputs = []
 57 |         for c in range(n_sub):
 58 |             with tf.variable_scope('lstm-{}'.format(c)):
 59 |                 # self.forward = self.LSTM()
 60 |                 # self.backward = self.LSTM()
 61 |                 # x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32)
 62 |                 # x = tf.concat(x, -1)
 63 |                 #### cudnn lstm ####
 64 |                 self.forward_lstm = cudnn_rnn.CudnnLSTM(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32)
 65 |                 self.forward_gru = cudnn_rnn.CudnnGRU(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32)
 66 |                 x, _ = self.forward_lstm(tf.transpose(self.word_encoding, [1, 0, 2]))
 67 |                 x, _ = self.forward_gru(x)
 68 |                 x = tf.transpose(x, [1, 0, 2])
 69 | 
 70 |             with tf.variable_scope('pooling-{}'.format(c)):
 71 |                 max_pooled = tf.reshape(tf.reduce_max(x, 1), [-1, 2*self.hidden_dim])
 72 |                 avg_pooled = tf.reshape(tf.reduce_mean(x, 1), [-1, 2*self.hidden_dim])
 73 | 
 74 |                 att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim], name='att_w')
 75 |                 att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b')
 76 |                 att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v')
 77 | 
 78 |                 x_reshape = tf.reshape(x, [-1, 2*self.hidden_dim])
 79 |                 score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(x_reshape, att_w)) + att_b, att_v), [-1, 1, self.max_len])
 80 |                 alpha = tf.nn.softmax(score, axis=-1)
 81 |                 att_pooled = tf.reshape(tf.matmul(alpha, x), [-1, 2*self.hidden_dim])
 82 | 
 83 |                 concat_pooled = tf.concat((max_pooled, att_pooled, avg_pooled), -1)
 84 | 
 85 |                 concat_pooled = tf.nn.dropout(concat_pooled, self.dropout_keep_prob)
 86 |                 dense = tf.layers.dense(concat_pooled, 4, activation=None)
 87 |                 c_outputs.append(dense)
 88 | 
 89 |         self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4])
 90 |         y_ = tf.nn.softmax(self.logits)
 91 |         self.prob = tf.reshape(y_, [-1, n_sub, 4])
 92 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
 93 | 
 94 |         if not self.config.balance:
 95 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
 96 |             # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
 97 |         else:
 98 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
 99 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
100 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
101 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
102 |             class0_weight = 1  # 第0类的权重系数
103 |             class1_weight = 3  # 第1类的权重系数
104 |             class2_weight = 3  # 第2类的权重系数
105 |             class3_weight = 3  # 第3类的权重系数
106 |             #  coe = tf.constant([1., 1., 1., 1.])
107 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
108 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
109 | 
110 |             y = tf.reshape(self.input_y, [-1, 4])
111 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
112 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
113 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
114 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
115 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
116 | 
117 |         return self
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/src/model/bilstm_model.py:
--------------------------------------------------------------------------------
  1 | from model.model_basic import BasicDeepModel
  2 | from model import modeling
  3 | import tensorflow as tf
  4 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
  5 | from bilm.elmo import weight_layers
  6 | 
  7 | n_sub = 10
  8 | 
  9 | class BilstmV0(BasicDeepModel):
 10 |     def __init__(self, name='basicModel', n_folds=5, config=None):
 11 |         name = 'qiuqiuv0' + config.main_feature
 12 |         self.hidden_dim = 300
 13 |         BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
 14 | 
 15 |     def create_model(self):
 16 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,10,4], name='input_y')
 17 |         self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
 18 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
 19 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
 20 | 
 21 |         if self.main_feature.lower() in ['word', 'char']:
 22 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
 23 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
 24 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
 25 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 26 | 
 27 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
 28 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
 29 |             if self.main_feature == 'elmo_word':
 30 |                 options_file = self.config.elmo_word_options_file
 31 |                 weight_file = self.config.elmo_word_weight_file
 32 |                 embed_file = self.config.elmo_word_embed_file
 33 |             elif self.main_feature == 'elmo_char':
 34 |                 options_file = self.config.elmo_char_options_file
 35 |                 weight_file = self.config.elmo_char_weight_file
 36 |                 embed_file = self.config.elmo_char_embed_file
 37 |             elif self.main_feature == 'elmo_qiuqiu':
 38 |                 options_file = self.config.elmo_qiuqiu_options_file
 39 |                 weight_file = self.config.elmo_qiuqiu_weight_file
 40 |                 embed_file = self.config.elmo_qiuqiu_embed_file
 41 | 
 42 |             self.bilm = BidirectionalLanguageModel(options_file,
 43 |                                                     weight_file,
 44 |                                                     use_character_inputs=False,
 45 |                                                     embedding_weight_file=embed_file,
 46 |                                                     max_batch_size=self.batch_size)
 47 |             bilm_embedding_op = self.bilm(self.input_x)
 48 |             bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
 49 |             self.word_encoding = bilm_embedding['weighted_op']
 50 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 51 | 
 52 |         else:
 53 |             exit('wrong feature')
 54 | 
 55 |         self.layer_embedding = tf.get_variable(shape=[10, self.hidden_dim], name='layer_embedding')
 56 |         # self.layer_embedding = tf.get_variable(initializer=self.sentiment_embed, name='layer_embedding')
 57 | 
 58 |         self.forward = self.LSTM()
 59 |         self.backwad = self.LSTM()
 60 |         # self.forward2 = self.LSTM()
 61 |         # self.backwad2 = self.LSTM()
 62 | 
 63 |         # add point
 64 |         self.forward2 = self.GRU()
 65 |         self.backwad2 = self.GRU()
 66 | 
 67 |         with tf.variable_scope('sentence_encode'):
 68 |             all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backwad,self.word_encoding,dtype=tf.float32)
 69 |         # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
 70 |         output_sentence = tf.concat(axis=2, values=all_output_words)
 71 | 
 72 |         with tf.variable_scope('sentence_encode2'):
 73 |             all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward2,self.backwad2,output_sentence,dtype=tf.float32)
 74 |         # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
 75 |         output_sentence = tf.concat(axis=2, values=all_output_words)
 76 |         output_sentence = tf.layers.dense(output_sentence, self.hidden_dim, activation=tf.nn.tanh)
 77 |         sentence_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len, self.hidden_dim])
 78 |         sentence_reshape_tile = tf.tile(sentence_reshape, [1, 10, 1, 1])  # 句子复制10份
 79 | 
 80 |         layer_reshape = tf.reshape(self.layer_embedding, [1, 10, 1, self.hidden_dim])
 81 |         layer_reshape_tile = tf.tile(layer_reshape, [self.batch_size, 1, self.max_len, 1])
 82 | 
 83 |         embed_concat = tf.reshape(tf.concat(axis=3,values=[sentence_reshape_tile,layer_reshape_tile]),[-1,2*self.hidden_dim])
 84 | 
 85 |         self.att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim],name='att_w')
 86 |         self.att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b')
 87 |         self.att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v')
 88 | 
 89 |         score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(embed_concat,self.att_w) + self.att_b),self.att_v),[-1,10,self.max_len])
 90 |         alpah = tf.nn.softmax(score,axis=2)
 91 |         layer_sentence = tf.matmul(alpah,output_sentence)
 92 | 
 93 |         layer_reshape2 = tf.reshape(self.layer_embedding,[1,10,self.hidden_dim])
 94 |         layer_reshape2_tile = tf.tile(layer_reshape2,[self.batch_size,1,1])
 95 |         layer_sentence = tf.concat(axis=2,values=[layer_sentence,layer_reshape2_tile])
 96 |         layer_sentence = tf.reshape(layer_sentence,[-1,2*self.hidden_dim])
 97 | 
 98 |         layer_sentence = tf.layers.dense(layer_sentence,self.hidden_dim,activation=tf.nn.relu)
 99 | 
100 |         # add point
101 |         layer_sentence = tf.nn.dropout(layer_sentence, self.dropout_keep_prob)
102 | 
103 |         self.logits = tf.layers.dense(layer_sentence, 4, activation=None)
104 |         y_ = tf.nn.softmax(self.logits, axis=1)
105 |         self.prob = tf.reshape(y_, [-1, 10, 4])
106 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
107 | 
108 |         if not self.config.balance:
109 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
110 |             #  self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
111 |         else:
112 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
113 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
114 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
115 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
116 |             class0_weight = 1  # 第0类的权重系数
117 |             class1_weight = 3  # 第1类的权重系数
118 |             class2_weight = 3  # 第2类的权重系数
119 |             class3_weight = 3  # 第3类的权重系数
120 |             #  coe = tf.constant([1., 1., 1., 1.])
121 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
122 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
123 | 
124 |             y = tf.reshape(self.input_y, [-1, 4])
125 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
126 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
127 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
128 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
129 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
130 | 
131 |         return self
132 | 
133 |     def LSTM(self, layers=1):
134 |         lstms = []
135 |         for num in range(layers):
136 |             lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
137 |             print(lstm.name)
138 |             # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
139 |             lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
140 |             lstms.append(lstm)
141 | 
142 |         lstms = tf.contrib.rnn.MultiRNNCell(lstms)
143 |         return lstms
144 | 
145 |     def GRU(self, layers=1):
146 |         lstms = []
147 |         for num in range(layers):
148 |             #  lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
149 |             lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
150 |             print(lstm.name)
151 |             lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
152 |             lstms.append(lstm)
153 | 
154 |         lstms = tf.contrib.rnn.MultiRNNCell(lstms)
155 |         return lstms
156 | 
157 | 
158 | class BilstmV1(BasicDeepModel):
159 |     def __init__(self, name='basicModel', n_folds=5, config=None):
160 |         name = 'qiuqiuv1' + config.main_feature
161 |         self.hidden_dim = 300
162 |         BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
163 | 
164 |     def create_model(self, concat_sub=True):
165 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,10,4], name='input_y')
166 |         self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
167 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
168 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
169 | 
170 |         if self.main_feature.lower() in ['word', 'char']:
171 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
172 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
173 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
174 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
175 | 
176 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
177 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
178 |             if self.main_feature == 'elmo_word':
179 |                 options_file = self.config.elmo_word_options_file
180 |                 weight_file = self.config.elmo_word_weight_file
181 |                 embed_file = self.config.elmo_word_embed_file
182 |             elif self.main_feature == 'elmo_char':
183 |                 options_file = self.config.elmo_char_options_file
184 |                 weight_file = self.config.elmo_char_weight_file
185 |                 embed_file = self.config.elmo_char_embed_file
186 |             elif self.main_feature == 'elmo_qiuqiu':
187 |                 options_file = self.config.elmo_qiuqiu_options_file
188 |                 weight_file = self.config.elmo_qiuqiu_weight_file
189 |                 embed_file = self.config.elmo_qiuqiu_embed_file
190 | 
191 |             self.bilm = BidirectionalLanguageModel(options_file,
192 |                                                     weight_file,
193 |                                                     use_character_inputs=False,
194 |                                                     embedding_weight_file=embed_file,
195 |                                                     max_batch_size=self.batch_size)
196 |             bilm_embedding_op = self.bilm(self.input_x)
197 |             bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
198 |             self.word_encoding = bilm_embedding['weighted_op']
199 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
200 | 
201 |         else:
202 |             exit('wrong feature')
203 | 
204 |         self.layer_embedding = tf.get_variable(shape=[10, self.hidden_dim], name='layer_embedding')
205 |         layer_reshape = tf.reshape(self.layer_embedding, [1, 10, 1, self.hidden_dim])
206 |         layer_reshape_tile = tf.tile(layer_reshape, [self.batch_size, 1, self.max_len, 1])
207 | 
208 |         self.forward = self.LSTM()
209 |         self.backwad = self.LSTM()
210 |         self.forward2 = self.LSTM()
211 |         self.backwad2 = self.LSTM()
212 | 
213 |         with tf.variable_scope('sentence_encode'):
214 |             s1_out, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backwad,self.word_encoding,dtype=tf.float32)
215 |         # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
216 |         s1_out = tf.concat(axis=2, values=s1_out)
217 |         s1_reshape = tf.reshape(s1_out, [-1, 1, self.max_len, 2*self.hidden_dim])
218 |         s1_tile = tf.tile(s1_reshape, [1, 10, 1, 1])  # 第一层lstm复制10份
219 | 
220 |         s2_input = tf.reshape(tf.concat((s1_tile, layer_reshape_tile), -1), [-1, self.max_len, 3*self.hidden_dim])
221 | 
222 |         with tf.variable_scope('sentence_encode2'):
223 |             s2_out, _ = tf.nn.bidirectional_dynamic_rnn(self.forward2,self.backwad2,s2_input,dtype=tf.float32)
224 |         # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
225 |         s2_out = tf.reshape(tf.concat(axis=-1, values=s2_out), [-1, 10, self.max_len, 2*self.hidden_dim])
226 |         res_out = s2_out + s1_tile
227 |         res_dense = tf.layers.dense(res_out, self.hidden_dim, activation=tf.nn.relu)
228 | 
229 |         res_layer_concat = tf.reshape(tf.concat((res_dense, layer_reshape_tile), -1), [-1, 2*self.hidden_dim])
230 | 
231 |         self.att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim],name='att_w')
232 |         self.att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b')
233 |         self.att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v')
234 | 
235 |         score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(res_layer_concat, self.att_w) + self.att_b),self.att_v),[-1,1,self.max_len])
236 |         alpha = tf.nn.softmax(score)
237 |         layer_sentence = tf.reshape(tf.matmul(alpha, tf.reshape(res_out, [-1, self.max_len, 2*self.hidden_dim])), [-1, n_sub, 2*self.hidden_dim])
238 | 
239 |         if concat_sub:
240 |             # 是否拼接layer_sub信息
241 |             layer_sub = tf.reshape(self.layer_embedding, [1, n_sub, self.hidden_dim])
242 |             layer_sub_tile = tf.tile(layer_sub, [self.batch_size, 1, 1])
243 | 
244 |             layer_total = tf.concat((layer_sentence, layer_sub_tile), -1)
245 |             outputs = tf.reshape(layer_total, [-1, 3*self.hidden_dim])
246 |         else:
247 |             outputs = tf.reshape(layer_sentence, [-1, 2*self.hidden_dim])
248 | 
249 |         self.logits = tf.layers.dense(outputs, 4, activation=None)
250 |         y_ = tf.nn.softmax(self.logits)
251 |         self.prob = tf.reshape(y_, [-1, 10, 4])
252 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
253 | 
254 |         if not self.config.balance:
255 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
256 |             self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
257 |         else:
258 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
259 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
260 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
261 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
262 |             class0_weight = 1  # 第0类的权重系数
263 |             class1_weight = 3  # 第1类的权重系数
264 |             class2_weight = 3  # 第2类的权重系数
265 |             class3_weight = 3  # 第3类的权重系数
266 |             #  coe = tf.constant([1., 1., 1., 1.])
267 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
268 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
269 | 
270 |             y = tf.reshape(self.input_y, [-1, 4])
271 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
272 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
273 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
274 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
275 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
276 | 
277 |         return self
278 | 
279 |     def LSTM(self, layers=1):
280 |         lstms = []
281 |         for num in range(layers):
282 |             lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
283 |             print(lstm.name)
284 |             # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
285 |             lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
286 |             lstms.append(lstm)
287 | 
288 |         lstms = tf.contrib.rnn.MultiRNNCell(lstms)
289 |         return lstms
290 | 
291 |     def GRU(self, layers=1):
292 |         lstms = []
293 |         for num in range(layers):
294 |             #  lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
295 |             lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
296 |             print(lstm.name)
297 |             lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
298 |             lstms.append(lstm)
299 | 
300 |         lstms = tf.contrib.rnn.MultiRNNCell(lstms)
301 |         return lstms
302 | 
303 | 
304 | class BilstmV2(BasicDeepModel):
305 |     def __init__(self, name='basicModel', n_folds=5, config=None):
306 |         name = 'qiuqiuv2' + config.main_feature
307 |         self.hidden_dim = 300
308 |         BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
309 | 
310 |     def create_model(self):
311 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,10,4], name='input_y')
312 |         self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
313 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
314 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
315 | 
316 |         self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None,190], name='input_ids')
317 |         self.mask_ids = tf.placeholder(dtype=tf.int32, shape=[None,190], name='mask_ids')
318 |         self.type_ids = tf.placeholder(dtype=tf.int32, shape=[None,190], name='type_ids')
319 |         self.is_training = tf.placeholder(dtype=tf.bool, name='is_training')
320 | 
321 |         #  bert_hidden_size = bert_output_layer.shape[-1].value
322 |         #  hidden_size = output_layer.shape[-1].value
323 | 
324 |         if self.main_feature.lower() in ['word', 'char']:
325 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
326 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
327 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
328 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
329 | 
330 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
331 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
332 |             if self.main_feature == 'elmo_word':
333 |                 options_file = self.config.elmo_word_options_file
334 |                 weight_file = self.config.elmo_word_weight_file
335 |                 embed_file = self.config.elmo_word_embed_file
336 |             elif self.main_feature == 'elmo_char':
337 |                 options_file = self.config.elmo_char_options_file
338 |                 weight_file = self.config.elmo_char_weight_file
339 |                 embed_file = self.config.elmo_char_embed_file
340 |             elif self.main_feature == 'elmo_qiuqiu':
341 |                 options_file = self.config.elmo_qiuqiu_options_file
342 |                 weight_file = self.config.elmo_qiuqiu_weight_file
343 |                 embed_file = self.config.elmo_qiuqiu_embed_file
344 | 
345 |             self.bilm = BidirectionalLanguageModel(options_file,
346 |                                                     weight_file,
347 |                                                     use_character_inputs=False,
348 |                                                     embedding_weight_file=embed_file,
349 |                                                     max_batch_size=self.batch_size)
350 |             bilm_embedding_op = self.bilm(self.input_x)
351 |             bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
352 |             self.word_encoding = bilm_embedding['weighted_op']
353 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
354 | 
355 |         else:
356 |             exit('wrong feature')
357 | 
358 |         self.layer_embedding = tf.get_variable(shape=[10, self.hidden_dim], name='layer_embedding')
359 | 
360 |         self.forward = self.LSTM()
361 |         self.backwad = self.LSTM()
362 |         # self.forward2 = self.LSTM()
363 |         # self.backwad2 = self.LSTM()
364 | 
365 |         # add point
366 |         self.forward2 = self.GRU()
367 |         self.backwad2 = self.GRU()
368 | 
369 |         # bert使用
370 |         bert_config = modeling.BertConfig.from_json_file(self.config.BERT_CONFIG_FILES)
371 | 
372 |         bert_model = modeling.BertModel(
373 |             config=bert_config,
374 |             is_training=self.is_training,
375 |             input_ids=self.input_ids,
376 |             input_mask=self.mask_ids,
377 |             token_type_ids=self.type_ids
378 |         )
379 |         if self.is_training is not None:
380 |            print('bert config hidden dropout -- ---', bert_config.hidden_dropout_prob)
381 |            print('bert config hidden dropout -- ---', bert_config.attention_probs_dropout_prob)
382 |         self.word_encoding = bert_model.get_sequence_output()
383 |         all_layer_output = bert_model.get_all_encoder_layers()
384 |         self.word_encoding = (all_layer_output[0] + all_layer_output[1] + all_layer_output[2] + all_layer_output[3]) / 4
385 |         with tf.variable_scope('sentence_encode'):
386 |             all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backwad,self.word_encoding,dtype=tf.float32)
387 |         # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
388 |         output_sentence = tf.concat(axis=2, values=all_output_words)
389 | 
390 |         with tf.variable_scope('sentence_encode2'):
391 |             all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward2,self.backwad2,output_sentence,dtype=tf.float32)
392 |         # output_sentence = 0.5*(all_output_words[0] + all_output_words[1])
393 |         output_sentence = tf.concat(axis=2, values=all_output_words)
394 |         output_sentence = tf.layers.dense(output_sentence, self.hidden_dim, activation=tf.nn.tanh)
395 |         sentence_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len, self.hidden_dim])
396 |         sentence_reshape_tile = tf.tile(sentence_reshape, [1, 10, 1, 1])  # 句子复制10份
397 | 
398 |         layer_reshape = tf.reshape(self.layer_embedding, [1, 10, 1, self.hidden_dim])
399 |         layer_reshape_tile = tf.tile(layer_reshape, [self.batch_size, 1, self.max_len, 1])
400 | 
401 |         embed_concat = tf.reshape(tf.concat(axis=3,values=[sentence_reshape_tile,layer_reshape_tile]),[-1,2*self.hidden_dim])
402 | 
403 |         self.att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim],name='att_w')
404 |         self.att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b')
405 |         self.att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v')
406 | 
407 |         score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(embed_concat,self.att_w) + self.att_b),self.att_v),[-1,10,self.max_len])
408 |         alpah = tf.nn.softmax(score,axis=2)
409 |         layer_sentence = tf.matmul(alpah,output_sentence)
410 | 
411 |         layer_reshape2 = tf.reshape(self.layer_embedding,[1,10,self.hidden_dim])
412 |         layer_reshape2_tile = tf.tile(layer_reshape2,[self.batch_size,1,1])
413 |         layer_sentence = tf.concat(axis=2,values=[layer_sentence,layer_reshape2_tile])
414 |         layer_sentence = tf.reshape(layer_sentence,[-1,2*self.hidden_dim])
415 | 
416 |         layer_sentence = tf.layers.dense(layer_sentence,self.hidden_dim,activation=tf.nn.relu)
417 | 
418 |         # add point
419 |         layer_sentence = tf.nn.dropout(layer_sentence, self.dropout_keep_prob)
420 | 
421 |         self.logits = tf.layers.dense(layer_sentence, 4, activation=None)
422 |         y_ = tf.nn.softmax(self.logits, axis=1)
423 |         self.prob = tf.reshape(y_, [-1, 10, 4])
424 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
425 | 
426 |         if not self.config.balance:
427 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
428 |             self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
429 |         else:
430 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
431 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
432 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
433 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
434 |             class0_weight = 1  # 第0类的权重系数
435 |             class1_weight = 3  # 第1类的权重系数
436 |             class2_weight = 3  # 第2类的权重系数
437 |             class3_weight = 3  # 第3类的权重系数
438 |             #  coe = tf.constant([1., 1., 1., 1.])
439 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
440 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
441 | 
442 |             y = tf.reshape(self.input_y, [-1, 4])
443 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
444 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
445 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
446 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
447 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
448 | 
449 |         return self
450 | 
451 |     def LSTM(self, layers=1):
452 |         lstms = []
453 |         for num in range(layers):
454 |             lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
455 |             print(lstm.name)
456 |             # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
457 |             lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
458 |             lstms.append(lstm)
459 | 
460 |         lstms = tf.contrib.rnn.MultiRNNCell(lstms)
461 |         return lstms
462 | 
463 |     def GRU(self, layers=1):
464 |         lstms = []
465 |         for num in range(layers):
466 |             #  lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
467 |             lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
468 |             print(lstm.name)
469 |             lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
470 |             lstms.append(lstm)
471 | 
472 |         lstms = tf.contrib.rnn.MultiRNNCell(lstms)
473 |         return lstms
474 | 
475 | 


--------------------------------------------------------------------------------
/src/model/capsule_model.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import *
 2 | from keras.models import *
 3 | from model.model_basic import BasicDeepModel
 4 | from model.model_component import Capsule
 5 | from keras import regularizers
 6 | 
 7 | class CapsuleModel(BasicDeepModel):
 8 |     def __init__(self, name='basicModel', num_flods=5, config=None):
 9 |         name = 'capsule' + config.main_feature
10 |         BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config)
11 | 
12 |     def create_model(self):
13 |         Routings = 5
14 |         Num_capsule = 10
15 |         Dim_capsule = 16
16 |         dropout_p = 0.25
17 |         rate_drop_dense = 0.28
18 |         gru_len = 128
19 |         if self.main_feature == 'char':
20 |             input = Input(shape=(self.max_len,), name='char')
21 |         else:
22 |             input = Input(shape=(self.max_len,), name='word')
23 | 
24 |         embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding')
25 |         x = Masking(mask_value=self.mask_value)(input)
26 |         x = embedding(x)
27 | 
28 |         x = SpatialDropout1D(rate_drop_dense)(x)
29 | 
30 |         x = Bidirectional(GRU(gru_len, activation='relu', dropout=dropout_p, recurrent_dropout=dropout_p, return_sequences=True))(x)
31 |         # x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
32 | 
33 |         capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings,
34 |                           share_weights=True)(x)
35 | 
36 |         capsule = Flatten()(capsule)
37 |         capsule = Dropout(dropout_p)(capsule)
38 |         dense = Dense(self.n_class, activation="softmax")(capsule)
39 |         res_model = Model(inputs=[input], outputs=dense)
40 | 
41 |         return res_model
42 | 


--------------------------------------------------------------------------------
/src/model/convlstm_model.py:
--------------------------------------------------------------------------------
  1 | from model.model_basic import BasicDeepModel
  2 | import tensorflow as tf
  3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
  4 | from bilm.elmo import weight_layers
  5 | 
  6 | n_sub = 10
  7 | 
  8 | class ConvlstmModel(BasicDeepModel):
  9 |     def __init__(self, name='basicModel', n_folds=5, config=None):
 10 |         name = 'convlstm' + config.main_feature
 11 |         self.hidden_dim = 300
 12 |         BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
 13 | 
 14 |     def LSTM(self, layers=1):
 15 |         lstms = []
 16 |         for num in range(layers):
 17 |             lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
 18 |             print(lstm.name)
 19 |             # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
 20 |             lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
 21 |             lstms.append(lstm)
 22 |         lstms = tf.contrib.rnn.MultiRNNCell(lstms)
 23 |         return lstms
 24 | 
 25 |     def GRU(self, layers=1):
 26 |         lstms = []
 27 |         for num in range(layers):
 28 |             #  lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
 29 |             lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
 30 |             print(lstm.name)
 31 |             lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
 32 |             lstms.append(lstm)
 33 |         lstms = tf.contrib.rnn.MultiRNNCell(lstms)
 34 |         return lstms
 35 | 
 36 |     def create_model(self, share_dense=True, concat_sub=True):
 37 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y')
 38 |         self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
 39 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
 40 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
 41 | 
 42 |         if self.main_feature.lower() in ['word', 'char']:
 43 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
 44 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
 45 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
 46 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 47 | 
 48 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
 49 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
 50 |             if self.main_feature == 'elmo_word':
 51 |                 options_file = self.config.elmo_word_options_file
 52 |                 weight_file = self.config.elmo_word_weight_file
 53 |                 embed_file = self.config.elmo_word_embed_file
 54 |             elif self.main_feature == 'elmo_char':
 55 |                 options_file = self.config.elmo_char_options_file
 56 |                 weight_file = self.config.elmo_char_weight_file
 57 |                 embed_file = self.config.elmo_char_embed_file
 58 |             elif self.main_feature == 'elmo_qiuqiu':
 59 |                 options_file = self.config.elmo_qiuqiu_options_file
 60 |                 weight_file = self.config.elmo_qiuqiu_weight_file
 61 |                 embed_file = self.config.elmo_qiuqiu_embed_file
 62 | 
 63 |             self.bilm = BidirectionalLanguageModel(options_file,
 64 |                                                     weight_file,
 65 |                                                     use_character_inputs=False,
 66 |                                                     embedding_weight_file=embed_file,
 67 |                                                     max_batch_size=self.batch_size)
 68 |             bilm_embedding_op = self.bilm(self.input_x)
 69 |             bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
 70 |             self.word_encoding = bilm_embedding['weighted_op']
 71 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 72 | 
 73 |         else:
 74 |             exit('wrong feature')
 75 | 
 76 |         inputs_expanded = tf.expand_dims(self.word_encoding, -1)
 77 |         n_filters = 128
 78 |         filter_shape = [3, self.embed_size, 1, n_filters]
 79 |         W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W')
 80 |         b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters]))
 81 |         conv = tf.nn.conv2d(inputs_expanded, W, strides=[1]*4, padding='VALID', name='conv2d')
 82 |         h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
 83 |         h = tf.reshape(h, [-1, self.max_len-3+1, n_filters])
 84 | 
 85 |         self.forward = self.LSTM()
 86 |         self.backward = self.LSTM()
 87 |         x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, h, dtype=tf.float32)
 88 |         x = tf.concat(x, -1)
 89 |         output_sentence = tf.layers.dense(x, self.hidden_dim, activation=tf.nn.relu)
 90 | 
 91 |         x_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len-3+1, self.hidden_dim])
 92 |         x_tile = tf.tile(x_reshape, [1, n_sub, 1, 1])  # 句子复制n_sub份
 93 | 
 94 |         sub_embedding = tf.get_variable(shape=[n_sub, self.hidden_dim], name='sub_embedding')
 95 |         sub_reshape = tf.reshape(sub_embedding, [1, n_sub, 1, self.hidden_dim])
 96 |         sub_tile = tf.tile(sub_reshape, [self.batch_size, 1, self.max_len-3+1, 1])
 97 | 
 98 |         embed_concat = tf.reshape(tf.concat((x_tile, sub_tile), -1), [-1, 2*self.hidden_dim])
 99 | 
100 |         att_w = tf.get_variable(shape=[2*self.hidden_dim, self.hidden_dim], name='att_w')
101 |         att_b = tf.get_variable(shape=[self.hidden_dim], name='att_b')
102 |         att_v = tf.get_variable(shape=[self.hidden_dim, 1], name='att_v')
103 | 
104 |         score = tf.matmul(tf.nn.tanh(tf.matmul(embed_concat, att_w) + att_b), att_v)
105 |         score_fit = tf.reshape(score, [-1, n_sub, self.max_len-3+1])
106 |         alpha = tf.nn.softmax(score_fit)
107 | 
108 |         layer_sentence = tf.matmul(alpha, output_sentence)
109 | 
110 |         if concat_sub:
111 |             # 是否拼接layer_sub信息
112 |             layer_sub = tf.reshape(sub_embedding, [1, n_sub, self.hidden_dim])
113 |             layer_sub_tile = tf.tile(layer_sub, [self.batch_size, 1, 1])
114 | 
115 |             layer_total = tf.concat((layer_sentence, layer_sub_tile), -1)
116 |             outputs = tf.reshape(layer_total, [-1, 2*self.hidden_dim])
117 |         else:
118 |             outputs = tf.reshape(layer_sentence, [-1, self.hidden_dim])
119 | 
120 |         self.logits = tf.layers.dense(layer_sentence, 4, activation=None)
121 |         y_ = tf.nn.softmax(self.logits)
122 |         self.prob = tf.reshape(y_, [-1, 10, 4])
123 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
124 | 
125 |         if not self.config.balance:
126 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
127 |             # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
128 |         else:
129 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
130 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
131 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
132 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
133 |             class0_weight = 1  # 第0类的权重系数
134 |             class1_weight = 3  # 第1类的权重系数
135 |             class2_weight = 3  # 第2类的权重系数
136 |             class3_weight = 3  # 第3类的权重系数
137 |             #  coe = tf.constant([1., 1., 1., 1.])
138 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
139 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
140 | 
141 |             y = tf.reshape(self.input_y, [-1, 4])
142 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
143 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
144 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
145 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
146 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
147 | 
148 |         return self
149 | 
150 | 


--------------------------------------------------------------------------------
/src/model/dpcnn_model.py:
--------------------------------------------------------------------------------
 1 | from keras.models import *
 2 | from keras.layers import *
 3 | from model.model_basic import BasicDeepModel
 4 | from keras import regularizers
 5 | 
 6 | 
 7 | dp = 4
 8 | filter_nr = 64
 9 | filter_size = 3
10 | max_pool_size = 3
11 | max_pool_strides = 2
12 | dense_nr = 128
13 | spatial_dropout = 0.5
14 | dense_dropout = 0.5
15 | 
16 | 
17 | class DpcnnModel(BasicDeepModel):
18 |     def __init__(self, name='basicModel', num_flods=5, config=None):
19 |         name = 'dpcnn' + config.main_feature
20 |         BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config)
21 | 
22 |     def create_model(self):
23 |         if self.main_feature == 'char':
24 |             input = Input(shape=(self.max_len,), name='char')
25 |         else:
26 |             input = Input(shape=(self.max_len,), name='word')
27 | 
28 |         embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding')
29 |         x = Masking(mask_value=self.mask_value)(input)
30 |         x = embedding(x)
31 |         x = SpatialDropout1D(0.5)(x)
32 | 
33 |         block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(x)
34 |         block1 = BatchNormalization()(block1)
35 |         block1 = PReLU()(block1)
36 |         block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1)
37 |         block1 = BatchNormalization()(block1)
38 |         block1 = PReLU()(block1)
39 | 
40 |         # we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output
41 |         # if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output
42 |         resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear')(x)
43 |         resize_emb = PReLU()(resize_emb)
44 | 
45 |         block1_output = add([block1, resize_emb])
46 |         x = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output)
47 | 
48 |         for i in range(dp):
49 |             block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(x)
50 |             block1 = BatchNormalization()(block1)
51 |             block1 = PReLU()(block1)
52 |             block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1)
53 |             block1 = BatchNormalization()(block1)
54 |             block1 = PReLU()(block1)
55 | 
56 |             block_output = add([block1, x])
57 |             if i + 1 != dp:
58 |                 x = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block_output)
59 | 
60 |         x = GlobalMaxPooling1D()(block_output)
61 |         output = Dense(dense_nr, activation='linear')(x)
62 |         output = BatchNormalization()(output)
63 |         x = PReLU()(output)
64 | 
65 |         # output = Dropout(dense_dropout)(output)
66 |         if self.config.data_type == 3:
67 |             dense = Dense(self.n_class, activation="sigmoid")(x)
68 |         else:
69 |             dense = Dense(self.n_class, activation="softmax")(x)
70 |         res_model = Model(inputs=[input], outputs=dense)
71 | 
72 |         return res_model
73 | 


--------------------------------------------------------------------------------
/src/model/han_model.py:
--------------------------------------------------------------------------------
 1 | from keras.models import *
 2 | from keras.layers import *
 3 | from model.model_basic import BasicDeepModel
 4 | from model.model_component import AttLayer
 5 | from model.model_component import AttentionWithContext
 6 | 
 7 | 
 8 | class HANModel(BasicDeepModel):
 9 |     def __init__(self, name='basicModel', num_flods=5, config=None):
10 |         name = 'han' + config.main_feature
11 |         BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config)
12 | 
13 |     def create_model(self):
14 | 
15 |         if self.config.main_feature == 'word':
16 |             input = Input(shape=(self.config.HANN_WORD_LEN,), dtype='int32')
17 |         else:
18 |             input = Input(shape=(self.config.HANN_CHAR_LEN,), dtype='int32')
19 | 
20 |         mask = Masking(mask_value=self.mask_value)(input)
21 |         embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding')
22 |         x = embedding(mask)
23 |         x = SpatialDropout1D(0.5)(x)
24 |         x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
25 |         l_att = AttLayer(100)(x)
26 |         # l_att = AttentionWithContext()(x)
27 |         sentEncoder = Model(input, l_att)
28 | 
29 |         if self.config.main_feature == 'word':
30 |             word_input = Input(shape=(self.config.HANN_SENT, self.config.HANN_WORD_LEN), name='hann_word')
31 |             word_encoder = TimeDistributed(sentEncoder)(word_input)
32 |             word_sent_lstm = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(word_encoder)
33 |             #  x = AttLayer(100)(word_sent_lstm)
34 |             x = AttentionWithContext()(word_sent_lstm)
35 |             x = Dropout(0.2)(x)
36 |             if self.config.data_type == 3:
37 |                 dense = Dense(self.n_class, activation="sigmoid")(x)
38 |             else:
39 |                 dense = Dense(self.n_class, activation="softmax")(x)
40 |             model = Model(word_input, dense)
41 |         else:
42 |             char_input = Input(shape=(self.config.HANN_SENT, self.config.HANN_CHAR_LEN), name='hann_char')
43 |             char_encoder = TimeDistributed(sentEncoder)(char_input)
44 |             char_sent_lstm = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(char_encoder)
45 |             x = AttLayer(100)(char_sent_lstm)
46 |             # x = AttentionWithContext()(char_sent_lstm)
47 |             x = Dropout(0.2)(x)
48 |             if self.config.data_type == 3:
49 |                 dense = Dense(self.n_class, activation="sigmoid")(x)
50 |             else:
51 |                 dense = Dense(self.n_class, activation="softmax")(x)
52 |             model = Model(char_input, dense)
53 |         return model
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/src/model/hybrid_nn_1.py:
--------------------------------------------------------------------------------
 1 | from keras.models import *
 2 | from keras.layers import *
 3 | from keras import backend as K
 4 | from model.model_basic import BasicDeepModel
 5 | from model.model_component import AttLayer
 6 | from model.model_component import Capsule
 7 | 
 8 | 
 9 | class HybridNN1Model(BasicDeepModel):
10 |     def __init__(self, name='basicModel', num_flods=5, config=None):
11 |         name = 'hybridnn1' + config.main_feature
12 |         BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config)
13 | 
14 |     def create_model(self):
15 |         if self.main_feature == 'char':
16 |             input = Input(shape=(self.max_len,), name='char')
17 |         else:
18 |             input = Input(shape=(self.max_len,), name='word')
19 | 
20 |         embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding')
21 |         x = Masking(mask_value=self.mask_value)(input)
22 |         x = embedding(x)
23 | 
24 |         x = SpatialDropout1D(0.5)(x)
25 |         x = GRU(256, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(x) # ??
26 |         capsule1 = Capsule(19, 17, 5)(x)
27 |         capsule1 = Flatten()(capsule1)
28 |         capsule2 = Capsule(19, 16, 5)(x)
29 |         capsule2 = Flatten()(capsule2)
30 |         output = concatenate([capsule1, capsule2])
31 | 
32 |         output = Dense(256)(output)
33 |         output = BatchNormalization()(output)
34 |         output = Activation('relu')(output)
35 |         output = Dropout(0.2)(output)
36 | 
37 |         output = Dense(256)(output)
38 |         output = BatchNormalization()(output)
39 |         output = Activation('relu')(output)
40 |         x = Dropout(0.2)(output)
41 | 
42 |         if self.config.data_type == 3:
43 |             dense = Dense(self.n_class, activation="sigmoid")(x)
44 |         else:
45 |             dense = Dense(self.n_class, activation="softmax")(x)
46 |         model = Model(inputs=[input], output=dense)
47 | 
48 |         return model
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/src/model/lightgbm_model.py:
--------------------------------------------------------------------------------
 1 | import lightgbm as lgbm
 2 | from model.model_basic import BasicStaticModel
 3 | 
 4 | class LightGbmModel(BasicStaticModel):
 5 |     def __init__(self, num_folds=5, config=None):
 6 |         lgbm_params = {'objective': 'multiclass',
 7 |                        'bagging_seed': 10,
 8 |                        'boosting_type': 'gbdt',
 9 |                        'feature_fraction': 0.9,
10 |                        'feature_fraction_seed': 10,
11 |                        'lambda_l1': 0.5,
12 |                        'lambda_l2': 0.5,
13 |                        'learning_rate': 0.01,
14 |                        'metric': 'multi_logloss',
15 |                        'min_child_weight': 1,
16 |                        # 'min_split_gain': 0,
17 |                        'device': 'gpu',
18 |                        'gpu_platform_id': 0,
19 |                        'gpu_device_id': config.gpu,
20 |                        'min_sum_hessian_in_leaf': 0.1,
21 |                        'num_leaves': 64,
22 |                        'num_thread': -1,
23 |                        'num_class': config.n_class,
24 |                        'verbose': 1}
25 |         self.config = config
26 |         BasicStaticModel.__init__(self, lgbm_params, num_folds, 'lightGBM', n_class=config.n_class)
27 | 
28 |     def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test):
29 | 
30 |         dtrain = lgbm.Dataset(kfold_X_train, label=y_train)
31 |         dwatch = lgbm.Dataset(kfold_X_valid, label=y_test)
32 | 
33 |         best = lgbm.train(self.params, dtrain, num_boost_round=300, verbose_eval=10, valid_sets=dwatch,
34 |                           early_stopping_rounds=10)
35 |         # 对验证集predict
36 | 
37 |         pred = best.predict(kfold_X_valid)
38 |         results = best.predict(test)
39 | 
40 |         return pred, results, best
41 | 
42 | 


--------------------------------------------------------------------------------
/src/model/lstmconv_model.py:
--------------------------------------------------------------------------------
  1 | from model.model_basic import BasicDeepModel
  2 | import tensorflow as tf
  3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
  4 | from bilm.elmo import weight_layers
  5 | from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
  6 | 
  7 | n_sub = 10
  8 | n_filters = 100
  9 | 
 10 | 
 11 | class LstmconvModel(BasicDeepModel):
 12 |     def __init__(self, name='basicModel', n_folds=5, config=None):
 13 |         name = 'lstmconv' + config.main_feature
 14 |         self.hidden_dim = 300
 15 |         BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
 16 | 
 17 |     def LSTM(self, layers=1):
 18 |         lstms = []
 19 |         for num in range(layers):
 20 |             lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0)
 21 |             print(lstm.name)
 22 |             # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim)
 23 |             lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob)
 24 |             lstms.append(lstm)
 25 | 
 26 |         lstms = tf.contrib.rnn.MultiRNNCell(lstms)
 27 |         return lstms
 28 | 
 29 |     def create_model(self, share_dense=True, concat_sub=True):
 30 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y')
 31 |         self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
 32 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
 33 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
 34 | 
 35 |         if self.main_feature.lower() in ['word', 'char']:
 36 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
 37 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
 38 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
 39 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 40 | 
 41 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
 42 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
 43 |             if self.main_feature == 'elmo_word':
 44 |                 options_file = self.config.elmo_word_options_file
 45 |                 weight_file = self.config.elmo_word_weight_file
 46 |                 embed_file = self.config.elmo_word_embed_file
 47 |             elif self.main_feature == 'elmo_char':
 48 |                 options_file = self.config.elmo_char_options_file
 49 |                 weight_file = self.config.elmo_char_weight_file
 50 |                 embed_file = self.config.elmo_char_embed_file
 51 |             elif self.main_feature == 'elmo_qiuqiu':
 52 |                 options_file = self.config.elmo_qiuqiu_options_file
 53 |                 weight_file = self.config.elmo_qiuqiu_weight_file
 54 |                 embed_file = self.config.elmo_qiuqiu_embed_file
 55 |             self.bilm = BidirectionalLanguageModel(options_file,
 56 |                                                     weight_file,
 57 |                                                     use_character_inputs=False,
 58 |                                                     embedding_weight_file=embed_file,
 59 |                                                     max_batch_size=self.batch_size)
 60 |             bilm_embedding_op = self.bilm(self.input_x)
 61 |             bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
 62 |             self.word_encoding = bilm_embedding['weighted_op']
 63 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 64 | 
 65 |         else:
 66 |             exit('wrong feature')
 67 | 
 68 |         c_outputs = []
 69 |         for c in range(n_sub):
 70 |             with tf.variable_scope('lstm-{}'.format(c)):
 71 |                 # self.forward = self.LSTM()
 72 |                 # self.backward = self.LSTM()
 73 |                 # x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32)
 74 |                 # x = tf.concat(x, -1)
 75 |                 #### cudnn lstm ####
 76 |                 self.forward = cudnn_rnn.CudnnLSTM(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32)
 77 |                 x, _ = self.forward(tf.transpose(self.word_encoding, [1, 0, 2]))
 78 |                 x = tf.transpose(x, [1, 0, 2])
 79 | 
 80 |             with tf.variable_scope('conv-{}'.format(c)):
 81 |                 inputs_expanded = tf.expand_dims(x, -1)
 82 |                 filter_shape = [3, 2*self.hidden_dim, 1, n_filters]
 83 |                 W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W')
 84 |                 b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters]))
 85 |                 conv = tf.nn.conv2d(inputs_expanded, W, strides=[1]*4, padding='VALID', name='conv2d')
 86 |                 h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
 87 |                 max_pooled = tf.nn.max_pool(h,
 88 |                                         ksize=[1, self.max_len-3+1, 1, 1],
 89 |                                         strides=[1, 1, 1, 1],
 90 |                                         padding='VALID',
 91 |                                         name='max_pool')
 92 |                 avg_pooled = tf.nn.avg_pool(h,
 93 |                                         ksize=[1, self.max_len-3+1, 1, 1],
 94 |                                         strides=[1, 1, 1, 1],
 95 |                                         padding='VALID',
 96 |                                         name='avg_pool')
 97 |                 concat_pooled = tf.reshape(tf.concat((max_pooled, avg_pooled), -1), [-1, 2*n_filters])
 98 | 
 99 |                 concat_pooled = tf.nn.dropout(concat_pooled, self.dropout_keep_prob)
100 |                 dense = tf.layers.dense(concat_pooled, 4, activation=None)
101 |                 c_outputs.append(dense)
102 | 
103 |         self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4])
104 |         y_ = tf.nn.softmax(self.logits)
105 |         self.prob = tf.reshape(y_, [-1, n_sub, 4])
106 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
107 | 
108 |         if not self.config.balance:
109 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
110 |             # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
111 |         else:
112 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
113 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
114 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
115 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
116 |             class0_weight = 1  # 第0类的权重系数
117 |             class1_weight = 3  # 第1类的权重系数
118 |             class2_weight = 3  # 第2类的权重系数
119 |             class3_weight = 3  # 第3类的权重系数
120 |             #  coe = tf.constant([1., 1., 1., 1.])
121 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
122 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
123 | 
124 |             y = tf.reshape(self.input_y, [-1, 4])
125 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
126 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
127 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
128 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
129 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
130 | 
131 |         return self
132 | 
133 |     def create_model_v1(self, share_dense=True, concat_sub=True):
134 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y')
135 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
136 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
137 | 
138 |         if self.main_feature.lower() in ['word', 'char']:
139 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
140 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
141 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
142 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
143 | 
144 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
145 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
146 |             if self.main_feature == 'elmo_word':
147 |                 options_file = self.config.elmo_word_options_file
148 |                 weight_file = self.config.elmo_word_weight_file
149 |                 embed_file = self.config.elmo_word_embed_file
150 |             elif self.main_feature == 'elmo_char':
151 |                 options_file = self.config.elmo_char_options_file
152 |                 weight_file = self.config.elmo_char_weight_file
153 |                 embed_file = self.config.elmo_char_embed_file
154 |             elif self.main_feature == 'elmo_qiuqiu':
155 |                 options_file = self.config.elmo_qiuqiu_options_file
156 |                 weight_file = self.config.elmo_qiuqiu_weight_file
157 |                 embed_file = self.config.elmo_qiuqiu_embed_file
158 | 
159 |             self.bilm = BidirectionalLanguageModel(options_file,
160 |                                                     weight_file,
161 |                                                     use_character_inputs=False,
162 |                                                     embedding_weight_file=embed_file,
163 |                                                     max_batch_size=self.batch_size)
164 |             bilm_embedding_op = self.bilm(self.input_x)
165 |             bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
166 |             self.word_encoding = bilm_embedding['weighted_op']
167 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
168 | 
169 |         else:
170 |             exit('wrong feature')
171 | 
172 |         self.forward = self.LSTM()
173 |         self.backward = self.LSTM()
174 |         x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32)
175 |         x = tf.concat(x, -1)
176 | 
177 |         inputs_expanded = tf.expand_dims(x, -1)
178 |         filter_shape = [3, 2*self.hidden_dim, 1, n_filters]
179 |         W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W')
180 |         b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters]))
181 |         conv = tf.nn.conv2d(inputs_expanded, W, strides=[1]*4, padding='VALID', name='conv2d')
182 |         h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
183 |         output_sentence = tf.reshape(h, [-1, self.max_len-3+1, n_filters])
184 | 
185 |         #  output_sentence = tf.layers.dense(x, self.hidden_dim, activation=tf.nn.relu)
186 | 
187 |         x_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len-3+1, n_filters])
188 |         x_tile = tf.tile(x_reshape, [1, n_sub, 1, 1])  # 句子复制n_sub份
189 | 
190 |         sub_embedding = tf.get_variable(shape=[n_sub, n_filters], name='sub_embedding')
191 |         sub_reshape = tf.reshape(sub_embedding, [1, n_sub, 1, n_filters])
192 |         sub_tile = tf.tile(sub_reshape, [self.batch_size, 1, self.max_len-3+1, 1])
193 | 
194 |         embed_concat = tf.reshape(tf.concat((x_tile, sub_tile), -1), [-1, 2*n_filters])
195 | 
196 |         att_w = tf.get_variable(shape=[2*n_filters, n_filters], name='att_w')
197 |         att_b = tf.get_variable(shape=[n_filters], name='att_b')
198 |         att_v = tf.get_variable(shape=[n_filters, 1], name='att_v')
199 | 
200 |         score = tf.matmul(tf.nn.tanh(tf.matmul(embed_concat, att_w) + att_b), att_v)
201 |         score_fit = tf.reshape(score, [-1, n_sub, self.max_len-3+1])
202 |         alpha = tf.nn.softmax(score_fit)
203 | 
204 |         layer_sentence = tf.matmul(alpha, output_sentence)
205 | 
206 |         if concat_sub:
207 |             # 是否拼接layer_sub信息
208 |             layer_sub = tf.reshape(sub_embedding, [1, n_sub, n_filters])
209 |             layer_sub_tile = tf.tile(layer_sub, [self.batch_size, 1, 1])
210 | 
211 |             layer_total = tf.concat((layer_sentence, layer_sub_tile), -1)
212 |             outputs = tf.reshape(layer_total, [-1, 2*n_filters])
213 |         else:
214 |             outputs = tf.reshape(layer_sentence, [-1, n_filters])
215 | 
216 |         self.logits = tf.layers.dense(layer_sentence, 4, activation=None)
217 |         y_ = tf.nn.softmax(self.logits)
218 |         self.prob = tf.reshape(y_, [-1, 10, 4])
219 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
220 | 
221 |         if not self.config.balance:
222 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
223 |             self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
224 |         else:
225 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
226 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
227 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
228 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
229 |             class0_weight = 0.7  # 第0类的权重系数
230 |             class1_weight = 1.3  # 第1类的权重系数
231 |             class2_weight = 1  # 第2类的权重系数
232 |             class3_weight = 1.3  # 第3类的权重系数
233 |             #  coe = tf.constant([1., 1., 1., 1.])
234 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
235 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
236 | 
237 |             y = tf.reshape(self.input_y, [-1, 4])
238 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
239 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
240 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
241 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
242 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
243 | 
244 |         return self
245 | 
246 | 
247 | 


--------------------------------------------------------------------------------
/src/model/lstmgru_model.py:
--------------------------------------------------------------------------------
  1 | from model.model_basic import BasicDeepModel
  2 | import tensorflow as tf
  3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
  4 | from bilm.elmo import weight_layers
  5 | from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
  6 | 
  7 | n_sub = 10
  8 | 
  9 | 
 10 | class LstmgruModel(BasicDeepModel):
 11 |     def __init__(self, name='basicModel', n_folds=5, config=None):
 12 |         name = 'lstmgru' + config.main_feature
 13 |         self.hidden_dim = 300
 14 |         BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
 15 | 
 16 |     def create_model(self, share_dense=True, concat_sub=True):
 17 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y')
 18 |         self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
 19 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
 20 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
 21 | 
 22 |         if self.main_feature.lower() in ['word', 'char']:
 23 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
 24 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
 25 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
 26 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 27 | 
 28 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
 29 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
 30 |             if self.main_feature == 'elmo_word':
 31 |                 options_file = self.config.elmo_word_options_file
 32 |                 weight_file = self.config.elmo_word_weight_file
 33 |                 embed_file = self.config.elmo_word_embed_file
 34 |             elif self.main_feature == 'elmo_char':
 35 |                 options_file = self.config.elmo_char_options_file
 36 |                 weight_file = self.config.elmo_char_weight_file
 37 |                 embed_file = self.config.elmo_char_embed_file
 38 |             elif self.main_feature == 'elmo_qiuqiu':
 39 |                 options_file = self.config.elmo_qiuqiu_options_file
 40 |                 weight_file = self.config.elmo_qiuqiu_weight_file
 41 |                 embed_file = self.config.elmo_qiuqiu_embed_file
 42 | 
 43 |             self.bilm = BidirectionalLanguageModel(options_file,
 44 |                                                     weight_file,
 45 |                                                     use_character_inputs=False,
 46 |                                                     embedding_weight_file=embed_file,
 47 |                                                     max_batch_size=self.batch_size)
 48 |             bilm_embedding_op = self.bilm(self.input_x)
 49 |             bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
 50 |             self.word_encoding = bilm_embedding['weighted_op']
 51 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 52 | 
 53 |         else:
 54 |             exit('wrong feature')
 55 | 
 56 |         c_outputs = []
 57 |         for c in range(n_sub):
 58 |             with tf.variable_scope('lstm-{}'.format(c)):
 59 |                 # self.forward = self.LSTM()
 60 |                 # self.backward = self.LSTM()
 61 |                 # x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32)
 62 |                 # x = tf.concat(x, -1)
 63 |                 #### cudnn lstm ####
 64 |                 self.forward_lstm = cudnn_rnn.CudnnLSTM(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32)
 65 |                 self.forward_gru = cudnn_rnn.CudnnGRU(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32)
 66 |                 x, _ = self.forward_lstm(tf.transpose(self.word_encoding, [1, 0, 2]))
 67 |                 x, _ = self.forward_gru(x)
 68 |                 x = tf.transpose(x, [1, 0, 2])
 69 | 
 70 |             with tf.variable_scope('pooling-{}'.format(c)):
 71 |                 max_pooled = tf.reshape(tf.reduce_max(x, 1), [-1, 2*self.hidden_dim])
 72 |                 avg_pooled = tf.reshape(tf.reduce_mean(x, 1), [-1, 2*self.hidden_dim])
 73 |                 concat_pooled = tf.concat((max_pooled, avg_pooled), -1)
 74 | 
 75 |                 concat_pooled = tf.nn.dropout(concat_pooled, self.dropout_keep_prob)
 76 |                 dense = tf.layers.dense(concat_pooled, 4, activation=None)
 77 |                 c_outputs.append(dense)
 78 | 
 79 |         self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4])
 80 |         y_ = tf.nn.softmax(self.logits)
 81 |         self.prob = tf.reshape(y_, [-1, n_sub, 4])
 82 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
 83 | 
 84 |         if not self.config.balance:
 85 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
 86 |             # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
 87 |         else:
 88 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
 89 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
 90 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
 91 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
 92 |             class0_weight = 1  # 第0类的权重系数
 93 |             class1_weight = 3  # 第1类的权重系数
 94 |             class2_weight = 3  # 第2类的权重系数
 95 |             class3_weight = 3  # 第3类的权重系数
 96 |             #  coe = tf.constant([1., 1., 1., 1.])
 97 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
 98 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
 99 | 
100 |             y = tf.reshape(self.input_y, [-1, 4])
101 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
102 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
103 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
104 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
105 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
106 | 
107 |         return self
108 | 
109 | 


--------------------------------------------------------------------------------
/src/model/ml_models.py:
--------------------------------------------------------------------------------
 1 | from model.model_basic import BasicStaticModel
 2 | from sklearn import svm
 3 | from sklearn.svm import SVC
 4 | from sklearn.naive_bayes import MultinomialNB
 5 | from sklearn.calibration import CalibratedClassifierCV
 6 | from sklearn.metrics import f1_score
 7 | from skift import FirstColFtClassifier
 8 | 
 9 | import logging
10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class SVCClassifier(BasicStaticModel):
15 | 
16 |     def __init__(self, name='basicModel', n_folds=5, config=None):
17 |         BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config)
18 | 
19 |     def create_model(self):
20 |         classifier = SVC(kernel="rbf")
21 |         classifier = CalibratedClassifierCV(classifier)
22 |         classifier = SVC(kernel="linear")
23 |         self.classifier = classifier
24 |         self.classifier = svm.LinearSVC(loss='hinge', tol=1e-4, C=0.6)
25 |         return self.classifier
26 | 
27 | 
28 | class Fasttext(BasicStaticModel):
29 |     def __init__(self, name='basicModel', n_folds=5, config=None):
30 |         BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config)
31 | 
32 |     def create_model(self):
33 |         sk_clf = FirstColFtClassifier(lr=1.0, epoch=10,
34 |                                       wordNgrams=1,
35 |                                       minCount=5, verbose=2)
36 |         return sk_clf
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/src/model/model_component.py:
--------------------------------------------------------------------------------
  1 | from keras.layers import *
  2 | from keras.models import *
  3 | 
  4 | 
  5 | class AttLayer(Layer):
  6 |     def __init__(self, attention_dim):
  7 |         self.init = initializers.get('normal')
  8 |         self.supports_masking = True
  9 |         self.attention_dim = attention_dim
 10 |         super(AttLayer, self).__init__()
 11 | 
 12 |     def build(self, input_shape):
 13 |         assert len(input_shape) == 3
 14 |         self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
 15 |         self.b = K.variable(self.init((self.attention_dim, )))
 16 |         self.u = K.variable(self.init((self.attention_dim, 1)))
 17 |         self.trainable_weights = [self.W, self.b, self.u]
 18 |         super(AttLayer, self).build(input_shape)
 19 | 
 20 |     def compute_mask(self, inputs, mask=None):
 21 |         return mask
 22 | 
 23 |     def call(self, x, mask=None):
 24 |         # size of x :[batch_size, sel_len, attention_dim]
 25 |         # size of u :[batch_size, attention_dim]
 26 |         # uit = tanh(xW+b)
 27 |         uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
 28 |         ait = K.dot(uit, self.u)
 29 |         ait = K.squeeze(ait, -1)
 30 | 
 31 |         ait = K.exp(ait)
 32 | 
 33 |         if mask is not None:
 34 |             # Cast the mask to floatX to avoid float64 upcasting in theano
 35 |             ait *= K.cast(mask, K.floatx())
 36 |         ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
 37 |         ait = K.expand_dims(ait)
 38 |         weighted_input = x * ait
 39 |         output = K.sum(weighted_input, axis=1)
 40 | 
 41 |         return output
 42 | 
 43 |     def compute_output_shape(self, input_shape):
 44 |         return (input_shape[0], input_shape[-1])
 45 | 
 46 | 
 47 | class AttentionWeightedAverage(Layer):
 48 |     """
 49 |     Computes a weighted average of the different channels across timesteps.
 50 |     Uses 1 parameter pr. channel to compute the attention value for a single timestep.
 51 |     """
 52 | 
 53 |     def __init__(self, return_attention=False, **kwargs):
 54 |         self.init = initializers.get('uniform')
 55 |         self.supports_masking = True
 56 |         self.return_attention = return_attention
 57 |         super(AttentionWeightedAverage, self).__init__(**kwargs)
 58 | 
 59 |     def build(self, input_shape):
 60 |         self.input_spec = [InputSpec(ndim=3)]
 61 |         assert len(input_shape) == 3
 62 | 
 63 |         self.W = self.add_weight(shape=(input_shape[2], 1),
 64 |                                  name='{}_W'.format(self.name),
 65 |                                  initializer=self.init)
 66 |         self.trainable_weights = [self.W]
 67 |         super(AttentionWeightedAverage, self).build(input_shape)
 68 | 
 69 |     def call(self, x, mask=None):
 70 |         # computes a probability distribution over the timesteps
 71 |         # uses 'max trick' for numerical stability
 72 |         # reshape is done to avoid issue with Tensorflow
 73 |         # and 1-dimensional weights
 74 |         logits = K.dot(x, self.W)
 75 |         x_shape = K.shape(x)
 76 |         logits = K.reshape(logits, (x_shape[0], x_shape[1]))
 77 |         ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))
 78 | 
 79 |         # masked timesteps have zero weight
 80 |         if mask is not None:
 81 |             mask = K.cast(mask, K.floatx())
 82 |             ai = ai * mask
 83 |         att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
 84 |         weighted_input = x * K.expand_dims(att_weights)
 85 |         result = K.sum(weighted_input, axis=1)
 86 |         if self.return_attention:
 87 |             return [result, att_weights]
 88 |         return result
 89 | 
 90 |     def get_output_shape_for(self, input_shape):
 91 |         return self.compute_output_shape(input_shape)
 92 | 
 93 |     def compute_output_shape(self, input_shape):
 94 |         output_len = input_shape[2]
 95 |         if self.return_attention:
 96 |             return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
 97 |         return (input_shape[0], output_len)
 98 | 
 99 |     def compute_mask(self, input, input_mask=None):
100 |         if isinstance(input_mask, list):
101 |             return [None] * len(input_mask)
102 |         else:
103 |             return None
104 | 
105 | def squash(x, axis=-1):
106 |     # s_squared_norm is really small
107 |     # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
108 |     # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
109 |     # return scale * x
110 |     s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
111 |     scale = K.sqrt(s_squared_norm + K.epsilon())
112 |     return x / scale
113 | 
114 | 
115 | # A Capsule Implement with Pure Keras
116 | class Capsule(Layer):
117 |     def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
118 |                  activation='default', **kwargs):
119 |         super(Capsule, self).__init__(**kwargs)
120 |         self.num_capsule = num_capsule
121 |         self.dim_capsule = dim_capsule
122 |         self.routings = routings
123 |         self.kernel_size = kernel_size
124 |         self.share_weights = share_weights
125 |         if activation == 'default':
126 |             self.activation = squash
127 |         else:
128 |             self.activation = Activation(activation)
129 | 
130 |     def build(self, input_shape):
131 |         super(Capsule, self).build(input_shape)
132 |         input_dim_capsule = input_shape[-1]
133 |         if self.share_weights:
134 |             self.W = self.add_weight(name='capsule_kernel',
135 |                                      shape=(1, input_dim_capsule,
136 |                                             self.num_capsule * self.dim_capsule),
137 |                                      # shape=self.kernel_size,
138 |                                      initializer='glorot_uniform',
139 |                                      trainable=True)
140 |         else:
141 |             input_num_capsule = input_shape[-2]
142 |             self.W = self.add_weight(name='capsule_kernel',
143 |                                      shape=(input_num_capsule,
144 |                                             input_dim_capsule,
145 |                                             self.num_capsule * self.dim_capsule),
146 |                                      initializer='glorot_uniform',
147 |                                      trainable=True)
148 | 
149 |     def call(self, u_vecs):
150 |         if self.share_weights:
151 |             u_hat_vecs = K.conv1d(u_vecs, self.W)
152 |         else:
153 |             u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])
154 | 
155 |         batch_size = K.shape(u_vecs)[0]
156 |         input_num_capsule = K.shape(u_vecs)[1]
157 |         u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
158 |                                             self.num_capsule, self.dim_capsule))
159 |         u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
160 |         # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]
161 | 
162 |         b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
163 |         for i in range(self.routings):
164 |             b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
165 |             c = K.softmax(b)
166 |             c = K.permute_dimensions(c, (0, 2, 1))
167 |             b = K.permute_dimensions(b, (0, 2, 1))
168 |             outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
169 |             if i < self.routings - 1:
170 |                 b = K.batch_dot(outputs, u_hat_vecs, [2, 3])
171 | 
172 |         return outputs
173 | 
174 |     def compute_output_shape(self, input_shape):
175 |         return (None, self.num_capsule, self.dim_capsule)
176 | 
177 | def dot_product(x, kernel):
178 |     """
179 |     Wrapper for dot product operation, in order to be compatible with both
180 |     Theano and Tensorflow
181 |     Args:
182 |         x (): input
183 |         kernel (): weights
184 |     Returns:
185 |     """
186 |     if K.backend() == 'tensorflow':
187 |         return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
188 |     else:
189 |         return K.dot(x, kernel)
190 | 
191 | 
192 | class AttentionWithContext(Layer):
193 |     """
194 |     Attention operation, with a context/query vector, for temporal data.
195 |     Supports Masking.
196 |     Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
197 |     "Hierarchical Attention Networks for Document Classification"
198 |     by using a context vector to assist the attention
199 |     # Input shape
200 |         3D tensor with shape: `(samples, steps, features)`.
201 |     # Output shape
202 |         2D tensor with shape: `(samples, features)`.
203 |     How to use:
204 |     Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
205 |     The dimensions are inferred based on the output shape of the RNN.
206 |     Note: The layer has been tested with Keras 2.0.6
207 |     Example:
208 |         model.add(LSTM(64, return_sequences=True))
209 |         model.add(AttentionWithContext())
210 |         # next add a Dense layer (for classification/regression) or whatever...
211 |     """
212 | 
213 |     def __init__(self,
214 |                  W_regularizer=None, u_regularizer=None, b_regularizer=None,
215 |                  W_constraint=None, u_constraint=None, b_constraint=None,
216 |                  bias=True, **kwargs):
217 | 
218 |         self.supports_masking = True
219 |         self.init = initializers.get('glorot_uniform')
220 | 
221 |         self.W_regularizer = regularizers.get(W_regularizer)
222 |         self.u_regularizer = regularizers.get(u_regularizer)
223 |         self.b_regularizer = regularizers.get(b_regularizer)
224 | 
225 |         self.W_constraint = constraints.get(W_constraint)
226 |         self.u_constraint = constraints.get(u_constraint)
227 |         self.b_constraint = constraints.get(b_constraint)
228 | 
229 |         self.bias = bias
230 |         super(AttentionWithContext, self).__init__(**kwargs)
231 | 
232 |     def build(self, input_shape):
233 |         assert len(input_shape) == 3
234 | 
235 |         self.W = self.add_weight((input_shape[-1], input_shape[-1],),
236 |                                  initializer=self.init,
237 |                                  name='{}_W'.format(self.name),
238 |                                  regularizer=self.W_regularizer,
239 |                                  constraint=self.W_constraint)
240 |         if self.bias:
241 |             self.b = self.add_weight((input_shape[-1],),
242 |                                      initializer='zero',
243 |                                      name='{}_b'.format(self.name),
244 |                                      regularizer=self.b_regularizer,
245 |                                      constraint=self.b_constraint)
246 | 
247 |         self.u = self.add_weight((input_shape[-1],),
248 |                                  initializer=self.init,
249 |                                  name='{}_u'.format(self.name),
250 |                                  regularizer=self.u_regularizer,
251 |                                  constraint=self.u_constraint)
252 | 
253 |         super(AttentionWithContext, self).build(input_shape)
254 | 
255 |     def compute_mask(self, input, input_mask=None):
256 |         # do not pass the mask to the next layers
257 |         return None
258 | 
259 |     def call(self, x, mask=None):
260 |         uit = dot_product(x, self.W)
261 | 
262 |         if self.bias:
263 |             uit += self.b
264 | 
265 |         uit = K.tanh(uit)
266 |         ait = K.dot(uit, self.u)
267 | 
268 |         a = K.exp(ait)
269 | 
270 |         # apply mask after the exp. will be re-normalized next
271 |         if mask is not None:
272 |             # Cast the mask to floatX to avoid float64 upcasting in theano
273 |             a *= K.cast(mask, K.floatx())
274 | 
275 |         # in some cases especially in the early stages of training the sum may be almost zero
276 |         # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
277 |         # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
278 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
279 | 
280 |         a = K.expand_dims(a)
281 |         weighted_input = x * a
282 |         return K.sum(weighted_input, axis=1)
283 | 
284 |     def compute_output_shape(self, input_shape):
285 |         return input_shape[0], input_shape[-1]
286 | 


--------------------------------------------------------------------------------
/src/model/my_callbacks.py:
--------------------------------------------------------------------------------
  1 | import keras as keras
  2 | from keras import backend as K
  3 | import numpy as np
  4 | import warnings
  5 | import glob
  6 | import os
  7 | from keras.models import load_model
  8 | import pickle
  9 | 
 10 | 
 11 | class JZTrainCategory(keras.callbacks.Callback):
 12 |     def __init__(self, filepath, nb_epochs=20, nb_snapshots=1, monitor='val_loss', factor=0.1, verbose=1, patience=1,
 13 |                     save_weights_only=False,
 14 |                     decay_factor_value=1.0,
 15 |                     mode='auto', period=1):
 16 |         super(JZTrainCategory, self).__init__()
 17 |         self.nb_epochs = nb_epochs
 18 |         self.monitor = monitor
 19 |         self.verbose = verbose
 20 |         self.filepath = filepath
 21 |         self.init_factor = factor
 22 |         self.decay_factor_value = decay_factor_value
 23 |         self.factor = factor
 24 |         self.save_weights_only = save_weights_only
 25 |         self.patience = patience
 26 |         self.r_patience = 0
 27 |         self.check = nb_epochs // nb_snapshots
 28 |         self.monitor_val_list = []
 29 |         if mode not in ['auto', 'min', 'max']:
 30 |             warnings.warn('ModelCheckpoint mode %s is unknown, '
 31 |                           'fallback to auto mode.' % (mode),
 32 |                           RuntimeWarning)
 33 |             mode = 'auto'
 34 |         if mode == 'min':
 35 |             self.monitor_op = np.less
 36 |             self.init_best = np.Inf
 37 |         elif mode == 'max':
 38 |             self.monitor_op = np.greater
 39 |             self.init_best = -np.Inf
 40 |         else:
 41 |             if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
 42 |                 self.monitor_op = np.greater
 43 |                 self.init_best = -np.Inf
 44 |             else:
 45 |                 self.monitor_op = np.less
 46 |                 self.init_best = np.Inf
 47 | 
 48 |     @staticmethod
 49 |     def compile_official_f1_score(y_true, y_pred):
 50 |         y_true = K.reshape(y_true, (-1, 10))
 51 |         y_true = K.cast(y_true, 'float32')
 52 |         y_pred = K.round(y_pred)
 53 | 
 54 |         tp = K.sum(y_pred * y_true)
 55 |         fp = K.sum(K.cast(K.greater(y_pred - y_true, 0.), 'float32'))
 56 |         fn = K.sum(K.cast(K.greater(y_true - y_pred, 0.), 'float32'))
 57 |         p = tp / (tp + fp)
 58 |         r = tp / (tp + fn)
 59 |         f = 2*p*r/(p+r)
 60 |         return f
 61 | 
 62 |     def on_batch_begin(self, batch, logs={}):
 63 |         return
 64 | 
 65 |     def on_batch_end(self, batch, logs={}):
 66 |         return
 67 | 
 68 |     def on_train_end(self, logs={}):
 69 |         return
 70 | 
 71 |     def on_train_begin(self, logs={}):
 72 |         self.init_lr = K.get_value(self.model.optimizer.lr)
 73 |         self.best = self.init_best
 74 |         return
 75 | 
 76 |     def on_epoch_begin(self, epoch, logs=None):
 77 |         return
 78 | 
 79 |     def on_epoch_end(self, epoch, logs=None):
 80 |         logs = logs or {}
 81 |         logs['lr'] = K.get_value(self.model.optimizer.lr)
 82 | 
 83 |         n_recurrent = epoch // self.check
 84 |         self.save_path = '{}/{}.h5'.format(self.filepath, n_recurrent)
 85 |         os.makedirs(self.filepath, exist_ok=True)
 86 |         current = logs.get(self.monitor)
 87 |         if current is None:
 88 |             warnings.warn('Can save best model only with %s available, '
 89 |                           'skipping.' % (self.monitor), RuntimeWarning)
 90 | 
 91 |         else:
 92 |             if self.monitor_op(current, self.best):
 93 |                 # if better result: save model
 94 |                 self.r_patience = 0
 95 |                 if self.verbose > 0:
 96 |                     print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
 97 |                           ' saving model to %s'
 98 |                           % (epoch + 1, self.monitor, self.best,
 99 |                              current, self.save_path))
100 |                 self.best = current
101 |                 if self.save_weights_only:
102 |                     self.model.save_weights(self.save_path)
103 |                     # pickle.dump(self.model.get_weights(), open('./debug_weight.pkl', 'wb'))
104 |                     symbolic_weights = getattr(self.model.optimizer, 'weights')
105 |                     weight_values = K.batch_get_value(symbolic_weights)
106 |                     with open('{}/optimizer.pkl'.format(self.filepath), 'wb') as f:
107 |                         pickle.dump(weight_values, f)
108 |                 else:
109 |                     self.model.save(self.save_path)
110 | 
111 |             else:
112 |                 # if worse resule: reload last best model saved
113 |                 self.r_patience += 1
114 |                 if self.verbose > 0:
115 |                     if self.r_patience == self.patience:
116 |                         print('\nEpoch %05d: %s did not improve from %0.5f' %
117 |                             (epoch + 1, self.monitor, self.best))
118 |                         if self.save_weights_only:
119 |                             self.model.load_weights(self.save_path)
120 |                             self.model._make_train_function()
121 |                             with open('{}/optimizer.pkl'.format(self.filepath), 'rb') as f:
122 |                                 weight_values = pickle.load(f)
123 |                             self.model.optimizer.set_weights(weight_values)
124 |                         else:
125 |                             self.model = load_model(self.save_path, custom_objects={'compile_official_f1_score': JZTrainCategory.compile_official_f1_score})
126 |                         # set new learning rate
127 |                         old_lr = K.get_value(self.model.optimizer.lr)
128 |                         new_lr = old_lr * self.factor
129 |                         self.factor *= self.decay_factor_value  # 衰减系数衰减
130 |                         K.set_value(self.model.optimizer.lr, new_lr)
131 |                         print('\nReload model and decay learningrate from {} to {}\n'.format(old_lr, new_lr))
132 |                         self.r_patience = 0
133 | 
134 |         if (epoch+1) % self.check == 0:
135 |             self.monitor_val_list.append(self.best)
136 |             self.best = self.init_best
137 |             self.factor = self.init_factor
138 | 
139 |             if (epoch+1) != self.nb_epochs:
140 |                 K.set_value(self.model.optimizer.lr, self.init_lr)
141 |                 print('At epoch-{} reset learning rate to mountain-top init lr {}'.format(epoch+1, self.init_lr))
142 | 
143 | 


--------------------------------------------------------------------------------
/src/model/rcnn_model.py:
--------------------------------------------------------------------------------
  1 | from model.model_basic import BasicDeepModel
  2 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
  3 | from bilm.elmo import weight_layers
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | 
  7 | from tensorflow.contrib import rnn
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | filter_sizes = [1, 2, 3, 4]
 11 | n_filter = 128
 12 | hidden_size = 300
 13 | n_sub = 10
 14 | n_sent = 4
 15 | 
 16 | 
 17 | class RCNNModel(BasicDeepModel):
 18 |     def __init__(self, name='basicModel', n_folds=10, config=None):
 19 |         name = 'RCNN' + config.main_feature
 20 |         BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
 21 | 
 22 |     def create_model(self, share_dense=True):
 23 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, n_sub, n_sent], name='input_y')
 24 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
 25 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
 26 | 
 27 |         if self.main_feature.lower() in ['word', 'char']:
 28 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, self.max_len], name='input_x')
 29 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
 30 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
 31 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 32 | 
 33 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
 34 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
 35 |             if self.main_feature == 'elmo_word':
 36 |                 options_file = self.config.elmo_word_options_file
 37 |                 weight_file = self.config.elmo_word_weight_file
 38 |                 embed_file = self.config.elmo_word_embed_file
 39 |             elif self.main_feature == 'elmo_char':
 40 |                 options_file = self.config.elmo_char_options_file
 41 |                 weight_file = self.config.elmo_char_weight_file
 42 |                 embed_file = self.config.elmo_char_embed_file
 43 |             elif self.main_feature == 'elmo_qiuqiu':
 44 |                 options_file = self.config.elmo_qiuqiu_options_file
 45 |                 weight_file = self.config.elmo_qiuqiu_weight_file
 46 |                 embed_file = self.config.elmo_qiuqiu_embed_file
 47 | 
 48 |             self.bilm = BidirectionalLanguageModel(options_file,
 49 |                                                     weight_file,
 50 |                                                     use_character_inputs=False,
 51 |                                                     embedding_weight_file=embed_file,
 52 |                                                     max_batch_size=self.batch_size)
 53 |             bilm_embedding_op = self.bilm(self.input_x)
 54 |             bilm_embedding = weight_layers('output', bilm_embedding_op, l2_coef=0.0)
 55 |             self.word_encoding = bilm_embedding['weighted_op']
 56 |             self.word_encoding = tf.nn.dropout(self.word_encoding,self.dropout_keep_prob) # new
 57 | 
 58 |         else:
 59 |             exit('wrong feature')
 60 | 
 61 |         rcnn_outputs = []
 62 |         for i in range(n_sub):
 63 |             with tf.variable_scope('rcnn_output_%d' % i):
 64 |                 output_bigru = self.bi_gru(self.word_encoding, hidden_size)
 65 |                 output = self.textcnn(output_bigru, self.max_len)
 66 |                 rcnn_outputs.append(output)
 67 | 
 68 |         n_filter_total = n_filter * len(filter_sizes)
 69 |         outputs = tf.reshape(tf.concat(rcnn_outputs, 1), (-1, n_sub, n_filter_total))
 70 | 
 71 |         if share_dense:
 72 |             cnn_outputs = tf.reshape(outputs, (-1, n_filter_total))
 73 |             W = tf.get_variable('W', shape=[n_filter_total, self.n_classes])
 74 |             b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes]))
 75 |             self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores')
 76 |         else:
 77 |             cnn_outputs = tf.reshape(tf.concat(outputs, 1), (-1, n_sub, n_filter_total))
 78 |             W = tf.get_variable('W', shape=[self.batch_size, n_filter_total, self.n_classes])
 79 |             b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes]))
 80 |             self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores')
 81 | 
 82 |         y_ = tf.nn.softmax(self.logits)
 83 |         self.prob = tf.reshape(y_, [-1, n_sub, 4])
 84 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
 85 | 
 86 |         if not self.config.balance:
 87 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
 88 |         else:
 89 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
 90 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
 91 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
 92 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
 93 |             class0_weight = 1  # 第0类的权重系数
 94 |             class1_weight = 3  # 第1类的权重系数
 95 |             class2_weight = 3  # 第2类的权重系数
 96 |             class3_weight = 3  # 第3类的权重系数
 97 |             #  coe = tf.constant([1., 1., 1., 1.])
 98 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
 99 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
100 | 
101 |             y = tf.reshape(self.input_y, [-1, 4])
102 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
103 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
104 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
105 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
106 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
107 | 
108 |         return self
109 | 
110 |     def textcnn(self, cnn_inputs, n_step):
111 |         # cnn_inputs = [batch_size, n_step, -1]
112 |         inputs = tf.expand_dims(cnn_inputs, -1)
113 |         pooled_outputs = []
114 |         for i, filter_size in enumerate(filter_sizes):
115 |             with tf.variable_scope('conv-maxpool-%s' % filter_size):
116 |                 filter_shape = [filter_size, hidden_size*2+self.embed_size, 1, n_filter]
117 |                 W_filter = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W_filter')
118 |                 beta = tf.get_variable(initializer=tf.constant(0.1, shape=[n_filter]), name='beta')
119 |                 conv = tf.nn.conv2d(inputs, W_filter, strides=[1]*4, padding='VALID', name='conv')
120 |                 h = tf.nn.relu(tf.nn.bias_add(conv, beta), name='relu')
121 |                 pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
122 |                                         strides=[1]*4, padding='VALID', name='pool')
123 |                 pooled_outputs.append(pooled)
124 |         h_pool = tf.concat(pooled_outputs, 3)
125 |         h_pool_flat = tf.reshape(h_pool, [-1, n_filter * len(filter_sizes)])
126 |         h_drop = tf.nn.dropout(h_pool_flat, self.dropout_keep_prob)
127 |         return h_drop
128 | 
129 |     def gru_cell(self, hidden_size):
130 |         cell = rnn.GRUCell(hidden_size, reuse=tf.get_variable_scope().reuse)
131 |         return rnn.DropoutWrapper(cell, output_keep_prob=self.output_keep_prob)
132 | 
133 |     def bi_gru(self, inputs, hidden_size, res_add=True):
134 |         """build the bi-GRU network. Return the encoder represented vector.
135 |         X_inputs: [batch_size, n_step]
136 |         n_step: 句子的词数量；或者文档的句子数。
137 |         outputs: [batch_size, n_step, hidden_size*2+embedding_size(if res_add)]
138 |         """
139 |         cells_fw = [self.gru_cell(hidden_size) for _ in range(1)]
140 |         cells_bw = [self.gru_cell(hidden_size) for _ in range(1)]
141 |         initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw]
142 |         initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw]
143 |         outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
144 |                                                             initial_states_fw=initial_states_fw,
145 |                                                             initial_states_bw=initial_states_bw,
146 |                                                             dtype=tf.float32)
147 |         if res_add:
148 |             outputs = tf.concat([outputs, inputs], axis=2)
149 |         return outputs
150 | 
151 |     # def batchnorm(self, Ylogits, offset, convolutional=False):
152 |         # exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, )
153 | 
154 | 


--------------------------------------------------------------------------------
/src/model/snapshot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | 
 4 | import keras.callbacks as callbacks
 5 | from keras.callbacks import Callback
 6 | 
 7 | class SnapshotModelCheckpoint(Callback):
 8 |     """Callback that saves the snapshot weights of the model.
 9 |     Saves the model weights on certain epochs (which can be considered the
10 |     snapshot of the model at that epoch).
11 |     Should be used with the cosine annealing learning rate schedule to save
12 |     the weight just before learning rate is sharply increased.
13 |     # Arguments:
14 |         nb_epochs: total number of epochs that the model will be trained for.
15 |         nb_snapshots: number of times the weights of the model will be saved.
16 |         fn_prefix: prefix for the filename of the weights.
17 |     """
18 | 
19 |     def __init__(self, nb_epochs, nb_snapshots, fn_prefix='Model'):
20 |         super(SnapshotModelCheckpoint, self).__init__()
21 | 
22 |         self.check = nb_epochs // nb_snapshots
23 |         self.fn_prefix = fn_prefix
24 | 
25 |     def on_epoch_end(self, epoch, logs={}):
26 |         if epoch != 0 and (epoch + 1) % self.check == 0:
27 |             filepath = self.fn_prefix + "-%d.h5" % ((epoch + 1) // self.check)
28 |             self.model.save_weights(filepath, overwrite=True)
29 |         # if epoch == 1:
30 |         #     self.model.get_layer('embedding').trainable = True
31 |         #     self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
32 |         #     print('now we begin to train our embeding layers')
33 |         #     self.model.summary()
34 | 
35 | 
36 | class SnapshotCallbackBuilder:
37 |     """Callback builder for snapshot ensemble training of a model.
38 |     Creates a list of callbacks, which are provided when training a model
39 |     so as to save the model weights at certain epochs, and then sharply
40 |     increase the learning rate.
41 |     """
42 | 
43 |     def __init__(self, nb_epochs, nb_snapshots, init_lr=0.1):
44 |         """
45 |         Initialize a snapshot callback builder.
46 |         # Arguments:
47 |             nb_epochs: total number of epochs that the model will be trained for.
48 |             nb_snapshots: number of times the weights of the model will be saved.
49 |             init_lr: initial learning rate
50 |         """
51 |         self.T = nb_epochs
52 |         self.M = nb_snapshots
53 |         self.alpha_zero = init_lr
54 | 
55 |     def get_callbacks(self, model_save_place='./', model_prefix='Model'):
56 |         """
57 |         Creates a list of callbacks that can be used during training to create a
58 |         snapshot ensemble of the model.
59 |         Args:
60 |             model_prefix: prefix for the filename of the weights.
61 |         Returns: list of 3 callbacks [ModelCheckpoint, LearningRateScheduler,
62 |                  SnapshotModelCheckpoint] which can be provided to the 'fit' function
63 |         """
64 |         if not os.path.exists(model_save_place):
65 |             os.makedirs(model_save_place)
66 | 
67 |         callback_list = [
68 |                          callbacks.LearningRateScheduler(schedule=self._cosine_anneal_schedule),
69 |                          SnapshotModelCheckpoint(self.T, self.M, fn_prefix='%s/%s' % (model_save_place, model_prefix))]
70 | 
71 |         return callback_list
72 | 
73 |     def _cosine_anneal_schedule(self, t):
74 |         cos_inner = np.pi * (t % (self.T // self.M))  # t - 1 is used when t has 1-based indexing.
75 |         cos_inner /= self.T // self.M
76 |         cos_out = np.cos(cos_inner) + 1
77 |         alpha = float(self.alpha_zero / 2 * cos_out)
78 |         print('lr: {}'.format(alpha))
79 |         return alpha
80 | 


--------------------------------------------------------------------------------
/src/model/textcnn_model.py:
--------------------------------------------------------------------------------
  1 | from model.model_basic import BasicDeepModel
  2 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings
  3 | from bilm.elmo import weight_layers
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | 
  7 | 
  8 | filter_sizes = [1, 2, 3, 4]
  9 | n_filters = 128
 10 | n_sub = 10
 11 | n_sent = 4
 12 | 
 13 | 
 14 | class TextCNNModel(BasicDeepModel):
 15 | 
 16 |     def __init__(self, name='basicModel', n_folds=5, config=None):
 17 |         name = 'textCNN' + config.main_feature
 18 |         BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config)
 19 | 
 20 |     def create_model(self, share_dense=True):
 21 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, n_sub, n_sent], name='input_y')
 22 |         self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2')
 23 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
 24 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
 25 | 
 26 |         if self.main_feature.lower() in ['word', 'char']:
 27 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
 28 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
 29 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
 30 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
 31 | 
 32 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
 33 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
 34 |             if self.main_feature == 'elmo_word':
 35 |                 options_file = self.config.elmo_word_options_file
 36 |                 weight_file = self.config.elmo_word_weight_file
 37 |                 embed_file = self.config.elmo_word_embed_file
 38 |             elif self.main_feature == 'elmo_char':
 39 |                 options_file = self.config.elmo_char_options_file
 40 |                 weight_file = self.config.elmo_char_weight_file
 41 |                 embed_file = self.config.elmo_char_embed_file
 42 |             elif self.main_feature == 'elmo_qiuqiu':
 43 |                 options_file = self.config.elmo_qiuqiu_options_file
 44 |                 weight_file = self.config.elmo_qiuqiu_weight_file
 45 |                 embed_file = self.config.elmo_qiuqiu_embed_file
 46 | 
 47 |             self.bilm = BidirectionalLanguageModel(options_file,
 48 |                                                     weight_file,
 49 |                                                     use_character_inputs=False,
 50 |                                                     embedding_weight_file=embed_file,
 51 |                                                     max_batch_size=self.batch_size)
 52 |             bilm_embedding_op = self.bilm(self.input_x)
 53 |             bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
 54 |             self.word_encoding = bilm_embedding['weighted_op']
 55 |             self.word_encoding = tf.nn.dropout(self.word_encoding,self.dropout_keep_prob) # new
 56 | 
 57 |         else:
 58 |             exit('wrong feature')
 59 | 
 60 |         all_input_expanded = tf.expand_dims(self.word_encoding, -1)
 61 | 
 62 |         c_outputs = []
 63 |         for c in range(n_sub):
 64 |             pooled_outputs = []
 65 |             for i, filter_size in enumerate(filter_sizes):
 66 |                 with tf.variable_scope('conv-maxpool-{}-{}'.format(c, filter_size)):
 67 |                     # 卷积层
 68 |                     filter_shape = [filter_size, self.embed_size, 1, n_filters]
 69 |                     W = tf.get_variable('W', initializer=tf.truncated_normal(filter_shape, stddev=0.1))
 70 |                     b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters]))
 71 |                     conv = tf.nn.conv2d(all_input_expanded, W, strides=[1]*4, padding='VALID', name='conv')
 72 |                     h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
 73 |                     pooled = tf.nn.max_pool(h,
 74 |                                             ksize=[1, self.max_len - filter_size + 1, 1, 1],
 75 |                                             strides=[1, 1, 1, 1],
 76 |                                             padding='VALID',
 77 |                                             name='pool')
 78 |                     pooled_outputs.append(pooled)
 79 |             num_filters_total = n_filters * len(filter_sizes)
 80 |             h_pool = tf.concat(pooled_outputs, 3)
 81 |             h_pool_flatten = tf.reshape(h_pool, [-1, 1, num_filters_total])
 82 |             h_drop = tf.nn.dropout(h_pool_flatten, self.dropout_keep_prob)
 83 |             dense = tf.layers.dense(h_drop, 4, activation=None)
 84 |             c_outputs.append(dense)
 85 | 
 86 |         self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4])
 87 |         y_ = tf.nn.softmax(self.logits)
 88 |         self.prob = tf.reshape(y_, [-1, n_sub, 4])
 89 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
 90 | 
 91 |         if not self.config.balance:
 92 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
 93 |             # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
 94 |         else:
 95 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
 96 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
 97 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
 98 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
 99 |             class0_weight = 1  # 第0类的权重系数
100 |             class1_weight = 3  # 第1类的权重系数
101 |             class2_weight = 3  # 第2类的权重系数
102 |             class3_weight = 3  # 第3类的权重系数
103 |             #  coe = tf.constant([1., 1., 1., 1.])
104 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
105 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
106 | 
107 |             y = tf.reshape(self.input_y, [-1, 4])
108 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
109 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
110 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
111 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
112 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
113 | 
114 |         return self
115 | 
116 |     def create_model_v1(self, share_dense=True):
117 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, n_sub, n_sent], name='input_y')
118 |         self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')
119 |         self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob')
120 | 
121 |         if self.main_feature.lower() in ['word', 'char']:
122 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x')
123 |             self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding')
124 |             self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x)
125 |             self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new
126 | 
127 |         elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']:
128 |             self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x')
129 |             if self.main_feature == 'elmo_word':
130 |                 options_file = self.config.elmo_word_options_file
131 |                 weight_file = self.config.elmo_word_weight_file
132 |                 embed_file = self.config.elmo_word_embed_file
133 |             elif self.main_feature == 'elmo_char':
134 |                 options_file = self.config.elmo_char_options_file
135 |                 weight_file = self.config.elmo_char_weight_file
136 |                 embed_file = self.config.elmo_char_embed_file
137 |             elif self.main_feature == 'elmo_qiuqiu':
138 |                 options_file = self.config.elmo_qiuqiu_options_file
139 |                 weight_file = self.config.elmo_qiuqiu_weight_file
140 |                 embed_file = self.config.elmo_qiuqiu_embed_file
141 | 
142 |             self.bilm = BidirectionalLanguageModel(options_file,
143 |                                                     weight_file,
144 |                                                     use_character_inputs=False,
145 |                                                     embedding_weight_file=embed_file,
146 |                                                     max_batch_size=self.batch_size)
147 |             bilm_embedding_op = self.bilm(self.input_x)
148 |             bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0)
149 |             self.word_encoding = bilm_embedding['weighted_op']
150 |             self.word_encoding = tf.nn.dropout(self.word_encoding,self.dropout_keep_prob) # new
151 | 
152 |         else:
153 |             exit('wrong feature')
154 | 
155 |         all_input_expanded = tf.expand_dims(self.word_encoding, -1)
156 |         # all_input_expanded = tf.tile(all_input_expanded, [1,1,1,10])
157 | 
158 |         c_outputs = []
159 |         for c in range(n_sub):
160 |             pooled_outputs = []
161 |             for i, filter_size in enumerate(filter_sizes):
162 |                 with tf.variable_scope('conv-maxpool-{}-{}'.format(c, filter_size)):
163 |                     # 卷积层
164 |                     filter_shape = [filter_size, self.embed_size, 1, n_filters]
165 |                     W = tf.get_variable('W', initializer=tf.truncated_normal(filter_shape, stddev=0.1))
166 |                     b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters]))
167 |                     conv = tf.nn.conv2d(all_input_expanded, W, strides=[1]*4, padding='VALID', name='conv')
168 |                     h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
169 |                     pooled = tf.nn.max_pool(h,
170 |                                             ksize=[1, self.max_len - filter_size + 1, 1, 1],
171 |                                             strides=[1, 1, 1, 1],
172 |                                             padding='VALID',
173 |                                             name='pool')
174 |                     pooled_outputs.append(pooled)
175 |             num_filters_total = n_filters * len(filter_sizes)
176 |             h_pool = tf.concat(pooled_outputs, 3)
177 |             h_pool_flatten = tf.reshape(h_pool, [-1, num_filters_total])
178 |             h_drop = tf.nn.dropout(h_pool_flatten, self.dropout_keep_prob)
179 |             c_outputs.append(h_drop)
180 |         cnn_outputs = tf.reshape(tf.concat(c_outputs, 1), (-1, n_sub, num_filters_total))
181 | 
182 |         if share_dense:
183 |             cnn_outputs = tf.reshape(cnn_outputs, (-1, num_filters_total))
184 |             W = tf.get_variable('W', shape=[num_filters_total, self.n_classes])
185 |             b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes]))
186 |             self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores')
187 |         else:
188 |             cnn_outputs = tf.reshape(tf.concat(c_outputs, 1), (-1, n_sub, num_filters_total))
189 |             W = tf.get_variable('W', shape=[self.batch_size, num_filters_total, self.n_classes])
190 |             b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes]))
191 |             self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores')
192 | 
193 |         y_ = tf.nn.softmax(self.logits)
194 |         self.prob = tf.reshape(y_, [-1, n_sub, 4])
195 |         self.prediction = tf.argmax(self.prob, 2, name="prediction")
196 | 
197 |         if not self.config.balance:
198 |             self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4])))
199 |             self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4])))
200 |         else:
201 |             #  class0_weight = 0.882 * self.n_classes  # 第0类的权重系数
202 |             #  class1_weight = 0.019 * self.n_classes  # 第1类的权重系数
203 |             #  class2_weight = 0.080 * self.n_classes  # 第2类的权重系数
204 |             #  class3_weight = 0.019 * self.n_classes  # 第3类的权重系数
205 |             class0_weight = 1  # 第0类的权重系数
206 |             class1_weight = 3  # 第1类的权重系数
207 |             class2_weight = 3  # 第2类的权重系数
208 |             class3_weight = 3  # 第3类的权重系数
209 |             #  coe = tf.constant([1., 1., 1., 1.])
210 |             #  y = tf.reshape(self.input_y, [-1, 4]) * coe
211 |             #  self.loss = -tf.reduce_mean(y * tf.log(y_))
212 | 
213 |             y = tf.reshape(self.input_y, [-1, 4])
214 |             self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0]))
215 |                                         -class1_weight * (y[:, 1]*tf.log(y_[:, 1]))
216 |                                         -class2_weight * (y[:, 2]*tf.log(y_[:, 2]))
217 |                                         -class3_weight * (y[:, 3]*tf.log(y_[:, 3])))
218 |             #  tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2]))
219 | 
220 |         return self
221 | 
222 | 


--------------------------------------------------------------------------------
/src/model/xgboost_model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/src/model/xgboost_model.py


--------------------------------------------------------------------------------
/src/pack_sub_dt2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import pandas as pd
 4 | import glob
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | from sklearn.metrics import f1_score
 8 | 
 9 | test_df = pd.read_csv('../data/csvs/test_public.csv')
10 | train_df = pd.read_csv('../data/csvs/train_multi.csv')
11 | true_labels = train_df.iloc[:, 6:].values
12 | 
13 | submit_df = pd.DataFrame(columns=['content_id', 'subject', 'sentiment_value', 'sentiment_word'])
14 | train_oof_df = pd.DataFrame(columns=['content_id', 'subject', 'sentiment_value', 'sentiment_word'])
15 | submit_df['content_id'] = test_df['content_id']
16 | train_oof_df['content_id'] = train_df['content_id']
17 | 
18 | 
19 | 
20 | pre_path = '../data/result/0.807*'
21 | pre_filenames = glob.glob(pre_path)
22 | train_oof_filenames = glob.glob(pre_path.replace('pre', 'oof'))
23 | 
24 | pre = np.argmax(pickle.load(open(pre_filenames[0], 'rb')), 2)
25 | train_oof_pred = np.argmax(pickle.load(open(train_oof_filenames[0], 'rb')), 2)
26 | 
27 | print(pre_filenames)
28 | label_itos = [s.split('_')[1] for s in pickle.load(open('../data/sub_list.pkl', 'rb'))]
29 | n_none = 0
30 | n_mul_label = {}
31 | 
32 | f1s = []
33 | 
34 | content_ids = []
35 | subjects = []
36 | sentiment_values = []
37 | lost_ids = []
38 | 
39 | for idx, c_id in enumerate(test_df['content_id']):
40 |     n_label = np.sum(pre[idx] > 0)
41 |     if not n_label:
42 |         n_none += 1
43 |         lost_ids.append(c_id)
44 |     else:
45 |         n_mul_label[n_label] = n_mul_label.get(n_label, 0) + 1
46 |     labels = list(np.where(pre[idx]>0)[0])
47 |     for l in labels:
48 |         content_ids.append(c_id)
49 |         subjects.append(label_itos[l])
50 |         sentiment_values.append(pre[idx][l]-2)
51 | 
52 | soft_df = pd.read_csv('../data/submit/676.csv')
53 | lost_df = soft_df[soft_df['content_id'].isin(lost_ids)]
54 | submit_df = pd.DataFrame({'content_id': content_ids + list(lost_df['content_id']),
55 |                           'subject': subjects + list(lost_df['subject']),
56 |                           'sentiment_value': sentiment_values + list(lost_df['sentiment_value']),
57 |                           # 'subject': subjects + ['']*len(lost_ids),
58 |                           # 'sentiment_value': sentiment_values + ['']*len(lost_ids),
59 |                           'sentiment_word': ['']*(len(lost_df)+len(subjects))})
60 | 
61 | print('n_none:', n_none)
62 | print('n_pad:', len(lost_df))
63 | os.makedirs('../data/submit', exist_ok=True)
64 | submit_df.to_csv('../data/submit/dt3_stacking_submission.csv', index=None)
65 | 
66 | #  for i in range(train_oof_pred.shape[1]):
67 |     #  pre_label = train_oof_pred[:, i]
68 |     #  true_label = true_labels[:, i]
69 |     #  f1 = f1_score(true_label, pre_label, average='macro')
70 |     #  f1s.append(f1)
71 | 
72 | #  f1 = np.mean(f1s)
73 | #  print('f1s->', f1s)
74 | #  print('mean f1', f1)
75 | #  print('n_none:', n_none)
76 | #  os.makedirs('../data/submit', exist_ok=True)
77 | 
78 | #  submit_df.to_csv('../data/submit/dt2_{}_submission.csv'.format(f1), index=None)
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/src/stacking.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import glob
  3 | import pandas as pd
  4 | from config import Config
  5 | from keras.utils import np_utils
  6 | from keras.layers import *
  7 | from model.snapshot import SnapshotCallbackBuilder
  8 | from model.my_callbacks import JZTrainCategory
  9 | from keras.models import *
 10 | from sklearn.preprocessing import MinMaxScaler
 11 | from sklearn.model_selection import KFold
 12 | from sklearn.metrics import accuracy_score, f1_score
 13 | 
 14 | from model.model_basic import BasicModel
 15 | import numpy as np
 16 | import os
 17 | 
 18 | 
 19 | def get_f1_score(x, y, verbose=False):
 20 |     tp = np.sum(np.logical_and(y > 0, x == y))
 21 |     fp = np.sum(np.logical_and(x > 0, y == 0)) + np.sum(np.logical_and(x * y > 0, y != x))  # 多判或者错判
 22 |     fn = np.sum(np.logical_and(y > 0, x == 0))  # 漏判
 23 | 
 24 |     P = float(tp) / (float(tp + fp) + 1e-8)
 25 |     R = float(tp) / (float(tp + fn) + 1e-8)
 26 |     F = 2 * P * R / (P + R + 1e-8)
 27 | 
 28 |     if verbose:
 29 |         print('P->', P)
 30 |         print('R->', R)
 31 |         print('F->', F)
 32 |     return F
 33 | 
 34 | 
 35 | def data_prepare():
 36 |     train_df = pd.read_csv(config.TRAIN_X)
 37 | 
 38 |     if config.data_type == 0:
 39 |         train_y = {}
 40 |         sub_list = pickle.load(open('../data/sub_list.pkl', 'rb'))
 41 |         for sub in sub_list:
 42 |             train_y_val = train_df[sub].values
 43 |             train_y[sub] = np_utils.to_categorical(train_y_val, num_classes=config.n_class)
 44 |     elif config.data_type == 1:
 45 |         train_y = train_df['c_numerical'].values
 46 |         train_y = np_utils.to_categorical(train_y, num_classes=config.n_class)
 47 |     elif config.data_type == 2:
 48 |         train_y = {}
 49 |         train_y['subject'] = train_df['sub_numerical'].values
 50 |         train_y['subject'] = np_utils.to_categorical(train_y['subject'], num_classes=10)
 51 |         train_y['sentiment_value'] = train_df['sentiment_value'].values
 52 |         train_y['sentiment_value'] = np_utils.to_categorical(train_y['sentiment_value'], num_classes=3)
 53 | 
 54 |     elif config.data_type == 3:
 55 |         # 主要融合这个
 56 |         train_y = train_df.iloc[:, 6:].values
 57 |         targets = train_y.reshape(-1)
 58 |         one_hot_targets = np.eye(config.n_classes)[targets]
 59 |         train_y = one_hot_targets.reshape(-1, 10, config.n_classes)
 60 |     elif config.data_type == 4:
 61 |         train_y = (train_df['sentiment_value']+1).values
 62 |         train_y = np_utils.to_categorical(train_y, num_classes=config.n_class)
 63 |     elif config.data_type == 5:
 64 |         train_y = train_df.iloc[:, 4:].values
 65 | 
 66 |     else:
 67 |         exit('错误数据类别')
 68 | 
 69 |     # oof features
 70 |     filenames = glob.glob('../data/result-qiuqiu/*oof*')
 71 |     filenames.extend(glob.glob('../data/result-dt3-op1-embed300-debugFalse-distillation/*oof*'))
 72 |     filenames.extend(glob.glob('../data/11_11_result/*oof*'))
 73 |     # filenames.extend(glob.glob('../data/result-dt3-op1-embed300-debugFalse/*oof*'))
 74 |     # filenames.extend(glob.glob('../data/result-dt3-op1-embed300-debugFalse-enhance/*oof*'))
 75 | 
 76 |     # filenames = glob.glob('../data/result-stacking/*oof*'.format(args.data_type))
 77 |     # def filter(filename, f_value):
 78 |         # return float(filename.split('_')[-3][1:-4]) > f_value
 79 | 
 80 |     # filenames = [e for e in filenames if filter(e, args.f_value)]
 81 |     # filenames = glob.glob('../data/result-dt{}-op1-embed300-debugFalse-enhance/*oof*'.format(args.data_type))
 82 |     from pprint import pprint
 83 |     pprint(filenames)
 84 | 
 85 |     oof_filename = []
 86 |     test_filename = []
 87 |     for j, filename in enumerate(filenames):
 88 |         p_filename = filename.replace('_oof_', '_pre_')
 89 |         oof_filename.append(filename)
 90 |         test_filename.append(p_filename)
 91 | 
 92 |     oof_data = []
 93 |     test_data = []
 94 |     for i, (tra, tes) in enumerate(zip(oof_filename, test_filename)):
 95 | 
 96 |         oof_feature = pickle.load(open(tra, 'rb'))
 97 |         print(tra, oof_feature.shape)
 98 |         oof_data.append(oof_feature)
 99 | 
100 |         oof_feature = pickle.load(open(tes, 'rb'))
101 |         print(tes, oof_feature.shape)
102 |         test_data.append(oof_feature)
103 | 
104 |     train_x = np.concatenate(oof_data, axis=-1)
105 |     test_x = np.concatenate(test_data, axis=-1)
106 |     #  train_x = np.reshape(train_x, [-1, train_x.shape[-1]])
107 |     #  test_x = np.reshape(test_x, [-1, test_x.shape[-1]])
108 |     print('train_x shape: ', train_x.shape)
109 |     print('train_y shape: ', train_y.shape)
110 |     print('test_x shape: ', test_x.shape)
111 | 
112 |     return train_x, train_y, test_x
113 | 
114 | 
115 | def get_model(train_x):
116 |     input_x = Input(shape=(train_x.shape[-2], train_x.shape[-1]), name='input')
117 |     x = Dense(256, activation='relu')(input_x)
118 |     x = Dropout(0.5)(x)
119 |     x = Dense(128, activation='relu')(x)
120 |     x = Dropout(0.5)(x)
121 |     x = Dense(4, activation="softmax")(x)
122 |     res_model = Model(inputs=[input_x], outputs=x)
123 |     return res_model
124 | 
125 | 
126 | # 第一次stacking
127 | def stacking_first(train, train_y, test):
128 |     savepath = './stack_op{}_dt{}_f_value{}/'.format(args.option, args.data_type, args.f_value)
129 |     os.makedirs(savepath, exist_ok=True)
130 | 
131 |     count_kflod = 0
132 |     num_folds = 5
133 |     kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
134 |     predict = np.zeros((test.shape[0], 10, 4))
135 |     oof_predict = np.zeros((train.shape[0], 10, 4))
136 |     scores = []
137 | 
138 |     for i, (train_index, test_index) in enumerate(kf.split(train)):
139 |         print('第{}折'.format(i))
140 | 
141 |         kfold_X_train = {}
142 |         kfold_X_valid = {}
143 | 
144 |         y_train, y_test = train_y[train_index], train_y[test_index]
145 | 
146 |         kfold_X_train, kfold_X_valid = train[train_index], train[test_index]
147 | 
148 |         model_prefix = savepath + 'DNN' + str(count_kflod)
149 |         if not os.path.exists(model_prefix):
150 |             os.mkdir(model_prefix)
151 | 
152 |         M = 3  # number of snapshots
153 |         alpha_zero = 1e-3  # initial learning rate
154 |         snap_epoch = 30
155 | 
156 |         snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero)
157 |         # M = 1  # number of snapshots
158 |         # snap_epoch = 16
159 |         # jz_schedule = JZTrainCategory(model_prefix, snap_epoch, M, save_weights_only=True,  monitor='val_loss', factor=0.7, patience=1)
160 | 
161 |         res_model = get_model(train)
162 |         res_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
163 |         res_model.summary()
164 | 
165 |         # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1,  class_weight=class_weight)
166 |         res_model.fit(kfold_X_train, y_train, batch_size=BATCH_SIZE, epochs=snap_epoch, verbose=1,
167 |                       validation_data=(kfold_X_valid, y_test),
168 |                       callbacks=snapshot.get_callbacks(model_save_place=model_prefix))
169 | 
170 |         evaluations = []
171 |         for i in os.listdir(model_prefix):
172 |             if '.h5' in i:
173 |                 evaluations.append(i)
174 | 
175 |         test_pred_ = np.zeros((test.shape[0], 10, 4))
176 |         oof_pred_ = np.zeros((len(kfold_X_valid), 10, 4))
177 |         for run, i in enumerate(evaluations):
178 |             print('loading from {}'.format(os.path.join(model_prefix, i)))
179 |             res_model.load_weights(os.path.join(model_prefix, i))
180 |             test_pred_ += res_model.predict(test, verbose=1, batch_size=256) / len(evaluations)
181 |             oof_pred_ += res_model.predict(kfold_X_valid, batch_size=256) / len(evaluations)
182 | 
183 |         predict += test_pred_ / num_folds
184 |         oof_predict[test_index] = oof_pred_
185 | 
186 |         f1 = get_f1_score(np.argmax(oof_pred_, -1), np.argmax(y_test, -1), verbose=True)
187 |         print(i, ' kflod cv f1 : ', str(f1))
188 |         count_kflod += 1
189 |         scores.append(f1)
190 |     print('f1 {} -> {}'.format(scores, np.mean(scores)))
191 |     return predict, oof_predict, np.mean(scores)
192 | 
193 | import lightgbm as lgb
194 | def stacking_lightgbm(train, train_y, test):
195 |     train_y = np.argmax(train_y, 1)
196 |     count_kflod = 0
197 |     num_folds = 5
198 |     kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
199 |     predict = np.zeros((test.shape[0], config.n_class))
200 |     oof_predict = np.zeros((train.shape[0], config.n_class))
201 |     scores = []
202 |     f1s = []
203 | 
204 |     params = {'objective': 'multiclass',
205 |                             'bagging_seed': 10,
206 |                             'boosting_type': 'gbdt',
207 |                             'feature_fraction': 0.9,
208 |                             'feature_fraction_seed': 10,
209 |                             'lambda_l1': 0.5,
210 |                             'lambda_l2': 0.5,
211 |                             'learning_rate': 0.01,
212 |                             'metric': 'multi_logloss',
213 |                             'min_child_weight': 1,
214 |                             # 'min_split_gain': 0,
215 |                             'device': 'gpu',
216 |                             'gpu_platform_id': 0,
217 |                             'gpu_device_id': config.gpu,
218 |                             'min_sum_hessian_in_leaf': 0.1,
219 |                             'num_leaves': 64,
220 |                             'num_thread': -1,
221 |                             'num_class': config.n_class,
222 |                             'verbose': 1}
223 | 
224 |     for train_index, test_index in kf.split(train):
225 | 
226 |         y_train, y_test = train_y[train_index], train_y[test_index]
227 |         kfold_X_train, kfold_X_valid = train[train_index], train[test_index]
228 | 
229 |         d_train = lgb.Dataset(kfold_X_train, label=y_train)
230 |         d_watch = lgb.Dataset(kfold_X_valid, label=y_test)
231 | 
232 |         best = lgb.train(params, d_train, num_boost_round=100, verbose_eval=5,
233 |                          valid_sets=d_watch,
234 |                          early_stopping_rounds=6)
235 | 
236 |         preds1 = best.predict(test)
237 |         preds2 = best.predict(kfold_X_valid)
238 | 
239 |         predict += preds1 / num_folds
240 |         # oof_predict[test_index] = preds2
241 | 
242 |         accuracy = mb.cal_acc(preds2, y_test)
243 |         f1 = mb.cal_f_alpha(preds2, y_test, n_out=config.n_class)
244 | 
245 |         print('the kflod cv is : ', str(accuracy))
246 |         print('the kflod f1 is : ', str(f1))
247 |         count_kflod += 1
248 |         scores.append(accuracy)
249 |         f1s.append(f1)
250 |     print('total scores is ', np.mean(scores))
251 |     print('total f1 is ', np.mean(f1s))
252 |     #  return predict, np.mean(scores)
253 |     return predict
254 | 
255 | 
256 | from sklearn.linear_model import LogisticRegression
257 | def stacking_lr(train, train_y, test):
258 |     train_y = np.argmax(train_y, 1)
259 |     count_kflod = 0
260 |     num_folds = 6
261 |     kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
262 |     predict = np.zeros((test.shape[0], config.n_class))
263 |     scores = []
264 |     f1s = []
265 |     for train_index, test_index in kf.split(train):
266 | 
267 |         y_train, y_test = train_y[train_index], train_y[test_index]
268 |         kfold_X_train, kfold_X_valid = train[train_index], train[test_index]
269 | 
270 |         print('拟合数据')
271 |         best = LogisticRegression(C=4, dual=True)
272 |         best.fit(kfold_X_train, y_train)
273 | 
274 |         print('预测结果')
275 |         preds1 = best.predict_proba(test)
276 |         preds2 = best.predict_proba(kfold_X_valid)
277 | 
278 |         predict += preds1 / num_folds
279 |         accuracy = mb.cal_acc(preds2, y_test)
280 |         f1 = mb.cal_f_alpha(preds2, y_test, n_out=config.n_class)
281 | 
282 |         print('the kflod cv is : ', str(accuracy))
283 |         print('the kflod f1 is : ', str(f1))
284 |         count_kflod += 1
285 |         scores.append(accuracy)
286 |         f1s.append(f1)
287 |     print('total scores is ', np.mean(scores))
288 |     print('total f1 is ', np.mean(f1s))
289 |     #  return predict, np.mean(scores)
290 |     return predict
291 | 
292 | from sklearn import svm
293 | from sklearn.calibration import CalibratedClassifierCV
294 | 
295 | def stacking_svm(train, train_y, test):
296 |     train_y = np.argmax(train_y, 1)
297 |     count_kflod = 0
298 |     num_folds = 6
299 |     kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
300 |     predict = np.zeros((test.shape[0], config.n_class))
301 |     scores = []
302 |     f1s = []
303 |     for train_index, test_index in kf.split(train):
304 | 
305 |         y_train, y_test = train_y[train_index], train_y[test_index]
306 |         kfold_X_train, kfold_X_valid = train[train_index], train[test_index]
307 | 
308 |         print('拟合数据')
309 |         best = svm.LinearSVC()
310 |         best = CalibratedClassifierCV(best)
311 |         best.fit(kfold_X_train, y_train)
312 | 
313 |         print('预测结果')
314 |         preds1 = best.predict_proba(test)
315 |         preds2 = best.predict_proba(kfold_X_valid)
316 | 
317 |         predict += preds1 / num_folds
318 |         accuracy = mb.cal_acc(preds2, y_test)
319 |         f1 = mb.cal_f_alpha(preds2, y_test, n_out=config.n_class)
320 | 
321 |         print('the kflod cv is : ', str(accuracy))
322 |         print('the kflod f1 is : ', str(f1))
323 |         count_kflod += 1
324 |         scores.append(accuracy)
325 |         f1s.append(f1)
326 |     print('total scores is ', np.mean(scores))
327 |     print('total f1 is ', np.mean(f1s))
328 |     #  return predict, np.mean(scores)
329 |     return predict
330 | 
331 | 
332 | # 使用pseudo-labeling做第二次stacking
333 | def stacking_pseudo(train, train_y, test, results):
334 |     answer = np.reshape(np.argmax(results, axis=-1), [-1])
335 |     answer = np.reshape(np.eye(4)[answer], [-1, 10, 4])
336 | 
337 |     train_y = np.concatenate([train_y, answer], axis=0)
338 |     train = np.concatenate([train, test], axis=0)
339 | 
340 |     savepath = './pesudo_{}_dt{}/'.format(args.option, args.data_type)
341 |     if not os.path.exists(savepath):
342 |         os.mkdir(savepath)
343 |     count_kflod = 0
344 |     num_folds = 5
345 |     kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
346 |     predict = np.zeros((test.shape[0], 10, 4))
347 |     oof_predict = np.zeros((train.shape[0], 10, 4))
348 |     scores = []
349 | 
350 |     for i, (train_index, test_index) in enumerate(kf.split(train)):
351 |         print('第{}折'.format(i))
352 | 
353 |         kfold_X_train = {}
354 |         kfold_X_valid = {}
355 | 
356 |         y_train, y_test = train_y[train_index], train_y[test_index]
357 | 
358 |         kfold_X_train, kfold_X_valid = train[train_index], train[test_index]
359 | 
360 |         model_prefix = savepath + 'DNN' + str(count_kflod)
361 |         if not os.path.exists(model_prefix):
362 |             os.mkdir(model_prefix)
363 | 
364 |         M = 3  # number of snapshots
365 |         alpha_zero = 1e-3  # initial learning rate
366 |         snap_epoch = 30
367 | 
368 |         snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero)
369 |         # M = 1  # number of snapshots
370 |         # snap_epoch = 16
371 |         # jz_schedule = JZTrainCategory(model_prefix, snap_epoch, M, save_weights_only=True,  monitor='val_loss', factor=0.7, patience=1)
372 | 
373 |         res_model = get_model(train)
374 |         res_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
375 |         res_model.summary()
376 | 
377 |         # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1,  class_weight=class_weight)
378 |         res_model.fit(kfold_X_train, y_train, batch_size=BATCH_SIZE, epochs=snap_epoch, verbose=1,
379 |                       validation_data=(kfold_X_valid, y_test),
380 |                       callbacks=snapshot.get_callbacks(model_save_place=model_prefix))
381 | 
382 |         evaluations = []
383 |         for i in os.listdir(model_prefix):
384 |             if '.h5' in i:
385 |                 evaluations.append(i)
386 | 
387 |         test_pred_ = np.zeros((test.shape[0], 10, 4))
388 |         oof_pred_ = np.zeros((len(kfold_X_valid), 10, 4))
389 |         for run, i in enumerate(evaluations):
390 |             print('loading from {}'.format(os.path.join(model_prefix, i)))
391 |             res_model.load_weights(os.path.join(model_prefix, i))
392 |             test_pred_ += res_model.predict(test, verbose=1, batch_size=256) / len(evaluations)
393 |             oof_pred_ += res_model.predict(kfold_X_valid, batch_size=256) / len(evaluations)
394 | 
395 |         predict += test_pred_ / num_folds
396 |         oof_predict[test_index] = oof_pred_
397 | 
398 |         f1 = get_f1_score(np.argmax(oof_pred_, -1), np.argmax(y_test, -1), verbose=True)
399 |         print(i, ' kflod cv f1 : ', str(f1))
400 |         count_kflod += 1
401 |         scores.append(f1)
402 |     print('f1 {} -> {}'.format(scores, np.mean(scores)))
403 |     return predict, np.mean(scores)
404 | 
405 | def save_result(predict, prefix):
406 |     os.makedirs('../data/result', exist_ok=True)
407 |     with open('../data/result/{}.pkl'.format(prefix), 'wb') as f:
408 |         pickle.dump(predict, f)
409 | 
410 | if __name__ == '__main__':
411 |     import argparse
412 |     parser = argparse.ArgumentParser()
413 |     parser.add_argument('--gpu', type=str, default='6')
414 |     parser.add_argument('--model', type=str, help='模型')
415 |     parser.add_argument('--option', type=int, default=1, help='训练方式')
416 |     parser.add_argument('--data_type', type=int, default=1, help='问题模式, 0为4分类, 1为单分类, 2为先分主题再分情感')
417 |     parser.add_argument('--feature', default='word', type=str, help='选择word或者char作为特征')
418 |     parser.add_argument('--es', default=200, type=int, help='embed size')
419 |     parser.add_argument('--debug', default=False, action='store_true')
420 |     parser.add_argument('--bs', default=256, type=int, help='batch size')
421 |     parser.add_argument('--f_value', default=0.0, type=float)
422 |     args = parser.parse_args()
423 | 
424 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
425 | 
426 |     import tensorflow as tf
427 |     from keras.backend.tensorflow_backend import set_session
428 |     tf_config = tf.ConfigProto()
429 |     tf_config.gpu_options.allow_growth=True
430 |     set_session(tf.Session(config=tf_config))
431 | 
432 |     mb = BasicModel()
433 |     config = Config()
434 |     config.gpu = args.gpu
435 |     config.data_type = args.data_type
436 |     BATCH_SIZE = args.bs
437 | 
438 |     #  cv_stacking()
439 | 
440 |     # normal stacking
441 |     train, train_y, test = data_prepare()
442 | 
443 |     predicts, oof_predicts, score = stacking_first(train, train_y, test)
444 |     save_result(predicts, prefix=str(score))
445 |     # save_result(oof_predicts, prefix='oof')
446 | 
447 |     # predicts = stacking_lightgbm(train, train_y, test)
448 |     # save_result(predicts[:10000], prefix='stacking_lgb_first_op{}_{}_{}'.format(args.option, args.data_type, args.f_value))
449 | 
450 |     # predicts = stacking_lr(train, train_y, test)
451 |     # save_result(predicts[:10000], prefix='stacking_lr_first_op{}_{}_{}'.format(args.option, args.data_type, args.f_value))
452 | 
453 |     # predicts = stacking_svm(train, train_y, test)
454 |     # save_result(predicts[:10000], prefix='stacking_svm_first_op{}_{}_{}'.format(args.option, args.data_type, args.f_value))
455 | 
456 |     # 假标签
457 |     predicts, score = stacking_pseudo(train, train_y, test, predicts)
458 |     save_result(predicts, prefix=str(score))
459 | 


--------------------------------------------------------------------------------
/src/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import unicodedata
 23 | import six
 24 | import tensorflow as tf
 25 | 
 26 | 
 27 | def convert_to_unicode(text):
 28 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 29 |   if six.PY3:
 30 |     if isinstance(text, str):
 31 |       return text
 32 |     elif isinstance(text, bytes):
 33 |       return text.decode("utf-8", "ignore")
 34 |     else:
 35 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 36 |   elif six.PY2:
 37 |     if isinstance(text, str):
 38 |       return text.decode("utf-8", "ignore")
 39 |     elif isinstance(text, unicode):
 40 |       return text
 41 |     else:
 42 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 43 |   else:
 44 |     raise ValueError("Not running on Python2 or Python 3?")
 45 | 
 46 | 
 47 | def printable_text(text):
 48 |   """Returns text encoded in a way suitable for print or `tf.logging`."""
 49 | 
 50 |   # These functions want `str` for both Python2 and Python3, but in one case
 51 |   # it's a Unicode string and in the other it's a byte string.
 52 |   if six.PY3:
 53 |     if isinstance(text, str):
 54 |       return text
 55 |     elif isinstance(text, bytes):
 56 |       return text.decode("utf-8", "ignore")
 57 |     else:
 58 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 59 |   elif six.PY2:
 60 |     if isinstance(text, str):
 61 |       return text
 62 |     elif isinstance(text, unicode):
 63 |       return text.encode("utf-8")
 64 |     else:
 65 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 66 |   else:
 67 |     raise ValueError("Not running on Python2 or Python 3?")
 68 | 
 69 | 
 70 | def load_vocab(vocab_file):
 71 |   """Loads a vocabulary file into a dictionary."""
 72 |   vocab = collections.OrderedDict()
 73 |   index = 0
 74 |   with tf.gfile.GFile(vocab_file, "r") as reader:
 75 |     while True:
 76 |       token = convert_to_unicode(reader.readline())
 77 |       if not token:
 78 |         break
 79 |       token = token.strip()
 80 |       vocab[token] = index
 81 |       index += 1
 82 |   return vocab
 83 | 
 84 | 
 85 | def convert_by_vocab(vocab, items):
 86 |   """Converts a sequence of [tokens|ids] using the vocab."""
 87 |   output = []
 88 |   for item in items:
 89 |     output.append(vocab[item])
 90 |   return output
 91 | 
 92 | 
 93 | def convert_tokens_to_ids(vocab, tokens):
 94 |   return convert_by_vocab(vocab, tokens)
 95 | 
 96 | 
 97 | def convert_ids_to_tokens(inv_vocab, ids):
 98 |   return convert_by_vocab(inv_vocab, ids)
 99 | 
100 | 
101 | def whitespace_tokenize(text):
102 |   """Runs basic whitespace cleaning and splitting on a peice of text."""
103 |   text = text.strip()
104 |   if not text:
105 |     return []
106 |   tokens = text.split()
107 |   return tokens
108 | 
109 | 
110 | class FullTokenizer(object):
111 |   """Runs end-to-end tokenziation."""
112 | 
113 |   def __init__(self, vocab_file, do_lower_case=True):
114 |     self.vocab = load_vocab(vocab_file)
115 |     self.inv_vocab = {v: k for k, v in self.vocab.items()}
116 |     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
117 |     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
118 | 
119 |   def tokenize(self, text):
120 |     split_tokens = []
121 |     for token in self.basic_tokenizer.tokenize(text):
122 |       for sub_token in self.wordpiece_tokenizer.tokenize(token):
123 |         split_tokens.append(sub_token)
124 | 
125 |     return split_tokens
126 | 
127 |   def convert_tokens_to_ids(self, tokens):
128 |     return convert_by_vocab(self.vocab, tokens)
129 | 
130 |   def convert_ids_to_tokens(self, ids):
131 |     return convert_by_vocab(self.inv_vocab, ids)
132 | 
133 | 
134 | class BasicTokenizer(object):
135 |   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
136 | 
137 |   def __init__(self, do_lower_case=True):
138 |     """Constructs a BasicTokenizer.
139 | 
140 |     Args:
141 |       do_lower_case: Whether to lower case the input.
142 |     """
143 |     self.do_lower_case = do_lower_case
144 | 
145 |   def tokenize(self, text):
146 |     """Tokenizes a piece of text."""
147 |     text = convert_to_unicode(text)
148 |     text = self._clean_text(text)
149 | 
150 |     # This was added on November 1st, 2018 for the multilingual and Chinese
151 |     # models. This is also applied to the English models now, but it doesn't
152 |     # matter since the English models were not trained on any Chinese data
153 |     # and generally don't have any Chinese data in them (there are Chinese
154 |     # characters in the vocabulary because Wikipedia does have some Chinese
155 |     # words in the English Wikipedia.).
156 |     text = self._tokenize_chinese_chars(text)
157 | 
158 |     orig_tokens = whitespace_tokenize(text)
159 |     split_tokens = []
160 |     for token in orig_tokens:
161 |       if self.do_lower_case:
162 |         token = token.lower()
163 |         token = self._run_strip_accents(token)
164 |       split_tokens.extend(self._run_split_on_punc(token))
165 | 
166 |     output_tokens = whitespace_tokenize(" ".join(split_tokens))
167 |     return output_tokens
168 | 
169 |   def _run_strip_accents(self, text):
170 |     """Strips accents from a piece of text."""
171 |     text = unicodedata.normalize("NFD", text)
172 |     output = []
173 |     for char in text:
174 |       cat = unicodedata.category(char)
175 |       if cat == "Mn":
176 |         continue
177 |       output.append(char)
178 |     return "".join(output)
179 | 
180 |   def _run_split_on_punc(self, text):
181 |     """Splits punctuation on a piece of text."""
182 |     chars = list(text)
183 |     i = 0
184 |     start_new_word = True
185 |     output = []
186 |     while i < len(chars):
187 |       char = chars[i]
188 |       if _is_punctuation(char):
189 |         output.append([char])
190 |         start_new_word = True
191 |       else:
192 |         if start_new_word:
193 |           output.append([])
194 |         start_new_word = False
195 |         output[-1].append(char)
196 |       i += 1
197 | 
198 |     return ["".join(x) for x in output]
199 | 
200 |   def _tokenize_chinese_chars(self, text):
201 |     """Adds whitespace around any CJK character."""
202 |     output = []
203 |     for char in text:
204 |       cp = ord(char)
205 |       if self._is_chinese_char(cp):
206 |         output.append(" ")
207 |         output.append(char)
208 |         output.append(" ")
209 |       else:
210 |         output.append(char)
211 |     return "".join(output)
212 | 
213 |   def _is_chinese_char(self, cp):
214 |     """Checks whether CP is the codepoint of a CJK character."""
215 |     # This defines a "chinese character" as anything in the CJK Unicode block:
216 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
217 |     #
218 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
219 |     # despite its name. The modern Korean Hangul alphabet is a different block,
220 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
221 |     # space-separated words, so they are not treated specially and handled
222 |     # like the all of the other languages.
223 |     if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
224 |         (cp >= 0x3400 and cp <= 0x4DBF) or  #
225 |         (cp >= 0x20000 and cp <= 0x2A6DF) or  #
226 |         (cp >= 0x2A700 and cp <= 0x2B73F) or  #
227 |         (cp >= 0x2B740 and cp <= 0x2B81F) or  #
228 |         (cp >= 0x2B820 and cp <= 0x2CEAF) or
229 |         (cp >= 0xF900 and cp <= 0xFAFF) or  #
230 |         (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
231 |       return True
232 | 
233 |     return False
234 | 
235 |   def _clean_text(self, text):
236 |     """Performs invalid character removal and whitespace cleanup on text."""
237 |     output = []
238 |     for char in text:
239 |       cp = ord(char)
240 |       if cp == 0 or cp == 0xfffd or _is_control(char):
241 |         continue
242 |       if _is_whitespace(char):
243 |         output.append(" ")
244 |       else:
245 |         output.append(char)
246 |     return "".join(output)
247 | 
248 | 
249 | class WordpieceTokenizer(object):
250 |   """Runs WordPiece tokenziation."""
251 | 
252 |   def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
253 |     self.vocab = vocab
254 |     self.unk_token = unk_token
255 |     self.max_input_chars_per_word = max_input_chars_per_word
256 | 
257 |   def tokenize(self, text):
258 |     """Tokenizes a piece of text into its word pieces.
259 | 
260 |     This uses a greedy longest-match-first algorithm to perform tokenization
261 |     using the given vocabulary.
262 | 
263 |     For example:
264 |       input = "unaffable"
265 |       output = ["un", "##aff", "##able"]
266 | 
267 |     Args:
268 |       text: A single token or whitespace separated tokens. This should have
269 |         already been passed through `BasicTokenizer.
270 | 
271 |     Returns:
272 |       A list of wordpiece tokens.
273 |     """
274 | 
275 |     text = convert_to_unicode(text)
276 | 
277 |     output_tokens = []
278 |     for token in whitespace_tokenize(text):
279 |       chars = list(token)
280 |       if len(chars) > self.max_input_chars_per_word:
281 |         output_tokens.append(self.unk_token)
282 |         continue
283 | 
284 |       is_bad = False
285 |       start = 0
286 |       sub_tokens = []
287 |       while start < len(chars):
288 |         end = len(chars)
289 |         cur_substr = None
290 |         while start < end:
291 |           substr = "".join(chars[start:end])
292 |           if start > 0:
293 |             substr = "##" + substr
294 |           if substr in self.vocab:
295 |             cur_substr = substr
296 |             break
297 |           end -= 1
298 |         if cur_substr is None:
299 |           is_bad = True
300 |           break
301 |         sub_tokens.append(cur_substr)
302 |         start = end
303 | 
304 |       if is_bad:
305 |         output_tokens.append(self.unk_token)
306 |       else:
307 |         output_tokens.extend(sub_tokens)
308 |     return output_tokens
309 | 
310 | 
311 | def _is_whitespace(char):
312 |   """Checks whether `chars` is a whitespace character."""
313 |   # \t, \n, and \r are technically contorl characters but we treat them
314 |   # as whitespace since they are generally considered as such.
315 |   if char == " " or char == "\t" or char == "\n" or char == "\r":
316 |     return True
317 |   cat = unicodedata.category(char)
318 |   if cat == "Zs":
319 |     return True
320 |   return False
321 | 
322 | 
323 | def _is_control(char):
324 |   """Checks whether `chars` is a control character."""
325 |   # These are technically control characters but we count them as whitespace
326 |   # characters.
327 |   if char == "\t" or char == "\n" or char == "\r":
328 |     return False
329 |   cat = unicodedata.category(char)
330 |   if cat.startswith("C"):
331 |     return True
332 |   return False
333 | 
334 | 
335 | def _is_punctuation(char):
336 |   """Checks whether `chars` is a punctuation character."""
337 |   cp = ord(char)
338 |   # We treat all non-letter/number ASCII as punctuation.
339 |   # Characters such as "^", "$", and "`" are not in the Unicode
340 |   # Punctuation class but we treat them as punctuation anyways, for
341 |   # consistency.
342 |   if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
343 |       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
344 |     return True
345 |   cat = unicodedata.category(char)
346 |   if cat.startswith("P"):
347 |     return True
348 |   return False
349 | 


--------------------------------------------------------------------------------
/src/train_elmo.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import numpy as np
 5 | 
 6 | from bilm.training import train, load_options_latest_checkpoint, load_vocab
 7 | from bilm.data import BidirectionalLMDataset
 8 | 
 9 | 
10 | def main(args):
11 |     # load the vocab
12 |     vocab = load_vocab(args.vocab_file, None)
13 | 
14 |     # define the options
15 |     batch_size = 512  # batch size for each GPU
16 |     n_gpus = 3
17 |     os.environ['CUDA_VISIBLE_DEVICES'] = '1, 2, 6'
18 | 
19 |     # number of tokens in training data (this for 1B Word Benchmark)
20 |     # word 8799
21 |     # char 2355
22 |     n_train_tokens = 768648884
23 |     # n_train_tokens = 8799
24 | 
25 |     options = {
26 |      'bidirectional': True,
27 | 
28 |      #  'char_cnn': {'activation': 'relu',
29 |       #  'embedding': {'dim': 16},
30 |       #  'filters': [[1, 32],
31 |        #  [2, 32],
32 |        #  [3, 64],
33 |        #  [4, 128],
34 |        #  [5, 256],
35 |        #  [6, 512],
36 |        #  [7, 1024]],
37 |       #  'max_characters_per_token': 50,
38 |       #  'n_characters': 261,
39 |       #  'n_highway': 2},
40 | 
41 |      'dropout': 0.1,
42 | 
43 |      'lstm': {
44 |       'cell_clip': 3,
45 |       'dim': 4096,
46 |       'n_layers': 2,
47 |       'proj_clip': 3,
48 |       'projection_dim': 512,
49 |       'use_skip_connections': True},
50 | 
51 |      'all_clip_norm_val': 10.0,
52 | 
53 |      'n_epochs': 10,
54 |      'n_train_tokens': n_train_tokens,
55 |      'batch_size': batch_size,
56 |      'n_tokens_vocab': vocab.size,
57 |      'unroll_steps': 20,
58 |      'n_negative_samples_batch': 1024,
59 |     }
60 | 
61 |     print('vocab_size:', vocab.size)
62 |     prefix = args.train_prefix
63 |     data = BidirectionalLMDataset(prefix, vocab, test=False,
64 |                                       shuffle_on_load=True)
65 | 
66 |     tf_save_dir = args.save_dir
67 |     tf_log_dir = args.save_dir
68 |     train(options, data, n_gpus, tf_save_dir, tf_log_dir)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     parser = argparse.ArgumentParser()
73 |     parser.add_argument('--save_dir', help='Location of checkpoint files')
74 |     parser.add_argument('--vocab_file', help='Vocabulary file')
75 |     parser.add_argument('--train_prefix', help='Prefix for train files')
76 | 
77 |     args = parser.parse_args()
78 |     main(args)
79 | 


--------------------------------------------------------------------------------
/src/train_predict.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import pickle
  4 | from config import Config
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | from sklearn.feature_extraction.text import TfidfVectorizer
  8 | from sklearn.feature_extraction.text import HashingVectorizer
  9 | import logging
 10 | from gensim.models.word2vec import Word2Vec
 11 | from bilm import TokenBatcher
 12 | from scipy.sparse import hstack
 13 | 
 14 | import tokenization
 15 | from keras.preprocessing import sequence
 16 | from keras.utils import np_utils
 17 | import tensorflow as tf
 18 | 
 19 | #  np.random.seed(201)
 20 | #  tf.set_random_seed(201)
 21 | 
 22 | logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | def deep_data_prepare(config):
 27 |     print('深度学习模型数据准备')
 28 |     train_df = pd.read_csv(config.TRAIN_X)
 29 |     train_jp = pd.read_csv(config.TRAIN_JP)
 30 |     train_en = pd.read_csv(config.TRAIN_EN)
 31 |     test_df = pd.read_csv(config.TEST_X)
 32 | 
 33 |     char_sw_list = pickle.load(open('../data/char_stopword.pkl', 'rb'))
 34 |     word_sw_list = pickle.load(open('../data/word_stopword.pkl', 'rb'))
 35 |     # 用词向量
 36 |     # 用字向量
 37 |     train_x_char = train_df['char']
 38 |     train_x_word = train_df['word']
 39 |     # train_x_sent_word = [w for w in open('../data/sentiment_word.txt')]
 40 |     # train_x_sent_char = [w for w in open('../data/sentiment_word.txt')]
 41 |     train_jp_char = train_jp['char']
 42 |     train_jp_word = train_jp['word']
 43 |     train_en_char = train_en['char']
 44 |     train_en_word = train_en['word']
 45 | 
 46 |     train_char = pd.concat((train_x_char, train_jp_char, train_en_char))
 47 |     train_word = pd.concat((train_x_word, train_jp_word, train_en_word))
 48 |     test_char = test_df['char']
 49 |     test_word = test_df['word']
 50 | 
 51 |     if config.data_type == 0:
 52 |         train_y = train_df['sub_numerical'].values
 53 |         train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes)
 54 | 
 55 |     elif config.data_type == 1:
 56 |         train_y = train_df['sentiment_value'].values
 57 |         train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes)
 58 | 
 59 |     elif config.data_type == 2:
 60 |         train_y = np.array(train_df.iloc[:, 6:].values)
 61 |     elif config.data_type == 3:
 62 |         train_y = train_df.iloc[:, 6:].values
 63 |         targets = train_y.reshape(-1)
 64 |         one_hot_targets = np.eye(config.n_classes)[targets]
 65 |         train_y = one_hot_targets.reshape(-1, 10, config.n_classes)
 66 |     elif config.data_type == 4:
 67 |         train_y = (train_df['sentiment_value']+1).values
 68 |         train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes)
 69 |     elif config.data_type == 5:
 70 |         train_y = train_df.iloc[:, 4:].values
 71 |     else:
 72 |         exit('错误数据类别')
 73 | 
 74 |     UNK_CHAR = len(char_stoi)
 75 |     PAD_CHAR = len(char_stoi) + 1
 76 | 
 77 |     UNK_WORD = len(word_stoi)
 78 |     PAD_WORD = len(word_stoi) + 1
 79 | 
 80 |     def generate_hann_data(df):
 81 |         import re
 82 |         hann_train_word = np.full(shape=(len(df['word']), config.HANN_SENT, config.HANN_WORD_LEN), fill_value=PAD_WORD)
 83 |         hann_train_char = np.full(shape=(len(df['char']), config.HANN_SENT, config.HANN_CHAR_LEN), fill_value=PAD_CHAR)
 84 | 
 85 |         for i, sentences in enumerate(df['word']):
 86 |             sentences = re.split(r" 。 | ， ", sentences)
 87 |             for j, sent in enumerate(sentences):
 88 |                 if j < config.HANN_SENT:
 89 |                     k = 0
 90 |                     word_tokens = sent.split()
 91 |                     for _, word in enumerate(word_tokens):
 92 |                         if k < config.HANN_WORD_LEN and word not in word_sw_list and word in word_stoi:
 93 |                             hann_train_word[i, j, k] = word_stoi[word]
 94 |                             k += 1
 95 | 
 96 |         for i, sentences in enumerate(df['char']):
 97 |             sentences = re.split(r" 。 | ， ", sentences)
 98 |             for j, sent in enumerate(sentences):
 99 |                 if j < config.HANN_SENT:
100 |                     k = 0
101 |                     word_tokens = sent.split()
102 |                     for _, word in enumerate(word_tokens):
103 |                         if k < config.HANN_CHAR_LEN and word not in char_sw_list and word in char_stoi:
104 |                             hann_train_char[i, j, k] = char_stoi[word]
105 |                             k += 1
106 |         return hann_train_word, hann_train_char
107 | 
108 |     hann_train_word, hann_train_char = generate_hann_data(train_df)
109 |     hann_test_word, hann_test_char = generate_hann_data(test_df)
110 | 
111 |     def word2id(train_dialogs, type='char'):
112 |         if type == 'char':
113 |             stoi = char_stoi
114 |             max_len = config.CHAR_MAXLEN
115 |             UNK = UNK_CHAR
116 |             sw_list = set(char_sw_list)
117 |         elif type == 'word':
118 |             stoi = word_stoi
119 |             max_len = config.WORD_MAXLEN
120 |             UNK = UNK_WORD
121 |             sw_list = set(word_sw_list)
122 |         else:
123 |             exit('类型错误')
124 | 
125 |         train_x = []
126 |         for d in tqdm(train_dialogs):
127 |             d = str(d).split()
128 |             line = []
129 |             for token in d:
130 |                 if token in sw_list\
131 |                         or token == ''\
132 |                         or token == ' ':
133 |                     continue
134 |                 if token in stoi:
135 |                     line.append(stoi[token])
136 |                 else:
137 |                     line.append(UNK)
138 | 
139 |             train_x.append(line[:max_len])
140 |         return train_x
141 | 
142 |     # 普通模型数据
143 |     train_x_word = word2id(train_word, type='word')
144 |     train_x_char = word2id(train_char, type='char')
145 |     test_x_char = word2id(test_char, type='char')
146 |     test_x_word = word2id(test_word, type='word')
147 | 
148 |     # train_x_sent_word = word2id(train_x_sent_word, type='word')
149 |     # train_x_sent_char = word2id(train_x_sent_char, type='char')
150 |     # rcnn模型数据准备
151 |     UNK_CHAR = PAD_CHAR
152 |     UNK_WORD = PAD_WORD
153 | 
154 |     train_word_left = [[UNK_WORD] + w[:-1] for w in train_x_word]
155 |     train_word_right = [w[1:] + [UNK_WORD] for w in train_x_word]
156 |     train_char_left = [[UNK_CHAR] + w[:-1] for w in train_x_char]
157 |     train_char_right = [w[1:] + [UNK_CHAR] for w in train_x_char]
158 | 
159 |     test_word_left = [[UNK_WORD] + w[:-1] for w in test_x_word]
160 |     test_word_right = [w[1:] + [UNK_WORD] for w in test_x_word]
161 |     test_char_left = [[UNK_CHAR] + w[:-1] for w in test_x_char]
162 |     test_char_right = [w[1:] + [UNK_CHAR] for w in test_x_char]
163 | 
164 |     train_x_char = sequence.pad_sequences(train_x_char, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
165 |     train_x_word = sequence.pad_sequences(train_x_word, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
166 |     train_x_char_left = sequence.pad_sequences(train_char_left, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
167 |     train_x_word_left = sequence.pad_sequences(train_word_left, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
168 |     train_x_char_right = sequence.pad_sequences(train_char_right, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
169 |     train_x_word_right = sequence.pad_sequences(train_word_right, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
170 | 
171 |     test_x_char = sequence.pad_sequences(test_x_char, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
172 |     test_x_word = sequence.pad_sequences(test_x_word, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
173 |     test_x_char_left = sequence.pad_sequences(test_char_left, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
174 |     test_x_word_left = sequence.pad_sequences(test_word_left, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
175 |     test_x_char_right = sequence.pad_sequences(test_char_right, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR)
176 |     test_x_word_right = sequence.pad_sequences(test_word_right, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD)
177 | 
178 |     print('train_x char shape is: ', train_x_char.shape)
179 |     print('train_x word shape is: ', train_x_word.shape)
180 |     print('test_x char shape is: ', test_x_char.shape)
181 |     print('test_x word shape is: ', test_x_word.shape)
182 | 
183 |     train = {}
184 |     test = {}
185 |     # tokenizer = tokenization.FullTokenizer(
186 |                     # vocab_file=config.BERT_VOCAB_FILES, do_lower_case=False)
187 | 
188 |     # def get_bert_data(corpus):
189 |         # input_ids = []
190 |         # input_mask = []
191 |         # input_segment_ids = []
192 | 
193 |         # for sent in train_df['word'].values:
194 |             # sent = ''.join(sent.strip().split())
195 |             # tmp_token_ids = tokenizer.convert_tokens_to_ids(['[CLS]'] + tokenizer.tokenize(sent)[:188] + ['[SEP]'])
196 |             # tmp_mask = [1] * len(tmp_token_ids)
197 |             # tmp_segment_ids = [0] * len(tmp_token_ids)
198 |             # if len(tmp_token_ids) < 190:
199 |                 # tmp_segment_ids.extend([0] * (190-len(tmp_token_ids)))
200 |                 # tmp_mask.extend([0] * (190-len(tmp_token_ids)))
201 |                 # tmp_token_ids.extend([0] * (190-len(tmp_token_ids)))
202 |             # input_ids.append(tmp_token_ids)
203 |             # input_mask.append(tmp_mask)
204 |             # input_segment_ids.append(tmp_segment_ids)
205 |         # return np.array(input_ids, dtype='int32'), np.array(input_mask, dtype='int32'), np.array(input_segment_ids, dtype='int32')
206 | 
207 |     # train['token_id'], train['mask_id'], train['type_id'] = get_bert_data(train_df['word'].values)
208 |     # test['token_id'], test['mask_id'], test['type_id'] = get_bert_data(test_df['word'].values)
209 | 
210 |     train['word'] = train_x_word
211 |     train['char'] = train_x_char
212 |     # train['word_sent'] = train_x_sent_word
213 |     # train['char_sent'] = train_x_sent_char
214 |     # rcnn
215 |     train['word_left'] = train_x_word_left
216 |     train['word_right'] = train_x_word_right
217 |     train['char_left'] = train_x_char_left
218 |     train['char_right'] = train_x_char_right
219 |     # han
220 |     train['hann_word'] = hann_train_word
221 |     train['hann_char'] = hann_train_char
222 | 
223 |     test['word'] = test_x_word
224 |     test['char'] = test_x_char
225 |     test['word_left'] = test_x_word_left
226 |     test['word_right'] = test_x_word_right
227 |     test['char_left'] = test_x_char_left
228 |     test['char_right'] = test_x_char_right
229 |     test['hann_word'] = hann_test_word
230 |     test['hann_char'] = hann_test_char
231 | 
232 |     assert train['word_left'].shape == train['word_right'].shape == train['word'].shape
233 |     assert train['char_left'].shape == train['char_right'].shape == train['char'].shape
234 |     assert test['word_left'].shape == test['word_right'].shape == test['word'].shape
235 |     assert test['char_left'].shape == test['char_right'].shape == test['char'].shape
236 | 
237 |     # batcher = TokenBatcher(config.elmo_word_vocab_file)
238 |     # train['elmo_word'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in train_df['word']])
239 |     # test['elmo_word'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in test_df['word']])
240 | 
241 |     # batcher = TokenBatcher(config.elmo_char_vocab_file)
242 |     # train['elmo_char'] = batcher.batch_sentences([str(w).split()[:config.CHAR_MAXLEN] for w in train_df['char']])
243 |     # test['elmo_char'] = batcher.batch_sentences([str(w).split()[:config.CHAR_MAXLEN] for w in test_df['char']])
244 | 
245 |     # batcher = TokenBatcher(config.elmo_qiuqiu_vocab_file)
246 |     # train['elmo_qiuqiu'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in train_df['word']])
247 |     # test['elmo_qiuqiu'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in test_df['word']])
248 | 
249 |     return train, train_y, test
250 | 
251 | 
252 | def init_embedding(config, type='word'):
253 |     model_file = config.word_w2v_file if type == 'word' else config.char_w2v_file
254 |     item_to_id = word_stoi if type == 'word' else char_stoi
255 |     vocab_len = len(item_to_id) + 2
256 |     print('Vocabulaty size : ', vocab_len)
257 |     print('create embedding matrix')
258 | 
259 |     def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
260 |     embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(model_file).readlines()[1:])
261 | 
262 |     all_embs = np.stack(embeddings_index.values())
263 |     embed_matrix = np.random.normal(all_embs.mean(), all_embs.std(), size=(vocab_len, config.EMBED_SIZE)).astype(dtype='float32')
264 |     embed_matrix[-1] = 0  # padding
265 | 
266 |     for word, i in tqdm(item_to_id.items()):
267 |         embedding_vector = embeddings_index.get(word)
268 |         if embedding_vector is not None:
269 |             embed_matrix[i] = embedding_vector
270 |     return embed_matrix
271 | 
272 | 
273 | def deep_data_cache():
274 |     char_w2v_embed = init_embedding(config, type='char')
275 |     word_w2v_embed = init_embedding(config, type='word')
276 | 
277 |     train, train_y, test = deep_data_prepare(config)
278 |     os.makedirs('../data/cache/', exist_ok=True)
279 |     pickle.dump((train, train_y, test, char_w2v_embed, word_w2v_embed), open('../data/cache/deep_data_oe{}_es{}_dt{}_f{}.pkl'.format(config.outer_embed, config.EMBED_SIZE, config.data_type, config.main_feature), 'wb'))
280 | 
281 | 
282 | def deep_data_process():
283 |     deep_data_cache()
284 |     (train, train_y, test, char_w2v_embed, word_w2v_embed) = pickle.load(open('../data/cache/deep_data_oe{}_es{}_dt{}_f{}.pkl'.format(config.outer_embed, config.EMBED_SIZE, config.data_type, config.main_feature), 'rb'))
285 |     config.char_embedding = char_w2v_embed
286 |     config.word_embedding = word_w2v_embed
287 | 
288 |     model = config.model[args.model](config=config, n_folds=5)
289 |     if config.data_type == 0:
290 |         model.single_train_predict(train, train_y, test, option=config.option)
291 |     elif config.data_type == 1:
292 |         model.single_train_predict(train, train_y, test, option=config.option)
293 | 
294 |     elif config.data_type == 2:
295 |         model.multi_train_predict(train, train_y, test, option=config.option)
296 |     elif config.data_type == 3:
297 |         model.four_classify_train_predict(train, train_y, test, option=config.option)
298 |         # # model.multi_train_predict(train, train_y, test, option=config.option)
299 |     # elif config.data_type == 4:
300 |         # model.single_train_predict(train, train_y, test, option=config.option)
301 |     # elif config.data_type == 5:
302 |         # model.multi_train_predict(train, train_y, test, option=config.option)
303 | 
304 |     else:
305 |         exit('错误数据类别')
306 | 
307 | 
308 | def static_data_prepare():
309 |     model_name = config.model_name
310 |     if not model_name:
311 |         model_name = "model_dict.pkl"
312 |     logger.info('start load data')
313 |     train_df = pd.read_csv(config.TRAIN_MULTI_X)
314 |     test_df = pd.read_csv(config.TEST_X)
315 |     if model_name in 'svc':
316 |         content_word = pd.concat((train_df['word'], test_df['word']))
317 |         content_char = pd.concat((train_df['char'], test_df['char']))
318 |         word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=5, norm='l2')
319 |         char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 5), min_df=1, norm='l2')
320 | 
321 |         ha = HashingVectorizer(ngram_range=(1, 1), lowercase=False)
322 |         discuss_ha = ha.fit_transform(content_word)
323 | 
324 |         logger.info('start word feature extraction')
325 |         word_feature = word_vectorizer.fit_transform(content_word)
326 |         logger.info("complete word feature extraction models")
327 |         logger.info("vocab len: %d" % len(word_vectorizer.vocabulary_.keys()))
328 | 
329 |         logger.info('start char feature extraction')
330 |         char_feature = char_vectorizer.fit_transform(content_char)
331 |         logger.info("complete char feature extraction models")
332 |         logger.info("vocab len: %d" % len(char_vectorizer.vocabulary_.keys()))
333 | 
334 |         train_feature = hstack([word_feature[:len(train_df)], char_feature[:len(train_df)]]).tocsr()
335 |         test_feature = hstack([word_feature[len(train_df):], char_feature[len(train_df):]]).tocsr()
336 | 
337 |         train_feature = hstack((word_feature[:len(train_df)], discuss_ha[:len(train_df)])).tocsr()
338 |         test_feature = hstack((word_feature[len(train_df):], discuss_ha[len(train_df):])).tocsr()
339 | 
340 |         train_feature = word_feature[:len(train_df)]
341 |         test_feature = word_feature[len(train_df):]
342 | 
343 |         logger.info("complete char feature extraction models")
344 |         logger.info("train feature shape: {}".format(np.shape(train_feature)))
345 |         logger.info("test feature shape: {}".format(np.shape(test_feature)))
346 | 
347 |         train_y = np.array(train_df.iloc[:, 6:].values)
348 |     else:
349 |         train_feature = np.asarray([train_df['word']]).T
350 |         train_y = np.array(train_df.iloc[:, 6:].values)
351 |         test_feature = np.asarray([test_df['word']]).T
352 |     return train_feature, train_y, test_feature
353 | 
354 | 
355 | def static_data_process():
356 |     # model train
357 |     train_x, train_y, test = static_data_prepare()
358 |     model = config.model[args.model](config=config, n_folds=5)
359 |     model.train_predict(train_x, train_y, test, option=config.option)
360 | 
361 | 
362 | if __name__ == '__main__':
363 |     import argparse
364 |     parser = argparse.ArgumentParser()
365 |     parser.add_argument('--gpu', type=str, default='6')
366 |     parser.add_argument('--model', type=str, help='模型')
367 |     parser.add_argument('--option', type=int, default=1, help='训练方式')
368 |     parser.add_argument('--epoch', type=int, default=10)
369 |     parser.add_argument('--data_type', type=int, default=3, help='问题模式, 0分单主题, 1分单情感, 2为十个四分类, 3为asp')
370 |     parser.add_argument('--feature', default='word', type=str, help='选择word或者char作为特征')
371 |     parser.add_argument('--es', default=300, type=int, help='embed size')
372 |     parser.add_argument('--debug', default=False, action='store_true', help='debug只会跑一折')
373 |     parser.add_argument('--oe', default=False, action='store_true', help='百度百科预训练词向量')
374 |     parser.add_argument('--ml', default=False, action='store_true', help='是否使用传统模型')
375 |     parser.add_argument('--car', default=False, action='store_true', help='是否用汽车之家数据训练的词向量')
376 |     parser.add_argument('--balance', default=False, action='store_true', help='根据样例比修改loss权重')
377 |     parser.add_argument('--bs', default=64, type=int, help='batch size')
378 |     args = parser.parse_args()
379 | 
380 |     # 设置keras后台和gpu
381 |     # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
382 | 
383 |     config = Config()
384 |     config.option = args.option
385 |     config.outer_embed = args.oe
386 |     config.n_epochs = args.epoch
387 |     config.main_feature = args.feature
388 |     config.model_name = args.model
389 |     config.is_debug = args.debug
390 |     config.BATCH_SIZE = args.bs
391 |     config.gpu = args.gpu
392 |     config.EMBED_SIZE = args.es
393 |     config.data_type = args.data_type
394 |     config.car = args.car
395 |     config.balance = args.balance
396 | 
397 |     if config.model_name in ['svc', 'fasttext']:
398 |         args.ml = True
399 | 
400 |     if args.ml:
401 |         static_data_process()
402 |     else:
403 |         char_stoi = pickle.load(open(config.char_stoi_file, 'rb'))
404 |         word_stoi = pickle.load(open(config.word_stoi_file, 'rb'))
405 | 
406 |         deep_data_process()
407 | 
408 | 


--------------------------------------------------------------------------------