├── .gitignore ├── README.md ├── data └── placeholder ├── model.jpeg └── src ├── bilm ├── __init__.py ├── data.py ├── elmo.py ├── model.py └── training.py ├── config.py ├── model ├── __init__.py ├── attention.py ├── bilstm_model.py ├── capsule_model.py ├── convlstm_model.py ├── dpcnn_model.py ├── han_model.py ├── hybrid_nn_1.py ├── lightgbm_model.py ├── lstmconv_model.py ├── lstmgru_model.py ├── ml_models.py ├── model_basic.py ├── model_component.py ├── modeling.py ├── my_callbacks.py ├── rcnn_model.py ├── snapshot.py ├── textcnn_model.py └── xgboost_model.py ├── pack_sub_dt2.py ├── preprocess ├── .ipynb_checkpoints │ └── EDA-checkpoint.ipynb ├── EDA.ipynb ├── word_tests.txt └── words.txt ├── stacking.py ├── tokenization.py ├── train_elmo.py └── train_predict.py /.gitignore: -------------------------------------------------------------------------------- 1 | ckpt*/ 2 | ./src/bilm/dump/ 3 | ./src/bilm/result/ 4 | ./src/runs/ 5 | data/ 6 | backup/ 7 | src/loss/ 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | bin/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # Installer logs 31 | pip-log.txt 32 | pip-delete-this-directory.txt 33 | 34 | # Unit test / coverage reports 35 | .tox/ 36 | .coverage 37 | .cache 38 | nosetests.xml 39 | coverage.xml 40 | 41 | # Translations 42 | *.mo 43 | 44 | # Mr Developer 45 | .mr.developer.cfg 46 | .project 47 | .pydevproject 48 | 49 | # Rope 50 | .ropeproject 51 | 52 | # Django stuff: 53 | *.log 54 | *.pot 55 | 56 | # Sphinx documentation 57 | docs/_build/ 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CCF-BDCI2018 汽车领域ASC挑战赛 2 | 3 | 以前没接触过ASC、TSC领域,最开始纠结这是单分类还是多分类问题,走了一些弯路。最终我们回到ASC赛道上,根据直觉,我们设计了一个基于memory的lstm-attention模型,复赛B榜线上在0.69左右,融合最终得分0.70,单模型结构图如下: 4 | 5 | ![](./model.jpeg) 6 | 7 | 8 | 后面时间比较紧张,复现今年ASC论文的代码效果都不好,最终排名6/1701,思路如同代码所写,很简单。 9 | 10 | 原始数据可在[比赛数据](https://www.datafountain.cn/competitions/310/details/data-evaluation)处下载,由于这次我们问题建模方式比较多,数据预处理代码也比较多,所以我会上传一份处理好的数据(包括处理好的Bert特征和百度百科词向量)放在[百度云盘](https://pan.baidu.com/s/1ZrgQ6Wp_sFRPrZGjZiBPaA),下载后请解压放在`data/`目录下。 11 | 12 | ELMo哈工大基于pytorch的pretrain版本和我用tf pretrain训练集的效果都不好,但是我也保留了tf pretrain版本代码。 13 | 14 | Bert我们没有弄finetune,直接抽取的特征,效果和百度百科词向量相当。 15 | 16 | 若有任何想法可以提issue或者pull request,也可以微信与我直接讨论。希望大家一起学习进步。 17 | 18 | 19 | ### 一、环境 20 | 21 | |环境/库|版本| 22 | |:---------:|----------| 23 | |Ubuntu|16.04.5 LTS| 24 | |python|3.6| 25 | |jupyter notebook|4.2.3| 26 | |tensorflow-gpu|1.9.1| 27 | |numpy|1.14.1| 28 | |pandas|0.23.0| 29 | |matplotlib|2.2.2| 30 | |tqdm|4.24.0| 31 | 32 | 这里最重要的就是我们用的Cudnn版本的lstm,所以需要tensorflow版本大于1.4.0,相应的cuda版本不能用8.0,需要9.0及以上。 33 | 34 | 35 | ### 二、数据预处理 36 | 37 | 都写在`jupyter`里了,运行`src/preprocess/EDA.ipynb`生成各种文件,可用看看思路,但是建议直接下载云盘处理好的结果。 38 | 39 | 40 | ### 三、深度模型训练 41 | 42 | 数据预处理好即可用直接train模型,单GPU运行,模型请参考`src/config.py`自选,参数名含义请参考`src/train_predict.py`: 43 | 44 | ``` 45 | python train_predict.py --gpu 7 --model aspv0 --feature word --epoch 20 --bs 128 --oe 46 | ``` 47 | 48 | 49 | ### 四、模型融合输出 50 | 51 | ``` 52 | python stacking.py --gpu 1 --data_type 3 53 | ``` 54 | 55 | 这里是`stacking`和`pesudo label`一起做了,请修改代码自选是否用伪标签。 56 | 57 | 这里数据集比较合适,伪标签有一定提分作用。 58 | 59 | ### 五、提交结果 60 | 61 | 修改`src/pack_sub_dt2.py`里对应stacking生成的`pre_path`概率结果路径,运行 62 | 63 | ``` 64 | python python pack_sub_dt2.py 65 | ``` 66 | 67 | 生成提交结果。 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /data/placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/data/placeholder -------------------------------------------------------------------------------- /model.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/model.jpeg -------------------------------------------------------------------------------- /src/bilm/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .data import Batcher, TokenBatcher 3 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \ 4 | dump_bilm_embeddings 5 | from .elmo import weight_layers 6 | 7 | -------------------------------------------------------------------------------- /src/bilm/data.py: -------------------------------------------------------------------------------- 1 | # originally based on https://github.com/tensorflow/models/tree/master/lm_1b 2 | import glob 3 | import random 4 | 5 | import numpy as np 6 | 7 | from typing import List 8 | 9 | 10 | class Vocabulary(object): 11 | ''' 12 | A token vocabulary. Holds a map from token to ids and provides 13 | a method for encoding text to a sequence of ids. 14 | ''' 15 | def __init__(self, filename, validate_file=False): 16 | ''' 17 | filename = the vocabulary file. It is a flat text file with one 18 | (normalized) token per line. In addition, the file should also 19 | contain the special tokens , , (case sensitive). 20 | ''' 21 | self._id_to_word = [] 22 | self._word_to_id = {} 23 | self._unk = -1 24 | self._bos = -1 25 | self._eos = -1 26 | 27 | with open(filename) as f: 28 | idx = 0 29 | for line in f: 30 | word_name = line.strip() 31 | if word_name == '': 32 | self._bos = idx 33 | elif word_name == '': 34 | self._eos = idx 35 | elif word_name == '': 36 | self._unk = idx 37 | if word_name == '!!!MAXTERMID': 38 | continue 39 | 40 | self._id_to_word.append(word_name) 41 | self._word_to_id[word_name] = idx 42 | idx += 1 43 | 44 | # check to ensure file has special tokens 45 | if validate_file: 46 | if self._bos == -1 or self._eos == -1 or self._unk == -1: 47 | raise ValueError("Ensure the vocabulary file has " 48 | ", , tokens") 49 | 50 | @property 51 | def bos(self): 52 | return self._bos 53 | 54 | @property 55 | def eos(self): 56 | return self._eos 57 | 58 | @property 59 | def unk(self): 60 | return self._unk 61 | 62 | @property 63 | def size(self): 64 | return len(self._id_to_word) 65 | 66 | def word_to_id(self, word): 67 | if word in self._word_to_id: 68 | return self._word_to_id[word] 69 | return self.unk 70 | 71 | def id_to_word(self, cur_id): 72 | return self._id_to_word[cur_id] 73 | 74 | def decode(self, cur_ids): 75 | """Convert a list of ids to a sentence, with space inserted.""" 76 | return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids]) 77 | 78 | def encode(self, sentence, reverse=False, split=True): 79 | """Convert a sentence to a list of ids, with special tokens added. 80 | Sentence is a single string with tokens separated by whitespace. 81 | 82 | If reverse, then the sentence is assumed to be reversed, and 83 | this method will swap the BOS/EOS tokens appropriately.""" 84 | 85 | if split: 86 | word_ids = [ 87 | self.word_to_id(cur_word) for cur_word in sentence.split() 88 | ] 89 | else: 90 | word_ids = [self.word_to_id(cur_word) for cur_word in sentence] 91 | 92 | if reverse: 93 | return np.array([self.eos] + word_ids + [self.bos], dtype=np.int32) 94 | else: 95 | return np.array([self.bos] + word_ids + [self.eos], dtype=np.int32) 96 | 97 | 98 | class UnicodeCharsVocabulary(Vocabulary): 99 | """Vocabulary containing character-level and word level information. 100 | 101 | Has a word vocabulary that is used to lookup word ids and 102 | a character id that is used to map words to arrays of character ids. 103 | 104 | The character ids are defined by ord(c) for c in word.encode('utf-8') 105 | This limits the total number of possible char ids to 256. 106 | To this we add 5 additional special ids: begin sentence, end sentence, 107 | begin word, end word and padding. 108 | 109 | WARNING: for prediction, we add +1 to the output ids from this 110 | class to create a special padding id (=0). As a result, we suggest 111 | you use the `Batcher`, `TokenBatcher`, and `LMDataset` classes instead 112 | of this lower level class. If you are using this lower level class, 113 | then be sure to add the +1 appropriately, otherwise embeddings computed 114 | from the pre-trained model will be useless. 115 | """ 116 | def __init__(self, filename, max_word_length, **kwargs): 117 | super(UnicodeCharsVocabulary, self).__init__(filename, **kwargs) 118 | self._max_word_length = max_word_length 119 | 120 | # char ids 0-255 come from utf-8 encoding bytes 121 | # assign 256-300 to special chars 122 | self.bos_char = 256 # 123 | self.eos_char = 257 # 124 | self.bow_char = 258 # 125 | self.eow_char = 259 # 126 | self.pad_char = 260 # 127 | 128 | num_words = len(self._id_to_word) 129 | 130 | self._word_char_ids = np.zeros([num_words, max_word_length], 131 | dtype=np.int32) 132 | 133 | # the charcter representation of the begin/end of sentence characters 134 | def _make_bos_eos(c): 135 | r = np.zeros([self.max_word_length], dtype=np.int32) 136 | r[:] = self.pad_char 137 | r[0] = self.bow_char 138 | r[1] = c 139 | r[2] = self.eow_char 140 | return r 141 | self.bos_chars = _make_bos_eos(self.bos_char) 142 | self.eos_chars = _make_bos_eos(self.eos_char) 143 | 144 | for i, word in enumerate(self._id_to_word): 145 | self._word_char_ids[i] = self._convert_word_to_char_ids(word) 146 | 147 | self._word_char_ids[self.bos] = self.bos_chars 148 | self._word_char_ids[self.eos] = self.eos_chars 149 | # TODO: properly handle 150 | 151 | @property 152 | def word_char_ids(self): 153 | return self._word_char_ids 154 | 155 | @property 156 | def max_word_length(self): 157 | return self._max_word_length 158 | 159 | def _convert_word_to_char_ids(self, word): 160 | code = np.zeros([self.max_word_length], dtype=np.int32) 161 | code[:] = self.pad_char 162 | 163 | word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length-2)] 164 | code[0] = self.bow_char 165 | for k, chr_id in enumerate(word_encoded, start=1): 166 | code[k] = chr_id 167 | code[k + 1] = self.eow_char 168 | 169 | return code 170 | 171 | def word_to_char_ids(self, word): 172 | if word in self._word_to_id: 173 | return self._word_char_ids[self._word_to_id[word]] 174 | else: 175 | return self._convert_word_to_char_ids(word) 176 | 177 | def encode_chars(self, sentence, reverse=False, split=True): 178 | ''' 179 | Encode the sentence as a white space delimited string of tokens. 180 | ''' 181 | if split: 182 | chars_ids = [self.word_to_char_ids(cur_word) 183 | for cur_word in sentence.split()] 184 | else: 185 | chars_ids = [self.word_to_char_ids(cur_word) 186 | for cur_word in sentence] 187 | if reverse: 188 | return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars]) 189 | else: 190 | return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars]) 191 | 192 | 193 | class Batcher(object): 194 | ''' 195 | Batch sentences of tokenized text into character id matrices. 196 | ''' 197 | def __init__(self, lm_vocab_file: str, max_token_length: int): 198 | ''' 199 | lm_vocab_file = the language model vocabulary file (one line per 200 | token) 201 | max_token_length = the maximum number of characters in each token 202 | ''' 203 | self._lm_vocab = UnicodeCharsVocabulary( 204 | lm_vocab_file, max_token_length 205 | ) 206 | self._max_token_length = max_token_length 207 | 208 | def batch_sentences(self, sentences: List[List[str]]): 209 | ''' 210 | Batch the sentences as character ids 211 | Each sentence is a list of tokens without or , e.g. 212 | [['The', 'first', 'sentence', '.'], ['Second', '.']] 213 | ''' 214 | n_sentences = len(sentences) 215 | max_length = max(len(sentence) for sentence in sentences) + 2 216 | 217 | X_char_ids = np.zeros( 218 | (n_sentences, max_length, self._max_token_length), 219 | dtype=np.int64 220 | ) 221 | 222 | for k, sent in enumerate(sentences): 223 | length = len(sent) + 2 224 | char_ids_without_mask = self._lm_vocab.encode_chars( 225 | sent, split=False) 226 | # add one so that 0 is the mask value 227 | X_char_ids[k, :length, :] = char_ids_without_mask + 1 228 | 229 | return X_char_ids 230 | 231 | 232 | class TokenBatcher(object): 233 | ''' 234 | Batch sentences of tokenized text into token id matrices. 235 | ''' 236 | def __init__(self, lm_vocab_file: str): 237 | ''' 238 | lm_vocab_file = the language model vocabulary file (one line per 239 | token) 240 | ''' 241 | self._lm_vocab = Vocabulary(lm_vocab_file) 242 | 243 | def batch_sentences(self, sentences: List[List[str]]): 244 | ''' 245 | Batch the sentences as character ids 246 | Each sentence is a list of tokens without or , e.g. 247 | [['The', 'first', 'sentence', '.'], ['Second', '.']] 248 | ''' 249 | n_sentences = len(sentences) 250 | max_length = max(len(sentence) for sentence in sentences) + 2 251 | 252 | X_ids = np.zeros((n_sentences, max_length), dtype=np.int64) 253 | 254 | for k, sent in enumerate(sentences): 255 | length = len(sent) + 2 256 | ids_without_mask = self._lm_vocab.encode(sent, split=False) 257 | # add one so that 0 is the mask value 258 | X_ids[k, :length] = ids_without_mask + 1 259 | 260 | return X_ids 261 | 262 | 263 | ##### for training 264 | def _get_batch(generator, batch_size, num_steps, max_word_length): 265 | """Read batches of input.""" 266 | cur_stream = [None] * batch_size 267 | 268 | no_more_data = False 269 | while True: 270 | inputs = np.zeros([batch_size, num_steps], np.int32) 271 | if max_word_length is not None: 272 | char_inputs = np.zeros([batch_size, num_steps, max_word_length], 273 | np.int32) 274 | else: 275 | char_inputs = None 276 | targets = np.zeros([batch_size, num_steps], np.int32) 277 | 278 | for i in range(batch_size): 279 | cur_pos = 0 280 | 281 | while cur_pos < num_steps: 282 | if cur_stream[i] is None or len(cur_stream[i][0]) <= 1: 283 | try: 284 | cur_stream[i] = list(next(generator)) 285 | except StopIteration: 286 | # No more data, exhaust current streams and quit 287 | no_more_data = True 288 | break 289 | 290 | how_many = min(len(cur_stream[i][0]) - 1, num_steps - cur_pos) 291 | next_pos = cur_pos + how_many 292 | 293 | inputs[i, cur_pos:next_pos] = cur_stream[i][0][:how_many] 294 | if max_word_length is not None: 295 | char_inputs[i, cur_pos:next_pos] = cur_stream[i][1][ 296 | :how_many] 297 | targets[i, cur_pos:next_pos] = cur_stream[i][0][1:how_many+1] 298 | 299 | cur_pos = next_pos 300 | 301 | cur_stream[i][0] = cur_stream[i][0][how_many:] 302 | if max_word_length is not None: 303 | cur_stream[i][1] = cur_stream[i][1][how_many:] 304 | 305 | if no_more_data: 306 | # There is no more data. Note: this will not return data 307 | # for the incomplete batch 308 | break 309 | 310 | X = {'token_ids': inputs, 'tokens_characters': char_inputs, 311 | 'next_token_id': targets} 312 | 313 | yield X 314 | 315 | class LMDataset(object): 316 | """ 317 | Hold a language model dataset. 318 | 319 | A dataset is a list of tokenized files. Each file contains one sentence 320 | per line. Each sentence is pre-tokenized and white space joined. 321 | """ 322 | def __init__(self, filepattern, vocab, reverse=False, test=False, 323 | shuffle_on_load=False): 324 | ''' 325 | filepattern = a glob string that specifies the list of files. 326 | vocab = an instance of Vocabulary or UnicodeCharsVocabulary 327 | reverse = if True, then iterate over tokens in each sentence in reverse 328 | test = if True, then iterate through all data once then stop. 329 | Otherwise, iterate forever. 330 | shuffle_on_load = if True, then shuffle the sentences after loading. 331 | ''' 332 | self._vocab = vocab 333 | self._all_shards = glob.glob(filepattern) 334 | print('Found %d shards at %s' % (len(self._all_shards), filepattern)) 335 | self._shards_to_choose = [] 336 | 337 | self._reverse = reverse 338 | self._test = test 339 | self._shuffle_on_load = shuffle_on_load 340 | self._use_char_inputs = hasattr(vocab, 'encode_chars') 341 | 342 | self._ids = self._load_random_shard() 343 | 344 | def _choose_random_shard(self): 345 | if len(self._shards_to_choose) == 0: 346 | self._shards_to_choose = list(self._all_shards) 347 | random.shuffle(self._shards_to_choose) 348 | shard_name = self._shards_to_choose.pop() 349 | return shard_name 350 | 351 | def _load_random_shard(self): 352 | """Randomly select a file and read it.""" 353 | if self._test: 354 | if len(self._all_shards) == 0: 355 | # we've loaded all the data 356 | # this will propogate up to the generator in get_batch 357 | # and stop iterating 358 | raise StopIteration 359 | else: 360 | shard_name = self._all_shards.pop() 361 | else: 362 | # just pick a random shard 363 | shard_name = self._choose_random_shard() 364 | 365 | ids = self._load_shard(shard_name) 366 | self._i = 0 367 | self._nids = len(ids) 368 | return ids 369 | 370 | def _load_shard(self, shard_name): 371 | """Read one file and convert to ids. 372 | 373 | Args: 374 | shard_name: file path. 375 | 376 | Returns: 377 | list of (id, char_id) tuples. 378 | """ 379 | print('Loading data from: %s' % shard_name) 380 | with open(shard_name) as f: 381 | sentences_raw = f.readlines() 382 | 383 | if self._reverse: 384 | sentences = [] 385 | for sentence in sentences_raw: 386 | splitted = sentence.split() 387 | splitted.reverse() 388 | sentences.append(' '.join(splitted)) 389 | else: 390 | sentences = sentences_raw 391 | 392 | if self._shuffle_on_load: 393 | random.shuffle(sentences) 394 | 395 | ids = [self.vocab.encode(sentence, self._reverse) 396 | for sentence in sentences] 397 | if self._use_char_inputs: 398 | chars_ids = [self.vocab.encode_chars(sentence, self._reverse) 399 | for sentence in sentences] 400 | else: 401 | chars_ids = [None] * len(ids) 402 | 403 | print('Loaded %d sentences.' % len(ids)) 404 | print('Finished loading') 405 | return list(zip(ids, chars_ids)) 406 | 407 | def get_sentence(self): 408 | while True: 409 | if self._i == self._nids: 410 | self._ids = self._load_random_shard() 411 | ret = self._ids[self._i] 412 | self._i += 1 413 | yield ret 414 | 415 | @property 416 | def max_word_length(self): 417 | if self._use_char_inputs: 418 | return self._vocab.max_word_length 419 | else: 420 | return None 421 | 422 | def iter_batches(self, batch_size, num_steps): 423 | for X in _get_batch(self.get_sentence(), batch_size, num_steps, 424 | self.max_word_length): 425 | 426 | # token_ids = (batch_size, num_steps) 427 | # char_inputs = (batch_size, num_steps, 50) of character ids 428 | # targets = word ID of next word (batch_size, num_steps) 429 | yield X 430 | 431 | @property 432 | def vocab(self): 433 | return self._vocab 434 | 435 | class BidirectionalLMDataset(object): 436 | def __init__(self, filepattern, vocab, test=False, shuffle_on_load=False): 437 | ''' 438 | bidirectional version of LMDataset 439 | ''' 440 | self._data_forward = LMDataset( 441 | filepattern, vocab, reverse=False, test=test, 442 | shuffle_on_load=shuffle_on_load) 443 | self._data_reverse = LMDataset( 444 | filepattern, vocab, reverse=True, test=test, 445 | shuffle_on_load=shuffle_on_load) 446 | 447 | def iter_batches(self, batch_size, num_steps): 448 | max_word_length = self._data_forward.max_word_length 449 | 450 | for X, Xr in zip( 451 | _get_batch(self._data_forward.get_sentence(), batch_size, 452 | num_steps, max_word_length), 453 | _get_batch(self._data_reverse.get_sentence(), batch_size, 454 | num_steps, max_word_length) 455 | ): 456 | 457 | for k, v in Xr.items(): 458 | X[k + '_reverse'] = v 459 | 460 | yield X 461 | 462 | 463 | class InvalidNumberOfCharacters(Exception): 464 | pass 465 | 466 | -------------------------------------------------------------------------------- /src/bilm/elmo.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | 4 | def weight_layers(name, bilm_ops, l2_coef=None, 5 | use_top_only=False, do_layer_norm=False): 6 | ''' 7 | Weight the layers of a biLM with trainable scalar weights to 8 | compute ELMo representations. 9 | 10 | For each output layer, this returns two ops. The first computes 11 | a layer specific weighted average of the biLM layers, and 12 | the second the l2 regularizer loss term. 13 | The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES 14 | 15 | Input: 16 | name = a string prefix used for the trainable variable names 17 | bilm_ops = the tensorflow ops returned to compute internal 18 | representations from a biLM. This is the return value 19 | from BidirectionalLanguageModel(...)(ids_placeholder) 20 | l2_coef: the l2 regularization coefficient $\lambda$. 21 | Pass None or 0.0 for no regularization. 22 | use_top_only: if True, then only use the top layer. 23 | do_layer_norm: if True, then apply layer normalization to each biLM 24 | layer before normalizing 25 | 26 | Output: 27 | { 28 | 'weighted_op': op to compute weighted average for output, 29 | 'regularization_op': op to compute regularization term 30 | } 31 | ''' 32 | def _l2_regularizer(weights): 33 | if l2_coef is not None: 34 | return l2_coef * tf.reduce_sum(tf.square(weights)) 35 | else: 36 | return 0.0 37 | 38 | # Get ops for computing LM embeddings and mask 39 | lm_embeddings = bilm_ops['lm_embeddings'] 40 | mask = bilm_ops['mask'] 41 | 42 | n_lm_layers = int(lm_embeddings.get_shape()[1]) 43 | lm_dim = int(lm_embeddings.get_shape()[3]) 44 | 45 | with tf.control_dependencies([lm_embeddings, mask]): 46 | # Cast the mask and broadcast for layer use. 47 | mask_float = tf.cast(mask, 'float32') 48 | broadcast_mask = tf.expand_dims(mask_float, axis=-1) 49 | 50 | def _do_ln(x): 51 | # do layer normalization excluding the mask 52 | x_masked = x * broadcast_mask 53 | N = tf.reduce_sum(mask_float) * lm_dim 54 | mean = tf.reduce_sum(x_masked) / N 55 | variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask)**2 56 | ) / N 57 | return tf.nn.batch_normalization( 58 | x, mean, variance, None, None, 1E-12 59 | ) 60 | 61 | if use_top_only: 62 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1) 63 | # just the top layer 64 | sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1) 65 | # no regularization 66 | reg = 0.0 67 | else: 68 | W = tf.get_variable( 69 | '{}_ELMo_W'.format(name), 70 | shape=(n_lm_layers, ), 71 | initializer=tf.zeros_initializer, 72 | regularizer=_l2_regularizer, 73 | trainable=True, 74 | ) 75 | 76 | # normalize the weights 77 | normed_weights = tf.split( 78 | tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers 79 | ) 80 | # split LM layers 81 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1) 82 | 83 | # compute the weighted, normalized LM activations 84 | pieces = [] 85 | for w, t in zip(normed_weights, layers): 86 | if do_layer_norm: 87 | pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1))) 88 | else: 89 | pieces.append(w * tf.squeeze(t, squeeze_dims=1)) 90 | sum_pieces = tf.add_n(pieces) 91 | 92 | # get the regularizer 93 | reg = [ 94 | r for r in tf.get_collection( 95 | tf.GraphKeys.REGULARIZATION_LOSSES) 96 | if r.name.find('{}_ELMo_W/'.format(name)) >= 0 97 | ] 98 | if len(reg) != 1: 99 | raise ValueError 100 | 101 | # scale the weighted sum by gamma 102 | gamma = tf.get_variable( 103 | '{}_ELMo_gamma'.format(name), 104 | shape=(1, ), 105 | initializer=tf.ones_initializer, 106 | regularizer=None, 107 | trainable=True, 108 | ) 109 | weighted_lm_layers = sum_pieces * gamma 110 | 111 | ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg} 112 | 113 | return ret 114 | 115 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | # from model.lightgbm_model import LightGbmModel 2 | # from model.xgboost_model import XgboostModel 3 | from model.textcnn_model import TextCNNModel 4 | from model.dpcnn_model import DpcnnModel 5 | from model.capsule_model import CapsuleModel 6 | from model.rcnn_model import RCNNModel 7 | from model.attention import AttentionModel 8 | from model.convlstm_model import ConvlstmModel 9 | from model.lstmconv_model import LstmconvModel 10 | from model.lstmgru_model import LstmgruModel 11 | from model.han_model import HANModel 12 | from model.hybrid_nn_1 import HybridNN1Model 13 | from model.ml_models import SVCClassifier 14 | from model.ml_models import Fasttext 15 | from model.bilstm_model import * 16 | 17 | 18 | class Config(object): 19 | 20 | """Docstring for Config. """ 21 | 22 | def __init__(self): 23 | """TODO: to be defined1. """ 24 | self.model = { 25 | # 'xgboost': XgboostModel, 26 | # 'lightgbm': LightGbmModel, 27 | # 'svc': SVCClassifier, 28 | # 'fasttext': Fasttext, 29 | 30 | # dl model 31 | 'aspv0': BilstmV0, 32 | 'aspv1': BilstmV1, 33 | # 'aspv2': BilstmV2, 34 | 'textcnn': TextCNNModel, 35 | 'lstmgru': LstmgruModel, 36 | 'attention': AttentionModel, 37 | 'convlstm': ConvlstmModel, 38 | 'lstmconv': LstmconvModel, 39 | # 'dpcnn': DpcnnModel, 40 | # 'rcnn': RCNNModel, 41 | # 'capsule': CapsuleModel, 42 | # 'han': HANModel, 43 | # 'hybridnn1': HybridNN1Model, 44 | } 45 | self.CHAR_MAXLEN = 190 46 | self.WORD_MAXLEN = 128 47 | 48 | self.HANN_SENT = 20 49 | self.HANN_WORD_LEN = 40 50 | self.HANN_CHAR_LEN = 70 51 | self.EMBED_SIZE = 300 52 | self.main_feature = 'word' 53 | self.is_debug = True 54 | # self.elmo_word_options_file = './bilm/dump/options.word.json' 55 | # self.elmo_word_weight_file = './bilm/dump/weights.word.hdf5' 56 | # self.elmo_word_embed_file = './bilm/dump/vocab_embedding.word.hdf5' 57 | # self.elmo_word_vocab_file = '../data/word2vec_models/word2vec.word.300d.vocab.txt' 58 | 59 | # self.elmo_char_options_file = './bilm/dump/options.char.json' 60 | # self.elmo_char_weight_file = './bilm/dump/weights.char.hdf5' 61 | # self.elmo_char_embed_file = './bilm/dump/vocab_embedding.char.hdf5' 62 | # self.elmo_char_vocab_file = '../data/word2vec_models/word2vec.char.300d.vocab.txt' 63 | 64 | # self.elmo_qiuqiu_options_file = './bilm/dump/tmp/options.json' 65 | # self.elmo_qiuqiu_weight_file = './bilm/dump/tmp/weight-11-4.hdf5' 66 | # self.elmo_qiuqiu_embed_file = './bilm/dump/tmp/word_embedding.after.elmo-11-4.hdf5' 67 | # self.elmo_qiuqiu_vocab_file = './bilm/dump/tmp/sa_elmo_vocabs.txt' 68 | 69 | self.loss_path = '../data/loss' 70 | self.TEST_X = '../data/csvs/test_public.csv' 71 | self.TRAIN_MULTI_X = '../data/csvs/train_multi.csv' 72 | self.TRAIN_JP = '../data/csvs/round2zh2jp.csv' 73 | self.TRAIN_EN = '../data/csvs/round2zh2en.csv' 74 | # self.SENTIMENT_EMBED_PATH = '../data/sentiment_embedding.pkl' 75 | 76 | # self.BERT_VOCAB_FILES = '../data/chinese_L-12_H-768_A-12/vocab.txt' 77 | # self.BERT_CONFIG_FILES = '../data/chinese_L-12_H-768_A-12/bert_config.json' 78 | 79 | # self.Y_DISTILLATION = '../data/result/oof.pkl' 80 | 81 | # property 等待调用到它时才计算,先加载embed size再加载对应词向量 82 | @property 83 | def char_stoi_file(self): 84 | if self.car: 85 | return '../data/char_item_to_id.cars-home.pkl' 86 | else: 87 | return '../data/char_item_to_id.pkl' 88 | 89 | @property 90 | def word_stoi_file(self): 91 | if self.car: 92 | return '../data/word_item_to_id.cars-home.pkl' 93 | else: 94 | return '../data/word_item_to_id.pkl' 95 | 96 | @property 97 | def char_w2v_file(self): 98 | if self.outer_embed: 99 | return '../data/word2vec_models/sgns.baidubaike.bigram-char' 100 | else: 101 | if not self.car: 102 | return '../data/word2vec_models/word2vec.char.{}d.model.txt'.format(self.EMBED_SIZE) 103 | else: 104 | return '../data/word2vec_models/word2vec.char.{}d.model.cars-home.txt'.format(self.EMBED_SIZE) 105 | 106 | 107 | @property 108 | def word_w2v_file(self): 109 | 110 | if self.outer_embed: 111 | return '../data/word2vec_models/sgns.baidubaike.bigram-char' 112 | else: 113 | if not self.car: 114 | return '../data/word2vec_models/word2vec.word.{}d.model.txt'.format(self.EMBED_SIZE) 115 | else: 116 | return '../data/word2vec_models/word2vec.word.{}d.model.cars-home.txt'.format(self.EMBED_SIZE) 117 | 118 | @property 119 | def TRAIN_X(self): 120 | if self.data_type == 0: 121 | return '../data/csvs/train_single_label.csv' 122 | elif self.data_type == 1: 123 | return '../data/csvs/train_single_label.csv' 124 | elif self.data_type == 2: 125 | return '../data/csvs/train_multi.csv' 126 | elif self.data_type == 3: 127 | return '../data/csvs/train_multi.csv' 128 | elif self.data_type == 4: 129 | return '../data/csvs/train.csv' 130 | elif self.data_type == 5: 131 | return '../data/csvs/multi_train.csv' 132 | 133 | @property 134 | def n_classes(self): 135 | if self.data_type == 0: 136 | return 10 137 | elif self.data_type == 1: 138 | return 3 139 | elif self.data_type == 2: 140 | return 4 141 | elif self.data_type == 3: 142 | return 4 143 | elif self.data_type == 4: 144 | return 3 145 | elif self.data_type == 5: 146 | return 30 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /src/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/src/model/__init__.py -------------------------------------------------------------------------------- /src/model/attention.py: -------------------------------------------------------------------------------- 1 | from model.model_basic import BasicDeepModel 2 | import tensorflow as tf 3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings 4 | from bilm.elmo import weight_layers 5 | from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn 6 | 7 | n_sub = 10 8 | 9 | 10 | class AttentionModel(BasicDeepModel): 11 | def __init__(self, name='basicModel', n_folds=5, config=None): 12 | name = 'attention' + config.main_feature 13 | self.hidden_dim = 150 14 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config) 15 | 16 | def create_model(self, share_dense=True, concat_sub=True): 17 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y') 18 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2') 19 | 20 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 21 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 22 | 23 | if self.main_feature.lower() in ['word', 'char']: 24 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x') 25 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 26 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 27 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 28 | 29 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 30 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 31 | if self.main_feature == 'elmo_word': 32 | options_file = self.config.elmo_word_options_file 33 | weight_file = self.config.elmo_word_weight_file 34 | embed_file = self.config.elmo_word_embed_file 35 | elif self.main_feature == 'elmo_char': 36 | options_file = self.config.elmo_char_options_file 37 | weight_file = self.config.elmo_char_weight_file 38 | embed_file = self.config.elmo_char_embed_file 39 | elif self.main_feature == 'elmo_qiuqiu': 40 | options_file = self.config.elmo_qiuqiu_options_file 41 | weight_file = self.config.elmo_qiuqiu_weight_file 42 | embed_file = self.config.elmo_qiuqiu_embed_file 43 | self.bilm = BidirectionalLanguageModel(options_file, 44 | weight_file, 45 | use_character_inputs=False, 46 | embedding_weight_file=embed_file, 47 | max_batch_size=self.batch_size) 48 | bilm_embedding_op = self.bilm(self.input_x) 49 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0) 50 | self.word_encoding = bilm_embedding['weighted_op'] 51 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 52 | 53 | else: 54 | exit('wrong feature') 55 | 56 | c_outputs = [] 57 | for c in range(n_sub): 58 | with tf.variable_scope('lstm-{}'.format(c)): 59 | # self.forward = self.LSTM() 60 | # self.backward = self.LSTM() 61 | # x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32) 62 | # x = tf.concat(x, -1) 63 | #### cudnn lstm #### 64 | self.forward_lstm = cudnn_rnn.CudnnLSTM(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32) 65 | self.forward_gru = cudnn_rnn.CudnnGRU(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32) 66 | x, _ = self.forward_lstm(tf.transpose(self.word_encoding, [1, 0, 2])) 67 | x, _ = self.forward_gru(x) 68 | x = tf.transpose(x, [1, 0, 2]) 69 | 70 | with tf.variable_scope('pooling-{}'.format(c)): 71 | max_pooled = tf.reshape(tf.reduce_max(x, 1), [-1, 2*self.hidden_dim]) 72 | avg_pooled = tf.reshape(tf.reduce_mean(x, 1), [-1, 2*self.hidden_dim]) 73 | 74 | att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim], name='att_w') 75 | att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b') 76 | att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v') 77 | 78 | x_reshape = tf.reshape(x, [-1, 2*self.hidden_dim]) 79 | score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(x_reshape, att_w)) + att_b, att_v), [-1, 1, self.max_len]) 80 | alpha = tf.nn.softmax(score, axis=-1) 81 | att_pooled = tf.reshape(tf.matmul(alpha, x), [-1, 2*self.hidden_dim]) 82 | 83 | concat_pooled = tf.concat((max_pooled, att_pooled, avg_pooled), -1) 84 | 85 | concat_pooled = tf.nn.dropout(concat_pooled, self.dropout_keep_prob) 86 | dense = tf.layers.dense(concat_pooled, 4, activation=None) 87 | c_outputs.append(dense) 88 | 89 | self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4]) 90 | y_ = tf.nn.softmax(self.logits) 91 | self.prob = tf.reshape(y_, [-1, n_sub, 4]) 92 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 93 | 94 | if not self.config.balance: 95 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 96 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4]))) 97 | else: 98 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 99 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 100 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 101 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 102 | class0_weight = 1 # 第0类的权重系数 103 | class1_weight = 3 # 第1类的权重系数 104 | class2_weight = 3 # 第2类的权重系数 105 | class3_weight = 3 # 第3类的权重系数 106 | # coe = tf.constant([1., 1., 1., 1.]) 107 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 108 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 109 | 110 | y = tf.reshape(self.input_y, [-1, 4]) 111 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 112 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 113 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 114 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 115 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 116 | 117 | return self 118 | 119 | 120 | -------------------------------------------------------------------------------- /src/model/bilstm_model.py: -------------------------------------------------------------------------------- 1 | from model.model_basic import BasicDeepModel 2 | from model import modeling 3 | import tensorflow as tf 4 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings 5 | from bilm.elmo import weight_layers 6 | 7 | n_sub = 10 8 | 9 | class BilstmV0(BasicDeepModel): 10 | def __init__(self, name='basicModel', n_folds=5, config=None): 11 | name = 'qiuqiuv0' + config.main_feature 12 | self.hidden_dim = 300 13 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config) 14 | 15 | def create_model(self): 16 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,10,4], name='input_y') 17 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2') 18 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 19 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 20 | 21 | if self.main_feature.lower() in ['word', 'char']: 22 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x') 23 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 24 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 25 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 26 | 27 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 28 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 29 | if self.main_feature == 'elmo_word': 30 | options_file = self.config.elmo_word_options_file 31 | weight_file = self.config.elmo_word_weight_file 32 | embed_file = self.config.elmo_word_embed_file 33 | elif self.main_feature == 'elmo_char': 34 | options_file = self.config.elmo_char_options_file 35 | weight_file = self.config.elmo_char_weight_file 36 | embed_file = self.config.elmo_char_embed_file 37 | elif self.main_feature == 'elmo_qiuqiu': 38 | options_file = self.config.elmo_qiuqiu_options_file 39 | weight_file = self.config.elmo_qiuqiu_weight_file 40 | embed_file = self.config.elmo_qiuqiu_embed_file 41 | 42 | self.bilm = BidirectionalLanguageModel(options_file, 43 | weight_file, 44 | use_character_inputs=False, 45 | embedding_weight_file=embed_file, 46 | max_batch_size=self.batch_size) 47 | bilm_embedding_op = self.bilm(self.input_x) 48 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0) 49 | self.word_encoding = bilm_embedding['weighted_op'] 50 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 51 | 52 | else: 53 | exit('wrong feature') 54 | 55 | self.layer_embedding = tf.get_variable(shape=[10, self.hidden_dim], name='layer_embedding') 56 | # self.layer_embedding = tf.get_variable(initializer=self.sentiment_embed, name='layer_embedding') 57 | 58 | self.forward = self.LSTM() 59 | self.backwad = self.LSTM() 60 | # self.forward2 = self.LSTM() 61 | # self.backwad2 = self.LSTM() 62 | 63 | # add point 64 | self.forward2 = self.GRU() 65 | self.backwad2 = self.GRU() 66 | 67 | with tf.variable_scope('sentence_encode'): 68 | all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backwad,self.word_encoding,dtype=tf.float32) 69 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1]) 70 | output_sentence = tf.concat(axis=2, values=all_output_words) 71 | 72 | with tf.variable_scope('sentence_encode2'): 73 | all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward2,self.backwad2,output_sentence,dtype=tf.float32) 74 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1]) 75 | output_sentence = tf.concat(axis=2, values=all_output_words) 76 | output_sentence = tf.layers.dense(output_sentence, self.hidden_dim, activation=tf.nn.tanh) 77 | sentence_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len, self.hidden_dim]) 78 | sentence_reshape_tile = tf.tile(sentence_reshape, [1, 10, 1, 1]) # 句子复制10份 79 | 80 | layer_reshape = tf.reshape(self.layer_embedding, [1, 10, 1, self.hidden_dim]) 81 | layer_reshape_tile = tf.tile(layer_reshape, [self.batch_size, 1, self.max_len, 1]) 82 | 83 | embed_concat = tf.reshape(tf.concat(axis=3,values=[sentence_reshape_tile,layer_reshape_tile]),[-1,2*self.hidden_dim]) 84 | 85 | self.att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim],name='att_w') 86 | self.att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b') 87 | self.att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v') 88 | 89 | score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(embed_concat,self.att_w) + self.att_b),self.att_v),[-1,10,self.max_len]) 90 | alpah = tf.nn.softmax(score,axis=2) 91 | layer_sentence = tf.matmul(alpah,output_sentence) 92 | 93 | layer_reshape2 = tf.reshape(self.layer_embedding,[1,10,self.hidden_dim]) 94 | layer_reshape2_tile = tf.tile(layer_reshape2,[self.batch_size,1,1]) 95 | layer_sentence = tf.concat(axis=2,values=[layer_sentence,layer_reshape2_tile]) 96 | layer_sentence = tf.reshape(layer_sentence,[-1,2*self.hidden_dim]) 97 | 98 | layer_sentence = tf.layers.dense(layer_sentence,self.hidden_dim,activation=tf.nn.relu) 99 | 100 | # add point 101 | layer_sentence = tf.nn.dropout(layer_sentence, self.dropout_keep_prob) 102 | 103 | self.logits = tf.layers.dense(layer_sentence, 4, activation=None) 104 | y_ = tf.nn.softmax(self.logits, axis=1) 105 | self.prob = tf.reshape(y_, [-1, 10, 4]) 106 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 107 | 108 | if not self.config.balance: 109 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 110 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4]))) 111 | else: 112 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 113 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 114 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 115 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 116 | class0_weight = 1 # 第0类的权重系数 117 | class1_weight = 3 # 第1类的权重系数 118 | class2_weight = 3 # 第2类的权重系数 119 | class3_weight = 3 # 第3类的权重系数 120 | # coe = tf.constant([1., 1., 1., 1.]) 121 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 122 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 123 | 124 | y = tf.reshape(self.input_y, [-1, 4]) 125 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 126 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 127 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 128 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 129 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 130 | 131 | return self 132 | 133 | def LSTM(self, layers=1): 134 | lstms = [] 135 | for num in range(layers): 136 | lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0) 137 | print(lstm.name) 138 | # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim) 139 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob) 140 | lstms.append(lstm) 141 | 142 | lstms = tf.contrib.rnn.MultiRNNCell(lstms) 143 | return lstms 144 | 145 | def GRU(self, layers=1): 146 | lstms = [] 147 | for num in range(layers): 148 | # lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0) 149 | lstm = tf.contrib.rnn.GRUCell(self.hidden_dim) 150 | print(lstm.name) 151 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob) 152 | lstms.append(lstm) 153 | 154 | lstms = tf.contrib.rnn.MultiRNNCell(lstms) 155 | return lstms 156 | 157 | 158 | class BilstmV1(BasicDeepModel): 159 | def __init__(self, name='basicModel', n_folds=5, config=None): 160 | name = 'qiuqiuv1' + config.main_feature 161 | self.hidden_dim = 300 162 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config) 163 | 164 | def create_model(self, concat_sub=True): 165 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,10,4], name='input_y') 166 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2') 167 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 168 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 169 | 170 | if self.main_feature.lower() in ['word', 'char']: 171 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x') 172 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 173 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 174 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 175 | 176 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 177 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 178 | if self.main_feature == 'elmo_word': 179 | options_file = self.config.elmo_word_options_file 180 | weight_file = self.config.elmo_word_weight_file 181 | embed_file = self.config.elmo_word_embed_file 182 | elif self.main_feature == 'elmo_char': 183 | options_file = self.config.elmo_char_options_file 184 | weight_file = self.config.elmo_char_weight_file 185 | embed_file = self.config.elmo_char_embed_file 186 | elif self.main_feature == 'elmo_qiuqiu': 187 | options_file = self.config.elmo_qiuqiu_options_file 188 | weight_file = self.config.elmo_qiuqiu_weight_file 189 | embed_file = self.config.elmo_qiuqiu_embed_file 190 | 191 | self.bilm = BidirectionalLanguageModel(options_file, 192 | weight_file, 193 | use_character_inputs=False, 194 | embedding_weight_file=embed_file, 195 | max_batch_size=self.batch_size) 196 | bilm_embedding_op = self.bilm(self.input_x) 197 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0) 198 | self.word_encoding = bilm_embedding['weighted_op'] 199 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 200 | 201 | else: 202 | exit('wrong feature') 203 | 204 | self.layer_embedding = tf.get_variable(shape=[10, self.hidden_dim], name='layer_embedding') 205 | layer_reshape = tf.reshape(self.layer_embedding, [1, 10, 1, self.hidden_dim]) 206 | layer_reshape_tile = tf.tile(layer_reshape, [self.batch_size, 1, self.max_len, 1]) 207 | 208 | self.forward = self.LSTM() 209 | self.backwad = self.LSTM() 210 | self.forward2 = self.LSTM() 211 | self.backwad2 = self.LSTM() 212 | 213 | with tf.variable_scope('sentence_encode'): 214 | s1_out, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backwad,self.word_encoding,dtype=tf.float32) 215 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1]) 216 | s1_out = tf.concat(axis=2, values=s1_out) 217 | s1_reshape = tf.reshape(s1_out, [-1, 1, self.max_len, 2*self.hidden_dim]) 218 | s1_tile = tf.tile(s1_reshape, [1, 10, 1, 1]) # 第一层lstm复制10份 219 | 220 | s2_input = tf.reshape(tf.concat((s1_tile, layer_reshape_tile), -1), [-1, self.max_len, 3*self.hidden_dim]) 221 | 222 | with tf.variable_scope('sentence_encode2'): 223 | s2_out, _ = tf.nn.bidirectional_dynamic_rnn(self.forward2,self.backwad2,s2_input,dtype=tf.float32) 224 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1]) 225 | s2_out = tf.reshape(tf.concat(axis=-1, values=s2_out), [-1, 10, self.max_len, 2*self.hidden_dim]) 226 | res_out = s2_out + s1_tile 227 | res_dense = tf.layers.dense(res_out, self.hidden_dim, activation=tf.nn.relu) 228 | 229 | res_layer_concat = tf.reshape(tf.concat((res_dense, layer_reshape_tile), -1), [-1, 2*self.hidden_dim]) 230 | 231 | self.att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim],name='att_w') 232 | self.att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b') 233 | self.att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v') 234 | 235 | score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(res_layer_concat, self.att_w) + self.att_b),self.att_v),[-1,1,self.max_len]) 236 | alpha = tf.nn.softmax(score) 237 | layer_sentence = tf.reshape(tf.matmul(alpha, tf.reshape(res_out, [-1, self.max_len, 2*self.hidden_dim])), [-1, n_sub, 2*self.hidden_dim]) 238 | 239 | if concat_sub: 240 | # 是否拼接layer_sub信息 241 | layer_sub = tf.reshape(self.layer_embedding, [1, n_sub, self.hidden_dim]) 242 | layer_sub_tile = tf.tile(layer_sub, [self.batch_size, 1, 1]) 243 | 244 | layer_total = tf.concat((layer_sentence, layer_sub_tile), -1) 245 | outputs = tf.reshape(layer_total, [-1, 3*self.hidden_dim]) 246 | else: 247 | outputs = tf.reshape(layer_sentence, [-1, 2*self.hidden_dim]) 248 | 249 | self.logits = tf.layers.dense(outputs, 4, activation=None) 250 | y_ = tf.nn.softmax(self.logits) 251 | self.prob = tf.reshape(y_, [-1, 10, 4]) 252 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 253 | 254 | if not self.config.balance: 255 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 256 | self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4]))) 257 | else: 258 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 259 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 260 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 261 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 262 | class0_weight = 1 # 第0类的权重系数 263 | class1_weight = 3 # 第1类的权重系数 264 | class2_weight = 3 # 第2类的权重系数 265 | class3_weight = 3 # 第3类的权重系数 266 | # coe = tf.constant([1., 1., 1., 1.]) 267 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 268 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 269 | 270 | y = tf.reshape(self.input_y, [-1, 4]) 271 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 272 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 273 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 274 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 275 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 276 | 277 | return self 278 | 279 | def LSTM(self, layers=1): 280 | lstms = [] 281 | for num in range(layers): 282 | lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0) 283 | print(lstm.name) 284 | # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim) 285 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob) 286 | lstms.append(lstm) 287 | 288 | lstms = tf.contrib.rnn.MultiRNNCell(lstms) 289 | return lstms 290 | 291 | def GRU(self, layers=1): 292 | lstms = [] 293 | for num in range(layers): 294 | # lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0) 295 | lstm = tf.contrib.rnn.GRUCell(self.hidden_dim) 296 | print(lstm.name) 297 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob) 298 | lstms.append(lstm) 299 | 300 | lstms = tf.contrib.rnn.MultiRNNCell(lstms) 301 | return lstms 302 | 303 | 304 | class BilstmV2(BasicDeepModel): 305 | def __init__(self, name='basicModel', n_folds=5, config=None): 306 | name = 'qiuqiuv2' + config.main_feature 307 | self.hidden_dim = 300 308 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config) 309 | 310 | def create_model(self): 311 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,10,4], name='input_y') 312 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2') 313 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 314 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 315 | 316 | self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None,190], name='input_ids') 317 | self.mask_ids = tf.placeholder(dtype=tf.int32, shape=[None,190], name='mask_ids') 318 | self.type_ids = tf.placeholder(dtype=tf.int32, shape=[None,190], name='type_ids') 319 | self.is_training = tf.placeholder(dtype=tf.bool, name='is_training') 320 | 321 | # bert_hidden_size = bert_output_layer.shape[-1].value 322 | # hidden_size = output_layer.shape[-1].value 323 | 324 | if self.main_feature.lower() in ['word', 'char']: 325 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x') 326 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 327 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 328 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 329 | 330 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 331 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 332 | if self.main_feature == 'elmo_word': 333 | options_file = self.config.elmo_word_options_file 334 | weight_file = self.config.elmo_word_weight_file 335 | embed_file = self.config.elmo_word_embed_file 336 | elif self.main_feature == 'elmo_char': 337 | options_file = self.config.elmo_char_options_file 338 | weight_file = self.config.elmo_char_weight_file 339 | embed_file = self.config.elmo_char_embed_file 340 | elif self.main_feature == 'elmo_qiuqiu': 341 | options_file = self.config.elmo_qiuqiu_options_file 342 | weight_file = self.config.elmo_qiuqiu_weight_file 343 | embed_file = self.config.elmo_qiuqiu_embed_file 344 | 345 | self.bilm = BidirectionalLanguageModel(options_file, 346 | weight_file, 347 | use_character_inputs=False, 348 | embedding_weight_file=embed_file, 349 | max_batch_size=self.batch_size) 350 | bilm_embedding_op = self.bilm(self.input_x) 351 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0) 352 | self.word_encoding = bilm_embedding['weighted_op'] 353 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 354 | 355 | else: 356 | exit('wrong feature') 357 | 358 | self.layer_embedding = tf.get_variable(shape=[10, self.hidden_dim], name='layer_embedding') 359 | 360 | self.forward = self.LSTM() 361 | self.backwad = self.LSTM() 362 | # self.forward2 = self.LSTM() 363 | # self.backwad2 = self.LSTM() 364 | 365 | # add point 366 | self.forward2 = self.GRU() 367 | self.backwad2 = self.GRU() 368 | 369 | # bert使用 370 | bert_config = modeling.BertConfig.from_json_file(self.config.BERT_CONFIG_FILES) 371 | 372 | bert_model = modeling.BertModel( 373 | config=bert_config, 374 | is_training=self.is_training, 375 | input_ids=self.input_ids, 376 | input_mask=self.mask_ids, 377 | token_type_ids=self.type_ids 378 | ) 379 | if self.is_training is not None: 380 | print('bert config hidden dropout -- ---', bert_config.hidden_dropout_prob) 381 | print('bert config hidden dropout -- ---', bert_config.attention_probs_dropout_prob) 382 | self.word_encoding = bert_model.get_sequence_output() 383 | all_layer_output = bert_model.get_all_encoder_layers() 384 | self.word_encoding = (all_layer_output[0] + all_layer_output[1] + all_layer_output[2] + all_layer_output[3]) / 4 385 | with tf.variable_scope('sentence_encode'): 386 | all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backwad,self.word_encoding,dtype=tf.float32) 387 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1]) 388 | output_sentence = tf.concat(axis=2, values=all_output_words) 389 | 390 | with tf.variable_scope('sentence_encode2'): 391 | all_output_words, _ = tf.nn.bidirectional_dynamic_rnn(self.forward2,self.backwad2,output_sentence,dtype=tf.float32) 392 | # output_sentence = 0.5*(all_output_words[0] + all_output_words[1]) 393 | output_sentence = tf.concat(axis=2, values=all_output_words) 394 | output_sentence = tf.layers.dense(output_sentence, self.hidden_dim, activation=tf.nn.tanh) 395 | sentence_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len, self.hidden_dim]) 396 | sentence_reshape_tile = tf.tile(sentence_reshape, [1, 10, 1, 1]) # 句子复制10份 397 | 398 | layer_reshape = tf.reshape(self.layer_embedding, [1, 10, 1, self.hidden_dim]) 399 | layer_reshape_tile = tf.tile(layer_reshape, [self.batch_size, 1, self.max_len, 1]) 400 | 401 | embed_concat = tf.reshape(tf.concat(axis=3,values=[sentence_reshape_tile,layer_reshape_tile]),[-1,2*self.hidden_dim]) 402 | 403 | self.att_w = tf.get_variable(shape=[2*self.hidden_dim,self.hidden_dim],name='att_w') 404 | self.att_b = tf.get_variable(shape=[self.hidden_dim],name='att_b') 405 | self.att_v = tf.get_variable(shape=[self.hidden_dim,1],name='att_v') 406 | 407 | score = tf.reshape(tf.matmul(tf.nn.tanh(tf.matmul(embed_concat,self.att_w) + self.att_b),self.att_v),[-1,10,self.max_len]) 408 | alpah = tf.nn.softmax(score,axis=2) 409 | layer_sentence = tf.matmul(alpah,output_sentence) 410 | 411 | layer_reshape2 = tf.reshape(self.layer_embedding,[1,10,self.hidden_dim]) 412 | layer_reshape2_tile = tf.tile(layer_reshape2,[self.batch_size,1,1]) 413 | layer_sentence = tf.concat(axis=2,values=[layer_sentence,layer_reshape2_tile]) 414 | layer_sentence = tf.reshape(layer_sentence,[-1,2*self.hidden_dim]) 415 | 416 | layer_sentence = tf.layers.dense(layer_sentence,self.hidden_dim,activation=tf.nn.relu) 417 | 418 | # add point 419 | layer_sentence = tf.nn.dropout(layer_sentence, self.dropout_keep_prob) 420 | 421 | self.logits = tf.layers.dense(layer_sentence, 4, activation=None) 422 | y_ = tf.nn.softmax(self.logits, axis=1) 423 | self.prob = tf.reshape(y_, [-1, 10, 4]) 424 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 425 | 426 | if not self.config.balance: 427 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 428 | self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4]))) 429 | else: 430 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 431 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 432 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 433 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 434 | class0_weight = 1 # 第0类的权重系数 435 | class1_weight = 3 # 第1类的权重系数 436 | class2_weight = 3 # 第2类的权重系数 437 | class3_weight = 3 # 第3类的权重系数 438 | # coe = tf.constant([1., 1., 1., 1.]) 439 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 440 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 441 | 442 | y = tf.reshape(self.input_y, [-1, 4]) 443 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 444 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 445 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 446 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 447 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 448 | 449 | return self 450 | 451 | def LSTM(self, layers=1): 452 | lstms = [] 453 | for num in range(layers): 454 | lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0) 455 | print(lstm.name) 456 | # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim) 457 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob) 458 | lstms.append(lstm) 459 | 460 | lstms = tf.contrib.rnn.MultiRNNCell(lstms) 461 | return lstms 462 | 463 | def GRU(self, layers=1): 464 | lstms = [] 465 | for num in range(layers): 466 | # lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0) 467 | lstm = tf.contrib.rnn.GRUCell(self.hidden_dim) 468 | print(lstm.name) 469 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob) 470 | lstms.append(lstm) 471 | 472 | lstms = tf.contrib.rnn.MultiRNNCell(lstms) 473 | return lstms 474 | 475 | -------------------------------------------------------------------------------- /src/model/capsule_model.py: -------------------------------------------------------------------------------- 1 | from keras.layers import * 2 | from keras.models import * 3 | from model.model_basic import BasicDeepModel 4 | from model.model_component import Capsule 5 | from keras import regularizers 6 | 7 | class CapsuleModel(BasicDeepModel): 8 | def __init__(self, name='basicModel', num_flods=5, config=None): 9 | name = 'capsule' + config.main_feature 10 | BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config) 11 | 12 | def create_model(self): 13 | Routings = 5 14 | Num_capsule = 10 15 | Dim_capsule = 16 16 | dropout_p = 0.25 17 | rate_drop_dense = 0.28 18 | gru_len = 128 19 | if self.main_feature == 'char': 20 | input = Input(shape=(self.max_len,), name='char') 21 | else: 22 | input = Input(shape=(self.max_len,), name='word') 23 | 24 | embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding') 25 | x = Masking(mask_value=self.mask_value)(input) 26 | x = embedding(x) 27 | 28 | x = SpatialDropout1D(rate_drop_dense)(x) 29 | 30 | x = Bidirectional(GRU(gru_len, activation='relu', dropout=dropout_p, recurrent_dropout=dropout_p, return_sequences=True))(x) 31 | # x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x) 32 | 33 | capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings, 34 | share_weights=True)(x) 35 | 36 | capsule = Flatten()(capsule) 37 | capsule = Dropout(dropout_p)(capsule) 38 | dense = Dense(self.n_class, activation="softmax")(capsule) 39 | res_model = Model(inputs=[input], outputs=dense) 40 | 41 | return res_model 42 | -------------------------------------------------------------------------------- /src/model/convlstm_model.py: -------------------------------------------------------------------------------- 1 | from model.model_basic import BasicDeepModel 2 | import tensorflow as tf 3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings 4 | from bilm.elmo import weight_layers 5 | 6 | n_sub = 10 7 | 8 | class ConvlstmModel(BasicDeepModel): 9 | def __init__(self, name='basicModel', n_folds=5, config=None): 10 | name = 'convlstm' + config.main_feature 11 | self.hidden_dim = 300 12 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config) 13 | 14 | def LSTM(self, layers=1): 15 | lstms = [] 16 | for num in range(layers): 17 | lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0) 18 | print(lstm.name) 19 | # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim) 20 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob) 21 | lstms.append(lstm) 22 | lstms = tf.contrib.rnn.MultiRNNCell(lstms) 23 | return lstms 24 | 25 | def GRU(self, layers=1): 26 | lstms = [] 27 | for num in range(layers): 28 | # lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0) 29 | lstm = tf.contrib.rnn.GRUCell(self.hidden_dim) 30 | print(lstm.name) 31 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob) 32 | lstms.append(lstm) 33 | lstms = tf.contrib.rnn.MultiRNNCell(lstms) 34 | return lstms 35 | 36 | def create_model(self, share_dense=True, concat_sub=True): 37 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y') 38 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2') 39 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 40 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 41 | 42 | if self.main_feature.lower() in ['word', 'char']: 43 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x') 44 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 45 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 46 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 47 | 48 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 49 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 50 | if self.main_feature == 'elmo_word': 51 | options_file = self.config.elmo_word_options_file 52 | weight_file = self.config.elmo_word_weight_file 53 | embed_file = self.config.elmo_word_embed_file 54 | elif self.main_feature == 'elmo_char': 55 | options_file = self.config.elmo_char_options_file 56 | weight_file = self.config.elmo_char_weight_file 57 | embed_file = self.config.elmo_char_embed_file 58 | elif self.main_feature == 'elmo_qiuqiu': 59 | options_file = self.config.elmo_qiuqiu_options_file 60 | weight_file = self.config.elmo_qiuqiu_weight_file 61 | embed_file = self.config.elmo_qiuqiu_embed_file 62 | 63 | self.bilm = BidirectionalLanguageModel(options_file, 64 | weight_file, 65 | use_character_inputs=False, 66 | embedding_weight_file=embed_file, 67 | max_batch_size=self.batch_size) 68 | bilm_embedding_op = self.bilm(self.input_x) 69 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0) 70 | self.word_encoding = bilm_embedding['weighted_op'] 71 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 72 | 73 | else: 74 | exit('wrong feature') 75 | 76 | inputs_expanded = tf.expand_dims(self.word_encoding, -1) 77 | n_filters = 128 78 | filter_shape = [3, self.embed_size, 1, n_filters] 79 | W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W') 80 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters])) 81 | conv = tf.nn.conv2d(inputs_expanded, W, strides=[1]*4, padding='VALID', name='conv2d') 82 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu') 83 | h = tf.reshape(h, [-1, self.max_len-3+1, n_filters]) 84 | 85 | self.forward = self.LSTM() 86 | self.backward = self.LSTM() 87 | x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, h, dtype=tf.float32) 88 | x = tf.concat(x, -1) 89 | output_sentence = tf.layers.dense(x, self.hidden_dim, activation=tf.nn.relu) 90 | 91 | x_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len-3+1, self.hidden_dim]) 92 | x_tile = tf.tile(x_reshape, [1, n_sub, 1, 1]) # 句子复制n_sub份 93 | 94 | sub_embedding = tf.get_variable(shape=[n_sub, self.hidden_dim], name='sub_embedding') 95 | sub_reshape = tf.reshape(sub_embedding, [1, n_sub, 1, self.hidden_dim]) 96 | sub_tile = tf.tile(sub_reshape, [self.batch_size, 1, self.max_len-3+1, 1]) 97 | 98 | embed_concat = tf.reshape(tf.concat((x_tile, sub_tile), -1), [-1, 2*self.hidden_dim]) 99 | 100 | att_w = tf.get_variable(shape=[2*self.hidden_dim, self.hidden_dim], name='att_w') 101 | att_b = tf.get_variable(shape=[self.hidden_dim], name='att_b') 102 | att_v = tf.get_variable(shape=[self.hidden_dim, 1], name='att_v') 103 | 104 | score = tf.matmul(tf.nn.tanh(tf.matmul(embed_concat, att_w) + att_b), att_v) 105 | score_fit = tf.reshape(score, [-1, n_sub, self.max_len-3+1]) 106 | alpha = tf.nn.softmax(score_fit) 107 | 108 | layer_sentence = tf.matmul(alpha, output_sentence) 109 | 110 | if concat_sub: 111 | # 是否拼接layer_sub信息 112 | layer_sub = tf.reshape(sub_embedding, [1, n_sub, self.hidden_dim]) 113 | layer_sub_tile = tf.tile(layer_sub, [self.batch_size, 1, 1]) 114 | 115 | layer_total = tf.concat((layer_sentence, layer_sub_tile), -1) 116 | outputs = tf.reshape(layer_total, [-1, 2*self.hidden_dim]) 117 | else: 118 | outputs = tf.reshape(layer_sentence, [-1, self.hidden_dim]) 119 | 120 | self.logits = tf.layers.dense(layer_sentence, 4, activation=None) 121 | y_ = tf.nn.softmax(self.logits) 122 | self.prob = tf.reshape(y_, [-1, 10, 4]) 123 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 124 | 125 | if not self.config.balance: 126 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 127 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4]))) 128 | else: 129 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 130 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 131 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 132 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 133 | class0_weight = 1 # 第0类的权重系数 134 | class1_weight = 3 # 第1类的权重系数 135 | class2_weight = 3 # 第2类的权重系数 136 | class3_weight = 3 # 第3类的权重系数 137 | # coe = tf.constant([1., 1., 1., 1.]) 138 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 139 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 140 | 141 | y = tf.reshape(self.input_y, [-1, 4]) 142 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 143 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 144 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 145 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 146 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 147 | 148 | return self 149 | 150 | -------------------------------------------------------------------------------- /src/model/dpcnn_model.py: -------------------------------------------------------------------------------- 1 | from keras.models import * 2 | from keras.layers import * 3 | from model.model_basic import BasicDeepModel 4 | from keras import regularizers 5 | 6 | 7 | dp = 4 8 | filter_nr = 64 9 | filter_size = 3 10 | max_pool_size = 3 11 | max_pool_strides = 2 12 | dense_nr = 128 13 | spatial_dropout = 0.5 14 | dense_dropout = 0.5 15 | 16 | 17 | class DpcnnModel(BasicDeepModel): 18 | def __init__(self, name='basicModel', num_flods=5, config=None): 19 | name = 'dpcnn' + config.main_feature 20 | BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config) 21 | 22 | def create_model(self): 23 | if self.main_feature == 'char': 24 | input = Input(shape=(self.max_len,), name='char') 25 | else: 26 | input = Input(shape=(self.max_len,), name='word') 27 | 28 | embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding') 29 | x = Masking(mask_value=self.mask_value)(input) 30 | x = embedding(x) 31 | x = SpatialDropout1D(0.5)(x) 32 | 33 | block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(x) 34 | block1 = BatchNormalization()(block1) 35 | block1 = PReLU()(block1) 36 | block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1) 37 | block1 = BatchNormalization()(block1) 38 | block1 = PReLU()(block1) 39 | 40 | # we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output 41 | # if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output 42 | resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear')(x) 43 | resize_emb = PReLU()(resize_emb) 44 | 45 | block1_output = add([block1, resize_emb]) 46 | x = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output) 47 | 48 | for i in range(dp): 49 | block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(x) 50 | block1 = BatchNormalization()(block1) 51 | block1 = PReLU()(block1) 52 | block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1) 53 | block1 = BatchNormalization()(block1) 54 | block1 = PReLU()(block1) 55 | 56 | block_output = add([block1, x]) 57 | if i + 1 != dp: 58 | x = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block_output) 59 | 60 | x = GlobalMaxPooling1D()(block_output) 61 | output = Dense(dense_nr, activation='linear')(x) 62 | output = BatchNormalization()(output) 63 | x = PReLU()(output) 64 | 65 | # output = Dropout(dense_dropout)(output) 66 | if self.config.data_type == 3: 67 | dense = Dense(self.n_class, activation="sigmoid")(x) 68 | else: 69 | dense = Dense(self.n_class, activation="softmax")(x) 70 | res_model = Model(inputs=[input], outputs=dense) 71 | 72 | return res_model 73 | -------------------------------------------------------------------------------- /src/model/han_model.py: -------------------------------------------------------------------------------- 1 | from keras.models import * 2 | from keras.layers import * 3 | from model.model_basic import BasicDeepModel 4 | from model.model_component import AttLayer 5 | from model.model_component import AttentionWithContext 6 | 7 | 8 | class HANModel(BasicDeepModel): 9 | def __init__(self, name='basicModel', num_flods=5, config=None): 10 | name = 'han' + config.main_feature 11 | BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config) 12 | 13 | def create_model(self): 14 | 15 | if self.config.main_feature == 'word': 16 | input = Input(shape=(self.config.HANN_WORD_LEN,), dtype='int32') 17 | else: 18 | input = Input(shape=(self.config.HANN_CHAR_LEN,), dtype='int32') 19 | 20 | mask = Masking(mask_value=self.mask_value)(input) 21 | embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding') 22 | x = embedding(mask) 23 | x = SpatialDropout1D(0.5)(x) 24 | x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x) 25 | l_att = AttLayer(100)(x) 26 | # l_att = AttentionWithContext()(x) 27 | sentEncoder = Model(input, l_att) 28 | 29 | if self.config.main_feature == 'word': 30 | word_input = Input(shape=(self.config.HANN_SENT, self.config.HANN_WORD_LEN), name='hann_word') 31 | word_encoder = TimeDistributed(sentEncoder)(word_input) 32 | word_sent_lstm = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(word_encoder) 33 | # x = AttLayer(100)(word_sent_lstm) 34 | x = AttentionWithContext()(word_sent_lstm) 35 | x = Dropout(0.2)(x) 36 | if self.config.data_type == 3: 37 | dense = Dense(self.n_class, activation="sigmoid")(x) 38 | else: 39 | dense = Dense(self.n_class, activation="softmax")(x) 40 | model = Model(word_input, dense) 41 | else: 42 | char_input = Input(shape=(self.config.HANN_SENT, self.config.HANN_CHAR_LEN), name='hann_char') 43 | char_encoder = TimeDistributed(sentEncoder)(char_input) 44 | char_sent_lstm = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(char_encoder) 45 | x = AttLayer(100)(char_sent_lstm) 46 | # x = AttentionWithContext()(char_sent_lstm) 47 | x = Dropout(0.2)(x) 48 | if self.config.data_type == 3: 49 | dense = Dense(self.n_class, activation="sigmoid")(x) 50 | else: 51 | dense = Dense(self.n_class, activation="softmax")(x) 52 | model = Model(char_input, dense) 53 | return model 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/model/hybrid_nn_1.py: -------------------------------------------------------------------------------- 1 | from keras.models import * 2 | from keras.layers import * 3 | from keras import backend as K 4 | from model.model_basic import BasicDeepModel 5 | from model.model_component import AttLayer 6 | from model.model_component import Capsule 7 | 8 | 9 | class HybridNN1Model(BasicDeepModel): 10 | def __init__(self, name='basicModel', num_flods=5, config=None): 11 | name = 'hybridnn1' + config.main_feature 12 | BasicDeepModel.__init__(self, name=name, n_folds=num_flods, config=config) 13 | 14 | def create_model(self): 15 | if self.main_feature == 'char': 16 | input = Input(shape=(self.max_len,), name='char') 17 | else: 18 | input = Input(shape=(self.max_len,), name='word') 19 | 20 | embedding = Embedding(self.max_features, self.embed_size, weights=[self.embedding], trainable=True, name='embedding') 21 | x = Masking(mask_value=self.mask_value)(input) 22 | x = embedding(x) 23 | 24 | x = SpatialDropout1D(0.5)(x) 25 | x = GRU(256, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(x) # ?? 26 | capsule1 = Capsule(19, 17, 5)(x) 27 | capsule1 = Flatten()(capsule1) 28 | capsule2 = Capsule(19, 16, 5)(x) 29 | capsule2 = Flatten()(capsule2) 30 | output = concatenate([capsule1, capsule2]) 31 | 32 | output = Dense(256)(output) 33 | output = BatchNormalization()(output) 34 | output = Activation('relu')(output) 35 | output = Dropout(0.2)(output) 36 | 37 | output = Dense(256)(output) 38 | output = BatchNormalization()(output) 39 | output = Activation('relu')(output) 40 | x = Dropout(0.2)(output) 41 | 42 | if self.config.data_type == 3: 43 | dense = Dense(self.n_class, activation="sigmoid")(x) 44 | else: 45 | dense = Dense(self.n_class, activation="softmax")(x) 46 | model = Model(inputs=[input], output=dense) 47 | 48 | return model 49 | 50 | 51 | -------------------------------------------------------------------------------- /src/model/lightgbm_model.py: -------------------------------------------------------------------------------- 1 | import lightgbm as lgbm 2 | from model.model_basic import BasicStaticModel 3 | 4 | class LightGbmModel(BasicStaticModel): 5 | def __init__(self, num_folds=5, config=None): 6 | lgbm_params = {'objective': 'multiclass', 7 | 'bagging_seed': 10, 8 | 'boosting_type': 'gbdt', 9 | 'feature_fraction': 0.9, 10 | 'feature_fraction_seed': 10, 11 | 'lambda_l1': 0.5, 12 | 'lambda_l2': 0.5, 13 | 'learning_rate': 0.01, 14 | 'metric': 'multi_logloss', 15 | 'min_child_weight': 1, 16 | # 'min_split_gain': 0, 17 | 'device': 'gpu', 18 | 'gpu_platform_id': 0, 19 | 'gpu_device_id': config.gpu, 20 | 'min_sum_hessian_in_leaf': 0.1, 21 | 'num_leaves': 64, 22 | 'num_thread': -1, 23 | 'num_class': config.n_class, 24 | 'verbose': 1} 25 | self.config = config 26 | BasicStaticModel.__init__(self, lgbm_params, num_folds, 'lightGBM', n_class=config.n_class) 27 | 28 | def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test): 29 | 30 | dtrain = lgbm.Dataset(kfold_X_train, label=y_train) 31 | dwatch = lgbm.Dataset(kfold_X_valid, label=y_test) 32 | 33 | best = lgbm.train(self.params, dtrain, num_boost_round=300, verbose_eval=10, valid_sets=dwatch, 34 | early_stopping_rounds=10) 35 | # 对验证集predict 36 | 37 | pred = best.predict(kfold_X_valid) 38 | results = best.predict(test) 39 | 40 | return pred, results, best 41 | 42 | -------------------------------------------------------------------------------- /src/model/lstmconv_model.py: -------------------------------------------------------------------------------- 1 | from model.model_basic import BasicDeepModel 2 | import tensorflow as tf 3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings 4 | from bilm.elmo import weight_layers 5 | from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn 6 | 7 | n_sub = 10 8 | n_filters = 100 9 | 10 | 11 | class LstmconvModel(BasicDeepModel): 12 | def __init__(self, name='basicModel', n_folds=5, config=None): 13 | name = 'lstmconv' + config.main_feature 14 | self.hidden_dim = 300 15 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config) 16 | 17 | def LSTM(self, layers=1): 18 | lstms = [] 19 | for num in range(layers): 20 | lstm = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, forget_bias=1.0) 21 | print(lstm.name) 22 | # lstm = tf.contrib.rnn.GRUCell(self.hidden_dim) 23 | lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.output_keep_prob) 24 | lstms.append(lstm) 25 | 26 | lstms = tf.contrib.rnn.MultiRNNCell(lstms) 27 | return lstms 28 | 29 | def create_model(self, share_dense=True, concat_sub=True): 30 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y') 31 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2') 32 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 33 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 34 | 35 | if self.main_feature.lower() in ['word', 'char']: 36 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x') 37 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 38 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 39 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 40 | 41 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 42 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 43 | if self.main_feature == 'elmo_word': 44 | options_file = self.config.elmo_word_options_file 45 | weight_file = self.config.elmo_word_weight_file 46 | embed_file = self.config.elmo_word_embed_file 47 | elif self.main_feature == 'elmo_char': 48 | options_file = self.config.elmo_char_options_file 49 | weight_file = self.config.elmo_char_weight_file 50 | embed_file = self.config.elmo_char_embed_file 51 | elif self.main_feature == 'elmo_qiuqiu': 52 | options_file = self.config.elmo_qiuqiu_options_file 53 | weight_file = self.config.elmo_qiuqiu_weight_file 54 | embed_file = self.config.elmo_qiuqiu_embed_file 55 | self.bilm = BidirectionalLanguageModel(options_file, 56 | weight_file, 57 | use_character_inputs=False, 58 | embedding_weight_file=embed_file, 59 | max_batch_size=self.batch_size) 60 | bilm_embedding_op = self.bilm(self.input_x) 61 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0) 62 | self.word_encoding = bilm_embedding['weighted_op'] 63 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 64 | 65 | else: 66 | exit('wrong feature') 67 | 68 | c_outputs = [] 69 | for c in range(n_sub): 70 | with tf.variable_scope('lstm-{}'.format(c)): 71 | # self.forward = self.LSTM() 72 | # self.backward = self.LSTM() 73 | # x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32) 74 | # x = tf.concat(x, -1) 75 | #### cudnn lstm #### 76 | self.forward = cudnn_rnn.CudnnLSTM(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32) 77 | x, _ = self.forward(tf.transpose(self.word_encoding, [1, 0, 2])) 78 | x = tf.transpose(x, [1, 0, 2]) 79 | 80 | with tf.variable_scope('conv-{}'.format(c)): 81 | inputs_expanded = tf.expand_dims(x, -1) 82 | filter_shape = [3, 2*self.hidden_dim, 1, n_filters] 83 | W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W') 84 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters])) 85 | conv = tf.nn.conv2d(inputs_expanded, W, strides=[1]*4, padding='VALID', name='conv2d') 86 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu') 87 | max_pooled = tf.nn.max_pool(h, 88 | ksize=[1, self.max_len-3+1, 1, 1], 89 | strides=[1, 1, 1, 1], 90 | padding='VALID', 91 | name='max_pool') 92 | avg_pooled = tf.nn.avg_pool(h, 93 | ksize=[1, self.max_len-3+1, 1, 1], 94 | strides=[1, 1, 1, 1], 95 | padding='VALID', 96 | name='avg_pool') 97 | concat_pooled = tf.reshape(tf.concat((max_pooled, avg_pooled), -1), [-1, 2*n_filters]) 98 | 99 | concat_pooled = tf.nn.dropout(concat_pooled, self.dropout_keep_prob) 100 | dense = tf.layers.dense(concat_pooled, 4, activation=None) 101 | c_outputs.append(dense) 102 | 103 | self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4]) 104 | y_ = tf.nn.softmax(self.logits) 105 | self.prob = tf.reshape(y_, [-1, n_sub, 4]) 106 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 107 | 108 | if not self.config.balance: 109 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 110 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4]))) 111 | else: 112 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 113 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 114 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 115 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 116 | class0_weight = 1 # 第0类的权重系数 117 | class1_weight = 3 # 第1类的权重系数 118 | class2_weight = 3 # 第2类的权重系数 119 | class3_weight = 3 # 第3类的权重系数 120 | # coe = tf.constant([1., 1., 1., 1.]) 121 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 122 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 123 | 124 | y = tf.reshape(self.input_y, [-1, 4]) 125 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 126 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 127 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 128 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 129 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 130 | 131 | return self 132 | 133 | def create_model_v1(self, share_dense=True, concat_sub=True): 134 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y') 135 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 136 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 137 | 138 | if self.main_feature.lower() in ['word', 'char']: 139 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x') 140 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 141 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 142 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 143 | 144 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 145 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 146 | if self.main_feature == 'elmo_word': 147 | options_file = self.config.elmo_word_options_file 148 | weight_file = self.config.elmo_word_weight_file 149 | embed_file = self.config.elmo_word_embed_file 150 | elif self.main_feature == 'elmo_char': 151 | options_file = self.config.elmo_char_options_file 152 | weight_file = self.config.elmo_char_weight_file 153 | embed_file = self.config.elmo_char_embed_file 154 | elif self.main_feature == 'elmo_qiuqiu': 155 | options_file = self.config.elmo_qiuqiu_options_file 156 | weight_file = self.config.elmo_qiuqiu_weight_file 157 | embed_file = self.config.elmo_qiuqiu_embed_file 158 | 159 | self.bilm = BidirectionalLanguageModel(options_file, 160 | weight_file, 161 | use_character_inputs=False, 162 | embedding_weight_file=embed_file, 163 | max_batch_size=self.batch_size) 164 | bilm_embedding_op = self.bilm(self.input_x) 165 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0) 166 | self.word_encoding = bilm_embedding['weighted_op'] 167 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 168 | 169 | else: 170 | exit('wrong feature') 171 | 172 | self.forward = self.LSTM() 173 | self.backward = self.LSTM() 174 | x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32) 175 | x = tf.concat(x, -1) 176 | 177 | inputs_expanded = tf.expand_dims(x, -1) 178 | filter_shape = [3, 2*self.hidden_dim, 1, n_filters] 179 | W = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W') 180 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters])) 181 | conv = tf.nn.conv2d(inputs_expanded, W, strides=[1]*4, padding='VALID', name='conv2d') 182 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu') 183 | output_sentence = tf.reshape(h, [-1, self.max_len-3+1, n_filters]) 184 | 185 | # output_sentence = tf.layers.dense(x, self.hidden_dim, activation=tf.nn.relu) 186 | 187 | x_reshape = tf.reshape(output_sentence, [-1, 1, self.max_len-3+1, n_filters]) 188 | x_tile = tf.tile(x_reshape, [1, n_sub, 1, 1]) # 句子复制n_sub份 189 | 190 | sub_embedding = tf.get_variable(shape=[n_sub, n_filters], name='sub_embedding') 191 | sub_reshape = tf.reshape(sub_embedding, [1, n_sub, 1, n_filters]) 192 | sub_tile = tf.tile(sub_reshape, [self.batch_size, 1, self.max_len-3+1, 1]) 193 | 194 | embed_concat = tf.reshape(tf.concat((x_tile, sub_tile), -1), [-1, 2*n_filters]) 195 | 196 | att_w = tf.get_variable(shape=[2*n_filters, n_filters], name='att_w') 197 | att_b = tf.get_variable(shape=[n_filters], name='att_b') 198 | att_v = tf.get_variable(shape=[n_filters, 1], name='att_v') 199 | 200 | score = tf.matmul(tf.nn.tanh(tf.matmul(embed_concat, att_w) + att_b), att_v) 201 | score_fit = tf.reshape(score, [-1, n_sub, self.max_len-3+1]) 202 | alpha = tf.nn.softmax(score_fit) 203 | 204 | layer_sentence = tf.matmul(alpha, output_sentence) 205 | 206 | if concat_sub: 207 | # 是否拼接layer_sub信息 208 | layer_sub = tf.reshape(sub_embedding, [1, n_sub, n_filters]) 209 | layer_sub_tile = tf.tile(layer_sub, [self.batch_size, 1, 1]) 210 | 211 | layer_total = tf.concat((layer_sentence, layer_sub_tile), -1) 212 | outputs = tf.reshape(layer_total, [-1, 2*n_filters]) 213 | else: 214 | outputs = tf.reshape(layer_sentence, [-1, n_filters]) 215 | 216 | self.logits = tf.layers.dense(layer_sentence, 4, activation=None) 217 | y_ = tf.nn.softmax(self.logits) 218 | self.prob = tf.reshape(y_, [-1, 10, 4]) 219 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 220 | 221 | if not self.config.balance: 222 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 223 | self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4]))) 224 | else: 225 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 226 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 227 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 228 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 229 | class0_weight = 0.7 # 第0类的权重系数 230 | class1_weight = 1.3 # 第1类的权重系数 231 | class2_weight = 1 # 第2类的权重系数 232 | class3_weight = 1.3 # 第3类的权重系数 233 | # coe = tf.constant([1., 1., 1., 1.]) 234 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 235 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 236 | 237 | y = tf.reshape(self.input_y, [-1, 4]) 238 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 239 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 240 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 241 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 242 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 243 | 244 | return self 245 | 246 | 247 | -------------------------------------------------------------------------------- /src/model/lstmgru_model.py: -------------------------------------------------------------------------------- 1 | from model.model_basic import BasicDeepModel 2 | import tensorflow as tf 3 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings 4 | from bilm.elmo import weight_layers 5 | from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn 6 | 7 | n_sub = 10 8 | 9 | 10 | class LstmgruModel(BasicDeepModel): 11 | def __init__(self, name='basicModel', n_folds=5, config=None): 12 | name = 'lstmgru' + config.main_feature 13 | self.hidden_dim = 300 14 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config) 15 | 16 | def create_model(self, share_dense=True, concat_sub=True): 17 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y') 18 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2') 19 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 20 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 21 | 22 | if self.main_feature.lower() in ['word', 'char']: 23 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x') 24 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 25 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 26 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 27 | 28 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 29 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 30 | if self.main_feature == 'elmo_word': 31 | options_file = self.config.elmo_word_options_file 32 | weight_file = self.config.elmo_word_weight_file 33 | embed_file = self.config.elmo_word_embed_file 34 | elif self.main_feature == 'elmo_char': 35 | options_file = self.config.elmo_char_options_file 36 | weight_file = self.config.elmo_char_weight_file 37 | embed_file = self.config.elmo_char_embed_file 38 | elif self.main_feature == 'elmo_qiuqiu': 39 | options_file = self.config.elmo_qiuqiu_options_file 40 | weight_file = self.config.elmo_qiuqiu_weight_file 41 | embed_file = self.config.elmo_qiuqiu_embed_file 42 | 43 | self.bilm = BidirectionalLanguageModel(options_file, 44 | weight_file, 45 | use_character_inputs=False, 46 | embedding_weight_file=embed_file, 47 | max_batch_size=self.batch_size) 48 | bilm_embedding_op = self.bilm(self.input_x) 49 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0) 50 | self.word_encoding = bilm_embedding['weighted_op'] 51 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 52 | 53 | else: 54 | exit('wrong feature') 55 | 56 | c_outputs = [] 57 | for c in range(n_sub): 58 | with tf.variable_scope('lstm-{}'.format(c)): 59 | # self.forward = self.LSTM() 60 | # self.backward = self.LSTM() 61 | # x, _ = tf.nn.bidirectional_dynamic_rnn(self.forward,self.backward, self.word_encoding, dtype=tf.float32) 62 | # x = tf.concat(x, -1) 63 | #### cudnn lstm #### 64 | self.forward_lstm = cudnn_rnn.CudnnLSTM(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32) 65 | self.forward_gru = cudnn_rnn.CudnnGRU(num_layers=1, num_units=self.hidden_dim, direction=cudnn_rnn.CUDNN_RNN_BIDIRECTION, dtype=tf.float32) 66 | x, _ = self.forward_lstm(tf.transpose(self.word_encoding, [1, 0, 2])) 67 | x, _ = self.forward_gru(x) 68 | x = tf.transpose(x, [1, 0, 2]) 69 | 70 | with tf.variable_scope('pooling-{}'.format(c)): 71 | max_pooled = tf.reshape(tf.reduce_max(x, 1), [-1, 2*self.hidden_dim]) 72 | avg_pooled = tf.reshape(tf.reduce_mean(x, 1), [-1, 2*self.hidden_dim]) 73 | concat_pooled = tf.concat((max_pooled, avg_pooled), -1) 74 | 75 | concat_pooled = tf.nn.dropout(concat_pooled, self.dropout_keep_prob) 76 | dense = tf.layers.dense(concat_pooled, 4, activation=None) 77 | c_outputs.append(dense) 78 | 79 | self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4]) 80 | y_ = tf.nn.softmax(self.logits) 81 | self.prob = tf.reshape(y_, [-1, n_sub, 4]) 82 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 83 | 84 | if not self.config.balance: 85 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 86 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4]))) 87 | else: 88 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 89 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 90 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 91 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 92 | class0_weight = 1 # 第0类的权重系数 93 | class1_weight = 3 # 第1类的权重系数 94 | class2_weight = 3 # 第2类的权重系数 95 | class3_weight = 3 # 第3类的权重系数 96 | # coe = tf.constant([1., 1., 1., 1.]) 97 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 98 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 99 | 100 | y = tf.reshape(self.input_y, [-1, 4]) 101 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 102 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 103 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 104 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 105 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 106 | 107 | return self 108 | 109 | -------------------------------------------------------------------------------- /src/model/ml_models.py: -------------------------------------------------------------------------------- 1 | from model.model_basic import BasicStaticModel 2 | from sklearn import svm 3 | from sklearn.svm import SVC 4 | from sklearn.naive_bayes import MultinomialNB 5 | from sklearn.calibration import CalibratedClassifierCV 6 | from sklearn.metrics import f1_score 7 | from skift import FirstColFtClassifier 8 | 9 | import logging 10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s') 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class SVCClassifier(BasicStaticModel): 15 | 16 | def __init__(self, name='basicModel', n_folds=5, config=None): 17 | BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config) 18 | 19 | def create_model(self): 20 | classifier = SVC(kernel="rbf") 21 | classifier = CalibratedClassifierCV(classifier) 22 | classifier = SVC(kernel="linear") 23 | self.classifier = classifier 24 | self.classifier = svm.LinearSVC(loss='hinge', tol=1e-4, C=0.6) 25 | return self.classifier 26 | 27 | 28 | class Fasttext(BasicStaticModel): 29 | def __init__(self, name='basicModel', n_folds=5, config=None): 30 | BasicStaticModel.__init__(self, name=name, n_folds=n_folds, config=config) 31 | 32 | def create_model(self): 33 | sk_clf = FirstColFtClassifier(lr=1.0, epoch=10, 34 | wordNgrams=1, 35 | minCount=5, verbose=2) 36 | return sk_clf 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/model/model_component.py: -------------------------------------------------------------------------------- 1 | from keras.layers import * 2 | from keras.models import * 3 | 4 | 5 | class AttLayer(Layer): 6 | def __init__(self, attention_dim): 7 | self.init = initializers.get('normal') 8 | self.supports_masking = True 9 | self.attention_dim = attention_dim 10 | super(AttLayer, self).__init__() 11 | 12 | def build(self, input_shape): 13 | assert len(input_shape) == 3 14 | self.W = K.variable(self.init((input_shape[-1], self.attention_dim))) 15 | self.b = K.variable(self.init((self.attention_dim, ))) 16 | self.u = K.variable(self.init((self.attention_dim, 1))) 17 | self.trainable_weights = [self.W, self.b, self.u] 18 | super(AttLayer, self).build(input_shape) 19 | 20 | def compute_mask(self, inputs, mask=None): 21 | return mask 22 | 23 | def call(self, x, mask=None): 24 | # size of x :[batch_size, sel_len, attention_dim] 25 | # size of u :[batch_size, attention_dim] 26 | # uit = tanh(xW+b) 27 | uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b)) 28 | ait = K.dot(uit, self.u) 29 | ait = K.squeeze(ait, -1) 30 | 31 | ait = K.exp(ait) 32 | 33 | if mask is not None: 34 | # Cast the mask to floatX to avoid float64 upcasting in theano 35 | ait *= K.cast(mask, K.floatx()) 36 | ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 37 | ait = K.expand_dims(ait) 38 | weighted_input = x * ait 39 | output = K.sum(weighted_input, axis=1) 40 | 41 | return output 42 | 43 | def compute_output_shape(self, input_shape): 44 | return (input_shape[0], input_shape[-1]) 45 | 46 | 47 | class AttentionWeightedAverage(Layer): 48 | """ 49 | Computes a weighted average of the different channels across timesteps. 50 | Uses 1 parameter pr. channel to compute the attention value for a single timestep. 51 | """ 52 | 53 | def __init__(self, return_attention=False, **kwargs): 54 | self.init = initializers.get('uniform') 55 | self.supports_masking = True 56 | self.return_attention = return_attention 57 | super(AttentionWeightedAverage, self).__init__(**kwargs) 58 | 59 | def build(self, input_shape): 60 | self.input_spec = [InputSpec(ndim=3)] 61 | assert len(input_shape) == 3 62 | 63 | self.W = self.add_weight(shape=(input_shape[2], 1), 64 | name='{}_W'.format(self.name), 65 | initializer=self.init) 66 | self.trainable_weights = [self.W] 67 | super(AttentionWeightedAverage, self).build(input_shape) 68 | 69 | def call(self, x, mask=None): 70 | # computes a probability distribution over the timesteps 71 | # uses 'max trick' for numerical stability 72 | # reshape is done to avoid issue with Tensorflow 73 | # and 1-dimensional weights 74 | logits = K.dot(x, self.W) 75 | x_shape = K.shape(x) 76 | logits = K.reshape(logits, (x_shape[0], x_shape[1])) 77 | ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True)) 78 | 79 | # masked timesteps have zero weight 80 | if mask is not None: 81 | mask = K.cast(mask, K.floatx()) 82 | ai = ai * mask 83 | att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon()) 84 | weighted_input = x * K.expand_dims(att_weights) 85 | result = K.sum(weighted_input, axis=1) 86 | if self.return_attention: 87 | return [result, att_weights] 88 | return result 89 | 90 | def get_output_shape_for(self, input_shape): 91 | return self.compute_output_shape(input_shape) 92 | 93 | def compute_output_shape(self, input_shape): 94 | output_len = input_shape[2] 95 | if self.return_attention: 96 | return [(input_shape[0], output_len), (input_shape[0], input_shape[1])] 97 | return (input_shape[0], output_len) 98 | 99 | def compute_mask(self, input, input_mask=None): 100 | if isinstance(input_mask, list): 101 | return [None] * len(input_mask) 102 | else: 103 | return None 104 | 105 | def squash(x, axis=-1): 106 | # s_squared_norm is really small 107 | # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon() 108 | # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm) 109 | # return scale * x 110 | s_squared_norm = K.sum(K.square(x), axis, keepdims=True) 111 | scale = K.sqrt(s_squared_norm + K.epsilon()) 112 | return x / scale 113 | 114 | 115 | # A Capsule Implement with Pure Keras 116 | class Capsule(Layer): 117 | def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True, 118 | activation='default', **kwargs): 119 | super(Capsule, self).__init__(**kwargs) 120 | self.num_capsule = num_capsule 121 | self.dim_capsule = dim_capsule 122 | self.routings = routings 123 | self.kernel_size = kernel_size 124 | self.share_weights = share_weights 125 | if activation == 'default': 126 | self.activation = squash 127 | else: 128 | self.activation = Activation(activation) 129 | 130 | def build(self, input_shape): 131 | super(Capsule, self).build(input_shape) 132 | input_dim_capsule = input_shape[-1] 133 | if self.share_weights: 134 | self.W = self.add_weight(name='capsule_kernel', 135 | shape=(1, input_dim_capsule, 136 | self.num_capsule * self.dim_capsule), 137 | # shape=self.kernel_size, 138 | initializer='glorot_uniform', 139 | trainable=True) 140 | else: 141 | input_num_capsule = input_shape[-2] 142 | self.W = self.add_weight(name='capsule_kernel', 143 | shape=(input_num_capsule, 144 | input_dim_capsule, 145 | self.num_capsule * self.dim_capsule), 146 | initializer='glorot_uniform', 147 | trainable=True) 148 | 149 | def call(self, u_vecs): 150 | if self.share_weights: 151 | u_hat_vecs = K.conv1d(u_vecs, self.W) 152 | else: 153 | u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1]) 154 | 155 | batch_size = K.shape(u_vecs)[0] 156 | input_num_capsule = K.shape(u_vecs)[1] 157 | u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, 158 | self.num_capsule, self.dim_capsule)) 159 | u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) 160 | # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule] 161 | 162 | b = K.zeros_like(u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule] 163 | for i in range(self.routings): 164 | b = K.permute_dimensions(b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule] 165 | c = K.softmax(b) 166 | c = K.permute_dimensions(c, (0, 2, 1)) 167 | b = K.permute_dimensions(b, (0, 2, 1)) 168 | outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2])) 169 | if i < self.routings - 1: 170 | b = K.batch_dot(outputs, u_hat_vecs, [2, 3]) 171 | 172 | return outputs 173 | 174 | def compute_output_shape(self, input_shape): 175 | return (None, self.num_capsule, self.dim_capsule) 176 | 177 | def dot_product(x, kernel): 178 | """ 179 | Wrapper for dot product operation, in order to be compatible with both 180 | Theano and Tensorflow 181 | Args: 182 | x (): input 183 | kernel (): weights 184 | Returns: 185 | """ 186 | if K.backend() == 'tensorflow': 187 | return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1) 188 | else: 189 | return K.dot(x, kernel) 190 | 191 | 192 | class AttentionWithContext(Layer): 193 | """ 194 | Attention operation, with a context/query vector, for temporal data. 195 | Supports Masking. 196 | Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] 197 | "Hierarchical Attention Networks for Document Classification" 198 | by using a context vector to assist the attention 199 | # Input shape 200 | 3D tensor with shape: `(samples, steps, features)`. 201 | # Output shape 202 | 2D tensor with shape: `(samples, features)`. 203 | How to use: 204 | Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. 205 | The dimensions are inferred based on the output shape of the RNN. 206 | Note: The layer has been tested with Keras 2.0.6 207 | Example: 208 | model.add(LSTM(64, return_sequences=True)) 209 | model.add(AttentionWithContext()) 210 | # next add a Dense layer (for classification/regression) or whatever... 211 | """ 212 | 213 | def __init__(self, 214 | W_regularizer=None, u_regularizer=None, b_regularizer=None, 215 | W_constraint=None, u_constraint=None, b_constraint=None, 216 | bias=True, **kwargs): 217 | 218 | self.supports_masking = True 219 | self.init = initializers.get('glorot_uniform') 220 | 221 | self.W_regularizer = regularizers.get(W_regularizer) 222 | self.u_regularizer = regularizers.get(u_regularizer) 223 | self.b_regularizer = regularizers.get(b_regularizer) 224 | 225 | self.W_constraint = constraints.get(W_constraint) 226 | self.u_constraint = constraints.get(u_constraint) 227 | self.b_constraint = constraints.get(b_constraint) 228 | 229 | self.bias = bias 230 | super(AttentionWithContext, self).__init__(**kwargs) 231 | 232 | def build(self, input_shape): 233 | assert len(input_shape) == 3 234 | 235 | self.W = self.add_weight((input_shape[-1], input_shape[-1],), 236 | initializer=self.init, 237 | name='{}_W'.format(self.name), 238 | regularizer=self.W_regularizer, 239 | constraint=self.W_constraint) 240 | if self.bias: 241 | self.b = self.add_weight((input_shape[-1],), 242 | initializer='zero', 243 | name='{}_b'.format(self.name), 244 | regularizer=self.b_regularizer, 245 | constraint=self.b_constraint) 246 | 247 | self.u = self.add_weight((input_shape[-1],), 248 | initializer=self.init, 249 | name='{}_u'.format(self.name), 250 | regularizer=self.u_regularizer, 251 | constraint=self.u_constraint) 252 | 253 | super(AttentionWithContext, self).build(input_shape) 254 | 255 | def compute_mask(self, input, input_mask=None): 256 | # do not pass the mask to the next layers 257 | return None 258 | 259 | def call(self, x, mask=None): 260 | uit = dot_product(x, self.W) 261 | 262 | if self.bias: 263 | uit += self.b 264 | 265 | uit = K.tanh(uit) 266 | ait = K.dot(uit, self.u) 267 | 268 | a = K.exp(ait) 269 | 270 | # apply mask after the exp. will be re-normalized next 271 | if mask is not None: 272 | # Cast the mask to floatX to avoid float64 upcasting in theano 273 | a *= K.cast(mask, K.floatx()) 274 | 275 | # in some cases especially in the early stages of training the sum may be almost zero 276 | # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. 277 | # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) 278 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 279 | 280 | a = K.expand_dims(a) 281 | weighted_input = x * a 282 | return K.sum(weighted_input, axis=1) 283 | 284 | def compute_output_shape(self, input_shape): 285 | return input_shape[0], input_shape[-1] 286 | -------------------------------------------------------------------------------- /src/model/my_callbacks.py: -------------------------------------------------------------------------------- 1 | import keras as keras 2 | from keras import backend as K 3 | import numpy as np 4 | import warnings 5 | import glob 6 | import os 7 | from keras.models import load_model 8 | import pickle 9 | 10 | 11 | class JZTrainCategory(keras.callbacks.Callback): 12 | def __init__(self, filepath, nb_epochs=20, nb_snapshots=1, monitor='val_loss', factor=0.1, verbose=1, patience=1, 13 | save_weights_only=False, 14 | decay_factor_value=1.0, 15 | mode='auto', period=1): 16 | super(JZTrainCategory, self).__init__() 17 | self.nb_epochs = nb_epochs 18 | self.monitor = monitor 19 | self.verbose = verbose 20 | self.filepath = filepath 21 | self.init_factor = factor 22 | self.decay_factor_value = decay_factor_value 23 | self.factor = factor 24 | self.save_weights_only = save_weights_only 25 | self.patience = patience 26 | self.r_patience = 0 27 | self.check = nb_epochs // nb_snapshots 28 | self.monitor_val_list = [] 29 | if mode not in ['auto', 'min', 'max']: 30 | warnings.warn('ModelCheckpoint mode %s is unknown, ' 31 | 'fallback to auto mode.' % (mode), 32 | RuntimeWarning) 33 | mode = 'auto' 34 | if mode == 'min': 35 | self.monitor_op = np.less 36 | self.init_best = np.Inf 37 | elif mode == 'max': 38 | self.monitor_op = np.greater 39 | self.init_best = -np.Inf 40 | else: 41 | if 'acc' in self.monitor or self.monitor.startswith('fmeasure'): 42 | self.monitor_op = np.greater 43 | self.init_best = -np.Inf 44 | else: 45 | self.monitor_op = np.less 46 | self.init_best = np.Inf 47 | 48 | @staticmethod 49 | def compile_official_f1_score(y_true, y_pred): 50 | y_true = K.reshape(y_true, (-1, 10)) 51 | y_true = K.cast(y_true, 'float32') 52 | y_pred = K.round(y_pred) 53 | 54 | tp = K.sum(y_pred * y_true) 55 | fp = K.sum(K.cast(K.greater(y_pred - y_true, 0.), 'float32')) 56 | fn = K.sum(K.cast(K.greater(y_true - y_pred, 0.), 'float32')) 57 | p = tp / (tp + fp) 58 | r = tp / (tp + fn) 59 | f = 2*p*r/(p+r) 60 | return f 61 | 62 | def on_batch_begin(self, batch, logs={}): 63 | return 64 | 65 | def on_batch_end(self, batch, logs={}): 66 | return 67 | 68 | def on_train_end(self, logs={}): 69 | return 70 | 71 | def on_train_begin(self, logs={}): 72 | self.init_lr = K.get_value(self.model.optimizer.lr) 73 | self.best = self.init_best 74 | return 75 | 76 | def on_epoch_begin(self, epoch, logs=None): 77 | return 78 | 79 | def on_epoch_end(self, epoch, logs=None): 80 | logs = logs or {} 81 | logs['lr'] = K.get_value(self.model.optimizer.lr) 82 | 83 | n_recurrent = epoch // self.check 84 | self.save_path = '{}/{}.h5'.format(self.filepath, n_recurrent) 85 | os.makedirs(self.filepath, exist_ok=True) 86 | current = logs.get(self.monitor) 87 | if current is None: 88 | warnings.warn('Can save best model only with %s available, ' 89 | 'skipping.' % (self.monitor), RuntimeWarning) 90 | 91 | else: 92 | if self.monitor_op(current, self.best): 93 | # if better result: save model 94 | self.r_patience = 0 95 | if self.verbose > 0: 96 | print('\nEpoch %05d: %s improved from %0.5f to %0.5f,' 97 | ' saving model to %s' 98 | % (epoch + 1, self.monitor, self.best, 99 | current, self.save_path)) 100 | self.best = current 101 | if self.save_weights_only: 102 | self.model.save_weights(self.save_path) 103 | # pickle.dump(self.model.get_weights(), open('./debug_weight.pkl', 'wb')) 104 | symbolic_weights = getattr(self.model.optimizer, 'weights') 105 | weight_values = K.batch_get_value(symbolic_weights) 106 | with open('{}/optimizer.pkl'.format(self.filepath), 'wb') as f: 107 | pickle.dump(weight_values, f) 108 | else: 109 | self.model.save(self.save_path) 110 | 111 | else: 112 | # if worse resule: reload last best model saved 113 | self.r_patience += 1 114 | if self.verbose > 0: 115 | if self.r_patience == self.patience: 116 | print('\nEpoch %05d: %s did not improve from %0.5f' % 117 | (epoch + 1, self.monitor, self.best)) 118 | if self.save_weights_only: 119 | self.model.load_weights(self.save_path) 120 | self.model._make_train_function() 121 | with open('{}/optimizer.pkl'.format(self.filepath), 'rb') as f: 122 | weight_values = pickle.load(f) 123 | self.model.optimizer.set_weights(weight_values) 124 | else: 125 | self.model = load_model(self.save_path, custom_objects={'compile_official_f1_score': JZTrainCategory.compile_official_f1_score}) 126 | # set new learning rate 127 | old_lr = K.get_value(self.model.optimizer.lr) 128 | new_lr = old_lr * self.factor 129 | self.factor *= self.decay_factor_value # 衰减系数衰减 130 | K.set_value(self.model.optimizer.lr, new_lr) 131 | print('\nReload model and decay learningrate from {} to {}\n'.format(old_lr, new_lr)) 132 | self.r_patience = 0 133 | 134 | if (epoch+1) % self.check == 0: 135 | self.monitor_val_list.append(self.best) 136 | self.best = self.init_best 137 | self.factor = self.init_factor 138 | 139 | if (epoch+1) != self.nb_epochs: 140 | K.set_value(self.model.optimizer.lr, self.init_lr) 141 | print('At epoch-{} reset learning rate to mountain-top init lr {}'.format(epoch+1, self.init_lr)) 142 | 143 | -------------------------------------------------------------------------------- /src/model/rcnn_model.py: -------------------------------------------------------------------------------- 1 | from model.model_basic import BasicDeepModel 2 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings 3 | from bilm.elmo import weight_layers 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | from tensorflow.contrib import rnn 8 | import tensorflow.contrib.layers as layers 9 | 10 | filter_sizes = [1, 2, 3, 4] 11 | n_filter = 128 12 | hidden_size = 300 13 | n_sub = 10 14 | n_sent = 4 15 | 16 | 17 | class RCNNModel(BasicDeepModel): 18 | def __init__(self, name='basicModel', n_folds=10, config=None): 19 | name = 'RCNN' + config.main_feature 20 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config) 21 | 22 | def create_model(self, share_dense=True): 23 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, n_sub, n_sent], name='input_y') 24 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 25 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 26 | 27 | if self.main_feature.lower() in ['word', 'char']: 28 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, self.max_len], name='input_x') 29 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 30 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 31 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 32 | 33 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 34 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 35 | if self.main_feature == 'elmo_word': 36 | options_file = self.config.elmo_word_options_file 37 | weight_file = self.config.elmo_word_weight_file 38 | embed_file = self.config.elmo_word_embed_file 39 | elif self.main_feature == 'elmo_char': 40 | options_file = self.config.elmo_char_options_file 41 | weight_file = self.config.elmo_char_weight_file 42 | embed_file = self.config.elmo_char_embed_file 43 | elif self.main_feature == 'elmo_qiuqiu': 44 | options_file = self.config.elmo_qiuqiu_options_file 45 | weight_file = self.config.elmo_qiuqiu_weight_file 46 | embed_file = self.config.elmo_qiuqiu_embed_file 47 | 48 | self.bilm = BidirectionalLanguageModel(options_file, 49 | weight_file, 50 | use_character_inputs=False, 51 | embedding_weight_file=embed_file, 52 | max_batch_size=self.batch_size) 53 | bilm_embedding_op = self.bilm(self.input_x) 54 | bilm_embedding = weight_layers('output', bilm_embedding_op, l2_coef=0.0) 55 | self.word_encoding = bilm_embedding['weighted_op'] 56 | self.word_encoding = tf.nn.dropout(self.word_encoding,self.dropout_keep_prob) # new 57 | 58 | else: 59 | exit('wrong feature') 60 | 61 | rcnn_outputs = [] 62 | for i in range(n_sub): 63 | with tf.variable_scope('rcnn_output_%d' % i): 64 | output_bigru = self.bi_gru(self.word_encoding, hidden_size) 65 | output = self.textcnn(output_bigru, self.max_len) 66 | rcnn_outputs.append(output) 67 | 68 | n_filter_total = n_filter * len(filter_sizes) 69 | outputs = tf.reshape(tf.concat(rcnn_outputs, 1), (-1, n_sub, n_filter_total)) 70 | 71 | if share_dense: 72 | cnn_outputs = tf.reshape(outputs, (-1, n_filter_total)) 73 | W = tf.get_variable('W', shape=[n_filter_total, self.n_classes]) 74 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes])) 75 | self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores') 76 | else: 77 | cnn_outputs = tf.reshape(tf.concat(outputs, 1), (-1, n_sub, n_filter_total)) 78 | W = tf.get_variable('W', shape=[self.batch_size, n_filter_total, self.n_classes]) 79 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes])) 80 | self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores') 81 | 82 | y_ = tf.nn.softmax(self.logits) 83 | self.prob = tf.reshape(y_, [-1, n_sub, 4]) 84 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 85 | 86 | if not self.config.balance: 87 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 88 | else: 89 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 90 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 91 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 92 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 93 | class0_weight = 1 # 第0类的权重系数 94 | class1_weight = 3 # 第1类的权重系数 95 | class2_weight = 3 # 第2类的权重系数 96 | class3_weight = 3 # 第3类的权重系数 97 | # coe = tf.constant([1., 1., 1., 1.]) 98 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 99 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 100 | 101 | y = tf.reshape(self.input_y, [-1, 4]) 102 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 103 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 104 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 105 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 106 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 107 | 108 | return self 109 | 110 | def textcnn(self, cnn_inputs, n_step): 111 | # cnn_inputs = [batch_size, n_step, -1] 112 | inputs = tf.expand_dims(cnn_inputs, -1) 113 | pooled_outputs = [] 114 | for i, filter_size in enumerate(filter_sizes): 115 | with tf.variable_scope('conv-maxpool-%s' % filter_size): 116 | filter_shape = [filter_size, hidden_size*2+self.embed_size, 1, n_filter] 117 | W_filter = tf.get_variable(initializer=tf.truncated_normal(filter_shape, stddev=0.1), name='W_filter') 118 | beta = tf.get_variable(initializer=tf.constant(0.1, shape=[n_filter]), name='beta') 119 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1]*4, padding='VALID', name='conv') 120 | h = tf.nn.relu(tf.nn.bias_add(conv, beta), name='relu') 121 | pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1], 122 | strides=[1]*4, padding='VALID', name='pool') 123 | pooled_outputs.append(pooled) 124 | h_pool = tf.concat(pooled_outputs, 3) 125 | h_pool_flat = tf.reshape(h_pool, [-1, n_filter * len(filter_sizes)]) 126 | h_drop = tf.nn.dropout(h_pool_flat, self.dropout_keep_prob) 127 | return h_drop 128 | 129 | def gru_cell(self, hidden_size): 130 | cell = rnn.GRUCell(hidden_size, reuse=tf.get_variable_scope().reuse) 131 | return rnn.DropoutWrapper(cell, output_keep_prob=self.output_keep_prob) 132 | 133 | def bi_gru(self, inputs, hidden_size, res_add=True): 134 | """build the bi-GRU network. Return the encoder represented vector. 135 | X_inputs: [batch_size, n_step] 136 | n_step: 句子的词数量;或者文档的句子数。 137 | outputs: [batch_size, n_step, hidden_size*2+embedding_size(if res_add)] 138 | """ 139 | cells_fw = [self.gru_cell(hidden_size) for _ in range(1)] 140 | cells_bw = [self.gru_cell(hidden_size) for _ in range(1)] 141 | initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw] 142 | initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw] 143 | outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, 144 | initial_states_fw=initial_states_fw, 145 | initial_states_bw=initial_states_bw, 146 | dtype=tf.float32) 147 | if res_add: 148 | outputs = tf.concat([outputs, inputs], axis=2) 149 | return outputs 150 | 151 | # def batchnorm(self, Ylogits, offset, convolutional=False): 152 | # exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, ) 153 | 154 | -------------------------------------------------------------------------------- /src/model/snapshot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | import keras.callbacks as callbacks 5 | from keras.callbacks import Callback 6 | 7 | class SnapshotModelCheckpoint(Callback): 8 | """Callback that saves the snapshot weights of the model. 9 | Saves the model weights on certain epochs (which can be considered the 10 | snapshot of the model at that epoch). 11 | Should be used with the cosine annealing learning rate schedule to save 12 | the weight just before learning rate is sharply increased. 13 | # Arguments: 14 | nb_epochs: total number of epochs that the model will be trained for. 15 | nb_snapshots: number of times the weights of the model will be saved. 16 | fn_prefix: prefix for the filename of the weights. 17 | """ 18 | 19 | def __init__(self, nb_epochs, nb_snapshots, fn_prefix='Model'): 20 | super(SnapshotModelCheckpoint, self).__init__() 21 | 22 | self.check = nb_epochs // nb_snapshots 23 | self.fn_prefix = fn_prefix 24 | 25 | def on_epoch_end(self, epoch, logs={}): 26 | if epoch != 0 and (epoch + 1) % self.check == 0: 27 | filepath = self.fn_prefix + "-%d.h5" % ((epoch + 1) // self.check) 28 | self.model.save_weights(filepath, overwrite=True) 29 | # if epoch == 1: 30 | # self.model.get_layer('embedding').trainable = True 31 | # self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 32 | # print('now we begin to train our embeding layers') 33 | # self.model.summary() 34 | 35 | 36 | class SnapshotCallbackBuilder: 37 | """Callback builder for snapshot ensemble training of a model. 38 | Creates a list of callbacks, which are provided when training a model 39 | so as to save the model weights at certain epochs, and then sharply 40 | increase the learning rate. 41 | """ 42 | 43 | def __init__(self, nb_epochs, nb_snapshots, init_lr=0.1): 44 | """ 45 | Initialize a snapshot callback builder. 46 | # Arguments: 47 | nb_epochs: total number of epochs that the model will be trained for. 48 | nb_snapshots: number of times the weights of the model will be saved. 49 | init_lr: initial learning rate 50 | """ 51 | self.T = nb_epochs 52 | self.M = nb_snapshots 53 | self.alpha_zero = init_lr 54 | 55 | def get_callbacks(self, model_save_place='./', model_prefix='Model'): 56 | """ 57 | Creates a list of callbacks that can be used during training to create a 58 | snapshot ensemble of the model. 59 | Args: 60 | model_prefix: prefix for the filename of the weights. 61 | Returns: list of 3 callbacks [ModelCheckpoint, LearningRateScheduler, 62 | SnapshotModelCheckpoint] which can be provided to the 'fit' function 63 | """ 64 | if not os.path.exists(model_save_place): 65 | os.makedirs(model_save_place) 66 | 67 | callback_list = [ 68 | callbacks.LearningRateScheduler(schedule=self._cosine_anneal_schedule), 69 | SnapshotModelCheckpoint(self.T, self.M, fn_prefix='%s/%s' % (model_save_place, model_prefix))] 70 | 71 | return callback_list 72 | 73 | def _cosine_anneal_schedule(self, t): 74 | cos_inner = np.pi * (t % (self.T // self.M)) # t - 1 is used when t has 1-based indexing. 75 | cos_inner /= self.T // self.M 76 | cos_out = np.cos(cos_inner) + 1 77 | alpha = float(self.alpha_zero / 2 * cos_out) 78 | print('lr: {}'.format(alpha)) 79 | return alpha 80 | -------------------------------------------------------------------------------- /src/model/textcnn_model.py: -------------------------------------------------------------------------------- 1 | from model.model_basic import BasicDeepModel 2 | from bilm.model import BidirectionalLanguageModel,dump_token_embeddings 3 | from bilm.elmo import weight_layers 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | 8 | filter_sizes = [1, 2, 3, 4] 9 | n_filters = 128 10 | n_sub = 10 11 | n_sent = 4 12 | 13 | 14 | class TextCNNModel(BasicDeepModel): 15 | 16 | def __init__(self, name='basicModel', n_folds=5, config=None): 17 | name = 'textCNN' + config.main_feature 18 | BasicDeepModel.__init__(self, name=name, n_folds=n_folds, config=config) 19 | 20 | def create_model(self, share_dense=True): 21 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, n_sub, n_sent], name='input_y') 22 | self.input_y2 = tf.placeholder(dtype=tf.float32, shape=[None,n_sub,4], name='input_y2') 23 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 24 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 25 | 26 | if self.main_feature.lower() in ['word', 'char']: 27 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x') 28 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 29 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 30 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 31 | 32 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 33 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 34 | if self.main_feature == 'elmo_word': 35 | options_file = self.config.elmo_word_options_file 36 | weight_file = self.config.elmo_word_weight_file 37 | embed_file = self.config.elmo_word_embed_file 38 | elif self.main_feature == 'elmo_char': 39 | options_file = self.config.elmo_char_options_file 40 | weight_file = self.config.elmo_char_weight_file 41 | embed_file = self.config.elmo_char_embed_file 42 | elif self.main_feature == 'elmo_qiuqiu': 43 | options_file = self.config.elmo_qiuqiu_options_file 44 | weight_file = self.config.elmo_qiuqiu_weight_file 45 | embed_file = self.config.elmo_qiuqiu_embed_file 46 | 47 | self.bilm = BidirectionalLanguageModel(options_file, 48 | weight_file, 49 | use_character_inputs=False, 50 | embedding_weight_file=embed_file, 51 | max_batch_size=self.batch_size) 52 | bilm_embedding_op = self.bilm(self.input_x) 53 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0) 54 | self.word_encoding = bilm_embedding['weighted_op'] 55 | self.word_encoding = tf.nn.dropout(self.word_encoding,self.dropout_keep_prob) # new 56 | 57 | else: 58 | exit('wrong feature') 59 | 60 | all_input_expanded = tf.expand_dims(self.word_encoding, -1) 61 | 62 | c_outputs = [] 63 | for c in range(n_sub): 64 | pooled_outputs = [] 65 | for i, filter_size in enumerate(filter_sizes): 66 | with tf.variable_scope('conv-maxpool-{}-{}'.format(c, filter_size)): 67 | # 卷积层 68 | filter_shape = [filter_size, self.embed_size, 1, n_filters] 69 | W = tf.get_variable('W', initializer=tf.truncated_normal(filter_shape, stddev=0.1)) 70 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters])) 71 | conv = tf.nn.conv2d(all_input_expanded, W, strides=[1]*4, padding='VALID', name='conv') 72 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu') 73 | pooled = tf.nn.max_pool(h, 74 | ksize=[1, self.max_len - filter_size + 1, 1, 1], 75 | strides=[1, 1, 1, 1], 76 | padding='VALID', 77 | name='pool') 78 | pooled_outputs.append(pooled) 79 | num_filters_total = n_filters * len(filter_sizes) 80 | h_pool = tf.concat(pooled_outputs, 3) 81 | h_pool_flatten = tf.reshape(h_pool, [-1, 1, num_filters_total]) 82 | h_drop = tf.nn.dropout(h_pool_flatten, self.dropout_keep_prob) 83 | dense = tf.layers.dense(h_drop, 4, activation=None) 84 | c_outputs.append(dense) 85 | 86 | self.logits = tf.reshape(tf.concat(c_outputs, axis=1), [-1, 10, 4]) 87 | y_ = tf.nn.softmax(self.logits) 88 | self.prob = tf.reshape(y_, [-1, n_sub, 4]) 89 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 90 | 91 | if not self.config.balance: 92 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 93 | # self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4]))) 94 | else: 95 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 96 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 97 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 98 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 99 | class0_weight = 1 # 第0类的权重系数 100 | class1_weight = 3 # 第1类的权重系数 101 | class2_weight = 3 # 第2类的权重系数 102 | class3_weight = 3 # 第3类的权重系数 103 | # coe = tf.constant([1., 1., 1., 1.]) 104 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 105 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 106 | 107 | y = tf.reshape(self.input_y, [-1, 4]) 108 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 109 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 110 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 111 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 112 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 113 | 114 | return self 115 | 116 | def create_model_v1(self, share_dense=True): 117 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, n_sub, n_sent], name='input_y') 118 | self.dropout_keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob') 119 | self.output_keep_prob = tf.placeholder(dtype=tf.float32, name='output_keep_prob') 120 | 121 | if self.main_feature.lower() in ['word', 'char']: 122 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len], name='input_x') 123 | self.word_embedding = tf.get_variable(initializer=self.embedding, name='word_embedding') 124 | self.word_encoding = tf.nn.embedding_lookup(self.embedding, self.input_x) 125 | self.word_encoding = tf.nn.dropout(self.word_encoding, self.dropout_keep_prob) # new 126 | 127 | elif self.main_feature.lower() in ['elmo_word', 'elmo_char', 'elmo_qiuqiu']: 128 | self.input_x = tf.placeholder(dtype=tf.int32, shape=[None,self.max_len+2], name='input_x') 129 | if self.main_feature == 'elmo_word': 130 | options_file = self.config.elmo_word_options_file 131 | weight_file = self.config.elmo_word_weight_file 132 | embed_file = self.config.elmo_word_embed_file 133 | elif self.main_feature == 'elmo_char': 134 | options_file = self.config.elmo_char_options_file 135 | weight_file = self.config.elmo_char_weight_file 136 | embed_file = self.config.elmo_char_embed_file 137 | elif self.main_feature == 'elmo_qiuqiu': 138 | options_file = self.config.elmo_qiuqiu_options_file 139 | weight_file = self.config.elmo_qiuqiu_weight_file 140 | embed_file = self.config.elmo_qiuqiu_embed_file 141 | 142 | self.bilm = BidirectionalLanguageModel(options_file, 143 | weight_file, 144 | use_character_inputs=False, 145 | embedding_weight_file=embed_file, 146 | max_batch_size=self.batch_size) 147 | bilm_embedding_op = self.bilm(self.input_x) 148 | bilm_embedding = weight_layers('output', bilm_embedding_op,l2_coef=0.0) 149 | self.word_encoding = bilm_embedding['weighted_op'] 150 | self.word_encoding = tf.nn.dropout(self.word_encoding,self.dropout_keep_prob) # new 151 | 152 | else: 153 | exit('wrong feature') 154 | 155 | all_input_expanded = tf.expand_dims(self.word_encoding, -1) 156 | # all_input_expanded = tf.tile(all_input_expanded, [1,1,1,10]) 157 | 158 | c_outputs = [] 159 | for c in range(n_sub): 160 | pooled_outputs = [] 161 | for i, filter_size in enumerate(filter_sizes): 162 | with tf.variable_scope('conv-maxpool-{}-{}'.format(c, filter_size)): 163 | # 卷积层 164 | filter_shape = [filter_size, self.embed_size, 1, n_filters] 165 | W = tf.get_variable('W', initializer=tf.truncated_normal(filter_shape, stddev=0.1)) 166 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[n_filters])) 167 | conv = tf.nn.conv2d(all_input_expanded, W, strides=[1]*4, padding='VALID', name='conv') 168 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu') 169 | pooled = tf.nn.max_pool(h, 170 | ksize=[1, self.max_len - filter_size + 1, 1, 1], 171 | strides=[1, 1, 1, 1], 172 | padding='VALID', 173 | name='pool') 174 | pooled_outputs.append(pooled) 175 | num_filters_total = n_filters * len(filter_sizes) 176 | h_pool = tf.concat(pooled_outputs, 3) 177 | h_pool_flatten = tf.reshape(h_pool, [-1, num_filters_total]) 178 | h_drop = tf.nn.dropout(h_pool_flatten, self.dropout_keep_prob) 179 | c_outputs.append(h_drop) 180 | cnn_outputs = tf.reshape(tf.concat(c_outputs, 1), (-1, n_sub, num_filters_total)) 181 | 182 | if share_dense: 183 | cnn_outputs = tf.reshape(cnn_outputs, (-1, num_filters_total)) 184 | W = tf.get_variable('W', shape=[num_filters_total, self.n_classes]) 185 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes])) 186 | self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores') 187 | else: 188 | cnn_outputs = tf.reshape(tf.concat(c_outputs, 1), (-1, n_sub, num_filters_total)) 189 | W = tf.get_variable('W', shape=[self.batch_size, num_filters_total, self.n_classes]) 190 | b = tf.get_variable('b', initializer=tf.constant(0.1, shape=[self.n_classes])) 191 | self.logits = tf.nn.xw_plus_b(cnn_outputs, W, b, name='scores') 192 | 193 | y_ = tf.nn.softmax(self.logits) 194 | self.prob = tf.reshape(y_, [-1, n_sub, 4]) 195 | self.prediction = tf.argmax(self.prob, 2, name="prediction") 196 | 197 | if not self.config.balance: 198 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y, [-1,4]))) 199 | self.loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=tf.reshape(self.input_y2, [-1,4]))) 200 | else: 201 | # class0_weight = 0.882 * self.n_classes # 第0类的权重系数 202 | # class1_weight = 0.019 * self.n_classes # 第1类的权重系数 203 | # class2_weight = 0.080 * self.n_classes # 第2类的权重系数 204 | # class3_weight = 0.019 * self.n_classes # 第3类的权重系数 205 | class0_weight = 1 # 第0类的权重系数 206 | class1_weight = 3 # 第1类的权重系数 207 | class2_weight = 3 # 第2类的权重系数 208 | class3_weight = 3 # 第3类的权重系数 209 | # coe = tf.constant([1., 1., 1., 1.]) 210 | # y = tf.reshape(self.input_y, [-1, 4]) * coe 211 | # self.loss = -tf.reduce_mean(y * tf.log(y_)) 212 | 213 | y = tf.reshape(self.input_y, [-1, 4]) 214 | self.loss = tf.reduce_mean(-class0_weight * (y[:, 0]*tf.log(y_[:, 0])) 215 | -class1_weight * (y[:, 1]*tf.log(y_[:, 1])) 216 | -class2_weight * (y[:, 2]*tf.log(y_[:, 2])) 217 | -class3_weight * (y[:, 3]*tf.log(y_[:, 3]))) 218 | # tf.reduce_mean(-class1_weight*tf.reduce_sum(y_[:,0] * tf.log(y[:,0])-class2_weight*tf.reduce_sum(y_[:,1] * tf.log(y[:,1])-class3_weight*tf.reduce_sum(y_[:,2] * tf.log(y[:,2])) 219 | 220 | return self 221 | 222 | -------------------------------------------------------------------------------- /src/model/xgboost_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpjoe/CCF-BDCI-Automotive-Field-ASC-2018/8b35560da3520aa9ada15e5f5abee3f0b99b7180/src/model/xgboost_model.py -------------------------------------------------------------------------------- /src/pack_sub_dt2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import pandas as pd 4 | import glob 5 | import numpy as np 6 | from tqdm import tqdm 7 | from sklearn.metrics import f1_score 8 | 9 | test_df = pd.read_csv('../data/csvs/test_public.csv') 10 | train_df = pd.read_csv('../data/csvs/train_multi.csv') 11 | true_labels = train_df.iloc[:, 6:].values 12 | 13 | submit_df = pd.DataFrame(columns=['content_id', 'subject', 'sentiment_value', 'sentiment_word']) 14 | train_oof_df = pd.DataFrame(columns=['content_id', 'subject', 'sentiment_value', 'sentiment_word']) 15 | submit_df['content_id'] = test_df['content_id'] 16 | train_oof_df['content_id'] = train_df['content_id'] 17 | 18 | 19 | 20 | pre_path = '../data/result/0.807*' 21 | pre_filenames = glob.glob(pre_path) 22 | train_oof_filenames = glob.glob(pre_path.replace('pre', 'oof')) 23 | 24 | pre = np.argmax(pickle.load(open(pre_filenames[0], 'rb')), 2) 25 | train_oof_pred = np.argmax(pickle.load(open(train_oof_filenames[0], 'rb')), 2) 26 | 27 | print(pre_filenames) 28 | label_itos = [s.split('_')[1] for s in pickle.load(open('../data/sub_list.pkl', 'rb'))] 29 | n_none = 0 30 | n_mul_label = {} 31 | 32 | f1s = [] 33 | 34 | content_ids = [] 35 | subjects = [] 36 | sentiment_values = [] 37 | lost_ids = [] 38 | 39 | for idx, c_id in enumerate(test_df['content_id']): 40 | n_label = np.sum(pre[idx] > 0) 41 | if not n_label: 42 | n_none += 1 43 | lost_ids.append(c_id) 44 | else: 45 | n_mul_label[n_label] = n_mul_label.get(n_label, 0) + 1 46 | labels = list(np.where(pre[idx]>0)[0]) 47 | for l in labels: 48 | content_ids.append(c_id) 49 | subjects.append(label_itos[l]) 50 | sentiment_values.append(pre[idx][l]-2) 51 | 52 | soft_df = pd.read_csv('../data/submit/676.csv') 53 | lost_df = soft_df[soft_df['content_id'].isin(lost_ids)] 54 | submit_df = pd.DataFrame({'content_id': content_ids + list(lost_df['content_id']), 55 | 'subject': subjects + list(lost_df['subject']), 56 | 'sentiment_value': sentiment_values + list(lost_df['sentiment_value']), 57 | # 'subject': subjects + ['']*len(lost_ids), 58 | # 'sentiment_value': sentiment_values + ['']*len(lost_ids), 59 | 'sentiment_word': ['']*(len(lost_df)+len(subjects))}) 60 | 61 | print('n_none:', n_none) 62 | print('n_pad:', len(lost_df)) 63 | os.makedirs('../data/submit', exist_ok=True) 64 | submit_df.to_csv('../data/submit/dt3_stacking_submission.csv', index=None) 65 | 66 | # for i in range(train_oof_pred.shape[1]): 67 | # pre_label = train_oof_pred[:, i] 68 | # true_label = true_labels[:, i] 69 | # f1 = f1_score(true_label, pre_label, average='macro') 70 | # f1s.append(f1) 71 | 72 | # f1 = np.mean(f1s) 73 | # print('f1s->', f1s) 74 | # print('mean f1', f1) 75 | # print('n_none:', n_none) 76 | # os.makedirs('../data/submit', exist_ok=True) 77 | 78 | # submit_df.to_csv('../data/submit/dt2_{}_submission.csv'.format(f1), index=None) 79 | 80 | 81 | -------------------------------------------------------------------------------- /src/stacking.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import glob 3 | import pandas as pd 4 | from config import Config 5 | from keras.utils import np_utils 6 | from keras.layers import * 7 | from model.snapshot import SnapshotCallbackBuilder 8 | from model.my_callbacks import JZTrainCategory 9 | from keras.models import * 10 | from sklearn.preprocessing import MinMaxScaler 11 | from sklearn.model_selection import KFold 12 | from sklearn.metrics import accuracy_score, f1_score 13 | 14 | from model.model_basic import BasicModel 15 | import numpy as np 16 | import os 17 | 18 | 19 | def get_f1_score(x, y, verbose=False): 20 | tp = np.sum(np.logical_and(y > 0, x == y)) 21 | fp = np.sum(np.logical_and(x > 0, y == 0)) + np.sum(np.logical_and(x * y > 0, y != x)) # 多判或者错判 22 | fn = np.sum(np.logical_and(y > 0, x == 0)) # 漏判 23 | 24 | P = float(tp) / (float(tp + fp) + 1e-8) 25 | R = float(tp) / (float(tp + fn) + 1e-8) 26 | F = 2 * P * R / (P + R + 1e-8) 27 | 28 | if verbose: 29 | print('P->', P) 30 | print('R->', R) 31 | print('F->', F) 32 | return F 33 | 34 | 35 | def data_prepare(): 36 | train_df = pd.read_csv(config.TRAIN_X) 37 | 38 | if config.data_type == 0: 39 | train_y = {} 40 | sub_list = pickle.load(open('../data/sub_list.pkl', 'rb')) 41 | for sub in sub_list: 42 | train_y_val = train_df[sub].values 43 | train_y[sub] = np_utils.to_categorical(train_y_val, num_classes=config.n_class) 44 | elif config.data_type == 1: 45 | train_y = train_df['c_numerical'].values 46 | train_y = np_utils.to_categorical(train_y, num_classes=config.n_class) 47 | elif config.data_type == 2: 48 | train_y = {} 49 | train_y['subject'] = train_df['sub_numerical'].values 50 | train_y['subject'] = np_utils.to_categorical(train_y['subject'], num_classes=10) 51 | train_y['sentiment_value'] = train_df['sentiment_value'].values 52 | train_y['sentiment_value'] = np_utils.to_categorical(train_y['sentiment_value'], num_classes=3) 53 | 54 | elif config.data_type == 3: 55 | # 主要融合这个 56 | train_y = train_df.iloc[:, 6:].values 57 | targets = train_y.reshape(-1) 58 | one_hot_targets = np.eye(config.n_classes)[targets] 59 | train_y = one_hot_targets.reshape(-1, 10, config.n_classes) 60 | elif config.data_type == 4: 61 | train_y = (train_df['sentiment_value']+1).values 62 | train_y = np_utils.to_categorical(train_y, num_classes=config.n_class) 63 | elif config.data_type == 5: 64 | train_y = train_df.iloc[:, 4:].values 65 | 66 | else: 67 | exit('错误数据类别') 68 | 69 | # oof features 70 | filenames = glob.glob('../data/result-qiuqiu/*oof*') 71 | filenames.extend(glob.glob('../data/result-dt3-op1-embed300-debugFalse-distillation/*oof*')) 72 | filenames.extend(glob.glob('../data/11_11_result/*oof*')) 73 | # filenames.extend(glob.glob('../data/result-dt3-op1-embed300-debugFalse/*oof*')) 74 | # filenames.extend(glob.glob('../data/result-dt3-op1-embed300-debugFalse-enhance/*oof*')) 75 | 76 | # filenames = glob.glob('../data/result-stacking/*oof*'.format(args.data_type)) 77 | # def filter(filename, f_value): 78 | # return float(filename.split('_')[-3][1:-4]) > f_value 79 | 80 | # filenames = [e for e in filenames if filter(e, args.f_value)] 81 | # filenames = glob.glob('../data/result-dt{}-op1-embed300-debugFalse-enhance/*oof*'.format(args.data_type)) 82 | from pprint import pprint 83 | pprint(filenames) 84 | 85 | oof_filename = [] 86 | test_filename = [] 87 | for j, filename in enumerate(filenames): 88 | p_filename = filename.replace('_oof_', '_pre_') 89 | oof_filename.append(filename) 90 | test_filename.append(p_filename) 91 | 92 | oof_data = [] 93 | test_data = [] 94 | for i, (tra, tes) in enumerate(zip(oof_filename, test_filename)): 95 | 96 | oof_feature = pickle.load(open(tra, 'rb')) 97 | print(tra, oof_feature.shape) 98 | oof_data.append(oof_feature) 99 | 100 | oof_feature = pickle.load(open(tes, 'rb')) 101 | print(tes, oof_feature.shape) 102 | test_data.append(oof_feature) 103 | 104 | train_x = np.concatenate(oof_data, axis=-1) 105 | test_x = np.concatenate(test_data, axis=-1) 106 | # train_x = np.reshape(train_x, [-1, train_x.shape[-1]]) 107 | # test_x = np.reshape(test_x, [-1, test_x.shape[-1]]) 108 | print('train_x shape: ', train_x.shape) 109 | print('train_y shape: ', train_y.shape) 110 | print('test_x shape: ', test_x.shape) 111 | 112 | return train_x, train_y, test_x 113 | 114 | 115 | def get_model(train_x): 116 | input_x = Input(shape=(train_x.shape[-2], train_x.shape[-1]), name='input') 117 | x = Dense(256, activation='relu')(input_x) 118 | x = Dropout(0.5)(x) 119 | x = Dense(128, activation='relu')(x) 120 | x = Dropout(0.5)(x) 121 | x = Dense(4, activation="softmax")(x) 122 | res_model = Model(inputs=[input_x], outputs=x) 123 | return res_model 124 | 125 | 126 | # 第一次stacking 127 | def stacking_first(train, train_y, test): 128 | savepath = './stack_op{}_dt{}_f_value{}/'.format(args.option, args.data_type, args.f_value) 129 | os.makedirs(savepath, exist_ok=True) 130 | 131 | count_kflod = 0 132 | num_folds = 5 133 | kf = KFold(n_splits=num_folds, shuffle=True, random_state=10) 134 | predict = np.zeros((test.shape[0], 10, 4)) 135 | oof_predict = np.zeros((train.shape[0], 10, 4)) 136 | scores = [] 137 | 138 | for i, (train_index, test_index) in enumerate(kf.split(train)): 139 | print('第{}折'.format(i)) 140 | 141 | kfold_X_train = {} 142 | kfold_X_valid = {} 143 | 144 | y_train, y_test = train_y[train_index], train_y[test_index] 145 | 146 | kfold_X_train, kfold_X_valid = train[train_index], train[test_index] 147 | 148 | model_prefix = savepath + 'DNN' + str(count_kflod) 149 | if not os.path.exists(model_prefix): 150 | os.mkdir(model_prefix) 151 | 152 | M = 3 # number of snapshots 153 | alpha_zero = 1e-3 # initial learning rate 154 | snap_epoch = 30 155 | 156 | snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero) 157 | # M = 1 # number of snapshots 158 | # snap_epoch = 16 159 | # jz_schedule = JZTrainCategory(model_prefix, snap_epoch, M, save_weights_only=True, monitor='val_loss', factor=0.7, patience=1) 160 | 161 | res_model = get_model(train) 162 | res_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 163 | res_model.summary() 164 | 165 | # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1, class_weight=class_weight) 166 | res_model.fit(kfold_X_train, y_train, batch_size=BATCH_SIZE, epochs=snap_epoch, verbose=1, 167 | validation_data=(kfold_X_valid, y_test), 168 | callbacks=snapshot.get_callbacks(model_save_place=model_prefix)) 169 | 170 | evaluations = [] 171 | for i in os.listdir(model_prefix): 172 | if '.h5' in i: 173 | evaluations.append(i) 174 | 175 | test_pred_ = np.zeros((test.shape[0], 10, 4)) 176 | oof_pred_ = np.zeros((len(kfold_X_valid), 10, 4)) 177 | for run, i in enumerate(evaluations): 178 | print('loading from {}'.format(os.path.join(model_prefix, i))) 179 | res_model.load_weights(os.path.join(model_prefix, i)) 180 | test_pred_ += res_model.predict(test, verbose=1, batch_size=256) / len(evaluations) 181 | oof_pred_ += res_model.predict(kfold_X_valid, batch_size=256) / len(evaluations) 182 | 183 | predict += test_pred_ / num_folds 184 | oof_predict[test_index] = oof_pred_ 185 | 186 | f1 = get_f1_score(np.argmax(oof_pred_, -1), np.argmax(y_test, -1), verbose=True) 187 | print(i, ' kflod cv f1 : ', str(f1)) 188 | count_kflod += 1 189 | scores.append(f1) 190 | print('f1 {} -> {}'.format(scores, np.mean(scores))) 191 | return predict, oof_predict, np.mean(scores) 192 | 193 | import lightgbm as lgb 194 | def stacking_lightgbm(train, train_y, test): 195 | train_y = np.argmax(train_y, 1) 196 | count_kflod = 0 197 | num_folds = 5 198 | kf = KFold(n_splits=num_folds, shuffle=True, random_state=10) 199 | predict = np.zeros((test.shape[0], config.n_class)) 200 | oof_predict = np.zeros((train.shape[0], config.n_class)) 201 | scores = [] 202 | f1s = [] 203 | 204 | params = {'objective': 'multiclass', 205 | 'bagging_seed': 10, 206 | 'boosting_type': 'gbdt', 207 | 'feature_fraction': 0.9, 208 | 'feature_fraction_seed': 10, 209 | 'lambda_l1': 0.5, 210 | 'lambda_l2': 0.5, 211 | 'learning_rate': 0.01, 212 | 'metric': 'multi_logloss', 213 | 'min_child_weight': 1, 214 | # 'min_split_gain': 0, 215 | 'device': 'gpu', 216 | 'gpu_platform_id': 0, 217 | 'gpu_device_id': config.gpu, 218 | 'min_sum_hessian_in_leaf': 0.1, 219 | 'num_leaves': 64, 220 | 'num_thread': -1, 221 | 'num_class': config.n_class, 222 | 'verbose': 1} 223 | 224 | for train_index, test_index in kf.split(train): 225 | 226 | y_train, y_test = train_y[train_index], train_y[test_index] 227 | kfold_X_train, kfold_X_valid = train[train_index], train[test_index] 228 | 229 | d_train = lgb.Dataset(kfold_X_train, label=y_train) 230 | d_watch = lgb.Dataset(kfold_X_valid, label=y_test) 231 | 232 | best = lgb.train(params, d_train, num_boost_round=100, verbose_eval=5, 233 | valid_sets=d_watch, 234 | early_stopping_rounds=6) 235 | 236 | preds1 = best.predict(test) 237 | preds2 = best.predict(kfold_X_valid) 238 | 239 | predict += preds1 / num_folds 240 | # oof_predict[test_index] = preds2 241 | 242 | accuracy = mb.cal_acc(preds2, y_test) 243 | f1 = mb.cal_f_alpha(preds2, y_test, n_out=config.n_class) 244 | 245 | print('the kflod cv is : ', str(accuracy)) 246 | print('the kflod f1 is : ', str(f1)) 247 | count_kflod += 1 248 | scores.append(accuracy) 249 | f1s.append(f1) 250 | print('total scores is ', np.mean(scores)) 251 | print('total f1 is ', np.mean(f1s)) 252 | # return predict, np.mean(scores) 253 | return predict 254 | 255 | 256 | from sklearn.linear_model import LogisticRegression 257 | def stacking_lr(train, train_y, test): 258 | train_y = np.argmax(train_y, 1) 259 | count_kflod = 0 260 | num_folds = 6 261 | kf = KFold(n_splits=num_folds, shuffle=True, random_state=10) 262 | predict = np.zeros((test.shape[0], config.n_class)) 263 | scores = [] 264 | f1s = [] 265 | for train_index, test_index in kf.split(train): 266 | 267 | y_train, y_test = train_y[train_index], train_y[test_index] 268 | kfold_X_train, kfold_X_valid = train[train_index], train[test_index] 269 | 270 | print('拟合数据') 271 | best = LogisticRegression(C=4, dual=True) 272 | best.fit(kfold_X_train, y_train) 273 | 274 | print('预测结果') 275 | preds1 = best.predict_proba(test) 276 | preds2 = best.predict_proba(kfold_X_valid) 277 | 278 | predict += preds1 / num_folds 279 | accuracy = mb.cal_acc(preds2, y_test) 280 | f1 = mb.cal_f_alpha(preds2, y_test, n_out=config.n_class) 281 | 282 | print('the kflod cv is : ', str(accuracy)) 283 | print('the kflod f1 is : ', str(f1)) 284 | count_kflod += 1 285 | scores.append(accuracy) 286 | f1s.append(f1) 287 | print('total scores is ', np.mean(scores)) 288 | print('total f1 is ', np.mean(f1s)) 289 | # return predict, np.mean(scores) 290 | return predict 291 | 292 | from sklearn import svm 293 | from sklearn.calibration import CalibratedClassifierCV 294 | 295 | def stacking_svm(train, train_y, test): 296 | train_y = np.argmax(train_y, 1) 297 | count_kflod = 0 298 | num_folds = 6 299 | kf = KFold(n_splits=num_folds, shuffle=True, random_state=10) 300 | predict = np.zeros((test.shape[0], config.n_class)) 301 | scores = [] 302 | f1s = [] 303 | for train_index, test_index in kf.split(train): 304 | 305 | y_train, y_test = train_y[train_index], train_y[test_index] 306 | kfold_X_train, kfold_X_valid = train[train_index], train[test_index] 307 | 308 | print('拟合数据') 309 | best = svm.LinearSVC() 310 | best = CalibratedClassifierCV(best) 311 | best.fit(kfold_X_train, y_train) 312 | 313 | print('预测结果') 314 | preds1 = best.predict_proba(test) 315 | preds2 = best.predict_proba(kfold_X_valid) 316 | 317 | predict += preds1 / num_folds 318 | accuracy = mb.cal_acc(preds2, y_test) 319 | f1 = mb.cal_f_alpha(preds2, y_test, n_out=config.n_class) 320 | 321 | print('the kflod cv is : ', str(accuracy)) 322 | print('the kflod f1 is : ', str(f1)) 323 | count_kflod += 1 324 | scores.append(accuracy) 325 | f1s.append(f1) 326 | print('total scores is ', np.mean(scores)) 327 | print('total f1 is ', np.mean(f1s)) 328 | # return predict, np.mean(scores) 329 | return predict 330 | 331 | 332 | # 使用pseudo-labeling做第二次stacking 333 | def stacking_pseudo(train, train_y, test, results): 334 | answer = np.reshape(np.argmax(results, axis=-1), [-1]) 335 | answer = np.reshape(np.eye(4)[answer], [-1, 10, 4]) 336 | 337 | train_y = np.concatenate([train_y, answer], axis=0) 338 | train = np.concatenate([train, test], axis=0) 339 | 340 | savepath = './pesudo_{}_dt{}/'.format(args.option, args.data_type) 341 | if not os.path.exists(savepath): 342 | os.mkdir(savepath) 343 | count_kflod = 0 344 | num_folds = 5 345 | kf = KFold(n_splits=num_folds, shuffle=True, random_state=10) 346 | predict = np.zeros((test.shape[0], 10, 4)) 347 | oof_predict = np.zeros((train.shape[0], 10, 4)) 348 | scores = [] 349 | 350 | for i, (train_index, test_index) in enumerate(kf.split(train)): 351 | print('第{}折'.format(i)) 352 | 353 | kfold_X_train = {} 354 | kfold_X_valid = {} 355 | 356 | y_train, y_test = train_y[train_index], train_y[test_index] 357 | 358 | kfold_X_train, kfold_X_valid = train[train_index], train[test_index] 359 | 360 | model_prefix = savepath + 'DNN' + str(count_kflod) 361 | if not os.path.exists(model_prefix): 362 | os.mkdir(model_prefix) 363 | 364 | M = 3 # number of snapshots 365 | alpha_zero = 1e-3 # initial learning rate 366 | snap_epoch = 30 367 | 368 | snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero) 369 | # M = 1 # number of snapshots 370 | # snap_epoch = 16 371 | # jz_schedule = JZTrainCategory(model_prefix, snap_epoch, M, save_weights_only=True, monitor='val_loss', factor=0.7, patience=1) 372 | 373 | res_model = get_model(train) 374 | res_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 375 | res_model.summary() 376 | 377 | # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1, class_weight=class_weight) 378 | res_model.fit(kfold_X_train, y_train, batch_size=BATCH_SIZE, epochs=snap_epoch, verbose=1, 379 | validation_data=(kfold_X_valid, y_test), 380 | callbacks=snapshot.get_callbacks(model_save_place=model_prefix)) 381 | 382 | evaluations = [] 383 | for i in os.listdir(model_prefix): 384 | if '.h5' in i: 385 | evaluations.append(i) 386 | 387 | test_pred_ = np.zeros((test.shape[0], 10, 4)) 388 | oof_pred_ = np.zeros((len(kfold_X_valid), 10, 4)) 389 | for run, i in enumerate(evaluations): 390 | print('loading from {}'.format(os.path.join(model_prefix, i))) 391 | res_model.load_weights(os.path.join(model_prefix, i)) 392 | test_pred_ += res_model.predict(test, verbose=1, batch_size=256) / len(evaluations) 393 | oof_pred_ += res_model.predict(kfold_X_valid, batch_size=256) / len(evaluations) 394 | 395 | predict += test_pred_ / num_folds 396 | oof_predict[test_index] = oof_pred_ 397 | 398 | f1 = get_f1_score(np.argmax(oof_pred_, -1), np.argmax(y_test, -1), verbose=True) 399 | print(i, ' kflod cv f1 : ', str(f1)) 400 | count_kflod += 1 401 | scores.append(f1) 402 | print('f1 {} -> {}'.format(scores, np.mean(scores))) 403 | return predict, np.mean(scores) 404 | 405 | def save_result(predict, prefix): 406 | os.makedirs('../data/result', exist_ok=True) 407 | with open('../data/result/{}.pkl'.format(prefix), 'wb') as f: 408 | pickle.dump(predict, f) 409 | 410 | if __name__ == '__main__': 411 | import argparse 412 | parser = argparse.ArgumentParser() 413 | parser.add_argument('--gpu', type=str, default='6') 414 | parser.add_argument('--model', type=str, help='模型') 415 | parser.add_argument('--option', type=int, default=1, help='训练方式') 416 | parser.add_argument('--data_type', type=int, default=1, help='问题模式, 0为4分类, 1为单分类, 2为先分主题再分情感') 417 | parser.add_argument('--feature', default='word', type=str, help='选择word或者char作为特征') 418 | parser.add_argument('--es', default=200, type=int, help='embed size') 419 | parser.add_argument('--debug', default=False, action='store_true') 420 | parser.add_argument('--bs', default=256, type=int, help='batch size') 421 | parser.add_argument('--f_value', default=0.0, type=float) 422 | args = parser.parse_args() 423 | 424 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 425 | 426 | import tensorflow as tf 427 | from keras.backend.tensorflow_backend import set_session 428 | tf_config = tf.ConfigProto() 429 | tf_config.gpu_options.allow_growth=True 430 | set_session(tf.Session(config=tf_config)) 431 | 432 | mb = BasicModel() 433 | config = Config() 434 | config.gpu = args.gpu 435 | config.data_type = args.data_type 436 | BATCH_SIZE = args.bs 437 | 438 | # cv_stacking() 439 | 440 | # normal stacking 441 | train, train_y, test = data_prepare() 442 | 443 | predicts, oof_predicts, score = stacking_first(train, train_y, test) 444 | save_result(predicts, prefix=str(score)) 445 | # save_result(oof_predicts, prefix='oof') 446 | 447 | # predicts = stacking_lightgbm(train, train_y, test) 448 | # save_result(predicts[:10000], prefix='stacking_lgb_first_op{}_{}_{}'.format(args.option, args.data_type, args.f_value)) 449 | 450 | # predicts = stacking_lr(train, train_y, test) 451 | # save_result(predicts[:10000], prefix='stacking_lr_first_op{}_{}_{}'.format(args.option, args.data_type, args.f_value)) 452 | 453 | # predicts = stacking_svm(train, train_y, test) 454 | # save_result(predicts[:10000], prefix='stacking_svm_first_op{}_{}_{}'.format(args.option, args.data_type, args.f_value)) 455 | 456 | # 假标签 457 | predicts, score = stacking_pseudo(train, train_y, test, predicts) 458 | save_result(predicts, prefix=str(score)) 459 | -------------------------------------------------------------------------------- /src/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import unicodedata 23 | import six 24 | import tensorflow as tf 25 | 26 | 27 | def convert_to_unicode(text): 28 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 29 | if six.PY3: 30 | if isinstance(text, str): 31 | return text 32 | elif isinstance(text, bytes): 33 | return text.decode("utf-8", "ignore") 34 | else: 35 | raise ValueError("Unsupported string type: %s" % (type(text))) 36 | elif six.PY2: 37 | if isinstance(text, str): 38 | return text.decode("utf-8", "ignore") 39 | elif isinstance(text, unicode): 40 | return text 41 | else: 42 | raise ValueError("Unsupported string type: %s" % (type(text))) 43 | else: 44 | raise ValueError("Not running on Python2 or Python 3?") 45 | 46 | 47 | def printable_text(text): 48 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 49 | 50 | # These functions want `str` for both Python2 and Python3, but in one case 51 | # it's a Unicode string and in the other it's a byte string. 52 | if six.PY3: 53 | if isinstance(text, str): 54 | return text 55 | elif isinstance(text, bytes): 56 | return text.decode("utf-8", "ignore") 57 | else: 58 | raise ValueError("Unsupported string type: %s" % (type(text))) 59 | elif six.PY2: 60 | if isinstance(text, str): 61 | return text 62 | elif isinstance(text, unicode): 63 | return text.encode("utf-8") 64 | else: 65 | raise ValueError("Unsupported string type: %s" % (type(text))) 66 | else: 67 | raise ValueError("Not running on Python2 or Python 3?") 68 | 69 | 70 | def load_vocab(vocab_file): 71 | """Loads a vocabulary file into a dictionary.""" 72 | vocab = collections.OrderedDict() 73 | index = 0 74 | with tf.gfile.GFile(vocab_file, "r") as reader: 75 | while True: 76 | token = convert_to_unicode(reader.readline()) 77 | if not token: 78 | break 79 | token = token.strip() 80 | vocab[token] = index 81 | index += 1 82 | return vocab 83 | 84 | 85 | def convert_by_vocab(vocab, items): 86 | """Converts a sequence of [tokens|ids] using the vocab.""" 87 | output = [] 88 | for item in items: 89 | output.append(vocab[item]) 90 | return output 91 | 92 | 93 | def convert_tokens_to_ids(vocab, tokens): 94 | return convert_by_vocab(vocab, tokens) 95 | 96 | 97 | def convert_ids_to_tokens(inv_vocab, ids): 98 | return convert_by_vocab(inv_vocab, ids) 99 | 100 | 101 | def whitespace_tokenize(text): 102 | """Runs basic whitespace cleaning and splitting on a peice of text.""" 103 | text = text.strip() 104 | if not text: 105 | return [] 106 | tokens = text.split() 107 | return tokens 108 | 109 | 110 | class FullTokenizer(object): 111 | """Runs end-to-end tokenziation.""" 112 | 113 | def __init__(self, vocab_file, do_lower_case=True): 114 | self.vocab = load_vocab(vocab_file) 115 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 116 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 117 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 118 | 119 | def tokenize(self, text): 120 | split_tokens = [] 121 | for token in self.basic_tokenizer.tokenize(text): 122 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 123 | split_tokens.append(sub_token) 124 | 125 | return split_tokens 126 | 127 | def convert_tokens_to_ids(self, tokens): 128 | return convert_by_vocab(self.vocab, tokens) 129 | 130 | def convert_ids_to_tokens(self, ids): 131 | return convert_by_vocab(self.inv_vocab, ids) 132 | 133 | 134 | class BasicTokenizer(object): 135 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 136 | 137 | def __init__(self, do_lower_case=True): 138 | """Constructs a BasicTokenizer. 139 | 140 | Args: 141 | do_lower_case: Whether to lower case the input. 142 | """ 143 | self.do_lower_case = do_lower_case 144 | 145 | def tokenize(self, text): 146 | """Tokenizes a piece of text.""" 147 | text = convert_to_unicode(text) 148 | text = self._clean_text(text) 149 | 150 | # This was added on November 1st, 2018 for the multilingual and Chinese 151 | # models. This is also applied to the English models now, but it doesn't 152 | # matter since the English models were not trained on any Chinese data 153 | # and generally don't have any Chinese data in them (there are Chinese 154 | # characters in the vocabulary because Wikipedia does have some Chinese 155 | # words in the English Wikipedia.). 156 | text = self._tokenize_chinese_chars(text) 157 | 158 | orig_tokens = whitespace_tokenize(text) 159 | split_tokens = [] 160 | for token in orig_tokens: 161 | if self.do_lower_case: 162 | token = token.lower() 163 | token = self._run_strip_accents(token) 164 | split_tokens.extend(self._run_split_on_punc(token)) 165 | 166 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 167 | return output_tokens 168 | 169 | def _run_strip_accents(self, text): 170 | """Strips accents from a piece of text.""" 171 | text = unicodedata.normalize("NFD", text) 172 | output = [] 173 | for char in text: 174 | cat = unicodedata.category(char) 175 | if cat == "Mn": 176 | continue 177 | output.append(char) 178 | return "".join(output) 179 | 180 | def _run_split_on_punc(self, text): 181 | """Splits punctuation on a piece of text.""" 182 | chars = list(text) 183 | i = 0 184 | start_new_word = True 185 | output = [] 186 | while i < len(chars): 187 | char = chars[i] 188 | if _is_punctuation(char): 189 | output.append([char]) 190 | start_new_word = True 191 | else: 192 | if start_new_word: 193 | output.append([]) 194 | start_new_word = False 195 | output[-1].append(char) 196 | i += 1 197 | 198 | return ["".join(x) for x in output] 199 | 200 | def _tokenize_chinese_chars(self, text): 201 | """Adds whitespace around any CJK character.""" 202 | output = [] 203 | for char in text: 204 | cp = ord(char) 205 | if self._is_chinese_char(cp): 206 | output.append(" ") 207 | output.append(char) 208 | output.append(" ") 209 | else: 210 | output.append(char) 211 | return "".join(output) 212 | 213 | def _is_chinese_char(self, cp): 214 | """Checks whether CP is the codepoint of a CJK character.""" 215 | # This defines a "chinese character" as anything in the CJK Unicode block: 216 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 217 | # 218 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 219 | # despite its name. The modern Korean Hangul alphabet is a different block, 220 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 221 | # space-separated words, so they are not treated specially and handled 222 | # like the all of the other languages. 223 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 224 | (cp >= 0x3400 and cp <= 0x4DBF) or # 225 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 226 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 227 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 228 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 229 | (cp >= 0xF900 and cp <= 0xFAFF) or # 230 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 231 | return True 232 | 233 | return False 234 | 235 | def _clean_text(self, text): 236 | """Performs invalid character removal and whitespace cleanup on text.""" 237 | output = [] 238 | for char in text: 239 | cp = ord(char) 240 | if cp == 0 or cp == 0xfffd or _is_control(char): 241 | continue 242 | if _is_whitespace(char): 243 | output.append(" ") 244 | else: 245 | output.append(char) 246 | return "".join(output) 247 | 248 | 249 | class WordpieceTokenizer(object): 250 | """Runs WordPiece tokenziation.""" 251 | 252 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 253 | self.vocab = vocab 254 | self.unk_token = unk_token 255 | self.max_input_chars_per_word = max_input_chars_per_word 256 | 257 | def tokenize(self, text): 258 | """Tokenizes a piece of text into its word pieces. 259 | 260 | This uses a greedy longest-match-first algorithm to perform tokenization 261 | using the given vocabulary. 262 | 263 | For example: 264 | input = "unaffable" 265 | output = ["un", "##aff", "##able"] 266 | 267 | Args: 268 | text: A single token or whitespace separated tokens. This should have 269 | already been passed through `BasicTokenizer. 270 | 271 | Returns: 272 | A list of wordpiece tokens. 273 | """ 274 | 275 | text = convert_to_unicode(text) 276 | 277 | output_tokens = [] 278 | for token in whitespace_tokenize(text): 279 | chars = list(token) 280 | if len(chars) > self.max_input_chars_per_word: 281 | output_tokens.append(self.unk_token) 282 | continue 283 | 284 | is_bad = False 285 | start = 0 286 | sub_tokens = [] 287 | while start < len(chars): 288 | end = len(chars) 289 | cur_substr = None 290 | while start < end: 291 | substr = "".join(chars[start:end]) 292 | if start > 0: 293 | substr = "##" + substr 294 | if substr in self.vocab: 295 | cur_substr = substr 296 | break 297 | end -= 1 298 | if cur_substr is None: 299 | is_bad = True 300 | break 301 | sub_tokens.append(cur_substr) 302 | start = end 303 | 304 | if is_bad: 305 | output_tokens.append(self.unk_token) 306 | else: 307 | output_tokens.extend(sub_tokens) 308 | return output_tokens 309 | 310 | 311 | def _is_whitespace(char): 312 | """Checks whether `chars` is a whitespace character.""" 313 | # \t, \n, and \r are technically contorl characters but we treat them 314 | # as whitespace since they are generally considered as such. 315 | if char == " " or char == "\t" or char == "\n" or char == "\r": 316 | return True 317 | cat = unicodedata.category(char) 318 | if cat == "Zs": 319 | return True 320 | return False 321 | 322 | 323 | def _is_control(char): 324 | """Checks whether `chars` is a control character.""" 325 | # These are technically control characters but we count them as whitespace 326 | # characters. 327 | if char == "\t" or char == "\n" or char == "\r": 328 | return False 329 | cat = unicodedata.category(char) 330 | if cat.startswith("C"): 331 | return True 332 | return False 333 | 334 | 335 | def _is_punctuation(char): 336 | """Checks whether `chars` is a punctuation character.""" 337 | cp = ord(char) 338 | # We treat all non-letter/number ASCII as punctuation. 339 | # Characters such as "^", "$", and "`" are not in the Unicode 340 | # Punctuation class but we treat them as punctuation anyways, for 341 | # consistency. 342 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 343 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 344 | return True 345 | cat = unicodedata.category(char) 346 | if cat.startswith("P"): 347 | return True 348 | return False 349 | -------------------------------------------------------------------------------- /src/train_elmo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | 6 | from bilm.training import train, load_options_latest_checkpoint, load_vocab 7 | from bilm.data import BidirectionalLMDataset 8 | 9 | 10 | def main(args): 11 | # load the vocab 12 | vocab = load_vocab(args.vocab_file, None) 13 | 14 | # define the options 15 | batch_size = 512 # batch size for each GPU 16 | n_gpus = 3 17 | os.environ['CUDA_VISIBLE_DEVICES'] = '1, 2, 6' 18 | 19 | # number of tokens in training data (this for 1B Word Benchmark) 20 | # word 8799 21 | # char 2355 22 | n_train_tokens = 768648884 23 | # n_train_tokens = 8799 24 | 25 | options = { 26 | 'bidirectional': True, 27 | 28 | # 'char_cnn': {'activation': 'relu', 29 | # 'embedding': {'dim': 16}, 30 | # 'filters': [[1, 32], 31 | # [2, 32], 32 | # [3, 64], 33 | # [4, 128], 34 | # [5, 256], 35 | # [6, 512], 36 | # [7, 1024]], 37 | # 'max_characters_per_token': 50, 38 | # 'n_characters': 261, 39 | # 'n_highway': 2}, 40 | 41 | 'dropout': 0.1, 42 | 43 | 'lstm': { 44 | 'cell_clip': 3, 45 | 'dim': 4096, 46 | 'n_layers': 2, 47 | 'proj_clip': 3, 48 | 'projection_dim': 512, 49 | 'use_skip_connections': True}, 50 | 51 | 'all_clip_norm_val': 10.0, 52 | 53 | 'n_epochs': 10, 54 | 'n_train_tokens': n_train_tokens, 55 | 'batch_size': batch_size, 56 | 'n_tokens_vocab': vocab.size, 57 | 'unroll_steps': 20, 58 | 'n_negative_samples_batch': 1024, 59 | } 60 | 61 | print('vocab_size:', vocab.size) 62 | prefix = args.train_prefix 63 | data = BidirectionalLMDataset(prefix, vocab, test=False, 64 | shuffle_on_load=True) 65 | 66 | tf_save_dir = args.save_dir 67 | tf_log_dir = args.save_dir 68 | train(options, data, n_gpus, tf_save_dir, tf_log_dir) 69 | 70 | 71 | if __name__ == '__main__': 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument('--save_dir', help='Location of checkpoint files') 74 | parser.add_argument('--vocab_file', help='Vocabulary file') 75 | parser.add_argument('--train_prefix', help='Prefix for train files') 76 | 77 | args = parser.parse_args() 78 | main(args) 79 | -------------------------------------------------------------------------------- /src/train_predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pickle 4 | from config import Config 5 | import numpy as np 6 | from tqdm import tqdm 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.feature_extraction.text import HashingVectorizer 9 | import logging 10 | from gensim.models.word2vec import Word2Vec 11 | from bilm import TokenBatcher 12 | from scipy.sparse import hstack 13 | 14 | import tokenization 15 | from keras.preprocessing import sequence 16 | from keras.utils import np_utils 17 | import tensorflow as tf 18 | 19 | # np.random.seed(201) 20 | # tf.set_random_seed(201) 21 | 22 | logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s') 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | def deep_data_prepare(config): 27 | print('深度学习模型数据准备') 28 | train_df = pd.read_csv(config.TRAIN_X) 29 | train_jp = pd.read_csv(config.TRAIN_JP) 30 | train_en = pd.read_csv(config.TRAIN_EN) 31 | test_df = pd.read_csv(config.TEST_X) 32 | 33 | char_sw_list = pickle.load(open('../data/char_stopword.pkl', 'rb')) 34 | word_sw_list = pickle.load(open('../data/word_stopword.pkl', 'rb')) 35 | # 用词向量 36 | # 用字向量 37 | train_x_char = train_df['char'] 38 | train_x_word = train_df['word'] 39 | # train_x_sent_word = [w for w in open('../data/sentiment_word.txt')] 40 | # train_x_sent_char = [w for w in open('../data/sentiment_word.txt')] 41 | train_jp_char = train_jp['char'] 42 | train_jp_word = train_jp['word'] 43 | train_en_char = train_en['char'] 44 | train_en_word = train_en['word'] 45 | 46 | train_char = pd.concat((train_x_char, train_jp_char, train_en_char)) 47 | train_word = pd.concat((train_x_word, train_jp_word, train_en_word)) 48 | test_char = test_df['char'] 49 | test_word = test_df['word'] 50 | 51 | if config.data_type == 0: 52 | train_y = train_df['sub_numerical'].values 53 | train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes) 54 | 55 | elif config.data_type == 1: 56 | train_y = train_df['sentiment_value'].values 57 | train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes) 58 | 59 | elif config.data_type == 2: 60 | train_y = np.array(train_df.iloc[:, 6:].values) 61 | elif config.data_type == 3: 62 | train_y = train_df.iloc[:, 6:].values 63 | targets = train_y.reshape(-1) 64 | one_hot_targets = np.eye(config.n_classes)[targets] 65 | train_y = one_hot_targets.reshape(-1, 10, config.n_classes) 66 | elif config.data_type == 4: 67 | train_y = (train_df['sentiment_value']+1).values 68 | train_y = np_utils.to_categorical(train_y, num_classes=config.n_classes) 69 | elif config.data_type == 5: 70 | train_y = train_df.iloc[:, 4:].values 71 | else: 72 | exit('错误数据类别') 73 | 74 | UNK_CHAR = len(char_stoi) 75 | PAD_CHAR = len(char_stoi) + 1 76 | 77 | UNK_WORD = len(word_stoi) 78 | PAD_WORD = len(word_stoi) + 1 79 | 80 | def generate_hann_data(df): 81 | import re 82 | hann_train_word = np.full(shape=(len(df['word']), config.HANN_SENT, config.HANN_WORD_LEN), fill_value=PAD_WORD) 83 | hann_train_char = np.full(shape=(len(df['char']), config.HANN_SENT, config.HANN_CHAR_LEN), fill_value=PAD_CHAR) 84 | 85 | for i, sentences in enumerate(df['word']): 86 | sentences = re.split(r" 。 | , ", sentences) 87 | for j, sent in enumerate(sentences): 88 | if j < config.HANN_SENT: 89 | k = 0 90 | word_tokens = sent.split() 91 | for _, word in enumerate(word_tokens): 92 | if k < config.HANN_WORD_LEN and word not in word_sw_list and word in word_stoi: 93 | hann_train_word[i, j, k] = word_stoi[word] 94 | k += 1 95 | 96 | for i, sentences in enumerate(df['char']): 97 | sentences = re.split(r" 。 | , ", sentences) 98 | for j, sent in enumerate(sentences): 99 | if j < config.HANN_SENT: 100 | k = 0 101 | word_tokens = sent.split() 102 | for _, word in enumerate(word_tokens): 103 | if k < config.HANN_CHAR_LEN and word not in char_sw_list and word in char_stoi: 104 | hann_train_char[i, j, k] = char_stoi[word] 105 | k += 1 106 | return hann_train_word, hann_train_char 107 | 108 | hann_train_word, hann_train_char = generate_hann_data(train_df) 109 | hann_test_word, hann_test_char = generate_hann_data(test_df) 110 | 111 | def word2id(train_dialogs, type='char'): 112 | if type == 'char': 113 | stoi = char_stoi 114 | max_len = config.CHAR_MAXLEN 115 | UNK = UNK_CHAR 116 | sw_list = set(char_sw_list) 117 | elif type == 'word': 118 | stoi = word_stoi 119 | max_len = config.WORD_MAXLEN 120 | UNK = UNK_WORD 121 | sw_list = set(word_sw_list) 122 | else: 123 | exit('类型错误') 124 | 125 | train_x = [] 126 | for d in tqdm(train_dialogs): 127 | d = str(d).split() 128 | line = [] 129 | for token in d: 130 | if token in sw_list\ 131 | or token == ''\ 132 | or token == ' ': 133 | continue 134 | if token in stoi: 135 | line.append(stoi[token]) 136 | else: 137 | line.append(UNK) 138 | 139 | train_x.append(line[:max_len]) 140 | return train_x 141 | 142 | # 普通模型数据 143 | train_x_word = word2id(train_word, type='word') 144 | train_x_char = word2id(train_char, type='char') 145 | test_x_char = word2id(test_char, type='char') 146 | test_x_word = word2id(test_word, type='word') 147 | 148 | # train_x_sent_word = word2id(train_x_sent_word, type='word') 149 | # train_x_sent_char = word2id(train_x_sent_char, type='char') 150 | # rcnn模型数据准备 151 | UNK_CHAR = PAD_CHAR 152 | UNK_WORD = PAD_WORD 153 | 154 | train_word_left = [[UNK_WORD] + w[:-1] for w in train_x_word] 155 | train_word_right = [w[1:] + [UNK_WORD] for w in train_x_word] 156 | train_char_left = [[UNK_CHAR] + w[:-1] for w in train_x_char] 157 | train_char_right = [w[1:] + [UNK_CHAR] for w in train_x_char] 158 | 159 | test_word_left = [[UNK_WORD] + w[:-1] for w in test_x_word] 160 | test_word_right = [w[1:] + [UNK_WORD] for w in test_x_word] 161 | test_char_left = [[UNK_CHAR] + w[:-1] for w in test_x_char] 162 | test_char_right = [w[1:] + [UNK_CHAR] for w in test_x_char] 163 | 164 | train_x_char = sequence.pad_sequences(train_x_char, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) 165 | train_x_word = sequence.pad_sequences(train_x_word, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) 166 | train_x_char_left = sequence.pad_sequences(train_char_left, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) 167 | train_x_word_left = sequence.pad_sequences(train_word_left, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) 168 | train_x_char_right = sequence.pad_sequences(train_char_right, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) 169 | train_x_word_right = sequence.pad_sequences(train_word_right, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) 170 | 171 | test_x_char = sequence.pad_sequences(test_x_char, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) 172 | test_x_word = sequence.pad_sequences(test_x_word, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) 173 | test_x_char_left = sequence.pad_sequences(test_char_left, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) 174 | test_x_word_left = sequence.pad_sequences(test_word_left, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) 175 | test_x_char_right = sequence.pad_sequences(test_char_right, maxlen=config.CHAR_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_CHAR) 176 | test_x_word_right = sequence.pad_sequences(test_word_right, maxlen=config.WORD_MAXLEN, dtype='int32', padding='post', truncating='post', value=UNK_WORD) 177 | 178 | print('train_x char shape is: ', train_x_char.shape) 179 | print('train_x word shape is: ', train_x_word.shape) 180 | print('test_x char shape is: ', test_x_char.shape) 181 | print('test_x word shape is: ', test_x_word.shape) 182 | 183 | train = {} 184 | test = {} 185 | # tokenizer = tokenization.FullTokenizer( 186 | # vocab_file=config.BERT_VOCAB_FILES, do_lower_case=False) 187 | 188 | # def get_bert_data(corpus): 189 | # input_ids = [] 190 | # input_mask = [] 191 | # input_segment_ids = [] 192 | 193 | # for sent in train_df['word'].values: 194 | # sent = ''.join(sent.strip().split()) 195 | # tmp_token_ids = tokenizer.convert_tokens_to_ids(['[CLS]'] + tokenizer.tokenize(sent)[:188] + ['[SEP]']) 196 | # tmp_mask = [1] * len(tmp_token_ids) 197 | # tmp_segment_ids = [0] * len(tmp_token_ids) 198 | # if len(tmp_token_ids) < 190: 199 | # tmp_segment_ids.extend([0] * (190-len(tmp_token_ids))) 200 | # tmp_mask.extend([0] * (190-len(tmp_token_ids))) 201 | # tmp_token_ids.extend([0] * (190-len(tmp_token_ids))) 202 | # input_ids.append(tmp_token_ids) 203 | # input_mask.append(tmp_mask) 204 | # input_segment_ids.append(tmp_segment_ids) 205 | # return np.array(input_ids, dtype='int32'), np.array(input_mask, dtype='int32'), np.array(input_segment_ids, dtype='int32') 206 | 207 | # train['token_id'], train['mask_id'], train['type_id'] = get_bert_data(train_df['word'].values) 208 | # test['token_id'], test['mask_id'], test['type_id'] = get_bert_data(test_df['word'].values) 209 | 210 | train['word'] = train_x_word 211 | train['char'] = train_x_char 212 | # train['word_sent'] = train_x_sent_word 213 | # train['char_sent'] = train_x_sent_char 214 | # rcnn 215 | train['word_left'] = train_x_word_left 216 | train['word_right'] = train_x_word_right 217 | train['char_left'] = train_x_char_left 218 | train['char_right'] = train_x_char_right 219 | # han 220 | train['hann_word'] = hann_train_word 221 | train['hann_char'] = hann_train_char 222 | 223 | test['word'] = test_x_word 224 | test['char'] = test_x_char 225 | test['word_left'] = test_x_word_left 226 | test['word_right'] = test_x_word_right 227 | test['char_left'] = test_x_char_left 228 | test['char_right'] = test_x_char_right 229 | test['hann_word'] = hann_test_word 230 | test['hann_char'] = hann_test_char 231 | 232 | assert train['word_left'].shape == train['word_right'].shape == train['word'].shape 233 | assert train['char_left'].shape == train['char_right'].shape == train['char'].shape 234 | assert test['word_left'].shape == test['word_right'].shape == test['word'].shape 235 | assert test['char_left'].shape == test['char_right'].shape == test['char'].shape 236 | 237 | # batcher = TokenBatcher(config.elmo_word_vocab_file) 238 | # train['elmo_word'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in train_df['word']]) 239 | # test['elmo_word'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in test_df['word']]) 240 | 241 | # batcher = TokenBatcher(config.elmo_char_vocab_file) 242 | # train['elmo_char'] = batcher.batch_sentences([str(w).split()[:config.CHAR_MAXLEN] for w in train_df['char']]) 243 | # test['elmo_char'] = batcher.batch_sentences([str(w).split()[:config.CHAR_MAXLEN] for w in test_df['char']]) 244 | 245 | # batcher = TokenBatcher(config.elmo_qiuqiu_vocab_file) 246 | # train['elmo_qiuqiu'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in train_df['word']]) 247 | # test['elmo_qiuqiu'] = batcher.batch_sentences([str(w).split()[:config.WORD_MAXLEN] for w in test_df['word']]) 248 | 249 | return train, train_y, test 250 | 251 | 252 | def init_embedding(config, type='word'): 253 | model_file = config.word_w2v_file if type == 'word' else config.char_w2v_file 254 | item_to_id = word_stoi if type == 'word' else char_stoi 255 | vocab_len = len(item_to_id) + 2 256 | print('Vocabulaty size : ', vocab_len) 257 | print('create embedding matrix') 258 | 259 | def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32') 260 | embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(model_file).readlines()[1:]) 261 | 262 | all_embs = np.stack(embeddings_index.values()) 263 | embed_matrix = np.random.normal(all_embs.mean(), all_embs.std(), size=(vocab_len, config.EMBED_SIZE)).astype(dtype='float32') 264 | embed_matrix[-1] = 0 # padding 265 | 266 | for word, i in tqdm(item_to_id.items()): 267 | embedding_vector = embeddings_index.get(word) 268 | if embedding_vector is not None: 269 | embed_matrix[i] = embedding_vector 270 | return embed_matrix 271 | 272 | 273 | def deep_data_cache(): 274 | char_w2v_embed = init_embedding(config, type='char') 275 | word_w2v_embed = init_embedding(config, type='word') 276 | 277 | train, train_y, test = deep_data_prepare(config) 278 | os.makedirs('../data/cache/', exist_ok=True) 279 | pickle.dump((train, train_y, test, char_w2v_embed, word_w2v_embed), open('../data/cache/deep_data_oe{}_es{}_dt{}_f{}.pkl'.format(config.outer_embed, config.EMBED_SIZE, config.data_type, config.main_feature), 'wb')) 280 | 281 | 282 | def deep_data_process(): 283 | deep_data_cache() 284 | (train, train_y, test, char_w2v_embed, word_w2v_embed) = pickle.load(open('../data/cache/deep_data_oe{}_es{}_dt{}_f{}.pkl'.format(config.outer_embed, config.EMBED_SIZE, config.data_type, config.main_feature), 'rb')) 285 | config.char_embedding = char_w2v_embed 286 | config.word_embedding = word_w2v_embed 287 | 288 | model = config.model[args.model](config=config, n_folds=5) 289 | if config.data_type == 0: 290 | model.single_train_predict(train, train_y, test, option=config.option) 291 | elif config.data_type == 1: 292 | model.single_train_predict(train, train_y, test, option=config.option) 293 | 294 | elif config.data_type == 2: 295 | model.multi_train_predict(train, train_y, test, option=config.option) 296 | elif config.data_type == 3: 297 | model.four_classify_train_predict(train, train_y, test, option=config.option) 298 | # # model.multi_train_predict(train, train_y, test, option=config.option) 299 | # elif config.data_type == 4: 300 | # model.single_train_predict(train, train_y, test, option=config.option) 301 | # elif config.data_type == 5: 302 | # model.multi_train_predict(train, train_y, test, option=config.option) 303 | 304 | else: 305 | exit('错误数据类别') 306 | 307 | 308 | def static_data_prepare(): 309 | model_name = config.model_name 310 | if not model_name: 311 | model_name = "model_dict.pkl" 312 | logger.info('start load data') 313 | train_df = pd.read_csv(config.TRAIN_MULTI_X) 314 | test_df = pd.read_csv(config.TEST_X) 315 | if model_name in 'svc': 316 | content_word = pd.concat((train_df['word'], test_df['word'])) 317 | content_char = pd.concat((train_df['char'], test_df['char'])) 318 | word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=5, norm='l2') 319 | char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 5), min_df=1, norm='l2') 320 | 321 | ha = HashingVectorizer(ngram_range=(1, 1), lowercase=False) 322 | discuss_ha = ha.fit_transform(content_word) 323 | 324 | logger.info('start word feature extraction') 325 | word_feature = word_vectorizer.fit_transform(content_word) 326 | logger.info("complete word feature extraction models") 327 | logger.info("vocab len: %d" % len(word_vectorizer.vocabulary_.keys())) 328 | 329 | logger.info('start char feature extraction') 330 | char_feature = char_vectorizer.fit_transform(content_char) 331 | logger.info("complete char feature extraction models") 332 | logger.info("vocab len: %d" % len(char_vectorizer.vocabulary_.keys())) 333 | 334 | train_feature = hstack([word_feature[:len(train_df)], char_feature[:len(train_df)]]).tocsr() 335 | test_feature = hstack([word_feature[len(train_df):], char_feature[len(train_df):]]).tocsr() 336 | 337 | train_feature = hstack((word_feature[:len(train_df)], discuss_ha[:len(train_df)])).tocsr() 338 | test_feature = hstack((word_feature[len(train_df):], discuss_ha[len(train_df):])).tocsr() 339 | 340 | train_feature = word_feature[:len(train_df)] 341 | test_feature = word_feature[len(train_df):] 342 | 343 | logger.info("complete char feature extraction models") 344 | logger.info("train feature shape: {}".format(np.shape(train_feature))) 345 | logger.info("test feature shape: {}".format(np.shape(test_feature))) 346 | 347 | train_y = np.array(train_df.iloc[:, 6:].values) 348 | else: 349 | train_feature = np.asarray([train_df['word']]).T 350 | train_y = np.array(train_df.iloc[:, 6:].values) 351 | test_feature = np.asarray([test_df['word']]).T 352 | return train_feature, train_y, test_feature 353 | 354 | 355 | def static_data_process(): 356 | # model train 357 | train_x, train_y, test = static_data_prepare() 358 | model = config.model[args.model](config=config, n_folds=5) 359 | model.train_predict(train_x, train_y, test, option=config.option) 360 | 361 | 362 | if __name__ == '__main__': 363 | import argparse 364 | parser = argparse.ArgumentParser() 365 | parser.add_argument('--gpu', type=str, default='6') 366 | parser.add_argument('--model', type=str, help='模型') 367 | parser.add_argument('--option', type=int, default=1, help='训练方式') 368 | parser.add_argument('--epoch', type=int, default=10) 369 | parser.add_argument('--data_type', type=int, default=3, help='问题模式, 0分单主题, 1分单情感, 2为十个四分类, 3为asp') 370 | parser.add_argument('--feature', default='word', type=str, help='选择word或者char作为特征') 371 | parser.add_argument('--es', default=300, type=int, help='embed size') 372 | parser.add_argument('--debug', default=False, action='store_true', help='debug只会跑一折') 373 | parser.add_argument('--oe', default=False, action='store_true', help='百度百科预训练词向量') 374 | parser.add_argument('--ml', default=False, action='store_true', help='是否使用传统模型') 375 | parser.add_argument('--car', default=False, action='store_true', help='是否用汽车之家数据训练的词向量') 376 | parser.add_argument('--balance', default=False, action='store_true', help='根据样例比修改loss权重') 377 | parser.add_argument('--bs', default=64, type=int, help='batch size') 378 | args = parser.parse_args() 379 | 380 | # 设置keras后台和gpu 381 | # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 382 | 383 | config = Config() 384 | config.option = args.option 385 | config.outer_embed = args.oe 386 | config.n_epochs = args.epoch 387 | config.main_feature = args.feature 388 | config.model_name = args.model 389 | config.is_debug = args.debug 390 | config.BATCH_SIZE = args.bs 391 | config.gpu = args.gpu 392 | config.EMBED_SIZE = args.es 393 | config.data_type = args.data_type 394 | config.car = args.car 395 | config.balance = args.balance 396 | 397 | if config.model_name in ['svc', 'fasttext']: 398 | args.ml = True 399 | 400 | if args.ml: 401 | static_data_process() 402 | else: 403 | char_stoi = pickle.load(open(config.char_stoi_file, 'rb')) 404 | word_stoi = pickle.load(open(config.word_stoi_file, 'rb')) 405 | 406 | deep_data_process() 407 | 408 | --------------------------------------------------------------------------------