├── test_lib.py ├── scripts ├── __init__.py ├── constant.py ├── net_components.py ├── augment.py ├── cnn.py ├── util.py ├── stack.py └── rnn.py ├── HARNN.png ├── VDCNN.png ├── HARNN1.png ├── TextCNN.png ├── SARNNKeras.png ├── 2019-03-16-12:32:21.png ├── external_lib └── install_lib.sh ├── requirements.txt ├── README.md ├── main_stack.py ├── test.py ├── 1st place solution.md ├── test_elmo.py ├── main_stack_hier.py ├── main_elmo.py ├── main.py └── main_hierarchical.py /test_lib.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /HARNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/HARNN.png -------------------------------------------------------------------------------- /VDCNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/VDCNN.png -------------------------------------------------------------------------------- /HARNN1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/HARNN1.png -------------------------------------------------------------------------------- /TextCNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/TextCNN.png -------------------------------------------------------------------------------- /SARNNKeras.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/SARNNKeras.png -------------------------------------------------------------------------------- /2019-03-16-12:32:21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/2019-03-16-12:32:21.png -------------------------------------------------------------------------------- /external_lib/install_lib.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Install deepai_nlp 3 | cd deepai_nlp 4 | pip install -e . 5 | cd .. 6 | # Install elmo 7 | cd ELMoForManyLangs 8 | python setup.py install 9 | cd .. 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyvi 2 | pandas>=0.24.1 3 | spacy>=2.0.18 4 | gensim>=3.7.1 5 | scikit-learn>=0.20.2 6 | keras 7 | tensorflow 8 | keras-self-attention==0.35.0 9 | keras-multi-head==0.16.0 10 | keras-layer-normalization==0.10.0 11 | annoy==1.15.1 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Aivivn_1 2 | 3 | Our submission for Aivivn Contest 1. 4 | 5 | By Nhat Pham and Hoang Phan. 6 | 7 | Install environment: 8 | ```bash 9 | conda install python=3.6 10 | ``` 11 | 12 | Dependencies guide: 13 | 14 | ```bash 15 | pip install -r requirements.txt 16 | cd external_lib 17 | chmod a+x install_lib.sh 18 | ./install_lib.sh 19 | cd .. 20 | ``` 21 | Notebook test link: 22 | https://colab.research.google.com/drive/1fgtIYXkXKKmZVI2w62nCI22wiVSNEQxw 23 | 24 | Sample run command: 25 | 26 | ```bash 27 | python -m main -m VDCNN -e ./embeddings/baomoi.model.bin --max 40000 --mix --prob 28 | ``` 29 | -------------------------------------------------------------------------------- /scripts/constant.py: -------------------------------------------------------------------------------- 1 | # From spacy english model 2 | EMOTICONS = set(""" 3 | :) 4 | :-) 5 | :)) 6 | :-)) 7 | :))) 8 | :-))) 9 | (: 10 | (-: 11 | =) 12 | (= 13 | ") 14 | :] 15 | :-] 16 | [: 17 | [-: 18 | :o) 19 | (o: 20 | :} 21 | :-} 22 | 8) 23 | 8-) 24 | (-8 25 | ;) 26 | ;-) 27 | (; 28 | (-; 29 | :( 30 | :-( 31 | :(( 32 | :-(( 33 | :((( 34 | :-((( 35 | ): 36 | )-: 37 | =( 38 | >:( 39 | :') 40 | :'-) 41 | :'( 42 | :'-( 43 | :/ 44 | :-/ 45 | =/ 46 | =| 47 | :| 48 | :-| 49 | :1 50 | :P 51 | :-P 52 | :p 53 | :-p 54 | :O 55 | :-O 56 | :o 57 | :-o 58 | :0 59 | :-0 60 | :() 61 | >:o 62 | :* 63 | :-* 64 | :3 65 | :-3 66 | =3 67 | :> 68 | :-> 69 | :X 70 | :-X 71 | :x 72 | :-x 73 | :D 74 | :-D 75 | ;D 76 | ;-D 77 | =D 78 | xD 79 | XD 80 | xDD 81 | XDD 82 | 8D 83 | 8-D 84 | ^_^ 85 | ^__^ 86 | ^___^ 87 | >.< 88 | >.> 89 | <.< 90 | ._. 91 | ;_; 92 | -_- 93 | -__- 94 | v.v 95 | V.V 96 | v_v 97 | V_V 98 | o_o 99 | o_O 100 | O_o 101 | O_O 102 | 0_o 103 | o_0 104 | 0_0 105 | o.O 106 | O.o 107 | O.O 108 | o.o 109 | 0.0 110 | o.0 111 | 0.o 112 | @_@ 113 | <3 114 | <33 115 | <333 116 | _<) 121 | (*_*) 122 | (¬_¬) 123 | ಠ_ಠ 124 | ಠ︵ಠ 125 | (ಠ_ಠ) 126 | ¯\(ツ)/¯ 127 | (╯°□°)╯︵┻━┻ 128 | ><(((*> 129 | """.split()) 130 | 131 | DEFAULT_MAX_FEATURES = 12000 132 | DEFAULT_MAX_LENGTH = 100 133 | -------------------------------------------------------------------------------- /scripts/net_components.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Layer 2 | import keras.backend as K 3 | 4 | class AttLayer(Layer): 5 | def __init__(self, context_size): 6 | self._context_size = context_size 7 | self.supports_masking = True 8 | # self._linear = Dense(context_size, activation = "tanh") 9 | super(AttLayer, self).__init__() 10 | 11 | def build(self, input_shape): 12 | self._W = self.add_weight( 13 | name = "W", 14 | shape = (input_shape[-1], self._context_size), 15 | initializer="he_normal", 16 | trainable=True 17 | ) 18 | self._b = self.add_weight( 19 | name = "b", 20 | shape = (1, self._context_size), 21 | initializer="constant", 22 | trainable=True 23 | ) 24 | self._context = self.add_weight( 25 | name = "context", 26 | shape = (self._context_size, 1), 27 | initializer = "he_normal", 28 | trainable = True 29 | ) 30 | super(AttLayer, self).build(input_shape) 31 | 32 | 33 | def compute_mask(self, input, input_mask=None): 34 | return input_mask 35 | 36 | 37 | def call(self, input, mask = None): 38 | # input: (N, T, M) 39 | rep = K.tanh(K.dot(input, self._W) + self._b) # (N, T, C) 40 | score = K.squeeze(K.dot(rep, self._context), axis = -1) # (N, T) 41 | 42 | weight = K.exp(score) 43 | if mask is not None: 44 | weight *= K.cast(mask, K.floatx()) 45 | 46 | weight /= K.cast(K.sum(weight, axis = 1, keepdims = True) + K.epsilon(), K.floatx()) 47 | 48 | 49 | # weight = softmax(score, axis = -1) # (N, T) 50 | op = K.batch_dot(input, weight, axes = (1, 1)) # (N, M) 51 | 52 | return op 53 | 54 | def compute_output_shape(self, input_shape): 55 | return (input_shape[0], input_shape[-1]) 56 | 57 | 58 | 59 | class AdditiveLayer(Layer): 60 | def __init__(self): 61 | super(AdditiveLayer, self).__init__() 62 | 63 | def build(self, input_shape): 64 | self._w = self.add_weight( 65 | name = "w", 66 | shape = (1, input_shape[-1]), 67 | initializer="constant", 68 | trainable=True 69 | ) 70 | super(AdditiveLayer, self).build(input_shape) 71 | 72 | 73 | 74 | def call(self, input): 75 | return input + self._w 76 | 77 | def compute_output_shape(self, input_shape): 78 | return input_shape 79 | -------------------------------------------------------------------------------- /scripts/augment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gensim.models import KeyedVectors 3 | import copy 4 | import random 5 | from gensim.similarities.index import AnnoyIndexer 6 | 7 | 8 | def shuffle_augment(texts, labels, n_increase, min_length = 1): 9 | texts_long = [] 10 | labels_long = [] 11 | 12 | if min_length > 1: 13 | for ind in range(len(texts)): 14 | if len(texts[ind]) >= min_length: 15 | texts_long.append(texts[ind]) 16 | labels_long.append(labels[ind]) 17 | else: 18 | texts_long = texts 19 | labels_long = labels 20 | 21 | 22 | shuffle_ind = np.random.choice(len(texts_long), size = n_increase) 23 | for ind in shuffle_ind: 24 | text_copy = np.random.permutation(texts_long[ind]) 25 | texts.append(text_copy) 26 | labels = np.append(labels, [labels_long[ind]]) 27 | 28 | 29 | return texts, labels 30 | 31 | 32 | def similar_augment(texts, labels, n_increase, n_word_replace, model_path, similar_threshold = 0.5, use_annoy = True, annoy_path = None): 33 | w2v = KeyedVectors.load_word2vec_format(model_path, binary=True) 34 | texts_long = [] 35 | labels_long = [] 36 | if use_annoy: 37 | if annoy_path is None: 38 | indexer = AnnoyIndexer(w2v, 100) 39 | else: 40 | indexer = AnnoyIndexer() 41 | indexer.load(annoy_path) 42 | 43 | for ind in range(len(texts)): 44 | if len(texts[ind]) >= n_word_replace: 45 | texts_long.append(texts[ind]) 46 | labels_long.append(labels[ind]) 47 | 48 | shuffle_ind = np.random.choice(len(texts_long), size = n_increase) 49 | for ind in shuffle_ind: 50 | text_copy = copy.deepcopy(texts_long[ind]) 51 | # if is_hier: 52 | 53 | replace_inds = np.random.choice(text_copy.shape[-1], size = n_word_replace, replace = False) 54 | for word_ind in replace_inds: 55 | word = text_copy[word_ind] 56 | try: 57 | 58 | closest, score = w2v.wv.most_similar( 59 | word, topn = 2, 60 | indexer = indexer if use_annoy else None 61 | )[1] 62 | if score > similar_threshold: 63 | text_copy[word_ind] = closest 64 | except: 65 | continue 66 | 67 | texts.append(text_copy) 68 | labels = np.append(labels, [labels_long[ind]]) 69 | 70 | return texts, labels 71 | 72 | 73 | 74 | 75 | def create_sim_dict(word_map, model_path, similar_threshold = 0.5, use_annoy = True, annoy_path = None): 76 | w2v = KeyedVectors.load_word2vec_format(model_path, binary=True) 77 | if use_annoy: 78 | if annoy_path is None: 79 | indexer = AnnoyIndexer(w2v, 100) 80 | else: 81 | indexer = AnnoyIndexer() 82 | indexer.load(annoy_path) 83 | 84 | sim_dict = dict() 85 | for word in word_map: 86 | try: 87 | closest, score = w2v.wv.most_similar( 88 | word, topn=2, 89 | indexer=indexer if use_annoy else None 90 | )[1] 91 | if score > similar_threshold and closest in word_map: 92 | sim_dict[word_map[word]] = word_map[closest] 93 | except: 94 | continue 95 | 96 | return sim_dict 97 | 98 | def similar_augment_from_sim_dict(texts, labels, sim_dict, n_increase, keep_prob = 0.5): 99 | aug_ind = np.random.choice(len(texts), size = n_increase) 100 | i = -1 101 | for ind in aug_ind: 102 | i += 1 103 | text_aug = copy.deepcopy(texts[ind]) 104 | for word_ind in range(len(text_aug)): 105 | word = text_aug[word_ind] 106 | if word in sim_dict: 107 | p = random.uniform(0, 1) 108 | if p > keep_prob: 109 | text_aug[word_ind] = sim_dict[word] 110 | 111 | texts = np.append(texts, [text_aug], axis = 0) 112 | labels = np.append(labels, [labels[ind]], axis = 0) 113 | 114 | return texts, labels 115 | 116 | 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /main_stack.py: -------------------------------------------------------------------------------- 1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences, f1 2 | from scripts.constant import DEFAULT_MAX_FEATURES 3 | from sklearn.model_selection import train_test_split 4 | from scripts.rnn import SARNNKeras 5 | from scripts.cnn import LSTMCNN, VDCNN 6 | from scripts.stack import StackedGeneralizer 7 | import argparse 8 | import os 9 | import numpy as np 10 | import datetime 11 | import pandas as pd 12 | from sklearn.metrics import f1_score 13 | 14 | from sklearn.linear_model import LogisticRegression 15 | from keras.utils import CustomObjectScope 16 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention 17 | 18 | 19 | def stack(models_list, embedding_path, max_features, should_mix): 20 | model_name = '-'.join( 21 | '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' ')) 22 | 23 | train_data = read_file('./data/train.crash') 24 | test_data = read_file('./data/test.crash', is_train=False) 25 | train_tokenized_texts = tokenize(train_data['text']) 26 | test_tokenizes_texts = tokenize(test_data['text']) 27 | labels = train_data['label'].values.astype(np.float16).reshape(-1, 1) 28 | 29 | embed_size, word_map, embedding_mat = make_embedding( 30 | list(train_tokenized_texts) + 31 | list(test_tokenizes_texts) if should_mix else train_tokenized_texts, 32 | embedding_path, 33 | max_features 34 | ) 35 | 36 | texts_id = text_to_sequences(train_tokenized_texts, word_map) 37 | print('Number of train data: {}'.format(labels.shape)) 38 | 39 | texts_id_train, texts_id_val, labels_train, labels_val = train_test_split( 40 | texts_id, labels, test_size=0.05) 41 | 42 | model_path = './models/{}-version'.format(model_name) 43 | 44 | try: 45 | os.mkdir('./models') 46 | except: 47 | print('Folder already created') 48 | try: 49 | os.mkdir(model_path) 50 | except: 51 | print('Folder already created') 52 | 53 | batch_size = 16 54 | epochs = 100 55 | patience = 3 56 | 57 | meta_model = LogisticRegression() 58 | models = [ 59 | model( 60 | embeddingMatrix=embedding_mat, 61 | embed_size=400, 62 | max_features=embedding_mat.shape[0] 63 | ) 64 | for model in models_list 65 | ] 66 | 67 | 68 | stack = StackedGeneralizer(models, meta_model) 69 | stack.train_meta_model( 70 | texts_id_train, labels_train, 71 | texts_id_val, labels_val, 72 | model_path = model_path, 73 | epochs = epochs, 74 | batch_size = batch_size, 75 | patience = patience 76 | ) 77 | 78 | stack.train_models( 79 | X = texts_id_train, y = labels_train, 80 | X_val = texts_id_val, y_val = labels_val, 81 | batch_size = batch_size, 82 | epochs = epochs, 83 | patience = patience, 84 | model_path = model_path 85 | ) 86 | 87 | prediction = stack.predict(texts_id_val) 88 | print('F1 validation score: {}'.format(f1_score(prediction, labels_val))) 89 | with open('{}/f1'.format(model_path), 'w') as fp: 90 | fp.write(str(f1_score(prediction, labels_val))) 91 | 92 | test_id_texts = text_to_sequences(test_tokenizes_texts, word_map) 93 | test_prediction = stack.predict(test_id_texts) 94 | 95 | df_predicton = pd.read_csv("./data/sample_submission.csv") 96 | df_predicton["label"] = test_prediction 97 | 98 | print('Number of test data: {}'.format(df_predicton.shape[0])) 99 | df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False) 100 | 101 | 102 | 103 | if __name__ == '__main__': 104 | models_list = [ 105 | SARNNKeras, LSTMCNN, VDCNN 106 | ] 107 | parser = argparse.ArgumentParser() 108 | parser.add_argument( 109 | '-e', 110 | '--embedding', 111 | help='Model use', 112 | default='./embeddings/smallFasttext.vi.vec' 113 | ) 114 | parser.add_argument( 115 | '--max', 116 | help='Model use', 117 | default=DEFAULT_MAX_FEATURES 118 | ) 119 | parser.add_argument( 120 | '--mix', 121 | action='store_true', 122 | help='Model use' 123 | ) 124 | args = parser.parse_args() 125 | 126 | with CustomObjectScope({ 127 | 'SeqSelfAttention': SeqSelfAttention, 128 | 'SeqWeightedAttention': SeqWeightedAttention, 129 | 'f1': f1} 130 | ): 131 | stack( 132 | models_list, args.embedding, int(args.max), args.mix 133 | ) 134 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences, find_threshold 2 | import numpy as np 3 | from scripts.constant import DEFAULT_MAX_FEATURES 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.metrics import f1_score 6 | import tensorflow as tf 7 | import random as rn 8 | import pandas as pd 9 | 10 | 11 | 12 | from keras.models import Model 13 | from keras.layers import Dense, Embedding, Input, GRU, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, Lambda 14 | from keras.callbacks import EarlyStopping, ModelCheckpoint 15 | import keras.backend as K 16 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention 17 | 18 | # np random seed: 19 | np.random.seed(22) 20 | 21 | # # Setting the seed for python random numbers 22 | rn.seed(1254) 23 | # 24 | # # Setting the graph-level random seed. 25 | tf.set_random_seed(89) 26 | 27 | def SARNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100): 28 | inp = Input(shape = (maxlen, )) 29 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix])(inp) 30 | x = Bidirectional(LSTM(128, return_sequences = True))(x) 31 | x = SeqSelfAttention( 32 | attention_type = SeqSelfAttention.ATTENTION_TYPE_MUL, 33 | attention_regularizer_weight=1e-4, 34 | )(x) 35 | x = Dropout(0.5)(x) 36 | x = Bidirectional(LSTM(128, return_sequences = True))(x) 37 | x = SeqWeightedAttention()(x) 38 | x = Dropout(0.5)(x) 39 | x = Dense(64, activation = "relu")(x) 40 | x = Dropout(0.5)(x) 41 | x = Dense(1, activation = "sigmoid")(x) 42 | model = Model(inputs = inp, outputs = x) 43 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 44 | return model 45 | 46 | 47 | def RNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100): 48 | inp = Input(shape = (maxlen, )) 49 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix])(inp) 50 | x = Bidirectional(LSTM(128, return_sequences = True))(x) 51 | x = Dropout(0.5)(x) 52 | x = Bidirectional(LSTM(128, return_sequences = True))(x) 53 | x = Dropout(0.5)(x) 54 | x = GlobalMaxPool1D()(x) 55 | x = Dense(64, activation = "relu")(x) 56 | x = Dropout(0.5)(x) 57 | x = Dense(1, activation = "sigmoid")(x) 58 | model = Model(inputs = inp, outputs = x) 59 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 60 | return model 61 | 62 | 63 | 64 | 65 | 66 | def f1(y_true, y_pred): 67 | def recall(y_true, y_pred): 68 | """Recall metric. 69 | 70 | Only computes a batch-wise average of recall. 71 | 72 | Computes the recall, a metric for multi-label classification of 73 | how many relevant items are selected. 74 | """ 75 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 76 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 77 | recall = true_positives / (possible_positives + K.epsilon()) 78 | return recall 79 | 80 | def precision(y_true, y_pred): 81 | """Precision metric. 82 | 83 | Only computes a batch-wise average of precision. 84 | 85 | Computes the precision, a metric for multi-label classification of 86 | how many selected items are relevant. 87 | """ 88 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 89 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 90 | precision = true_positives / (predicted_positives + K.epsilon()) 91 | return precision 92 | precision = precision(y_true, y_pred) 93 | recall = recall(y_true, y_pred) 94 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 95 | 96 | 97 | 98 | 99 | 100 | data = read_file("./data/train.crash") 101 | tokenized_texts = tokenize(data["text"]) 102 | labels = data["label"].values.astype(np.float16).reshape(-1, 1) 103 | 104 | embed_size, word_map, embedding_mat = make_embedding( 105 | tokenized_texts, 106 | embedding_path = "./data/baomoi.model.bin", 107 | max_features = 40000 108 | ) 109 | 110 | 111 | 112 | texts_id = text_to_sequences(tokenized_texts, word_map) 113 | print(labels.shape) 114 | print(texts_id.shape) 115 | 116 | texts_id_train, texts_id_val, labels_train, labels_val = train_test_split( 117 | texts_id, labels, 118 | test_size = 0.05 119 | ) 120 | 121 | checkpoint = ModelCheckpoint( 122 | filepath = "./Weights/model_sa_2.hdf5", 123 | monitor = 'val_f1', verbose = 1, 124 | mode = 'max', 125 | save_best_only = True 126 | ) 127 | early = EarlyStopping(monitor = "val_f1", mode = "max", patience = 3) 128 | callbacks_list = [checkpoint, early] 129 | batch_size = 16 130 | epochs = 100 131 | 132 | 133 | model = SARNNKerasCPU( 134 | embeddingMatrix = embedding_mat, 135 | embed_size = 400, 136 | max_features = embedding_mat.shape[0] 137 | ) 138 | model.fit( 139 | texts_id_train, labels_train, 140 | validation_data = (texts_id_val, labels_val), 141 | callbacks = callbacks_list, 142 | epochs = epochs, 143 | batch_size = 16 144 | ) 145 | 146 | 147 | 148 | 149 | model.load_weights("./Weights/model_sa_2.hdf5") 150 | prediction_prob = model.predict(texts_id_val) 151 | 152 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val) 153 | print(OPTIMAL_THRESHOLD) 154 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8) 155 | print(f1_score( 156 | y_true = labels_val.reshape(-1), 157 | y_pred = prediction.reshape(-1) 158 | )) 159 | 160 | 161 | 162 | data_test = read_file("./data/test.crash", is_train = False) 163 | tokenized_texts_test = tokenize(data_test["text"]) 164 | texts_id_test = text_to_sequences(tokenized_texts_test, word_map) 165 | prediction_test = model.predict(texts_id_test) 166 | df_predicton = pd.read_csv("./data/sample_submission.csv") 167 | df_predicton["label"] = (prediction_test > OPTIMAL_THRESHOLD).astype(np.int8) 168 | print(df_predicton.shape[0]) 169 | df_predicton.to_csv("./prediction/prediction_sa_2.csv", index = False) -------------------------------------------------------------------------------- /1st place solution.md: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | 4 | --- 5 | 6 |

1st place solution analysis

7 |

Chào mọi người bọn mình là Hoàng và Nhật trong team HoangNhat2 trên leaderboard. Đầu tiên, bọn mình muốn cảm ơn anh Tiệp và mọi người trong team Aivivn vì đã tổ chức một cuộc thi về Machine Learning về xử lý tiếng Việt rất thú vị. Bọn mình đã học được rất nhiều thứ mới lạ qua cuộc thi.

8 |

Tóm tắt cách làm:

9 |

Tụi mình không có nhiều kiến thức về xử lý NLP nên tụi mình tập trung thử nghiệm những model DL và xem model nào hoạt động tốt. Qua những lần thử rất nhiều model tụi mình nhận ra là không có single model nào vượt qua được 0.89x ở Public LB mặc dù có một số làm rất tốt ở local validation. Sau đó, bọn mình đã thôi thử model mới mà đã qua thử nghiệm một số cách kết hợp model hoặc augment train data.

10 |

Sau những lần thử nghiệm để đạt được độ diversity phù hợp thì solution được top 1 của bọn mình là Weighted Ensemble của những model sau đây:

11 |
    12 |
  1. TextCNN (Weight: 0.1) source
  2. 13 |
  3. Inspired VDCNN (Weight: 0.1) source
  4. 14 |
  5. HARNN (Weight: 0.3) source
  6. 15 |
  7. SARNN (Weight: 0.5) source
  8. 16 |
17 |

Pretrained Embeddings tụi mình test và sử dụng là:

18 | 21 |

Tụi mình chủ yếu train model ở trên Google Colab và sử dụng GPU của Colab. Thời gian train mỗi model khoảng từ 10 - 20 phút (model converge sau khoảng 5-10 epochs). Những model CNN thì train nhanh hơn những model RNN rất nhiều vì có thể nó không phải là sequential model nên nó tận dụng được GPU tốt hơn lúc train.

22 |

Chi tiết cách làm:

23 |

1. Models:

24 |

1.1 TextCNN:

25 |

Đây là model CNN cho text classification của bọn mình.

26 |

Architecture:

27 |

TextCNN

28 |

1.2 VDCNN:

29 |

Tương tự như TextCNN nhưng ở giữa các layer Convolution có những Residual layer để tránh việc vanishing gradient.

30 |

Architecture:

31 |

VDCNN

32 |

1.3 HARNN:

33 |

HARNN xử lý text ở hai level:

34 |
    35 |
  1. Tính encoding cho từng sentence bằng word embedding trong paragraph bằng một BiLSTM
  2. 36 |
  3. Dùng một BiLSTM để tính document encoding theo sentence encoding.
  4. 37 |
38 |

Giữa mỗi layer đều có một Attention layer.

39 |

Architecture Word2Sent

40 |

VDCNN

41 |

Architecture Sent2Doc:

42 |

VDCNN

43 |

1.4 SARNN:

44 |

Đây là model BiLSTM với Attention ỡ giữa hai layer BiLSTM.

45 |

Architecture:

46 |

VDCNN

47 |

2. Combine models:

48 |

Bọn mình đã thử những cách kết hợp các models như Stacking and Ensembling nhưng thấy Ensembling đưa ra được kết quả khả quan nhất. Về cách lựa chọn weight thì bọn mình đã dựa vào model nào có kết quả tốt nhất trên Public LB và cho model đó weight cao nhất. Bọn mình để nguyên probability và chọn threshold là 0.5 chứ không tìm threshold vì không thấy được kết quả tăng nhiều.

49 |

Ngoài lề:

50 |

Khó khăn:

51 | 56 |

Những cách tiếp cận bọn mình đã thử:

57 | 61 |
    62 |
  1. Thay ngẫu nhiên những từ trong câu bằng từ đồng nghĩa. Bọn mình làm điều này bằng cách thay mỗi từ bằng từ có word embeddings gần nó nhất trong từ điển của bọn mình (Nearest neighbor). Mặc dù thay đổi này không mang lại improvement đáng kể nhưng mình nghĩ với thesaurus tốt hoặc metrics chọn vector phù hợp thì sẽ có thể có kết quả tốt.
  2. 63 |
  3. Xáo các câu trong HARNN model để có thể generate được nhiều document khác nhau.
  4. 64 |
  5. Dịch từ tiếng Việt sang các thứ tiếng khác và dịch ngược lại. Mà giờ Google Translate ban cái này rồi ;__;
  6. 65 |
66 |

Kết:

67 |

Một lần nữa cảm ơn BTC và chúc mừng các bạn đã hoàn thành một cái Datathon đầu tiên của Aivivn. Hi vọng những contest trong tương lại sẽ càng nhiều người ủng hộ hơn và cũng sẽ có nhiều discussion trong và ngoài contest hơn vì bọn mình thấy có vẻ thiếu những thảo luận về baseline model lúc thi.

68 | 69 | -------------------------------------------------------------------------------- /scripts/cnn.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.layers import \ 3 | Dense, Embedding, Input, \ 4 | Conv1D, MaxPool1D, \ 5 | Dropout, BatchNormalization, \ 6 | Bidirectional, CuDNNLSTM, \ 7 | Concatenate, Flatten, Add 8 | from .util import f1 9 | from .net_components import AdditiveLayer 10 | 11 | 12 | 13 | # Based on https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/ 14 | # https://www.aclweb.org/anthology/D14-1181 15 | def TextCNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, filter_sizes = {2, 3, 4, 5}, use_fasttext = False, trainable = True, use_additive_emb = False): 16 | if use_fasttext: 17 | inp = Input(shape=(maxlen, embed_size)) 18 | x = inp 19 | else: 20 | inp = Input(shape = (maxlen, )) 21 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp) 22 | 23 | if use_additive_emb: 24 | x = AdditiveLayer()(x) 25 | x = Dropout(0.5)(x) 26 | 27 | 28 | conv_ops = [] 29 | for filter_size in filter_sizes: 30 | conv = Conv1D(128, filter_size, activation = 'relu')(x) 31 | pool = MaxPool1D(5)(conv) 32 | conv_ops.append(pool) 33 | 34 | concat = Concatenate(axis = 1)(conv_ops) 35 | # concat = Dropout(0.1)(concat) 36 | concat = BatchNormalization()(concat) 37 | 38 | 39 | conv_2 = Conv1D(128, 5, activation = 'relu')(concat) 40 | conv_2 = MaxPool1D(5)(conv_2) 41 | conv_2 = BatchNormalization()(conv_2) 42 | # conv_2 = Dropout(0.1)(conv_2) 43 | 44 | conv_3 = Conv1D(128, 5, activation = 'relu')(conv_2) 45 | conv_3 = MaxPool1D(5)(conv_3) 46 | conv_3 = BatchNormalization()(conv_3) 47 | # conv_3 = Dropout(0.1)(conv_3) 48 | 49 | 50 | flat = Flatten()(conv_3) 51 | 52 | op = Dense(64, activation = "relu")(flat) 53 | # op = Dropout(0.5)(op) 54 | op = BatchNormalization()(op) 55 | op = Dense(1, activation = "sigmoid")(op) 56 | 57 | model = Model(inputs = inp, outputs = op) 58 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 59 | return model 60 | 61 | 62 | def VDCNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, filter_sizes = {2, 3, 4, 5}, use_fasttext = False, trainable = True, use_additive_emb = False): 63 | if use_fasttext: 64 | inp = Input(shape=(maxlen, embed_size)) 65 | x = inp 66 | else: 67 | inp = Input(shape = (maxlen, )) 68 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp) 69 | 70 | if use_additive_emb: 71 | x = AdditiveLayer()(x) 72 | x = Dropout(0.5)(x) 73 | 74 | conv_ops = [] 75 | for filter_size in filter_sizes: 76 | conv = Conv1D(128, filter_size, activation = 'relu')(x) 77 | pool = MaxPool1D(5)(conv) 78 | conv_ops.append(pool) 79 | 80 | concat = Concatenate(axis = 1)(conv_ops) 81 | # concat = Dropout(0.1)(concat) 82 | concat = BatchNormalization()(concat) 83 | 84 | 85 | conv_2_main = Conv1D(128, 5, activation = 'relu', padding='same')(concat) 86 | conv_2_main = BatchNormalization()(conv_2_main) 87 | conv_2_main = Conv1D(128, 5, activation = 'relu', padding='same')(conv_2_main) 88 | conv_2_main = BatchNormalization()(conv_2_main) 89 | conv_2 = Add()([concat, conv_2_main]) 90 | conv_2 = MaxPool1D(pool_size = 2, strides = 2)(conv_2) 91 | # conv_2 = BatchNormalization()(conv_2) 92 | # conv_2 = Dropout(0.1)(conv_2) 93 | 94 | conv_3_main = Conv1D(128, 5, activation = 'relu', padding='same')(conv_2) 95 | conv_3_main = BatchNormalization()(conv_3_main) 96 | conv_3_main = Conv1D(128, 5, activation = 'relu', padding='same')(conv_3_main) 97 | conv_3_main = BatchNormalization()(conv_3_main) 98 | conv_3 = Add()([conv_2, conv_3_main]) 99 | conv_3 = MaxPool1D(pool_size = 2, strides = 2)(conv_3) 100 | # conv_3 = BatchNormalization()(conv_3) 101 | # conv_3 = Dropout(0.1)(conv_3) 102 | 103 | 104 | flat = Flatten()(conv_3) 105 | 106 | op = Dense(64, activation = "relu")(flat) 107 | # op = Dropout(0.5)(op) 108 | op = BatchNormalization()(op) 109 | op = Dense(1, activation = "sigmoid")(op) 110 | 111 | model = Model(inputs = inp, outputs = op) 112 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 113 | return model 114 | 115 | 116 | 117 | # Based on http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/ 118 | def LSTMCNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, filter_sizes = {2, 3, 4, 5}, use_fasttext = False, trainable = True, use_additive_emb = False): 119 | if use_fasttext: 120 | inp = Input(shape=(maxlen, embed_size)) 121 | x = inp 122 | else: 123 | inp = Input(shape = (maxlen, )) 124 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp) 125 | 126 | if use_additive_emb: 127 | x = AdditiveLayer()(x) 128 | x = Dropout(0.5)(x) 129 | 130 | x = Bidirectional(CuDNNLSTM(128, return_sequences = True))(x) 131 | 132 | 133 | conv_ops = [] 134 | for filter_size in filter_sizes: 135 | conv = Conv1D(128, filter_size, activation = 'relu')(x) 136 | pool = MaxPool1D(5)(conv) 137 | conv_ops.append(pool) 138 | 139 | concat = Concatenate(axis = 1)(conv_ops) 140 | concat = Dropout(0.5)(concat) 141 | # concat = BatchNormalization()(concat) 142 | 143 | 144 | conv_2 = Conv1D(128, 5, activation = 'relu')(concat) 145 | conv_2 = MaxPool1D(5)(conv_2) 146 | # conv_2 = BatchNormalization()(conv_2) 147 | conv_2 = Dropout(0.5)(conv_2) 148 | 149 | # conv_3 = Conv1D(128, 5, activation = 'relu')(conv_2) 150 | # conv_3 = MaxPool1D(5)(conv_3) 151 | # conv_3 = BatchNormalization()(conv_3) 152 | # conv_3 = Dropout(0.1)(conv_3) 153 | 154 | 155 | flat = Flatten()(conv_2) 156 | 157 | op = Dense(64, activation = "relu")(flat) 158 | op = Dropout(0.5)(op) 159 | # op = BatchNormalization()(op) 160 | op = Dense(1, activation = "sigmoid")(op) 161 | 162 | model = Model(inputs = inp, outputs = op) 163 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 164 | return model 165 | -------------------------------------------------------------------------------- /test_elmo.py: -------------------------------------------------------------------------------- 1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences, find_threshold 2 | import numpy as np 3 | from scripts.constant import DEFAULT_MAX_FEATURES 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.metrics import f1_score 6 | from elmoformanylangs import Embedder 7 | import tensorflow as tf 8 | import random as rn 9 | import pandas as pd 10 | import timeit 11 | 12 | 13 | 14 | from keras.models import Model, load_model, model_from_json 15 | from keras.utils import Sequence 16 | from keras.layers import Dense, Embedding, Input, GRU, Bidirectional, GlobalMaxPool1D, Dropout, Lambda 17 | from keras.callbacks import EarlyStopping, ModelCheckpoint 18 | import keras.backend as K 19 | 20 | # np random seed: 21 | np.random.seed(22) 22 | 23 | # # Setting the seed for python random numbers 24 | rn.seed(1254) 25 | # 26 | # # Setting the graph-level random seed. 27 | tf.set_random_seed(89) 28 | 29 | elmo_path = "./data/elmo/" 30 | 31 | 32 | batch_size = 16 33 | epochs = 100 34 | 35 | 36 | 37 | elmo = Embedder(elmo_path, batch_size = batch_size) 38 | 39 | 40 | def to_length(texts, length): 41 | def pad_func(vector, pad_width, iaxis, kwargs): 42 | str = kwargs.get('padder', '') 43 | vector[:pad_width[0]] = str 44 | vector[-pad_width[1]:] = str 45 | return vector 46 | 47 | ret = [] 48 | for sentence in texts: 49 | sentence = np.array(sentence, dtype = np.unicode) 50 | sentence = sentence[:min(length, len(sentence))] 51 | if length > len(sentence): 52 | sentence = np.pad( 53 | sentence, mode = pad_func, 54 | pad_width = (0, length - len(sentence)) 55 | ) 56 | ret.append(sentence) 57 | 58 | return np.array(ret) 59 | 60 | 61 | class TrainSeq(Sequence): 62 | def __init__(self, X, y, batch_size): 63 | self._X, self._y = X, y 64 | self._batch_size = batch_size 65 | self._indices = np.arange(len(self._X)) 66 | 67 | def __len__(self): 68 | return len(self._X) // self._batch_size 69 | 70 | def __getitem__(self, idx): 71 | id = self._indices[idx * self._batch_size:(idx + 1) * self._batch_size] 72 | return np.array(elmo.sents2elmo(self._X[id])), self._y[id] 73 | 74 | def on_epoch_end(self): 75 | np.random.shuffle(self._indices) 76 | 77 | 78 | class TestSeq(Sequence): 79 | def __init__(self, x, batch_size): 80 | self._X = x 81 | self._batch_size = batch_size 82 | 83 | def __len__(self): 84 | return len(self._X) // batch_size 85 | 86 | def __getitem__(self, idx): 87 | return np.array(elmo.sents2elmo(self._X[idx * self._batch_size:(idx + 1) * self._batch_size])) 88 | 89 | 90 | 91 | 92 | def RNNKerasCPUNoEmbedding(embed_size = 1024, maxlen = 100): 93 | inp = Input(shape = (maxlen, embed_size)) 94 | x = Bidirectional(GRU(256, return_sequences = True))(inp) 95 | x = Dropout(0.5)(x) 96 | x = Bidirectional(GRU(256, return_sequences = True))(x) 97 | x = Dropout(0.5)(x) 98 | x = GlobalMaxPool1D()(x) 99 | x = Dropout(0.5)(x) 100 | x = Dense(64, activation = "relu")(x) 101 | x = Dropout(0.5)(x) 102 | x = Dense(1, activation = "sigmoid")(x) 103 | model = Model(inputs = inp, outputs = x) 104 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 105 | return model 106 | 107 | 108 | 109 | 110 | 111 | def f1(y_true, y_pred): 112 | def recall(y_true, y_pred): 113 | """Recall metric. 114 | 115 | Only computes a batch-wise average of recall. 116 | 117 | Computes the recall, a metric for multi-label classification of 118 | how many relevant items are selected. 119 | """ 120 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 121 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 122 | recall = true_positives / (possible_positives + K.epsilon()) 123 | return recall 124 | 125 | def precision(y_true, y_pred): 126 | """Precision metric. 127 | 128 | Only computes a batch-wise average of precision. 129 | 130 | Computes the precision, a metric for multi-label classification of 131 | how many selected items are relevant. 132 | """ 133 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 134 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 135 | precision = true_positives / (predicted_positives + K.epsilon()) 136 | return precision 137 | precision = precision(y_true, y_pred) 138 | recall = recall(y_true, y_pred) 139 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 140 | 141 | 142 | 143 | 144 | 145 | data = read_file("./data/train.crash") 146 | data_test = read_file("./data/test.crash", is_train = False) 147 | 148 | labels = data["label"].values.astype(np.float16).reshape(-1, 1) 149 | texts = tokenize(data["text"]) 150 | texts_test = tokenize(data_test["text"]) 151 | 152 | 153 | texts = to_length(texts, 100) 154 | texts_test = to_length(texts_test, 100) 155 | 156 | texts_train, texts_val, labels_train, labels_val = train_test_split( 157 | texts, labels, 158 | test_size = 0.05 159 | ) 160 | 161 | 162 | checkpoint = ModelCheckpoint( 163 | filepath = "./Weights/model_elmo.hdf5", 164 | monitor = 'val_f1', verbose = 1, 165 | mode = 'max', 166 | save_best_only = True 167 | ) 168 | early = EarlyStopping(monitor = "val_f1", mode = "max", patience = 3) 169 | callbacks_list = [checkpoint, early] 170 | 171 | train_seq = TrainSeq(texts_train, labels_train, batch_size = batch_size) 172 | val_seq = TrainSeq(texts_val, labels_val, batch_size = 1) 173 | test_seq = TestSeq(texts_test, batch_size = 1) 174 | 175 | 176 | model = RNNKerasCPUNoEmbedding() 177 | model.fit_generator( 178 | train_seq, 179 | validation_data = val_seq, 180 | callbacks = callbacks_list, 181 | epochs = epochs, 182 | workers = False 183 | ) 184 | 185 | 186 | 187 | 188 | model.load_weights("./Weights/model_elmo.hdf5") 189 | prediction_prob = model.predict_generator(val_seq, workers = False) 190 | 191 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val) 192 | print(OPTIMAL_THRESHOLD) 193 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8) 194 | print(f1_score( 195 | y_true = labels_val.reshape(-1), 196 | y_pred = prediction.reshape(-1) 197 | )) 198 | 199 | 200 | 201 | prediction_test = model.predict_generator(test_seq, workers = False) 202 | df_predicton = pd.read_csv("./data/sample_submission.csv") 203 | df_predicton["label"] = (prediction_test > OPTIMAL_THRESHOLD).astype(np.int8) 204 | df_predicton.to_csv("./prediction/prediction_elmo.csv", index = False) -------------------------------------------------------------------------------- /main_stack_hier.py: -------------------------------------------------------------------------------- 1 | from scripts.util import \ 2 | read_file, \ 3 | tokenize, make_embedding, text_to_sequences, \ 4 | sent_embedding, sent_tokenize, text_sents_to_sequences, f1 5 | from scripts.constant import DEFAULT_MAX_FEATURES 6 | from sklearn.model_selection import train_test_split 7 | from scripts.rnn import SARNNKeras, HARNN, AttLayer, RNNKeras, OriginalHARNN, AdditiveLayer 8 | from scripts.cnn import VDCNN, TextCNN, LSTMCNN 9 | from scripts.stack import StackedGeneralizerWithHier 10 | import argparse 11 | import os 12 | import numpy as np 13 | import datetime 14 | import pandas as pd 15 | from sklearn.metrics import f1_score 16 | 17 | from sklearn.linear_model import LogisticRegression 18 | from sklearn.ensemble import RandomForestClassifier 19 | from sklearn.neural_network import MLPClassifier 20 | 21 | from keras.utils import CustomObjectScope 22 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention 23 | 24 | 25 | 26 | def stack(models_list, hier_models_list, embedding_path, max_features, should_mix): 27 | model_name = '-'.join( 28 | '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' ')) 29 | 30 | train_data = read_file('./data/train.crash') 31 | test_data = read_file('./data/test.crash', is_train=False) 32 | 33 | train_tokenized_texts = tokenize(train_data['text']) 34 | test_tokenizes_texts = tokenize(test_data['text']) 35 | 36 | train_tokenized_texts_sent = sent_tokenize(train_data['text']) 37 | test_tokenizes_texts_sent = sent_tokenize(test_data['text']) 38 | 39 | labels = train_data['label'].values.astype(np.float16).reshape(-1, 1) 40 | 41 | embed_size, word_map, embedding_mat = make_embedding( 42 | list(train_tokenized_texts) + 43 | list(test_tokenizes_texts) if should_mix else train_tokenized_texts, 44 | embedding_path, 45 | max_features 46 | ) 47 | 48 | embed_size_sent, word_map_sent, embedding_mat_sent = sent_embedding( 49 | list(train_tokenized_texts_sent) + 50 | list(test_tokenizes_texts_sent) if should_mix else train_tokenized_texts_sent, 51 | embedding_path, 52 | max_features 53 | ) 54 | 55 | 56 | texts_id = text_to_sequences(train_tokenized_texts, word_map) 57 | texts_id_sent = text_sents_to_sequences( 58 | train_tokenized_texts_sent, 59 | word_map_sent, 60 | max_nb_sent = 3, 61 | max_sent_len = 50 62 | ) 63 | print('Number of train data: {}'.format(labels.shape)) 64 | 65 | texts_id_train, texts_id_val, texts_id_sent_train, texts_id_sent_val, labels_train, labels_val = train_test_split( 66 | texts_id, texts_id_sent, labels, test_size=0.05) 67 | 68 | model_path = './models/{}-version'.format(model_name) 69 | 70 | try: 71 | os.mkdir('./models') 72 | except: 73 | print('Folder already created') 74 | try: 75 | os.mkdir(model_path) 76 | except: 77 | print('Folder already created') 78 | 79 | batch_size = 16 80 | epochs = 100 81 | patience = 3 82 | 83 | # meta_model = RandomForestClassifier ( 84 | # n_estimators=200, 85 | # criterion="entropy", 86 | # max_depth=5, 87 | # max_features=0.5 88 | # ) 89 | # meta_model = MLPClassifier( 90 | # hidden_layer_sizes = (10), 91 | # early_stopping = True, 92 | # validation_fraction = 0.05, 93 | # batch_size = batch_size, 94 | # n_iter_no_change = patience 95 | # ) 96 | meta_model = LogisticRegression() 97 | 98 | models = [ 99 | model( 100 | embeddingMatrix=embedding_mat, 101 | embed_size=embed_size, 102 | max_features=embedding_mat.shape[0] 103 | ) 104 | for model in models_list 105 | ] 106 | 107 | hier_models = [ 108 | model( 109 | embeddingMatrix=embedding_mat_sent, 110 | embed_size=embed_size_sent, 111 | max_features=embedding_mat_sent.shape[0], 112 | max_nb_sent = 3, 113 | max_sent_len = 50 114 | ) 115 | for model in hier_models_list 116 | ] 117 | 118 | 119 | 120 | stack = StackedGeneralizerWithHier(models, hier_models, meta_model) 121 | stack.train_meta_model( 122 | X = texts_id_train, y = labels_train, 123 | X_val = texts_id_val, y_val = labels_val, 124 | X_hier = texts_id_sent_train, X_hier_val = texts_id_sent_val, 125 | model_path = model_path, 126 | epochs = epochs, 127 | batch_size = batch_size, 128 | patience = patience 129 | ) 130 | 131 | stack.train_models( 132 | X = texts_id_train, y = labels_train, 133 | X_val = texts_id_val, y_val = labels_val, 134 | X_hier = texts_id_sent_train, X_hier_val = texts_id_sent_val, 135 | batch_size = batch_size, 136 | epochs = epochs, 137 | patience = patience, 138 | model_path = model_path 139 | ) 140 | 141 | prediction = stack.predict(texts_id_val, texts_id_sent_val) 142 | print('F1 validation score: {}'.format(f1_score(prediction, labels_val))) 143 | with open('{}/f1'.format(model_path), 'w') as fp: 144 | fp.write(str(f1_score(prediction, labels_val))) 145 | 146 | test_id_texts = text_to_sequences(test_tokenizes_texts, word_map) 147 | test_id_texts_sent = text_sents_to_sequences(test_tokenizes_texts_sent, word_map_sent, 3, 50) 148 | test_prediction = stack.predict(test_id_texts, test_id_texts_sent) 149 | 150 | df_predicton = pd.read_csv("./data/sample_submission.csv") 151 | df_predicton["label"] = test_prediction 152 | 153 | print('Number of test data: {}'.format(df_predicton.shape[0])) 154 | df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False) 155 | 156 | 157 | 158 | if __name__ == '__main__': 159 | models_list = [ 160 | VDCNN, TextCNN, SARNNKeras, RNNKeras 161 | ] 162 | hier_models_list = [ 163 | OriginalHARNN, HARNN 164 | ] 165 | parser = argparse.ArgumentParser() 166 | parser.add_argument( 167 | '-e', 168 | '--embedding', 169 | help='Model use', 170 | default='./embeddings/smallFasttext.vi.vec' 171 | ) 172 | parser.add_argument( 173 | '--max', 174 | help='Model use', 175 | default=DEFAULT_MAX_FEATURES 176 | ) 177 | parser.add_argument( 178 | '--mix', 179 | action='store_true', 180 | help='Model use' 181 | ) 182 | args = parser.parse_args() 183 | with CustomObjectScope({ 184 | 'SeqSelfAttention': SeqSelfAttention, 185 | 'SeqWeightedAttention': SeqWeightedAttention, 186 | 'AttLayer': AttLayer, 187 | 'AdditiveLayer': AdditiveLayer, 188 | 'f1': f1 189 | }): 190 | stack(models_list, hier_models_list, args.embedding, 191 | int(args.max), args.mix) 192 | -------------------------------------------------------------------------------- /main_elmo.py: -------------------------------------------------------------------------------- 1 | from scripts.util import read_file, tokenize 2 | from sklearn.model_selection import train_test_split 3 | from keras.callbacks import EarlyStopping, ModelCheckpoint 4 | from scripts.rnn import RNNKeras, RNNKerasCPU, LSTMKeras, SARNNKerasCPU, SARNNKeras 5 | from scripts.cnn import TextCNN, LSTMCNN, VDCNN 6 | import argparse 7 | import os 8 | import numpy as np 9 | import datetime 10 | import pandas as pd 11 | from scripts.util import find_threshold 12 | from sklearn.metrics import f1_score 13 | from keras.utils import Sequence 14 | from elmoformanylangs import Embedder 15 | 16 | 17 | 18 | 19 | def train_model(model, embedding_path, should_find_threshold, return_prob, use_additive_emb): 20 | batch_size = 16 21 | epochs = 100 22 | max_len = 100 23 | 24 | def to_length(texts, length): 25 | def pad_func(vector, pad_width, iaxis, kwargs): 26 | str = kwargs.get('padder', '') 27 | vector[:pad_width[0]] = str 28 | vector[-pad_width[1]:] = str 29 | return vector 30 | 31 | ret = [] 32 | for sentence in texts: 33 | sentence = np.array([token.replace("_", " ") for token in sentence], dtype=np.unicode) 34 | sentence = sentence[:min(length, len(sentence))] 35 | if length > len(sentence): 36 | sentence = np.pad( 37 | sentence, mode=pad_func, 38 | pad_width=(0, length - len(sentence)) 39 | ) 40 | ret.append(sentence) 41 | 42 | return np.array(ret) 43 | 44 | class TrainSeq(Sequence): 45 | def __init__(self, X, y, batch_size, elmo): 46 | self._X, self._y = X, y 47 | self._batch_size = batch_size 48 | self._indices = np.arange(len(self._X)) 49 | self._elmo = elmo 50 | 51 | def __len__(self): 52 | return int(np.ceil(len(self._X) / float(self._batch_size))) 53 | 54 | def __getitem__(self, idx): 55 | id = self._indices[idx * self._batch_size:(idx + 1) * self._batch_size] 56 | return np.array(self._elmo.sents2elmo(self._X[id])), self._y[id] 57 | 58 | def on_epoch_end(self): 59 | np.random.shuffle(self._indices) 60 | 61 | class TestSeq(Sequence): 62 | def __init__(self, x, batch_size, elmo): 63 | self._X = x 64 | self._batch_size = batch_size 65 | self._elmo = elmo 66 | 67 | def __len__(self): 68 | return int(np.ceil(len(self._X) / float(self._batch_size))) 69 | 70 | def __getitem__(self, idx): 71 | return np.array(self._elmo.sents2elmo(self._X[idx * self._batch_size:(idx + 1) * self._batch_size])) 72 | 73 | model_name = '-'.join( 74 | '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' ')) 75 | 76 | elmo = Embedder(embedding_path, batch_size=batch_size) 77 | 78 | train_data = read_file('./data/train.crash') 79 | test_data = read_file('./data/test.crash', is_train=False) 80 | train_tokenized_texts = tokenize(train_data['text']) 81 | test_tokenizes_texts = tokenize(test_data['text']) 82 | labels = train_data['label'].values.astype(np.float16).reshape(-1, 1) 83 | 84 | texts = to_length(train_tokenized_texts, max_len) 85 | texts_test = to_length(test_tokenizes_texts, max_len) 86 | 87 | print('Number of train data: {}'.format(labels.shape)) 88 | 89 | texts_train, texts_val, labels_train, labels_val = train_test_split( 90 | texts, labels, 91 | test_size=0.05 92 | ) 93 | 94 | model_path = './models/{}-version'.format(model_name) 95 | 96 | try: 97 | os.mkdir('./models') 98 | except: 99 | print('Folder already created') 100 | try: 101 | os.mkdir(model_path) 102 | except: 103 | print('Folder already created') 104 | 105 | checkpoint = ModelCheckpoint( 106 | filepath='{}/models.hdf5'.format(model_path), 107 | monitor='val_f1', verbose=1, 108 | mode='max', 109 | save_best_only=True 110 | ) 111 | early = EarlyStopping(monitor='val_f1', mode='max', patience=5) 112 | callbacks_list = [checkpoint, early] 113 | 114 | train_seq = TrainSeq(texts_train, labels_train, batch_size=batch_size, elmo = elmo) 115 | val_seq = TrainSeq(texts_val, labels_val, batch_size=min(batch_size, len(texts_val)), elmo = elmo) 116 | test_seq = TestSeq(texts_test, batch_size=min(batch_size, len(texts_test)), elmo = elmo) 117 | 118 | model = model( 119 | maxlen = max_len, 120 | embed_size=1024, 121 | use_fasttext = True, 122 | use_additive_emb = use_additive_emb 123 | ) 124 | model.fit_generator( 125 | train_seq, 126 | validation_data=val_seq, 127 | callbacks=callbacks_list, 128 | epochs=epochs, 129 | workers=False 130 | ) 131 | 132 | model.load_weights('{}/models.hdf5'.format(model_path)) 133 | prediction_prob = model.predict_generator(val_seq, workers=False) 134 | if should_find_threshold: 135 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val) 136 | else: 137 | OPTIMAL_THRESHOLD = 0.5 138 | print('OPTIMAL_THRESHOLD: {}'.format(OPTIMAL_THRESHOLD)) 139 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8) 140 | print('F1 validation score: {}'.format(f1_score(prediction, labels_val))) 141 | with open('{}/f1'.format(model_path), 'w') as fp: 142 | fp.write(str(f1_score(prediction, labels_val))) 143 | 144 | test_prediction = model.predict_generator(test_seq, workers=False) 145 | 146 | df_predicton = pd.read_csv("./data/sample_submission.csv") 147 | if return_prob: 148 | df_predicton["label"] = test_prediction 149 | else: 150 | df_predicton["label"] = ( 151 | test_prediction > OPTIMAL_THRESHOLD).astype(np.int8) 152 | 153 | print('Number of test data: {}'.format(df_predicton.shape[0])) 154 | df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False) 155 | 156 | 157 | model_dict = { 158 | 'RNNKeras': RNNKeras, 159 | 'RNNKerasCPU': RNNKerasCPU, 160 | 'LSTMKeras': LSTMKeras, 161 | 'SARNNKerasCPU': SARNNKerasCPU, 162 | 'SARNNKeras': SARNNKeras, 163 | 'TextCNN': TextCNN, 164 | 'LSTMCNN': LSTMCNN, 165 | 'VDCNN': VDCNN 166 | } 167 | 168 | if __name__ == '__main__': 169 | parser = argparse.ArgumentParser() 170 | parser.add_argument( 171 | '-m', 172 | '--model', 173 | help='Model use', 174 | default='RNNKerasCPU' 175 | ) 176 | parser.add_argument( 177 | '-e', 178 | '--embedding', 179 | help='Model use', 180 | default='./embeddings/smallFasttext.vi.vec' 181 | ) 182 | parser.add_argument( 183 | '--find_threshold', 184 | action='store_true', 185 | help='Model use' 186 | ) 187 | parser.add_argument( 188 | '--prob', 189 | action='store_true', 190 | help='Model use' 191 | ) 192 | parser.add_argument( 193 | '--add_embed', 194 | action='store_true', 195 | help='Model use' 196 | ) 197 | args = parser.parse_args() 198 | if not args.model in model_dict: 199 | raise RuntimeError('Model not found') 200 | train_model(model_dict[args.model], args.embedding, args.find_threshold, args.prob, args.add_embed) 201 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences 2 | from scripts.rnn import RNNKeras 3 | from scripts.constant import DEFAULT_MAX_FEATURES 4 | from sklearn.model_selection import train_test_split 5 | from keras.callbacks import EarlyStopping, ModelCheckpoint 6 | from scripts.rnn import RNNKeras, RNNKerasCPU, LSTMKeras, SARNNKerasCPU, SARNNKeras 7 | from scripts.cnn import TextCNN, LSTMCNN, VDCNN 8 | import argparse 9 | import os 10 | import numpy as np 11 | import datetime 12 | import pandas as pd 13 | from scripts.util import find_threshold 14 | from scripts.augment import similar_augment, create_sim_dict, similar_augment_from_sim_dict 15 | from sklearn.metrics import f1_score 16 | from keras.utils.vis_utils import plot_model 17 | 18 | 19 | def train_model( 20 | model, embedding_path, annoy_path, 21 | max_features, should_find_threshold, should_mix, 22 | return_prob, trainable, use_additive_emb, augment_size, use_sim_dict, 23 | print_model, model_high 24 | ): 25 | model_name = '-'.join( 26 | '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' ')) 27 | 28 | augment_size = int(augment_size) 29 | 30 | train_data = read_file('./data/train.crash') 31 | test_data = read_file('./data/test.crash', is_train=False) 32 | train_tokenized_texts = tokenize(train_data['text']) 33 | test_tokenizes_texts = tokenize(test_data['text']) 34 | labels = train_data['label'].values.astype(np.float16).reshape(-1, 1) 35 | 36 | train_tokenized_texts, val_tokenized_texts, labels_train, labels_val = train_test_split( 37 | train_tokenized_texts, labels, test_size = 0.05 38 | ) 39 | 40 | 41 | if augment_size != 0 and not use_sim_dict: 42 | if augment_size < 0: 43 | augment_size = len(train_tokenized_texts) * (-augment_size) 44 | 45 | print(augment_size) 46 | 47 | train_tokenized_texts, labels_train = similar_augment( 48 | train_tokenized_texts, 49 | labels_train, 50 | n_increase = augment_size, 51 | model_path = embedding_path, 52 | n_word_replace = 10, 53 | use_annoy=True, 54 | annoy_path=annoy_path 55 | ) 56 | 57 | 58 | embed_size, word_map, embedding_mat = make_embedding( 59 | list(train_tokenized_texts) + list(val_tokenized_texts) + 60 | list(test_tokenizes_texts) if should_mix else list(train_tokenized_texts) + list(val_tokenized_texts), 61 | embedding_path, 62 | max_features 63 | ) 64 | 65 | texts_id_train = text_to_sequences(train_tokenized_texts, word_map) 66 | 67 | if augment_size != 0 and use_sim_dict: 68 | if augment_size < 0: 69 | augment_size = len(train_tokenized_texts) * (-augment_size) 70 | sim_dict = create_sim_dict(word_map, model_path = embedding_path, annoy_path = annoy_path) 71 | print("Finish Creating sim dict") 72 | texts_id_train, labels_train = similar_augment_from_sim_dict( 73 | texts_id_train, labels_train, sim_dict, 74 | n_increase = augment_size 75 | ) 76 | 77 | texts_id_val = text_to_sequences(val_tokenized_texts, word_map) 78 | print('Number of train data: {}'.format(labels.shape)) 79 | 80 | # texts_id_train, texts_id_val, labels_train, labels_val = train_test_split( 81 | # texts_id, labels, test_size=0.05) 82 | 83 | model_path = './models/{}-version'.format(model_name) 84 | 85 | try: 86 | os.mkdir('./models') 87 | except: 88 | print('Folder already created') 89 | try: 90 | os.mkdir(model_path) 91 | except: 92 | print('Folder already created') 93 | 94 | checkpoint = ModelCheckpoint( 95 | filepath='{}/models.hdf5'.format(model_path), 96 | monitor='val_f1', verbose=1, 97 | mode='max', 98 | save_best_only=True 99 | ) 100 | early = EarlyStopping(monitor='val_f1', mode='max', patience=5) 101 | callbacks_list = [checkpoint, early] 102 | batch_size = 16 103 | epochs = 100 104 | 105 | model = model( 106 | embeddingMatrix=embedding_mat, 107 | embed_size=embed_size, 108 | max_features=embedding_mat.shape[0], 109 | trainable = trainable, 110 | use_additive_emb = use_additive_emb 111 | ) 112 | if print_model: 113 | plot_model(model, to_file='{}.png'.format(model_high), show_shapes=True, show_layer_names=True) 114 | return 115 | model.fit( 116 | texts_id_train, labels_train, 117 | validation_data=(texts_id_val, labels_val), 118 | callbacks=callbacks_list, 119 | epochs=epochs, 120 | batch_size=batch_size 121 | ) 122 | 123 | model.load_weights('{}/models.hdf5'.format(model_path)) 124 | prediction_prob = model.predict(texts_id_val) 125 | if should_find_threshold: 126 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val) 127 | else: 128 | OPTIMAL_THRESHOLD = 0.5 129 | print('OPTIMAL_THRESHOLD: {}'.format(OPTIMAL_THRESHOLD)) 130 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8) 131 | print('F1 validation score: {}'.format(f1_score(prediction, labels_val))) 132 | with open('{}/f1'.format(model_path), 'w') as fp: 133 | fp.write(str(f1_score(prediction, labels_val))) 134 | 135 | test_id_texts = text_to_sequences(test_tokenizes_texts, word_map) 136 | test_prediction = model.predict(test_id_texts) 137 | 138 | df_predicton = pd.read_csv("./data/sample_submission.csv") 139 | if return_prob: 140 | df_predicton["label"] = test_prediction 141 | else: 142 | df_predicton["label"] = ( 143 | test_prediction > OPTIMAL_THRESHOLD).astype(np.int8) 144 | 145 | print('Number of test data: {}'.format(df_predicton.shape[0])) 146 | df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False) 147 | 148 | 149 | model_dict = { 150 | 'RNNKeras': RNNKeras, 151 | 'RNNKerasCPU': RNNKerasCPU, 152 | 'LSTMKeras': LSTMKeras, 153 | 'SARNNKerasCPU': SARNNKerasCPU, 154 | 'SARNNKeras': SARNNKeras, 155 | 'TextCNN': TextCNN, 156 | 'LSTMCNN': LSTMCNN, 157 | 'VDCNN': VDCNN 158 | } 159 | 160 | if __name__ == '__main__': 161 | parser = argparse.ArgumentParser() 162 | parser.add_argument( 163 | '-m', 164 | '--model', 165 | help='Model use', 166 | default='RNNKerasCPU' 167 | ) 168 | parser.add_argument( 169 | '-e', 170 | '--embedding', 171 | help='Model use', 172 | default='./embeddings/smallFasttext.vi.vec' 173 | ) 174 | parser.add_argument( 175 | '-annoy', 176 | '--annoy', 177 | help='Model use', 178 | default='./embeddings/annoy.pkl' 179 | ) 180 | parser.add_argument( 181 | '--max', 182 | help='Model use', 183 | default=DEFAULT_MAX_FEATURES 184 | ) 185 | parser.add_argument( 186 | '--aug', 187 | help='Model use', 188 | default=0 189 | ) 190 | parser.add_argument( 191 | '--use_sim_dict', 192 | action='store_true', 193 | help='Model use' 194 | ) 195 | parser.add_argument( 196 | '--find_threshold', 197 | action='store_true', 198 | help='Model use' 199 | ) 200 | parser.add_argument( 201 | '--mix', 202 | action='store_true', 203 | help='Model use' 204 | ) 205 | parser.add_argument( 206 | '--prob', 207 | action='store_true', 208 | help='Model use' 209 | ) 210 | parser.add_argument( 211 | '--fix_embed', 212 | action='store_false', 213 | help='Model use' 214 | ) 215 | parser.add_argument( 216 | '--add_embed', 217 | action='store_true', 218 | help='Model use' 219 | ) 220 | parser.add_argument( 221 | '--print_model', 222 | action='store_true', 223 | help='Model use' 224 | ) 225 | args = parser.parse_args() 226 | if not args.model in model_dict: 227 | raise RuntimeError('Model not found') 228 | train_model(model_dict[args.model], args.embedding, args.annoy, 229 | int(args.max), args.find_threshold, args.mix, args.prob, args.fix_embed, args.add_embed, args.aug, 230 | args.use_sim_dict, args.print_model, args.model) 231 | -------------------------------------------------------------------------------- /main_hierarchical.py: -------------------------------------------------------------------------------- 1 | from scripts.util import read_file, sent_tokenize, sent_embedding, text_sents_to_sequences 2 | from scripts.constant import DEFAULT_MAX_FEATURES 3 | from sklearn.model_selection import train_test_split 4 | from keras.callbacks import EarlyStopping, ModelCheckpoint 5 | from scripts.rnn import HRNN, HRNNCPU, OriginalHARNN, OriginalHARNNCPU, HARNN, HARNNCPU 6 | import argparse 7 | import os 8 | import numpy as np 9 | import datetime 10 | import pandas as pd 11 | from scripts.util import find_threshold 12 | from scripts.augment import shuffle_augment 13 | from sklearn.metrics import f1_score 14 | from keras.utils.vis_utils import plot_model 15 | 16 | 17 | def train_model( 18 | model, embedding_path, 19 | max_features, max_nb_sent, max_sent_len, 20 | should_find_threshold, should_mix, 21 | return_prob, trainable, use_additive_emb, augment_size, aug_min_len, print_model, model_high 22 | ): 23 | model_name = '-'.join( 24 | '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' ')) 25 | 26 | train_data = read_file('./data/train.crash') 27 | test_data = read_file('./data/test.crash', is_train=False) 28 | train_tokenized_texts = sent_tokenize(train_data['text']) 29 | test_tokenizes_texts = sent_tokenize(test_data['text']) 30 | labels = train_data['label'].values.astype(np.float16).reshape(-1, 1) 31 | 32 | train_tokenized_texts, val_tokenized_texts, labels_train, labels_val = train_test_split( 33 | train_tokenized_texts, labels, test_size=0.05 34 | ) 35 | 36 | augment_size = int(augment_size) 37 | aug_min_len = int(aug_min_len) 38 | max_nb_sent = int(max_nb_sent) 39 | max_sent_len = int(max_sent_len) 40 | 41 | if augment_size != 0: 42 | if augment_size < 0: 43 | augment_size = len(train_tokenized_texts) * (-augment_size) 44 | 45 | print(augment_size) 46 | 47 | train_tokenized_texts, labels_train = shuffle_augment( 48 | train_tokenized_texts, 49 | labels_train, 50 | n_increase = augment_size, 51 | min_length = aug_min_len 52 | ) 53 | 54 | embed_size, word_map, embedding_mat = sent_embedding( 55 | list(train_tokenized_texts) + list(val_tokenized_texts) + 56 | list(test_tokenizes_texts) if should_mix 57 | else list(train_tokenized_texts) + list(val_tokenized_texts), 58 | embedding_path, 59 | max_features 60 | ) 61 | 62 | texts_id_train = text_sents_to_sequences( 63 | train_tokenized_texts, 64 | word_map, 65 | max_nb_sent = max_nb_sent, 66 | max_sent_len = max_sent_len 67 | ) 68 | 69 | texts_id_val = text_sents_to_sequences( 70 | val_tokenized_texts, 71 | word_map, 72 | max_nb_sent = max_nb_sent, 73 | max_sent_len = max_sent_len 74 | ) 75 | 76 | 77 | # texts_id = text_sents_to_sequences( 78 | # train_tokenized_texts, 79 | # word_map, 80 | # max_nb_sent = max_nb_sent, 81 | # max_sent_len = max_sent_len 82 | # ) 83 | print('Number of train data: {}'.format(labels.shape)) 84 | 85 | # texts_id_train, texts_id_val, labels_train, labels_val = train_test_split( 86 | # texts_id, labels, test_size=0.05) 87 | 88 | model_path = './models/{}-version'.format(model_name) 89 | 90 | try: 91 | os.mkdir('./models') 92 | except: 93 | print('Folder already created') 94 | try: 95 | os.mkdir(model_path) 96 | except: 97 | print('Folder already created') 98 | 99 | checkpoint = ModelCheckpoint( 100 | filepath='{}/models.hdf5'.format(model_path), 101 | monitor='val_f1', verbose=1, 102 | mode='max', 103 | save_best_only=True 104 | ) 105 | early = EarlyStopping(monitor='val_f1', mode='max', patience=5) 106 | callbacks_list = [checkpoint, early] 107 | batch_size = 16 108 | epochs = 100 109 | 110 | model = model( 111 | embeddingMatrix=embedding_mat, 112 | embed_size=embed_size, 113 | max_features=embedding_mat.shape[0], 114 | max_nb_sent = max_nb_sent, 115 | max_sent_len = max_sent_len, 116 | trainable = trainable, 117 | use_additive_emb = use_additive_emb 118 | ) 119 | if print_model: 120 | plot_model(model, to_file='{}.png'.format(model_high), show_shapes=True, show_layer_names=True) 121 | return 122 | model.fit( 123 | texts_id_train, labels_train, 124 | validation_data=(texts_id_val, labels_val), 125 | callbacks=callbacks_list, 126 | epochs=epochs, 127 | batch_size=batch_size 128 | ) 129 | 130 | model.load_weights('{}/models.hdf5'.format(model_path)) 131 | prediction_prob = model.predict(texts_id_val) 132 | 133 | if should_find_threshold: 134 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val) 135 | else: 136 | OPTIMAL_THRESHOLD = 0.5 137 | print('OPTIMAL_THRESHOLD: {}'.format(OPTIMAL_THRESHOLD)) 138 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8) 139 | print('F1 validation score: {}'.format(f1_score(prediction, labels_val))) 140 | with open('{}/f1'.format(model_path), 'w') as fp: 141 | fp.write(str(f1_score(prediction, labels_val))) 142 | 143 | test_id_texts = text_sents_to_sequences( 144 | test_tokenizes_texts, 145 | word_map, 146 | max_nb_sent = max_nb_sent, 147 | max_sent_len = max_sent_len 148 | ) 149 | test_prediction = model.predict(test_id_texts) 150 | 151 | df_predicton = pd.read_csv("./data/sample_submission.csv") 152 | 153 | if return_prob: 154 | df_predicton["label"] = test_prediction 155 | else: 156 | df_predicton["label"] = ( 157 | test_prediction > OPTIMAL_THRESHOLD).astype(np.int8) 158 | print('Number of test data: {}'.format(df_predicton.shape[0])) 159 | df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False) 160 | 161 | 162 | model_dict = { 163 | 'HRNN': HRNN, 164 | 'HRNNCPU': HRNNCPU, 165 | 'HARNN': HARNN, 166 | 'HARNNCPU': HARNNCPU, 167 | 'OriginalHARNN': OriginalHARNN, 168 | 'OriginalHARNNCPU':OriginalHARNNCPU 169 | } 170 | 171 | if __name__ == '__main__': 172 | parser = argparse.ArgumentParser() 173 | parser.add_argument( 174 | '-m', 175 | '--model', 176 | help='Model use', 177 | default='HRNN' 178 | ) 179 | parser.add_argument( 180 | '-e', 181 | '--embedding', 182 | help='Model use', 183 | default='./embeddings/smallFasttext.vi.vec' 184 | ) 185 | parser.add_argument( 186 | '--max', 187 | help='Model use', 188 | default=DEFAULT_MAX_FEATURES 189 | ) 190 | parser.add_argument( 191 | '--nb_sent', 192 | help='Model use', 193 | default=3 194 | ) 195 | parser.add_argument( 196 | '--sent_len', 197 | help='Model use', 198 | default=50 199 | ) 200 | parser.add_argument( 201 | '--aug', 202 | help='Model use', 203 | default=0 204 | ) 205 | parser.add_argument( 206 | '--aug_min_len', 207 | help='Model use', 208 | default=1 209 | ) 210 | parser.add_argument( 211 | '--find_threshold', 212 | action='store_true', 213 | help='Model use' 214 | ) 215 | parser.add_argument( 216 | '--mix', 217 | action='store_true', 218 | help='Model use' 219 | ) 220 | parser.add_argument( 221 | '--prob', 222 | action='store_true', 223 | help='Model use' 224 | ) 225 | parser.add_argument( 226 | '--fix_embed', 227 | action='store_false', 228 | help='Model use' 229 | ) 230 | parser.add_argument( 231 | '--add_embed', 232 | action='store_true', 233 | help='Model use' 234 | ) 235 | parser.add_argument( 236 | '--print_model', 237 | action='store_true', 238 | help='Model use' 239 | ) 240 | args = parser.parse_args() 241 | if not args.model in model_dict: 242 | raise RuntimeError('Model not found') 243 | train_model( 244 | model_dict[args.model], args.embedding, 245 | int(args.max), args.nb_sent, args.sent_len, 246 | args.find_threshold, args.mix, args.prob, 247 | args.fix_embed, args.add_embed, args.aug, args.aug_min_len, args.print_model, args.model 248 | ) 249 | -------------------------------------------------------------------------------- /scripts/util.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import copy 3 | import os 4 | import numpy as np 5 | import re 6 | import keras.backend as K 7 | 8 | from tqdm import tqdm 9 | from collections import defaultdict 10 | from os.path import abspath 11 | from spacy.lang.vi import Vietnamese 12 | from .constant import DEFAULT_MAX_LENGTH 13 | from gensim.models.keyedvectors import KeyedVectors 14 | from sklearn.metrics import f1_score 15 | import string 16 | 17 | 18 | def split_array(arr, condition): 19 | if len(arr) == 0: 20 | return [] 21 | result = [] 22 | accumulated = [arr[0]] 23 | for ele in arr[1:]: 24 | if condition(ele): 25 | result.append(copy.deepcopy(accumulated)) 26 | accumulated = [copy.deepcopy(ele)] 27 | else: 28 | accumulated.append(copy.deepcopy(ele)) 29 | result.append(copy.deepcopy(accumulated)) 30 | return result 31 | 32 | 33 | def read_file(file_path, is_train=True): 34 | file_path = abspath(file_path) 35 | data_lines = list( 36 | filter(lambda x: x != '', open(file_path).read().split('\n'))) 37 | pattern = ('train' if is_train else 'test') + '_[0-9]{5}' 38 | datas = split_array(data_lines, lambda x: bool(re.match(pattern, x))) 39 | if is_train: 40 | result_array = list(map( 41 | lambda x: [x[0], ' '.join(x[1:-1]), int(x[-1])], datas)) 42 | else: 43 | result_array = list(map(lambda x: [x[0], ' '.join(x[1:])], datas)) 44 | columns = ['name', 'text', 'label'] if is_train else ['name', 'text'] 45 | return pd.DataFrame(result_array, columns=columns) 46 | 47 | 48 | def tokenize(texts): 49 | nlp = Vietnamese() 50 | docs = [] 51 | for text in texts: 52 | tokens = np.array([postprocess_token(token.text) for token in nlp(text.lower())[1:-1]]) 53 | docs.append(tokens) 54 | 55 | return docs 56 | 57 | 58 | def postprocess_token(token): 59 | if token in string.punctuation: 60 | return '' 61 | elif token.isdigit(): 62 | return '' 63 | else: 64 | return token 65 | 66 | 67 | 68 | def make_embedding(texts, embedding_path, max_features): 69 | embedding_path = abspath(embedding_path) 70 | 71 | def get_coefs(word, *arr): 72 | return word, np.asarray(arr, dtype='float32') 73 | 74 | if embedding_path.endswith('.vec'): 75 | embedding_index = dict(get_coefs(*o.strip().split(" ")) 76 | for o in open(embedding_path)) 77 | mean_embedding = np.mean(np.array(list(embedding_index.values()))) 78 | elif embedding_path.endswith('bin'): 79 | embedding_index = KeyedVectors.load_word2vec_format( 80 | embedding_path, binary=True) 81 | mean_embedding = np.mean(embedding_index.vectors, axis=0) 82 | embed_size = mean_embedding.shape[0] 83 | word_index = sorted(list({word.lower() for sentence in texts for word in sentence})) 84 | nb_words = min(max_features, len(word_index)) 85 | embedding_matrix = np.zeros((nb_words + 1, embed_size)) 86 | i = 1 87 | word_map = defaultdict(lambda: nb_words) 88 | for word in word_index: 89 | if i >= max_features: 90 | continue 91 | if word in embedding_index: 92 | embedding_matrix[i] = embedding_index[word] 93 | else: 94 | embedding_matrix[i] = mean_embedding 95 | word_map[word] = i 96 | i += 1 97 | 98 | embedding_matrix[-1] = mean_embedding 99 | return embed_size, word_map, embedding_matrix 100 | 101 | def text_to_sequences(texts, word_map, max_len=DEFAULT_MAX_LENGTH): 102 | texts_id = [] 103 | for sentence in texts: 104 | sentence = [word_map[word.lower()] for word in sentence][:max_len] 105 | padded_setence = np.pad( 106 | sentence, (0, max(0, max_len - len(sentence))), 'constant', constant_values=0) 107 | texts_id.append(padded_setence) 108 | return np.array(texts_id) 109 | 110 | def find_threshold(pred_proba, y_true, metric = f1_score): 111 | cur_acc = 0 112 | cur_thres = 0 113 | for ind in range(len(pred_proba) - 1): 114 | threshold = (pred_proba[ind][0] + pred_proba[ind + 1][0]) / 2 115 | pred = (pred_proba > threshold).astype(np.int8) 116 | acc = metric(pred, y_true) 117 | if acc > cur_acc: 118 | cur_thres = threshold 119 | cur_acc = acc 120 | 121 | return cur_thres 122 | 123 | def f1(y_true, y_pred): 124 | def recall(y_true, y_pred): 125 | """Recall metric. 126 | 127 | Only computes a batch-wise average of recall. 128 | 129 | Computes the recall, a metric for multi-label classification of 130 | how many relevant items are selected. 131 | """ 132 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 133 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 134 | recall = true_positives / (possible_positives + K.epsilon()) 135 | return recall 136 | 137 | def precision(y_true, y_pred): 138 | """Precision metric. 139 | 140 | Only computes a batch-wise average of precision. 141 | 142 | Computes the precision, a metric for multi-label classification of 143 | how many selected items are relevant. 144 | """ 145 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 146 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 147 | precision = true_positives / (predicted_positives + K.epsilon()) 148 | return precision 149 | precision = precision(y_true, y_pred) 150 | recall = recall(y_true, y_pred) 151 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 152 | 153 | def predictions_to_submission(test_data, predictor): 154 | tqdm.pandas() 155 | submission = test_data[['id']] 156 | submission['label'] = test_data['text'].progress_apply(predictor) 157 | return submission 158 | 159 | 160 | # HELPERS FOR HIERARCHICAL MODEL: 161 | def sent_tokenize(texts): 162 | nlp = Vietnamese() 163 | nlp.add_pipe(nlp.create_pipe('sentencizer')) 164 | docs = [] 165 | for text in texts: 166 | text_tokenized = [] 167 | if (len(text) > 3): 168 | for sentence in nlp(text.lower()[1:-1]).sents: 169 | sent_tokens = np.array([postprocess_token(token.text) for token in sentence]) 170 | text_tokenized.append(sent_tokens) 171 | else: 172 | text_tokenized.append([]) 173 | docs.append(text_tokenized) 174 | 175 | return docs 176 | 177 | 178 | def sent_embedding(tokenized_texts, embedding_path, max_features): 179 | embedding_path = abspath(embedding_path) 180 | 181 | def get_coefs(word, *arr): 182 | return word, np.asarray(arr, dtype='float32') 183 | 184 | if embedding_path.endswith('.vec'): 185 | embedding_index = dict(get_coefs(*o.strip().split(" ")) 186 | for o in open(embedding_path)) 187 | mean_embedding = np.mean(np.array(list(embedding_index.values()))) 188 | elif embedding_path.endswith('bin'): 189 | embedding_index = KeyedVectors.load_word2vec_format( 190 | embedding_path, binary=True) 191 | mean_embedding = np.mean(embedding_index.vectors, axis=0) 192 | embed_size = mean_embedding.shape[0] 193 | word_index = {word.lower() for text in tokenized_texts for sentence in text for word in sentence} 194 | nb_words = min(max_features, len(word_index)) 195 | embedding_matrix = np.zeros((nb_words + 1, embed_size)) 196 | 197 | i = 1 198 | word_map = defaultdict(lambda: nb_words) 199 | for word in word_index: 200 | if i >= max_features: 201 | continue 202 | if word in embedding_index: 203 | embedding_matrix[i] = embedding_index[word] 204 | else: 205 | embedding_matrix[i] = mean_embedding 206 | word_map[word] = i 207 | i += 1 208 | embedding_matrix[-1] = mean_embedding 209 | return embed_size, word_map, embedding_matrix 210 | 211 | def text_sents_to_sequences(texts, word_map, max_nb_sent, max_sent_len): 212 | ret = [] 213 | for i in range(len(texts)): 214 | text_vecs = [] 215 | for j in range(len(texts[i])): 216 | if (j < max_nb_sent): 217 | sent_vecs = [] 218 | for k in range(len(texts[i][j])): 219 | if (k < max_sent_len): 220 | sent_vecs.append(word_map[texts[i][j][k]]) 221 | if (len(sent_vecs) < max_sent_len): 222 | sent_vecs = np.pad( 223 | sent_vecs, 224 | (0, max(0, max_sent_len - len(sent_vecs))), 225 | 'constant', 226 | constant_values=0 227 | ) 228 | text_vecs.append(sent_vecs) 229 | 230 | 231 | if (len(text_vecs) < max_nb_sent): 232 | text_vecs = np.pad( 233 | text_vecs, 234 | ((0, max_nb_sent - len(text_vecs)), (0, 0)), 235 | 'constant', 236 | constant_values=0 237 | ) 238 | 239 | ret.append(text_vecs) 240 | 241 | return np.array(ret) 242 | -------------------------------------------------------------------------------- /scripts/stack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import KFold 3 | from scripts.util import f1 4 | 5 | 6 | from keras.callbacks import EarlyStopping, ModelCheckpoint 7 | from keras.models import load_model 8 | 9 | from keras.models import Model 10 | from keras.layers import \ 11 | Dense, Embedding, Input, \ 12 | Conv1D, MaxPool1D, \ 13 | Dropout, BatchNormalization, \ 14 | Bidirectional, CuDNNLSTM, \ 15 | Concatenate, Flatten, Add 16 | 17 | 18 | 19 | class StackedGeneralizer: 20 | 21 | def __init__(self, models, meta_model): 22 | self._models = models 23 | self._meta_model = meta_model 24 | return 25 | 26 | 27 | def train_models(self, X, y, X_val, y_val, model_path, epochs, batch_size, patience): 28 | for ind in range(len(self._models)): 29 | checkpoint = ModelCheckpoint( 30 | filepath='{}/models.hdf5'.format(model_path), 31 | monitor='val_f1', verbose=1, 32 | mode='max', 33 | save_best_only=True 34 | ) 35 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience) 36 | callbacks_list = [checkpoint, early] 37 | self._models[ind].compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 38 | self._models[ind].fit( 39 | X, y, 40 | validation_data= (X_val, y_val), 41 | callbacks=callbacks_list, 42 | epochs=epochs, 43 | batch_size=batch_size 44 | ) 45 | self._models[ind].load_weights(filepath='{}/models.hdf5'.format(model_path)) 46 | 47 | 48 | 49 | def train_meta_model(self, X, y, X_val, y_val, model_path, epochs, batch_size, patience): 50 | 51 | # Obtain level-1 input from each model: 52 | meta_input = np.zeros((len(X), len(self._models))) 53 | 54 | for ind in range(len(self._models)): 55 | pred = np.zeros(len(X)) 56 | kf = KFold(n_splits = 5, shuffle = False) 57 | model = self._models[ind] 58 | # model.save(filepath='{}/dumped.hdf5'.format(model_path)) 59 | weights = model.get_weights() 60 | 61 | 62 | for train_index, test_index in kf.split(X): 63 | X_train, X_test = X[train_index], X[test_index] 64 | y_train, y_test = y[train_index], y[test_index] 65 | 66 | 67 | checkpoint = ModelCheckpoint( 68 | filepath='{}/models.hdf5'.format(model_path), 69 | monitor='val_f1', verbose=1, 70 | mode='max', 71 | save_best_only=True 72 | ) 73 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience) 74 | callbacks_list = [checkpoint, early] 75 | model.fit( 76 | X_train, y_train, 77 | validation_data= (X_val, y_val), 78 | callbacks=callbacks_list, 79 | epochs=epochs, 80 | batch_size=batch_size 81 | ) 82 | 83 | model.set_weights(weights) 84 | pred[test_index] = model.predict(X_test).reshape(-1) 85 | 86 | # Reset model: 87 | model = load_model(filepath='{}/dumped.hdf5'.format(model_path)) 88 | # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1]) 89 | 90 | 91 | meta_input[:, ind] = pred 92 | 93 | 94 | self._meta_model.fit(meta_input, y) 95 | 96 | 97 | def predict(self, X): 98 | meta_input = self.compute_meta_data(X) 99 | return (self._meta_model.predict(meta_input) > 0.5).astype(np.int8) 100 | 101 | 102 | def compute_meta_data(self, X): 103 | prediction = np.zeros((len(X), len(self._models))) 104 | for ind in range(len(self._models)): 105 | pred = self._models[ind].predict(X).reshape(len(X), 1).reshape(-1) 106 | prediction[:, ind] = pred 107 | 108 | return prediction 109 | 110 | def load_weights(self, paths): 111 | for ind in range(len(self._models)): 112 | self._models[ind].load_weights(paths[ind]) 113 | 114 | 115 | class StackedGeneralizerWithHier: 116 | def __init__(self, models, hier_models, meta_model): 117 | self._models = models 118 | self._hier_models = hier_models 119 | 120 | self._meta_model = meta_model 121 | return 122 | 123 | def train_models(self, X, y, X_val, y_val, X_hier, X_hier_val, model_path, epochs, batch_size, 124 | patience): 125 | 126 | for ind in range(len(self._models)): 127 | checkpoint = ModelCheckpoint( 128 | filepath='{}/models.hdf5'.format(model_path), 129 | monitor='val_f1', verbose=1, 130 | mode='max', 131 | save_best_only=True 132 | ) 133 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience) 134 | callbacks_list = [checkpoint, early] 135 | self._models[ind].compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1]) 136 | self._models[ind].fit( 137 | X, y, 138 | validation_data=(X_val, y_val), 139 | callbacks=callbacks_list, 140 | epochs=epochs, 141 | batch_size=batch_size 142 | ) 143 | self._models[ind].load_weights(filepath='{}/models.hdf5'.format(model_path)) 144 | 145 | for ind in range(len(self._hier_models)): 146 | checkpoint = ModelCheckpoint( 147 | filepath='{}/models.hdf5'.format(model_path), 148 | monitor='val_f1', verbose=1, 149 | mode='max', 150 | save_best_only=True 151 | ) 152 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience) 153 | callbacks_list = [checkpoint, early] 154 | self._hier_models[ind].compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1]) 155 | self._hier_models[ind].fit( 156 | X_hier, y, 157 | validation_data=(X_hier_val, y_val), 158 | callbacks=callbacks_list, 159 | epochs=epochs, 160 | batch_size=batch_size 161 | ) 162 | self._hier_models[ind].load_weights(filepath='{}/models.hdf5'.format(model_path)) 163 | 164 | def train_meta_model(self, X, y, X_val, y_val, X_hier, X_hier_val, model_path, epochs, 165 | batch_size, patience): 166 | 167 | # Obtain level-1 input from each model: 168 | meta_input = np.zeros((len(X), len(self._models) + len(self._hier_models))) 169 | 170 | for ind in range(len(self._hier_models)): 171 | pred = np.zeros(len(X)) 172 | kf = KFold(n_splits=5, shuffle=False) 173 | model = self._hier_models[ind] 174 | weights = model.get_weights() 175 | 176 | 177 | for train_index, test_index in kf.split(X_hier): 178 | X_train, X_test = X_hier[train_index], X_hier[test_index] 179 | y_train, y_test = y[train_index], y[test_index] 180 | 181 | checkpoint = ModelCheckpoint( 182 | filepath='{}/models.hdf5'.format(model_path), 183 | monitor='val_f1', verbose=1, 184 | mode='max', 185 | save_best_only=True 186 | ) 187 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience) 188 | callbacks_list = [checkpoint, early] 189 | model.fit( 190 | X_train, y_train, 191 | validation_data=(X_hier_val, y_val), 192 | callbacks=callbacks_list, 193 | epochs=epochs, 194 | batch_size=batch_size 195 | ) 196 | 197 | model.load_weights(filepath='{}/models.hdf5'.format(model_path)) 198 | pred[test_index] = model.predict(X_test).reshape(-1) 199 | 200 | # Reset model: 201 | model = model.set_weights(weights) 202 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1]) 203 | 204 | meta_input[:, len(self._models) + ind] = pred 205 | 206 | 207 | for ind in range(len(self._models)): 208 | pred = np.zeros(len(X)) 209 | kf = KFold(n_splits=5, shuffle=False) 210 | model = self._models[ind] 211 | weights = model.get_weights() 212 | 213 | for train_index, test_index in kf.split(X): 214 | X_train, X_test = X[train_index], X[test_index] 215 | y_train, y_test = y[train_index], y[test_index] 216 | 217 | checkpoint = ModelCheckpoint( 218 | filepath='{}/models.hdf5'.format(model_path), 219 | monitor='val_f1', verbose=1, 220 | mode='max', 221 | save_best_only=True 222 | ) 223 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience) 224 | callbacks_list = [checkpoint, early] 225 | model.fit( 226 | X_train, y_train, 227 | validation_data=(X_val, y_val), 228 | callbacks=callbacks_list, 229 | epochs=epochs, 230 | batch_size=batch_size 231 | ) 232 | 233 | model.load_weights(filepath='{}/models.hdf5'.format(model_path)) 234 | pred[test_index] = model.predict(X_test).reshape(-1) 235 | 236 | # Reset model: 237 | model.set_weights(weights) 238 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1]) 239 | 240 | 241 | meta_input[:, ind] = pred 242 | 243 | 244 | self._meta_model.fit(meta_input, y) 245 | 246 | def predict(self, X, X_hier): 247 | meta_input = self.compute_meta_data(X, X_hier) 248 | return (self._meta_model.predict(meta_input) > 0.5).astype(np.int8) 249 | 250 | def compute_meta_data(self, X, X_hier): 251 | prediction = np.zeros((len(X), len(self._models) + len(self._hier_models))) 252 | for ind in range(len(self._models)): 253 | pred = self._models[ind].predict(X).reshape(len(X), 1).reshape(-1) 254 | prediction[:, ind] = pred 255 | 256 | for ind in range(len(self._hier_models)): 257 | pred = self._hier_models[ind].predict(X_hier).reshape(len(X_hier), 1).reshape(-1) 258 | prediction[:, len(self._models) + ind] = pred 259 | 260 | return prediction 261 | 262 | def load_weights(self, paths, paths_hier): 263 | for ind in range(len(self._models)): 264 | self._models[ind].load_weights(paths[ind]) 265 | 266 | for ind in range(len(self._hier_models)): 267 | self._hier_models[ind].load_weights(paths_hier[ind]) 268 | 269 | 270 | def StackMLP(n_model): 271 | inp = Input(shape = (n_model,)) 272 | op = Dense(10, activation = "relu")(inp) 273 | op = BatchNormalization()(op) 274 | op = Dense(1, activation = "sigmoid")(op) 275 | 276 | model = Model(inputs = inp, outputs = op) 277 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 278 | return model 279 | 280 | 281 | 282 | -------------------------------------------------------------------------------- /scripts/rnn.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.layers import \ 3 | Dense, Embedding, Input, \ 4 | CuDNNGRU, GRU, LSTM, Bidirectional, CuDNNLSTM, \ 5 | GlobalMaxPool1D, GlobalAveragePooling1D, Dropout, \ 6 | Lambda, Concatenate, TimeDistributed 7 | from .util import f1 8 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention 9 | from keras.activations import softmax 10 | from keras_layer_normalization import LayerNormalization 11 | from .net_components import AttLayer, AdditiveLayer 12 | from keras.utils.vis_utils import plot_model 13 | 14 | 15 | 16 | 17 | def RNNKeras(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, use_fasttext = False, trainable = True, use_additive_emb = False): 18 | if use_fasttext: 19 | inp = Input(shape=(maxlen, embed_size)) 20 | x = inp 21 | else: 22 | inp = Input(shape = (maxlen, )) 23 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp) 24 | 25 | if use_additive_emb: 26 | x = AdditiveLayer()(x) 27 | x = Dropout(0.5)(x) 28 | 29 | x = Bidirectional(CuDNNGRU(128, return_sequences = True))(x) 30 | x = Dropout(0.5)(x) 31 | x = Bidirectional(CuDNNGRU(128, return_sequences = True))(x) 32 | x = Dropout(0.5)(x) 33 | 34 | max_pool = GlobalMaxPool1D()(x) 35 | avg_pool = GlobalAveragePooling1D()(x) 36 | last = Lambda(lambda x: x[:, 0, :])(x) 37 | concat_pool = Concatenate(axis = -1)([last, max_pool, avg_pool]) 38 | 39 | op = Dense(64, activation = "relu")(concat_pool) 40 | op = Dropout(0.5)(op) 41 | op = Dense(1, activation = "sigmoid")(op) 42 | 43 | model = Model(inputs = inp, outputs = op) 44 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 45 | return model 46 | 47 | def RNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, use_fasttext = False, trainable = True, use_additive_emb = False): 48 | if use_fasttext: 49 | inp = Input(shape=(maxlen, embed_size)) 50 | x = inp 51 | else: 52 | inp = Input(shape = (maxlen, )) 53 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp) 54 | 55 | if use_additive_emb: 56 | x = AdditiveLayer()(x) 57 | x = Dropout(0.5)(x) 58 | 59 | 60 | x = Bidirectional(GRU(128, return_sequences = True, recurrent_dropout = 0.5, dropout = 0.5))(x) 61 | # x = Dropout(0.5)(x) 62 | x = Bidirectional(GRU(128, return_sequences = True, recurrent_dropout = 0.5, dropout = 0.5))(x) 63 | # x = Dropout(0.5)(x) 64 | 65 | max_pool = GlobalMaxPool1D()(x) 66 | avg_pool = GlobalAveragePooling1D()(x) 67 | last = Lambda(lambda x: x[:, 0, :])(x) 68 | concat_pool = Concatenate(axis = -1)([last, max_pool, avg_pool]) 69 | 70 | op = Dense(64, activation = "relu")(concat_pool) 71 | op = Dropout(0.5)(op) 72 | op = Dense(1, activation = "sigmoid")(op) 73 | 74 | model = Model(inputs = inp, outputs = op) 75 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 76 | return model 77 | 78 | def LSTMKeras(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100): 79 | inp = Input(shape = (maxlen, )) 80 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix])(inp) 81 | x = Bidirectional(CuDNNLSTM(50, return_sequences = True))(x) 82 | # x = Dropout(0.1)(x) 83 | x = Bidirectional(CuDNNLSTM(50, return_sequences = True))(x) 84 | x = Dropout(0.1)(x) 85 | x = GlobalMaxPool1D()(x) 86 | x = Dense(50, activation = "relu")(x) 87 | x = Dropout(0.1)(x) 88 | x = Dense(1, activation = "sigmoid")(x) 89 | model = Model(inputs = inp, outputs = x) 90 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 91 | return model 92 | 93 | 94 | def SARNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, use_fasttext = False, trainable = True, use_additive_emb = False): 95 | if use_fasttext: 96 | inp = Input(shape=(maxlen, embed_size)) 97 | x = inp 98 | else: 99 | inp = Input(shape = (maxlen, )) 100 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp) 101 | 102 | if use_additive_emb: 103 | x = AdditiveLayer()(x) 104 | x = Dropout(0.5)(x) 105 | 106 | 107 | x = Bidirectional(LSTM(128, return_sequences = True))(x) 108 | x = SeqSelfAttention( 109 | # attention_type = SeqSelfAttention.ATTENTION_TYPE_MUL, 110 | attention_regularizer_weight=1e-4, 111 | )(x) 112 | # x = LayerNormalization()(x) 113 | x = Dropout(0.5)(x) 114 | 115 | x = Bidirectional(LSTM(128, return_sequences = True))(x) 116 | x = SeqWeightedAttention()(x) 117 | # x = LayerNormalization()(x) 118 | x = Dropout(0.5)(x) 119 | 120 | x = Dense(64, activation = "relu")(x) 121 | x = Dropout(0.5)(x) 122 | x = Dense(1, activation = "sigmoid")(x) 123 | model = Model(inputs = inp, outputs = x) 124 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 125 | return model 126 | 127 | def SARNNKeras(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, rnn_type = CuDNNLSTM, use_fasttext = False, trainable = True, use_additive_emb = False): 128 | if use_fasttext: 129 | inp = Input(shape=(maxlen, embed_size)) 130 | x = inp 131 | else: 132 | inp = Input(shape = (maxlen, )) 133 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp) 134 | 135 | if use_additive_emb: 136 | x = AdditiveLayer()(x) 137 | x = Dropout(0.5)(x) 138 | 139 | 140 | x = Bidirectional(rnn_type(128, return_sequences = True))(x) 141 | x = SeqSelfAttention( 142 | # attention_type = SeqSelfAttention.ATTENTION_TYPE_MUL, 143 | attention_regularizer_weight=1e-4, 144 | )(x) 145 | # x = LayerNormalization()(x) 146 | x = Dropout(0.5)(x) 147 | 148 | x = Bidirectional(rnn_type(128, return_sequences = True))(x) 149 | x = SeqWeightedAttention()(x) 150 | # x = LayerNormalization()(x) 151 | x = Dropout(0.5)(x) 152 | 153 | x = Dense(64, activation = "relu")(x) 154 | x = Dropout(0.5)(x) 155 | x = Dense(1, activation = "sigmoid")(x) 156 | model = Model(inputs = inp, outputs = x) 157 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 158 | return model 159 | 160 | 161 | def HRNNCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, trainable = True, use_additive_emb = False): 162 | sent_inp = Input(shape = (max_sent_len, embed_size)) 163 | embed = Embedding( 164 | input_dim = max_features, 165 | output_dim = embed_size, 166 | weights = [embeddingMatrix], 167 | trainable = trainable 168 | )(sent_inp) 169 | 170 | if use_additive_emb: 171 | embed = AdditiveLayer()(embed) 172 | embed = Dropout(0.5)(embed) 173 | 174 | word_lstm = Bidirectional(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))(embed) 175 | sent_encoder = Model(sent_inp, word_lstm) 176 | 177 | doc_input = Input(shape = (max_nb_sent, max_sent_len)) 178 | doc_encoder = TimeDistributed(sent_encoder)(doc_input) 179 | sent_lstm = Bidirectional(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))(doc_encoder) 180 | preds = Dense(1, activation = "sigmoid")(sent_lstm) 181 | model = Model(inputs = doc_input, outputs = preds) 182 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 183 | return model 184 | 185 | def HRNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, trainable = True, use_additive_emb = False): 186 | sent_inp = Input(shape = (max_sent_len, embed_size)) 187 | embed = Embedding( 188 | input_dim = max_features, 189 | output_dim = embed_size, 190 | weights = [embeddingMatrix], 191 | trainable = trainable 192 | )(sent_inp) 193 | 194 | if use_additive_emb: 195 | embed = AdditiveLayer()(embed) 196 | embed = Dropout(0.5)(embed) 197 | 198 | word_lstm = Bidirectional(CuDNNLSTM(128))(embed) 199 | sent_encoder = Model(sent_inp, word_lstm) 200 | 201 | doc_input = Input(shape = (max_nb_sent, max_sent_len)) 202 | doc_encoder = TimeDistributed(sent_encoder)(doc_input) 203 | sent_lstm = Bidirectional(CuDNNLSTM(128))(doc_encoder) 204 | preds = Dense(1, activation = "sigmoid")(sent_lstm) 205 | model = Model(inputs = doc_input, outputs = preds) 206 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 207 | return model 208 | 209 | 210 | def OriginalHARNNCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False): 211 | if use_fasttext: 212 | sent_inp = Input(shape = (max_sent_len, embed_size)) 213 | embed = sent_inp 214 | else: 215 | sent_inp = Input(shape = (max_sent_len, )) 216 | embed = Embedding( 217 | input_dim = max_features, 218 | output_dim = embed_size, 219 | weights = [embeddingMatrix], 220 | trainable = trainable 221 | )(sent_inp) 222 | 223 | if use_additive_emb: 224 | embed = AdditiveLayer()(embed) 225 | embed = Dropout(0.5)(embed) 226 | 227 | word_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(embed) 228 | word_att = AttLayer(context_size = 256)(word_lstm) 229 | sent_encoder = Model(sent_inp, word_att) 230 | 231 | doc_input = Input(shape = (max_nb_sent, max_sent_len)) 232 | doc_encoder = TimeDistributed(sent_encoder)(doc_input) 233 | sent_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(doc_encoder) 234 | sent_att = AttLayer(context_size = 256)(sent_lstm) 235 | preds = Dense(1, activation = "sigmoid")(sent_att) 236 | model = Model(inputs = doc_input, outputs = preds) 237 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 238 | return model 239 | 240 | def OriginalHARNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False): 241 | if use_fasttext: 242 | sent_inp = Input(shape = (max_sent_len, embed_size)) 243 | embed = sent_inp 244 | else: 245 | sent_inp = Input(shape = (max_sent_len, )) 246 | embed = Embedding( 247 | input_dim = max_features, 248 | output_dim = embed_size, 249 | weights = [embeddingMatrix], 250 | trainable = trainable 251 | )(sent_inp) 252 | 253 | if use_additive_emb: 254 | embed = AdditiveLayer()(embed) 255 | embed = Dropout(0.5)(embed) 256 | 257 | word_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(embed) 258 | word_att = AttLayer(context_size = 256)(word_lstm) 259 | word_att = Dropout(0.5)(word_att) 260 | sent_encoder = Model(sent_inp, word_att) 261 | 262 | doc_input = Input(shape = (max_nb_sent, max_sent_len)) 263 | doc_encoder = TimeDistributed(sent_encoder)(doc_input) 264 | sent_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(doc_encoder) 265 | sent_att = AttLayer(context_size = 256)(sent_lstm) 266 | sent_att = Dropout(0.5)(sent_att) 267 | preds = Dense(1, activation = "sigmoid")(sent_att) 268 | model = Model(inputs = doc_input, outputs = preds) 269 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 270 | return model 271 | 272 | 273 | 274 | 275 | def HARNNCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False): 276 | if use_fasttext: 277 | sent_inp = Input(shape = (max_sent_len, embed_size)) 278 | embed = sent_inp 279 | else: 280 | sent_inp = Input(shape = (max_sent_len, )) 281 | embed = Embedding( 282 | input_dim = max_features, 283 | output_dim = embed_size, 284 | weights = [embeddingMatrix], 285 | trainable = trainable 286 | )(sent_inp) 287 | 288 | if use_additive_emb: 289 | embed = AdditiveLayer()(embed) 290 | embed = Dropout(0.5)(embed) 291 | 292 | 293 | word_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(embed) 294 | word_att = SeqWeightedAttention()(word_lstm) 295 | sent_encoder = Model(sent_inp, word_att) 296 | 297 | doc_input = Input(shape = (max_nb_sent, max_sent_len)) 298 | doc_encoder = TimeDistributed(sent_encoder)(doc_input) 299 | sent_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(doc_encoder) 300 | sent_att = SeqWeightedAttention()(sent_lstm) 301 | preds = Dense(1, activation = "sigmoid")(sent_att) 302 | model = Model(inputs = doc_input, outputs = preds) 303 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 304 | return model 305 | 306 | 307 | 308 | def HARNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False): 309 | if use_fasttext: 310 | sent_inp = Input(shape = (max_sent_len, embed_size)) 311 | embed = sent_inp 312 | else: 313 | sent_inp = Input(shape = (max_sent_len, )) 314 | embed = Embedding( 315 | input_dim = max_features, 316 | output_dim = embed_size, 317 | weights = [embeddingMatrix], 318 | trainable = trainable 319 | )(sent_inp) 320 | 321 | if use_additive_emb: 322 | embed = AdditiveLayer()(embed) 323 | embed = Dropout(0.5)(embed) 324 | 325 | word_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(embed) 326 | word_att = SeqWeightedAttention()(word_lstm) 327 | word_att = Dropout(0.5)(word_att) 328 | sent_encoder = Model(sent_inp, word_att) 329 | plot_model(sent_encoder, to_file='{}.png'.format("HARNN1"), show_shapes=True, show_layer_names=True) 330 | 331 | 332 | doc_input = Input(shape = (max_nb_sent, max_sent_len)) 333 | doc_encoder = TimeDistributed(sent_encoder)(doc_input) 334 | sent_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(doc_encoder) 335 | sent_att = SeqWeightedAttention()(sent_lstm) 336 | sent_att = Dropout(0.5)(sent_att) 337 | preds = Dense(1, activation = "sigmoid")(sent_att) 338 | model = Model(inputs = doc_input, outputs = preds) 339 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1]) 340 | return model 341 | 342 | 343 | 344 | --------------------------------------------------------------------------------