├── test_lib.py
├── scripts
    ├── __init__.py
    ├── constant.py
    ├── net_components.py
    ├── augment.py
    ├── cnn.py
    ├── util.py
    ├── stack.py
    └── rnn.py
├── HARNN.png
├── VDCNN.png
├── HARNN1.png
├── TextCNN.png
├── SARNNKeras.png
├── 2019-03-16-12:32:21.png
├── external_lib
    └── install_lib.sh
├── requirements.txt
├── README.md
├── main_stack.py
├── test.py
├── 1st place solution.md
├── test_elmo.py
├── main_stack_hier.py
├── main_elmo.py
├── main.py
└── main_hierarchical.py


/test_lib.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/HARNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/HARNN.png


--------------------------------------------------------------------------------
/VDCNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/VDCNN.png


--------------------------------------------------------------------------------
/HARNN1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/HARNN1.png


--------------------------------------------------------------------------------
/TextCNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/TextCNN.png


--------------------------------------------------------------------------------
/SARNNKeras.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/SARNNKeras.png


--------------------------------------------------------------------------------
/2019-03-16-12:32:21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/2019-03-16-12:32:21.png


--------------------------------------------------------------------------------
/external_lib/install_lib.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Install deepai_nlp
 3 | cd deepai_nlp
 4 | pip install -e .
 5 | cd ..
 6 | # Install elmo
 7 | cd ELMoForManyLangs
 8 | python setup.py install
 9 | cd ..
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pyvi
 2 | pandas>=0.24.1
 3 | spacy>=2.0.18
 4 | gensim>=3.7.1
 5 | scikit-learn>=0.20.2
 6 | keras
 7 | tensorflow
 8 | keras-self-attention==0.35.0
 9 | keras-multi-head==0.16.0
10 | keras-layer-normalization==0.10.0
11 | annoy==1.15.1


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Aivivn_1
 2 | 
 3 | Our submission for Aivivn Contest 1.
 4 | 
 5 | By Nhat Pham and Hoang Phan.
 6 | 
 7 | Install environment:
 8 | ```bash
 9 | conda install python=3.6
10 | ```
11 | 
12 | Dependencies guide:
13 | 
14 | ```bash
15 | pip install -r requirements.txt
16 | cd external_lib
17 | chmod a+x install_lib.sh
18 | ./install_lib.sh
19 | cd ..
20 | ```
21 | Notebook test link:
22 | https://colab.research.google.com/drive/1fgtIYXkXKKmZVI2w62nCI22wiVSNEQxw
23 | 
24 | Sample run command:
25 | 
26 | ```bash
27 | python -m main -m VDCNN -e ./embeddings/baomoi.model.bin --max 40000 --mix --prob
28 | ```
29 | 


--------------------------------------------------------------------------------
/scripts/constant.py:
--------------------------------------------------------------------------------
  1 | # From spacy english model
  2 | EMOTICONS = set("""
  3 | :)
  4 | :-)
  5 | :))
  6 | :-))
  7 | :)))
  8 | :-)))
  9 | (:
 10 | (-:
 11 | =)
 12 | (=
 13 | ")
 14 | :]
 15 | :-]
 16 | [:
 17 | [-:
 18 | :o)
 19 | (o:
 20 | :}
 21 | :-}
 22 | 8)
 23 | 8-)
 24 | (-8
 25 | ;)
 26 | ;-)
 27 | (;
 28 | (-;
 29 | :(
 30 | :-(
 31 | :((
 32 | :-((
 33 | :(((
 34 | :-(((
 35 | ):
 36 | )-:
 37 | =(
 38 | >:(
 39 | :')
 40 | :'-)
 41 | :'(
 42 | :'-(
 43 | :/
 44 | :-/
 45 | =/
 46 | =|
 47 | :|
 48 | :-|
 49 | :1
 50 | :P
 51 | :-P
 52 | :p
 53 | :-p
 54 | :O
 55 | :-O
 56 | :o
 57 | :-o
 58 | :0
 59 | :-0
 60 | :()
 61 | >:o
 62 | :*
 63 | :-*
 64 | :3
 65 | :-3
 66 | =3
 67 | :>
 68 | :->
 69 | :X
 70 | :-X
 71 | :x
 72 | :-x
 73 | :D
 74 | :-D
 75 | ;D
 76 | ;-D
 77 | =D
 78 | xD
 79 | XD
 80 | xDD
 81 | XDD
 82 | 8D
 83 | 8-D
 84 | ^_^
 85 | ^__^
 86 | ^___^
 87 | >.<
 88 | >.>
 89 | <.<
 90 | ._.
 91 | ;_;
 92 | -_-
 93 | -__-
 94 | v.v
 95 | V.V
 96 | v_v
 97 | V_V
 98 | o_o
 99 | o_O
100 | O_o
101 | O_O
102 | 0_o
103 | o_0
104 | 0_0
105 | o.O
106 | O.o
107 | O.O
108 | o.o
109 | 0.0
110 | o.0
111 | 0.o
112 | @_@
113 | <3
114 | <33
115 | <333
116 | </3
117 | (^_^)
118 | (-_-)
119 | (._.)
120 | (>_<)
121 | (*_*)
122 | (¬_¬)
123 | ಠ_ಠ
124 | ಠ︵ಠ
125 | (ಠ_ಠ)
126 | ¯\(ツ)/¯
127 | (╯°□°）╯︵┻━┻
128 | ><(((*>
129 | """.split())
130 | 
131 | DEFAULT_MAX_FEATURES = 12000
132 | DEFAULT_MAX_LENGTH = 100
133 | 


--------------------------------------------------------------------------------
/scripts/net_components.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Layer
 2 | import keras.backend as K
 3 | 
 4 | class AttLayer(Layer):
 5 |     def __init__(self, context_size):
 6 |         self._context_size = context_size
 7 |         self.supports_masking = True
 8 |         # self._linear = Dense(context_size, activation = "tanh")
 9 |         super(AttLayer, self).__init__()
10 | 
11 |     def build(self, input_shape):
12 |         self._W = self.add_weight(
13 |             name = "W",
14 |             shape = (input_shape[-1], self._context_size),
15 |             initializer="he_normal",
16 |             trainable=True
17 |         )
18 |         self._b = self.add_weight(
19 |             name = "b",
20 |             shape = (1, self._context_size),
21 |             initializer="constant",
22 |             trainable=True
23 |         )
24 |         self._context = self.add_weight(
25 |             name = "context",
26 |             shape = (self._context_size, 1),
27 |             initializer = "he_normal",
28 |             trainable = True
29 |         )
30 |         super(AttLayer, self).build(input_shape)
31 | 
32 | 
33 |     def compute_mask(self, input, input_mask=None):
34 |         return input_mask
35 | 
36 | 
37 |     def call(self, input, mask = None):
38 |         # input: (N, T, M)
39 |         rep = K.tanh(K.dot(input, self._W) + self._b) # (N, T, C)
40 |         score = K.squeeze(K.dot(rep, self._context), axis = -1) # (N, T)
41 | 
42 |         weight = K.exp(score)
43 |         if mask is not None:
44 |             weight *= K.cast(mask, K.floatx())
45 | 
46 |         weight /= K.cast(K.sum(weight, axis = 1, keepdims = True) + K.epsilon(), K.floatx())
47 | 
48 | 
49 |         # weight = softmax(score, axis = -1) # (N, T)
50 |         op = K.batch_dot(input, weight, axes = (1, 1)) # (N, M)
51 | 
52 |         return op
53 | 
54 |     def compute_output_shape(self, input_shape):
55 |         return (input_shape[0], input_shape[-1])
56 | 
57 | 
58 | 
59 | class AdditiveLayer(Layer):
60 |     def __init__(self):
61 |         super(AdditiveLayer, self).__init__()
62 | 
63 |     def build(self, input_shape):
64 |         self._w = self.add_weight(
65 |             name = "w",
66 |             shape = (1, input_shape[-1]),
67 |             initializer="constant",
68 |             trainable=True
69 |         )
70 |         super(AdditiveLayer, self).build(input_shape)
71 | 
72 | 
73 | 
74 |     def call(self, input):
75 |         return input + self._w
76 | 
77 |     def compute_output_shape(self, input_shape):
78 |         return input_shape
79 | 


--------------------------------------------------------------------------------
/scripts/augment.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gensim.models import KeyedVectors
  3 | import copy
  4 | import random
  5 | from gensim.similarities.index import AnnoyIndexer
  6 | 
  7 | 
  8 | def shuffle_augment(texts, labels, n_increase, min_length = 1):
  9 |     texts_long = []
 10 |     labels_long = []
 11 | 
 12 |     if min_length > 1:
 13 |         for ind in range(len(texts)):
 14 |             if len(texts[ind]) >= min_length:
 15 |                 texts_long.append(texts[ind])
 16 |                 labels_long.append(labels[ind])
 17 |     else:
 18 |         texts_long = texts
 19 |         labels_long = labels
 20 | 
 21 | 
 22 |     shuffle_ind = np.random.choice(len(texts_long), size = n_increase)
 23 |     for ind in shuffle_ind:
 24 |         text_copy = np.random.permutation(texts_long[ind])
 25 |         texts.append(text_copy)
 26 |         labels = np.append(labels, [labels_long[ind]])
 27 | 
 28 | 
 29 |     return texts, labels
 30 | 
 31 | 
 32 | def similar_augment(texts, labels, n_increase, n_word_replace, model_path, similar_threshold = 0.5, use_annoy = True, annoy_path = None):
 33 |     w2v = KeyedVectors.load_word2vec_format(model_path, binary=True)
 34 |     texts_long = []
 35 |     labels_long = []
 36 |     if use_annoy:
 37 |         if annoy_path is None:
 38 |             indexer = AnnoyIndexer(w2v, 100)
 39 |         else:
 40 |             indexer = AnnoyIndexer()
 41 |             indexer.load(annoy_path)
 42 | 
 43 |     for ind in range(len(texts)):
 44 |         if len(texts[ind]) >= n_word_replace:
 45 |             texts_long.append(texts[ind])
 46 |             labels_long.append(labels[ind])
 47 | 
 48 |     shuffle_ind = np.random.choice(len(texts_long), size = n_increase)
 49 |     for ind in shuffle_ind:
 50 |         text_copy = copy.deepcopy(texts_long[ind])
 51 |         # if is_hier:
 52 | 
 53 |         replace_inds = np.random.choice(text_copy.shape[-1], size = n_word_replace, replace = False)
 54 |         for word_ind in replace_inds:
 55 |             word = text_copy[word_ind]
 56 |             try:
 57 | 
 58 |                 closest, score = w2v.wv.most_similar(
 59 |                     word, topn = 2,
 60 |                     indexer = indexer if use_annoy else None
 61 |                 )[1]
 62 |                 if score > similar_threshold:
 63 |                     text_copy[word_ind] = closest
 64 |             except:
 65 |                 continue
 66 | 
 67 |         texts.append(text_copy)
 68 |         labels = np.append(labels, [labels_long[ind]])
 69 | 
 70 |     return texts, labels
 71 | 
 72 | 
 73 | 
 74 | 
 75 | def create_sim_dict(word_map, model_path, similar_threshold = 0.5, use_annoy = True, annoy_path = None):
 76 |     w2v = KeyedVectors.load_word2vec_format(model_path, binary=True)
 77 |     if use_annoy:
 78 |         if annoy_path is None:
 79 |             indexer = AnnoyIndexer(w2v, 100)
 80 |         else:
 81 |             indexer = AnnoyIndexer()
 82 |             indexer.load(annoy_path)
 83 | 
 84 |     sim_dict = dict()
 85 |     for word in word_map:
 86 |         try:
 87 |             closest, score = w2v.wv.most_similar(
 88 |                 word, topn=2,
 89 |                 indexer=indexer if use_annoy else None
 90 |             )[1]
 91 |             if score > similar_threshold and closest in word_map:
 92 |                 sim_dict[word_map[word]] = word_map[closest]
 93 |         except:
 94 |             continue
 95 | 
 96 |     return sim_dict
 97 | 
 98 | def similar_augment_from_sim_dict(texts, labels, sim_dict, n_increase, keep_prob = 0.5):
 99 |     aug_ind = np.random.choice(len(texts), size = n_increase)
100 |     i = -1
101 |     for ind in aug_ind:
102 |         i += 1
103 |         text_aug = copy.deepcopy(texts[ind])
104 |         for word_ind in range(len(text_aug)):
105 |             word = text_aug[word_ind]
106 |             if word in sim_dict:
107 |                 p = random.uniform(0, 1)
108 |                 if p > keep_prob:
109 |                     text_aug[word_ind] = sim_dict[word]
110 | 
111 |         texts = np.append(texts, [text_aug], axis = 0)
112 |         labels = np.append(labels, [labels[ind]], axis = 0)
113 | 
114 |     return texts, labels
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/main_stack.py:
--------------------------------------------------------------------------------
  1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences, f1
  2 | from scripts.constant import DEFAULT_MAX_FEATURES
  3 | from sklearn.model_selection import train_test_split
  4 | from scripts.rnn import SARNNKeras
  5 | from scripts.cnn import LSTMCNN, VDCNN
  6 | from scripts.stack import StackedGeneralizer
  7 | import argparse
  8 | import os
  9 | import numpy as np
 10 | import datetime
 11 | import pandas as pd
 12 | from sklearn.metrics import f1_score
 13 | 
 14 | from sklearn.linear_model import LogisticRegression
 15 | from keras.utils import CustomObjectScope
 16 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
 17 | 
 18 | 
 19 | def stack(models_list, embedding_path, max_features, should_mix):
 20 |     model_name = '-'.join(
 21 |         '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' '))
 22 | 
 23 |     train_data = read_file('./data/train.crash')
 24 |     test_data = read_file('./data/test.crash', is_train=False)
 25 |     train_tokenized_texts = tokenize(train_data['text'])
 26 |     test_tokenizes_texts = tokenize(test_data['text'])
 27 |     labels = train_data['label'].values.astype(np.float16).reshape(-1, 1)
 28 | 
 29 |     embed_size, word_map, embedding_mat = make_embedding(
 30 |         list(train_tokenized_texts) +
 31 |         list(test_tokenizes_texts) if should_mix else train_tokenized_texts,
 32 |         embedding_path,
 33 |         max_features
 34 |     )
 35 | 
 36 |     texts_id = text_to_sequences(train_tokenized_texts, word_map)
 37 |     print('Number of train data: {}'.format(labels.shape))
 38 | 
 39 |     texts_id_train, texts_id_val, labels_train, labels_val = train_test_split(
 40 |         texts_id, labels, test_size=0.05)
 41 | 
 42 |     model_path = './models/{}-version'.format(model_name)
 43 | 
 44 |     try:
 45 |         os.mkdir('./models')
 46 |     except:
 47 |         print('Folder already created')
 48 |     try:
 49 |         os.mkdir(model_path)
 50 |     except:
 51 |         print('Folder already created')
 52 | 
 53 |     batch_size = 16
 54 |     epochs = 100
 55 |     patience = 3
 56 | 
 57 |     meta_model = LogisticRegression()
 58 |     models = [
 59 |         model(
 60 |             embeddingMatrix=embedding_mat,
 61 |             embed_size=400,
 62 |             max_features=embedding_mat.shape[0]
 63 |         )
 64 |         for model in models_list
 65 |     ]
 66 | 
 67 | 
 68 |     stack = StackedGeneralizer(models, meta_model)
 69 |     stack.train_meta_model(
 70 |         texts_id_train, labels_train,
 71 |         texts_id_val, labels_val,
 72 |         model_path = model_path,
 73 |         epochs = epochs,
 74 |         batch_size = batch_size,
 75 |         patience = patience
 76 |     )
 77 | 
 78 |     stack.train_models(
 79 |         X = texts_id_train, y = labels_train,
 80 |         X_val = texts_id_val, y_val = labels_val,
 81 |         batch_size = batch_size,
 82 |         epochs = epochs,
 83 |         patience = patience,
 84 |         model_path = model_path
 85 |     )
 86 | 
 87 |     prediction = stack.predict(texts_id_val)
 88 |     print('F1 validation score: {}'.format(f1_score(prediction, labels_val)))
 89 |     with open('{}/f1'.format(model_path), 'w') as fp:
 90 |         fp.write(str(f1_score(prediction, labels_val)))
 91 | 
 92 |     test_id_texts = text_to_sequences(test_tokenizes_texts, word_map)
 93 |     test_prediction = stack.predict(test_id_texts)
 94 | 
 95 |     df_predicton = pd.read_csv("./data/sample_submission.csv")
 96 |     df_predicton["label"] = test_prediction
 97 | 
 98 |     print('Number of test data: {}'.format(df_predicton.shape[0]))
 99 |     df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False)
100 | 
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     models_list = [
105 |         SARNNKeras, LSTMCNN, VDCNN
106 |     ]
107 |     parser = argparse.ArgumentParser()
108 |     parser.add_argument(
109 |         '-e',
110 |         '--embedding',
111 |         help='Model use',
112 |         default='./embeddings/smallFasttext.vi.vec'
113 |     )
114 |     parser.add_argument(
115 |         '--max',
116 |         help='Model use',
117 |         default=DEFAULT_MAX_FEATURES
118 |     )
119 |     parser.add_argument(
120 |         '--mix',
121 |         action='store_true',
122 |         help='Model use'
123 |     )
124 |     args = parser.parse_args()
125 | 
126 |     with CustomObjectScope({
127 |         'SeqSelfAttention': SeqSelfAttention,
128 |         'SeqWeightedAttention': SeqWeightedAttention,
129 |         'f1': f1}
130 |     ):
131 |         stack(
132 |             models_list, args.embedding, int(args.max), args.mix
133 |         )
134 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences, find_threshold
  2 | import numpy as np
  3 | from scripts.constant import DEFAULT_MAX_FEATURES
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.metrics import f1_score
  6 | import tensorflow as tf
  7 | import random as rn
  8 | import pandas as pd
  9 | 
 10 | 
 11 | 
 12 | from keras.models import Model
 13 | from keras.layers import Dense, Embedding, Input, GRU, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, Lambda
 14 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 15 | import keras.backend as K
 16 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
 17 | 
 18 | # np random seed:
 19 | np.random.seed(22)
 20 | 
 21 | # # Setting the seed for python random numbers
 22 | rn.seed(1254)
 23 | #
 24 | # # Setting the graph-level random seed.
 25 | tf.set_random_seed(89)
 26 | 
 27 | def SARNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100):
 28 |     inp = Input(shape = (maxlen, ))
 29 |     x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix])(inp)
 30 |     x = Bidirectional(LSTM(128, return_sequences = True))(x)
 31 |     x = SeqSelfAttention(
 32 |         attention_type = SeqSelfAttention.ATTENTION_TYPE_MUL,
 33 |         attention_regularizer_weight=1e-4,
 34 |     )(x)
 35 |     x = Dropout(0.5)(x)
 36 |     x = Bidirectional(LSTM(128, return_sequences = True))(x)
 37 |     x = SeqWeightedAttention()(x)
 38 |     x = Dropout(0.5)(x)
 39 |     x = Dense(64, activation = "relu")(x)
 40 |     x = Dropout(0.5)(x)
 41 |     x = Dense(1, activation = "sigmoid")(x)
 42 |     model = Model(inputs = inp, outputs = x)
 43 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
 44 |     return model
 45 | 
 46 | 
 47 | def RNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100):
 48 |     inp = Input(shape = (maxlen, ))
 49 |     x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix])(inp)
 50 |     x = Bidirectional(LSTM(128, return_sequences = True))(x)
 51 |     x = Dropout(0.5)(x)
 52 |     x = Bidirectional(LSTM(128, return_sequences = True))(x)
 53 |     x = Dropout(0.5)(x)
 54 |     x = GlobalMaxPool1D()(x)
 55 |     x = Dense(64, activation = "relu")(x)
 56 |     x = Dropout(0.5)(x)
 57 |     x = Dense(1, activation = "sigmoid")(x)
 58 |     model = Model(inputs = inp, outputs = x)
 59 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
 60 |     return model
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | def f1(y_true, y_pred):
 67 |     def recall(y_true, y_pred):
 68 |         """Recall metric.
 69 | 
 70 |         Only computes a batch-wise average of recall.
 71 | 
 72 |         Computes the recall, a metric for multi-label classification of
 73 |         how many relevant items are selected.
 74 |         """
 75 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 76 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
 77 |         recall = true_positives / (possible_positives + K.epsilon())
 78 |         return recall
 79 | 
 80 |     def precision(y_true, y_pred):
 81 |         """Precision metric.
 82 | 
 83 |         Only computes a batch-wise average of precision.
 84 | 
 85 |         Computes the precision, a metric for multi-label classification of
 86 |         how many selected items are relevant.
 87 |         """
 88 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 89 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
 90 |         precision = true_positives / (predicted_positives + K.epsilon())
 91 |         return precision
 92 |     precision = precision(y_true, y_pred)
 93 |     recall = recall(y_true, y_pred)
 94 |     return 2*((precision*recall)/(precision+recall+K.epsilon()))
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | data = read_file("./data/train.crash")
101 | tokenized_texts = tokenize(data["text"])
102 | labels = data["label"].values.astype(np.float16).reshape(-1, 1)
103 | 
104 | embed_size, word_map, embedding_mat = make_embedding(
105 |     tokenized_texts,
106 |     embedding_path = "./data/baomoi.model.bin",
107 |     max_features =  40000
108 | )
109 | 
110 | 
111 | 
112 | texts_id = text_to_sequences(tokenized_texts, word_map)
113 | print(labels.shape)
114 | print(texts_id.shape)
115 | 
116 | texts_id_train, texts_id_val, labels_train, labels_val = train_test_split(
117 |     texts_id, labels,
118 |     test_size = 0.05
119 | )
120 | 
121 | checkpoint = ModelCheckpoint(
122 |     filepath = "./Weights/model_sa_2.hdf5",
123 |     monitor = 'val_f1', verbose = 1,
124 |     mode = 'max',
125 |     save_best_only = True
126 | )
127 | early = EarlyStopping(monitor = "val_f1", mode = "max", patience = 3)
128 | callbacks_list = [checkpoint, early]
129 | batch_size = 16
130 | epochs = 100
131 | 
132 | 
133 | model = SARNNKerasCPU(
134 |     embeddingMatrix = embedding_mat,
135 |     embed_size = 400,
136 |     max_features = embedding_mat.shape[0]
137 | )
138 | model.fit(
139 |     texts_id_train, labels_train,
140 |     validation_data = (texts_id_val, labels_val),
141 |     callbacks = callbacks_list,
142 |     epochs = epochs,
143 |     batch_size = 16
144 | )
145 | 
146 | 
147 | 
148 | 
149 | model.load_weights("./Weights/model_sa_2.hdf5")
150 | prediction_prob = model.predict(texts_id_val)
151 | 
152 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val)
153 | print(OPTIMAL_THRESHOLD)
154 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8)
155 | print(f1_score(
156 |     y_true = labels_val.reshape(-1),
157 |     y_pred = prediction.reshape(-1)
158 | ))
159 | 
160 | 
161 | 
162 | data_test = read_file("./data/test.crash", is_train = False)
163 | tokenized_texts_test = tokenize(data_test["text"])
164 | texts_id_test = text_to_sequences(tokenized_texts_test, word_map)
165 | prediction_test = model.predict(texts_id_test)
166 | df_predicton = pd.read_csv("./data/sample_submission.csv")
167 | df_predicton["label"] = (prediction_test > OPTIMAL_THRESHOLD).astype(np.int8)
168 | print(df_predicton.shape[0])
169 | df_predicton.to_csv("./prediction/prediction_sa_2.csv", index = False)


--------------------------------------------------------------------------------
/1st place solution.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | 
 4 | ---
 5 | 
 6 | <h1 id="st-place-solution-analysis">1st place solution analysis</h1>
 7 | <p>Chào mọi người bọn mình là Hoàng và Nhật trong team HoangNhat2 trên leaderboard. Đầu tiên, bọn mình muốn cảm ơn anh Tiệp và mọi người trong team Aivivn vì đã tổ chức một cuộc thi về Machine Learning về xử lý tiếng Việt rất thú vị. Bọn mình đã học được rất nhiều thứ mới lạ qua cuộc thi.</p>
 8 | <h2 id="tóm-tắt-cách-làm">Tóm tắt cách làm:</h2>
 9 | <p>Tụi mình không có nhiều kiến thức về xử lý NLP nên tụi mình tập trung thử nghiệm những model DL và xem model nào hoạt động tốt. Qua những lần thử rất nhiều model tụi mình nhận ra là không có single model nào vượt qua được 0.89x ở Public LB mặc dù có một số làm rất tốt ở local validation. Sau đó, bọn mình đã thôi thử model mới mà đã qua thử nghiệm một số cách kết hợp model hoặc augment train data.</p>
10 | <p>Sau những lần thử nghiệm để đạt được độ diversity phù hợp thì solution được top 1 của bọn mình là Weighted Ensemble của những model sau đây:</p>
11 | <ol>
12 | <li>TextCNN (Weight: 0.1)  <a href="https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/">source</a></li>
13 | <li>Inspired VDCNN (Weight: 0.1) <a href="https://arxiv.org/abs/1606.01781">source</a></li>
14 | <li>HARNN (Weight: 0.3) <a href="https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf">source</a></li>
15 | <li>SARNN (Weight: 0.5) <a href="https://github.com/CyberZHG/keras-self-attention">source</a></li>
16 | </ol>
17 | <p>Pretrained Embeddings tụi mình test và sử dụng là:</p>
18 | <ul>
19 | <li>word2vecVN (window-size 5, 400dims) <a href="https://github.com/sonvx/word2vecVN">source</a></li>
20 | </ul>
21 | <p>Tụi mình chủ yếu train model ở trên Google Colab và sử dụng GPU của Colab. Thời gian train mỗi model khoảng từ 10 - 20 phút (model converge sau khoảng 5-10 epochs). Những model CNN thì train  nhanh hơn những model RNN rất nhiều vì có thể nó không phải là sequential model nên nó tận dụng được GPU tốt hơn lúc train.</p>
22 | <h2 id="chi-tiết-cách-làm">Chi tiết cách làm:</h2>
23 | <h3 id="models">1.  Models:</h3>
24 | <h4 id="textcnn">1.1 TextCNN:</h4>
25 | <p>Đây là model CNN cho text classification của bọn mình.</p>
26 | <h4 id="architecture">Architecture:</h4>
27 | <p><img src="https://imgur.com/TaEIrPx.png" alt="TextCNN"></p>
28 | <h4 id="vdcnn">1.2 VDCNN:</h4>
29 | <p>Tương tự như TextCNN nhưng ở giữa các layer Convolution có những Residual layer để tránh việc vanishing gradient.</p>
30 | <h4 id="architecture-1">Architecture:</h4>
31 | <p><img src="https://imgur.com/CVPVvd3.png" alt="VDCNN"></p>
32 | <h4 id="harnn">1.3 HARNN:</h4>
33 | <p>HARNN xử lý text ở hai level:</p>
34 | <ol>
35 | <li>Tính encoding cho từng sentence bằng word embedding trong paragraph bằng một BiLSTM</li>
36 | <li>Dùng một BiLSTM để tính document encoding theo sentence encoding.</li>
37 | </ol>
38 | <p>Giữa mỗi layer đều có một Attention layer.</p>
39 | <h4 id="architecture-word2sent">Architecture Word2Sent</h4>
40 | <p><img src="https://imgur.com/JuZzSMM.png" alt="VDCNN"></p>
41 | <h4 id="architecture-sent2doc">Architecture Sent2Doc:</h4>
42 | <p><img src="https://imgur.com/ELAREeE.png" alt="VDCNN"></p>
43 | <h4 id="sarnn">1.4 SARNN:</h4>
44 | <p>Đây là model BiLSTM với Attention ỡ giữa hai layer BiLSTM.</p>
45 | <h4 id="architecture-2">Architecture:</h4>
46 | <p><img src="https://imgur.com/qpF9tPR.png" alt="VDCNN"></p>
47 | <h3 id="combine-models">2. Combine models:</h3>
48 | <p>Bọn mình đã thử những cách kết hợp các models như Stacking and Ensembling nhưng thấy Ensembling đưa ra được kết quả khả quan nhất. Về cách lựa chọn weight thì bọn mình đã dựa vào model nào có kết quả tốt nhất trên Public LB và cho model đó weight cao nhất. Bọn mình để nguyên probability và chọn threshold là 0.5 chứ không tìm threshold vì không thấy được kết quả tăng nhiều.</p>
49 | <h2 id="ngoài-lề">Ngoài lề:</h2>
50 | <h3 id="khó-khăn">Khó khăn:</h3>
51 | <ul>
52 | <li>Có lẽ vấn đề đâu tiên hai đứa gặp phải là vấn đề máy móc. Do hai chiếc máy Macbook 128gb (1 Air, 1 Pro) nên hai đứa không đứa nào đủ chỗ để tải pretrained model về thử mỗi lần tải một cái phải xoá cái cũ đi. Mãi sau này tụi mình chuyển mọi thứ lên Google Colab và Github thì mọi thứ mới bắt đầu nhanh và dễ hơn. Nên bọn mình khuyên các bạn nên xài Google Colab hoặc Kaggle Instance.</li>
53 | <li>Bọn mình gặp nhiều vấn đề với việc reproduce được kết quả với hai lý do. Hàm save_weight của Keras có rất nhiều vấn đề và sau khi load lại thì hầu như model bị hư + trong lúc xử lý có nhiều thứ việc bị undeterministic (Python set, keras model). Model đầu tiên trên 0.9 của tui mình cũng là do chạy lại một model cũ mà thành xD.<br>
54 | <img src="https://cdn-images-1.medium.com/freeze/max/1000/1*wdSexjsOvnksIWNq3MeXhw.jpeg?q=20" alt=""></li>
55 | </ul>
56 | <h3 id="những-cách-tiếp-cận-bọn-mình-đã-thử">Những cách tiếp cận bọn mình đã thử:</h3>
57 | <ul>
58 | <li>Dùng Language Model như Elmo <a href="https://github.com/HIT-SCIR/ELMoForManyLangs">(source)</a>. Approach này có vẻ không phù hợp vì thời gian train quá lâu (thời gian train một epoch bằng thời gian train một model CNN hoặc RNN) và bọn mình cũng không có thời gian để preprocess data lại cho đúng format của Elmo.</li>
59 | <li>Một vấn đề mà bọn mình đã thấy là về việc lượng data không đủ để có thể làm model có thể vượt qua được mức 0.89 - 0.9. Bọn mình đã thử một số cách để augment ra data mới như:</li>
60 | </ul>
61 | <ol>
62 | <li>Thay ngẫu nhiên những từ trong câu bằng từ đồng nghĩa. Bọn mình làm điều này bằng cách thay mỗi từ bằng từ có word embeddings gần nó nhất trong từ điển của bọn mình (Nearest neighbor). Mặc dù thay đổi này không mang lại improvement đáng kể nhưng mình nghĩ với thesaurus tốt hoặc metrics chọn vector phù hợp thì sẽ có thể có kết quả tốt.</li>
63 | <li>Xáo các câu trong HARNN model để có thể generate được nhiều document khác nhau.</li>
64 | <li>Dịch từ tiếng Việt sang các thứ tiếng khác và dịch ngược lại. Mà giờ Google Translate ban cái này rồi ;__;</li>
65 | </ol>
66 | <h2 id="kết">Kết:</h2>
67 | <p>Một lần nữa cảm ơn BTC và chúc mừng các bạn đã hoàn thành một cái Datathon đầu tiên của Aivivn. Hi vọng những contest trong tương lại sẽ càng nhiều người ủng hộ hơn và cũng sẽ có nhiều discussion trong và ngoài contest hơn vì bọn mình thấy có vẻ thiếu những thảo luận về baseline model lúc thi.</p>
68 | 
69 | 


--------------------------------------------------------------------------------
/scripts/cnn.py:
--------------------------------------------------------------------------------
  1 | from keras.models import Model
  2 | from keras.layers import \
  3 |     Dense, Embedding, Input, \
  4 |     Conv1D, MaxPool1D, \
  5 |     Dropout, BatchNormalization, \
  6 |     Bidirectional, CuDNNLSTM, \
  7 |     Concatenate, Flatten, Add
  8 | from .util import f1
  9 | from .net_components import AdditiveLayer
 10 | 
 11 | 
 12 | 
 13 | # Based on https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/
 14 | # https://www.aclweb.org/anthology/D14-1181
 15 | def TextCNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, filter_sizes = {2, 3, 4, 5}, use_fasttext = False, trainable = True, use_additive_emb = False):
 16 |     if use_fasttext:
 17 |         inp = Input(shape=(maxlen, embed_size))
 18 |         x = inp
 19 |     else:
 20 |         inp = Input(shape = (maxlen, ))
 21 |         x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
 22 | 
 23 |     if use_additive_emb:
 24 |         x = AdditiveLayer()(x)
 25 |         x = Dropout(0.5)(x)
 26 | 
 27 | 
 28 |     conv_ops = []
 29 |     for filter_size in filter_sizes:
 30 |         conv = Conv1D(128, filter_size, activation = 'relu')(x)
 31 |         pool = MaxPool1D(5)(conv)
 32 |         conv_ops.append(pool)
 33 | 
 34 |     concat = Concatenate(axis = 1)(conv_ops)
 35 |     # concat = Dropout(0.1)(concat)
 36 |     concat = BatchNormalization()(concat)
 37 | 
 38 | 
 39 |     conv_2 = Conv1D(128, 5, activation = 'relu')(concat)
 40 |     conv_2 = MaxPool1D(5)(conv_2)
 41 |     conv_2 = BatchNormalization()(conv_2)
 42 |     # conv_2 = Dropout(0.1)(conv_2)
 43 | 
 44 |     conv_3 = Conv1D(128, 5, activation = 'relu')(conv_2)
 45 |     conv_3 = MaxPool1D(5)(conv_3)
 46 |     conv_3 = BatchNormalization()(conv_3)
 47 |     # conv_3 = Dropout(0.1)(conv_3)
 48 | 
 49 | 
 50 |     flat = Flatten()(conv_3)
 51 | 
 52 |     op = Dense(64, activation = "relu")(flat)
 53 |     # op = Dropout(0.5)(op)
 54 |     op = BatchNormalization()(op)
 55 |     op = Dense(1, activation = "sigmoid")(op)
 56 | 
 57 |     model = Model(inputs = inp, outputs = op)
 58 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
 59 |     return model
 60 | 
 61 | 
 62 | def VDCNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, filter_sizes = {2, 3, 4, 5}, use_fasttext = False, trainable = True, use_additive_emb = False):
 63 |     if use_fasttext:
 64 |         inp = Input(shape=(maxlen, embed_size))
 65 |         x = inp
 66 |     else:
 67 |         inp = Input(shape = (maxlen, ))
 68 |         x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
 69 | 
 70 |     if use_additive_emb:
 71 |         x = AdditiveLayer()(x)
 72 |         x = Dropout(0.5)(x)
 73 | 
 74 |     conv_ops = []
 75 |     for filter_size in filter_sizes:
 76 |         conv = Conv1D(128, filter_size, activation = 'relu')(x)
 77 |         pool = MaxPool1D(5)(conv)
 78 |         conv_ops.append(pool)
 79 | 
 80 |     concat = Concatenate(axis = 1)(conv_ops)
 81 |     # concat = Dropout(0.1)(concat)
 82 |     concat = BatchNormalization()(concat)
 83 | 
 84 | 
 85 |     conv_2_main = Conv1D(128, 5, activation = 'relu', padding='same')(concat)
 86 |     conv_2_main = BatchNormalization()(conv_2_main)
 87 |     conv_2_main = Conv1D(128, 5, activation = 'relu', padding='same')(conv_2_main)
 88 |     conv_2_main = BatchNormalization()(conv_2_main)
 89 |     conv_2 = Add()([concat, conv_2_main])
 90 |     conv_2 = MaxPool1D(pool_size = 2, strides = 2)(conv_2)
 91 |     # conv_2 = BatchNormalization()(conv_2)
 92 |     # conv_2 = Dropout(0.1)(conv_2)
 93 | 
 94 |     conv_3_main = Conv1D(128, 5, activation = 'relu', padding='same')(conv_2)
 95 |     conv_3_main = BatchNormalization()(conv_3_main)
 96 |     conv_3_main = Conv1D(128, 5, activation = 'relu', padding='same')(conv_3_main)
 97 |     conv_3_main = BatchNormalization()(conv_3_main)
 98 |     conv_3 = Add()([conv_2, conv_3_main])
 99 |     conv_3 = MaxPool1D(pool_size = 2, strides = 2)(conv_3)
100 |     # conv_3 = BatchNormalization()(conv_3)
101 |     # conv_3 = Dropout(0.1)(conv_3)
102 | 
103 | 
104 |     flat = Flatten()(conv_3)
105 | 
106 |     op = Dense(64, activation = "relu")(flat)
107 |     # op = Dropout(0.5)(op)
108 |     op = BatchNormalization()(op)
109 |     op = Dense(1, activation = "sigmoid")(op)
110 | 
111 |     model = Model(inputs = inp, outputs = op)
112 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
113 |     return model
114 | 
115 | 
116 | 
117 | # Based on http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/
118 | def LSTMCNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, filter_sizes = {2, 3, 4, 5}, use_fasttext = False, trainable = True, use_additive_emb = False):
119 |     if use_fasttext:
120 |         inp = Input(shape=(maxlen, embed_size))
121 |         x = inp
122 |     else:
123 |         inp = Input(shape = (maxlen, ))
124 |         x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
125 | 
126 |     if use_additive_emb:
127 |         x = AdditiveLayer()(x)
128 |         x = Dropout(0.5)(x)
129 | 
130 |     x = Bidirectional(CuDNNLSTM(128, return_sequences = True))(x)
131 | 
132 | 
133 |     conv_ops = []
134 |     for filter_size in filter_sizes:
135 |         conv = Conv1D(128, filter_size, activation = 'relu')(x)
136 |         pool = MaxPool1D(5)(conv)
137 |         conv_ops.append(pool)
138 | 
139 |     concat = Concatenate(axis = 1)(conv_ops)
140 |     concat = Dropout(0.5)(concat)
141 |     # concat = BatchNormalization()(concat)
142 | 
143 | 
144 |     conv_2 = Conv1D(128, 5, activation = 'relu')(concat)
145 |     conv_2 = MaxPool1D(5)(conv_2)
146 |     # conv_2 = BatchNormalization()(conv_2)
147 |     conv_2 = Dropout(0.5)(conv_2)
148 | 
149 |     # conv_3 = Conv1D(128, 5, activation = 'relu')(conv_2)
150 |     # conv_3 = MaxPool1D(5)(conv_3)
151 |     # conv_3 = BatchNormalization()(conv_3)
152 |     # conv_3 = Dropout(0.1)(conv_3)
153 | 
154 | 
155 |     flat = Flatten()(conv_2)
156 | 
157 |     op = Dense(64, activation = "relu")(flat)
158 |     op = Dropout(0.5)(op)
159 |     # op = BatchNormalization()(op)
160 |     op = Dense(1, activation = "sigmoid")(op)
161 | 
162 |     model = Model(inputs = inp, outputs = op)
163 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
164 |     return model
165 | 


--------------------------------------------------------------------------------
/test_elmo.py:
--------------------------------------------------------------------------------
  1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences, find_threshold
  2 | import numpy as np
  3 | from scripts.constant import DEFAULT_MAX_FEATURES
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.metrics import f1_score
  6 | from elmoformanylangs import Embedder
  7 | import tensorflow as tf
  8 | import random as rn
  9 | import pandas as pd
 10 | import timeit
 11 | 
 12 | 
 13 | 
 14 | from keras.models import Model, load_model, model_from_json
 15 | from keras.utils import Sequence
 16 | from keras.layers import Dense, Embedding, Input, GRU, Bidirectional, GlobalMaxPool1D, Dropout, Lambda
 17 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 18 | import keras.backend as K
 19 | 
 20 | # np random seed:
 21 | np.random.seed(22)
 22 | 
 23 | # # Setting the seed for python random numbers
 24 | rn.seed(1254)
 25 | #
 26 | # # Setting the graph-level random seed.
 27 | tf.set_random_seed(89)
 28 | 
 29 | elmo_path = "./data/elmo/"
 30 | 
 31 | 
 32 | batch_size = 16
 33 | epochs = 100
 34 | 
 35 | 
 36 | 
 37 | elmo = Embedder(elmo_path, batch_size = batch_size)
 38 | 
 39 | 
 40 | def to_length(texts, length):
 41 |     def pad_func(vector, pad_width, iaxis, kwargs):
 42 |         str = kwargs.get('padder', '<pad>')
 43 |         vector[:pad_width[0]] = str
 44 |         vector[-pad_width[1]:] = str
 45 |         return vector
 46 | 
 47 |     ret = []
 48 |     for sentence in texts:
 49 |         sentence = np.array(sentence, dtype = np.unicode)
 50 |         sentence = sentence[:min(length, len(sentence))]
 51 |         if length > len(sentence):
 52 |             sentence = np.pad(
 53 |                 sentence, mode = pad_func,
 54 |                 pad_width = (0, length - len(sentence))
 55 |             )
 56 |         ret.append(sentence)
 57 | 
 58 |     return np.array(ret)
 59 | 
 60 | 
 61 | class TrainSeq(Sequence):
 62 |     def __init__(self, X, y, batch_size):
 63 |         self._X, self._y = X, y
 64 |         self._batch_size = batch_size
 65 |         self._indices = np.arange(len(self._X))
 66 | 
 67 |     def __len__(self):
 68 |         return len(self._X) // self._batch_size
 69 | 
 70 |     def __getitem__(self, idx):
 71 |         id = self._indices[idx * self._batch_size:(idx + 1) * self._batch_size]
 72 |         return np.array(elmo.sents2elmo(self._X[id])), self._y[id]
 73 | 
 74 |     def on_epoch_end(self):
 75 |         np.random.shuffle(self._indices)
 76 | 
 77 | 
 78 | class TestSeq(Sequence):
 79 |     def __init__(self, x, batch_size):
 80 |         self._X = x
 81 |         self._batch_size = batch_size
 82 | 
 83 |     def __len__(self):
 84 |         return len(self._X) // batch_size
 85 | 
 86 |     def __getitem__(self, idx):
 87 |         return np.array(elmo.sents2elmo(self._X[idx * self._batch_size:(idx + 1) * self._batch_size]))
 88 | 
 89 | 
 90 | 
 91 | 
 92 | def RNNKerasCPUNoEmbedding(embed_size = 1024, maxlen = 100):
 93 |     inp = Input(shape = (maxlen, embed_size))
 94 |     x = Bidirectional(GRU(256, return_sequences = True))(inp)
 95 |     x = Dropout(0.5)(x)
 96 |     x = Bidirectional(GRU(256, return_sequences = True))(x)
 97 |     x = Dropout(0.5)(x)
 98 |     x = GlobalMaxPool1D()(x)
 99 |     x = Dropout(0.5)(x)
100 |     x = Dense(64, activation = "relu")(x)
101 |     x = Dropout(0.5)(x)
102 |     x = Dense(1, activation = "sigmoid")(x)
103 |     model = Model(inputs = inp, outputs = x)
104 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
105 |     return model
106 | 
107 | 
108 | 
109 | 
110 | 
111 | def f1(y_true, y_pred):
112 |     def recall(y_true, y_pred):
113 |         """Recall metric.
114 | 
115 |         Only computes a batch-wise average of recall.
116 | 
117 |         Computes the recall, a metric for multi-label classification of
118 |         how many relevant items are selected.
119 |         """
120 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
121 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
122 |         recall = true_positives / (possible_positives + K.epsilon())
123 |         return recall
124 | 
125 |     def precision(y_true, y_pred):
126 |         """Precision metric.
127 | 
128 |         Only computes a batch-wise average of precision.
129 | 
130 |         Computes the precision, a metric for multi-label classification of
131 |         how many selected items are relevant.
132 |         """
133 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
134 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
135 |         precision = true_positives / (predicted_positives + K.epsilon())
136 |         return precision
137 |     precision = precision(y_true, y_pred)
138 |     recall = recall(y_true, y_pred)
139 |     return 2*((precision*recall)/(precision+recall+K.epsilon()))
140 | 
141 | 
142 | 
143 | 
144 | 
145 | data = read_file("./data/train.crash")
146 | data_test = read_file("./data/test.crash", is_train = False)
147 | 
148 | labels = data["label"].values.astype(np.float16).reshape(-1, 1)
149 | texts = tokenize(data["text"])
150 | texts_test = tokenize(data_test["text"])
151 | 
152 | 
153 | texts = to_length(texts, 100)
154 | texts_test = to_length(texts_test, 100)
155 | 
156 | texts_train, texts_val, labels_train, labels_val = train_test_split(
157 |     texts, labels,
158 |     test_size = 0.05
159 | )
160 | 
161 | 
162 | checkpoint = ModelCheckpoint(
163 |     filepath = "./Weights/model_elmo.hdf5",
164 |     monitor = 'val_f1', verbose = 1,
165 |     mode = 'max',
166 |     save_best_only = True
167 | )
168 | early = EarlyStopping(monitor = "val_f1", mode = "max", patience = 3)
169 | callbacks_list = [checkpoint, early]
170 | 
171 | train_seq = TrainSeq(texts_train, labels_train, batch_size = batch_size)
172 | val_seq = TrainSeq(texts_val, labels_val, batch_size = 1)
173 | test_seq = TestSeq(texts_test, batch_size = 1)
174 | 
175 | 
176 | model = RNNKerasCPUNoEmbedding()
177 | model.fit_generator(
178 |     train_seq,
179 |     validation_data = val_seq,
180 |     callbacks = callbacks_list,
181 |     epochs = epochs,
182 |     workers = False
183 | )
184 | 
185 | 
186 | 
187 | 
188 | model.load_weights("./Weights/model_elmo.hdf5")
189 | prediction_prob = model.predict_generator(val_seq, workers = False)
190 | 
191 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val)
192 | print(OPTIMAL_THRESHOLD)
193 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8)
194 | print(f1_score(
195 |     y_true = labels_val.reshape(-1),
196 |     y_pred = prediction.reshape(-1)
197 | ))
198 | 
199 | 
200 | 
201 | prediction_test = model.predict_generator(test_seq, workers = False)
202 | df_predicton = pd.read_csv("./data/sample_submission.csv")
203 | df_predicton["label"] = (prediction_test > OPTIMAL_THRESHOLD).astype(np.int8)
204 | df_predicton.to_csv("./prediction/prediction_elmo.csv", index = False)


--------------------------------------------------------------------------------
/main_stack_hier.py:
--------------------------------------------------------------------------------
  1 | from scripts.util import \
  2 |     read_file, \
  3 |     tokenize, make_embedding, text_to_sequences, \
  4 |     sent_embedding, sent_tokenize, text_sents_to_sequences, f1
  5 | from scripts.constant import DEFAULT_MAX_FEATURES
  6 | from sklearn.model_selection import train_test_split
  7 | from scripts.rnn import SARNNKeras, HARNN, AttLayer, RNNKeras, OriginalHARNN, AdditiveLayer
  8 | from scripts.cnn import VDCNN, TextCNN, LSTMCNN
  9 | from scripts.stack import StackedGeneralizerWithHier
 10 | import argparse
 11 | import os
 12 | import numpy as np
 13 | import datetime
 14 | import pandas as pd
 15 | from sklearn.metrics import f1_score
 16 | 
 17 | from sklearn.linear_model import LogisticRegression
 18 | from sklearn.ensemble import RandomForestClassifier
 19 | from sklearn.neural_network import MLPClassifier
 20 | 
 21 | from keras.utils import CustomObjectScope
 22 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
 23 | 
 24 | 
 25 | 
 26 | def stack(models_list, hier_models_list, embedding_path, max_features, should_mix):
 27 |     model_name = '-'.join(
 28 |         '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' '))
 29 | 
 30 |     train_data = read_file('./data/train.crash')
 31 |     test_data = read_file('./data/test.crash', is_train=False)
 32 | 
 33 |     train_tokenized_texts = tokenize(train_data['text'])
 34 |     test_tokenizes_texts = tokenize(test_data['text'])
 35 | 
 36 |     train_tokenized_texts_sent = sent_tokenize(train_data['text'])
 37 |     test_tokenizes_texts_sent = sent_tokenize(test_data['text'])
 38 | 
 39 |     labels = train_data['label'].values.astype(np.float16).reshape(-1, 1)
 40 | 
 41 |     embed_size, word_map, embedding_mat = make_embedding(
 42 |         list(train_tokenized_texts) +
 43 |         list(test_tokenizes_texts) if should_mix else train_tokenized_texts,
 44 |         embedding_path,
 45 |         max_features
 46 |     )
 47 | 
 48 |     embed_size_sent, word_map_sent, embedding_mat_sent = sent_embedding(
 49 |         list(train_tokenized_texts_sent) +
 50 |         list(test_tokenizes_texts_sent) if should_mix else train_tokenized_texts_sent,
 51 |         embedding_path,
 52 |         max_features
 53 |     )
 54 | 
 55 | 
 56 |     texts_id = text_to_sequences(train_tokenized_texts, word_map)
 57 |     texts_id_sent = text_sents_to_sequences(
 58 |         train_tokenized_texts_sent,
 59 |         word_map_sent,
 60 |         max_nb_sent = 3,
 61 |         max_sent_len = 50
 62 |     )
 63 |     print('Number of train data: {}'.format(labels.shape))
 64 | 
 65 |     texts_id_train, texts_id_val, texts_id_sent_train, texts_id_sent_val, labels_train, labels_val = train_test_split(
 66 |         texts_id, texts_id_sent, labels, test_size=0.05)
 67 | 
 68 |     model_path = './models/{}-version'.format(model_name)
 69 | 
 70 |     try:
 71 |         os.mkdir('./models')
 72 |     except:
 73 |         print('Folder already created')
 74 |     try:
 75 |         os.mkdir(model_path)
 76 |     except:
 77 |         print('Folder already created')
 78 | 
 79 |     batch_size = 16
 80 |     epochs = 100
 81 |     patience = 3
 82 | 
 83 |     # meta_model = RandomForestClassifier (
 84 |     #     n_estimators=200,
 85 |     #     criterion="entropy",
 86 |     #     max_depth=5,
 87 |     #     max_features=0.5
 88 |     # )
 89 |     # meta_model = MLPClassifier(
 90 |     #     hidden_layer_sizes = (10),
 91 |     #     early_stopping = True,
 92 |     #     validation_fraction = 0.05,
 93 |     #     batch_size = batch_size,
 94 |     #     n_iter_no_change = patience
 95 |     # )
 96 |     meta_model = LogisticRegression()
 97 | 
 98 |     models = [
 99 |         model(
100 |             embeddingMatrix=embedding_mat,
101 |             embed_size=embed_size,
102 |             max_features=embedding_mat.shape[0]
103 |         )
104 |         for model in models_list
105 |     ]
106 | 
107 |     hier_models = [
108 |         model(
109 |             embeddingMatrix=embedding_mat_sent,
110 |             embed_size=embed_size_sent,
111 |             max_features=embedding_mat_sent.shape[0],
112 |             max_nb_sent = 3,
113 |             max_sent_len = 50
114 |         )
115 |         for model in hier_models_list
116 |     ]
117 | 
118 | 
119 | 
120 |     stack = StackedGeneralizerWithHier(models, hier_models, meta_model)
121 |     stack.train_meta_model(
122 |         X = texts_id_train, y = labels_train,
123 |         X_val = texts_id_val, y_val = labels_val,
124 |         X_hier = texts_id_sent_train, X_hier_val = texts_id_sent_val,
125 |         model_path = model_path,
126 |         epochs = epochs,
127 |         batch_size = batch_size,
128 |         patience = patience
129 |     )
130 | 
131 |     stack.train_models(
132 |         X = texts_id_train, y = labels_train,
133 |         X_val = texts_id_val, y_val = labels_val,
134 |         X_hier = texts_id_sent_train, X_hier_val = texts_id_sent_val,
135 |         batch_size = batch_size,
136 |         epochs = epochs,
137 |         patience = patience,
138 |         model_path = model_path
139 |     )
140 | 
141 |     prediction = stack.predict(texts_id_val, texts_id_sent_val)
142 |     print('F1 validation score: {}'.format(f1_score(prediction, labels_val)))
143 |     with open('{}/f1'.format(model_path), 'w') as fp:
144 |         fp.write(str(f1_score(prediction, labels_val)))
145 | 
146 |     test_id_texts = text_to_sequences(test_tokenizes_texts, word_map)
147 |     test_id_texts_sent = text_sents_to_sequences(test_tokenizes_texts_sent, word_map_sent, 3, 50)
148 |     test_prediction = stack.predict(test_id_texts, test_id_texts_sent)
149 | 
150 |     df_predicton = pd.read_csv("./data/sample_submission.csv")
151 |     df_predicton["label"] = test_prediction
152 | 
153 |     print('Number of test data: {}'.format(df_predicton.shape[0]))
154 |     df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False)
155 | 
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     models_list = [
160 |         VDCNN, TextCNN, SARNNKeras, RNNKeras
161 |     ]
162 |     hier_models_list = [
163 |         OriginalHARNN, HARNN
164 |     ]
165 |     parser = argparse.ArgumentParser()
166 |     parser.add_argument(
167 |         '-e',
168 |         '--embedding',
169 |         help='Model use',
170 |         default='./embeddings/smallFasttext.vi.vec'
171 |     )
172 |     parser.add_argument(
173 |         '--max',
174 |         help='Model use',
175 |         default=DEFAULT_MAX_FEATURES
176 |     )
177 |     parser.add_argument(
178 |         '--mix',
179 |         action='store_true',
180 |         help='Model use'
181 |     )
182 |     args = parser.parse_args()
183 |     with CustomObjectScope({
184 |         'SeqSelfAttention': SeqSelfAttention,
185 |         'SeqWeightedAttention': SeqWeightedAttention,
186 |         'AttLayer': AttLayer,
187 |         'AdditiveLayer': AdditiveLayer,
188 |         'f1': f1
189 |     }):
190 |         stack(models_list, hier_models_list, args.embedding,
191 |                     int(args.max), args.mix)
192 | 


--------------------------------------------------------------------------------
/main_elmo.py:
--------------------------------------------------------------------------------
  1 | from scripts.util import read_file, tokenize
  2 | from sklearn.model_selection import train_test_split
  3 | from keras.callbacks import EarlyStopping, ModelCheckpoint
  4 | from scripts.rnn import RNNKeras, RNNKerasCPU, LSTMKeras, SARNNKerasCPU, SARNNKeras
  5 | from scripts.cnn import TextCNN, LSTMCNN, VDCNN
  6 | import argparse
  7 | import os
  8 | import numpy as np
  9 | import datetime
 10 | import pandas as pd
 11 | from scripts.util import find_threshold
 12 | from sklearn.metrics import f1_score
 13 | from keras.utils import Sequence
 14 | from elmoformanylangs import Embedder
 15 | 
 16 | 
 17 | 
 18 | 
 19 | def train_model(model, embedding_path, should_find_threshold, return_prob, use_additive_emb):
 20 |     batch_size = 16
 21 |     epochs = 100
 22 |     max_len = 100
 23 | 
 24 |     def to_length(texts, length):
 25 |         def pad_func(vector, pad_width, iaxis, kwargs):
 26 |             str = kwargs.get('padder', '<pad>')
 27 |             vector[:pad_width[0]] = str
 28 |             vector[-pad_width[1]:] = str
 29 |             return vector
 30 | 
 31 |         ret = []
 32 |         for sentence in texts:
 33 |             sentence = np.array([token.replace("_", " ") for token in sentence], dtype=np.unicode)
 34 |             sentence = sentence[:min(length, len(sentence))]
 35 |             if length > len(sentence):
 36 |                 sentence = np.pad(
 37 |                     sentence, mode=pad_func,
 38 |                     pad_width=(0, length - len(sentence))
 39 |                 )
 40 |             ret.append(sentence)
 41 | 
 42 |         return np.array(ret)
 43 | 
 44 |     class TrainSeq(Sequence):
 45 |         def __init__(self, X, y, batch_size, elmo):
 46 |             self._X, self._y = X, y
 47 |             self._batch_size = batch_size
 48 |             self._indices = np.arange(len(self._X))
 49 |             self._elmo = elmo
 50 | 
 51 |         def __len__(self):
 52 |             return int(np.ceil(len(self._X) / float(self._batch_size)))
 53 | 
 54 |         def __getitem__(self, idx):
 55 |             id = self._indices[idx * self._batch_size:(idx + 1) * self._batch_size]
 56 |             return np.array(self._elmo.sents2elmo(self._X[id])), self._y[id]
 57 | 
 58 |         def on_epoch_end(self):
 59 |             np.random.shuffle(self._indices)
 60 | 
 61 |     class TestSeq(Sequence):
 62 |         def __init__(self, x, batch_size, elmo):
 63 |             self._X = x
 64 |             self._batch_size = batch_size
 65 |             self._elmo = elmo
 66 | 
 67 |         def __len__(self):
 68 |             return int(np.ceil(len(self._X) / float(self._batch_size)))
 69 | 
 70 |         def __getitem__(self, idx):
 71 |             return np.array(self._elmo.sents2elmo(self._X[idx * self._batch_size:(idx + 1) * self._batch_size]))
 72 | 
 73 |     model_name = '-'.join(
 74 |         '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' '))
 75 | 
 76 |     elmo = Embedder(embedding_path, batch_size=batch_size)
 77 | 
 78 |     train_data = read_file('./data/train.crash')
 79 |     test_data = read_file('./data/test.crash', is_train=False)
 80 |     train_tokenized_texts = tokenize(train_data['text'])
 81 |     test_tokenizes_texts = tokenize(test_data['text'])
 82 |     labels = train_data['label'].values.astype(np.float16).reshape(-1, 1)
 83 | 
 84 |     texts = to_length(train_tokenized_texts, max_len)
 85 |     texts_test = to_length(test_tokenizes_texts, max_len)
 86 | 
 87 |     print('Number of train data: {}'.format(labels.shape))
 88 | 
 89 |     texts_train, texts_val, labels_train, labels_val = train_test_split(
 90 |         texts, labels,
 91 |         test_size=0.05
 92 |     )
 93 | 
 94 |     model_path = './models/{}-version'.format(model_name)
 95 | 
 96 |     try:
 97 |         os.mkdir('./models')
 98 |     except:
 99 |         print('Folder already created')
100 |     try:
101 |         os.mkdir(model_path)
102 |     except:
103 |         print('Folder already created')
104 | 
105 |     checkpoint = ModelCheckpoint(
106 |         filepath='{}/models.hdf5'.format(model_path),
107 |         monitor='val_f1', verbose=1,
108 |         mode='max',
109 |         save_best_only=True
110 |     )
111 |     early = EarlyStopping(monitor='val_f1', mode='max', patience=5)
112 |     callbacks_list = [checkpoint, early]
113 | 
114 |     train_seq = TrainSeq(texts_train, labels_train, batch_size=batch_size, elmo = elmo)
115 |     val_seq = TrainSeq(texts_val, labels_val, batch_size=min(batch_size, len(texts_val)), elmo = elmo)
116 |     test_seq = TestSeq(texts_test, batch_size=min(batch_size, len(texts_test)), elmo = elmo)
117 | 
118 |     model = model(
119 |         maxlen = max_len,
120 |         embed_size=1024,
121 |         use_fasttext = True,
122 |         use_additive_emb = use_additive_emb
123 |     )
124 |     model.fit_generator(
125 |         train_seq,
126 |         validation_data=val_seq,
127 |         callbacks=callbacks_list,
128 |         epochs=epochs,
129 |         workers=False
130 |     )
131 | 
132 |     model.load_weights('{}/models.hdf5'.format(model_path))
133 |     prediction_prob = model.predict_generator(val_seq, workers=False)
134 |     if should_find_threshold:
135 |         OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val)
136 |     else:
137 |         OPTIMAL_THRESHOLD = 0.5
138 |     print('OPTIMAL_THRESHOLD: {}'.format(OPTIMAL_THRESHOLD))
139 |     prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8)
140 |     print('F1 validation score: {}'.format(f1_score(prediction, labels_val)))
141 |     with open('{}/f1'.format(model_path), 'w') as fp:
142 |         fp.write(str(f1_score(prediction, labels_val)))
143 | 
144 |         test_prediction = model.predict_generator(test_seq, workers=False)
145 | 
146 |     df_predicton = pd.read_csv("./data/sample_submission.csv")
147 |     if return_prob:
148 |         df_predicton["label"] = test_prediction
149 |     else:
150 |         df_predicton["label"] = (
151 |             test_prediction > OPTIMAL_THRESHOLD).astype(np.int8)
152 | 
153 |     print('Number of test data: {}'.format(df_predicton.shape[0]))
154 |     df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False)
155 | 
156 | 
157 | model_dict = {
158 |     'RNNKeras': RNNKeras,
159 |     'RNNKerasCPU': RNNKerasCPU,
160 |     'LSTMKeras': LSTMKeras,
161 |     'SARNNKerasCPU': SARNNKerasCPU,
162 |     'SARNNKeras': SARNNKeras,
163 |     'TextCNN': TextCNN,
164 |     'LSTMCNN': LSTMCNN,
165 |     'VDCNN': VDCNN
166 | }
167 | 
168 | if __name__ == '__main__':
169 |     parser = argparse.ArgumentParser()
170 |     parser.add_argument(
171 |         '-m',
172 |         '--model',
173 |         help='Model use',
174 |         default='RNNKerasCPU'
175 |     )
176 |     parser.add_argument(
177 |         '-e',
178 |         '--embedding',
179 |         help='Model use',
180 |         default='./embeddings/smallFasttext.vi.vec'
181 |     )
182 |     parser.add_argument(
183 |         '--find_threshold',
184 |         action='store_true',
185 |         help='Model use'
186 |     )
187 |     parser.add_argument(
188 |         '--prob',
189 |         action='store_true',
190 |         help='Model use'
191 |     )
192 |     parser.add_argument(
193 |         '--add_embed',
194 |         action='store_true',
195 |         help='Model use'
196 |     )
197 |     args = parser.parse_args()
198 |     if not args.model in model_dict:
199 |         raise RuntimeError('Model not found')
200 |     train_model(model_dict[args.model], args.embedding, args.find_threshold, args.prob, args.add_embed)
201 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences
  2 | from scripts.rnn import RNNKeras
  3 | from scripts.constant import DEFAULT_MAX_FEATURES
  4 | from sklearn.model_selection import train_test_split
  5 | from keras.callbacks import EarlyStopping, ModelCheckpoint
  6 | from scripts.rnn import RNNKeras, RNNKerasCPU, LSTMKeras, SARNNKerasCPU, SARNNKeras
  7 | from scripts.cnn import TextCNN, LSTMCNN, VDCNN
  8 | import argparse
  9 | import os
 10 | import numpy as np
 11 | import datetime
 12 | import pandas as pd
 13 | from scripts.util import find_threshold
 14 | from scripts.augment import similar_augment, create_sim_dict, similar_augment_from_sim_dict
 15 | from sklearn.metrics import f1_score
 16 | from keras.utils.vis_utils import plot_model
 17 | 
 18 | 
 19 | def train_model(
 20 |         model, embedding_path, annoy_path,
 21 |         max_features, should_find_threshold, should_mix,
 22 |         return_prob, trainable, use_additive_emb, augment_size, use_sim_dict,
 23 |         print_model, model_high
 24 | ):
 25 |     model_name = '-'.join(
 26 |         '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' '))
 27 | 
 28 |     augment_size = int(augment_size)
 29 | 
 30 |     train_data = read_file('./data/train.crash')
 31 |     test_data = read_file('./data/test.crash', is_train=False)
 32 |     train_tokenized_texts = tokenize(train_data['text'])
 33 |     test_tokenizes_texts = tokenize(test_data['text'])
 34 |     labels = train_data['label'].values.astype(np.float16).reshape(-1, 1)
 35 | 
 36 |     train_tokenized_texts, val_tokenized_texts, labels_train, labels_val = train_test_split(
 37 |         train_tokenized_texts, labels, test_size = 0.05
 38 |     )
 39 | 
 40 | 
 41 |     if augment_size != 0 and not use_sim_dict:
 42 |         if augment_size < 0:
 43 |             augment_size = len(train_tokenized_texts) * (-augment_size)
 44 | 
 45 |         print(augment_size)
 46 | 
 47 |         train_tokenized_texts, labels_train = similar_augment(
 48 |             train_tokenized_texts,
 49 |             labels_train,
 50 |             n_increase = augment_size,
 51 |             model_path = embedding_path,
 52 |             n_word_replace = 10,
 53 |             use_annoy=True,
 54 |             annoy_path=annoy_path
 55 |         )
 56 | 
 57 | 
 58 |     embed_size, word_map, embedding_mat = make_embedding(
 59 |         list(train_tokenized_texts) + list(val_tokenized_texts) +
 60 |         list(test_tokenizes_texts) if should_mix else list(train_tokenized_texts) + list(val_tokenized_texts),
 61 |         embedding_path,
 62 |         max_features
 63 |     )
 64 | 
 65 |     texts_id_train = text_to_sequences(train_tokenized_texts, word_map)
 66 | 
 67 |     if augment_size != 0 and use_sim_dict:
 68 |         if augment_size < 0:
 69 |             augment_size = len(train_tokenized_texts) * (-augment_size)
 70 |         sim_dict = create_sim_dict(word_map, model_path = embedding_path, annoy_path  = annoy_path)
 71 |         print("Finish Creating sim dict")
 72 |         texts_id_train, labels_train = similar_augment_from_sim_dict(
 73 |             texts_id_train, labels_train, sim_dict,
 74 |             n_increase = augment_size
 75 |         )
 76 | 
 77 |     texts_id_val = text_to_sequences(val_tokenized_texts, word_map)
 78 |     print('Number of train data: {}'.format(labels.shape))
 79 | 
 80 |     # texts_id_train, texts_id_val, labels_train, labels_val = train_test_split(
 81 |     #     texts_id, labels, test_size=0.05)
 82 | 
 83 |     model_path = './models/{}-version'.format(model_name)
 84 | 
 85 |     try:
 86 |         os.mkdir('./models')
 87 |     except:
 88 |         print('Folder already created')
 89 |     try:
 90 |         os.mkdir(model_path)
 91 |     except:
 92 |         print('Folder already created')
 93 | 
 94 |     checkpoint = ModelCheckpoint(
 95 |         filepath='{}/models.hdf5'.format(model_path),
 96 |         monitor='val_f1', verbose=1,
 97 |         mode='max',
 98 |         save_best_only=True
 99 |     )
100 |     early = EarlyStopping(monitor='val_f1', mode='max', patience=5)
101 |     callbacks_list = [checkpoint, early]
102 |     batch_size = 16
103 |     epochs = 100
104 | 
105 |     model = model(
106 |         embeddingMatrix=embedding_mat,
107 |         embed_size=embed_size,
108 |         max_features=embedding_mat.shape[0],
109 |         trainable = trainable,
110 |         use_additive_emb = use_additive_emb
111 |     )
112 |     if print_model:
113 |         plot_model(model, to_file='{}.png'.format(model_high), show_shapes=True, show_layer_names=True)
114 |         return
115 |     model.fit(
116 |         texts_id_train, labels_train,
117 |         validation_data=(texts_id_val, labels_val),
118 |         callbacks=callbacks_list,
119 |         epochs=epochs,
120 |         batch_size=batch_size
121 |     )
122 | 
123 |     model.load_weights('{}/models.hdf5'.format(model_path))
124 |     prediction_prob = model.predict(texts_id_val)
125 |     if should_find_threshold:
126 |         OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val)
127 |     else:
128 |         OPTIMAL_THRESHOLD = 0.5
129 |     print('OPTIMAL_THRESHOLD: {}'.format(OPTIMAL_THRESHOLD))
130 |     prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8)
131 |     print('F1 validation score: {}'.format(f1_score(prediction, labels_val)))
132 |     with open('{}/f1'.format(model_path), 'w') as fp:
133 |         fp.write(str(f1_score(prediction, labels_val)))
134 | 
135 |     test_id_texts = text_to_sequences(test_tokenizes_texts, word_map)
136 |     test_prediction = model.predict(test_id_texts)
137 | 
138 |     df_predicton = pd.read_csv("./data/sample_submission.csv")
139 |     if return_prob:
140 |         df_predicton["label"] = test_prediction
141 |     else:
142 |         df_predicton["label"] = (
143 |             test_prediction > OPTIMAL_THRESHOLD).astype(np.int8)
144 | 
145 |     print('Number of test data: {}'.format(df_predicton.shape[0]))
146 |     df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False)
147 | 
148 | 
149 | model_dict = {
150 |     'RNNKeras': RNNKeras,
151 |     'RNNKerasCPU': RNNKerasCPU,
152 |     'LSTMKeras': LSTMKeras,
153 |     'SARNNKerasCPU': SARNNKerasCPU,
154 |     'SARNNKeras': SARNNKeras,
155 |     'TextCNN': TextCNN,
156 |     'LSTMCNN': LSTMCNN,
157 |     'VDCNN': VDCNN
158 | }
159 | 
160 | if __name__ == '__main__':
161 |     parser = argparse.ArgumentParser()
162 |     parser.add_argument(
163 |         '-m',
164 |         '--model',
165 |         help='Model use',
166 |         default='RNNKerasCPU'
167 |     )
168 |     parser.add_argument(
169 |         '-e',
170 |         '--embedding',
171 |         help='Model use',
172 |         default='./embeddings/smallFasttext.vi.vec'
173 |     )
174 |     parser.add_argument(
175 |         '-annoy',
176 |         '--annoy',
177 |         help='Model use',
178 |         default='./embeddings/annoy.pkl'
179 |     )
180 |     parser.add_argument(
181 |         '--max',
182 |         help='Model use',
183 |         default=DEFAULT_MAX_FEATURES
184 |     )
185 |     parser.add_argument(
186 |         '--aug',
187 |         help='Model use',
188 |         default=0
189 |     )
190 |     parser.add_argument(
191 |         '--use_sim_dict',
192 |         action='store_true',
193 |         help='Model use'
194 |     )
195 |     parser.add_argument(
196 |         '--find_threshold',
197 |         action='store_true',
198 |         help='Model use'
199 |     )
200 |     parser.add_argument(
201 |         '--mix',
202 |         action='store_true',
203 |         help='Model use'
204 |     )
205 |     parser.add_argument(
206 |         '--prob',
207 |         action='store_true',
208 |         help='Model use'
209 |     )
210 |     parser.add_argument(
211 |         '--fix_embed',
212 |         action='store_false',
213 |         help='Model use'
214 |     )
215 |     parser.add_argument(
216 |         '--add_embed',
217 |         action='store_true',
218 |         help='Model use'
219 |     )
220 |     parser.add_argument(
221 |         '--print_model',
222 |         action='store_true',
223 |         help='Model use'
224 |     )
225 |     args = parser.parse_args()
226 |     if not args.model in model_dict:
227 |         raise RuntimeError('Model not found')
228 |     train_model(model_dict[args.model], args.embedding, args.annoy,
229 |                 int(args.max), args.find_threshold, args.mix, args.prob, args.fix_embed, args.add_embed, args.aug,
230 |                 args.use_sim_dict, args.print_model, args.model)
231 | 


--------------------------------------------------------------------------------
/main_hierarchical.py:
--------------------------------------------------------------------------------
  1 | from scripts.util import read_file, sent_tokenize, sent_embedding, text_sents_to_sequences
  2 | from scripts.constant import DEFAULT_MAX_FEATURES
  3 | from sklearn.model_selection import train_test_split
  4 | from keras.callbacks import EarlyStopping, ModelCheckpoint
  5 | from scripts.rnn import HRNN, HRNNCPU, OriginalHARNN, OriginalHARNNCPU, HARNN, HARNNCPU
  6 | import argparse
  7 | import os
  8 | import numpy as np
  9 | import datetime
 10 | import pandas as pd
 11 | from scripts.util import find_threshold
 12 | from scripts.augment import shuffle_augment
 13 | from sklearn.metrics import f1_score
 14 | from keras.utils.vis_utils import plot_model
 15 | 
 16 | 
 17 | def train_model(
 18 |         model, embedding_path,
 19 |         max_features, max_nb_sent, max_sent_len,
 20 |         should_find_threshold, should_mix,
 21 |         return_prob, trainable, use_additive_emb, augment_size, aug_min_len, print_model, model_high
 22 | ):
 23 |     model_name = '-'.join(
 24 |         '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' '))
 25 | 
 26 |     train_data = read_file('./data/train.crash')
 27 |     test_data = read_file('./data/test.crash', is_train=False)
 28 |     train_tokenized_texts = sent_tokenize(train_data['text'])
 29 |     test_tokenizes_texts = sent_tokenize(test_data['text'])
 30 |     labels = train_data['label'].values.astype(np.float16).reshape(-1, 1)
 31 | 
 32 |     train_tokenized_texts, val_tokenized_texts, labels_train, labels_val = train_test_split(
 33 |         train_tokenized_texts, labels, test_size=0.05
 34 |     )
 35 | 
 36 |     augment_size = int(augment_size)
 37 |     aug_min_len = int(aug_min_len)
 38 |     max_nb_sent = int(max_nb_sent)
 39 |     max_sent_len = int(max_sent_len)
 40 | 
 41 |     if augment_size != 0:
 42 |         if augment_size < 0:
 43 |             augment_size = len(train_tokenized_texts) * (-augment_size)
 44 | 
 45 |         print(augment_size)
 46 | 
 47 |         train_tokenized_texts, labels_train = shuffle_augment(
 48 |             train_tokenized_texts,
 49 |             labels_train,
 50 |             n_increase = augment_size,
 51 |             min_length = aug_min_len
 52 |         )
 53 | 
 54 |     embed_size, word_map, embedding_mat = sent_embedding(
 55 |         list(train_tokenized_texts) + list(val_tokenized_texts) +
 56 |         list(test_tokenizes_texts) if should_mix
 57 |         else list(train_tokenized_texts) + list(val_tokenized_texts),
 58 |         embedding_path,
 59 |         max_features
 60 |     )
 61 | 
 62 |     texts_id_train = text_sents_to_sequences(
 63 |         train_tokenized_texts,
 64 |         word_map,
 65 |         max_nb_sent = max_nb_sent,
 66 |         max_sent_len = max_sent_len
 67 |     )
 68 | 
 69 |     texts_id_val = text_sents_to_sequences(
 70 |         val_tokenized_texts,
 71 |         word_map,
 72 |         max_nb_sent = max_nb_sent,
 73 |         max_sent_len = max_sent_len
 74 |     )
 75 | 
 76 | 
 77 |     # texts_id = text_sents_to_sequences(
 78 |     #     train_tokenized_texts,
 79 |     #     word_map,
 80 |     #     max_nb_sent = max_nb_sent,
 81 |     #     max_sent_len = max_sent_len
 82 |     # )
 83 |     print('Number of train data: {}'.format(labels.shape))
 84 | 
 85 |     # texts_id_train, texts_id_val, labels_train, labels_val = train_test_split(
 86 |     #     texts_id, labels, test_size=0.05)
 87 | 
 88 |     model_path = './models/{}-version'.format(model_name)
 89 | 
 90 |     try:
 91 |         os.mkdir('./models')
 92 |     except:
 93 |         print('Folder already created')
 94 |     try:
 95 |         os.mkdir(model_path)
 96 |     except:
 97 |         print('Folder already created')
 98 | 
 99 |     checkpoint = ModelCheckpoint(
100 |         filepath='{}/models.hdf5'.format(model_path),
101 |         monitor='val_f1', verbose=1,
102 |         mode='max',
103 |         save_best_only=True
104 |     )
105 |     early = EarlyStopping(monitor='val_f1', mode='max', patience=5)
106 |     callbacks_list = [checkpoint, early]
107 |     batch_size = 16
108 |     epochs = 100
109 | 
110 |     model = model(
111 |         embeddingMatrix=embedding_mat,
112 |         embed_size=embed_size,
113 |         max_features=embedding_mat.shape[0],
114 |         max_nb_sent = max_nb_sent,
115 |         max_sent_len = max_sent_len,
116 |         trainable = trainable,
117 |         use_additive_emb = use_additive_emb
118 |     )
119 |     if print_model:
120 |         plot_model(model, to_file='{}.png'.format(model_high), show_shapes=True, show_layer_names=True)
121 |         return
122 |     model.fit(
123 |         texts_id_train, labels_train,
124 |         validation_data=(texts_id_val, labels_val),
125 |         callbacks=callbacks_list,
126 |         epochs=epochs,
127 |         batch_size=batch_size
128 |     )
129 | 
130 |     model.load_weights('{}/models.hdf5'.format(model_path))
131 |     prediction_prob = model.predict(texts_id_val)
132 | 
133 |     if should_find_threshold:
134 |         OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val)
135 |     else:
136 |         OPTIMAL_THRESHOLD = 0.5
137 |     print('OPTIMAL_THRESHOLD: {}'.format(OPTIMAL_THRESHOLD))
138 |     prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8)
139 |     print('F1 validation score: {}'.format(f1_score(prediction, labels_val)))
140 |     with open('{}/f1'.format(model_path), 'w') as fp:
141 |         fp.write(str(f1_score(prediction, labels_val)))
142 | 
143 |     test_id_texts = text_sents_to_sequences(
144 |         test_tokenizes_texts,
145 |         word_map,
146 |         max_nb_sent = max_nb_sent,
147 |         max_sent_len = max_sent_len
148 |     )
149 |     test_prediction = model.predict(test_id_texts)
150 | 
151 |     df_predicton = pd.read_csv("./data/sample_submission.csv")
152 | 
153 |     if return_prob:
154 |         df_predicton["label"] = test_prediction
155 |     else:
156 |         df_predicton["label"] = (
157 |             test_prediction > OPTIMAL_THRESHOLD).astype(np.int8)
158 |     print('Number of test data: {}'.format(df_predicton.shape[0]))
159 |     df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False)
160 | 
161 | 
162 | model_dict = {
163 |     'HRNN': HRNN,
164 |     'HRNNCPU': HRNNCPU,
165 |     'HARNN': HARNN,
166 |     'HARNNCPU': HARNNCPU,
167 |     'OriginalHARNN': OriginalHARNN,
168 |     'OriginalHARNNCPU':OriginalHARNNCPU
169 | }
170 | 
171 | if __name__ == '__main__':
172 |     parser = argparse.ArgumentParser()
173 |     parser.add_argument(
174 |         '-m',
175 |         '--model',
176 |         help='Model use',
177 |         default='HRNN'
178 |     )
179 |     parser.add_argument(
180 |         '-e',
181 |         '--embedding',
182 |         help='Model use',
183 |         default='./embeddings/smallFasttext.vi.vec'
184 |     )
185 |     parser.add_argument(
186 |         '--max',
187 |         help='Model use',
188 |         default=DEFAULT_MAX_FEATURES
189 |     )
190 |     parser.add_argument(
191 |         '--nb_sent',
192 |         help='Model use',
193 |         default=3
194 |     )
195 |     parser.add_argument(
196 |         '--sent_len',
197 |         help='Model use',
198 |         default=50
199 |     )
200 |     parser.add_argument(
201 |         '--aug',
202 |         help='Model use',
203 |         default=0
204 |     )
205 |     parser.add_argument(
206 |         '--aug_min_len',
207 |         help='Model use',
208 |         default=1
209 |     )
210 |     parser.add_argument(
211 |         '--find_threshold',
212 |         action='store_true',
213 |         help='Model use'
214 |     )
215 |     parser.add_argument(
216 |         '--mix',
217 |         action='store_true',
218 |         help='Model use'
219 |     )
220 |     parser.add_argument(
221 |         '--prob',
222 |         action='store_true',
223 |         help='Model use'
224 |     )
225 |     parser.add_argument(
226 |         '--fix_embed',
227 |         action='store_false',
228 |         help='Model use'
229 |     )
230 |     parser.add_argument(
231 |         '--add_embed',
232 |         action='store_true',
233 |         help='Model use'
234 |     )
235 |     parser.add_argument(
236 |         '--print_model',
237 |         action='store_true',
238 |         help='Model use'
239 |     )
240 |     args = parser.parse_args()
241 |     if not args.model in model_dict:
242 |         raise RuntimeError('Model not found')
243 |     train_model(
244 |         model_dict[args.model], args.embedding,
245 |         int(args.max), args.nb_sent, args.sent_len,
246 |         args.find_threshold, args.mix, args.prob,
247 |         args.fix_embed, args.add_embed, args.aug, args.aug_min_len, args.print_model, args.model
248 |     )
249 | 


--------------------------------------------------------------------------------
/scripts/util.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import copy
  3 | import os
  4 | import numpy as np
  5 | import re
  6 | import keras.backend as K
  7 | 
  8 | from tqdm import tqdm
  9 | from collections import defaultdict
 10 | from os.path import abspath
 11 | from spacy.lang.vi import Vietnamese
 12 | from .constant import DEFAULT_MAX_LENGTH
 13 | from gensim.models.keyedvectors import KeyedVectors
 14 | from sklearn.metrics import f1_score
 15 | import string
 16 | 
 17 | 
 18 | def split_array(arr, condition):
 19 |     if len(arr) == 0:
 20 |         return []
 21 |     result = []
 22 |     accumulated = [arr[0]]
 23 |     for ele in arr[1:]:
 24 |         if condition(ele):
 25 |             result.append(copy.deepcopy(accumulated))
 26 |             accumulated = [copy.deepcopy(ele)]
 27 |         else:
 28 |             accumulated.append(copy.deepcopy(ele))
 29 |     result.append(copy.deepcopy(accumulated))
 30 |     return result
 31 | 
 32 | 
 33 | def read_file(file_path, is_train=True):
 34 |     file_path = abspath(file_path)
 35 |     data_lines = list(
 36 |         filter(lambda x: x != '', open(file_path).read().split('\n')))
 37 |     pattern = ('train' if is_train else 'test') + '_[0-9]{5}'
 38 |     datas = split_array(data_lines, lambda x: bool(re.match(pattern, x)))
 39 |     if is_train:
 40 |         result_array = list(map(
 41 |             lambda x: [x[0], ' '.join(x[1:-1]), int(x[-1])], datas))
 42 |     else:
 43 |         result_array = list(map(lambda x: [x[0], ' '.join(x[1:])], datas))
 44 |     columns = ['name', 'text', 'label'] if is_train else ['name', 'text']
 45 |     return pd.DataFrame(result_array, columns=columns)
 46 | 
 47 | 
 48 | def tokenize(texts):
 49 |     nlp = Vietnamese()
 50 |     docs = []
 51 |     for text in texts:
 52 |         tokens = np.array([postprocess_token(token.text) for token in nlp(text.lower())[1:-1]])
 53 |         docs.append(tokens)
 54 | 
 55 |     return docs
 56 | 
 57 | 
 58 | def postprocess_token(token):
 59 |     if token in string.punctuation:
 60 |         return '<punct>'
 61 |     elif token.isdigit():
 62 |         return '<number>'
 63 |     else:
 64 |         return token
 65 | 
 66 | 
 67 | 
 68 | def make_embedding(texts, embedding_path, max_features):
 69 |     embedding_path = abspath(embedding_path)
 70 | 
 71 |     def get_coefs(word, *arr):
 72 |         return word, np.asarray(arr, dtype='float32')
 73 | 
 74 |     if embedding_path.endswith('.vec'):
 75 |         embedding_index = dict(get_coefs(*o.strip().split(" "))
 76 |                                for o in open(embedding_path))
 77 |         mean_embedding = np.mean(np.array(list(embedding_index.values())))
 78 |     elif embedding_path.endswith('bin'):
 79 |         embedding_index = KeyedVectors.load_word2vec_format(
 80 |             embedding_path, binary=True)
 81 |         mean_embedding = np.mean(embedding_index.vectors, axis=0)
 82 |     embed_size = mean_embedding.shape[0]
 83 |     word_index = sorted(list({word.lower() for sentence in texts for word in sentence}))
 84 |     nb_words = min(max_features, len(word_index))
 85 |     embedding_matrix = np.zeros((nb_words + 1, embed_size))
 86 |     i = 1
 87 |     word_map = defaultdict(lambda: nb_words)
 88 |     for word in word_index:
 89 |         if i >= max_features:
 90 |             continue
 91 |         if word in embedding_index:
 92 |             embedding_matrix[i] = embedding_index[word]
 93 |         else:
 94 |             embedding_matrix[i] = mean_embedding
 95 |         word_map[word] = i
 96 |         i += 1
 97 |     
 98 |     embedding_matrix[-1] = mean_embedding
 99 |     return embed_size, word_map, embedding_matrix
100 | 
101 | def text_to_sequences(texts, word_map, max_len=DEFAULT_MAX_LENGTH):
102 |     texts_id = []
103 |     for sentence in texts:
104 |         sentence = [word_map[word.lower()] for word in sentence][:max_len]
105 |         padded_setence = np.pad(
106 |             sentence, (0, max(0, max_len - len(sentence))), 'constant', constant_values=0)
107 |         texts_id.append(padded_setence)
108 |     return np.array(texts_id)
109 | 
110 | def find_threshold(pred_proba, y_true, metric = f1_score):
111 |     cur_acc = 0
112 |     cur_thres = 0
113 |     for ind in range(len(pred_proba) - 1):
114 |         threshold = (pred_proba[ind][0] + pred_proba[ind + 1][0]) / 2
115 |         pred = (pred_proba > threshold).astype(np.int8)
116 |         acc = metric(pred, y_true)
117 |         if acc > cur_acc:
118 |             cur_thres = threshold
119 |             cur_acc = acc
120 | 
121 |     return cur_thres
122 | 
123 | def f1(y_true, y_pred):
124 |     def recall(y_true, y_pred):
125 |         """Recall metric.
126 | 
127 |         Only computes a batch-wise average of recall.
128 | 
129 |         Computes the recall, a metric for multi-label classification of
130 |         how many relevant items are selected.
131 |         """
132 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
133 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
134 |         recall = true_positives / (possible_positives + K.epsilon())
135 |         return recall
136 | 
137 |     def precision(y_true, y_pred):
138 |         """Precision metric.
139 | 
140 |         Only computes a batch-wise average of precision.
141 | 
142 |         Computes the precision, a metric for multi-label classification of
143 |         how many selected items are relevant.
144 |         """
145 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
146 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
147 |         precision = true_positives / (predicted_positives + K.epsilon())
148 |         return precision
149 |     precision = precision(y_true, y_pred)
150 |     recall = recall(y_true, y_pred)
151 |     return 2*((precision*recall)/(precision+recall+K.epsilon()))
152 | 
153 | def predictions_to_submission(test_data, predictor):
154 |     tqdm.pandas()
155 |     submission = test_data[['id']]
156 |     submission['label'] = test_data['text'].progress_apply(predictor)
157 |     return submission
158 | 
159 | 
160 | # HELPERS FOR HIERARCHICAL MODEL:
161 | def sent_tokenize(texts):
162 |     nlp = Vietnamese()
163 |     nlp.add_pipe(nlp.create_pipe('sentencizer'))
164 |     docs = []
165 |     for text in texts:
166 |         text_tokenized = []
167 |         if (len(text) > 3):
168 |             for sentence in nlp(text.lower()[1:-1]).sents:
169 |                 sent_tokens = np.array([postprocess_token(token.text) for token in sentence])
170 |                 text_tokenized.append(sent_tokens)
171 |         else:
172 |             text_tokenized.append([])
173 |         docs.append(text_tokenized)
174 | 
175 |     return docs
176 | 
177 | 
178 | def sent_embedding(tokenized_texts, embedding_path, max_features):
179 |     embedding_path = abspath(embedding_path)
180 | 
181 |     def get_coefs(word, *arr):
182 |         return word, np.asarray(arr, dtype='float32')
183 | 
184 |     if embedding_path.endswith('.vec'):
185 |         embedding_index = dict(get_coefs(*o.strip().split(" "))
186 |                                for o in open(embedding_path))
187 |         mean_embedding = np.mean(np.array(list(embedding_index.values())))
188 |     elif embedding_path.endswith('bin'):
189 |         embedding_index = KeyedVectors.load_word2vec_format(
190 |             embedding_path, binary=True)
191 |         mean_embedding = np.mean(embedding_index.vectors, axis=0)
192 |     embed_size = mean_embedding.shape[0]
193 |     word_index = {word.lower() for text in tokenized_texts for sentence in text for word in sentence}
194 |     nb_words = min(max_features, len(word_index))
195 |     embedding_matrix = np.zeros((nb_words + 1, embed_size))
196 | 
197 |     i = 1
198 |     word_map = defaultdict(lambda: nb_words)
199 |     for word in word_index:
200 |         if i >= max_features:
201 |             continue
202 |         if word in embedding_index:
203 |             embedding_matrix[i] = embedding_index[word]
204 |         else:
205 |             embedding_matrix[i] = mean_embedding
206 |         word_map[word] = i
207 |         i += 1
208 |     embedding_matrix[-1] = mean_embedding
209 |     return embed_size, word_map, embedding_matrix
210 | 
211 | def text_sents_to_sequences(texts, word_map, max_nb_sent, max_sent_len):
212 |     ret = []
213 |     for i in range(len(texts)):
214 |         text_vecs = []
215 |         for j in range(len(texts[i])):
216 |             if (j < max_nb_sent):
217 |                 sent_vecs = []
218 |                 for k in range(len(texts[i][j])):
219 |                     if (k < max_sent_len):
220 |                         sent_vecs.append(word_map[texts[i][j][k]])
221 |                 if (len(sent_vecs) < max_sent_len):
222 |                     sent_vecs = np.pad(
223 |                         sent_vecs,
224 |                         (0, max(0, max_sent_len - len(sent_vecs))),
225 |                         'constant',
226 |                         constant_values=0
227 |                     )
228 |                 text_vecs.append(sent_vecs)
229 | 
230 | 
231 |         if (len(text_vecs) < max_nb_sent):
232 |             text_vecs = np.pad(
233 |                 text_vecs,
234 |                 ((0, max_nb_sent - len(text_vecs)), (0, 0)),
235 |                 'constant',
236 |                 constant_values=0
237 |             )
238 | 
239 |         ret.append(text_vecs)
240 | 
241 |     return np.array(ret)
242 | 


--------------------------------------------------------------------------------
/scripts/stack.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.model_selection import KFold
  3 | from scripts.util import f1
  4 | 
  5 | 
  6 | from keras.callbacks import EarlyStopping, ModelCheckpoint
  7 | from keras.models import load_model
  8 | 
  9 | from keras.models import Model
 10 | from keras.layers import \
 11 |     Dense, Embedding, Input, \
 12 |     Conv1D, MaxPool1D, \
 13 |     Dropout, BatchNormalization, \
 14 |     Bidirectional, CuDNNLSTM, \
 15 |     Concatenate, Flatten, Add
 16 | 
 17 | 
 18 | 
 19 | class StackedGeneralizer:
 20 | 
 21 |     def __init__(self, models, meta_model):
 22 |         self._models = models
 23 |         self._meta_model = meta_model
 24 |         return
 25 | 
 26 | 
 27 |     def train_models(self, X, y, X_val, y_val, model_path, epochs, batch_size, patience):
 28 |         for ind in range(len(self._models)):
 29 |             checkpoint = ModelCheckpoint(
 30 |                 filepath='{}/models.hdf5'.format(model_path),
 31 |                 monitor='val_f1', verbose=1,
 32 |                 mode='max',
 33 |                 save_best_only=True
 34 |             )
 35 |             early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
 36 |             callbacks_list = [checkpoint, early]
 37 |             self._models[ind].compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
 38 |             self._models[ind].fit(
 39 |                 X, y,
 40 |                 validation_data= (X_val, y_val),
 41 |                 callbacks=callbacks_list,
 42 |                 epochs=epochs,
 43 |                 batch_size=batch_size
 44 |             )
 45 |             self._models[ind].load_weights(filepath='{}/models.hdf5'.format(model_path))
 46 | 
 47 | 
 48 | 
 49 |     def train_meta_model(self, X, y, X_val, y_val, model_path, epochs, batch_size, patience):
 50 | 
 51 |         # Obtain level-1 input from each model:
 52 |         meta_input = np.zeros((len(X), len(self._models)))
 53 | 
 54 |         for ind in range(len(self._models)):
 55 |             pred = np.zeros(len(X))
 56 |             kf = KFold(n_splits = 5, shuffle = False)
 57 |             model = self._models[ind]
 58 |             # model.save(filepath='{}/dumped.hdf5'.format(model_path))
 59 |             weights = model.get_weights()
 60 | 
 61 | 
 62 |             for train_index, test_index in kf.split(X):
 63 |                 X_train, X_test = X[train_index], X[test_index]
 64 |                 y_train, y_test = y[train_index], y[test_index]
 65 | 
 66 | 
 67 |                 checkpoint = ModelCheckpoint(
 68 |                     filepath='{}/models.hdf5'.format(model_path),
 69 |                     monitor='val_f1', verbose=1,
 70 |                     mode='max',
 71 |                     save_best_only=True
 72 |                 )
 73 |                 early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
 74 |                 callbacks_list = [checkpoint, early]
 75 |                 model.fit(
 76 |                     X_train, y_train,
 77 |                     validation_data= (X_val, y_val),
 78 |                     callbacks=callbacks_list,
 79 |                     epochs=epochs,
 80 |                     batch_size=batch_size
 81 |                 )
 82 | 
 83 |                 model.set_weights(weights)
 84 |                 pred[test_index] = model.predict(X_test).reshape(-1)
 85 | 
 86 |                 # Reset model:
 87 |                 model = load_model(filepath='{}/dumped.hdf5'.format(model_path))
 88 |                 # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
 89 | 
 90 | 
 91 |             meta_input[:, ind] = pred
 92 | 
 93 | 
 94 |         self._meta_model.fit(meta_input, y)
 95 | 
 96 | 
 97 |     def predict(self, X):
 98 |         meta_input = self.compute_meta_data(X)
 99 |         return (self._meta_model.predict(meta_input) > 0.5).astype(np.int8)
100 | 
101 | 
102 |     def compute_meta_data(self, X):
103 |         prediction = np.zeros((len(X), len(self._models)))
104 |         for ind in range(len(self._models)):
105 |             pred = self._models[ind].predict(X).reshape(len(X), 1).reshape(-1)
106 |             prediction[:, ind] = pred
107 | 
108 |         return prediction
109 | 
110 |     def load_weights(self, paths):
111 |         for ind in range(len(self._models)):
112 |             self._models[ind].load_weights(paths[ind])
113 | 
114 | 
115 | class StackedGeneralizerWithHier:
116 |     def __init__(self, models, hier_models, meta_model):
117 |         self._models = models
118 |         self._hier_models = hier_models
119 | 
120 |         self._meta_model = meta_model
121 |         return
122 | 
123 |     def train_models(self, X, y, X_val, y_val, X_hier, X_hier_val, model_path, epochs, batch_size,
124 |                      patience):
125 | 
126 |         for ind in range(len(self._models)):
127 |             checkpoint = ModelCheckpoint(
128 |                 filepath='{}/models.hdf5'.format(model_path),
129 |                 monitor='val_f1', verbose=1,
130 |                 mode='max',
131 |                 save_best_only=True
132 |             )
133 |             early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
134 |             callbacks_list = [checkpoint, early]
135 |             self._models[ind].compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
136 |             self._models[ind].fit(
137 |                 X, y,
138 |                 validation_data=(X_val, y_val),
139 |                 callbacks=callbacks_list,
140 |                 epochs=epochs,
141 |                 batch_size=batch_size
142 |             )
143 |             self._models[ind].load_weights(filepath='{}/models.hdf5'.format(model_path))
144 | 
145 |         for ind in range(len(self._hier_models)):
146 |             checkpoint = ModelCheckpoint(
147 |                 filepath='{}/models.hdf5'.format(model_path),
148 |                 monitor='val_f1', verbose=1,
149 |                 mode='max',
150 |                 save_best_only=True
151 |             )
152 |             early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
153 |             callbacks_list = [checkpoint, early]
154 |             self._hier_models[ind].compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
155 |             self._hier_models[ind].fit(
156 |                 X_hier, y,
157 |                 validation_data=(X_hier_val, y_val),
158 |                 callbacks=callbacks_list,
159 |                 epochs=epochs,
160 |                 batch_size=batch_size
161 |             )
162 |             self._hier_models[ind].load_weights(filepath='{}/models.hdf5'.format(model_path))
163 | 
164 |     def train_meta_model(self, X, y, X_val, y_val, X_hier, X_hier_val, model_path, epochs,
165 |                          batch_size, patience):
166 | 
167 |         # Obtain level-1 input from each model:
168 |         meta_input = np.zeros((len(X), len(self._models) + len(self._hier_models)))
169 | 
170 |         for ind in range(len(self._hier_models)):
171 |             pred = np.zeros(len(X))
172 |             kf = KFold(n_splits=5, shuffle=False)
173 |             model = self._hier_models[ind]
174 |             weights = model.get_weights()
175 | 
176 | 
177 |             for train_index, test_index in kf.split(X_hier):
178 |                 X_train, X_test = X_hier[train_index], X_hier[test_index]
179 |                 y_train, y_test = y[train_index], y[test_index]
180 | 
181 |                 checkpoint = ModelCheckpoint(
182 |                     filepath='{}/models.hdf5'.format(model_path),
183 |                     monitor='val_f1', verbose=1,
184 |                     mode='max',
185 |                     save_best_only=True
186 |                 )
187 |                 early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
188 |                 callbacks_list = [checkpoint, early]
189 |                 model.fit(
190 |                     X_train, y_train,
191 |                     validation_data=(X_hier_val, y_val),
192 |                     callbacks=callbacks_list,
193 |                     epochs=epochs,
194 |                     batch_size=batch_size
195 |                 )
196 | 
197 |                 model.load_weights(filepath='{}/models.hdf5'.format(model_path))
198 |                 pred[test_index] = model.predict(X_test).reshape(-1)
199 | 
200 |                 # Reset model:
201 |                 model = model.set_weights(weights)
202 |                 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
203 | 
204 |             meta_input[:, len(self._models) + ind] = pred
205 | 
206 | 
207 |         for ind in range(len(self._models)):
208 |             pred = np.zeros(len(X))
209 |             kf = KFold(n_splits=5, shuffle=False)
210 |             model = self._models[ind]
211 |             weights = model.get_weights()
212 | 
213 |             for train_index, test_index in kf.split(X):
214 |                 X_train, X_test = X[train_index], X[test_index]
215 |                 y_train, y_test = y[train_index], y[test_index]
216 | 
217 |                 checkpoint = ModelCheckpoint(
218 |                     filepath='{}/models.hdf5'.format(model_path),
219 |                     monitor='val_f1', verbose=1,
220 |                     mode='max',
221 |                     save_best_only=True
222 |                 )
223 |                 early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
224 |                 callbacks_list = [checkpoint, early]
225 |                 model.fit(
226 |                     X_train, y_train,
227 |                     validation_data=(X_val, y_val),
228 |                     callbacks=callbacks_list,
229 |                     epochs=epochs,
230 |                     batch_size=batch_size
231 |                 )
232 | 
233 |                 model.load_weights(filepath='{}/models.hdf5'.format(model_path))
234 |                 pred[test_index] = model.predict(X_test).reshape(-1)
235 | 
236 |                 # Reset model:
237 |                 model.set_weights(weights)
238 |                 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
239 | 
240 | 
241 |             meta_input[:, ind] = pred
242 | 
243 | 
244 |         self._meta_model.fit(meta_input, y)
245 | 
246 |     def predict(self, X, X_hier):
247 |         meta_input = self.compute_meta_data(X, X_hier)
248 |         return (self._meta_model.predict(meta_input) > 0.5).astype(np.int8)
249 | 
250 |     def compute_meta_data(self, X, X_hier):
251 |         prediction = np.zeros((len(X), len(self._models) + len(self._hier_models)))
252 |         for ind in range(len(self._models)):
253 |             pred = self._models[ind].predict(X).reshape(len(X), 1).reshape(-1)
254 |             prediction[:, ind] = pred
255 | 
256 |         for ind in range(len(self._hier_models)):
257 |             pred = self._hier_models[ind].predict(X_hier).reshape(len(X_hier), 1).reshape(-1)
258 |             prediction[:, len(self._models) + ind] = pred
259 | 
260 |         return prediction
261 | 
262 |     def load_weights(self, paths, paths_hier):
263 |         for ind in range(len(self._models)):
264 |             self._models[ind].load_weights(paths[ind])
265 | 
266 |         for ind in range(len(self._hier_models)):
267 |             self._hier_models[ind].load_weights(paths_hier[ind])
268 | 
269 | 
270 | def StackMLP(n_model):
271 |     inp = Input(shape = (n_model,))
272 |     op = Dense(10, activation = "relu")(inp)
273 |     op = BatchNormalization()(op)
274 |     op = Dense(1, activation = "sigmoid")(op)
275 | 
276 |     model = Model(inputs = inp, outputs = op)
277 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
278 |     return model
279 | 
280 | 
281 | 
282 | 


--------------------------------------------------------------------------------
/scripts/rnn.py:
--------------------------------------------------------------------------------
  1 | from keras.models import Model
  2 | from keras.layers import \
  3 |     Dense, Embedding, Input, \
  4 |     CuDNNGRU, GRU, LSTM, Bidirectional, CuDNNLSTM, \
  5 |     GlobalMaxPool1D, GlobalAveragePooling1D, Dropout, \
  6 |     Lambda, Concatenate, TimeDistributed
  7 | from .util import f1
  8 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
  9 | from keras.activations import softmax
 10 | from keras_layer_normalization import LayerNormalization
 11 | from .net_components import AttLayer, AdditiveLayer
 12 | from keras.utils.vis_utils import plot_model
 13 | 
 14 | 
 15 | 
 16 | 
 17 | def RNNKeras(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, use_fasttext = False, trainable = True, use_additive_emb = False):
 18 |     if use_fasttext:
 19 |         inp = Input(shape=(maxlen, embed_size))
 20 |         x = inp
 21 |     else:
 22 |         inp = Input(shape = (maxlen, ))
 23 |         x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
 24 | 
 25 |     if use_additive_emb:
 26 |         x = AdditiveLayer()(x)
 27 |         x = Dropout(0.5)(x)
 28 | 
 29 |     x = Bidirectional(CuDNNGRU(128, return_sequences = True))(x)
 30 |     x = Dropout(0.5)(x)
 31 |     x = Bidirectional(CuDNNGRU(128, return_sequences = True))(x)
 32 |     x = Dropout(0.5)(x)
 33 | 
 34 |     max_pool = GlobalMaxPool1D()(x)
 35 |     avg_pool = GlobalAveragePooling1D()(x)
 36 |     last = Lambda(lambda x: x[:, 0, :])(x)
 37 |     concat_pool = Concatenate(axis = -1)([last, max_pool, avg_pool])
 38 | 
 39 |     op = Dense(64, activation = "relu")(concat_pool)
 40 |     op = Dropout(0.5)(op)
 41 |     op = Dense(1, activation = "sigmoid")(op)
 42 | 
 43 |     model = Model(inputs = inp, outputs = op)
 44 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
 45 |     return model
 46 | 
 47 | def RNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, use_fasttext = False, trainable = True, use_additive_emb = False):
 48 |     if use_fasttext:
 49 |         inp = Input(shape=(maxlen, embed_size))
 50 |         x = inp
 51 |     else:
 52 |         inp = Input(shape = (maxlen, ))
 53 |         x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
 54 | 
 55 |     if use_additive_emb:
 56 |         x = AdditiveLayer()(x)
 57 |         x = Dropout(0.5)(x)
 58 | 
 59 | 
 60 |     x = Bidirectional(GRU(128, return_sequences = True, recurrent_dropout = 0.5, dropout = 0.5))(x)
 61 |     # x = Dropout(0.5)(x)
 62 |     x = Bidirectional(GRU(128, return_sequences = True, recurrent_dropout = 0.5, dropout = 0.5))(x)
 63 |     # x = Dropout(0.5)(x)
 64 | 
 65 |     max_pool = GlobalMaxPool1D()(x)
 66 |     avg_pool = GlobalAveragePooling1D()(x)
 67 |     last = Lambda(lambda x: x[:, 0, :])(x)
 68 |     concat_pool = Concatenate(axis = -1)([last, max_pool, avg_pool])
 69 | 
 70 |     op = Dense(64, activation = "relu")(concat_pool)
 71 |     op = Dropout(0.5)(op)
 72 |     op = Dense(1, activation = "sigmoid")(op)
 73 | 
 74 |     model = Model(inputs = inp, outputs = op)
 75 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
 76 |     return model
 77 | 
 78 | def LSTMKeras(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100):
 79 |     inp = Input(shape = (maxlen, ))
 80 |     x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix])(inp)
 81 |     x = Bidirectional(CuDNNLSTM(50, return_sequences = True))(x)
 82 |     # x = Dropout(0.1)(x)
 83 |     x = Bidirectional(CuDNNLSTM(50, return_sequences = True))(x)
 84 |     x = Dropout(0.1)(x)
 85 |     x = GlobalMaxPool1D()(x)
 86 |     x = Dense(50, activation = "relu")(x)
 87 |     x = Dropout(0.1)(x)
 88 |     x = Dense(1, activation = "sigmoid")(x)
 89 |     model = Model(inputs = inp, outputs = x)
 90 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
 91 |     return model
 92 | 
 93 | 
 94 | def SARNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, use_fasttext = False, trainable = True, use_additive_emb = False):
 95 |     if use_fasttext:
 96 |         inp = Input(shape=(maxlen, embed_size))
 97 |         x = inp
 98 |     else:
 99 |         inp = Input(shape = (maxlen, ))
100 |         x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
101 | 
102 |     if use_additive_emb:
103 |         x = AdditiveLayer()(x)
104 |         x = Dropout(0.5)(x)
105 | 
106 | 
107 |     x = Bidirectional(LSTM(128, return_sequences = True))(x)
108 |     x = SeqSelfAttention(
109 |         # attention_type = SeqSelfAttention.ATTENTION_TYPE_MUL,
110 |         attention_regularizer_weight=1e-4,
111 |     )(x)
112 |     # x = LayerNormalization()(x)
113 |     x = Dropout(0.5)(x)
114 | 
115 |     x = Bidirectional(LSTM(128, return_sequences = True))(x)
116 |     x = SeqWeightedAttention()(x)
117 |     # x = LayerNormalization()(x)
118 |     x = Dropout(0.5)(x)
119 | 
120 |     x = Dense(64, activation = "relu")(x)
121 |     x = Dropout(0.5)(x)
122 |     x = Dense(1, activation = "sigmoid")(x)
123 |     model = Model(inputs = inp, outputs = x)
124 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
125 |     return model
126 | 
127 | def SARNNKeras(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, rnn_type = CuDNNLSTM, use_fasttext = False, trainable = True, use_additive_emb = False):
128 |     if use_fasttext:
129 |         inp = Input(shape=(maxlen, embed_size))
130 |         x = inp
131 |     else:
132 |         inp = Input(shape = (maxlen, ))
133 |         x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
134 | 
135 |     if use_additive_emb:
136 |         x = AdditiveLayer()(x)
137 |         x = Dropout(0.5)(x)
138 | 
139 | 
140 |     x = Bidirectional(rnn_type(128, return_sequences = True))(x)
141 |     x = SeqSelfAttention(
142 |         # attention_type = SeqSelfAttention.ATTENTION_TYPE_MUL,
143 |         attention_regularizer_weight=1e-4,
144 |     )(x)
145 |     # x = LayerNormalization()(x)
146 |     x = Dropout(0.5)(x)
147 | 
148 |     x = Bidirectional(rnn_type(128, return_sequences = True))(x)
149 |     x = SeqWeightedAttention()(x)
150 |     # x = LayerNormalization()(x)
151 |     x = Dropout(0.5)(x)
152 | 
153 |     x = Dense(64, activation = "relu")(x)
154 |     x = Dropout(0.5)(x)
155 |     x = Dense(1, activation = "sigmoid")(x)
156 |     model = Model(inputs = inp, outputs = x)
157 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
158 |     return model
159 | 
160 | 
161 | def HRNNCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, trainable = True, use_additive_emb = False):
162 |     sent_inp = Input(shape = (max_sent_len, embed_size))
163 |     embed = Embedding(
164 |         input_dim = max_features,
165 |         output_dim = embed_size,
166 |         weights = [embeddingMatrix],
167 |         trainable = trainable
168 |     )(sent_inp)
169 | 
170 |     if use_additive_emb:
171 |         embed = AdditiveLayer()(embed)
172 |         embed = Dropout(0.5)(embed)
173 | 
174 |     word_lstm = Bidirectional(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))(embed)
175 |     sent_encoder = Model(sent_inp, word_lstm)
176 | 
177 |     doc_input = Input(shape = (max_nb_sent, max_sent_len))
178 |     doc_encoder = TimeDistributed(sent_encoder)(doc_input)
179 |     sent_lstm = Bidirectional(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))(doc_encoder)
180 |     preds = Dense(1, activation = "sigmoid")(sent_lstm)
181 |     model = Model(inputs = doc_input, outputs = preds)
182 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
183 |     return model
184 | 
185 | def HRNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, trainable = True, use_additive_emb = False):
186 |     sent_inp = Input(shape = (max_sent_len, embed_size))
187 |     embed = Embedding(
188 |         input_dim = max_features,
189 |         output_dim = embed_size,
190 |         weights = [embeddingMatrix],
191 |         trainable = trainable
192 |     )(sent_inp)
193 | 
194 |     if use_additive_emb:
195 |         embed = AdditiveLayer()(embed)
196 |         embed = Dropout(0.5)(embed)
197 | 
198 |     word_lstm = Bidirectional(CuDNNLSTM(128))(embed)
199 |     sent_encoder = Model(sent_inp, word_lstm)
200 | 
201 |     doc_input = Input(shape = (max_nb_sent, max_sent_len))
202 |     doc_encoder = TimeDistributed(sent_encoder)(doc_input)
203 |     sent_lstm = Bidirectional(CuDNNLSTM(128))(doc_encoder)
204 |     preds = Dense(1, activation = "sigmoid")(sent_lstm)
205 |     model = Model(inputs = doc_input, outputs = preds)
206 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
207 |     return model
208 | 
209 | 
210 | def OriginalHARNNCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False):
211 |     if use_fasttext:
212 |         sent_inp = Input(shape = (max_sent_len, embed_size))
213 |         embed = sent_inp
214 |     else:
215 |         sent_inp = Input(shape = (max_sent_len, ))
216 |         embed = Embedding(
217 |             input_dim = max_features,
218 |             output_dim = embed_size,
219 |             weights = [embeddingMatrix],
220 |             trainable = trainable
221 |         )(sent_inp)
222 | 
223 |     if use_additive_emb:
224 |         embed = AdditiveLayer()(embed)
225 |         embed = Dropout(0.5)(embed)
226 | 
227 |     word_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(embed)
228 |     word_att = AttLayer(context_size = 256)(word_lstm)
229 |     sent_encoder = Model(sent_inp, word_att)
230 | 
231 |     doc_input = Input(shape = (max_nb_sent, max_sent_len))
232 |     doc_encoder = TimeDistributed(sent_encoder)(doc_input)
233 |     sent_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(doc_encoder)
234 |     sent_att = AttLayer(context_size = 256)(sent_lstm)
235 |     preds = Dense(1, activation = "sigmoid")(sent_att)
236 |     model = Model(inputs = doc_input, outputs = preds)
237 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
238 |     return model
239 | 
240 | def OriginalHARNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False):
241 |     if use_fasttext:
242 |         sent_inp = Input(shape = (max_sent_len, embed_size))
243 |         embed = sent_inp
244 |     else:
245 |         sent_inp = Input(shape = (max_sent_len, ))
246 |         embed = Embedding(
247 |             input_dim = max_features,
248 |             output_dim = embed_size,
249 |             weights = [embeddingMatrix],
250 |             trainable = trainable
251 |         )(sent_inp)
252 | 
253 |     if use_additive_emb:
254 |         embed = AdditiveLayer()(embed)
255 |         embed = Dropout(0.5)(embed)
256 | 
257 |     word_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(embed)
258 |     word_att = AttLayer(context_size = 256)(word_lstm)
259 |     word_att = Dropout(0.5)(word_att)
260 |     sent_encoder = Model(sent_inp, word_att)
261 | 
262 |     doc_input = Input(shape = (max_nb_sent, max_sent_len))
263 |     doc_encoder = TimeDistributed(sent_encoder)(doc_input)
264 |     sent_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(doc_encoder)
265 |     sent_att = AttLayer(context_size = 256)(sent_lstm)
266 |     sent_att = Dropout(0.5)(sent_att)
267 |     preds = Dense(1, activation = "sigmoid")(sent_att)
268 |     model = Model(inputs = doc_input, outputs = preds)
269 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
270 |     return model
271 | 
272 | 
273 | 
274 | 
275 | def HARNNCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False):
276 |     if use_fasttext:
277 |         sent_inp = Input(shape = (max_sent_len, embed_size))
278 |         embed = sent_inp
279 |     else:
280 |         sent_inp = Input(shape = (max_sent_len, ))
281 |         embed = Embedding(
282 |             input_dim = max_features,
283 |             output_dim = embed_size,
284 |             weights = [embeddingMatrix],
285 |             trainable = trainable
286 |         )(sent_inp)
287 | 
288 |     if use_additive_emb:
289 |         embed = AdditiveLayer()(embed)
290 |         embed = Dropout(0.5)(embed)
291 | 
292 | 
293 |     word_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(embed)
294 |     word_att = SeqWeightedAttention()(word_lstm)
295 |     sent_encoder = Model(sent_inp, word_att)
296 | 
297 |     doc_input = Input(shape = (max_nb_sent, max_sent_len))
298 |     doc_encoder = TimeDistributed(sent_encoder)(doc_input)
299 |     sent_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(doc_encoder)
300 |     sent_att = SeqWeightedAttention()(sent_lstm)
301 |     preds = Dense(1, activation = "sigmoid")(sent_att)
302 |     model = Model(inputs = doc_input, outputs = preds)
303 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
304 |     return model
305 | 
306 | 
307 | 
308 | def HARNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False):
309 |     if use_fasttext:
310 |         sent_inp = Input(shape = (max_sent_len, embed_size))
311 |         embed = sent_inp
312 |     else:
313 |         sent_inp = Input(shape = (max_sent_len, ))
314 |         embed = Embedding(
315 |             input_dim = max_features,
316 |             output_dim = embed_size,
317 |             weights = [embeddingMatrix],
318 |             trainable = trainable
319 |         )(sent_inp)
320 | 
321 |     if use_additive_emb:
322 |         embed = AdditiveLayer()(embed)
323 |         embed = Dropout(0.5)(embed)
324 | 
325 |     word_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(embed)
326 |     word_att = SeqWeightedAttention()(word_lstm)
327 |     word_att = Dropout(0.5)(word_att)
328 |     sent_encoder = Model(sent_inp, word_att)
329 |     plot_model(sent_encoder, to_file='{}.png'.format("HARNN1"), show_shapes=True, show_layer_names=True)
330 | 
331 | 
332 |     doc_input = Input(shape = (max_nb_sent, max_sent_len))
333 |     doc_encoder = TimeDistributed(sent_encoder)(doc_input)
334 |     sent_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(doc_encoder)
335 |     sent_att = SeqWeightedAttention()(sent_lstm)
336 |     sent_att = Dropout(0.5)(sent_att)
337 |     preds = Dense(1, activation = "sigmoid")(sent_att)
338 |     model = Model(inputs = doc_input, outputs = preds)
339 |     model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
340 |     return model
341 | 
342 | 
343 | 
344 | 


--------------------------------------------------------------------------------