├── test_lib.py
├── scripts
├── __init__.py
├── constant.py
├── net_components.py
├── augment.py
├── cnn.py
├── util.py
├── stack.py
└── rnn.py
├── HARNN.png
├── VDCNN.png
├── HARNN1.png
├── TextCNN.png
├── SARNNKeras.png
├── 2019-03-16-12:32:21.png
├── external_lib
└── install_lib.sh
├── requirements.txt
├── README.md
├── main_stack.py
├── test.py
├── 1st place solution.md
├── test_elmo.py
├── main_stack_hier.py
├── main_elmo.py
├── main.py
└── main_hierarchical.py
/test_lib.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/HARNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/HARNN.png
--------------------------------------------------------------------------------
/VDCNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/VDCNN.png
--------------------------------------------------------------------------------
/HARNN1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/HARNN1.png
--------------------------------------------------------------------------------
/TextCNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/TextCNN.png
--------------------------------------------------------------------------------
/SARNNKeras.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/SARNNKeras.png
--------------------------------------------------------------------------------
/2019-03-16-12:32:21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petrpan26/Aivivn_1/HEAD/2019-03-16-12:32:21.png
--------------------------------------------------------------------------------
/external_lib/install_lib.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # Install deepai_nlp
3 | cd deepai_nlp
4 | pip install -e .
5 | cd ..
6 | # Install elmo
7 | cd ELMoForManyLangs
8 | python setup.py install
9 | cd ..
10 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyvi
2 | pandas>=0.24.1
3 | spacy>=2.0.18
4 | gensim>=3.7.1
5 | scikit-learn>=0.20.2
6 | keras
7 | tensorflow
8 | keras-self-attention==0.35.0
9 | keras-multi-head==0.16.0
10 | keras-layer-normalization==0.10.0
11 | annoy==1.15.1
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Aivivn_1
2 |
3 | Our submission for Aivivn Contest 1.
4 |
5 | By Nhat Pham and Hoang Phan.
6 |
7 | Install environment:
8 | ```bash
9 | conda install python=3.6
10 | ```
11 |
12 | Dependencies guide:
13 |
14 | ```bash
15 | pip install -r requirements.txt
16 | cd external_lib
17 | chmod a+x install_lib.sh
18 | ./install_lib.sh
19 | cd ..
20 | ```
21 | Notebook test link:
22 | https://colab.research.google.com/drive/1fgtIYXkXKKmZVI2w62nCI22wiVSNEQxw
23 |
24 | Sample run command:
25 |
26 | ```bash
27 | python -m main -m VDCNN -e ./embeddings/baomoi.model.bin --max 40000 --mix --prob
28 | ```
29 |
--------------------------------------------------------------------------------
/scripts/constant.py:
--------------------------------------------------------------------------------
1 | # From spacy english model
2 | EMOTICONS = set("""
3 | :)
4 | :-)
5 | :))
6 | :-))
7 | :)))
8 | :-)))
9 | (:
10 | (-:
11 | =)
12 | (=
13 | ")
14 | :]
15 | :-]
16 | [:
17 | [-:
18 | :o)
19 | (o:
20 | :}
21 | :-}
22 | 8)
23 | 8-)
24 | (-8
25 | ;)
26 | ;-)
27 | (;
28 | (-;
29 | :(
30 | :-(
31 | :((
32 | :-((
33 | :(((
34 | :-(((
35 | ):
36 | )-:
37 | =(
38 | >:(
39 | :')
40 | :'-)
41 | :'(
42 | :'-(
43 | :/
44 | :-/
45 | =/
46 | =|
47 | :|
48 | :-|
49 | :1
50 | :P
51 | :-P
52 | :p
53 | :-p
54 | :O
55 | :-O
56 | :o
57 | :-o
58 | :0
59 | :-0
60 | :()
61 | >:o
62 | :*
63 | :-*
64 | :3
65 | :-3
66 | =3
67 | :>
68 | :->
69 | :X
70 | :-X
71 | :x
72 | :-x
73 | :D
74 | :-D
75 | ;D
76 | ;-D
77 | =D
78 | xD
79 | XD
80 | xDD
81 | XDD
82 | 8D
83 | 8-D
84 | ^_^
85 | ^__^
86 | ^___^
87 | >.<
88 | >.>
89 | <.<
90 | ._.
91 | ;_;
92 | -_-
93 | -__-
94 | v.v
95 | V.V
96 | v_v
97 | V_V
98 | o_o
99 | o_O
100 | O_o
101 | O_O
102 | 0_o
103 | o_0
104 | 0_0
105 | o.O
106 | O.o
107 | O.O
108 | o.o
109 | 0.0
110 | o.0
111 | 0.o
112 | @_@
113 | <3
114 | <33
115 | <333
116 | 3
117 | (^_^)
118 | (-_-)
119 | (._.)
120 | (>_<)
121 | (*_*)
122 | (¬_¬)
123 | ಠ_ಠ
124 | ಠ︵ಠ
125 | (ಠ_ಠ)
126 | ¯\(ツ)/¯
127 | (╯°□°)╯︵┻━┻
128 | ><(((*>
129 | """.split())
130 |
131 | DEFAULT_MAX_FEATURES = 12000
132 | DEFAULT_MAX_LENGTH = 100
133 |
--------------------------------------------------------------------------------
/scripts/net_components.py:
--------------------------------------------------------------------------------
1 | from keras.layers import Layer
2 | import keras.backend as K
3 |
4 | class AttLayer(Layer):
5 | def __init__(self, context_size):
6 | self._context_size = context_size
7 | self.supports_masking = True
8 | # self._linear = Dense(context_size, activation = "tanh")
9 | super(AttLayer, self).__init__()
10 |
11 | def build(self, input_shape):
12 | self._W = self.add_weight(
13 | name = "W",
14 | shape = (input_shape[-1], self._context_size),
15 | initializer="he_normal",
16 | trainable=True
17 | )
18 | self._b = self.add_weight(
19 | name = "b",
20 | shape = (1, self._context_size),
21 | initializer="constant",
22 | trainable=True
23 | )
24 | self._context = self.add_weight(
25 | name = "context",
26 | shape = (self._context_size, 1),
27 | initializer = "he_normal",
28 | trainable = True
29 | )
30 | super(AttLayer, self).build(input_shape)
31 |
32 |
33 | def compute_mask(self, input, input_mask=None):
34 | return input_mask
35 |
36 |
37 | def call(self, input, mask = None):
38 | # input: (N, T, M)
39 | rep = K.tanh(K.dot(input, self._W) + self._b) # (N, T, C)
40 | score = K.squeeze(K.dot(rep, self._context), axis = -1) # (N, T)
41 |
42 | weight = K.exp(score)
43 | if mask is not None:
44 | weight *= K.cast(mask, K.floatx())
45 |
46 | weight /= K.cast(K.sum(weight, axis = 1, keepdims = True) + K.epsilon(), K.floatx())
47 |
48 |
49 | # weight = softmax(score, axis = -1) # (N, T)
50 | op = K.batch_dot(input, weight, axes = (1, 1)) # (N, M)
51 |
52 | return op
53 |
54 | def compute_output_shape(self, input_shape):
55 | return (input_shape[0], input_shape[-1])
56 |
57 |
58 |
59 | class AdditiveLayer(Layer):
60 | def __init__(self):
61 | super(AdditiveLayer, self).__init__()
62 |
63 | def build(self, input_shape):
64 | self._w = self.add_weight(
65 | name = "w",
66 | shape = (1, input_shape[-1]),
67 | initializer="constant",
68 | trainable=True
69 | )
70 | super(AdditiveLayer, self).build(input_shape)
71 |
72 |
73 |
74 | def call(self, input):
75 | return input + self._w
76 |
77 | def compute_output_shape(self, input_shape):
78 | return input_shape
79 |
--------------------------------------------------------------------------------
/scripts/augment.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gensim.models import KeyedVectors
3 | import copy
4 | import random
5 | from gensim.similarities.index import AnnoyIndexer
6 |
7 |
8 | def shuffle_augment(texts, labels, n_increase, min_length = 1):
9 | texts_long = []
10 | labels_long = []
11 |
12 | if min_length > 1:
13 | for ind in range(len(texts)):
14 | if len(texts[ind]) >= min_length:
15 | texts_long.append(texts[ind])
16 | labels_long.append(labels[ind])
17 | else:
18 | texts_long = texts
19 | labels_long = labels
20 |
21 |
22 | shuffle_ind = np.random.choice(len(texts_long), size = n_increase)
23 | for ind in shuffle_ind:
24 | text_copy = np.random.permutation(texts_long[ind])
25 | texts.append(text_copy)
26 | labels = np.append(labels, [labels_long[ind]])
27 |
28 |
29 | return texts, labels
30 |
31 |
32 | def similar_augment(texts, labels, n_increase, n_word_replace, model_path, similar_threshold = 0.5, use_annoy = True, annoy_path = None):
33 | w2v = KeyedVectors.load_word2vec_format(model_path, binary=True)
34 | texts_long = []
35 | labels_long = []
36 | if use_annoy:
37 | if annoy_path is None:
38 | indexer = AnnoyIndexer(w2v, 100)
39 | else:
40 | indexer = AnnoyIndexer()
41 | indexer.load(annoy_path)
42 |
43 | for ind in range(len(texts)):
44 | if len(texts[ind]) >= n_word_replace:
45 | texts_long.append(texts[ind])
46 | labels_long.append(labels[ind])
47 |
48 | shuffle_ind = np.random.choice(len(texts_long), size = n_increase)
49 | for ind in shuffle_ind:
50 | text_copy = copy.deepcopy(texts_long[ind])
51 | # if is_hier:
52 |
53 | replace_inds = np.random.choice(text_copy.shape[-1], size = n_word_replace, replace = False)
54 | for word_ind in replace_inds:
55 | word = text_copy[word_ind]
56 | try:
57 |
58 | closest, score = w2v.wv.most_similar(
59 | word, topn = 2,
60 | indexer = indexer if use_annoy else None
61 | )[1]
62 | if score > similar_threshold:
63 | text_copy[word_ind] = closest
64 | except:
65 | continue
66 |
67 | texts.append(text_copy)
68 | labels = np.append(labels, [labels_long[ind]])
69 |
70 | return texts, labels
71 |
72 |
73 |
74 |
75 | def create_sim_dict(word_map, model_path, similar_threshold = 0.5, use_annoy = True, annoy_path = None):
76 | w2v = KeyedVectors.load_word2vec_format(model_path, binary=True)
77 | if use_annoy:
78 | if annoy_path is None:
79 | indexer = AnnoyIndexer(w2v, 100)
80 | else:
81 | indexer = AnnoyIndexer()
82 | indexer.load(annoy_path)
83 |
84 | sim_dict = dict()
85 | for word in word_map:
86 | try:
87 | closest, score = w2v.wv.most_similar(
88 | word, topn=2,
89 | indexer=indexer if use_annoy else None
90 | )[1]
91 | if score > similar_threshold and closest in word_map:
92 | sim_dict[word_map[word]] = word_map[closest]
93 | except:
94 | continue
95 |
96 | return sim_dict
97 |
98 | def similar_augment_from_sim_dict(texts, labels, sim_dict, n_increase, keep_prob = 0.5):
99 | aug_ind = np.random.choice(len(texts), size = n_increase)
100 | i = -1
101 | for ind in aug_ind:
102 | i += 1
103 | text_aug = copy.deepcopy(texts[ind])
104 | for word_ind in range(len(text_aug)):
105 | word = text_aug[word_ind]
106 | if word in sim_dict:
107 | p = random.uniform(0, 1)
108 | if p > keep_prob:
109 | text_aug[word_ind] = sim_dict[word]
110 |
111 | texts = np.append(texts, [text_aug], axis = 0)
112 | labels = np.append(labels, [labels[ind]], axis = 0)
113 |
114 | return texts, labels
115 |
116 |
117 |
118 |
119 |
120 |
121 |
--------------------------------------------------------------------------------
/main_stack.py:
--------------------------------------------------------------------------------
1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences, f1
2 | from scripts.constant import DEFAULT_MAX_FEATURES
3 | from sklearn.model_selection import train_test_split
4 | from scripts.rnn import SARNNKeras
5 | from scripts.cnn import LSTMCNN, VDCNN
6 | from scripts.stack import StackedGeneralizer
7 | import argparse
8 | import os
9 | import numpy as np
10 | import datetime
11 | import pandas as pd
12 | from sklearn.metrics import f1_score
13 |
14 | from sklearn.linear_model import LogisticRegression
15 | from keras.utils import CustomObjectScope
16 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
17 |
18 |
19 | def stack(models_list, embedding_path, max_features, should_mix):
20 | model_name = '-'.join(
21 | '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' '))
22 |
23 | train_data = read_file('./data/train.crash')
24 | test_data = read_file('./data/test.crash', is_train=False)
25 | train_tokenized_texts = tokenize(train_data['text'])
26 | test_tokenizes_texts = tokenize(test_data['text'])
27 | labels = train_data['label'].values.astype(np.float16).reshape(-1, 1)
28 |
29 | embed_size, word_map, embedding_mat = make_embedding(
30 | list(train_tokenized_texts) +
31 | list(test_tokenizes_texts) if should_mix else train_tokenized_texts,
32 | embedding_path,
33 | max_features
34 | )
35 |
36 | texts_id = text_to_sequences(train_tokenized_texts, word_map)
37 | print('Number of train data: {}'.format(labels.shape))
38 |
39 | texts_id_train, texts_id_val, labels_train, labels_val = train_test_split(
40 | texts_id, labels, test_size=0.05)
41 |
42 | model_path = './models/{}-version'.format(model_name)
43 |
44 | try:
45 | os.mkdir('./models')
46 | except:
47 | print('Folder already created')
48 | try:
49 | os.mkdir(model_path)
50 | except:
51 | print('Folder already created')
52 |
53 | batch_size = 16
54 | epochs = 100
55 | patience = 3
56 |
57 | meta_model = LogisticRegression()
58 | models = [
59 | model(
60 | embeddingMatrix=embedding_mat,
61 | embed_size=400,
62 | max_features=embedding_mat.shape[0]
63 | )
64 | for model in models_list
65 | ]
66 |
67 |
68 | stack = StackedGeneralizer(models, meta_model)
69 | stack.train_meta_model(
70 | texts_id_train, labels_train,
71 | texts_id_val, labels_val,
72 | model_path = model_path,
73 | epochs = epochs,
74 | batch_size = batch_size,
75 | patience = patience
76 | )
77 |
78 | stack.train_models(
79 | X = texts_id_train, y = labels_train,
80 | X_val = texts_id_val, y_val = labels_val,
81 | batch_size = batch_size,
82 | epochs = epochs,
83 | patience = patience,
84 | model_path = model_path
85 | )
86 |
87 | prediction = stack.predict(texts_id_val)
88 | print('F1 validation score: {}'.format(f1_score(prediction, labels_val)))
89 | with open('{}/f1'.format(model_path), 'w') as fp:
90 | fp.write(str(f1_score(prediction, labels_val)))
91 |
92 | test_id_texts = text_to_sequences(test_tokenizes_texts, word_map)
93 | test_prediction = stack.predict(test_id_texts)
94 |
95 | df_predicton = pd.read_csv("./data/sample_submission.csv")
96 | df_predicton["label"] = test_prediction
97 |
98 | print('Number of test data: {}'.format(df_predicton.shape[0]))
99 | df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False)
100 |
101 |
102 |
103 | if __name__ == '__main__':
104 | models_list = [
105 | SARNNKeras, LSTMCNN, VDCNN
106 | ]
107 | parser = argparse.ArgumentParser()
108 | parser.add_argument(
109 | '-e',
110 | '--embedding',
111 | help='Model use',
112 | default='./embeddings/smallFasttext.vi.vec'
113 | )
114 | parser.add_argument(
115 | '--max',
116 | help='Model use',
117 | default=DEFAULT_MAX_FEATURES
118 | )
119 | parser.add_argument(
120 | '--mix',
121 | action='store_true',
122 | help='Model use'
123 | )
124 | args = parser.parse_args()
125 |
126 | with CustomObjectScope({
127 | 'SeqSelfAttention': SeqSelfAttention,
128 | 'SeqWeightedAttention': SeqWeightedAttention,
129 | 'f1': f1}
130 | ):
131 | stack(
132 | models_list, args.embedding, int(args.max), args.mix
133 | )
134 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences, find_threshold
2 | import numpy as np
3 | from scripts.constant import DEFAULT_MAX_FEATURES
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.metrics import f1_score
6 | import tensorflow as tf
7 | import random as rn
8 | import pandas as pd
9 |
10 |
11 |
12 | from keras.models import Model
13 | from keras.layers import Dense, Embedding, Input, GRU, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, Lambda
14 | from keras.callbacks import EarlyStopping, ModelCheckpoint
15 | import keras.backend as K
16 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
17 |
18 | # np random seed:
19 | np.random.seed(22)
20 |
21 | # # Setting the seed for python random numbers
22 | rn.seed(1254)
23 | #
24 | # # Setting the graph-level random seed.
25 | tf.set_random_seed(89)
26 |
27 | def SARNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100):
28 | inp = Input(shape = (maxlen, ))
29 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix])(inp)
30 | x = Bidirectional(LSTM(128, return_sequences = True))(x)
31 | x = SeqSelfAttention(
32 | attention_type = SeqSelfAttention.ATTENTION_TYPE_MUL,
33 | attention_regularizer_weight=1e-4,
34 | )(x)
35 | x = Dropout(0.5)(x)
36 | x = Bidirectional(LSTM(128, return_sequences = True))(x)
37 | x = SeqWeightedAttention()(x)
38 | x = Dropout(0.5)(x)
39 | x = Dense(64, activation = "relu")(x)
40 | x = Dropout(0.5)(x)
41 | x = Dense(1, activation = "sigmoid")(x)
42 | model = Model(inputs = inp, outputs = x)
43 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
44 | return model
45 |
46 |
47 | def RNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100):
48 | inp = Input(shape = (maxlen, ))
49 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix])(inp)
50 | x = Bidirectional(LSTM(128, return_sequences = True))(x)
51 | x = Dropout(0.5)(x)
52 | x = Bidirectional(LSTM(128, return_sequences = True))(x)
53 | x = Dropout(0.5)(x)
54 | x = GlobalMaxPool1D()(x)
55 | x = Dense(64, activation = "relu")(x)
56 | x = Dropout(0.5)(x)
57 | x = Dense(1, activation = "sigmoid")(x)
58 | model = Model(inputs = inp, outputs = x)
59 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
60 | return model
61 |
62 |
63 |
64 |
65 |
66 | def f1(y_true, y_pred):
67 | def recall(y_true, y_pred):
68 | """Recall metric.
69 |
70 | Only computes a batch-wise average of recall.
71 |
72 | Computes the recall, a metric for multi-label classification of
73 | how many relevant items are selected.
74 | """
75 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
76 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
77 | recall = true_positives / (possible_positives + K.epsilon())
78 | return recall
79 |
80 | def precision(y_true, y_pred):
81 | """Precision metric.
82 |
83 | Only computes a batch-wise average of precision.
84 |
85 | Computes the precision, a metric for multi-label classification of
86 | how many selected items are relevant.
87 | """
88 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
89 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
90 | precision = true_positives / (predicted_positives + K.epsilon())
91 | return precision
92 | precision = precision(y_true, y_pred)
93 | recall = recall(y_true, y_pred)
94 | return 2*((precision*recall)/(precision+recall+K.epsilon()))
95 |
96 |
97 |
98 |
99 |
100 | data = read_file("./data/train.crash")
101 | tokenized_texts = tokenize(data["text"])
102 | labels = data["label"].values.astype(np.float16).reshape(-1, 1)
103 |
104 | embed_size, word_map, embedding_mat = make_embedding(
105 | tokenized_texts,
106 | embedding_path = "./data/baomoi.model.bin",
107 | max_features = 40000
108 | )
109 |
110 |
111 |
112 | texts_id = text_to_sequences(tokenized_texts, word_map)
113 | print(labels.shape)
114 | print(texts_id.shape)
115 |
116 | texts_id_train, texts_id_val, labels_train, labels_val = train_test_split(
117 | texts_id, labels,
118 | test_size = 0.05
119 | )
120 |
121 | checkpoint = ModelCheckpoint(
122 | filepath = "./Weights/model_sa_2.hdf5",
123 | monitor = 'val_f1', verbose = 1,
124 | mode = 'max',
125 | save_best_only = True
126 | )
127 | early = EarlyStopping(monitor = "val_f1", mode = "max", patience = 3)
128 | callbacks_list = [checkpoint, early]
129 | batch_size = 16
130 | epochs = 100
131 |
132 |
133 | model = SARNNKerasCPU(
134 | embeddingMatrix = embedding_mat,
135 | embed_size = 400,
136 | max_features = embedding_mat.shape[0]
137 | )
138 | model.fit(
139 | texts_id_train, labels_train,
140 | validation_data = (texts_id_val, labels_val),
141 | callbacks = callbacks_list,
142 | epochs = epochs,
143 | batch_size = 16
144 | )
145 |
146 |
147 |
148 |
149 | model.load_weights("./Weights/model_sa_2.hdf5")
150 | prediction_prob = model.predict(texts_id_val)
151 |
152 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val)
153 | print(OPTIMAL_THRESHOLD)
154 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8)
155 | print(f1_score(
156 | y_true = labels_val.reshape(-1),
157 | y_pred = prediction.reshape(-1)
158 | ))
159 |
160 |
161 |
162 | data_test = read_file("./data/test.crash", is_train = False)
163 | tokenized_texts_test = tokenize(data_test["text"])
164 | texts_id_test = text_to_sequences(tokenized_texts_test, word_map)
165 | prediction_test = model.predict(texts_id_test)
166 | df_predicton = pd.read_csv("./data/sample_submission.csv")
167 | df_predicton["label"] = (prediction_test > OPTIMAL_THRESHOLD).astype(np.int8)
168 | print(df_predicton.shape[0])
169 | df_predicton.to_csv("./prediction/prediction_sa_2.csv", index = False)
--------------------------------------------------------------------------------
/1st place solution.md:
--------------------------------------------------------------------------------
1 | ---
2 |
3 |
4 | ---
5 |
6 |
1st place solution analysis
7 | Chào mọi người bọn mình là Hoàng và Nhật trong team HoangNhat2 trên leaderboard. Đầu tiên, bọn mình muốn cảm ơn anh Tiệp và mọi người trong team Aivivn vì đã tổ chức một cuộc thi về Machine Learning về xử lý tiếng Việt rất thú vị. Bọn mình đã học được rất nhiều thứ mới lạ qua cuộc thi.
8 | Tóm tắt cách làm:
9 | Tụi mình không có nhiều kiến thức về xử lý NLP nên tụi mình tập trung thử nghiệm những model DL và xem model nào hoạt động tốt. Qua những lần thử rất nhiều model tụi mình nhận ra là không có single model nào vượt qua được 0.89x ở Public LB mặc dù có một số làm rất tốt ở local validation. Sau đó, bọn mình đã thôi thử model mới mà đã qua thử nghiệm một số cách kết hợp model hoặc augment train data.
10 | Sau những lần thử nghiệm để đạt được độ diversity phù hợp thì solution được top 1 của bọn mình là Weighted Ensemble của những model sau đây:
11 |
12 | - TextCNN (Weight: 0.1) source
13 | - Inspired VDCNN (Weight: 0.1) source
14 | - HARNN (Weight: 0.3) source
15 | - SARNN (Weight: 0.5) source
16 |
17 | Pretrained Embeddings tụi mình test và sử dụng là:
18 |
19 | - word2vecVN (window-size 5, 400dims) source
20 |
21 | Tụi mình chủ yếu train model ở trên Google Colab và sử dụng GPU của Colab. Thời gian train mỗi model khoảng từ 10 - 20 phút (model converge sau khoảng 5-10 epochs). Những model CNN thì train nhanh hơn những model RNN rất nhiều vì có thể nó không phải là sequential model nên nó tận dụng được GPU tốt hơn lúc train.
22 | Chi tiết cách làm:
23 | 1. Models:
24 | 1.1 TextCNN:
25 | Đây là model CNN cho text classification của bọn mình.
26 | Architecture:
27 | 
28 | 1.2 VDCNN:
29 | Tương tự như TextCNN nhưng ở giữa các layer Convolution có những Residual layer để tránh việc vanishing gradient.
30 | Architecture:
31 | 
32 | 1.3 HARNN:
33 | HARNN xử lý text ở hai level:
34 |
35 | - Tính encoding cho từng sentence bằng word embedding trong paragraph bằng một BiLSTM
36 | - Dùng một BiLSTM để tính document encoding theo sentence encoding.
37 |
38 | Giữa mỗi layer đều có một Attention layer.
39 | Architecture Word2Sent
40 | 
41 | Architecture Sent2Doc:
42 | 
43 | 1.4 SARNN:
44 | Đây là model BiLSTM với Attention ỡ giữa hai layer BiLSTM.
45 | Architecture:
46 | 
47 | 2. Combine models:
48 | Bọn mình đã thử những cách kết hợp các models như Stacking and Ensembling nhưng thấy Ensembling đưa ra được kết quả khả quan nhất. Về cách lựa chọn weight thì bọn mình đã dựa vào model nào có kết quả tốt nhất trên Public LB và cho model đó weight cao nhất. Bọn mình để nguyên probability và chọn threshold là 0.5 chứ không tìm threshold vì không thấy được kết quả tăng nhiều.
49 | Ngoài lề:
50 | Khó khăn:
51 |
52 | - Có lẽ vấn đề đâu tiên hai đứa gặp phải là vấn đề máy móc. Do hai chiếc máy Macbook 128gb (1 Air, 1 Pro) nên hai đứa không đứa nào đủ chỗ để tải pretrained model về thử mỗi lần tải một cái phải xoá cái cũ đi. Mãi sau này tụi mình chuyển mọi thứ lên Google Colab và Github thì mọi thứ mới bắt đầu nhanh và dễ hơn. Nên bọn mình khuyên các bạn nên xài Google Colab hoặc Kaggle Instance.
53 | - Bọn mình gặp nhiều vấn đề với việc reproduce được kết quả với hai lý do. Hàm save_weight của Keras có rất nhiều vấn đề và sau khi load lại thì hầu như model bị hư + trong lúc xử lý có nhiều thứ việc bị undeterministic (Python set, keras model). Model đầu tiên trên 0.9 của tui mình cũng là do chạy lại một model cũ mà thành xD.
54 | 
55 |
56 | Những cách tiếp cận bọn mình đã thử:
57 |
58 | - Dùng Language Model như Elmo (source). Approach này có vẻ không phù hợp vì thời gian train quá lâu (thời gian train một epoch bằng thời gian train một model CNN hoặc RNN) và bọn mình cũng không có thời gian để preprocess data lại cho đúng format của Elmo.
59 | - Một vấn đề mà bọn mình đã thấy là về việc lượng data không đủ để có thể làm model có thể vượt qua được mức 0.89 - 0.9. Bọn mình đã thử một số cách để augment ra data mới như:
60 |
61 |
62 | - Thay ngẫu nhiên những từ trong câu bằng từ đồng nghĩa. Bọn mình làm điều này bằng cách thay mỗi từ bằng từ có word embeddings gần nó nhất trong từ điển của bọn mình (Nearest neighbor). Mặc dù thay đổi này không mang lại improvement đáng kể nhưng mình nghĩ với thesaurus tốt hoặc metrics chọn vector phù hợp thì sẽ có thể có kết quả tốt.
63 | - Xáo các câu trong HARNN model để có thể generate được nhiều document khác nhau.
64 | - Dịch từ tiếng Việt sang các thứ tiếng khác và dịch ngược lại. Mà giờ Google Translate ban cái này rồi ;__;
65 |
66 | Kết:
67 | Một lần nữa cảm ơn BTC và chúc mừng các bạn đã hoàn thành một cái Datathon đầu tiên của Aivivn. Hi vọng những contest trong tương lại sẽ càng nhiều người ủng hộ hơn và cũng sẽ có nhiều discussion trong và ngoài contest hơn vì bọn mình thấy có vẻ thiếu những thảo luận về baseline model lúc thi.
68 |
69 |
--------------------------------------------------------------------------------
/scripts/cnn.py:
--------------------------------------------------------------------------------
1 | from keras.models import Model
2 | from keras.layers import \
3 | Dense, Embedding, Input, \
4 | Conv1D, MaxPool1D, \
5 | Dropout, BatchNormalization, \
6 | Bidirectional, CuDNNLSTM, \
7 | Concatenate, Flatten, Add
8 | from .util import f1
9 | from .net_components import AdditiveLayer
10 |
11 |
12 |
13 | # Based on https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/
14 | # https://www.aclweb.org/anthology/D14-1181
15 | def TextCNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, filter_sizes = {2, 3, 4, 5}, use_fasttext = False, trainable = True, use_additive_emb = False):
16 | if use_fasttext:
17 | inp = Input(shape=(maxlen, embed_size))
18 | x = inp
19 | else:
20 | inp = Input(shape = (maxlen, ))
21 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
22 |
23 | if use_additive_emb:
24 | x = AdditiveLayer()(x)
25 | x = Dropout(0.5)(x)
26 |
27 |
28 | conv_ops = []
29 | for filter_size in filter_sizes:
30 | conv = Conv1D(128, filter_size, activation = 'relu')(x)
31 | pool = MaxPool1D(5)(conv)
32 | conv_ops.append(pool)
33 |
34 | concat = Concatenate(axis = 1)(conv_ops)
35 | # concat = Dropout(0.1)(concat)
36 | concat = BatchNormalization()(concat)
37 |
38 |
39 | conv_2 = Conv1D(128, 5, activation = 'relu')(concat)
40 | conv_2 = MaxPool1D(5)(conv_2)
41 | conv_2 = BatchNormalization()(conv_2)
42 | # conv_2 = Dropout(0.1)(conv_2)
43 |
44 | conv_3 = Conv1D(128, 5, activation = 'relu')(conv_2)
45 | conv_3 = MaxPool1D(5)(conv_3)
46 | conv_3 = BatchNormalization()(conv_3)
47 | # conv_3 = Dropout(0.1)(conv_3)
48 |
49 |
50 | flat = Flatten()(conv_3)
51 |
52 | op = Dense(64, activation = "relu")(flat)
53 | # op = Dropout(0.5)(op)
54 | op = BatchNormalization()(op)
55 | op = Dense(1, activation = "sigmoid")(op)
56 |
57 | model = Model(inputs = inp, outputs = op)
58 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
59 | return model
60 |
61 |
62 | def VDCNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, filter_sizes = {2, 3, 4, 5}, use_fasttext = False, trainable = True, use_additive_emb = False):
63 | if use_fasttext:
64 | inp = Input(shape=(maxlen, embed_size))
65 | x = inp
66 | else:
67 | inp = Input(shape = (maxlen, ))
68 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
69 |
70 | if use_additive_emb:
71 | x = AdditiveLayer()(x)
72 | x = Dropout(0.5)(x)
73 |
74 | conv_ops = []
75 | for filter_size in filter_sizes:
76 | conv = Conv1D(128, filter_size, activation = 'relu')(x)
77 | pool = MaxPool1D(5)(conv)
78 | conv_ops.append(pool)
79 |
80 | concat = Concatenate(axis = 1)(conv_ops)
81 | # concat = Dropout(0.1)(concat)
82 | concat = BatchNormalization()(concat)
83 |
84 |
85 | conv_2_main = Conv1D(128, 5, activation = 'relu', padding='same')(concat)
86 | conv_2_main = BatchNormalization()(conv_2_main)
87 | conv_2_main = Conv1D(128, 5, activation = 'relu', padding='same')(conv_2_main)
88 | conv_2_main = BatchNormalization()(conv_2_main)
89 | conv_2 = Add()([concat, conv_2_main])
90 | conv_2 = MaxPool1D(pool_size = 2, strides = 2)(conv_2)
91 | # conv_2 = BatchNormalization()(conv_2)
92 | # conv_2 = Dropout(0.1)(conv_2)
93 |
94 | conv_3_main = Conv1D(128, 5, activation = 'relu', padding='same')(conv_2)
95 | conv_3_main = BatchNormalization()(conv_3_main)
96 | conv_3_main = Conv1D(128, 5, activation = 'relu', padding='same')(conv_3_main)
97 | conv_3_main = BatchNormalization()(conv_3_main)
98 | conv_3 = Add()([conv_2, conv_3_main])
99 | conv_3 = MaxPool1D(pool_size = 2, strides = 2)(conv_3)
100 | # conv_3 = BatchNormalization()(conv_3)
101 | # conv_3 = Dropout(0.1)(conv_3)
102 |
103 |
104 | flat = Flatten()(conv_3)
105 |
106 | op = Dense(64, activation = "relu")(flat)
107 | # op = Dropout(0.5)(op)
108 | op = BatchNormalization()(op)
109 | op = Dense(1, activation = "sigmoid")(op)
110 |
111 | model = Model(inputs = inp, outputs = op)
112 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
113 | return model
114 |
115 |
116 |
117 | # Based on http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/
118 | def LSTMCNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, filter_sizes = {2, 3, 4, 5}, use_fasttext = False, trainable = True, use_additive_emb = False):
119 | if use_fasttext:
120 | inp = Input(shape=(maxlen, embed_size))
121 | x = inp
122 | else:
123 | inp = Input(shape = (maxlen, ))
124 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
125 |
126 | if use_additive_emb:
127 | x = AdditiveLayer()(x)
128 | x = Dropout(0.5)(x)
129 |
130 | x = Bidirectional(CuDNNLSTM(128, return_sequences = True))(x)
131 |
132 |
133 | conv_ops = []
134 | for filter_size in filter_sizes:
135 | conv = Conv1D(128, filter_size, activation = 'relu')(x)
136 | pool = MaxPool1D(5)(conv)
137 | conv_ops.append(pool)
138 |
139 | concat = Concatenate(axis = 1)(conv_ops)
140 | concat = Dropout(0.5)(concat)
141 | # concat = BatchNormalization()(concat)
142 |
143 |
144 | conv_2 = Conv1D(128, 5, activation = 'relu')(concat)
145 | conv_2 = MaxPool1D(5)(conv_2)
146 | # conv_2 = BatchNormalization()(conv_2)
147 | conv_2 = Dropout(0.5)(conv_2)
148 |
149 | # conv_3 = Conv1D(128, 5, activation = 'relu')(conv_2)
150 | # conv_3 = MaxPool1D(5)(conv_3)
151 | # conv_3 = BatchNormalization()(conv_3)
152 | # conv_3 = Dropout(0.1)(conv_3)
153 |
154 |
155 | flat = Flatten()(conv_2)
156 |
157 | op = Dense(64, activation = "relu")(flat)
158 | op = Dropout(0.5)(op)
159 | # op = BatchNormalization()(op)
160 | op = Dense(1, activation = "sigmoid")(op)
161 |
162 | model = Model(inputs = inp, outputs = op)
163 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
164 | return model
165 |
--------------------------------------------------------------------------------
/test_elmo.py:
--------------------------------------------------------------------------------
1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences, find_threshold
2 | import numpy as np
3 | from scripts.constant import DEFAULT_MAX_FEATURES
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.metrics import f1_score
6 | from elmoformanylangs import Embedder
7 | import tensorflow as tf
8 | import random as rn
9 | import pandas as pd
10 | import timeit
11 |
12 |
13 |
14 | from keras.models import Model, load_model, model_from_json
15 | from keras.utils import Sequence
16 | from keras.layers import Dense, Embedding, Input, GRU, Bidirectional, GlobalMaxPool1D, Dropout, Lambda
17 | from keras.callbacks import EarlyStopping, ModelCheckpoint
18 | import keras.backend as K
19 |
20 | # np random seed:
21 | np.random.seed(22)
22 |
23 | # # Setting the seed for python random numbers
24 | rn.seed(1254)
25 | #
26 | # # Setting the graph-level random seed.
27 | tf.set_random_seed(89)
28 |
29 | elmo_path = "./data/elmo/"
30 |
31 |
32 | batch_size = 16
33 | epochs = 100
34 |
35 |
36 |
37 | elmo = Embedder(elmo_path, batch_size = batch_size)
38 |
39 |
40 | def to_length(texts, length):
41 | def pad_func(vector, pad_width, iaxis, kwargs):
42 | str = kwargs.get('padder', '')
43 | vector[:pad_width[0]] = str
44 | vector[-pad_width[1]:] = str
45 | return vector
46 |
47 | ret = []
48 | for sentence in texts:
49 | sentence = np.array(sentence, dtype = np.unicode)
50 | sentence = sentence[:min(length, len(sentence))]
51 | if length > len(sentence):
52 | sentence = np.pad(
53 | sentence, mode = pad_func,
54 | pad_width = (0, length - len(sentence))
55 | )
56 | ret.append(sentence)
57 |
58 | return np.array(ret)
59 |
60 |
61 | class TrainSeq(Sequence):
62 | def __init__(self, X, y, batch_size):
63 | self._X, self._y = X, y
64 | self._batch_size = batch_size
65 | self._indices = np.arange(len(self._X))
66 |
67 | def __len__(self):
68 | return len(self._X) // self._batch_size
69 |
70 | def __getitem__(self, idx):
71 | id = self._indices[idx * self._batch_size:(idx + 1) * self._batch_size]
72 | return np.array(elmo.sents2elmo(self._X[id])), self._y[id]
73 |
74 | def on_epoch_end(self):
75 | np.random.shuffle(self._indices)
76 |
77 |
78 | class TestSeq(Sequence):
79 | def __init__(self, x, batch_size):
80 | self._X = x
81 | self._batch_size = batch_size
82 |
83 | def __len__(self):
84 | return len(self._X) // batch_size
85 |
86 | def __getitem__(self, idx):
87 | return np.array(elmo.sents2elmo(self._X[idx * self._batch_size:(idx + 1) * self._batch_size]))
88 |
89 |
90 |
91 |
92 | def RNNKerasCPUNoEmbedding(embed_size = 1024, maxlen = 100):
93 | inp = Input(shape = (maxlen, embed_size))
94 | x = Bidirectional(GRU(256, return_sequences = True))(inp)
95 | x = Dropout(0.5)(x)
96 | x = Bidirectional(GRU(256, return_sequences = True))(x)
97 | x = Dropout(0.5)(x)
98 | x = GlobalMaxPool1D()(x)
99 | x = Dropout(0.5)(x)
100 | x = Dense(64, activation = "relu")(x)
101 | x = Dropout(0.5)(x)
102 | x = Dense(1, activation = "sigmoid")(x)
103 | model = Model(inputs = inp, outputs = x)
104 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
105 | return model
106 |
107 |
108 |
109 |
110 |
111 | def f1(y_true, y_pred):
112 | def recall(y_true, y_pred):
113 | """Recall metric.
114 |
115 | Only computes a batch-wise average of recall.
116 |
117 | Computes the recall, a metric for multi-label classification of
118 | how many relevant items are selected.
119 | """
120 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
121 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
122 | recall = true_positives / (possible_positives + K.epsilon())
123 | return recall
124 |
125 | def precision(y_true, y_pred):
126 | """Precision metric.
127 |
128 | Only computes a batch-wise average of precision.
129 |
130 | Computes the precision, a metric for multi-label classification of
131 | how many selected items are relevant.
132 | """
133 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
134 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
135 | precision = true_positives / (predicted_positives + K.epsilon())
136 | return precision
137 | precision = precision(y_true, y_pred)
138 | recall = recall(y_true, y_pred)
139 | return 2*((precision*recall)/(precision+recall+K.epsilon()))
140 |
141 |
142 |
143 |
144 |
145 | data = read_file("./data/train.crash")
146 | data_test = read_file("./data/test.crash", is_train = False)
147 |
148 | labels = data["label"].values.astype(np.float16).reshape(-1, 1)
149 | texts = tokenize(data["text"])
150 | texts_test = tokenize(data_test["text"])
151 |
152 |
153 | texts = to_length(texts, 100)
154 | texts_test = to_length(texts_test, 100)
155 |
156 | texts_train, texts_val, labels_train, labels_val = train_test_split(
157 | texts, labels,
158 | test_size = 0.05
159 | )
160 |
161 |
162 | checkpoint = ModelCheckpoint(
163 | filepath = "./Weights/model_elmo.hdf5",
164 | monitor = 'val_f1', verbose = 1,
165 | mode = 'max',
166 | save_best_only = True
167 | )
168 | early = EarlyStopping(monitor = "val_f1", mode = "max", patience = 3)
169 | callbacks_list = [checkpoint, early]
170 |
171 | train_seq = TrainSeq(texts_train, labels_train, batch_size = batch_size)
172 | val_seq = TrainSeq(texts_val, labels_val, batch_size = 1)
173 | test_seq = TestSeq(texts_test, batch_size = 1)
174 |
175 |
176 | model = RNNKerasCPUNoEmbedding()
177 | model.fit_generator(
178 | train_seq,
179 | validation_data = val_seq,
180 | callbacks = callbacks_list,
181 | epochs = epochs,
182 | workers = False
183 | )
184 |
185 |
186 |
187 |
188 | model.load_weights("./Weights/model_elmo.hdf5")
189 | prediction_prob = model.predict_generator(val_seq, workers = False)
190 |
191 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val)
192 | print(OPTIMAL_THRESHOLD)
193 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8)
194 | print(f1_score(
195 | y_true = labels_val.reshape(-1),
196 | y_pred = prediction.reshape(-1)
197 | ))
198 |
199 |
200 |
201 | prediction_test = model.predict_generator(test_seq, workers = False)
202 | df_predicton = pd.read_csv("./data/sample_submission.csv")
203 | df_predicton["label"] = (prediction_test > OPTIMAL_THRESHOLD).astype(np.int8)
204 | df_predicton.to_csv("./prediction/prediction_elmo.csv", index = False)
--------------------------------------------------------------------------------
/main_stack_hier.py:
--------------------------------------------------------------------------------
1 | from scripts.util import \
2 | read_file, \
3 | tokenize, make_embedding, text_to_sequences, \
4 | sent_embedding, sent_tokenize, text_sents_to_sequences, f1
5 | from scripts.constant import DEFAULT_MAX_FEATURES
6 | from sklearn.model_selection import train_test_split
7 | from scripts.rnn import SARNNKeras, HARNN, AttLayer, RNNKeras, OriginalHARNN, AdditiveLayer
8 | from scripts.cnn import VDCNN, TextCNN, LSTMCNN
9 | from scripts.stack import StackedGeneralizerWithHier
10 | import argparse
11 | import os
12 | import numpy as np
13 | import datetime
14 | import pandas as pd
15 | from sklearn.metrics import f1_score
16 |
17 | from sklearn.linear_model import LogisticRegression
18 | from sklearn.ensemble import RandomForestClassifier
19 | from sklearn.neural_network import MLPClassifier
20 |
21 | from keras.utils import CustomObjectScope
22 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
23 |
24 |
25 |
26 | def stack(models_list, hier_models_list, embedding_path, max_features, should_mix):
27 | model_name = '-'.join(
28 | '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' '))
29 |
30 | train_data = read_file('./data/train.crash')
31 | test_data = read_file('./data/test.crash', is_train=False)
32 |
33 | train_tokenized_texts = tokenize(train_data['text'])
34 | test_tokenizes_texts = tokenize(test_data['text'])
35 |
36 | train_tokenized_texts_sent = sent_tokenize(train_data['text'])
37 | test_tokenizes_texts_sent = sent_tokenize(test_data['text'])
38 |
39 | labels = train_data['label'].values.astype(np.float16).reshape(-1, 1)
40 |
41 | embed_size, word_map, embedding_mat = make_embedding(
42 | list(train_tokenized_texts) +
43 | list(test_tokenizes_texts) if should_mix else train_tokenized_texts,
44 | embedding_path,
45 | max_features
46 | )
47 |
48 | embed_size_sent, word_map_sent, embedding_mat_sent = sent_embedding(
49 | list(train_tokenized_texts_sent) +
50 | list(test_tokenizes_texts_sent) if should_mix else train_tokenized_texts_sent,
51 | embedding_path,
52 | max_features
53 | )
54 |
55 |
56 | texts_id = text_to_sequences(train_tokenized_texts, word_map)
57 | texts_id_sent = text_sents_to_sequences(
58 | train_tokenized_texts_sent,
59 | word_map_sent,
60 | max_nb_sent = 3,
61 | max_sent_len = 50
62 | )
63 | print('Number of train data: {}'.format(labels.shape))
64 |
65 | texts_id_train, texts_id_val, texts_id_sent_train, texts_id_sent_val, labels_train, labels_val = train_test_split(
66 | texts_id, texts_id_sent, labels, test_size=0.05)
67 |
68 | model_path = './models/{}-version'.format(model_name)
69 |
70 | try:
71 | os.mkdir('./models')
72 | except:
73 | print('Folder already created')
74 | try:
75 | os.mkdir(model_path)
76 | except:
77 | print('Folder already created')
78 |
79 | batch_size = 16
80 | epochs = 100
81 | patience = 3
82 |
83 | # meta_model = RandomForestClassifier (
84 | # n_estimators=200,
85 | # criterion="entropy",
86 | # max_depth=5,
87 | # max_features=0.5
88 | # )
89 | # meta_model = MLPClassifier(
90 | # hidden_layer_sizes = (10),
91 | # early_stopping = True,
92 | # validation_fraction = 0.05,
93 | # batch_size = batch_size,
94 | # n_iter_no_change = patience
95 | # )
96 | meta_model = LogisticRegression()
97 |
98 | models = [
99 | model(
100 | embeddingMatrix=embedding_mat,
101 | embed_size=embed_size,
102 | max_features=embedding_mat.shape[0]
103 | )
104 | for model in models_list
105 | ]
106 |
107 | hier_models = [
108 | model(
109 | embeddingMatrix=embedding_mat_sent,
110 | embed_size=embed_size_sent,
111 | max_features=embedding_mat_sent.shape[0],
112 | max_nb_sent = 3,
113 | max_sent_len = 50
114 | )
115 | for model in hier_models_list
116 | ]
117 |
118 |
119 |
120 | stack = StackedGeneralizerWithHier(models, hier_models, meta_model)
121 | stack.train_meta_model(
122 | X = texts_id_train, y = labels_train,
123 | X_val = texts_id_val, y_val = labels_val,
124 | X_hier = texts_id_sent_train, X_hier_val = texts_id_sent_val,
125 | model_path = model_path,
126 | epochs = epochs,
127 | batch_size = batch_size,
128 | patience = patience
129 | )
130 |
131 | stack.train_models(
132 | X = texts_id_train, y = labels_train,
133 | X_val = texts_id_val, y_val = labels_val,
134 | X_hier = texts_id_sent_train, X_hier_val = texts_id_sent_val,
135 | batch_size = batch_size,
136 | epochs = epochs,
137 | patience = patience,
138 | model_path = model_path
139 | )
140 |
141 | prediction = stack.predict(texts_id_val, texts_id_sent_val)
142 | print('F1 validation score: {}'.format(f1_score(prediction, labels_val)))
143 | with open('{}/f1'.format(model_path), 'w') as fp:
144 | fp.write(str(f1_score(prediction, labels_val)))
145 |
146 | test_id_texts = text_to_sequences(test_tokenizes_texts, word_map)
147 | test_id_texts_sent = text_sents_to_sequences(test_tokenizes_texts_sent, word_map_sent, 3, 50)
148 | test_prediction = stack.predict(test_id_texts, test_id_texts_sent)
149 |
150 | df_predicton = pd.read_csv("./data/sample_submission.csv")
151 | df_predicton["label"] = test_prediction
152 |
153 | print('Number of test data: {}'.format(df_predicton.shape[0]))
154 | df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False)
155 |
156 |
157 |
158 | if __name__ == '__main__':
159 | models_list = [
160 | VDCNN, TextCNN, SARNNKeras, RNNKeras
161 | ]
162 | hier_models_list = [
163 | OriginalHARNN, HARNN
164 | ]
165 | parser = argparse.ArgumentParser()
166 | parser.add_argument(
167 | '-e',
168 | '--embedding',
169 | help='Model use',
170 | default='./embeddings/smallFasttext.vi.vec'
171 | )
172 | parser.add_argument(
173 | '--max',
174 | help='Model use',
175 | default=DEFAULT_MAX_FEATURES
176 | )
177 | parser.add_argument(
178 | '--mix',
179 | action='store_true',
180 | help='Model use'
181 | )
182 | args = parser.parse_args()
183 | with CustomObjectScope({
184 | 'SeqSelfAttention': SeqSelfAttention,
185 | 'SeqWeightedAttention': SeqWeightedAttention,
186 | 'AttLayer': AttLayer,
187 | 'AdditiveLayer': AdditiveLayer,
188 | 'f1': f1
189 | }):
190 | stack(models_list, hier_models_list, args.embedding,
191 | int(args.max), args.mix)
192 |
--------------------------------------------------------------------------------
/main_elmo.py:
--------------------------------------------------------------------------------
1 | from scripts.util import read_file, tokenize
2 | from sklearn.model_selection import train_test_split
3 | from keras.callbacks import EarlyStopping, ModelCheckpoint
4 | from scripts.rnn import RNNKeras, RNNKerasCPU, LSTMKeras, SARNNKerasCPU, SARNNKeras
5 | from scripts.cnn import TextCNN, LSTMCNN, VDCNN
6 | import argparse
7 | import os
8 | import numpy as np
9 | import datetime
10 | import pandas as pd
11 | from scripts.util import find_threshold
12 | from sklearn.metrics import f1_score
13 | from keras.utils import Sequence
14 | from elmoformanylangs import Embedder
15 |
16 |
17 |
18 |
19 | def train_model(model, embedding_path, should_find_threshold, return_prob, use_additive_emb):
20 | batch_size = 16
21 | epochs = 100
22 | max_len = 100
23 |
24 | def to_length(texts, length):
25 | def pad_func(vector, pad_width, iaxis, kwargs):
26 | str = kwargs.get('padder', '')
27 | vector[:pad_width[0]] = str
28 | vector[-pad_width[1]:] = str
29 | return vector
30 |
31 | ret = []
32 | for sentence in texts:
33 | sentence = np.array([token.replace("_", " ") for token in sentence], dtype=np.unicode)
34 | sentence = sentence[:min(length, len(sentence))]
35 | if length > len(sentence):
36 | sentence = np.pad(
37 | sentence, mode=pad_func,
38 | pad_width=(0, length - len(sentence))
39 | )
40 | ret.append(sentence)
41 |
42 | return np.array(ret)
43 |
44 | class TrainSeq(Sequence):
45 | def __init__(self, X, y, batch_size, elmo):
46 | self._X, self._y = X, y
47 | self._batch_size = batch_size
48 | self._indices = np.arange(len(self._X))
49 | self._elmo = elmo
50 |
51 | def __len__(self):
52 | return int(np.ceil(len(self._X) / float(self._batch_size)))
53 |
54 | def __getitem__(self, idx):
55 | id = self._indices[idx * self._batch_size:(idx + 1) * self._batch_size]
56 | return np.array(self._elmo.sents2elmo(self._X[id])), self._y[id]
57 |
58 | def on_epoch_end(self):
59 | np.random.shuffle(self._indices)
60 |
61 | class TestSeq(Sequence):
62 | def __init__(self, x, batch_size, elmo):
63 | self._X = x
64 | self._batch_size = batch_size
65 | self._elmo = elmo
66 |
67 | def __len__(self):
68 | return int(np.ceil(len(self._X) / float(self._batch_size)))
69 |
70 | def __getitem__(self, idx):
71 | return np.array(self._elmo.sents2elmo(self._X[idx * self._batch_size:(idx + 1) * self._batch_size]))
72 |
73 | model_name = '-'.join(
74 | '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' '))
75 |
76 | elmo = Embedder(embedding_path, batch_size=batch_size)
77 |
78 | train_data = read_file('./data/train.crash')
79 | test_data = read_file('./data/test.crash', is_train=False)
80 | train_tokenized_texts = tokenize(train_data['text'])
81 | test_tokenizes_texts = tokenize(test_data['text'])
82 | labels = train_data['label'].values.astype(np.float16).reshape(-1, 1)
83 |
84 | texts = to_length(train_tokenized_texts, max_len)
85 | texts_test = to_length(test_tokenizes_texts, max_len)
86 |
87 | print('Number of train data: {}'.format(labels.shape))
88 |
89 | texts_train, texts_val, labels_train, labels_val = train_test_split(
90 | texts, labels,
91 | test_size=0.05
92 | )
93 |
94 | model_path = './models/{}-version'.format(model_name)
95 |
96 | try:
97 | os.mkdir('./models')
98 | except:
99 | print('Folder already created')
100 | try:
101 | os.mkdir(model_path)
102 | except:
103 | print('Folder already created')
104 |
105 | checkpoint = ModelCheckpoint(
106 | filepath='{}/models.hdf5'.format(model_path),
107 | monitor='val_f1', verbose=1,
108 | mode='max',
109 | save_best_only=True
110 | )
111 | early = EarlyStopping(monitor='val_f1', mode='max', patience=5)
112 | callbacks_list = [checkpoint, early]
113 |
114 | train_seq = TrainSeq(texts_train, labels_train, batch_size=batch_size, elmo = elmo)
115 | val_seq = TrainSeq(texts_val, labels_val, batch_size=min(batch_size, len(texts_val)), elmo = elmo)
116 | test_seq = TestSeq(texts_test, batch_size=min(batch_size, len(texts_test)), elmo = elmo)
117 |
118 | model = model(
119 | maxlen = max_len,
120 | embed_size=1024,
121 | use_fasttext = True,
122 | use_additive_emb = use_additive_emb
123 | )
124 | model.fit_generator(
125 | train_seq,
126 | validation_data=val_seq,
127 | callbacks=callbacks_list,
128 | epochs=epochs,
129 | workers=False
130 | )
131 |
132 | model.load_weights('{}/models.hdf5'.format(model_path))
133 | prediction_prob = model.predict_generator(val_seq, workers=False)
134 | if should_find_threshold:
135 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val)
136 | else:
137 | OPTIMAL_THRESHOLD = 0.5
138 | print('OPTIMAL_THRESHOLD: {}'.format(OPTIMAL_THRESHOLD))
139 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8)
140 | print('F1 validation score: {}'.format(f1_score(prediction, labels_val)))
141 | with open('{}/f1'.format(model_path), 'w') as fp:
142 | fp.write(str(f1_score(prediction, labels_val)))
143 |
144 | test_prediction = model.predict_generator(test_seq, workers=False)
145 |
146 | df_predicton = pd.read_csv("./data/sample_submission.csv")
147 | if return_prob:
148 | df_predicton["label"] = test_prediction
149 | else:
150 | df_predicton["label"] = (
151 | test_prediction > OPTIMAL_THRESHOLD).astype(np.int8)
152 |
153 | print('Number of test data: {}'.format(df_predicton.shape[0]))
154 | df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False)
155 |
156 |
157 | model_dict = {
158 | 'RNNKeras': RNNKeras,
159 | 'RNNKerasCPU': RNNKerasCPU,
160 | 'LSTMKeras': LSTMKeras,
161 | 'SARNNKerasCPU': SARNNKerasCPU,
162 | 'SARNNKeras': SARNNKeras,
163 | 'TextCNN': TextCNN,
164 | 'LSTMCNN': LSTMCNN,
165 | 'VDCNN': VDCNN
166 | }
167 |
168 | if __name__ == '__main__':
169 | parser = argparse.ArgumentParser()
170 | parser.add_argument(
171 | '-m',
172 | '--model',
173 | help='Model use',
174 | default='RNNKerasCPU'
175 | )
176 | parser.add_argument(
177 | '-e',
178 | '--embedding',
179 | help='Model use',
180 | default='./embeddings/smallFasttext.vi.vec'
181 | )
182 | parser.add_argument(
183 | '--find_threshold',
184 | action='store_true',
185 | help='Model use'
186 | )
187 | parser.add_argument(
188 | '--prob',
189 | action='store_true',
190 | help='Model use'
191 | )
192 | parser.add_argument(
193 | '--add_embed',
194 | action='store_true',
195 | help='Model use'
196 | )
197 | args = parser.parse_args()
198 | if not args.model in model_dict:
199 | raise RuntimeError('Model not found')
200 | train_model(model_dict[args.model], args.embedding, args.find_threshold, args.prob, args.add_embed)
201 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from scripts.util import read_file, tokenize, make_embedding, text_to_sequences
2 | from scripts.rnn import RNNKeras
3 | from scripts.constant import DEFAULT_MAX_FEATURES
4 | from sklearn.model_selection import train_test_split
5 | from keras.callbacks import EarlyStopping, ModelCheckpoint
6 | from scripts.rnn import RNNKeras, RNNKerasCPU, LSTMKeras, SARNNKerasCPU, SARNNKeras
7 | from scripts.cnn import TextCNN, LSTMCNN, VDCNN
8 | import argparse
9 | import os
10 | import numpy as np
11 | import datetime
12 | import pandas as pd
13 | from scripts.util import find_threshold
14 | from scripts.augment import similar_augment, create_sim_dict, similar_augment_from_sim_dict
15 | from sklearn.metrics import f1_score
16 | from keras.utils.vis_utils import plot_model
17 |
18 |
19 | def train_model(
20 | model, embedding_path, annoy_path,
21 | max_features, should_find_threshold, should_mix,
22 | return_prob, trainable, use_additive_emb, augment_size, use_sim_dict,
23 | print_model, model_high
24 | ):
25 | model_name = '-'.join(
26 | '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' '))
27 |
28 | augment_size = int(augment_size)
29 |
30 | train_data = read_file('./data/train.crash')
31 | test_data = read_file('./data/test.crash', is_train=False)
32 | train_tokenized_texts = tokenize(train_data['text'])
33 | test_tokenizes_texts = tokenize(test_data['text'])
34 | labels = train_data['label'].values.astype(np.float16).reshape(-1, 1)
35 |
36 | train_tokenized_texts, val_tokenized_texts, labels_train, labels_val = train_test_split(
37 | train_tokenized_texts, labels, test_size = 0.05
38 | )
39 |
40 |
41 | if augment_size != 0 and not use_sim_dict:
42 | if augment_size < 0:
43 | augment_size = len(train_tokenized_texts) * (-augment_size)
44 |
45 | print(augment_size)
46 |
47 | train_tokenized_texts, labels_train = similar_augment(
48 | train_tokenized_texts,
49 | labels_train,
50 | n_increase = augment_size,
51 | model_path = embedding_path,
52 | n_word_replace = 10,
53 | use_annoy=True,
54 | annoy_path=annoy_path
55 | )
56 |
57 |
58 | embed_size, word_map, embedding_mat = make_embedding(
59 | list(train_tokenized_texts) + list(val_tokenized_texts) +
60 | list(test_tokenizes_texts) if should_mix else list(train_tokenized_texts) + list(val_tokenized_texts),
61 | embedding_path,
62 | max_features
63 | )
64 |
65 | texts_id_train = text_to_sequences(train_tokenized_texts, word_map)
66 |
67 | if augment_size != 0 and use_sim_dict:
68 | if augment_size < 0:
69 | augment_size = len(train_tokenized_texts) * (-augment_size)
70 | sim_dict = create_sim_dict(word_map, model_path = embedding_path, annoy_path = annoy_path)
71 | print("Finish Creating sim dict")
72 | texts_id_train, labels_train = similar_augment_from_sim_dict(
73 | texts_id_train, labels_train, sim_dict,
74 | n_increase = augment_size
75 | )
76 |
77 | texts_id_val = text_to_sequences(val_tokenized_texts, word_map)
78 | print('Number of train data: {}'.format(labels.shape))
79 |
80 | # texts_id_train, texts_id_val, labels_train, labels_val = train_test_split(
81 | # texts_id, labels, test_size=0.05)
82 |
83 | model_path = './models/{}-version'.format(model_name)
84 |
85 | try:
86 | os.mkdir('./models')
87 | except:
88 | print('Folder already created')
89 | try:
90 | os.mkdir(model_path)
91 | except:
92 | print('Folder already created')
93 |
94 | checkpoint = ModelCheckpoint(
95 | filepath='{}/models.hdf5'.format(model_path),
96 | monitor='val_f1', verbose=1,
97 | mode='max',
98 | save_best_only=True
99 | )
100 | early = EarlyStopping(monitor='val_f1', mode='max', patience=5)
101 | callbacks_list = [checkpoint, early]
102 | batch_size = 16
103 | epochs = 100
104 |
105 | model = model(
106 | embeddingMatrix=embedding_mat,
107 | embed_size=embed_size,
108 | max_features=embedding_mat.shape[0],
109 | trainable = trainable,
110 | use_additive_emb = use_additive_emb
111 | )
112 | if print_model:
113 | plot_model(model, to_file='{}.png'.format(model_high), show_shapes=True, show_layer_names=True)
114 | return
115 | model.fit(
116 | texts_id_train, labels_train,
117 | validation_data=(texts_id_val, labels_val),
118 | callbacks=callbacks_list,
119 | epochs=epochs,
120 | batch_size=batch_size
121 | )
122 |
123 | model.load_weights('{}/models.hdf5'.format(model_path))
124 | prediction_prob = model.predict(texts_id_val)
125 | if should_find_threshold:
126 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val)
127 | else:
128 | OPTIMAL_THRESHOLD = 0.5
129 | print('OPTIMAL_THRESHOLD: {}'.format(OPTIMAL_THRESHOLD))
130 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8)
131 | print('F1 validation score: {}'.format(f1_score(prediction, labels_val)))
132 | with open('{}/f1'.format(model_path), 'w') as fp:
133 | fp.write(str(f1_score(prediction, labels_val)))
134 |
135 | test_id_texts = text_to_sequences(test_tokenizes_texts, word_map)
136 | test_prediction = model.predict(test_id_texts)
137 |
138 | df_predicton = pd.read_csv("./data/sample_submission.csv")
139 | if return_prob:
140 | df_predicton["label"] = test_prediction
141 | else:
142 | df_predicton["label"] = (
143 | test_prediction > OPTIMAL_THRESHOLD).astype(np.int8)
144 |
145 | print('Number of test data: {}'.format(df_predicton.shape[0]))
146 | df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False)
147 |
148 |
149 | model_dict = {
150 | 'RNNKeras': RNNKeras,
151 | 'RNNKerasCPU': RNNKerasCPU,
152 | 'LSTMKeras': LSTMKeras,
153 | 'SARNNKerasCPU': SARNNKerasCPU,
154 | 'SARNNKeras': SARNNKeras,
155 | 'TextCNN': TextCNN,
156 | 'LSTMCNN': LSTMCNN,
157 | 'VDCNN': VDCNN
158 | }
159 |
160 | if __name__ == '__main__':
161 | parser = argparse.ArgumentParser()
162 | parser.add_argument(
163 | '-m',
164 | '--model',
165 | help='Model use',
166 | default='RNNKerasCPU'
167 | )
168 | parser.add_argument(
169 | '-e',
170 | '--embedding',
171 | help='Model use',
172 | default='./embeddings/smallFasttext.vi.vec'
173 | )
174 | parser.add_argument(
175 | '-annoy',
176 | '--annoy',
177 | help='Model use',
178 | default='./embeddings/annoy.pkl'
179 | )
180 | parser.add_argument(
181 | '--max',
182 | help='Model use',
183 | default=DEFAULT_MAX_FEATURES
184 | )
185 | parser.add_argument(
186 | '--aug',
187 | help='Model use',
188 | default=0
189 | )
190 | parser.add_argument(
191 | '--use_sim_dict',
192 | action='store_true',
193 | help='Model use'
194 | )
195 | parser.add_argument(
196 | '--find_threshold',
197 | action='store_true',
198 | help='Model use'
199 | )
200 | parser.add_argument(
201 | '--mix',
202 | action='store_true',
203 | help='Model use'
204 | )
205 | parser.add_argument(
206 | '--prob',
207 | action='store_true',
208 | help='Model use'
209 | )
210 | parser.add_argument(
211 | '--fix_embed',
212 | action='store_false',
213 | help='Model use'
214 | )
215 | parser.add_argument(
216 | '--add_embed',
217 | action='store_true',
218 | help='Model use'
219 | )
220 | parser.add_argument(
221 | '--print_model',
222 | action='store_true',
223 | help='Model use'
224 | )
225 | args = parser.parse_args()
226 | if not args.model in model_dict:
227 | raise RuntimeError('Model not found')
228 | train_model(model_dict[args.model], args.embedding, args.annoy,
229 | int(args.max), args.find_threshold, args.mix, args.prob, args.fix_embed, args.add_embed, args.aug,
230 | args.use_sim_dict, args.print_model, args.model)
231 |
--------------------------------------------------------------------------------
/main_hierarchical.py:
--------------------------------------------------------------------------------
1 | from scripts.util import read_file, sent_tokenize, sent_embedding, text_sents_to_sequences
2 | from scripts.constant import DEFAULT_MAX_FEATURES
3 | from sklearn.model_selection import train_test_split
4 | from keras.callbacks import EarlyStopping, ModelCheckpoint
5 | from scripts.rnn import HRNN, HRNNCPU, OriginalHARNN, OriginalHARNNCPU, HARNN, HARNNCPU
6 | import argparse
7 | import os
8 | import numpy as np
9 | import datetime
10 | import pandas as pd
11 | from scripts.util import find_threshold
12 | from scripts.augment import shuffle_augment
13 | from sklearn.metrics import f1_score
14 | from keras.utils.vis_utils import plot_model
15 |
16 |
17 | def train_model(
18 | model, embedding_path,
19 | max_features, max_nb_sent, max_sent_len,
20 | should_find_threshold, should_mix,
21 | return_prob, trainable, use_additive_emb, augment_size, aug_min_len, print_model, model_high
22 | ):
23 | model_name = '-'.join(
24 | '.'.join(str(datetime.datetime.now()).split('.')[:-1]).split(' '))
25 |
26 | train_data = read_file('./data/train.crash')
27 | test_data = read_file('./data/test.crash', is_train=False)
28 | train_tokenized_texts = sent_tokenize(train_data['text'])
29 | test_tokenizes_texts = sent_tokenize(test_data['text'])
30 | labels = train_data['label'].values.astype(np.float16).reshape(-1, 1)
31 |
32 | train_tokenized_texts, val_tokenized_texts, labels_train, labels_val = train_test_split(
33 | train_tokenized_texts, labels, test_size=0.05
34 | )
35 |
36 | augment_size = int(augment_size)
37 | aug_min_len = int(aug_min_len)
38 | max_nb_sent = int(max_nb_sent)
39 | max_sent_len = int(max_sent_len)
40 |
41 | if augment_size != 0:
42 | if augment_size < 0:
43 | augment_size = len(train_tokenized_texts) * (-augment_size)
44 |
45 | print(augment_size)
46 |
47 | train_tokenized_texts, labels_train = shuffle_augment(
48 | train_tokenized_texts,
49 | labels_train,
50 | n_increase = augment_size,
51 | min_length = aug_min_len
52 | )
53 |
54 | embed_size, word_map, embedding_mat = sent_embedding(
55 | list(train_tokenized_texts) + list(val_tokenized_texts) +
56 | list(test_tokenizes_texts) if should_mix
57 | else list(train_tokenized_texts) + list(val_tokenized_texts),
58 | embedding_path,
59 | max_features
60 | )
61 |
62 | texts_id_train = text_sents_to_sequences(
63 | train_tokenized_texts,
64 | word_map,
65 | max_nb_sent = max_nb_sent,
66 | max_sent_len = max_sent_len
67 | )
68 |
69 | texts_id_val = text_sents_to_sequences(
70 | val_tokenized_texts,
71 | word_map,
72 | max_nb_sent = max_nb_sent,
73 | max_sent_len = max_sent_len
74 | )
75 |
76 |
77 | # texts_id = text_sents_to_sequences(
78 | # train_tokenized_texts,
79 | # word_map,
80 | # max_nb_sent = max_nb_sent,
81 | # max_sent_len = max_sent_len
82 | # )
83 | print('Number of train data: {}'.format(labels.shape))
84 |
85 | # texts_id_train, texts_id_val, labels_train, labels_val = train_test_split(
86 | # texts_id, labels, test_size=0.05)
87 |
88 | model_path = './models/{}-version'.format(model_name)
89 |
90 | try:
91 | os.mkdir('./models')
92 | except:
93 | print('Folder already created')
94 | try:
95 | os.mkdir(model_path)
96 | except:
97 | print('Folder already created')
98 |
99 | checkpoint = ModelCheckpoint(
100 | filepath='{}/models.hdf5'.format(model_path),
101 | monitor='val_f1', verbose=1,
102 | mode='max',
103 | save_best_only=True
104 | )
105 | early = EarlyStopping(monitor='val_f1', mode='max', patience=5)
106 | callbacks_list = [checkpoint, early]
107 | batch_size = 16
108 | epochs = 100
109 |
110 | model = model(
111 | embeddingMatrix=embedding_mat,
112 | embed_size=embed_size,
113 | max_features=embedding_mat.shape[0],
114 | max_nb_sent = max_nb_sent,
115 | max_sent_len = max_sent_len,
116 | trainable = trainable,
117 | use_additive_emb = use_additive_emb
118 | )
119 | if print_model:
120 | plot_model(model, to_file='{}.png'.format(model_high), show_shapes=True, show_layer_names=True)
121 | return
122 | model.fit(
123 | texts_id_train, labels_train,
124 | validation_data=(texts_id_val, labels_val),
125 | callbacks=callbacks_list,
126 | epochs=epochs,
127 | batch_size=batch_size
128 | )
129 |
130 | model.load_weights('{}/models.hdf5'.format(model_path))
131 | prediction_prob = model.predict(texts_id_val)
132 |
133 | if should_find_threshold:
134 | OPTIMAL_THRESHOLD = find_threshold(prediction_prob, labels_val)
135 | else:
136 | OPTIMAL_THRESHOLD = 0.5
137 | print('OPTIMAL_THRESHOLD: {}'.format(OPTIMAL_THRESHOLD))
138 | prediction = (prediction_prob > OPTIMAL_THRESHOLD).astype(np.int8)
139 | print('F1 validation score: {}'.format(f1_score(prediction, labels_val)))
140 | with open('{}/f1'.format(model_path), 'w') as fp:
141 | fp.write(str(f1_score(prediction, labels_val)))
142 |
143 | test_id_texts = text_sents_to_sequences(
144 | test_tokenizes_texts,
145 | word_map,
146 | max_nb_sent = max_nb_sent,
147 | max_sent_len = max_sent_len
148 | )
149 | test_prediction = model.predict(test_id_texts)
150 |
151 | df_predicton = pd.read_csv("./data/sample_submission.csv")
152 |
153 | if return_prob:
154 | df_predicton["label"] = test_prediction
155 | else:
156 | df_predicton["label"] = (
157 | test_prediction > OPTIMAL_THRESHOLD).astype(np.int8)
158 | print('Number of test data: {}'.format(df_predicton.shape[0]))
159 | df_predicton.to_csv('{}/prediction.csv'.format(model_path), index=False)
160 |
161 |
162 | model_dict = {
163 | 'HRNN': HRNN,
164 | 'HRNNCPU': HRNNCPU,
165 | 'HARNN': HARNN,
166 | 'HARNNCPU': HARNNCPU,
167 | 'OriginalHARNN': OriginalHARNN,
168 | 'OriginalHARNNCPU':OriginalHARNNCPU
169 | }
170 |
171 | if __name__ == '__main__':
172 | parser = argparse.ArgumentParser()
173 | parser.add_argument(
174 | '-m',
175 | '--model',
176 | help='Model use',
177 | default='HRNN'
178 | )
179 | parser.add_argument(
180 | '-e',
181 | '--embedding',
182 | help='Model use',
183 | default='./embeddings/smallFasttext.vi.vec'
184 | )
185 | parser.add_argument(
186 | '--max',
187 | help='Model use',
188 | default=DEFAULT_MAX_FEATURES
189 | )
190 | parser.add_argument(
191 | '--nb_sent',
192 | help='Model use',
193 | default=3
194 | )
195 | parser.add_argument(
196 | '--sent_len',
197 | help='Model use',
198 | default=50
199 | )
200 | parser.add_argument(
201 | '--aug',
202 | help='Model use',
203 | default=0
204 | )
205 | parser.add_argument(
206 | '--aug_min_len',
207 | help='Model use',
208 | default=1
209 | )
210 | parser.add_argument(
211 | '--find_threshold',
212 | action='store_true',
213 | help='Model use'
214 | )
215 | parser.add_argument(
216 | '--mix',
217 | action='store_true',
218 | help='Model use'
219 | )
220 | parser.add_argument(
221 | '--prob',
222 | action='store_true',
223 | help='Model use'
224 | )
225 | parser.add_argument(
226 | '--fix_embed',
227 | action='store_false',
228 | help='Model use'
229 | )
230 | parser.add_argument(
231 | '--add_embed',
232 | action='store_true',
233 | help='Model use'
234 | )
235 | parser.add_argument(
236 | '--print_model',
237 | action='store_true',
238 | help='Model use'
239 | )
240 | args = parser.parse_args()
241 | if not args.model in model_dict:
242 | raise RuntimeError('Model not found')
243 | train_model(
244 | model_dict[args.model], args.embedding,
245 | int(args.max), args.nb_sent, args.sent_len,
246 | args.find_threshold, args.mix, args.prob,
247 | args.fix_embed, args.add_embed, args.aug, args.aug_min_len, args.print_model, args.model
248 | )
249 |
--------------------------------------------------------------------------------
/scripts/util.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import copy
3 | import os
4 | import numpy as np
5 | import re
6 | import keras.backend as K
7 |
8 | from tqdm import tqdm
9 | from collections import defaultdict
10 | from os.path import abspath
11 | from spacy.lang.vi import Vietnamese
12 | from .constant import DEFAULT_MAX_LENGTH
13 | from gensim.models.keyedvectors import KeyedVectors
14 | from sklearn.metrics import f1_score
15 | import string
16 |
17 |
18 | def split_array(arr, condition):
19 | if len(arr) == 0:
20 | return []
21 | result = []
22 | accumulated = [arr[0]]
23 | for ele in arr[1:]:
24 | if condition(ele):
25 | result.append(copy.deepcopy(accumulated))
26 | accumulated = [copy.deepcopy(ele)]
27 | else:
28 | accumulated.append(copy.deepcopy(ele))
29 | result.append(copy.deepcopy(accumulated))
30 | return result
31 |
32 |
33 | def read_file(file_path, is_train=True):
34 | file_path = abspath(file_path)
35 | data_lines = list(
36 | filter(lambda x: x != '', open(file_path).read().split('\n')))
37 | pattern = ('train' if is_train else 'test') + '_[0-9]{5}'
38 | datas = split_array(data_lines, lambda x: bool(re.match(pattern, x)))
39 | if is_train:
40 | result_array = list(map(
41 | lambda x: [x[0], ' '.join(x[1:-1]), int(x[-1])], datas))
42 | else:
43 | result_array = list(map(lambda x: [x[0], ' '.join(x[1:])], datas))
44 | columns = ['name', 'text', 'label'] if is_train else ['name', 'text']
45 | return pd.DataFrame(result_array, columns=columns)
46 |
47 |
48 | def tokenize(texts):
49 | nlp = Vietnamese()
50 | docs = []
51 | for text in texts:
52 | tokens = np.array([postprocess_token(token.text) for token in nlp(text.lower())[1:-1]])
53 | docs.append(tokens)
54 |
55 | return docs
56 |
57 |
58 | def postprocess_token(token):
59 | if token in string.punctuation:
60 | return ''
61 | elif token.isdigit():
62 | return ''
63 | else:
64 | return token
65 |
66 |
67 |
68 | def make_embedding(texts, embedding_path, max_features):
69 | embedding_path = abspath(embedding_path)
70 |
71 | def get_coefs(word, *arr):
72 | return word, np.asarray(arr, dtype='float32')
73 |
74 | if embedding_path.endswith('.vec'):
75 | embedding_index = dict(get_coefs(*o.strip().split(" "))
76 | for o in open(embedding_path))
77 | mean_embedding = np.mean(np.array(list(embedding_index.values())))
78 | elif embedding_path.endswith('bin'):
79 | embedding_index = KeyedVectors.load_word2vec_format(
80 | embedding_path, binary=True)
81 | mean_embedding = np.mean(embedding_index.vectors, axis=0)
82 | embed_size = mean_embedding.shape[0]
83 | word_index = sorted(list({word.lower() for sentence in texts for word in sentence}))
84 | nb_words = min(max_features, len(word_index))
85 | embedding_matrix = np.zeros((nb_words + 1, embed_size))
86 | i = 1
87 | word_map = defaultdict(lambda: nb_words)
88 | for word in word_index:
89 | if i >= max_features:
90 | continue
91 | if word in embedding_index:
92 | embedding_matrix[i] = embedding_index[word]
93 | else:
94 | embedding_matrix[i] = mean_embedding
95 | word_map[word] = i
96 | i += 1
97 |
98 | embedding_matrix[-1] = mean_embedding
99 | return embed_size, word_map, embedding_matrix
100 |
101 | def text_to_sequences(texts, word_map, max_len=DEFAULT_MAX_LENGTH):
102 | texts_id = []
103 | for sentence in texts:
104 | sentence = [word_map[word.lower()] for word in sentence][:max_len]
105 | padded_setence = np.pad(
106 | sentence, (0, max(0, max_len - len(sentence))), 'constant', constant_values=0)
107 | texts_id.append(padded_setence)
108 | return np.array(texts_id)
109 |
110 | def find_threshold(pred_proba, y_true, metric = f1_score):
111 | cur_acc = 0
112 | cur_thres = 0
113 | for ind in range(len(pred_proba) - 1):
114 | threshold = (pred_proba[ind][0] + pred_proba[ind + 1][0]) / 2
115 | pred = (pred_proba > threshold).astype(np.int8)
116 | acc = metric(pred, y_true)
117 | if acc > cur_acc:
118 | cur_thres = threshold
119 | cur_acc = acc
120 |
121 | return cur_thres
122 |
123 | def f1(y_true, y_pred):
124 | def recall(y_true, y_pred):
125 | """Recall metric.
126 |
127 | Only computes a batch-wise average of recall.
128 |
129 | Computes the recall, a metric for multi-label classification of
130 | how many relevant items are selected.
131 | """
132 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
133 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
134 | recall = true_positives / (possible_positives + K.epsilon())
135 | return recall
136 |
137 | def precision(y_true, y_pred):
138 | """Precision metric.
139 |
140 | Only computes a batch-wise average of precision.
141 |
142 | Computes the precision, a metric for multi-label classification of
143 | how many selected items are relevant.
144 | """
145 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
146 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
147 | precision = true_positives / (predicted_positives + K.epsilon())
148 | return precision
149 | precision = precision(y_true, y_pred)
150 | recall = recall(y_true, y_pred)
151 | return 2*((precision*recall)/(precision+recall+K.epsilon()))
152 |
153 | def predictions_to_submission(test_data, predictor):
154 | tqdm.pandas()
155 | submission = test_data[['id']]
156 | submission['label'] = test_data['text'].progress_apply(predictor)
157 | return submission
158 |
159 |
160 | # HELPERS FOR HIERARCHICAL MODEL:
161 | def sent_tokenize(texts):
162 | nlp = Vietnamese()
163 | nlp.add_pipe(nlp.create_pipe('sentencizer'))
164 | docs = []
165 | for text in texts:
166 | text_tokenized = []
167 | if (len(text) > 3):
168 | for sentence in nlp(text.lower()[1:-1]).sents:
169 | sent_tokens = np.array([postprocess_token(token.text) for token in sentence])
170 | text_tokenized.append(sent_tokens)
171 | else:
172 | text_tokenized.append([])
173 | docs.append(text_tokenized)
174 |
175 | return docs
176 |
177 |
178 | def sent_embedding(tokenized_texts, embedding_path, max_features):
179 | embedding_path = abspath(embedding_path)
180 |
181 | def get_coefs(word, *arr):
182 | return word, np.asarray(arr, dtype='float32')
183 |
184 | if embedding_path.endswith('.vec'):
185 | embedding_index = dict(get_coefs(*o.strip().split(" "))
186 | for o in open(embedding_path))
187 | mean_embedding = np.mean(np.array(list(embedding_index.values())))
188 | elif embedding_path.endswith('bin'):
189 | embedding_index = KeyedVectors.load_word2vec_format(
190 | embedding_path, binary=True)
191 | mean_embedding = np.mean(embedding_index.vectors, axis=0)
192 | embed_size = mean_embedding.shape[0]
193 | word_index = {word.lower() for text in tokenized_texts for sentence in text for word in sentence}
194 | nb_words = min(max_features, len(word_index))
195 | embedding_matrix = np.zeros((nb_words + 1, embed_size))
196 |
197 | i = 1
198 | word_map = defaultdict(lambda: nb_words)
199 | for word in word_index:
200 | if i >= max_features:
201 | continue
202 | if word in embedding_index:
203 | embedding_matrix[i] = embedding_index[word]
204 | else:
205 | embedding_matrix[i] = mean_embedding
206 | word_map[word] = i
207 | i += 1
208 | embedding_matrix[-1] = mean_embedding
209 | return embed_size, word_map, embedding_matrix
210 |
211 | def text_sents_to_sequences(texts, word_map, max_nb_sent, max_sent_len):
212 | ret = []
213 | for i in range(len(texts)):
214 | text_vecs = []
215 | for j in range(len(texts[i])):
216 | if (j < max_nb_sent):
217 | sent_vecs = []
218 | for k in range(len(texts[i][j])):
219 | if (k < max_sent_len):
220 | sent_vecs.append(word_map[texts[i][j][k]])
221 | if (len(sent_vecs) < max_sent_len):
222 | sent_vecs = np.pad(
223 | sent_vecs,
224 | (0, max(0, max_sent_len - len(sent_vecs))),
225 | 'constant',
226 | constant_values=0
227 | )
228 | text_vecs.append(sent_vecs)
229 |
230 |
231 | if (len(text_vecs) < max_nb_sent):
232 | text_vecs = np.pad(
233 | text_vecs,
234 | ((0, max_nb_sent - len(text_vecs)), (0, 0)),
235 | 'constant',
236 | constant_values=0
237 | )
238 |
239 | ret.append(text_vecs)
240 |
241 | return np.array(ret)
242 |
--------------------------------------------------------------------------------
/scripts/stack.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.model_selection import KFold
3 | from scripts.util import f1
4 |
5 |
6 | from keras.callbacks import EarlyStopping, ModelCheckpoint
7 | from keras.models import load_model
8 |
9 | from keras.models import Model
10 | from keras.layers import \
11 | Dense, Embedding, Input, \
12 | Conv1D, MaxPool1D, \
13 | Dropout, BatchNormalization, \
14 | Bidirectional, CuDNNLSTM, \
15 | Concatenate, Flatten, Add
16 |
17 |
18 |
19 | class StackedGeneralizer:
20 |
21 | def __init__(self, models, meta_model):
22 | self._models = models
23 | self._meta_model = meta_model
24 | return
25 |
26 |
27 | def train_models(self, X, y, X_val, y_val, model_path, epochs, batch_size, patience):
28 | for ind in range(len(self._models)):
29 | checkpoint = ModelCheckpoint(
30 | filepath='{}/models.hdf5'.format(model_path),
31 | monitor='val_f1', verbose=1,
32 | mode='max',
33 | save_best_only=True
34 | )
35 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
36 | callbacks_list = [checkpoint, early]
37 | self._models[ind].compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
38 | self._models[ind].fit(
39 | X, y,
40 | validation_data= (X_val, y_val),
41 | callbacks=callbacks_list,
42 | epochs=epochs,
43 | batch_size=batch_size
44 | )
45 | self._models[ind].load_weights(filepath='{}/models.hdf5'.format(model_path))
46 |
47 |
48 |
49 | def train_meta_model(self, X, y, X_val, y_val, model_path, epochs, batch_size, patience):
50 |
51 | # Obtain level-1 input from each model:
52 | meta_input = np.zeros((len(X), len(self._models)))
53 |
54 | for ind in range(len(self._models)):
55 | pred = np.zeros(len(X))
56 | kf = KFold(n_splits = 5, shuffle = False)
57 | model = self._models[ind]
58 | # model.save(filepath='{}/dumped.hdf5'.format(model_path))
59 | weights = model.get_weights()
60 |
61 |
62 | for train_index, test_index in kf.split(X):
63 | X_train, X_test = X[train_index], X[test_index]
64 | y_train, y_test = y[train_index], y[test_index]
65 |
66 |
67 | checkpoint = ModelCheckpoint(
68 | filepath='{}/models.hdf5'.format(model_path),
69 | monitor='val_f1', verbose=1,
70 | mode='max',
71 | save_best_only=True
72 | )
73 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
74 | callbacks_list = [checkpoint, early]
75 | model.fit(
76 | X_train, y_train,
77 | validation_data= (X_val, y_val),
78 | callbacks=callbacks_list,
79 | epochs=epochs,
80 | batch_size=batch_size
81 | )
82 |
83 | model.set_weights(weights)
84 | pred[test_index] = model.predict(X_test).reshape(-1)
85 |
86 | # Reset model:
87 | model = load_model(filepath='{}/dumped.hdf5'.format(model_path))
88 | # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
89 |
90 |
91 | meta_input[:, ind] = pred
92 |
93 |
94 | self._meta_model.fit(meta_input, y)
95 |
96 |
97 | def predict(self, X):
98 | meta_input = self.compute_meta_data(X)
99 | return (self._meta_model.predict(meta_input) > 0.5).astype(np.int8)
100 |
101 |
102 | def compute_meta_data(self, X):
103 | prediction = np.zeros((len(X), len(self._models)))
104 | for ind in range(len(self._models)):
105 | pred = self._models[ind].predict(X).reshape(len(X), 1).reshape(-1)
106 | prediction[:, ind] = pred
107 |
108 | return prediction
109 |
110 | def load_weights(self, paths):
111 | for ind in range(len(self._models)):
112 | self._models[ind].load_weights(paths[ind])
113 |
114 |
115 | class StackedGeneralizerWithHier:
116 | def __init__(self, models, hier_models, meta_model):
117 | self._models = models
118 | self._hier_models = hier_models
119 |
120 | self._meta_model = meta_model
121 | return
122 |
123 | def train_models(self, X, y, X_val, y_val, X_hier, X_hier_val, model_path, epochs, batch_size,
124 | patience):
125 |
126 | for ind in range(len(self._models)):
127 | checkpoint = ModelCheckpoint(
128 | filepath='{}/models.hdf5'.format(model_path),
129 | monitor='val_f1', verbose=1,
130 | mode='max',
131 | save_best_only=True
132 | )
133 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
134 | callbacks_list = [checkpoint, early]
135 | self._models[ind].compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
136 | self._models[ind].fit(
137 | X, y,
138 | validation_data=(X_val, y_val),
139 | callbacks=callbacks_list,
140 | epochs=epochs,
141 | batch_size=batch_size
142 | )
143 | self._models[ind].load_weights(filepath='{}/models.hdf5'.format(model_path))
144 |
145 | for ind in range(len(self._hier_models)):
146 | checkpoint = ModelCheckpoint(
147 | filepath='{}/models.hdf5'.format(model_path),
148 | monitor='val_f1', verbose=1,
149 | mode='max',
150 | save_best_only=True
151 | )
152 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
153 | callbacks_list = [checkpoint, early]
154 | self._hier_models[ind].compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
155 | self._hier_models[ind].fit(
156 | X_hier, y,
157 | validation_data=(X_hier_val, y_val),
158 | callbacks=callbacks_list,
159 | epochs=epochs,
160 | batch_size=batch_size
161 | )
162 | self._hier_models[ind].load_weights(filepath='{}/models.hdf5'.format(model_path))
163 |
164 | def train_meta_model(self, X, y, X_val, y_val, X_hier, X_hier_val, model_path, epochs,
165 | batch_size, patience):
166 |
167 | # Obtain level-1 input from each model:
168 | meta_input = np.zeros((len(X), len(self._models) + len(self._hier_models)))
169 |
170 | for ind in range(len(self._hier_models)):
171 | pred = np.zeros(len(X))
172 | kf = KFold(n_splits=5, shuffle=False)
173 | model = self._hier_models[ind]
174 | weights = model.get_weights()
175 |
176 |
177 | for train_index, test_index in kf.split(X_hier):
178 | X_train, X_test = X_hier[train_index], X_hier[test_index]
179 | y_train, y_test = y[train_index], y[test_index]
180 |
181 | checkpoint = ModelCheckpoint(
182 | filepath='{}/models.hdf5'.format(model_path),
183 | monitor='val_f1', verbose=1,
184 | mode='max',
185 | save_best_only=True
186 | )
187 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
188 | callbacks_list = [checkpoint, early]
189 | model.fit(
190 | X_train, y_train,
191 | validation_data=(X_hier_val, y_val),
192 | callbacks=callbacks_list,
193 | epochs=epochs,
194 | batch_size=batch_size
195 | )
196 |
197 | model.load_weights(filepath='{}/models.hdf5'.format(model_path))
198 | pred[test_index] = model.predict(X_test).reshape(-1)
199 |
200 | # Reset model:
201 | model = model.set_weights(weights)
202 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
203 |
204 | meta_input[:, len(self._models) + ind] = pred
205 |
206 |
207 | for ind in range(len(self._models)):
208 | pred = np.zeros(len(X))
209 | kf = KFold(n_splits=5, shuffle=False)
210 | model = self._models[ind]
211 | weights = model.get_weights()
212 |
213 | for train_index, test_index in kf.split(X):
214 | X_train, X_test = X[train_index], X[test_index]
215 | y_train, y_test = y[train_index], y[test_index]
216 |
217 | checkpoint = ModelCheckpoint(
218 | filepath='{}/models.hdf5'.format(model_path),
219 | monitor='val_f1', verbose=1,
220 | mode='max',
221 | save_best_only=True
222 | )
223 | early = EarlyStopping(monitor='val_f1', mode='max', patience=patience)
224 | callbacks_list = [checkpoint, early]
225 | model.fit(
226 | X_train, y_train,
227 | validation_data=(X_val, y_val),
228 | callbacks=callbacks_list,
229 | epochs=epochs,
230 | batch_size=batch_size
231 | )
232 |
233 | model.load_weights(filepath='{}/models.hdf5'.format(model_path))
234 | pred[test_index] = model.predict(X_test).reshape(-1)
235 |
236 | # Reset model:
237 | model.set_weights(weights)
238 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1])
239 |
240 |
241 | meta_input[:, ind] = pred
242 |
243 |
244 | self._meta_model.fit(meta_input, y)
245 |
246 | def predict(self, X, X_hier):
247 | meta_input = self.compute_meta_data(X, X_hier)
248 | return (self._meta_model.predict(meta_input) > 0.5).astype(np.int8)
249 |
250 | def compute_meta_data(self, X, X_hier):
251 | prediction = np.zeros((len(X), len(self._models) + len(self._hier_models)))
252 | for ind in range(len(self._models)):
253 | pred = self._models[ind].predict(X).reshape(len(X), 1).reshape(-1)
254 | prediction[:, ind] = pred
255 |
256 | for ind in range(len(self._hier_models)):
257 | pred = self._hier_models[ind].predict(X_hier).reshape(len(X_hier), 1).reshape(-1)
258 | prediction[:, len(self._models) + ind] = pred
259 |
260 | return prediction
261 |
262 | def load_weights(self, paths, paths_hier):
263 | for ind in range(len(self._models)):
264 | self._models[ind].load_weights(paths[ind])
265 |
266 | for ind in range(len(self._hier_models)):
267 | self._hier_models[ind].load_weights(paths_hier[ind])
268 |
269 |
270 | def StackMLP(n_model):
271 | inp = Input(shape = (n_model,))
272 | op = Dense(10, activation = "relu")(inp)
273 | op = BatchNormalization()(op)
274 | op = Dense(1, activation = "sigmoid")(op)
275 |
276 | model = Model(inputs = inp, outputs = op)
277 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
278 | return model
279 |
280 |
281 |
282 |
--------------------------------------------------------------------------------
/scripts/rnn.py:
--------------------------------------------------------------------------------
1 | from keras.models import Model
2 | from keras.layers import \
3 | Dense, Embedding, Input, \
4 | CuDNNGRU, GRU, LSTM, Bidirectional, CuDNNLSTM, \
5 | GlobalMaxPool1D, GlobalAveragePooling1D, Dropout, \
6 | Lambda, Concatenate, TimeDistributed
7 | from .util import f1
8 | from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
9 | from keras.activations import softmax
10 | from keras_layer_normalization import LayerNormalization
11 | from .net_components import AttLayer, AdditiveLayer
12 | from keras.utils.vis_utils import plot_model
13 |
14 |
15 |
16 |
17 | def RNNKeras(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, use_fasttext = False, trainable = True, use_additive_emb = False):
18 | if use_fasttext:
19 | inp = Input(shape=(maxlen, embed_size))
20 | x = inp
21 | else:
22 | inp = Input(shape = (maxlen, ))
23 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
24 |
25 | if use_additive_emb:
26 | x = AdditiveLayer()(x)
27 | x = Dropout(0.5)(x)
28 |
29 | x = Bidirectional(CuDNNGRU(128, return_sequences = True))(x)
30 | x = Dropout(0.5)(x)
31 | x = Bidirectional(CuDNNGRU(128, return_sequences = True))(x)
32 | x = Dropout(0.5)(x)
33 |
34 | max_pool = GlobalMaxPool1D()(x)
35 | avg_pool = GlobalAveragePooling1D()(x)
36 | last = Lambda(lambda x: x[:, 0, :])(x)
37 | concat_pool = Concatenate(axis = -1)([last, max_pool, avg_pool])
38 |
39 | op = Dense(64, activation = "relu")(concat_pool)
40 | op = Dropout(0.5)(op)
41 | op = Dense(1, activation = "sigmoid")(op)
42 |
43 | model = Model(inputs = inp, outputs = op)
44 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
45 | return model
46 |
47 | def RNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, use_fasttext = False, trainable = True, use_additive_emb = False):
48 | if use_fasttext:
49 | inp = Input(shape=(maxlen, embed_size))
50 | x = inp
51 | else:
52 | inp = Input(shape = (maxlen, ))
53 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
54 |
55 | if use_additive_emb:
56 | x = AdditiveLayer()(x)
57 | x = Dropout(0.5)(x)
58 |
59 |
60 | x = Bidirectional(GRU(128, return_sequences = True, recurrent_dropout = 0.5, dropout = 0.5))(x)
61 | # x = Dropout(0.5)(x)
62 | x = Bidirectional(GRU(128, return_sequences = True, recurrent_dropout = 0.5, dropout = 0.5))(x)
63 | # x = Dropout(0.5)(x)
64 |
65 | max_pool = GlobalMaxPool1D()(x)
66 | avg_pool = GlobalAveragePooling1D()(x)
67 | last = Lambda(lambda x: x[:, 0, :])(x)
68 | concat_pool = Concatenate(axis = -1)([last, max_pool, avg_pool])
69 |
70 | op = Dense(64, activation = "relu")(concat_pool)
71 | op = Dropout(0.5)(op)
72 | op = Dense(1, activation = "sigmoid")(op)
73 |
74 | model = Model(inputs = inp, outputs = op)
75 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
76 | return model
77 |
78 | def LSTMKeras(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100):
79 | inp = Input(shape = (maxlen, ))
80 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix])(inp)
81 | x = Bidirectional(CuDNNLSTM(50, return_sequences = True))(x)
82 | # x = Dropout(0.1)(x)
83 | x = Bidirectional(CuDNNLSTM(50, return_sequences = True))(x)
84 | x = Dropout(0.1)(x)
85 | x = GlobalMaxPool1D()(x)
86 | x = Dense(50, activation = "relu")(x)
87 | x = Dropout(0.1)(x)
88 | x = Dense(1, activation = "sigmoid")(x)
89 | model = Model(inputs = inp, outputs = x)
90 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
91 | return model
92 |
93 |
94 | def SARNNKerasCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, use_fasttext = False, trainable = True, use_additive_emb = False):
95 | if use_fasttext:
96 | inp = Input(shape=(maxlen, embed_size))
97 | x = inp
98 | else:
99 | inp = Input(shape = (maxlen, ))
100 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
101 |
102 | if use_additive_emb:
103 | x = AdditiveLayer()(x)
104 | x = Dropout(0.5)(x)
105 |
106 |
107 | x = Bidirectional(LSTM(128, return_sequences = True))(x)
108 | x = SeqSelfAttention(
109 | # attention_type = SeqSelfAttention.ATTENTION_TYPE_MUL,
110 | attention_regularizer_weight=1e-4,
111 | )(x)
112 | # x = LayerNormalization()(x)
113 | x = Dropout(0.5)(x)
114 |
115 | x = Bidirectional(LSTM(128, return_sequences = True))(x)
116 | x = SeqWeightedAttention()(x)
117 | # x = LayerNormalization()(x)
118 | x = Dropout(0.5)(x)
119 |
120 | x = Dense(64, activation = "relu")(x)
121 | x = Dropout(0.5)(x)
122 | x = Dense(1, activation = "sigmoid")(x)
123 | model = Model(inputs = inp, outputs = x)
124 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
125 | return model
126 |
127 | def SARNNKeras(embeddingMatrix = None, embed_size = 400, max_features = 20000, maxlen = 100, rnn_type = CuDNNLSTM, use_fasttext = False, trainable = True, use_additive_emb = False):
128 | if use_fasttext:
129 | inp = Input(shape=(maxlen, embed_size))
130 | x = inp
131 | else:
132 | inp = Input(shape = (maxlen, ))
133 | x = Embedding(input_dim = max_features, output_dim = embed_size, weights = [embeddingMatrix], trainable = trainable)(inp)
134 |
135 | if use_additive_emb:
136 | x = AdditiveLayer()(x)
137 | x = Dropout(0.5)(x)
138 |
139 |
140 | x = Bidirectional(rnn_type(128, return_sequences = True))(x)
141 | x = SeqSelfAttention(
142 | # attention_type = SeqSelfAttention.ATTENTION_TYPE_MUL,
143 | attention_regularizer_weight=1e-4,
144 | )(x)
145 | # x = LayerNormalization()(x)
146 | x = Dropout(0.5)(x)
147 |
148 | x = Bidirectional(rnn_type(128, return_sequences = True))(x)
149 | x = SeqWeightedAttention()(x)
150 | # x = LayerNormalization()(x)
151 | x = Dropout(0.5)(x)
152 |
153 | x = Dense(64, activation = "relu")(x)
154 | x = Dropout(0.5)(x)
155 | x = Dense(1, activation = "sigmoid")(x)
156 | model = Model(inputs = inp, outputs = x)
157 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
158 | return model
159 |
160 |
161 | def HRNNCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, trainable = True, use_additive_emb = False):
162 | sent_inp = Input(shape = (max_sent_len, embed_size))
163 | embed = Embedding(
164 | input_dim = max_features,
165 | output_dim = embed_size,
166 | weights = [embeddingMatrix],
167 | trainable = trainable
168 | )(sent_inp)
169 |
170 | if use_additive_emb:
171 | embed = AdditiveLayer()(embed)
172 | embed = Dropout(0.5)(embed)
173 |
174 | word_lstm = Bidirectional(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))(embed)
175 | sent_encoder = Model(sent_inp, word_lstm)
176 |
177 | doc_input = Input(shape = (max_nb_sent, max_sent_len))
178 | doc_encoder = TimeDistributed(sent_encoder)(doc_input)
179 | sent_lstm = Bidirectional(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))(doc_encoder)
180 | preds = Dense(1, activation = "sigmoid")(sent_lstm)
181 | model = Model(inputs = doc_input, outputs = preds)
182 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
183 | return model
184 |
185 | def HRNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, trainable = True, use_additive_emb = False):
186 | sent_inp = Input(shape = (max_sent_len, embed_size))
187 | embed = Embedding(
188 | input_dim = max_features,
189 | output_dim = embed_size,
190 | weights = [embeddingMatrix],
191 | trainable = trainable
192 | )(sent_inp)
193 |
194 | if use_additive_emb:
195 | embed = AdditiveLayer()(embed)
196 | embed = Dropout(0.5)(embed)
197 |
198 | word_lstm = Bidirectional(CuDNNLSTM(128))(embed)
199 | sent_encoder = Model(sent_inp, word_lstm)
200 |
201 | doc_input = Input(shape = (max_nb_sent, max_sent_len))
202 | doc_encoder = TimeDistributed(sent_encoder)(doc_input)
203 | sent_lstm = Bidirectional(CuDNNLSTM(128))(doc_encoder)
204 | preds = Dense(1, activation = "sigmoid")(sent_lstm)
205 | model = Model(inputs = doc_input, outputs = preds)
206 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
207 | return model
208 |
209 |
210 | def OriginalHARNNCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False):
211 | if use_fasttext:
212 | sent_inp = Input(shape = (max_sent_len, embed_size))
213 | embed = sent_inp
214 | else:
215 | sent_inp = Input(shape = (max_sent_len, ))
216 | embed = Embedding(
217 | input_dim = max_features,
218 | output_dim = embed_size,
219 | weights = [embeddingMatrix],
220 | trainable = trainable
221 | )(sent_inp)
222 |
223 | if use_additive_emb:
224 | embed = AdditiveLayer()(embed)
225 | embed = Dropout(0.5)(embed)
226 |
227 | word_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(embed)
228 | word_att = AttLayer(context_size = 256)(word_lstm)
229 | sent_encoder = Model(sent_inp, word_att)
230 |
231 | doc_input = Input(shape = (max_nb_sent, max_sent_len))
232 | doc_encoder = TimeDistributed(sent_encoder)(doc_input)
233 | sent_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(doc_encoder)
234 | sent_att = AttLayer(context_size = 256)(sent_lstm)
235 | preds = Dense(1, activation = "sigmoid")(sent_att)
236 | model = Model(inputs = doc_input, outputs = preds)
237 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
238 | return model
239 |
240 | def OriginalHARNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False):
241 | if use_fasttext:
242 | sent_inp = Input(shape = (max_sent_len, embed_size))
243 | embed = sent_inp
244 | else:
245 | sent_inp = Input(shape = (max_sent_len, ))
246 | embed = Embedding(
247 | input_dim = max_features,
248 | output_dim = embed_size,
249 | weights = [embeddingMatrix],
250 | trainable = trainable
251 | )(sent_inp)
252 |
253 | if use_additive_emb:
254 | embed = AdditiveLayer()(embed)
255 | embed = Dropout(0.5)(embed)
256 |
257 | word_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(embed)
258 | word_att = AttLayer(context_size = 256)(word_lstm)
259 | word_att = Dropout(0.5)(word_att)
260 | sent_encoder = Model(sent_inp, word_att)
261 |
262 | doc_input = Input(shape = (max_nb_sent, max_sent_len))
263 | doc_encoder = TimeDistributed(sent_encoder)(doc_input)
264 | sent_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(doc_encoder)
265 | sent_att = AttLayer(context_size = 256)(sent_lstm)
266 | sent_att = Dropout(0.5)(sent_att)
267 | preds = Dense(1, activation = "sigmoid")(sent_att)
268 | model = Model(inputs = doc_input, outputs = preds)
269 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
270 | return model
271 |
272 |
273 |
274 |
275 | def HARNNCPU(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False):
276 | if use_fasttext:
277 | sent_inp = Input(shape = (max_sent_len, embed_size))
278 | embed = sent_inp
279 | else:
280 | sent_inp = Input(shape = (max_sent_len, ))
281 | embed = Embedding(
282 | input_dim = max_features,
283 | output_dim = embed_size,
284 | weights = [embeddingMatrix],
285 | trainable = trainable
286 | )(sent_inp)
287 |
288 | if use_additive_emb:
289 | embed = AdditiveLayer()(embed)
290 | embed = Dropout(0.5)(embed)
291 |
292 |
293 | word_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(embed)
294 | word_att = SeqWeightedAttention()(word_lstm)
295 | sent_encoder = Model(sent_inp, word_att)
296 |
297 | doc_input = Input(shape = (max_nb_sent, max_sent_len))
298 | doc_encoder = TimeDistributed(sent_encoder)(doc_input)
299 | sent_lstm = Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.5, return_sequences = True))(doc_encoder)
300 | sent_att = SeqWeightedAttention()(sent_lstm)
301 | preds = Dense(1, activation = "sigmoid")(sent_att)
302 | model = Model(inputs = doc_input, outputs = preds)
303 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
304 | return model
305 |
306 |
307 |
308 | def HARNN(embeddingMatrix = None, embed_size = 400, max_features = 20000, max_nb_sent = 3, max_sent_len = 40, use_fasttext = False, trainable = True, use_additive_emb = False):
309 | if use_fasttext:
310 | sent_inp = Input(shape = (max_sent_len, embed_size))
311 | embed = sent_inp
312 | else:
313 | sent_inp = Input(shape = (max_sent_len, ))
314 | embed = Embedding(
315 | input_dim = max_features,
316 | output_dim = embed_size,
317 | weights = [embeddingMatrix],
318 | trainable = trainable
319 | )(sent_inp)
320 |
321 | if use_additive_emb:
322 | embed = AdditiveLayer()(embed)
323 | embed = Dropout(0.5)(embed)
324 |
325 | word_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(embed)
326 | word_att = SeqWeightedAttention()(word_lstm)
327 | word_att = Dropout(0.5)(word_att)
328 | sent_encoder = Model(sent_inp, word_att)
329 | plot_model(sent_encoder, to_file='{}.png'.format("HARNN1"), show_shapes=True, show_layer_names=True)
330 |
331 |
332 | doc_input = Input(shape = (max_nb_sent, max_sent_len))
333 | doc_encoder = TimeDistributed(sent_encoder)(doc_input)
334 | sent_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(doc_encoder)
335 | sent_att = SeqWeightedAttention()(sent_lstm)
336 | sent_att = Dropout(0.5)(sent_att)
337 | preds = Dense(1, activation = "sigmoid")(sent_att)
338 | model = Model(inputs = doc_input, outputs = preds)
339 | model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1])
340 | return model
341 |
342 |
343 |
344 |
--------------------------------------------------------------------------------