├── .gitignore
├── bin
    ├── config.py
    ├── nb_features.py
    ├── imports.py
    ├── utils.py
    ├── text_cleaner.py
    ├── models.py
    └── contractions.py
├── README.md
├── requirements.txt
└── modelling.ipynb


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Jupyter Notebook
 2 | .ipynb_checkpoints
 3 | 
 4 | # dotenv
 5 | .env
 6 | 
 7 | # virtualenv
 8 | .venv
 9 | venv/
10 | ENV/
11 | 


--------------------------------------------------------------------------------
/bin/config.py:
--------------------------------------------------------------------------------
1 | config = dict(
2 |     HOME='/home/ser/DL/toxic/', 
3 |     data=dict(
4 |         train = 'train.csv.zip',
5 |         test = 'test.csv.zip',
6 |         sample = 'sample_submission.csv.zip'
7 |     )
8 | )
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # jigsaw-toxic-comment-classification-challenge
 2 | 
 3 | Result: 19/4551
 4 | 
 5 | Kagle kernels:
 6 | 
 7 | * [Words polarity based on LR weights](https://www.kaggle.com/sermakarevich/words-polarity-based-on-lr-weights)
 8 | * [Hierarchical Attention Network](https://www.kaggle.com/sermakarevich/hierarchical-attention-network)
 9 | * [Sklearn pipelines tutorial](https://www.kaggle.com/sermakarevich/sklearn-pipelines-tutorial)
10 | 
11 | 


--------------------------------------------------------------------------------
/bin/nb_features.py:
--------------------------------------------------------------------------------
 1 | from imports import *
 2 | 
 3 | 
 4 | class NBFeaturer(BaseEstimator, ClassifierMixin):
 5 |     def __init__(self, alpha):
 6 |         self.alpha = alpha
 7 |     
 8 |     def preprocess_x(self, x, r):
 9 |         return x.multiply(r)
10 |     
11 |     def pr(self, x, y_i, y):
12 |         p = x[y==y_i].sum(0)
13 |         return (p+self.alpha) / ((y==y_i).sum()+self.alpha)
14 | 
15 |     def fit(self, x, y=None):
16 |         self._r = sparse.csr_matrix(np.log(self.pr(x,1,y) / self.pr(x,0,y)))
17 |         return self
18 |     
19 |     def transform(self, x):
20 |         x_nb = self.preprocess_x(x, self._r)
21 |         return x_nb


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.1.11
 2 | appnope==0.1.0
 3 | astor==0.6.2
 4 | bleach==1.5.0
 5 | cycler==0.10.0
 6 | decorator==4.2.1
 7 | entrypoints==0.2.3
 8 | gast==0.2.0
 9 | grpcio==1.10.0
10 | html5lib==0.9999999
11 | ipykernel==4.8.2
12 | ipython==6.2.1
13 | ipython-genutils==0.2.0
14 | ipywidgets==7.1.2
15 | jedi==0.11.1
16 | Jinja2==2.10
17 | joblib==0.11
18 | jsonschema==2.6.0
19 | jupyter==1.0.0
20 | jupyter-client==5.2.3
21 | jupyter-console==5.2.0
22 | jupyter-core==4.4.0
23 | Keras==2.1.5
24 | kiwisolver==1.0.1
25 | Markdown==2.6.11
26 | MarkupSafe==1.0
27 | matplotlib==2.2.2
28 | mistune==0.8.3
29 | nbconvert==5.3.1
30 | nbformat==4.4.0
31 | nltk==3.2.5
32 | notebook==5.7.8
33 | numpy==1.14.2
34 | pandas==0.22.0
35 | pandocfilters==1.4.2
36 | parso==0.1.1
37 | pexpect==4.4.0
38 | pickleshare==0.7.4
39 | prompt-toolkit==1.0.15
40 | protobuf==3.5.2.post1
41 | ptyprocess==0.5.2
42 | Pygments==2.2.0
43 | pyparsing==2.2.0
44 | python-dateutil==2.7.0
45 | pytz==2018.3
46 | PyYAML==5.1
47 | pyzmq==17.0.0
48 | qtconsole==4.3.1
49 | scikit-learn==0.19.1
50 | scipy==1.0.0
51 | Send2Trash==1.5.0
52 | simplegeneric==0.8.1
53 | six==1.11.0
54 | sklearn==0.0
55 | tensorboard==1.6.0
56 | tensorflow==1.12.2
57 | termcolor==1.1.0
58 | terminado==0.8.1
59 | testpath==0.3.1
60 | tornado==5.0.1
61 | traitlets==4.3.2
62 | wcwidth==0.1.7
63 | webencodings==0.5.1
64 | Werkzeug==0.15.3
65 | widgetsnbextension==3.1.4
66 | 


--------------------------------------------------------------------------------
/bin/imports.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import string
 4 | from string import digits
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | from scipy import sparse
 9 | 
10 | from sklearn.model_selection import train_test_split
11 | from sklearn.metrics import roc_auc_score
12 | 
13 | from keras.models import Model, Sequential
14 | from keras.layers import (Input, Dense, Embedding, SpatialDropout1D, concatenate,  RepeatVector, Flatten, Conv1D,
15 |         GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU, CuDNNLSTM, MaxPooling1D, Layer,
16 |         Dropout, K, Activation, BatchNormalization, PReLU, add, Reshape)
17 | from keras.preprocessing import text, sequence
18 | from keras import optimizers
19 | from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
20 | 
21 | 
22 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
23 | from sklearn.base import ClassifierMixin, BaseEstimator
24 | from sklearn.linear_model import LogisticRegression
25 | from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
26 | from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold, KFold
27 | from sklearn.metrics import roc_auc_score
28 | 
29 | import nltk
30 | from nltk.stem import WordNetLemmatizer
31 | from nltk.corpus import wordnet, stopwords
32 | 
33 | import pandas as pd
34 | from joblib import Parallel, delayed
35 | import multiprocessing
36 | from multiprocessing import Pool


--------------------------------------------------------------------------------
/bin/utils.py:
--------------------------------------------------------------------------------
 1 | from imports import * 
 2 | 
 3 | 
 4 | re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
 5 | def tokenize(s): 
 6 |     return re_tok.sub(r' \1 ', s).split()
 7 | 
 8 | 
 9 | def get_coefs(word, *arr): 
10 |     return word.lower(), np.asarray(arr, dtype='float32')
11 | 
12 | def get_emb_dict(word, *arr): 
13 |     return word.lower(), 1
14 | 
15 | 
16 | def substitute(word, neg, pos):
17 |     for n in neg:
18 |         if n.lower() in word.lower():
19 |             return n
20 |     for p in pos:
21 |         if p.lower() in word.lower():
22 |             return p
23 |     return None
24 | 
25 | 
26 | def parallelize_dataframe(df, func):
27 |     df_split = np.array_split(df, multiprocessing.cpu_count())
28 |     pool = Pool(multiprocessing.cpu_count())
29 |     df = pd.concat(pool.map(func, df_split))
30 |     pool.close()
31 |     pool.join()
32 |     return df
33 | 
34 | 
35 | def applyParallel(df, func):
36 |     retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(val) for val in df)
37 |     return retLst
38 | 
39 | 
40 | class RocAucEvaluation(Callback):
41 |     def __init__(self, validation_data=(), interval=1):
42 |         super(Callback, self).__init__()
43 | 
44 |         self.interval = interval
45 |         self.X_val, self.y_val = validation_data
46 | 
47 |     def on_epoch_end(self, epoch, logs={}):
48 |         if epoch % self.interval == 0:
49 |             y_pred = self.model.predict(self.X_val, verbose=0)
50 |             score = roc_auc_score(self.y_val, y_pred)
51 |             print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
52 | 


--------------------------------------------------------------------------------
/bin/text_cleaner.py:
--------------------------------------------------------------------------------
 1 | from imports import *
 2 | 
 3 | 
 4 | class TextCleaner(BaseEstimator):    
 5 |     def __init__(self, contractions):
 6 |         self.wl = WordNetLemmatizer().lemmatize
 7 |         self.wn = wordnet.morphy
 8 |         self.wt = nltk.word_tokenize
 9 |         self.c_s = contractions
10 |         self.ss = "'\":-.,=`*/|—~\\•"
11 |         self.tp = re.compile('\w{1,}')
12 |         self.tp2 = re.compile('([!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~“”¨«»®´·º½¾¿¡§£₤‘’])')
13 | 
14 |     def remove_digits(self, x):
15 |         rd = x.maketrans(' ', ' ', digits)
16 |         x = x.translate(rd)
17 |         return x
18 |     
19 |     def lemmatizer(self, x):
20 |         return [self.wl(self.wl('%s'%i, pos='v'), pos='a') for i in x]
21 |     
22 |     
23 |     def morphy(self, x):
24 |         m = self.wn(x)
25 |         if m is None:
26 |             return x
27 |         else:
28 |             return m
29 |     
30 |     def tokenize(self, s): 
31 |         return self.tp.findall(s)
32 |     
33 |     def tokenize2(self, s): 
34 |         return self.tp2.sub(r' \1 ', s).split()
35 |         
36 |     def morphy_list(self, x):
37 |         return [self.morphy(i) for i in x]
38 |         
39 |     def contr(self, x):
40 |         for k, v in self.c_s.items():
41 |             x = x.replace(k, v)
42 |         return x
43 |     
44 |     def special_symbols(self, x):
45 |         for ss in self.ss:
46 |             if len(x) > 1:
47 |                 x = x.replace(ss, '')
48 |         return x
49 |     
50 |     def remove_stopwords(self, x):
51 |         return [i for i in x if i not in stopwords.words('english')]
52 |         
53 |     def fit(self, x, y=None):
54 |         return self
55 |     
56 |     def transform(self, x):
57 |         x = map(lambda r: r.replace('_', ' '), x)
58 |         x = map(lambda r: r.replace('`', '\''), x)
59 |         x = map(lambda r: self.remove_digits(r), x)
60 |         x = map(self.contr, x)
61 | #         x = map(lambda r: r.lower(), x)
62 | #         x = map(self.contr, x)
63 |         x = map(self.special_symbols, x)
64 | #         x = map(self.wt, x)
65 |         x = map(self.tokenize2, x)
66 | #         x = map(self.remove_stopwords, x)
67 | #         x = map(self.lemmatizer, x)
68 | #         x = list(map(self.morphy_list, x))
69 | #         x = map(lambda i: ' '.join(i), x)
70 |         x = list(x)
71 |         return x


--------------------------------------------------------------------------------
/bin/models.py:
--------------------------------------------------------------------------------
  1 | from imports import *
  2 | 
  3 | 
  4 | ##############################################
  5 | ####  GRU/LSTM  ##############################
  6 | ##############################################
  7 | 
  8 | def GRU_LSTM_model(CuDNN, maxlen, max_features, embed_size, embedding_matrix):
  9 |     inp = Input(shape=(maxlen, ))
 10 |     x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = False)(inp)
 11 |     x = SpatialDropout1D(0.2)(x)
 12 |     x = Bidirectional(CuDNN(128, return_sequences=True))(x)
 13 |     x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
 14 |     
 15 |     avg_pool = GlobalAveragePooling1D()(x)
 16 |     max_pool = GlobalMaxPooling1D()(x)
 17 |     
 18 |     conc = concatenate([avg_pool, max_pool])
 19 |     outp = Dense(6, activation="sigmoid")(conc)
 20 |     
 21 |     model = Model(inputs=inp, outputs=outp)
 22 |     model.compile(loss='binary_crossentropy',
 23 |                   optimizer='adam',
 24 |                   metrics=['accuracy'])
 25 | 
 26 |     return model
 27 | 
 28 | 
 29 | ##############################################
 30 | ####  CAPSULE  ###############################
 31 | ##############################################
 32 | 
 33 | def squash(x, axis=-1):
 34 |     # s_squared_norm is really small
 35 |     # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
 36 |     # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
 37 |     # return scale * x
 38 |     s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
 39 |     scale = K.sqrt(s_squared_norm + K.epsilon())
 40 |     return x / scale
 41 | 
 42 | 
 43 | # A Capsule Implement with Pure Keras
 44 | class Capsule(Layer):
 45 |     def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
 46 |                  activation='default', **kwargs):
 47 |         super(Capsule, self).__init__(**kwargs)
 48 |         self.num_capsule = num_capsule
 49 |         self.dim_capsule = dim_capsule
 50 |         self.routings = routings
 51 |         self.kernel_size = kernel_size
 52 |         self.share_weights = share_weights
 53 |         if activation == 'default':
 54 |             self.activation = squash
 55 |         else:
 56 |             self.activation = Activation(activation)
 57 | 
 58 |     def build(self, input_shape):
 59 |         super(Capsule, self).build(input_shape)
 60 |         input_dim_capsule = input_shape[-1]
 61 |         if self.share_weights:
 62 |             self.W = self.add_weight(name='capsule_kernel',
 63 |                                      shape=(1, input_dim_capsule,
 64 |                                             self.num_capsule * self.dim_capsule),
 65 |                                      # shape=self.kernel_size,
 66 |                                      initializer='glorot_uniform',
 67 |                                      trainable=True)
 68 |         else:
 69 |             input_num_capsule = input_shape[-2]
 70 |             self.W = self.add_weight(name='capsule_kernel',
 71 |                                      shape=(input_num_capsule,
 72 |                                             input_dim_capsule,
 73 |                                             self.num_capsule * self.dim_capsule),
 74 |                                      initializer='glorot_uniform',
 75 |                                      trainable=True)
 76 | 
 77 |     def call(self, u_vecs):
 78 |         if self.share_weights:
 79 |             u_hat_vecs = K.conv1d(u_vecs, self.W)
 80 |         else:
 81 |             u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])
 82 | 
 83 |         batch_size = K.shape(u_vecs)[0]
 84 |         input_num_capsule = K.shape(u_vecs)[1]
 85 |         u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
 86 |                                             self.num_capsule, self.dim_capsule))
 87 |         u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
 88 |         # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]
 89 | 
 90 |         b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
 91 |         for i in range(self.routings):
 92 |             b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
 93 |             c = K.softmax(b)
 94 |             c = K.permute_dimensions(c, (0, 2, 1))
 95 |             b = K.permute_dimensions(b, (0, 2, 1))
 96 |             outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
 97 |             if i < self.routings - 1:
 98 |                 b = K.batch_dot(outputs, u_hat_vecs, [2, 3])
 99 | 
100 |         return outputs
101 | 
102 |     def compute_output_shape(self, input_shape):
103 |         return (None, self.num_capsule, self.dim_capsule)
104 | 
105 | 
106 | def CAPSULE_model(maxlen, max_features, embed_size, embedding_matrix, rate_drop_dense, 
107 |                  Num_capsule, Dim_capsule, Routings, gru_len):
108 |     input1 = Input(shape=(maxlen,))
109 |     embed_layer = Embedding(max_features,
110 |                             embed_size,
111 |                             input_length=maxlen,
112 |                             weights=[embedding_matrix],
113 |                             trainable=False)(input1)
114 |     embed_layer = SpatialDropout1D(rate_drop_dense)(embed_layer)
115 | 
116 | #     x = Bidirectional(
117 | #         GRU(gru_len, activation='relu', dropout=dropout_p, recurrent_dropout=dropout_p, return_sequences=True))(
118 | #         embed_layer)
119 | 
120 |     x = Bidirectional(
121 |         CuDNNGRU(gru_len,return_sequences=True))(embed_layer)
122 |     x = Dropout(rate_drop_dense)(x)
123 |     capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings,
124 |                       share_weights=True)(x)
125 |     # output_capsule = Lambda(lambda x: K.sqrt(K.sum(K.square(x), 2)))(capsule)
126 |     capsule = Flatten()(capsule)
127 |     capsule = Dropout(rate_drop_dense)(capsule)
128 |     output = Dense(6, activation='sigmoid')(capsule)
129 |     model = Model(inputs=input1, outputs=output)
130 |     model.compile(
131 |         loss='binary_crossentropy',
132 |         optimizer='adam',
133 |         metrics=['accuracy'])
134 |     return model
135 | 
136 | 
137 | ##############################################
138 | ####  DCNN     ###############################
139 | ##############################################
140 | 
141 | def DPCNN_model(maxlen, max_features, embed_size, embedding_matrix, spatial_dropout,
142 |                filter_nr, filter_size, max_pool_size, max_pool_strides, dense_nr, dense_dropout):
143 |     comment = Input(shape=(maxlen,))
144 |     emb_comment = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(comment)
145 |     emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment)
146 | 
147 |     block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(emb_comment)
148 |     block1 = BatchNormalization()(block1)
149 |     block1 = PReLU()(block1)
150 |     block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1)
151 |     block1 = BatchNormalization()(block1)
152 |     block1 = PReLU()(block1)
153 | 
154 |     #we pass embedded comment through conv1d with filter size 1 because it needs 
155 |     # to have the same shape as block output
156 |     #if you choose filter_nr = embed_size (300 in this case) you don't have 
157 |     # to do this part and can add emb_comment directly to block1_output
158 |     resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear')(emb_comment)
159 |     resize_emb = PReLU()(resize_emb)
160 | 
161 |     block1_output = add([block1, resize_emb])
162 |     block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output)
163 | 
164 |     block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1_output)
165 |     block2 = BatchNormalization()(block2)
166 |     block2 = PReLU()(block2)
167 |     block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block2)
168 |     block2 = BatchNormalization()(block2)
169 |     block2 = PReLU()(block2)
170 | 
171 |     block2_output = add([block2, block1_output])
172 |     block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output)
173 | 
174 |     block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block2_output)
175 |     block3 = BatchNormalization()(block3)
176 |     block3 = PReLU()(block3)
177 |     block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block3)
178 |     block3 = BatchNormalization()(block3)
179 |     block3 = PReLU()(block3)
180 | 
181 |     block3_output = add([block3, block2_output])
182 |     block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output)
183 | 
184 |     block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block3_output)
185 |     block4 = BatchNormalization()(block4)
186 |     block4 = PReLU()(block4)
187 |     block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block4)
188 |     block4 = BatchNormalization()(block4)
189 |     block4 = PReLU()(block4)
190 | 
191 |     output = add([block4, block3_output])
192 |     output = GlobalMaxPooling1D()(output)
193 |     output = Dense(dense_nr, activation='linear')(output)
194 |     output = BatchNormalization()(output)
195 |     output = PReLU()(output)
196 |     output = Dropout(dense_dropout)(output)
197 |     output = Dense(6, activation='sigmoid')(output)
198 | 
199 |     model = Model(comment, output)
200 | 
201 |     model.compile(loss='binary_crossentropy', 
202 |                 optimizer=optimizers.Adam(),
203 |                 metrics=['accuracy'])
204 |     return model
205 | 
206 | 
207 | ##############################################
208 | ####  DCNN     ###############################
209 | ##############################################
210 | 
211 | 
212 | class CV_predictor():
213 |     '''
214 |     class to extract predictions on train and test set from tunned pipeline
215 |     '''
216 |     def __init__(self, get_model, x_train, y_train, x_test, 
217 |                  n_splits, batch_size, epochs, col_names, 
218 |                  model_kwargs):
219 |         self.get_model = get_model
220 |         self.cv = KFold(n_splits=n_splits, shuffle=True, random_state=1)
221 |         self.x_train = x_train
222 |         self.y_train = y_train
223 |         self.x_test = x_test
224 |         self.scorrer = roc_auc_score
225 |         self.train_predictions = []
226 |         self.test_predictions = []
227 |         self.score = []
228 |         self.epochs = epochs
229 |         self.batch_size = batch_size
230 |         self.col_names = col_names        
231 |         self.model_kwargs = model_kwargs
232 | #         self.maxlen, self.max_features, self.embed_size, self.embedding_matrix
233 |     
234 |     def predict(self):
235 |         test_number = 1
236 |         for train_i, valid_i in self.cv.split(self.x_train, self.y_train):
237 |             model = self.get_model(**self.model_kwargs)
238 |             x_train = self.x_train[train_i]
239 |             y_train = self.y_train[train_i]
240 |             x_valid = self.x_train[valid_i]
241 |             y_valid = self.y_train[valid_i]
242 |             for i in self.epochs:
243 |                 model.fit(x_train, y_train, epochs = 1, batch_size = self.batch_size)
244 |                 train_prediction = model.predict(x_valid, self.batch_size * 2)
245 |                 print (f'test_number: {test_number}, epoch: {i}, score: {self.scorrer(y_valid, train_prediction)}')
246 |             test_prediction =  model.predict(self.x_test, self.batch_size * 2)
247 |             self.train_predictions.append([train_prediction, valid_i])
248 |             self.test_predictions.append(test_prediction)
249 |             self.score.append(self.scorrer(y_valid, train_prediction))
250 |             print (f"test_number: {test_number}, avg score: {self.score[-1]}")
251 |             test_number += 1
252 |         print (np.mean(self.score))
253 |         self.train_predictions = (
254 |             pd.concat([pd.DataFrame(data=i[0],index=i[1], columns=[self.col_names]) 
255 |                        for i in self.train_predictions]).sort_index())
256 |         self.test_predictions = pd.DataFrame(data=np.mean(self.test_predictions, axis=0), columns=[self.col_names])


--------------------------------------------------------------------------------
/bin/contractions.py:
--------------------------------------------------------------------------------
  1 | contractions = { 
  2 |     "ain't": "am not",
  3 |     "aren't": "are not",
  4 |     "can't": "cannot",
  5 |     "can't've": "cannot have",
  6 |     "'cause": "because",
  7 |     "could've": "could have",
  8 |     "couldn't": "could not",
  9 |     "couldn't've": "could not have",
 10 |     "didn't": "did not",
 11 |     "doesn't": "does not",
 12 |     "don't": "do not",
 13 |     "hadn't": "had not",
 14 |     "hadn't've": "had not have",
 15 |     "hasn't": "has not",
 16 |     "haven't": "have not",
 17 |     "he'd": "he had",
 18 |     "he'd've": "he would have",
 19 |     "he'll": "he shall",
 20 |     "he'll've": "he shall have",
 21 |     "he's": "he has",
 22 |     "how'd": "how did",
 23 |     "how'd'y": "how do you",
 24 |     "how'll": "how will",
 25 |     "how's": "how has",
 26 |     "I'd": "I had",
 27 |     "I'd've": "I would have",
 28 |     "I'll": "I will",
 29 |     "I'll've": "I will have",
 30 |     "I'm": "I am",
 31 |     "I've": "I have",
 32 |     "isn't": "is not",
 33 |     "it'd": "it would",
 34 |     "it'd've": "it would have",
 35 |     "it'll": "it will",
 36 |     "it'll've": "it will have",
 37 |     "it's": "it is",
 38 |     "let's": "let us",
 39 |     "ma'am": "madam",
 40 |     "mayn't": "may not",
 41 |     "might've": "might have",
 42 |     "mightn't": "might not",
 43 |     "mightn't've": "might not have",
 44 |     "must've": "must have",
 45 |     "mustn't": "must not",
 46 |     "mustn't've": "must not have",
 47 |     "needn't": "need not",
 48 |     "needn't've": "need not have",
 49 |     "o'clock": "of the clock",
 50 |     "oughtn't": "ought not",
 51 |     "oughtn't've": "ought not have",
 52 |     "shan't": "shall not",
 53 |     "sha'n't": "shall not",
 54 |     "shan't've": "shall not have",
 55 |     "she'd": "she would",
 56 |     "she'd've": "she would have",
 57 |     "she'll": "she will",
 58 |     "she'll've": "she will have",
 59 |     "she's": "she is",
 60 |     "should've": "should have",
 61 |     "shouldn't": "should not",
 62 |     "shouldn't've": "should not have",
 63 |     "so've": "so have",
 64 |     "so's": "so is",
 65 |     "that'd": "that had",
 66 |     "that'd've": "that would have",
 67 |     "that's": "that is",
 68 |     "there'd": "there would",
 69 |     "there'd've": "there would have",
 70 |     "there's": "there is",
 71 |     "they'd": "they would",
 72 |     "they'd've": "they would have",
 73 |     "they'll": "they will",
 74 |     "they'll've": "they will have",
 75 |     "they're": "they are",
 76 |     "they've": "they have",
 77 |     "to've": "to have",
 78 |     "wasn't": "was not",
 79 |     "we'd": "we would",
 80 |     "we'd've": "we would have",
 81 |     "we'll": "we will",
 82 |     "we'll've": "we will have",
 83 |     "we're": "we are",
 84 |     "we've": "we have",
 85 |     "weren't": "were not",
 86 |     "what'll": "what will",
 87 |     "what'll've": "what will have",
 88 |     "what're": "what are",
 89 |     "what's": "what is",
 90 |     "what've": "what have",
 91 |     "when's": "when is",
 92 |     "when've": "when have",
 93 |     "where'd": "where did",
 94 |     "where's": "where is",
 95 |     "where've": "where have",
 96 |     "who'll": "who will",
 97 |     "who'll've": "who will have",
 98 |     "who's": "who is",
 99 |     "who've": "who have",
100 |     "why's": "why is",
101 |     "why've": "why have",
102 |     "will've": "will have",
103 |     "won't": "will not",
104 |     "won't've": "will not have",
105 |     "would've": "would have",
106 |     "wouldn't": "would not",
107 |     "wouldn't've": "would not have",
108 |     "y'all": "you all",
109 |     "y'all'd": "you all would",
110 |     "y'all'd've": "you all would have",
111 |     "y'all're": "you all are",
112 |     "y'all've": "you all have",
113 |     "you'd": "you would",
114 |     "you'd've": "you would have",
115 |     "you'll": " you will",
116 |     "you'll've": "you will have",
117 |     "you're": "you are",
118 |     "you've": "you have",
119 |     "f*ck": "fuck",
120 |     "f**k": "fuck",
121 |     'f*cking': 'fucking',
122 |     'f**king': 'fucking',
123 |     "fucking": " fucking ",
124 |     "fcuk": "fuck", 
125 |     "fucksex": "fuck sex",
126 |     "MOTHJER": "mother",
127 |     "OFFFUCK": "OFF FUCK",
128 |     "MarcolFuck": "Marcol Fuck",
129 |     "SECURITYFUCK": "security FUCK",
130 |     "CUNTBAG": "CUNT BAG",
131 |     "ancestryFuck": "ancestry Fuck",
132 |     "shitFuck": "shit Fuck",
133 |     "CENTRALISTSTUPID": "centralists STUPID",
134 |     "bitchMattythewhite": "bitch Matty the white",
135 |     "HAAHHAHAHAH": "hahahaha",
136 |     "SHITHOLE": "shithole",
137 |     "cuntLiz": "cunt Liz",
138 |     "PenIS": "penis",
139 |     "pennnis": "penis",
140 |     "pneis": "penis",
141 |     "pensnsnnienSNsn": "penis",
142 |     "peNis": "penis",
143 |     "itsuck": "it suck",
144 |     "gayfrozen": "gay frozen",
145 |     "GAYFAG": "GAY FAG",
146 |     "CUNTFRANKS": "CUNT FRANKS",
147 |     "ahahahahahahahahahahahahahahahahahahaha": "hahahaha",
148 |     "FoReVeR": "forever",
149 |     "ReSPeCT": "respect",
150 |     'PaTHeTiC': "pathetic",
151 |     "FUCKINGABF" : "FUCKING",
152 |     "misterwiki" : "mister wiki",
153 |     "MUAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHA" : "hahahaha",
154 |     "DICKED": 'dicked',
155 |     "FUCKBAGS": "FUCK bags",
156 |     "DreamGuy": "Dream Guy",
157 |     "accessdate": "access date",
158 |     "wikistalking": "wiki talking",
159 |     "ashol": "asshole",
160 |     'HAHAHAHAHAHAHAHAHAHAHAHAHA': "hahahaha",
161 |     'BUTTHEAD': "butthead",
162 |     'bitchMother': 'bitch Mother',
163 |     "FUCKK" : "FUCK",
164 |     "DICKFACE": "dickface",
165 |     "sUcks": "sucks",
166 |     "suCks": "sucks",
167 |     "sucKs": "sucks",
168 |     "suckS": "sucks",
169 |     'ASSWHOLE': 'ASSHOLE',
170 |     'faggotMONGO': 'faggot MONGO',
171 |     'AULAHEHELALELALALA': "hahahaha",
172 |     'fuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuck': "FUCK",
173 |     'suckersyou': 'sucker you',
174 |     'shiot': "shit",
175 |     "YOUFUCK": 'YOU FUCK',
176 |     'youfuck': "you fuck",
177 |     'néger':'niger',
178 |     'youFuck': 'you Fuck',
179 |     'NIGGORS': "nigers",
180 |     'DONKEYSEX': "donkey sex",
181 |     'BeCauSe': "because",
182 |     "CUCKS": "sucks",
183 |     "NIGGERJEW": "nigger jew",
184 |     "ANALANAL": "anal anal",
185 |     'SEXBUTT': "sex butt",
186 |     'bitchesfuck': "bitches fuck",
187 |     'ARSEHOLE': "asshole",
188 |     'MOTHERFUCKERDIE': "motherfucker die",
189 |     'BUTTSLUTS': "butt sluts",
190 |     'penisSmall': "penis small",
191 |     'FuckoffJewish': 'Fuck off Jewish',
192 |     'TURKEYFUCK': "turkey FUCK",
193 |     'faggotgay': "faggot gay",
194 |     'headsdick': 'heads dick'
195 | }
196 | 
197 | negative_100 = ['fuck', 'shit', 'idiot', 'stupid', 'suck', 'bitch', 'dick', 'crap',
198 |        'faggot', 'hell', 'penis', 'moron', 'pathetic', 'cunt', 'bastard',
199 |        'gay', 'shut', 'dumb', 'jerk', 'nigger', 'ass', 'die', 'kill',
200 |        'fag', 'cock', 'sex', 'loser', 'racist', 'damn', 'pussy', 'hate',
201 |        'fool', 'sick', 'liar', 'retard', 'nazi', 'retarded', 'pig', 'piss',
202 |        'wtf', 'morons', 'fat', 'niggers', 'garbage', 'ignorant',
203 |        'disgusting', 'bloody', 'arse', 'whore', 'scum', 'vagina', 'dirty',
204 |        'homosexual', 'ridiculous', 'nazis', 'hypocrite', 'your', 'ugly',
205 |        'hole', 'wanker', 'douchebag', 'balls', 'pedophile', 'butt',
206 |        'fascist', 'anal', 'bastards', 'porn', 'shame', 'arrogant', 'losers',
207 |        'bunch', 'worst', 'nerd', 'prick', 'rubbish', 'monkey', 'cum', 'screw',
208 |        'troll', 'silly', 'lies', 'twat', 'freaking', 'rape', 'goddamn',
209 |        'homo', 'anus', 'coward', 'disgrace', 'poop', 'crazy', 'trash', 'blood',
210 |        'aids', 
211 |        'moronic', 'hates', 'nigga', 'donkey', 'sexual', 'hitler', 'douche',
212 |        'cking', 'eat', 'fuk', 'piece', 'mentally', 'mother', 'shove',
213 |        'evil', 'shoot', 'kiss', 'mouth', 'lazy', 'fart', 'little',
214 |        'terrorists', 'youre', 'blow', 'queer', 'black', 'brain',
215 |        'racists', 'mom', 'heck', 'lick', 'sockpuppet', 'nonsense', 'bully',
216 |        'pervert', 'bag', 'killed', 'boobs', 'nothing', 'stfu',
217 |        'prostitute', 'fukkin', 'jew', 'hater', 'stick', 'homosexuality',
218 |        'stinks', 'dumbest', 'arsehole', 'useless', 'annoying', 'goats',
219 |        'insane', 'child', 'smells', 'destroy', 'fags', 'horrible',
220 |        'hating', 'burn','nut', 'filth', 'masturbate', 'poo',
221 |        'face', 'nerds', 'rapist', 'slut', 'stop', 'rude', 'imbecile',
222 |        'donkeys', 'guts', 'fatty', 'pissed', 'lame', 'mental', 'toilet',
223 |        'terrorist', 'freak', 'smell', 'fuc', 'liars', 'retards', 'death',
224 |        'waste', 'yourself', 'blah', 'fools', 'big', 'fggt', 'homosexuals',
225 |        'looser', 'punk', 'cunts', 'scumbag', 'poor', 'cowards', 'blacks',
226 |        'like', 'life', 'smelly', 'dare', 'foolish', 'fake', 'killing',
227 |         'penises', 'ever', 'rat', 'mad', 'kissing',
228 |        'testicles', 'jerks', 'babies', 'spit', 'puppet', 'naughty', 'ggot',
229 |        'filthy', 'fock', 'quit', 'idiocy', 'damned', 'vomit', 'violent',
230 |        'pompous', 'illiterate', 'worm', 'terrible', 'dead', 'fire', 'my',
231 |        'licking', 'cancer', 'fascists', 'ill', 'raped', 'smoke',
232 |        'feces', 'ignorance', 'cougar', 'hairy', 'semen',
233 |        'dummy', 'gtfo', 'crook', 'immature', 'site',
234 |        'pigs', 'hypocrisy', 'vandals', 'wikinazi', 'bull', 'tits',
235 |        'hypocrites', 'weed', 'murder', 'stinky', 'laugh', 'awful', 'freaky',
236 |        'lying', 'oh', 'hurt', 'chicken', 'admins', 'nation', 'hermaphrodite',
237 |        'nutcase', 'communist', 'children', 'maggot', 'utter', 'leftist',
238 |        'ban', 'joke']
239 | 
240 | positive_100 = ['thank', 'thanks', 'please', 'redirect', 'talk', 'utc',
241 |        'may', 'best', 'help', 'interested', 'sorry', 'wikiproject',
242 |        'agree', 'welcome', 'consensus', 'there', 'could', 'appreciate',
243 |        'cheers', 'request', 'continue', 
244 |        'considered', 'importance', 'editing', 'apologize', 'however',
245 |        'think', 'section', 'sure', 'source', 'sources', 'good', 'article',
246 |        'deletion', 'would', 'title', 'case', 'template', 'review',
247 |        'friend', 'email', 'references', 'contribs', 'where', 'books', 'find',
248 |        'category', 'idea', 'interesting', 'used', 'believe', 'wondering',
249 |        'reverted', 'regards', 'point', 'topic', 'barnstar', 'february', 'can',
250 |        'talkpage', 'removed', 'issue', 'popular', 'did', 'mean', 'issues',
251 |        'early', 'discussion', 'april', 'provide', 'version', 'lead', 'further',
252 |        'correct', 'reply', 'great', 'promise', 'from', 'ask', 'also', 'moved',
253 |        'published', 'well', 'work', 'pov', 'noticed', 'archive', 'continues',
254 |        'unblocked', 'wikipedia', 
255 |        'support', 'disagree', 'january', 'apologies', 'reported', 'better',
256 |        'september', 'standard', 'sounds', 'book', 'company', 'which', 'asked',
257 |        'need', 'made', 'michael', 'after', 'into', 'might', 'copy',
258 |        'process', 'perhaps', 'tag', 'armenian', 'not', 'against',
259 |        'added', 'involved', 'listas', 'mentioned', 'done', 'john', 
260 |        'given', 'evidence', 'long', 'know', 'learned', 'science', 'often',
261 |        'definitely', 'questions', 'mention', 'seems', 'future', 'edits',
262 |        'reasonable', 'both', 'notice', 'opinions', 'central', 'exactly',
263 |        'list', 'sourced', 'incorrect', 'faith', 'context', 'example',
264 |        'policy', 'recommend', 'under', 'changed', 'about',
265 |        'saying', 'showing', 'laurent', 'image', 'season', 'first',
266 |        'didn', 'details', 'episode', 'thoughts', 'type', 'state',
267 |        'though', 'surely', 'looks', 'link', 'start',
268 |        'experiment', 'examples', 'suggestions', 'number', 'each', 'box',
269 |        'happy', 'explain', 'been', 'paragraph', 'username', 'semi', 'infobox', 'when',
270 |        'december', 'more', 'related', 'report', 'that', 'links', 'responded',
271 |        'regarding', 'requests', 'trust', 'according', 'merge', 'week',
272 |        'french', 'term', 'reference', 'listed', 'question', 'understand',
273 |        'very', 'paper', 'rfc', 'mistake', 'recently', 'convince',
274 |        'simply', 'consider', 'rather', 'chat', 'talking', 'background',
275 |        'seeing', 'latest', 'quality', 'creating', 'major', 'share', 'input',
276 |        'through', 'july', 'west', 'month', 'fair', 'therefore', 'general',
277 |        'thought', 'former', 'states', 'yesterday', 'dispute', 'proposal',
278 |        'unfair', 'fixed', 'smith', 'violations', 'familiar', 'beyond',
279 |        'football', 'themselves', 'luck', 'replied', 'copyright',
280 |        'views', 'stub', 'original', 'appreciated', 'parents',
281 |        'certainly', 'mail', 'fine', 'thinking', 'october', 'letting',
282 |        'proposed', 'follow', 'compromise', 'unless', 'warring', 'test',
283 |        'wish', 'static', 'usage', 'same', 'message', 'named', 'reconsider',
284 |        'likely']


--------------------------------------------------------------------------------
/modelling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%reload_ext autoreload\n",
 10 |     "%autoreload 2\n",
 11 |     "%matplotlib inline\n",
 12 |     "\n",
 13 |     "import warnings\n",
 14 |     "warnings.filterwarnings('ignore')\n",
 15 |     "\n",
 16 |     "import sys\n",
 17 |     "sys.path.append('/home/ser/DL/toxic/solution/bin')\n",
 18 |     "sys.path.append('/home/ser/DL/toxic/solution/')"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stderr",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "Using TensorFlow backend.\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "import datetime\n",
 36 |     "from imports import *\n",
 37 |     "from config import *\n",
 38 |     "from utils import *\n",
 39 |     "from models import GRU_LSTM_model, CV_predictor, CAPSULE_model, DPCNN_model\n",
 40 |     "from text_cleaner import TextCleaner\n",
 41 |     "from contractions import contractions, negative_100, positive_100"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "negative_100 = [i for i in negative_100 if len(i) > 3][:200]\n",
 51 |     "positive_100 = [i for i in positive_100 if len(i) > 3][:200]\n",
 52 |     "# negative_100 = sorted(negative_100, key= lambda x: -len(x))\n",
 53 |     "# positive_100 = sorted(positive_100, key= lambda x: -len(x))\n",
 54 |     "\n",
 55 |     "valuable_words = negative_100 + positive_100"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 6,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "def unify_tokens(comment):\n",
 65 |     "    nl = []\n",
 66 |     "    wl = WordNetLemmatizer().lemmatize\n",
 67 |     "    for word in comment:\n",
 68 |     "        word = wl(wl(word, pos='v'), pos='a')\n",
 69 |     "        # token in original form, exact matching\n",
 70 |     "        in_dict = emb_keys.get(word)\n",
 71 |     "        if in_dict is not None:\n",
 72 |     "            nl.append(word) \n",
 73 |     "        else:\n",
 74 |     "            word = word.lower()\n",
 75 |     "            word = wl(wl(word, pos='v'), pos='a')\n",
 76 |     "            # token lowercased, exact matching\n",
 77 |     "            in_dict = emb_keys.get(word)\n",
 78 |     "            if in_dict is not None:\n",
 79 |     "                nl.append(word)\n",
 80 |     "            else:\n",
 81 |     "                # break if work consist of < 3 symbols as non reliable solution\n",
 82 |     "                if len(word) < 3:\n",
 83 |     "                    continue\n",
 84 |     "                # top pos/neg words by LR weights lowercased, partial matching\n",
 85 |     "                for w in valuable_words:\n",
 86 |     "                    if w in word:\n",
 87 |     "                        word = word.replace(w, '')\n",
 88 |     "                        nl.append(w)\n",
 89 |     "                        if len(word) < 3:\n",
 90 |     "                            continue\n",
 91 |     "                # embedding keys lowercased, partial matching\n",
 92 |     "                for w in emb_sorted:\n",
 93 |     "                    if w.lower() in word:\n",
 94 |     "                        word = word.replace(w, '')\n",
 95 |     "                        nl.append(w.lower())\n",
 96 |     "                        if len(word) < 3:\n",
 97 |     "                            continue\n",
 98 |     "        # words which were not found in dict gonna be excluded from the comment\n",
 99 |     "    return nl\n",
100 |     "\n",
101 |     "def process_comment(df):\n",
102 |     "    df['comment_text'] = df['comment_text'].apply(unify_tokens).values\n",
103 |     "    return df"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 5,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "EMBEDDING_FILE = '/home/ser/Downloads/fasttext/crawl-300d-2M.vec'\n",
113 |     "emb_keys = dict(get_emb_dict(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))\n",
114 |     "emb_sorted = [i for i in sorted(emb_keys, key=lambda x: -len(x)) if len(i) < 15 and len(i)>2]"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "text cleaner processing: 2018-03-19 19:09:42.072307\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "train = pd.read_csv('../data/train.csv.zip').fillna(\"fillna\")\n",
132 |     "test = pd.read_csv('../data/test.csv.zip').fillna(\"fillna\")\n",
133 |     "submission = pd.read_csv('../data/sample_submission.csv.zip')\n",
134 |     "\n",
135 |     "print (f'text cleaner processing: {datetime.datetime.now()}')\n",
136 |     "tc = TextCleaner(contractions)\n",
137 |     "train['comment_text'] = tc.transform(train['comment_text'].fillna('na').values)\n",
138 |     "test['comment_text'] = tc.transform(test['comment_text'].fillna('na').values)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "train cleaning: 2018-03-19 19:10:07.279024\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "print (f'train cleaning: {datetime.datetime.now()}')\n",
156 |     "train = parallelize_dataframe(train, process_comment)\n",
157 |     "print (f'test cleaning: {datetime.datetime.now()}')\n",
158 |     "test = parallelize_dataframe(test, process_comment)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "EMBEDDING_FILE = '/home/ser/Downloads/fasttext/crawl-300d-2M.vec'\n",
168 |     "embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "max_features = 150000\n",
178 |     "maxlen = 150\n",
179 |     "embed_size = 300\n",
180 |     "\n",
181 |     "X_train = train[\"comment_text\"].values\n",
182 |     "y_train = train[[\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]].values\n",
183 |     "X_test = test[\"comment_text\"].values\n",
184 |     "\n",
185 |     "tokenizer = text.Tokenizer(num_words=max_features, lower=False)\n",
186 |     "tokenizer.fit_on_texts(list(X_train) + list(X_test))\n",
187 |     "X_train = tokenizer.texts_to_sequences(X_train)\n",
188 |     "X_test = tokenizer.texts_to_sequences(X_test)\n",
189 |     "x_train = sequence.pad_sequences(X_train, maxlen=maxlen)\n",
190 |     "x_test = sequence.pad_sequences(X_test, maxlen=maxlen)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "word_index = tokenizer.word_index\n",
200 |     "nb_words = min(max_features, len(word_index))\n",
201 |     "embedding_matrix = np.zeros((nb_words, embed_size))\n",
202 |     "missed = []\n",
203 |     "for word, i in word_index.items():\n",
204 |     "    if i >= max_features: continue\n",
205 |     "    embedding_vector = embeddings_index.get(word)\n",
206 |     "    if embedding_vector is not None: \n",
207 |     "        embedding_matrix[i] = embedding_vector      \n",
208 |     "    else:\n",
209 |     "        missed.append(word)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "len(missed)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "missed[:10]"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "### Tests"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 18,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "model = GRU_LSTM_model(CuDNNLSTM, maxlen, max_features, embed_size, embedding_matrix)\n",
244 |     "\n",
245 |     "batch_size = 128\n",
246 |     "epochs = 10\n",
247 |     "\n",
248 |     "X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)\n",
249 |     "RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)\n",
250 |     "model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),\n",
251 |     "                 callbacks=[RocAuc], verbose=1)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 58,
257 |    "metadata": {
258 |     "scrolled": true
259 |    },
260 |    "outputs": [
261 |     {
262 |      "name": "stdout",
263 |      "output_type": "stream",
264 |      "text": [
265 |       "Train on 143613 samples, validate on 15958 samples\n",
266 |       "Epoch 1/15\n",
267 |       "143613/143613 [==============================] - 20s 140us/step - loss: 0.0733 - acc: 0.9742 - val_loss: 0.0500 - val_acc: 0.9814\n",
268 |       "\n",
269 |       " ROC-AUC - epoch: 1 - score: 0.971133 \n",
270 |       "\n",
271 |       "Epoch 2/15\n",
272 |       "143613/143613 [==============================] - 17s 120us/step - loss: 0.0495 - acc: 0.9815 - val_loss: 0.0463 - val_acc: 0.9825\n",
273 |       "\n",
274 |       " ROC-AUC - epoch: 2 - score: 0.979113 \n",
275 |       "\n",
276 |       "Epoch 3/15\n",
277 |       "143613/143613 [==============================] - 18s 122us/step - loss: 0.0460 - acc: 0.9824 - val_loss: 0.0473 - val_acc: 0.9824\n",
278 |       "\n",
279 |       " ROC-AUC - epoch: 3 - score: 0.981239 \n",
280 |       "\n",
281 |       "Epoch 4/15\n",
282 |       "143613/143613 [==============================] - 18s 123us/step - loss: 0.0438 - acc: 0.9831 - val_loss: 0.0431 - val_acc: 0.9835\n",
283 |       "\n",
284 |       " ROC-AUC - epoch: 4 - score: 0.983787 \n",
285 |       "\n",
286 |       "Epoch 5/15\n",
287 |       "143613/143613 [==============================] - 17s 122us/step - loss: 0.0413 - acc: 0.9839 - val_loss: 0.0430 - val_acc: 0.9834\n",
288 |       "\n",
289 |       " ROC-AUC - epoch: 5 - score: 0.984971 \n",
290 |       "\n",
291 |       "Epoch 6/15\n",
292 |       "143613/143613 [==============================] - 17s 118us/step - loss: 0.0397 - acc: 0.9843 - val_loss: 0.0438 - val_acc: 0.9837\n",
293 |       "\n",
294 |       " ROC-AUC - epoch: 6 - score: 0.984433 \n",
295 |       "\n",
296 |       "Epoch 7/15\n",
297 |       "143613/143613 [==============================] - 18s 122us/step - loss: 0.0379 - acc: 0.9848 - val_loss: 0.0425 - val_acc: 0.9830\n",
298 |       "\n",
299 |       " ROC-AUC - epoch: 7 - score: 0.986733 \n",
300 |       "\n",
301 |       "Epoch 8/15\n",
302 |       "143613/143613 [==============================] - 17s 120us/step - loss: 0.0365 - acc: 0.9852 - val_loss: 0.0426 - val_acc: 0.9838\n",
303 |       "\n",
304 |       " ROC-AUC - epoch: 8 - score: 0.985978 \n",
305 |       "\n",
306 |       "Epoch 9/15\n",
307 |       "143613/143613 [==============================] - 17s 120us/step - loss: 0.0350 - acc: 0.9857 - val_loss: 0.0429 - val_acc: 0.9837\n",
308 |       "\n",
309 |       " ROC-AUC - epoch: 9 - score: 0.987321 \n",
310 |       "\n",
311 |       "Epoch 10/15\n",
312 |       "143613/143613 [==============================] - 17s 121us/step - loss: 0.0337 - acc: 0.9862 - val_loss: 0.0473 - val_acc: 0.9807\n",
313 |       "\n",
314 |       " ROC-AUC - epoch: 10 - score: 0.987668 \n",
315 |       "\n",
316 |       "Epoch 11/15\n",
317 |       "143613/143613 [==============================] - 17s 121us/step - loss: 0.0324 - acc: 0.9866 - val_loss: 0.0475 - val_acc: 0.9832\n",
318 |       "\n",
319 |       " ROC-AUC - epoch: 11 - score: 0.986771 \n",
320 |       "\n",
321 |       "Epoch 12/15\n",
322 |       "143613/143613 [==============================] - 17s 121us/step - loss: 0.0315 - acc: 0.9869 - val_loss: 0.0439 - val_acc: 0.9832\n",
323 |       "\n",
324 |       " ROC-AUC - epoch: 12 - score: 0.987882 \n",
325 |       "\n",
326 |       "Epoch 13/15\n",
327 |       "143613/143613 [==============================] - 17s 120us/step - loss: 0.0303 - acc: 0.9874 - val_loss: 0.0454 - val_acc: 0.9823\n",
328 |       "\n",
329 |       " ROC-AUC - epoch: 13 - score: 0.987454 \n",
330 |       "\n",
331 |       "Epoch 14/15\n",
332 |       "143613/143613 [==============================] - 18s 122us/step - loss: 0.0299 - acc: 0.9876 - val_loss: 0.0456 - val_acc: 0.9834\n",
333 |       "\n",
334 |       " ROC-AUC - epoch: 14 - score: 0.987154 \n",
335 |       "\n",
336 |       "Epoch 15/15\n",
337 |       "143613/143613 [==============================] - 17s 120us/step - loss: 0.0287 - acc: 0.9881 - val_loss: 0.0516 - val_acc: 0.9828\n",
338 |       "\n",
339 |       " ROC-AUC - epoch: 15 - score: 0.985625 \n",
340 |       "\n"
341 |      ]
342 |     },
343 |     {
344 |      "data": {
345 |       "text/plain": [
346 |        "<keras.callbacks.History at 0x7f959a4426a0>"
347 |       ]
348 |      },
349 |      "execution_count": 58,
350 |      "metadata": {},
351 |      "output_type": "execute_result"
352 |     }
353 |    ],
354 |    "source": [
355 |     "dpcnn_kwargs = {\n",
356 |     "    'maxlen': maxlen,\n",
357 |     "    'max_features': max_features,\n",
358 |     "    'embed_size': embed_size,\n",
359 |     "    'embedding_matrix': embedding_matrix,\n",
360 |     "    'spatial_dropout': 0.25,\n",
361 |     "    'filter_nr': 64,\n",
362 |     "    'filter_size': 3, \n",
363 |     "    'max_pool_size': 3, \n",
364 |     "    'max_pool_strides': 2,\n",
365 |     "    'dense_nr': 256,\n",
366 |     "    'dense_dropout': 0.5\n",
367 |     "}\n",
368 |     "\n",
369 |     "model = DPCNN_model(**dpcnn_kwargs)\n",
370 |     "\n",
371 |     "batch_size = 128\n",
372 |     "epochs = 15\n",
373 |     "\n",
374 |     "X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)\n",
375 |     "RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)\n",
376 |     "model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),\n",
377 |     "                 callbacks=[RocAuc], verbose=1)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "### Predictions"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "#### DPCNN"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {
398 |     "scrolled": true
399 |    },
400 |    "outputs": [],
401 |    "source": [
402 |     "dpcnn_kwargs = {\n",
403 |     "    'maxlen': maxlen,\n",
404 |     "    'max_features': max_features,\n",
405 |     "    'embed_size': embed_size,\n",
406 |     "    'embedding_matrix': embedding_matrix,\n",
407 |     "    'spatial_dropout': 0.25,\n",
408 |     "    'filter_nr': 64,\n",
409 |     "    'filter_size': 3, \n",
410 |     "    'max_pool_size': 3, \n",
411 |     "    'max_pool_strides': 2,\n",
412 |     "    'dense_nr': 256,\n",
413 |     "    'dense_dropout': 0.5\n",
414 |     "}\n",
415 |     "\n",
416 |     "batch_size = 128\n",
417 |     "n_splits = 10\n",
418 |     "epochs = range(10)\n",
419 |     "list_classes = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n",
420 |     "\n",
421 |     "cv = CV_predictor(DPCNN_model, x_train, y_train, x_test, \n",
422 |     "                      n_splits, batch_size, epochs, list_classes, dpcnn_kwargs)\n",
423 |     "cv.predict()\n",
424 |     "\n",
425 |     "train_p = cv.train_predictions\n",
426 |     "test_p = cv.test_predictions\n",
427 |     "test_p.index = test['id']\n",
428 |     "\n",
429 |     "train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_dpcnn.csv', index=False)\n",
430 |     "test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_dpcnn.csv', index=False)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "#### GRU"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {
444 |     "scrolled": false
445 |    },
446 |    "outputs": [],
447 |    "source": [
448 |     "gru_kwargs = {\n",
449 |     "    'CuDNN': CuDNNGRU,\n",
450 |     "    'maxlen': maxlen, \n",
451 |     "    'max_features': max_features, \n",
452 |     "    'embed_size': embed_size, \n",
453 |     "    'embedding_matrix' : embedding_matrix\n",
454 |     "}\n",
455 |     "\n",
456 |     "batch_size = 128\n",
457 |     "n_splits = 10\n",
458 |     "epochs = range(4)\n",
459 |     "list_classes = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n",
460 |     "\n",
461 |     "cv = CV_predictor(GRU_LSTM_model, x_train, y_train, x_test, \n",
462 |     "                      n_splits, batch_size, epochs, list_classes, gru_kwargs)\n",
463 |     "cv.predict()\n",
464 |     "\n",
465 |     "train_p = cv.train_predictions\n",
466 |     "test_p = cv.test_predictions\n",
467 |     "test_p.index = test['id']\n",
468 |     "\n",
469 |     "train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_gru_lem_low.csv', index=False)\n",
470 |     "test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_gru_lem_low.csv', index=False)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "#### LSTM"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {
484 |     "scrolled": false
485 |    },
486 |    "outputs": [],
487 |    "source": [
488 |     "gru_kwargs = {\n",
489 |     "    'CuDNN': CuDNNLSTM, \n",
490 |     "    'maxlen': maxlen, \n",
491 |     "    'max_features': max_features, \n",
492 |     "    'embed_size': embed_size, \n",
493 |     "    'embedding_matrix' : embedding_matrix\n",
494 |     "}\n",
495 |     "\n",
496 |     "batch_size = 128\n",
497 |     "n_splits = 10\n",
498 |     "epochs = range(4)\n",
499 |     "list_classes = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n",
500 |     "\n",
501 |     "cv = CV_predictor(GRU_LSTM_model, x_train, y_train, x_test, \n",
502 |     "                      n_splits, batch_size, epochs, list_classes, gru_kwargs)\n",
503 |     "cv.predict()\n",
504 |     "\n",
505 |     "train_p = cv.train_predictions\n",
506 |     "test_p = cv.test_predictions\n",
507 |     "test_p.index = test['id']\n",
508 |     "\n",
509 |     "train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_lstm_lem_low.csv', index=False)\n",
510 |     "test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_lstm_lem_low.csv', index=False)"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "markdown",
515 |    "metadata": {},
516 |    "source": [
517 |     "#### Capsule"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "metadata": {
524 |     "scrolled": false
525 |    },
526 |    "outputs": [],
527 |    "source": [
528 |     "capsule_kwargs = {\n",
529 |     "    'maxlen': maxlen, \n",
530 |     "    'max_features': max_features, \n",
531 |     "    'embed_size': embed_size, \n",
532 |     "    'embedding_matrix' : embedding_matrix, \n",
533 |     "    'rate_drop_dense': 0.3,\n",
534 |     "    'Num_capsule': 10, \n",
535 |     "    'Dim_capsule': 16, \n",
536 |     "    'Routings':  5,\n",
537 |     "    'gru_len': 128\n",
538 |     "}\n",
539 |     "\n",
540 |     "batch_size = 128\n",
541 |     "n_splits = 10\n",
542 |     "epochs = range(4)\n",
543 |     "list_classes = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n",
544 |     "\n",
545 |     "cv = CV_predictor(CAPSULE_model, x_train, y_train, x_test, \n",
546 |     "                      n_splits, batch_size, epochs, list_classes, capsule_kwargs)\n",
547 |     "cv.predict()\n",
548 |     "\n",
549 |     "train_p = cv.train_predictions\n",
550 |     "test_p = cv.test_predictions\n",
551 |     "test_p.index = test['id']\n",
552 |     "\n",
553 |     "train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_capsule_lem_low.csv', index=False)\n",
554 |     "test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_capsule_lem_low.csv', index=False)"
555 |    ]
556 |   }
557 |  ],
558 |  "metadata": {
559 |   "kernelspec": {
560 |    "display_name": "Python 3",
561 |    "language": "python",
562 |    "name": "python3"
563 |   },
564 |   "language_info": {
565 |    "codemirror_mode": {
566 |     "name": "ipython",
567 |     "version": 3
568 |    },
569 |    "file_extension": ".py",
570 |    "mimetype": "text/x-python",
571 |    "name": "python",
572 |    "nbconvert_exporter": "python",
573 |    "pygments_lexer": "ipython3",
574 |    "version": "3.6.3"
575 |   },
576 |   "latex_envs": {
577 |    "LaTeX_envs_menu_present": true,
578 |    "autoclose": false,
579 |    "autocomplete": true,
580 |    "bibliofile": "biblio.bib",
581 |    "cite_by": "apalike",
582 |    "current_citInitial": 1,
583 |    "eqLabelWithNumbers": true,
584 |    "eqNumInitial": 1,
585 |    "hotkeys": {
586 |     "equation": "Ctrl-E",
587 |     "itemize": "Ctrl-I"
588 |    },
589 |    "labels_anchors": false,
590 |    "latex_user_defs": false,
591 |    "report_style_numbering": false,
592 |    "user_envs_cfg": false
593 |   },
594 |   "toc": {
595 |    "nav_menu": {},
596 |    "number_sections": true,
597 |    "sideBar": true,
598 |    "skip_h1_title": false,
599 |    "title_cell": "Table of Contents",
600 |    "title_sidebar": "Contents",
601 |    "toc_cell": false,
602 |    "toc_position": {},
603 |    "toc_section_display": true,
604 |    "toc_window_display": false
605 |   }
606 |  },
607 |  "nbformat": 4,
608 |  "nbformat_minor": 2
609 | }
610 | 


--------------------------------------------------------------------------------