├── .gitignore ├── bin ├── config.py ├── nb_features.py ├── imports.py ├── utils.py ├── text_cleaner.py ├── models.py └── contractions.py ├── README.md ├── requirements.txt └── modelling.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Jupyter Notebook 2 | .ipynb_checkpoints 3 | 4 | # dotenv 5 | .env 6 | 7 | # virtualenv 8 | .venv 9 | venv/ 10 | ENV/ 11 | -------------------------------------------------------------------------------- /bin/config.py: -------------------------------------------------------------------------------- 1 | config = dict( 2 | HOME='/home/ser/DL/toxic/', 3 | data=dict( 4 | train = 'train.csv.zip', 5 | test = 'test.csv.zip', 6 | sample = 'sample_submission.csv.zip' 7 | ) 8 | ) 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jigsaw-toxic-comment-classification-challenge 2 | 3 | Result: 19/4551 4 | 5 | Kagle kernels: 6 | 7 | * [Words polarity based on LR weights](https://www.kaggle.com/sermakarevich/words-polarity-based-on-lr-weights) 8 | * [Hierarchical Attention Network](https://www.kaggle.com/sermakarevich/hierarchical-attention-network) 9 | * [Sklearn pipelines tutorial](https://www.kaggle.com/sermakarevich/sklearn-pipelines-tutorial) 10 | 11 | -------------------------------------------------------------------------------- /bin/nb_features.py: -------------------------------------------------------------------------------- 1 | from imports import * 2 | 3 | 4 | class NBFeaturer(BaseEstimator, ClassifierMixin): 5 | def __init__(self, alpha): 6 | self.alpha = alpha 7 | 8 | def preprocess_x(self, x, r): 9 | return x.multiply(r) 10 | 11 | def pr(self, x, y_i, y): 12 | p = x[y==y_i].sum(0) 13 | return (p+self.alpha) / ((y==y_i).sum()+self.alpha) 14 | 15 | def fit(self, x, y=None): 16 | self._r = sparse.csr_matrix(np.log(self.pr(x,1,y) / self.pr(x,0,y))) 17 | return self 18 | 19 | def transform(self, x): 20 | x_nb = self.preprocess_x(x, self._r) 21 | return x_nb -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.1.11 2 | appnope==0.1.0 3 | astor==0.6.2 4 | bleach==1.5.0 5 | cycler==0.10.0 6 | decorator==4.2.1 7 | entrypoints==0.2.3 8 | gast==0.2.0 9 | grpcio==1.10.0 10 | html5lib==0.9999999 11 | ipykernel==4.8.2 12 | ipython==6.2.1 13 | ipython-genutils==0.2.0 14 | ipywidgets==7.1.2 15 | jedi==0.11.1 16 | Jinja2==2.10 17 | joblib==0.11 18 | jsonschema==2.6.0 19 | jupyter==1.0.0 20 | jupyter-client==5.2.3 21 | jupyter-console==5.2.0 22 | jupyter-core==4.4.0 23 | Keras==2.1.5 24 | kiwisolver==1.0.1 25 | Markdown==2.6.11 26 | MarkupSafe==1.0 27 | matplotlib==2.2.2 28 | mistune==0.8.3 29 | nbconvert==5.3.1 30 | nbformat==4.4.0 31 | nltk==3.2.5 32 | notebook==5.7.8 33 | numpy==1.14.2 34 | pandas==0.22.0 35 | pandocfilters==1.4.2 36 | parso==0.1.1 37 | pexpect==4.4.0 38 | pickleshare==0.7.4 39 | prompt-toolkit==1.0.15 40 | protobuf==3.5.2.post1 41 | ptyprocess==0.5.2 42 | Pygments==2.2.0 43 | pyparsing==2.2.0 44 | python-dateutil==2.7.0 45 | pytz==2018.3 46 | PyYAML==5.1 47 | pyzmq==17.0.0 48 | qtconsole==4.3.1 49 | scikit-learn==0.19.1 50 | scipy==1.0.0 51 | Send2Trash==1.5.0 52 | simplegeneric==0.8.1 53 | six==1.11.0 54 | sklearn==0.0 55 | tensorboard==1.6.0 56 | tensorflow==1.12.2 57 | termcolor==1.1.0 58 | terminado==0.8.1 59 | testpath==0.3.1 60 | tornado==5.0.1 61 | traitlets==4.3.2 62 | wcwidth==0.1.7 63 | webencodings==0.5.1 64 | Werkzeug==0.15.3 65 | widgetsnbextension==3.1.4 66 | -------------------------------------------------------------------------------- /bin/imports.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import string 4 | from string import digits 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from scipy import sparse 9 | 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.metrics import roc_auc_score 12 | 13 | from keras.models import Model, Sequential 14 | from keras.layers import (Input, Dense, Embedding, SpatialDropout1D, concatenate, RepeatVector, Flatten, Conv1D, 15 | GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU, CuDNNLSTM, MaxPooling1D, Layer, 16 | Dropout, K, Activation, BatchNormalization, PReLU, add, Reshape) 17 | from keras.preprocessing import text, sequence 18 | from keras import optimizers 19 | from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint 20 | 21 | 22 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 23 | from sklearn.base import ClassifierMixin, BaseEstimator 24 | from sklearn.linear_model import LogisticRegression 25 | from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline 26 | from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold, KFold 27 | from sklearn.metrics import roc_auc_score 28 | 29 | import nltk 30 | from nltk.stem import WordNetLemmatizer 31 | from nltk.corpus import wordnet, stopwords 32 | 33 | import pandas as pd 34 | from joblib import Parallel, delayed 35 | import multiprocessing 36 | from multiprocessing import Pool -------------------------------------------------------------------------------- /bin/utils.py: -------------------------------------------------------------------------------- 1 | from imports import * 2 | 3 | 4 | re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])') 5 | def tokenize(s): 6 | return re_tok.sub(r' \1 ', s).split() 7 | 8 | 9 | def get_coefs(word, *arr): 10 | return word.lower(), np.asarray(arr, dtype='float32') 11 | 12 | def get_emb_dict(word, *arr): 13 | return word.lower(), 1 14 | 15 | 16 | def substitute(word, neg, pos): 17 | for n in neg: 18 | if n.lower() in word.lower(): 19 | return n 20 | for p in pos: 21 | if p.lower() in word.lower(): 22 | return p 23 | return None 24 | 25 | 26 | def parallelize_dataframe(df, func): 27 | df_split = np.array_split(df, multiprocessing.cpu_count()) 28 | pool = Pool(multiprocessing.cpu_count()) 29 | df = pd.concat(pool.map(func, df_split)) 30 | pool.close() 31 | pool.join() 32 | return df 33 | 34 | 35 | def applyParallel(df, func): 36 | retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(val) for val in df) 37 | return retLst 38 | 39 | 40 | class RocAucEvaluation(Callback): 41 | def __init__(self, validation_data=(), interval=1): 42 | super(Callback, self).__init__() 43 | 44 | self.interval = interval 45 | self.X_val, self.y_val = validation_data 46 | 47 | def on_epoch_end(self, epoch, logs={}): 48 | if epoch % self.interval == 0: 49 | y_pred = self.model.predict(self.X_val, verbose=0) 50 | score = roc_auc_score(self.y_val, y_pred) 51 | print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score)) 52 | -------------------------------------------------------------------------------- /bin/text_cleaner.py: -------------------------------------------------------------------------------- 1 | from imports import * 2 | 3 | 4 | class TextCleaner(BaseEstimator): 5 | def __init__(self, contractions): 6 | self.wl = WordNetLemmatizer().lemmatize 7 | self.wn = wordnet.morphy 8 | self.wt = nltk.word_tokenize 9 | self.c_s = contractions 10 | self.ss = "'\":-.,=`*/|—~\\•" 11 | self.tp = re.compile('\w{1,}') 12 | self.tp2 = re.compile('([!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~“”¨«»®´·º½¾¿¡§£₤‘’])') 13 | 14 | def remove_digits(self, x): 15 | rd = x.maketrans(' ', ' ', digits) 16 | x = x.translate(rd) 17 | return x 18 | 19 | def lemmatizer(self, x): 20 | return [self.wl(self.wl('%s'%i, pos='v'), pos='a') for i in x] 21 | 22 | 23 | def morphy(self, x): 24 | m = self.wn(x) 25 | if m is None: 26 | return x 27 | else: 28 | return m 29 | 30 | def tokenize(self, s): 31 | return self.tp.findall(s) 32 | 33 | def tokenize2(self, s): 34 | return self.tp2.sub(r' \1 ', s).split() 35 | 36 | def morphy_list(self, x): 37 | return [self.morphy(i) for i in x] 38 | 39 | def contr(self, x): 40 | for k, v in self.c_s.items(): 41 | x = x.replace(k, v) 42 | return x 43 | 44 | def special_symbols(self, x): 45 | for ss in self.ss: 46 | if len(x) > 1: 47 | x = x.replace(ss, '') 48 | return x 49 | 50 | def remove_stopwords(self, x): 51 | return [i for i in x if i not in stopwords.words('english')] 52 | 53 | def fit(self, x, y=None): 54 | return self 55 | 56 | def transform(self, x): 57 | x = map(lambda r: r.replace('_', ' '), x) 58 | x = map(lambda r: r.replace('`', '\''), x) 59 | x = map(lambda r: self.remove_digits(r), x) 60 | x = map(self.contr, x) 61 | # x = map(lambda r: r.lower(), x) 62 | # x = map(self.contr, x) 63 | x = map(self.special_symbols, x) 64 | # x = map(self.wt, x) 65 | x = map(self.tokenize2, x) 66 | # x = map(self.remove_stopwords, x) 67 | # x = map(self.lemmatizer, x) 68 | # x = list(map(self.morphy_list, x)) 69 | # x = map(lambda i: ' '.join(i), x) 70 | x = list(x) 71 | return x -------------------------------------------------------------------------------- /bin/models.py: -------------------------------------------------------------------------------- 1 | from imports import * 2 | 3 | 4 | ############################################## 5 | #### GRU/LSTM ############################## 6 | ############################################## 7 | 8 | def GRU_LSTM_model(CuDNN, maxlen, max_features, embed_size, embedding_matrix): 9 | inp = Input(shape=(maxlen, )) 10 | x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = False)(inp) 11 | x = SpatialDropout1D(0.2)(x) 12 | x = Bidirectional(CuDNN(128, return_sequences=True))(x) 13 | x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x) 14 | 15 | avg_pool = GlobalAveragePooling1D()(x) 16 | max_pool = GlobalMaxPooling1D()(x) 17 | 18 | conc = concatenate([avg_pool, max_pool]) 19 | outp = Dense(6, activation="sigmoid")(conc) 20 | 21 | model = Model(inputs=inp, outputs=outp) 22 | model.compile(loss='binary_crossentropy', 23 | optimizer='adam', 24 | metrics=['accuracy']) 25 | 26 | return model 27 | 28 | 29 | ############################################## 30 | #### CAPSULE ############################### 31 | ############################################## 32 | 33 | def squash(x, axis=-1): 34 | # s_squared_norm is really small 35 | # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon() 36 | # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm) 37 | # return scale * x 38 | s_squared_norm = K.sum(K.square(x), axis, keepdims=True) 39 | scale = K.sqrt(s_squared_norm + K.epsilon()) 40 | return x / scale 41 | 42 | 43 | # A Capsule Implement with Pure Keras 44 | class Capsule(Layer): 45 | def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True, 46 | activation='default', **kwargs): 47 | super(Capsule, self).__init__(**kwargs) 48 | self.num_capsule = num_capsule 49 | self.dim_capsule = dim_capsule 50 | self.routings = routings 51 | self.kernel_size = kernel_size 52 | self.share_weights = share_weights 53 | if activation == 'default': 54 | self.activation = squash 55 | else: 56 | self.activation = Activation(activation) 57 | 58 | def build(self, input_shape): 59 | super(Capsule, self).build(input_shape) 60 | input_dim_capsule = input_shape[-1] 61 | if self.share_weights: 62 | self.W = self.add_weight(name='capsule_kernel', 63 | shape=(1, input_dim_capsule, 64 | self.num_capsule * self.dim_capsule), 65 | # shape=self.kernel_size, 66 | initializer='glorot_uniform', 67 | trainable=True) 68 | else: 69 | input_num_capsule = input_shape[-2] 70 | self.W = self.add_weight(name='capsule_kernel', 71 | shape=(input_num_capsule, 72 | input_dim_capsule, 73 | self.num_capsule * self.dim_capsule), 74 | initializer='glorot_uniform', 75 | trainable=True) 76 | 77 | def call(self, u_vecs): 78 | if self.share_weights: 79 | u_hat_vecs = K.conv1d(u_vecs, self.W) 80 | else: 81 | u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1]) 82 | 83 | batch_size = K.shape(u_vecs)[0] 84 | input_num_capsule = K.shape(u_vecs)[1] 85 | u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, 86 | self.num_capsule, self.dim_capsule)) 87 | u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) 88 | # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule] 89 | 90 | b = K.zeros_like(u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule] 91 | for i in range(self.routings): 92 | b = K.permute_dimensions(b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule] 93 | c = K.softmax(b) 94 | c = K.permute_dimensions(c, (0, 2, 1)) 95 | b = K.permute_dimensions(b, (0, 2, 1)) 96 | outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2])) 97 | if i < self.routings - 1: 98 | b = K.batch_dot(outputs, u_hat_vecs, [2, 3]) 99 | 100 | return outputs 101 | 102 | def compute_output_shape(self, input_shape): 103 | return (None, self.num_capsule, self.dim_capsule) 104 | 105 | 106 | def CAPSULE_model(maxlen, max_features, embed_size, embedding_matrix, rate_drop_dense, 107 | Num_capsule, Dim_capsule, Routings, gru_len): 108 | input1 = Input(shape=(maxlen,)) 109 | embed_layer = Embedding(max_features, 110 | embed_size, 111 | input_length=maxlen, 112 | weights=[embedding_matrix], 113 | trainable=False)(input1) 114 | embed_layer = SpatialDropout1D(rate_drop_dense)(embed_layer) 115 | 116 | # x = Bidirectional( 117 | # GRU(gru_len, activation='relu', dropout=dropout_p, recurrent_dropout=dropout_p, return_sequences=True))( 118 | # embed_layer) 119 | 120 | x = Bidirectional( 121 | CuDNNGRU(gru_len,return_sequences=True))(embed_layer) 122 | x = Dropout(rate_drop_dense)(x) 123 | capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings, 124 | share_weights=True)(x) 125 | # output_capsule = Lambda(lambda x: K.sqrt(K.sum(K.square(x), 2)))(capsule) 126 | capsule = Flatten()(capsule) 127 | capsule = Dropout(rate_drop_dense)(capsule) 128 | output = Dense(6, activation='sigmoid')(capsule) 129 | model = Model(inputs=input1, outputs=output) 130 | model.compile( 131 | loss='binary_crossentropy', 132 | optimizer='adam', 133 | metrics=['accuracy']) 134 | return model 135 | 136 | 137 | ############################################## 138 | #### DCNN ############################### 139 | ############################################## 140 | 141 | def DPCNN_model(maxlen, max_features, embed_size, embedding_matrix, spatial_dropout, 142 | filter_nr, filter_size, max_pool_size, max_pool_strides, dense_nr, dense_dropout): 143 | comment = Input(shape=(maxlen,)) 144 | emb_comment = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(comment) 145 | emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment) 146 | 147 | block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(emb_comment) 148 | block1 = BatchNormalization()(block1) 149 | block1 = PReLU()(block1) 150 | block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1) 151 | block1 = BatchNormalization()(block1) 152 | block1 = PReLU()(block1) 153 | 154 | #we pass embedded comment through conv1d with filter size 1 because it needs 155 | # to have the same shape as block output 156 | #if you choose filter_nr = embed_size (300 in this case) you don't have 157 | # to do this part and can add emb_comment directly to block1_output 158 | resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear')(emb_comment) 159 | resize_emb = PReLU()(resize_emb) 160 | 161 | block1_output = add([block1, resize_emb]) 162 | block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output) 163 | 164 | block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1_output) 165 | block2 = BatchNormalization()(block2) 166 | block2 = PReLU()(block2) 167 | block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block2) 168 | block2 = BatchNormalization()(block2) 169 | block2 = PReLU()(block2) 170 | 171 | block2_output = add([block2, block1_output]) 172 | block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output) 173 | 174 | block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block2_output) 175 | block3 = BatchNormalization()(block3) 176 | block3 = PReLU()(block3) 177 | block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block3) 178 | block3 = BatchNormalization()(block3) 179 | block3 = PReLU()(block3) 180 | 181 | block3_output = add([block3, block2_output]) 182 | block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output) 183 | 184 | block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block3_output) 185 | block4 = BatchNormalization()(block4) 186 | block4 = PReLU()(block4) 187 | block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block4) 188 | block4 = BatchNormalization()(block4) 189 | block4 = PReLU()(block4) 190 | 191 | output = add([block4, block3_output]) 192 | output = GlobalMaxPooling1D()(output) 193 | output = Dense(dense_nr, activation='linear')(output) 194 | output = BatchNormalization()(output) 195 | output = PReLU()(output) 196 | output = Dropout(dense_dropout)(output) 197 | output = Dense(6, activation='sigmoid')(output) 198 | 199 | model = Model(comment, output) 200 | 201 | model.compile(loss='binary_crossentropy', 202 | optimizer=optimizers.Adam(), 203 | metrics=['accuracy']) 204 | return model 205 | 206 | 207 | ############################################## 208 | #### DCNN ############################### 209 | ############################################## 210 | 211 | 212 | class CV_predictor(): 213 | ''' 214 | class to extract predictions on train and test set from tunned pipeline 215 | ''' 216 | def __init__(self, get_model, x_train, y_train, x_test, 217 | n_splits, batch_size, epochs, col_names, 218 | model_kwargs): 219 | self.get_model = get_model 220 | self.cv = KFold(n_splits=n_splits, shuffle=True, random_state=1) 221 | self.x_train = x_train 222 | self.y_train = y_train 223 | self.x_test = x_test 224 | self.scorrer = roc_auc_score 225 | self.train_predictions = [] 226 | self.test_predictions = [] 227 | self.score = [] 228 | self.epochs = epochs 229 | self.batch_size = batch_size 230 | self.col_names = col_names 231 | self.model_kwargs = model_kwargs 232 | # self.maxlen, self.max_features, self.embed_size, self.embedding_matrix 233 | 234 | def predict(self): 235 | test_number = 1 236 | for train_i, valid_i in self.cv.split(self.x_train, self.y_train): 237 | model = self.get_model(**self.model_kwargs) 238 | x_train = self.x_train[train_i] 239 | y_train = self.y_train[train_i] 240 | x_valid = self.x_train[valid_i] 241 | y_valid = self.y_train[valid_i] 242 | for i in self.epochs: 243 | model.fit(x_train, y_train, epochs = 1, batch_size = self.batch_size) 244 | train_prediction = model.predict(x_valid, self.batch_size * 2) 245 | print (f'test_number: {test_number}, epoch: {i}, score: {self.scorrer(y_valid, train_prediction)}') 246 | test_prediction = model.predict(self.x_test, self.batch_size * 2) 247 | self.train_predictions.append([train_prediction, valid_i]) 248 | self.test_predictions.append(test_prediction) 249 | self.score.append(self.scorrer(y_valid, train_prediction)) 250 | print (f"test_number: {test_number}, avg score: {self.score[-1]}") 251 | test_number += 1 252 | print (np.mean(self.score)) 253 | self.train_predictions = ( 254 | pd.concat([pd.DataFrame(data=i[0],index=i[1], columns=[self.col_names]) 255 | for i in self.train_predictions]).sort_index()) 256 | self.test_predictions = pd.DataFrame(data=np.mean(self.test_predictions, axis=0), columns=[self.col_names]) -------------------------------------------------------------------------------- /bin/contractions.py: -------------------------------------------------------------------------------- 1 | contractions = { 2 | "ain't": "am not", 3 | "aren't": "are not", 4 | "can't": "cannot", 5 | "can't've": "cannot have", 6 | "'cause": "because", 7 | "could've": "could have", 8 | "couldn't": "could not", 9 | "couldn't've": "could not have", 10 | "didn't": "did not", 11 | "doesn't": "does not", 12 | "don't": "do not", 13 | "hadn't": "had not", 14 | "hadn't've": "had not have", 15 | "hasn't": "has not", 16 | "haven't": "have not", 17 | "he'd": "he had", 18 | "he'd've": "he would have", 19 | "he'll": "he shall", 20 | "he'll've": "he shall have", 21 | "he's": "he has", 22 | "how'd": "how did", 23 | "how'd'y": "how do you", 24 | "how'll": "how will", 25 | "how's": "how has", 26 | "I'd": "I had", 27 | "I'd've": "I would have", 28 | "I'll": "I will", 29 | "I'll've": "I will have", 30 | "I'm": "I am", 31 | "I've": "I have", 32 | "isn't": "is not", 33 | "it'd": "it would", 34 | "it'd've": "it would have", 35 | "it'll": "it will", 36 | "it'll've": "it will have", 37 | "it's": "it is", 38 | "let's": "let us", 39 | "ma'am": "madam", 40 | "mayn't": "may not", 41 | "might've": "might have", 42 | "mightn't": "might not", 43 | "mightn't've": "might not have", 44 | "must've": "must have", 45 | "mustn't": "must not", 46 | "mustn't've": "must not have", 47 | "needn't": "need not", 48 | "needn't've": "need not have", 49 | "o'clock": "of the clock", 50 | "oughtn't": "ought not", 51 | "oughtn't've": "ought not have", 52 | "shan't": "shall not", 53 | "sha'n't": "shall not", 54 | "shan't've": "shall not have", 55 | "she'd": "she would", 56 | "she'd've": "she would have", 57 | "she'll": "she will", 58 | "she'll've": "she will have", 59 | "she's": "she is", 60 | "should've": "should have", 61 | "shouldn't": "should not", 62 | "shouldn't've": "should not have", 63 | "so've": "so have", 64 | "so's": "so is", 65 | "that'd": "that had", 66 | "that'd've": "that would have", 67 | "that's": "that is", 68 | "there'd": "there would", 69 | "there'd've": "there would have", 70 | "there's": "there is", 71 | "they'd": "they would", 72 | "they'd've": "they would have", 73 | "they'll": "they will", 74 | "they'll've": "they will have", 75 | "they're": "they are", 76 | "they've": "they have", 77 | "to've": "to have", 78 | "wasn't": "was not", 79 | "we'd": "we would", 80 | "we'd've": "we would have", 81 | "we'll": "we will", 82 | "we'll've": "we will have", 83 | "we're": "we are", 84 | "we've": "we have", 85 | "weren't": "were not", 86 | "what'll": "what will", 87 | "what'll've": "what will have", 88 | "what're": "what are", 89 | "what's": "what is", 90 | "what've": "what have", 91 | "when's": "when is", 92 | "when've": "when have", 93 | "where'd": "where did", 94 | "where's": "where is", 95 | "where've": "where have", 96 | "who'll": "who will", 97 | "who'll've": "who will have", 98 | "who's": "who is", 99 | "who've": "who have", 100 | "why's": "why is", 101 | "why've": "why have", 102 | "will've": "will have", 103 | "won't": "will not", 104 | "won't've": "will not have", 105 | "would've": "would have", 106 | "wouldn't": "would not", 107 | "wouldn't've": "would not have", 108 | "y'all": "you all", 109 | "y'all'd": "you all would", 110 | "y'all'd've": "you all would have", 111 | "y'all're": "you all are", 112 | "y'all've": "you all have", 113 | "you'd": "you would", 114 | "you'd've": "you would have", 115 | "you'll": " you will", 116 | "you'll've": "you will have", 117 | "you're": "you are", 118 | "you've": "you have", 119 | "f*ck": "fuck", 120 | "f**k": "fuck", 121 | 'f*cking': 'fucking', 122 | 'f**king': 'fucking', 123 | "fucking": " fucking ", 124 | "fcuk": "fuck", 125 | "fucksex": "fuck sex", 126 | "MOTHJER": "mother", 127 | "OFFFUCK": "OFF FUCK", 128 | "MarcolFuck": "Marcol Fuck", 129 | "SECURITYFUCK": "security FUCK", 130 | "CUNTBAG": "CUNT BAG", 131 | "ancestryFuck": "ancestry Fuck", 132 | "shitFuck": "shit Fuck", 133 | "CENTRALISTSTUPID": "centralists STUPID", 134 | "bitchMattythewhite": "bitch Matty the white", 135 | "HAAHHAHAHAH": "hahahaha", 136 | "SHITHOLE": "shithole", 137 | "cuntLiz": "cunt Liz", 138 | "PenIS": "penis", 139 | "pennnis": "penis", 140 | "pneis": "penis", 141 | "pensnsnnienSNsn": "penis", 142 | "peNis": "penis", 143 | "itsuck": "it suck", 144 | "gayfrozen": "gay frozen", 145 | "GAYFAG": "GAY FAG", 146 | "CUNTFRANKS": "CUNT FRANKS", 147 | "ahahahahahahahahahahahahahahahahahahaha": "hahahaha", 148 | "FoReVeR": "forever", 149 | "ReSPeCT": "respect", 150 | 'PaTHeTiC': "pathetic", 151 | "FUCKINGABF" : "FUCKING", 152 | "misterwiki" : "mister wiki", 153 | "MUAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHAHA" : "hahahaha", 154 | "DICKED": 'dicked', 155 | "FUCKBAGS": "FUCK bags", 156 | "DreamGuy": "Dream Guy", 157 | "accessdate": "access date", 158 | "wikistalking": "wiki talking", 159 | "ashol": "asshole", 160 | 'HAHAHAHAHAHAHAHAHAHAHAHAHA': "hahahaha", 161 | 'BUTTHEAD': "butthead", 162 | 'bitchMother': 'bitch Mother', 163 | "FUCKK" : "FUCK", 164 | "DICKFACE": "dickface", 165 | "sUcks": "sucks", 166 | "suCks": "sucks", 167 | "sucKs": "sucks", 168 | "suckS": "sucks", 169 | 'ASSWHOLE': 'ASSHOLE', 170 | 'faggotMONGO': 'faggot MONGO', 171 | 'AULAHEHELALELALALA': "hahahaha", 172 | 'fuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuckfuck': "FUCK", 173 | 'suckersyou': 'sucker you', 174 | 'shiot': "shit", 175 | "YOUFUCK": 'YOU FUCK', 176 | 'youfuck': "you fuck", 177 | 'néger':'niger', 178 | 'youFuck': 'you Fuck', 179 | 'NIGGORS': "nigers", 180 | 'DONKEYSEX': "donkey sex", 181 | 'BeCauSe': "because", 182 | "CUCKS": "sucks", 183 | "NIGGERJEW": "nigger jew", 184 | "ANALANAL": "anal anal", 185 | 'SEXBUTT': "sex butt", 186 | 'bitchesfuck': "bitches fuck", 187 | 'ARSEHOLE': "asshole", 188 | 'MOTHERFUCKERDIE': "motherfucker die", 189 | 'BUTTSLUTS': "butt sluts", 190 | 'penisSmall': "penis small", 191 | 'FuckoffJewish': 'Fuck off Jewish', 192 | 'TURKEYFUCK': "turkey FUCK", 193 | 'faggotgay': "faggot gay", 194 | 'headsdick': 'heads dick' 195 | } 196 | 197 | negative_100 = ['fuck', 'shit', 'idiot', 'stupid', 'suck', 'bitch', 'dick', 'crap', 198 | 'faggot', 'hell', 'penis', 'moron', 'pathetic', 'cunt', 'bastard', 199 | 'gay', 'shut', 'dumb', 'jerk', 'nigger', 'ass', 'die', 'kill', 200 | 'fag', 'cock', 'sex', 'loser', 'racist', 'damn', 'pussy', 'hate', 201 | 'fool', 'sick', 'liar', 'retard', 'nazi', 'retarded', 'pig', 'piss', 202 | 'wtf', 'morons', 'fat', 'niggers', 'garbage', 'ignorant', 203 | 'disgusting', 'bloody', 'arse', 'whore', 'scum', 'vagina', 'dirty', 204 | 'homosexual', 'ridiculous', 'nazis', 'hypocrite', 'your', 'ugly', 205 | 'hole', 'wanker', 'douchebag', 'balls', 'pedophile', 'butt', 206 | 'fascist', 'anal', 'bastards', 'porn', 'shame', 'arrogant', 'losers', 207 | 'bunch', 'worst', 'nerd', 'prick', 'rubbish', 'monkey', 'cum', 'screw', 208 | 'troll', 'silly', 'lies', 'twat', 'freaking', 'rape', 'goddamn', 209 | 'homo', 'anus', 'coward', 'disgrace', 'poop', 'crazy', 'trash', 'blood', 210 | 'aids', 211 | 'moronic', 'hates', 'nigga', 'donkey', 'sexual', 'hitler', 'douche', 212 | 'cking', 'eat', 'fuk', 'piece', 'mentally', 'mother', 'shove', 213 | 'evil', 'shoot', 'kiss', 'mouth', 'lazy', 'fart', 'little', 214 | 'terrorists', 'youre', 'blow', 'queer', 'black', 'brain', 215 | 'racists', 'mom', 'heck', 'lick', 'sockpuppet', 'nonsense', 'bully', 216 | 'pervert', 'bag', 'killed', 'boobs', 'nothing', 'stfu', 217 | 'prostitute', 'fukkin', 'jew', 'hater', 'stick', 'homosexuality', 218 | 'stinks', 'dumbest', 'arsehole', 'useless', 'annoying', 'goats', 219 | 'insane', 'child', 'smells', 'destroy', 'fags', 'horrible', 220 | 'hating', 'burn','nut', 'filth', 'masturbate', 'poo', 221 | 'face', 'nerds', 'rapist', 'slut', 'stop', 'rude', 'imbecile', 222 | 'donkeys', 'guts', 'fatty', 'pissed', 'lame', 'mental', 'toilet', 223 | 'terrorist', 'freak', 'smell', 'fuc', 'liars', 'retards', 'death', 224 | 'waste', 'yourself', 'blah', 'fools', 'big', 'fggt', 'homosexuals', 225 | 'looser', 'punk', 'cunts', 'scumbag', 'poor', 'cowards', 'blacks', 226 | 'like', 'life', 'smelly', 'dare', 'foolish', 'fake', 'killing', 227 | 'penises', 'ever', 'rat', 'mad', 'kissing', 228 | 'testicles', 'jerks', 'babies', 'spit', 'puppet', 'naughty', 'ggot', 229 | 'filthy', 'fock', 'quit', 'idiocy', 'damned', 'vomit', 'violent', 230 | 'pompous', 'illiterate', 'worm', 'terrible', 'dead', 'fire', 'my', 231 | 'licking', 'cancer', 'fascists', 'ill', 'raped', 'smoke', 232 | 'feces', 'ignorance', 'cougar', 'hairy', 'semen', 233 | 'dummy', 'gtfo', 'crook', 'immature', 'site', 234 | 'pigs', 'hypocrisy', 'vandals', 'wikinazi', 'bull', 'tits', 235 | 'hypocrites', 'weed', 'murder', 'stinky', 'laugh', 'awful', 'freaky', 236 | 'lying', 'oh', 'hurt', 'chicken', 'admins', 'nation', 'hermaphrodite', 237 | 'nutcase', 'communist', 'children', 'maggot', 'utter', 'leftist', 238 | 'ban', 'joke'] 239 | 240 | positive_100 = ['thank', 'thanks', 'please', 'redirect', 'talk', 'utc', 241 | 'may', 'best', 'help', 'interested', 'sorry', 'wikiproject', 242 | 'agree', 'welcome', 'consensus', 'there', 'could', 'appreciate', 243 | 'cheers', 'request', 'continue', 244 | 'considered', 'importance', 'editing', 'apologize', 'however', 245 | 'think', 'section', 'sure', 'source', 'sources', 'good', 'article', 246 | 'deletion', 'would', 'title', 'case', 'template', 'review', 247 | 'friend', 'email', 'references', 'contribs', 'where', 'books', 'find', 248 | 'category', 'idea', 'interesting', 'used', 'believe', 'wondering', 249 | 'reverted', 'regards', 'point', 'topic', 'barnstar', 'february', 'can', 250 | 'talkpage', 'removed', 'issue', 'popular', 'did', 'mean', 'issues', 251 | 'early', 'discussion', 'april', 'provide', 'version', 'lead', 'further', 252 | 'correct', 'reply', 'great', 'promise', 'from', 'ask', 'also', 'moved', 253 | 'published', 'well', 'work', 'pov', 'noticed', 'archive', 'continues', 254 | 'unblocked', 'wikipedia', 255 | 'support', 'disagree', 'january', 'apologies', 'reported', 'better', 256 | 'september', 'standard', 'sounds', 'book', 'company', 'which', 'asked', 257 | 'need', 'made', 'michael', 'after', 'into', 'might', 'copy', 258 | 'process', 'perhaps', 'tag', 'armenian', 'not', 'against', 259 | 'added', 'involved', 'listas', 'mentioned', 'done', 'john', 260 | 'given', 'evidence', 'long', 'know', 'learned', 'science', 'often', 261 | 'definitely', 'questions', 'mention', 'seems', 'future', 'edits', 262 | 'reasonable', 'both', 'notice', 'opinions', 'central', 'exactly', 263 | 'list', 'sourced', 'incorrect', 'faith', 'context', 'example', 264 | 'policy', 'recommend', 'under', 'changed', 'about', 265 | 'saying', 'showing', 'laurent', 'image', 'season', 'first', 266 | 'didn', 'details', 'episode', 'thoughts', 'type', 'state', 267 | 'though', 'surely', 'looks', 'link', 'start', 268 | 'experiment', 'examples', 'suggestions', 'number', 'each', 'box', 269 | 'happy', 'explain', 'been', 'paragraph', 'username', 'semi', 'infobox', 'when', 270 | 'december', 'more', 'related', 'report', 'that', 'links', 'responded', 271 | 'regarding', 'requests', 'trust', 'according', 'merge', 'week', 272 | 'french', 'term', 'reference', 'listed', 'question', 'understand', 273 | 'very', 'paper', 'rfc', 'mistake', 'recently', 'convince', 274 | 'simply', 'consider', 'rather', 'chat', 'talking', 'background', 275 | 'seeing', 'latest', 'quality', 'creating', 'major', 'share', 'input', 276 | 'through', 'july', 'west', 'month', 'fair', 'therefore', 'general', 277 | 'thought', 'former', 'states', 'yesterday', 'dispute', 'proposal', 278 | 'unfair', 'fixed', 'smith', 'violations', 'familiar', 'beyond', 279 | 'football', 'themselves', 'luck', 'replied', 'copyright', 280 | 'views', 'stub', 'original', 'appreciated', 'parents', 281 | 'certainly', 'mail', 'fine', 'thinking', 'october', 'letting', 282 | 'proposed', 'follow', 'compromise', 'unless', 'warring', 'test', 283 | 'wish', 'static', 'usage', 'same', 'message', 'named', 'reconsider', 284 | 'likely'] -------------------------------------------------------------------------------- /modelling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline\n", 12 | "\n", 13 | "import warnings\n", 14 | "warnings.filterwarnings('ignore')\n", 15 | "\n", 16 | "import sys\n", 17 | "sys.path.append('/home/ser/DL/toxic/solution/bin')\n", 18 | "sys.path.append('/home/ser/DL/toxic/solution/')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stderr", 28 | "output_type": "stream", 29 | "text": [ 30 | "Using TensorFlow backend.\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "import datetime\n", 36 | "from imports import *\n", 37 | "from config import *\n", 38 | "from utils import *\n", 39 | "from models import GRU_LSTM_model, CV_predictor, CAPSULE_model, DPCNN_model\n", 40 | "from text_cleaner import TextCleaner\n", 41 | "from contractions import contractions, negative_100, positive_100" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "negative_100 = [i for i in negative_100 if len(i) > 3][:200]\n", 51 | "positive_100 = [i for i in positive_100 if len(i) > 3][:200]\n", 52 | "# negative_100 = sorted(negative_100, key= lambda x: -len(x))\n", 53 | "# positive_100 = sorted(positive_100, key= lambda x: -len(x))\n", 54 | "\n", 55 | "valuable_words = negative_100 + positive_100" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "def unify_tokens(comment):\n", 65 | " nl = []\n", 66 | " wl = WordNetLemmatizer().lemmatize\n", 67 | " for word in comment:\n", 68 | " word = wl(wl(word, pos='v'), pos='a')\n", 69 | " # token in original form, exact matching\n", 70 | " in_dict = emb_keys.get(word)\n", 71 | " if in_dict is not None:\n", 72 | " nl.append(word) \n", 73 | " else:\n", 74 | " word = word.lower()\n", 75 | " word = wl(wl(word, pos='v'), pos='a')\n", 76 | " # token lowercased, exact matching\n", 77 | " in_dict = emb_keys.get(word)\n", 78 | " if in_dict is not None:\n", 79 | " nl.append(word)\n", 80 | " else:\n", 81 | " # break if work consist of < 3 symbols as non reliable solution\n", 82 | " if len(word) < 3:\n", 83 | " continue\n", 84 | " # top pos/neg words by LR weights lowercased, partial matching\n", 85 | " for w in valuable_words:\n", 86 | " if w in word:\n", 87 | " word = word.replace(w, '')\n", 88 | " nl.append(w)\n", 89 | " if len(word) < 3:\n", 90 | " continue\n", 91 | " # embedding keys lowercased, partial matching\n", 92 | " for w in emb_sorted:\n", 93 | " if w.lower() in word:\n", 94 | " word = word.replace(w, '')\n", 95 | " nl.append(w.lower())\n", 96 | " if len(word) < 3:\n", 97 | " continue\n", 98 | " # words which were not found in dict gonna be excluded from the comment\n", 99 | " return nl\n", 100 | "\n", 101 | "def process_comment(df):\n", 102 | " df['comment_text'] = df['comment_text'].apply(unify_tokens).values\n", 103 | " return df" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "EMBEDDING_FILE = '/home/ser/Downloads/fasttext/crawl-300d-2M.vec'\n", 113 | "emb_keys = dict(get_emb_dict(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))\n", 114 | "emb_sorted = [i for i in sorted(emb_keys, key=lambda x: -len(x)) if len(i) < 15 and len(i)>2]" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "text cleaner processing: 2018-03-19 19:09:42.072307\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "train = pd.read_csv('../data/train.csv.zip').fillna(\"fillna\")\n", 132 | "test = pd.read_csv('../data/test.csv.zip').fillna(\"fillna\")\n", 133 | "submission = pd.read_csv('../data/sample_submission.csv.zip')\n", 134 | "\n", 135 | "print (f'text cleaner processing: {datetime.datetime.now()}')\n", 136 | "tc = TextCleaner(contractions)\n", 137 | "train['comment_text'] = tc.transform(train['comment_text'].fillna('na').values)\n", 138 | "test['comment_text'] = tc.transform(test['comment_text'].fillna('na').values)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "train cleaning: 2018-03-19 19:10:07.279024\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "print (f'train cleaning: {datetime.datetime.now()}')\n", 156 | "train = parallelize_dataframe(train, process_comment)\n", 157 | "print (f'test cleaning: {datetime.datetime.now()}')\n", 158 | "test = parallelize_dataframe(test, process_comment)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "EMBEDDING_FILE = '/home/ser/Downloads/fasttext/crawl-300d-2M.vec'\n", 168 | "embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "max_features = 150000\n", 178 | "maxlen = 150\n", 179 | "embed_size = 300\n", 180 | "\n", 181 | "X_train = train[\"comment_text\"].values\n", 182 | "y_train = train[[\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]].values\n", 183 | "X_test = test[\"comment_text\"].values\n", 184 | "\n", 185 | "tokenizer = text.Tokenizer(num_words=max_features, lower=False)\n", 186 | "tokenizer.fit_on_texts(list(X_train) + list(X_test))\n", 187 | "X_train = tokenizer.texts_to_sequences(X_train)\n", 188 | "X_test = tokenizer.texts_to_sequences(X_test)\n", 189 | "x_train = sequence.pad_sequences(X_train, maxlen=maxlen)\n", 190 | "x_test = sequence.pad_sequences(X_test, maxlen=maxlen)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "word_index = tokenizer.word_index\n", 200 | "nb_words = min(max_features, len(word_index))\n", 201 | "embedding_matrix = np.zeros((nb_words, embed_size))\n", 202 | "missed = []\n", 203 | "for word, i in word_index.items():\n", 204 | " if i >= max_features: continue\n", 205 | " embedding_vector = embeddings_index.get(word)\n", 206 | " if embedding_vector is not None: \n", 207 | " embedding_matrix[i] = embedding_vector \n", 208 | " else:\n", 209 | " missed.append(word)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "len(missed)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "missed[:10]" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "### Tests" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 18, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "model = GRU_LSTM_model(CuDNNLSTM, maxlen, max_features, embed_size, embedding_matrix)\n", 244 | "\n", 245 | "batch_size = 128\n", 246 | "epochs = 10\n", 247 | "\n", 248 | "X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)\n", 249 | "RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)\n", 250 | "model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),\n", 251 | " callbacks=[RocAuc], verbose=1)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 58, 257 | "metadata": { 258 | "scrolled": true 259 | }, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "Train on 143613 samples, validate on 15958 samples\n", 266 | "Epoch 1/15\n", 267 | "143613/143613 [==============================] - 20s 140us/step - loss: 0.0733 - acc: 0.9742 - val_loss: 0.0500 - val_acc: 0.9814\n", 268 | "\n", 269 | " ROC-AUC - epoch: 1 - score: 0.971133 \n", 270 | "\n", 271 | "Epoch 2/15\n", 272 | "143613/143613 [==============================] - 17s 120us/step - loss: 0.0495 - acc: 0.9815 - val_loss: 0.0463 - val_acc: 0.9825\n", 273 | "\n", 274 | " ROC-AUC - epoch: 2 - score: 0.979113 \n", 275 | "\n", 276 | "Epoch 3/15\n", 277 | "143613/143613 [==============================] - 18s 122us/step - loss: 0.0460 - acc: 0.9824 - val_loss: 0.0473 - val_acc: 0.9824\n", 278 | "\n", 279 | " ROC-AUC - epoch: 3 - score: 0.981239 \n", 280 | "\n", 281 | "Epoch 4/15\n", 282 | "143613/143613 [==============================] - 18s 123us/step - loss: 0.0438 - acc: 0.9831 - val_loss: 0.0431 - val_acc: 0.9835\n", 283 | "\n", 284 | " ROC-AUC - epoch: 4 - score: 0.983787 \n", 285 | "\n", 286 | "Epoch 5/15\n", 287 | "143613/143613 [==============================] - 17s 122us/step - loss: 0.0413 - acc: 0.9839 - val_loss: 0.0430 - val_acc: 0.9834\n", 288 | "\n", 289 | " ROC-AUC - epoch: 5 - score: 0.984971 \n", 290 | "\n", 291 | "Epoch 6/15\n", 292 | "143613/143613 [==============================] - 17s 118us/step - loss: 0.0397 - acc: 0.9843 - val_loss: 0.0438 - val_acc: 0.9837\n", 293 | "\n", 294 | " ROC-AUC - epoch: 6 - score: 0.984433 \n", 295 | "\n", 296 | "Epoch 7/15\n", 297 | "143613/143613 [==============================] - 18s 122us/step - loss: 0.0379 - acc: 0.9848 - val_loss: 0.0425 - val_acc: 0.9830\n", 298 | "\n", 299 | " ROC-AUC - epoch: 7 - score: 0.986733 \n", 300 | "\n", 301 | "Epoch 8/15\n", 302 | "143613/143613 [==============================] - 17s 120us/step - loss: 0.0365 - acc: 0.9852 - val_loss: 0.0426 - val_acc: 0.9838\n", 303 | "\n", 304 | " ROC-AUC - epoch: 8 - score: 0.985978 \n", 305 | "\n", 306 | "Epoch 9/15\n", 307 | "143613/143613 [==============================] - 17s 120us/step - loss: 0.0350 - acc: 0.9857 - val_loss: 0.0429 - val_acc: 0.9837\n", 308 | "\n", 309 | " ROC-AUC - epoch: 9 - score: 0.987321 \n", 310 | "\n", 311 | "Epoch 10/15\n", 312 | "143613/143613 [==============================] - 17s 121us/step - loss: 0.0337 - acc: 0.9862 - val_loss: 0.0473 - val_acc: 0.9807\n", 313 | "\n", 314 | " ROC-AUC - epoch: 10 - score: 0.987668 \n", 315 | "\n", 316 | "Epoch 11/15\n", 317 | "143613/143613 [==============================] - 17s 121us/step - loss: 0.0324 - acc: 0.9866 - val_loss: 0.0475 - val_acc: 0.9832\n", 318 | "\n", 319 | " ROC-AUC - epoch: 11 - score: 0.986771 \n", 320 | "\n", 321 | "Epoch 12/15\n", 322 | "143613/143613 [==============================] - 17s 121us/step - loss: 0.0315 - acc: 0.9869 - val_loss: 0.0439 - val_acc: 0.9832\n", 323 | "\n", 324 | " ROC-AUC - epoch: 12 - score: 0.987882 \n", 325 | "\n", 326 | "Epoch 13/15\n", 327 | "143613/143613 [==============================] - 17s 120us/step - loss: 0.0303 - acc: 0.9874 - val_loss: 0.0454 - val_acc: 0.9823\n", 328 | "\n", 329 | " ROC-AUC - epoch: 13 - score: 0.987454 \n", 330 | "\n", 331 | "Epoch 14/15\n", 332 | "143613/143613 [==============================] - 18s 122us/step - loss: 0.0299 - acc: 0.9876 - val_loss: 0.0456 - val_acc: 0.9834\n", 333 | "\n", 334 | " ROC-AUC - epoch: 14 - score: 0.987154 \n", 335 | "\n", 336 | "Epoch 15/15\n", 337 | "143613/143613 [==============================] - 17s 120us/step - loss: 0.0287 - acc: 0.9881 - val_loss: 0.0516 - val_acc: 0.9828\n", 338 | "\n", 339 | " ROC-AUC - epoch: 15 - score: 0.985625 \n", 340 | "\n" 341 | ] 342 | }, 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "" 347 | ] 348 | }, 349 | "execution_count": 58, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | } 353 | ], 354 | "source": [ 355 | "dpcnn_kwargs = {\n", 356 | " 'maxlen': maxlen,\n", 357 | " 'max_features': max_features,\n", 358 | " 'embed_size': embed_size,\n", 359 | " 'embedding_matrix': embedding_matrix,\n", 360 | " 'spatial_dropout': 0.25,\n", 361 | " 'filter_nr': 64,\n", 362 | " 'filter_size': 3, \n", 363 | " 'max_pool_size': 3, \n", 364 | " 'max_pool_strides': 2,\n", 365 | " 'dense_nr': 256,\n", 366 | " 'dense_dropout': 0.5\n", 367 | "}\n", 368 | "\n", 369 | "model = DPCNN_model(**dpcnn_kwargs)\n", 370 | "\n", 371 | "batch_size = 128\n", 372 | "epochs = 15\n", 373 | "\n", 374 | "X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)\n", 375 | "RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)\n", 376 | "model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),\n", 377 | " callbacks=[RocAuc], verbose=1)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "### Predictions" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "#### DPCNN" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": { 398 | "scrolled": true 399 | }, 400 | "outputs": [], 401 | "source": [ 402 | "dpcnn_kwargs = {\n", 403 | " 'maxlen': maxlen,\n", 404 | " 'max_features': max_features,\n", 405 | " 'embed_size': embed_size,\n", 406 | " 'embedding_matrix': embedding_matrix,\n", 407 | " 'spatial_dropout': 0.25,\n", 408 | " 'filter_nr': 64,\n", 409 | " 'filter_size': 3, \n", 410 | " 'max_pool_size': 3, \n", 411 | " 'max_pool_strides': 2,\n", 412 | " 'dense_nr': 256,\n", 413 | " 'dense_dropout': 0.5\n", 414 | "}\n", 415 | "\n", 416 | "batch_size = 128\n", 417 | "n_splits = 10\n", 418 | "epochs = range(10)\n", 419 | "list_classes = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", 420 | "\n", 421 | "cv = CV_predictor(DPCNN_model, x_train, y_train, x_test, \n", 422 | " n_splits, batch_size, epochs, list_classes, dpcnn_kwargs)\n", 423 | "cv.predict()\n", 424 | "\n", 425 | "train_p = cv.train_predictions\n", 426 | "test_p = cv.test_predictions\n", 427 | "test_p.index = test['id']\n", 428 | "\n", 429 | "train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_dpcnn.csv', index=False)\n", 430 | "test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_dpcnn.csv', index=False)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "#### GRU" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": { 444 | "scrolled": false 445 | }, 446 | "outputs": [], 447 | "source": [ 448 | "gru_kwargs = {\n", 449 | " 'CuDNN': CuDNNGRU,\n", 450 | " 'maxlen': maxlen, \n", 451 | " 'max_features': max_features, \n", 452 | " 'embed_size': embed_size, \n", 453 | " 'embedding_matrix' : embedding_matrix\n", 454 | "}\n", 455 | "\n", 456 | "batch_size = 128\n", 457 | "n_splits = 10\n", 458 | "epochs = range(4)\n", 459 | "list_classes = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", 460 | "\n", 461 | "cv = CV_predictor(GRU_LSTM_model, x_train, y_train, x_test, \n", 462 | " n_splits, batch_size, epochs, list_classes, gru_kwargs)\n", 463 | "cv.predict()\n", 464 | "\n", 465 | "train_p = cv.train_predictions\n", 466 | "test_p = cv.test_predictions\n", 467 | "test_p.index = test['id']\n", 468 | "\n", 469 | "train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_gru_lem_low.csv', index=False)\n", 470 | "test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_gru_lem_low.csv', index=False)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "#### LSTM" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": { 484 | "scrolled": false 485 | }, 486 | "outputs": [], 487 | "source": [ 488 | "gru_kwargs = {\n", 489 | " 'CuDNN': CuDNNLSTM, \n", 490 | " 'maxlen': maxlen, \n", 491 | " 'max_features': max_features, \n", 492 | " 'embed_size': embed_size, \n", 493 | " 'embedding_matrix' : embedding_matrix\n", 494 | "}\n", 495 | "\n", 496 | "batch_size = 128\n", 497 | "n_splits = 10\n", 498 | "epochs = range(4)\n", 499 | "list_classes = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", 500 | "\n", 501 | "cv = CV_predictor(GRU_LSTM_model, x_train, y_train, x_test, \n", 502 | " n_splits, batch_size, epochs, list_classes, gru_kwargs)\n", 503 | "cv.predict()\n", 504 | "\n", 505 | "train_p = cv.train_predictions\n", 506 | "test_p = cv.test_predictions\n", 507 | "test_p.index = test['id']\n", 508 | "\n", 509 | "train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_lstm_lem_low.csv', index=False)\n", 510 | "test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_lstm_lem_low.csv', index=False)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "#### Capsule" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": { 524 | "scrolled": false 525 | }, 526 | "outputs": [], 527 | "source": [ 528 | "capsule_kwargs = {\n", 529 | " 'maxlen': maxlen, \n", 530 | " 'max_features': max_features, \n", 531 | " 'embed_size': embed_size, \n", 532 | " 'embedding_matrix' : embedding_matrix, \n", 533 | " 'rate_drop_dense': 0.3,\n", 534 | " 'Num_capsule': 10, \n", 535 | " 'Dim_capsule': 16, \n", 536 | " 'Routings': 5,\n", 537 | " 'gru_len': 128\n", 538 | "}\n", 539 | "\n", 540 | "batch_size = 128\n", 541 | "n_splits = 10\n", 542 | "epochs = range(4)\n", 543 | "list_classes = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", 544 | "\n", 545 | "cv = CV_predictor(CAPSULE_model, x_train, y_train, x_test, \n", 546 | " n_splits, batch_size, epochs, list_classes, capsule_kwargs)\n", 547 | "cv.predict()\n", 548 | "\n", 549 | "train_p = cv.train_predictions\n", 550 | "test_p = cv.test_predictions\n", 551 | "test_p.index = test['id']\n", 552 | "\n", 553 | "train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_capsule_lem_low.csv', index=False)\n", 554 | "test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_capsule_lem_low.csv', index=False)" 555 | ] 556 | } 557 | ], 558 | "metadata": { 559 | "kernelspec": { 560 | "display_name": "Python 3", 561 | "language": "python", 562 | "name": "python3" 563 | }, 564 | "language_info": { 565 | "codemirror_mode": { 566 | "name": "ipython", 567 | "version": 3 568 | }, 569 | "file_extension": ".py", 570 | "mimetype": "text/x-python", 571 | "name": "python", 572 | "nbconvert_exporter": "python", 573 | "pygments_lexer": "ipython3", 574 | "version": "3.6.3" 575 | }, 576 | "latex_envs": { 577 | "LaTeX_envs_menu_present": true, 578 | "autoclose": false, 579 | "autocomplete": true, 580 | "bibliofile": "biblio.bib", 581 | "cite_by": "apalike", 582 | "current_citInitial": 1, 583 | "eqLabelWithNumbers": true, 584 | "eqNumInitial": 1, 585 | "hotkeys": { 586 | "equation": "Ctrl-E", 587 | "itemize": "Ctrl-I" 588 | }, 589 | "labels_anchors": false, 590 | "latex_user_defs": false, 591 | "report_style_numbering": false, 592 | "user_envs_cfg": false 593 | }, 594 | "toc": { 595 | "nav_menu": {}, 596 | "number_sections": true, 597 | "sideBar": true, 598 | "skip_h1_title": false, 599 | "title_cell": "Table of Contents", 600 | "title_sidebar": "Contents", 601 | "toc_cell": false, 602 | "toc_position": {}, 603 | "toc_section_display": true, 604 | "toc_window_display": false 605 | } 606 | }, 607 | "nbformat": 4, 608 | "nbformat_minor": 2 609 | } 610 | --------------------------------------------------------------------------------