├── Boosted Late-Fusion.ipynb
├── LICENSE
├── README.md
├── SEResnext50_train_predict.ipynb
├── camembert_train_predict.ipynb
├── flaubert_train_predict.ipynb
├── multi-modal_concatenate_fusion.ipynb
└── multi_modal_addition_fusion.ipynb
/Boosted Late-Fusion.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
8 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
9 | },
10 | "outputs": [],
11 | "source": [
12 | "import pandas as pd\n",
13 | "import numpy as np\n",
14 | "from tqdm import tqdm\n",
15 | "tqdm.pandas()\n",
16 | "\n",
17 | "import os, time, datetime\n",
18 | "from sklearn.model_selection import train_test_split\n",
19 | "from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc\n",
20 | "import lightgbm as lgb\n",
21 | "import xgboost as xgb"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "def format_time(elapsed):\n",
31 | " '''\n",
32 | " Takes a time in seconds and returns a string hh:mm:ss\n",
33 | " '''\n",
34 | " # Round to the nearest second.\n",
35 | " elapsed_rounded = int(round((elapsed)))\n",
36 | " \n",
37 | " # Format as hh:mm:ss\n",
38 | " return str(datetime.timedelta(seconds=elapsed_rounded))\n",
39 | "\n",
40 | "class SigirPreprocess():\n",
41 | " \n",
42 | " def __init__(self, text_data_path):\n",
43 | " self.text_data_path = text_data_path\n",
44 | " self.train = None\n",
45 | " self.dict_code_to_id = {}\n",
46 | " self.dict_id_to_code = {}\n",
47 | " self.list_tags = {}\n",
48 | " self.sentences = []\n",
49 | " self.labels = []\n",
50 | " self.text_col = None\n",
51 | " self.X_test = None\n",
52 | " \n",
53 | " def prepare_data(self ):\n",
54 | " catalog_eng= pd.read_csv(self.text_data_path+\"data/catalog_english_taxonomy.tsv\",sep=\"\\t\")\n",
55 | " X_train= pd.read_csv(self.text_data_path+\"data/X_train.tsv\",sep=\"\\t\")\n",
56 | " Y_train= pd.read_csv(self.text_data_path+\"data/Y_train.tsv\",sep=\"\\t\")\n",
57 | " \n",
58 | " self.list_tags = list(Y_train['Prdtypecode'].unique())\n",
59 | " for i,tag in enumerate(self.list_tags):\n",
60 | " self.dict_code_to_id[tag] = i \n",
61 | " self.dict_id_to_code[i]=tag\n",
62 | " print(self.dict_code_to_id)\n",
63 | " \n",
64 | " Y_train['labels']=Y_train['Prdtypecode'].map(self.dict_code_to_id)\n",
65 | " train=pd.merge(left=X_train,right=Y_train,\n",
66 | " how='left',left_on=['Integer_id','Image_id','Product_id'],\n",
67 | " right_on=['Integer_id','Image_id','Product_id'])\n",
68 | " prod_map=pd.Series(catalog_eng['Top level category'].values,\n",
69 | " index=catalog_eng['Prdtypecode']).to_dict()\n",
70 | "\n",
71 | " train['product'] = train['Prdtypecode'].map(prod_map)\n",
72 | " train['title_len']=train['Title'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n",
73 | " train['desc_len']=train['Description'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n",
74 | " train['title_desc_len']=train['title_len'] + train['desc_len']\n",
75 | " train.loc[train['Description'].isnull(), 'Description'] = \" \"\n",
76 | " train['title_desc'] = train['Title'] + \" \" + train['Description']\n",
77 | " \n",
78 | " self.train = train\n",
79 | " \n",
80 | " def get_sentences(self, text_col, remove_null_rows=False):\n",
81 | " self.text_col = text_col\n",
82 | " if remove_null_rows==True:\n",
83 | " new_train = self.train[self.train[text_col].notnull()]\n",
84 | "\n",
85 | " else:\n",
86 | " new_train = self.train.copy()\n",
87 | " \n",
88 | " self.sentences = new_train[text_col].values\n",
89 | " self.labels = new_train['labels'].values\n",
90 | " \n",
91 | " def prepare_test(self, text_col, test_data_path, phase=1):\n",
92 | " X_test=pd.read_csv(test_data_path+f\"data/x_test_task1_phase{phase}.tsv\",sep=\"\\t\")\n",
93 | " X_test.loc[X_test['Description'].isnull(), 'Description'] = \" \"\n",
94 | " X_test['title_desc'] = X_test['Title'] + \" \" + X_test['Description']\n",
95 | " self.X_test = X_test\n",
96 | " self.test_sentences = X_test[text_col].values\n",
97 | " "
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "text_col = 'title_desc'\n",
107 | "val_size = 0.1\n",
108 | "random_state=2020\n",
109 | "num_class = 27\n",
110 | "do_gridsearch = False"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "kwargs = {'add_logits':['cam', 'fla']}\n",
120 | "\n",
121 | "\n",
122 | "cam_path = '/../input/camembert-vec-256m768-10ep/'\n",
123 | "flau_path = '/../input/flaubertlogits2107/' \n",
124 | "res_path = '/../input/resnextfinal/'\n",
125 | "cms_path = '/../input/crossmodal-v0/'\n",
126 | "vca_path = '/../input/vec-concat-9093/'\n",
127 | "vca_path_phase2 = '/../input/predictions-test-phase2-vec-fusion/'\n",
128 | "aem_path = '/../input/addition-ensemble-latest/'\n",
129 | "\n",
130 | "\n",
131 | "val_logits_path = {'cam':cam_path + 'validation_set_softmax_logits.npy',\n",
132 | " 'fla':flau_path + 'validation_set_softmax_logits.npy',\n",
133 | " 'res':res_path + 'Valid_resnext50_32x4d_phase1_softmax_logits.npy',\n",
134 | " 'vca':vca_path + 'softmax_logits_val_9093.npy',\n",
135 | " 'aem':aem_path + 'softmax_logits_val_add.npy'}\n",
136 | "\n",
137 | "test_logits_path_phase1 = {'cam':cam_path+f'X_test_phase1_softmax_logits.npy',\n",
138 | " 'fla':flau_path + f'X_test_phase1_softmax_logits.npy', \n",
139 | " 'res':res_path + f'Test_resnext50_32x4d_phase1_softmax_logits.npy',\n",
140 | " 'vca':vca_path + f'softmax_logits_test_9093.npy'}\n",
141 | "\n",
142 | "test_logits_path_phase2 = {'cam':cam_path+f'X_test_phase2_softmax_logits.npy',\n",
143 | " 'fla':flau_path + f'X_test_phase2_softmax_logits.npy', \n",
144 | " 'res':res_path + f'Test_resnext50_32x4d_phase2_softmax_logits.npy',\n",
145 | " 'vca':vca_path_phase2 + f'softmax_logits_test_phase2_9093.npy'}\n",
146 | " \n",
147 | "\n"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "## Get valdation dataset from original train dataset\n",
157 | "Preprocess = SigirPreprocess(\"/../input/textphase1/\")\n",
158 | "Preprocess.prepare_data()\n",
159 | "Preprocess.get_sentences(text_col, True)\n",
160 | "\n",
161 | "full_data = Preprocess.train\n",
162 | "labels = Preprocess.labels\n",
163 | "index = full_data.Integer_id\n",
164 | "\n",
165 | "\n",
166 | "tr_index, val_index, tr_labels, val_labels = train_test_split(index, labels,\n",
167 | " stratify=labels,\n",
168 | " random_state=random_state, \n",
169 | " test_size=val_size)\n",
170 | "\n",
171 | "train_data = full_data.loc[tr_index, :]\n",
172 | "train_data.reset_index(inplace=True, drop=True)\n",
173 | "val_data = full_data.loc[val_index, :]\n",
174 | "val_data.reset_index(inplace=True, drop=True)\n",
175 | "\n",
176 | "full_data.loc[val_index, 'sample'] = 'val'\n",
177 | "full_data['sample'].fillna('train', inplace=True)"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "def preparelogits_df(logit_paths, df=None, val_labels=None, **kwargs):\n",
187 | " ### Prepare and combine Logits data with original validation dataset\n",
188 | " logits_dict = {}\n",
189 | " dfs_dict = {}\n",
190 | " for key, logit_path in logit_paths.items():\n",
191 | " logits_dict[key] = np.load(logit_path)\n",
192 | " \n",
193 | " dfs_dict[key] = pd.DataFrame(logits_dict[key], \n",
194 | " columns=[key + \"_\" + str(i) for i in range(1,28)])\n",
195 | " print(\"Shape of logit arrays: {}\", logits_dict[key].shape)\n",
196 | " \n",
197 | " if kwargs['add_logits']:\n",
198 | " if len(kwargs['add_logits'])>0:\n",
199 | " add_str = '_'.join(kwargs['add_logits'])\n",
200 | " logits_dict[add_str] = logits_dict[kwargs['add_logits'][0]]\n",
201 | " for k in kwargs['add_logits'][1:]:\n",
202 | " logits_dict[add_str] += logits_dict[k]\n",
203 | " logits_dict[add_str] = logits_dict[add_str]/len(kwargs['add_logits'])\n",
204 | " dfs_dict[add_str] = pd.DataFrame(logits_dict[add_str], \n",
205 | " columns=[add_str + \"_\" + str(i) for i in range(1,28)])\n",
206 | " print(\"Shape of logit arrays: {}\", logits_dict[add_str].shape)\n",
207 | "\n",
208 | "\n",
209 | " \n",
210 | " if type(val_labels) == np.ndarray:\n",
211 | " for key,logits in logits_dict.items():\n",
212 | " print(\"\"\"Validation F1 scores for {} logits: {} \"\"\".format(key, \n",
213 | " f1_score(val_labels, np.argmax(logits, axis=1), average='macro')))\n",
214 | " \n",
215 | " \n",
216 | "\n",
217 | " df = pd.concat([df] + list(dfs_dict.values()), axis=1)\n",
218 | " \n",
219 | " return df"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {},
226 | "outputs": [],
227 | "source": [
228 | "val_data = preparelogits_df(val_logits_path, df=val_data, \n",
229 | " val_labels=val_labels, **kwargs)"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "# Model Data Prep"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "df_log = val_data.copy()\n",
246 | "\n",
247 | "probas_cols = [\"fla_\" + str(i) for i in range(1,28)] + [\"cam_\" + str(i) for i in range(1,28)] +\\\n",
248 | "[\"res_\" + str(i) for i in range(1,28)] \\\n",
249 | "+ [\"vca_\" + str(i) for i in range(1,28)] \\\n",
250 | "\n",
251 | "X = df_log[probas_cols]\n",
252 | "y = df_log['labels'].values\n",
253 | "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=random_state)\n"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "from scipy.stats import randint as sp_randint\n",
263 | "from scipy.stats import uniform as sp_uniform\n",
264 | "\n",
265 | "from sklearn.model_selection import RandomizedSearchCV, GridSearchCV\n",
266 | "n_HP_points_to_test = 100\n",
267 | "\n",
268 | "\n",
269 | "param_test ={'num_leaves': sp_randint(6, 50), \n",
270 | " 'min_child_samples': sp_randint(100, 500), \n",
271 | " 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],\n",
272 | " 'subsample': sp_uniform(loc=0.2, scale=0.8), \n",
273 | " 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),\n",
274 | " 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],\n",
275 | " 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],\n",
276 | "# \"bagging_fraction\" : [0.5, 0.6, 0.7, 0.8, 0.9],\n",
277 | "# \"feature_fraction\":[0.5, 0.6, 0.7, 0.8, 0.9]\n",
278 | " }\n",
279 | "\n",
280 | "\n",
281 | "\n",
282 | "\n",
283 | "fit_params={\n",
284 | " \"early_stopping_rounds\":100, \n",
285 | " \"eval_metric\" : 'multi_logloss', \n",
286 | " \"eval_set\" : [(X_test,y_test)],\n",
287 | " 'eval_names': ['valid'],\n",
288 | " #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],\n",
289 | " 'verbose': 100,\n",
290 | " 'categorical_feature': 'auto'}\n",
291 | "\n",
292 | "\n",
293 | "clf = lgb.LGBMClassifier(num_iteration=1000, max_depth=-1, random_state=314, silent=True,\n",
294 | " metric='multi_logloss', n_jobs=4, early_stopping_rounds=100,\n",
295 | " num_class=num_class, objective= \"multiclass\")\n",
296 | "gs = RandomizedSearchCV(\n",
297 | " estimator=clf, param_distributions=param_test, \n",
298 | " n_iter=n_HP_points_to_test,\n",
299 | " cv=3,\n",
300 | " refit=True,\n",
301 | " random_state=314,\n",
302 | " verbose=True)\n",
303 | "\n",
304 | "if do_gridsearch==True:\n",
305 | " gs.fit(X_train, y_train, **fit_params)\n",
306 | " print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "# opt_parameters = gs.best_params_\n",
316 | "opt_parameters = {'colsample_bytree': 0.5284213741879101, 'min_child_samples': 125, \n",
317 | " 'min_child_weight': 10.0, 'num_leaves': 22, \n",
318 | " 'reg_alpha': 0.1, 'reg_lambda': 20, 'subsample': 0.3080033455431848} \n"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {},
324 | "source": [
325 | "# Model Training"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "### Run lightgbm to get weights for different class logits\n",
335 | "\n",
336 | "t0 = time.time()\n",
337 | "\n",
338 | "model_met = 'fit' #'xgb'#'train' #fit\n",
339 | "\n",
340 | "params = {\n",
341 | " \"objective\" : \"multiclass\",\n",
342 | " \"num_class\" : num_class,\n",
343 | " \"num_leaves\" : 60,\n",
344 | " \"max_depth\": -1,\n",
345 | " \"learning_rate\" : 0.01,\n",
346 | " \"bagging_fraction\" : 0.9, # subsample\n",
347 | " \"feature_fraction\" : 0.9, # colsample_bytree\n",
348 | " \"bagging_freq\" : 5, # subsample_freq\n",
349 | " \"bagging_seed\" : 2018,\n",
350 | " \"verbosity\" : -1 }\n",
351 | "\n",
352 | "lgtrain, lgval = lgb.Dataset(X_train, y_train), lgb.Dataset(X_test, y_test)\n",
353 | "\n",
354 | "if model_met == 'train':\n",
355 | " params.update(opt_parameters)\n",
356 | " params.update(fit_params)\n",
357 | " \n",
358 | " lgbmodel = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgval], \n",
359 | " num_iterations = 1000, metric= 'multi_logloss')\n",
360 | " train_logits = lgbmodel.predict(X_train) \n",
361 | " test_logits = lgbmodel.predict(X_test)\n",
362 | "\n",
363 | " train_pred = np.argmax(train_logits, axis=1) \n",
364 | " test_pred = np.argmax(test_logits, axis=1) \n",
365 | "elif model_met == 'xgb':\n",
366 | " dtrain = xgb.DMatrix(X_train, label=y_train)\n",
367 | " dtrain.save_binary('xgb_train.buffer')\n",
368 | " dtest = xgb.DMatrix(X_test, label=y_test)\n",
369 | " \n",
370 | " num_round = 200\n",
371 | " xgb_param = {'max_depth': 5, 'eta': 0.1, 'seed':2020, 'verbosity':1,\n",
372 | " 'objective': 'multi:softmax', 'num_class':num_class}\n",
373 | " xgb_param['nthread'] = 4\n",
374 | " xgb_param['eval_metric'] = 'mlogloss'\n",
375 | " evallist = [(dtest, 'eval'), (dtrain, 'train')]\n",
376 | " bst = xgb.train(xgb_param, dtrain, num_round, evallist\n",
377 | " , early_stopping_rounds=10\n",
378 | " )\n",
379 | " \n",
380 | " train_logits = bst.predict(xgb.DMatrix(X_train), ntree_limit=bst.best_ntree_limit) \n",
381 | " test_logits = bst.predict(xgb.DMatrix(X_test), ntree_limit=bst.best_ntree_limit)\n",
382 | "\n",
383 | " train_pred = train_logits \n",
384 | " test_pred = test_logits \n",
385 | " \n",
386 | "else:\n",
387 | "\n",
388 | " lgbmodel = lgb.LGBMClassifier(**clf.get_params())\n",
389 | " #set optimal parameters\n",
390 | " lgbmodel.set_params(**opt_parameters)\n",
391 | " lgbmodel.fit(X_train, y_train, **fit_params)\n",
392 | " \n",
393 | " train_logits = lgbmodel.predict(X_train) \n",
394 | " test_logits = lgbmodel.predict(X_test)\n",
395 | "\n",
396 | " train_pred = train_logits \n",
397 | " test_pred = test_logits \n",
398 | " \n",
399 | "print(\"Validation F1: {} and Training F1: {} \".format(\n",
400 | " f1_score(y_test, test_pred, average='macro'), \n",
401 | " f1_score(y_train, train_pred, average='macro')))\n",
402 | "\n",
403 | "if model_met == 'train':\n",
404 | " feat_imp = pd.DataFrame({'feature':probas_cols, \n",
405 | " 'logit_kind': [i.split('_')[0] for i in probas_cols],\n",
406 | " 'imp':lgbmodel.feature_importance()/sum(lgbmodel.feature_importance())})\n",
407 | "\n",
408 | "\n",
409 | " lgbmodel.save_model('lgb_classifier_81feats.txt', num_iteration=lgbmodel.best_iteration) \n",
410 | " print(\"\"\"Feature Importances by logits group: \n",
411 | " \"\"\", feat_imp.groupby(['logit_kind'])['imp'].sum())\n",
412 | "else:\n",
413 | " feat_imp = pd.DataFrame({'feature':probas_cols, \n",
414 | " 'logit_kind': [i.split('_')[0] for i in probas_cols],\n",
415 | " 'imp':lgbmodel.feature_importances_/sum(lgbmodel.feature_importances_)})\n",
416 | "\n",
417 | " print(\"\"\"Feature Importances by logits group: \n",
418 | " \"\"\", feat_imp.groupby(['logit_kind'])['imp'].sum())\n",
419 | " \n",
420 | "import shap\n",
421 | "explainer = shap.TreeExplainer(lgbmodel)\n",
422 | "shap_values = explainer.shap_values(X)\n",
423 | "print(\"Time Elapsed: {:}.\".format(format_time(time.time() - t0)))"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {},
430 | "outputs": [],
431 | "source": [
432 | "for n, path in enumerate(['/kaggle/input/textphase1/', \n",
433 | " '/kaggle/input/testphase2/']):\n",
434 | " phase = n+1\n",
435 | " if phase==1:\n",
436 | " test_logits_path = test_logits_path_phase1\n",
437 | " else:\n",
438 | " test_logits_path = test_logits_path_phase2\n",
439 | " Preprocess.prepare_test(text_col, path, phase)\n",
440 | " X_test_phase1= Preprocess.X_test\n",
441 | "\n",
442 | " test_phase1 = preparelogits_df(test_logits_path,\n",
443 | " df=X_test_phase1, val_labels=None, **kwargs)\n",
444 | " \n",
445 | " phase1_logits = lgbmodel.predict(test_phase1[probas_cols].values) \n",
446 | " if model_met == 'train':\n",
447 | " predictions = np.argmax(phase1_logits, axis=1) \n",
448 | " elif model_met == 'xgb':\n",
449 | " phase1_logits = bst.predict(xgb.DMatrix(test_phase1[probas_cols]), \n",
450 | " ntree_limit=bst.best_ntree_limit) \n",
451 | " predictions = phase1_logits\n",
452 | " else:\n",
453 | " predictions = phase1_logits\n",
454 | " X_test_phase1['prediction_model']= predictions\n",
455 | " X_test_phase1['Prdtypecode']=X_test_phase1['prediction_model'].map(Preprocess.dict_id_to_code)\n",
456 | " print(X_test_phase1['Prdtypecode'].value_counts())\n",
457 | " X_test_phase1=X_test_phase1.drop(['prediction_model','Title','Description'],axis=1)\n",
458 | " X_test_phase1.to_csv(f'y_test_task1_phase{phase}_pred_.tsv',sep='\\t',index=False)"
459 | ]
460 | }
461 | ],
462 | "metadata": {
463 | "kernelspec": {
464 | "display_name": "Python 3",
465 | "language": "python",
466 | "name": "python3"
467 | },
468 | "language_info": {
469 | "codemirror_mode": {
470 | "name": "ipython",
471 | "version": 3
472 | },
473 | "file_extension": ".py",
474 | "mimetype": "text/x-python",
475 | "name": "python",
476 | "nbconvert_exporter": "python",
477 | "pygments_lexer": "ipython3",
478 | "version": "3.7.7"
479 | },
480 | "toc": {
481 | "base_numbering": 1,
482 | "nav_menu": {},
483 | "number_sections": true,
484 | "sideBar": true,
485 | "skip_h1_title": false,
486 | "title_cell": "Table of Contents",
487 | "title_sidebar": "Contents",
488 | "toc_cell": false,
489 | "toc_position": {},
490 | "toc_section_display": true,
491 | "toc_window_display": false
492 | }
493 | },
494 | "nbformat": 4,
495 | "nbformat_minor": 4
496 | }
497 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 depshad
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deep Learning Framework for Multi-modal Product Classification
2 | Code repository for Rakuten Data Challenge : Multimodal Product Classification and Retrieval.
3 |
4 | Team Transformer's solution : Deep Multi-level Boosted Fusion Learning Framework for Multi-modal Product Classification
5 |
6 | Paper Link : https://sigir-ecom.github.io/ecom20DCPapers/SIGIR_eCom20_DC_paper_8.pdf
7 |
8 |
9 | Data challenge link : https://sigir-ecom.github.io/data-task.html
10 |
11 | ## Abstract
12 |
13 | In this paper, we present our approach for the ’Multimodal Product
14 | Classification’ task as a part of the 2020 SIGIR Workshop On eCommerce (ECOM20). The specific objective of this task is to build and
15 | submit systems that classify previously unseen products into their
16 | corresponding product type codes. We propose a deep Multi-Modal
17 | Multi-level Boosted Fusion Learning Framework used to categorize
18 | large-scale multi-modal (text and image) product data into product
19 | type codes. Our proposed final methodology achieved a macro F1-
20 | score of 91.94 on the phase 1 test dataset which is the top-scoring
21 | submission and third position on the scoreboard for phase 2 test
22 | dataset with macro F1-score of 90.53.
23 |
24 | ## Code Usage
25 |
26 | ### Unimodal Model Training and Prediction Scripts
27 |
28 | 1. SEResnext50_train_predict.ipynb : Fine tune the pre-trained SEResnext50 model on Rakuten images
29 |
30 | 2. camembert_train_predict.ipynb : Fine tune the pre-trained Cammebert model on French text; Custom Cammbert model with vector output (used later for feature fusion)
31 |
32 | 3. flaubert_train_predict.ipynb : Fine tune the pre-trained Flaubert model on French text; Custom Flaubert model with vector output (used later for feature fusion)
33 |
34 | ### Multimodal Feature Level Fusion
35 | 1. multi-modal_concatenate_fusion.ipynb : Concatenate the features extracted and train NN module on top
36 |
37 | ### Probability Level Fusion
38 | 1. Boosted Late-Fusion.ipynb : Train LightGBM model with class probability as input
39 |
40 |
41 |
42 |
Multi-modal Joint Representation Learning
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 | Late Fusion Model
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/camembert_train_predict.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
8 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
9 | },
10 | "outputs": [],
11 | "source": [
12 | "import os, time, datetime\n",
13 | "import numpy as np\n",
14 | "import pandas as pd\n",
15 | "from tqdm import tqdm\n",
16 | "import random\n",
17 | "import logging\n",
18 | "tqdm.pandas()\n",
19 | "import seaborn as sns\n",
20 | "from sklearn.model_selection import train_test_split\n",
21 | "\n",
22 | "#NN Packages\n",
23 | "import torch\n",
24 | "import torch.nn as nn\n",
25 | "from torch.utils.data import TensorDataset, random_split,DataLoader, RandomSampler, SequentialSampler\n",
26 | "\n",
27 | "logger = logging.getLogger(__name__)\n",
28 | "\n",
29 | "\n",
30 | "if torch.cuda.is_available(): \n",
31 | "\n",
32 | " # Tell PyTorch to use the GPU. \n",
33 | " device = torch.device(\"cuda\")\n",
34 | "\n",
35 | " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n",
36 | "\n",
37 | " print('We will use the GPU:', torch.cuda.get_device_name(0))\n",
38 | "\n",
39 | "# If not...\n",
40 | "else:\n",
41 | " print('No GPU available, using the CPU instead.')\n",
42 | " device = torch.device(\"cpu\")"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "def format_time(elapsed):\n",
52 | " '''\n",
53 | " Takes a time in seconds and returns a string hh:mm:ss\n",
54 | " '''\n",
55 | " # Round to the nearest second.\n",
56 | " elapsed_rounded = int(round((elapsed)))\n",
57 | " \n",
58 | " # Format as hh:mm:ss\n",
59 | " return str(datetime.timedelta(seconds=elapsed_rounded))\n",
60 | "\n",
61 | "class SigirPreprocess():\n",
62 | " \n",
63 | " def __init__(self, text_data_path):\n",
64 | " self.text_data_path = text_data_path\n",
65 | " self.train = None\n",
66 | " self.dict_code_to_id = {}\n",
67 | " self.dict_id_to_code = {}\n",
68 | " self.list_tags = {}\n",
69 | " self.sentences = []\n",
70 | " self.labels = []\n",
71 | " self.text_col = None\n",
72 | " self.X_test = None\n",
73 | " def prepare_data(self ):\n",
74 | " catalog_eng= pd.read_csv(self.text_data_path+\"data/catalog_english_taxonomy.tsv\",sep=\"\\t\")\n",
75 | " X_train= pd.read_csv(self.text_data_path+\"data/X_train.tsv\",sep=\"\\t\")\n",
76 | " Y_train= pd.read_csv(self.text_data_path+\"data/Y_train.tsv\",sep=\"\\t\")\n",
77 | " \n",
78 | " self.list_tags = list(Y_train['Prdtypecode'].unique())\n",
79 | " for i,tag in enumerate(self.list_tags):\n",
80 | " self.dict_code_to_id[tag] = i \n",
81 | " self.dict_id_to_code[i]=tag\n",
82 | " print(self.dict_code_to_id)\n",
83 | " \n",
84 | " Y_train['labels']=Y_train['Prdtypecode'].map(self.dict_code_to_id)\n",
85 | " train=pd.merge(left=X_train,right=Y_train,\n",
86 | " how='left',left_on=['Integer_id','Image_id','Product_id'],\n",
87 | " right_on=['Integer_id','Image_id','Product_id'])\n",
88 | " prod_map=pd.Series(catalog_eng['Top level category'].values,\n",
89 | " index=catalog_eng['Prdtypecode']).to_dict()\n",
90 | "\n",
91 | " train['product'] = train['Prdtypecode'].map(prod_map)\n",
92 | " train['title_len']=train['Title'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n",
93 | " train['desc_len']=train['Description'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n",
94 | " train['title_desc_len']=train['title_len'] + train['desc_len']\n",
95 | " train.loc[train['Description'].isnull(), 'Description'] = \" \"\n",
96 | " train['title_desc'] = train['Title'] + \" \" + train['Description']\n",
97 | " \n",
98 | " self.train = train\n",
99 | " \n",
100 | " def get_sentences(self, text_col, remove_null_rows=False):\n",
101 | " self.text_col = text_col\n",
102 | " if remove_null_rows==True:\n",
103 | " new_train = self.train[self.train[text_col].notnull()]\n",
104 | "\n",
105 | " else:\n",
106 | " new_train = self.train.copy()\n",
107 | " \n",
108 | " self.sentences = new_train[text_col].values\n",
109 | " self.labels = new_train['labels'].values\n",
110 | " \n",
111 | " def prepare_test(self, text_col):\n",
112 | " X_test=pd.read_csv(self.text_data_path+\"data/x_test_task1_phase1.tsv\",sep=\"\\t\")\n",
113 | " X_test.loc[X_test['Description'].isnull(), 'Description'] = \" \"\n",
114 | " X_test['title_desc'] = X_test['Title'] + \" \" + X_test['Description']\n",
115 | " self.X_test = X_test\n",
116 | " self.test_sentences = X_test[text_col].values\n",
117 | " "
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "text_col = 'title_desc'\n",
127 | "max_len = 256\n",
128 | "val_size = 0.1\n",
129 | "\n"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "Preprocess = SigirPreprocess(\"/kaggle/input/textphase1/\")\n",
139 | "Preprocess.prepare_data()\n",
140 | "Preprocess.get_sentences(text_col, True)"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "sentences = Preprocess.sentences\n",
150 | "labels = Preprocess.labels\n",
151 | "print(\"Total number of sentences:{}, labels:{}\".format(len(sentences), len(labels)))"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "from transformers import CamembertConfig, CamembertTokenizer, CamembertModel, CamembertForSequenceClassification, AdamW\n",
161 | "from transformers.modeling_roberta import RobertaClassificationHead\n",
162 | "print('Using Camembert')\n",
163 | "modelname = 'camembert-base'\n",
164 | "tokenizer = CamembertTokenizer.from_pretrained(modelname, do_lowercase=False)\n"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "#function to prepare input for model training\n",
174 | "def prep_input(sentences,labels, max_len):\n",
175 | " input_ids = []\n",
176 | " attention_masks = []\n",
177 | "\n",
178 | " # For every sentence...\n",
179 | " for sent in tqdm(sentences):\n",
180 | " # `encode_plus` will:\n",
181 | " # (1) Tokenize the sentence.\n",
182 | " # (2) Prepend the `[CLS]` token to the start.\n",
183 | " # (3) Append the `[SEP]` token to the end.\n",
184 | " # (4) Map tokens to their IDs.\n",
185 | " # (5) Pad or truncate the sentence to `max_length`\n",
186 | " # (6) Create attention masks for [PAD] tokens.\n",
187 | " encoded_dict = tokenizer.encode_plus(\n",
188 | " sent, # Sentence to encode.\n",
189 | " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
190 | " max_length = max_len, # Pad & truncate all sentences.\n",
191 | " pad_to_max_length = True,\n",
192 | " return_attention_mask = True, # Construct attn. masks.\n",
193 | " return_tensors = 'pt', # Return pytorch tensors.\n",
194 | " )\n",
195 | "\n",
196 | " # Add the encoded sentence to the list. \n",
197 | " input_ids.append(encoded_dict['input_ids'])\n",
198 | "\n",
199 | " # And its attention mask (simply differentiates padding from non-padding).\n",
200 | " attention_masks.append(encoded_dict['attention_mask'])\n",
201 | "\n",
202 | " # Convert the lists into tensors.\n",
203 | " input_ids = torch.cat(input_ids, dim=0)\n",
204 | " attention_masks = torch.cat(attention_masks, dim=0)\n",
205 | " if labels is not None:\n",
206 | " labels = torch.tensor(labels)\n",
207 | " return input_ids,attention_masks,labels\n",
208 | " else:\n",
209 | " return input_ids,attention_masks\n",
210 | " "
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {},
217 | "outputs": [],
218 | "source": [
219 | "input_ids,attention_masks,labels=prep_input(sentences,labels, max_len=max_len)\n",
220 | "print('Original: ', sentences[0])\n",
221 | "print('Token IDs:', input_ids[0]) "
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "### Camembert Model with Vector Output"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {},
235 | "outputs": [],
236 | "source": [
237 | "# class RobertaClassificationHead(nn.Module):\n",
238 | "# \"\"\"Head for sentence-level classification tasks.\"\"\"\n",
239 | "\n",
240 | "# def __init__(self, config):\n",
241 | "# super().__init__()\n",
242 | "# self.dense = nn.Linear(config.hidden_size, config.hidden_size)\n",
243 | "# self.dropout = nn.Dropout(config.hidden_dropout_prob)\n",
244 | "# self.out_proj = nn.Linear(config.hidden_size, config.num_labels)\n",
245 | "\n",
246 | "# def forward(self, features, **kwargs):\n",
247 | "# x = features[:, 0, :] # take token (equiv. to [CLS])\n",
248 | "# x = self.dropout(x)\n",
249 | "# x = self.dense(x)\n",
250 | "# x = torch.tanh(x)\n",
251 | "# feat = self.dropout(x)\n",
252 | "# x = self.out_proj(feat)\n",
253 | "# return x,feat"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "class vec_output_CamembertForSequenceClassification(CamembertModel):\n",
263 | " config_class = CamembertConfig\n",
264 | "\n",
265 | " def __init__(self, config):\n",
266 | " super().__init__(config)\n",
267 | " self.num_labels = config.num_labels\n",
268 | "\n",
269 | " self.roberta = CamembertModel(config)\n",
270 | " self.dense = nn.Linear(256*config.hidden_size, config.hidden_size)\n",
271 | " self.dropout = nn.Dropout(0.1)\n",
272 | " self.out_proj = nn.Linear(config.hidden_size, config.num_labels)\n",
273 | " self.init_weights()\n",
274 | "\n",
275 | "\n",
276 | " def forward(\n",
277 | " self,\n",
278 | " input_ids=None,\n",
279 | " attention_mask=None,\n",
280 | " token_type_ids=None,\n",
281 | " position_ids=None,\n",
282 | " head_mask=None,\n",
283 | " inputs_embeds=None,\n",
284 | " labels=None,\n",
285 | " output_attentions=None,\n",
286 | " output_hidden_states=None,\n",
287 | " ):\n",
288 | " outputs = self.roberta(\n",
289 | " input_ids,\n",
290 | " attention_mask=attention_mask,\n",
291 | " token_type_ids=token_type_ids,\n",
292 | " position_ids=position_ids,\n",
293 | " head_mask=head_mask,\n",
294 | " inputs_embeds=inputs_embeds,\n",
295 | "# output_attentions=output_attentions,\n",
296 | "# output_hidden_states=output_hidden_states,\n",
297 | " )\n",
298 | " sequence_output = outputs[0] #(B,256,768)\n",
299 | " x = sequence_output.view(sequence_output.shape[0], 256*768)\n",
300 | "# x = sequence_output[:, 0, :] # take token (equiv. to [CLS])-> #(B,768) Image -> (B,2048)\n",
301 | " x = self.dense(x) # 768 -> 768\n",
302 | " feat= torch.tanh(x) \n",
303 | " logits = self.out_proj(feat) # 768 -> 27\n",
304 | " outputs = (logits,) + outputs[2:]\n",
305 | "\n",
306 | " return outputs,feat # (loss), logits, (hidden_states), (attentions)"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "\n",
316 | "model = vec_output_CamembertForSequenceClassification.from_pretrained(\n",
317 | " modelname, # Use the 12-layer BERT model, with an uncased vocab.\n",
318 | " num_labels = len(Preprocess.dict_code_to_id), # The number of output labels--2 for binary classification.\n",
319 | " # You can increase this for multi-class tasks. \n",
320 | " output_attentions = False, # Whether the model returns attentions weights.\n",
321 | " output_hidden_states = False, # Whether the model returns all hidden-states.\n",
322 | ")\n",
323 | "model.cuda()"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": null,
329 | "metadata": {},
330 | "outputs": [],
331 | "source": [
332 | "tr_inputs, val_inputs, tr_labels, val_labels = train_test_split(input_ids, labels,stratify=labels,\n",
333 | " random_state=2020, test_size=val_size)\n",
334 | "tr_masks, val_masks, u,v = train_test_split(attention_masks, labels,stratify=labels,\n",
335 | " random_state=2020, test_size=val_size)\n",
336 | "\n",
337 | "\n",
338 | "train_dataset=TensorDataset(tr_inputs, tr_masks, tr_labels)\n",
339 | "val_dataset=TensorDataset(val_inputs, val_masks, val_labels)\n",
340 | "train_sampler = RandomSampler(train_dataset) \n",
341 | "valid_sampler = SequentialSampler(val_dataset)\n",
342 | "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n",
343 | "\n",
344 | "# The DataLoader needs to know our batch size for training, so we specify it \n",
345 | "# here. For fine-tuning BERT on a specific task, the authors recommend a batch \n",
346 | "# size of 16 or 32.\n",
347 | "batch_size = 32\n",
348 | "\n",
349 | "# Create the DataLoaders for our training and validation sets.\n",
350 | "# We'll take training samples in random order. \n",
351 | "train_dataloader = DataLoader(\n",
352 | " train_dataset, # The training samples.\n",
353 | " sampler = train_sampler, # Select batches randomly\n",
354 | " batch_size = batch_size # Trains with this batch size.\n",
355 | " )\n",
356 | "\n",
357 | "# For validation the order doesn't matter, so we'll just read them sequentially.\n",
358 | "validation_dataloader = DataLoader(\n",
359 | " val_dataset, # The validation samples.\n",
360 | " sampler = valid_sampler, # Pull out batches sequentially.\n",
361 | " batch_size = batch_size # Evaluate with this batch size.\n",
362 | " )"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": null,
368 | "metadata": {},
369 | "outputs": [],
370 | "source": [
371 | "optimizer = AdamW(model.parameters(),\n",
372 | " lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n",
373 | " eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n",
374 | " )\n"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "metadata": {},
381 | "outputs": [],
382 | "source": [
383 | "from transformers import get_linear_schedule_with_warmup\n",
384 | "\n",
385 | "# Number of training epochs. The BERT authors recommend between 2 and 4. \n",
386 | "# We chose to run for 4, but we'll see later that this may be over-fitting the\n",
387 | "# training data.\n",
388 | "epochs = 10\n",
389 | "\n",
390 | "# Total number of training steps is [number of batches] x [number of epochs]. \n",
391 | "# (Note that this is not the same as the number of training samples).\n",
392 | "total_steps = len(train_dataloader) * epochs\n",
393 | "\n",
394 | "# Create the learning rate scheduler.\n",
395 | "scheduler = get_linear_schedule_with_warmup(optimizer, \n",
396 | " num_warmup_steps = 0, # Default value in run_glue.py\n",
397 | " num_training_steps = total_steps)"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": null,
403 | "metadata": {},
404 | "outputs": [],
405 | "source": [
406 | "# Function to calculate the accuracy of our predictions vs labels\n",
407 | "def flat_accuracy(preds, labels):\n",
408 | " pred_flat = np.argmax(preds, axis=1).flatten()\n",
409 | " labels_flat = labels.flatten()\n",
410 | " return np.sum(pred_flat == labels_flat) / len(labels_flat)"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "metadata": {},
417 | "outputs": [],
418 | "source": [
419 | "import torch.nn as nn\n",
420 | "loss_criterion = nn.CrossEntropyLoss()"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": null,
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "from sklearn.metrics import f1_score\n",
430 | "# This training code is based on the `run_glue.py` script here:\n",
431 | "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n",
432 | "\n",
433 | "# Set the seed value all over the place to make this reproducible.\n",
434 | "seed_val = 42\n",
435 | "\n",
436 | "random.seed(seed_val)\n",
437 | "np.random.seed(seed_val)\n",
438 | "torch.manual_seed(seed_val)\n",
439 | "torch.cuda.manual_seed_all(seed_val)\n",
440 | "\n",
441 | "# We'll store a number of quantities such as training and validation loss, \n",
442 | "# validation accuracy, and timings.\n",
443 | "training_stats = []\n",
444 | "\n",
445 | "# Measure the total training time for the whole run.\n",
446 | "total_t0 = time.time()\n",
447 | "\n",
448 | "\n",
449 | "# For each epoch...\n",
450 | "for epoch_i in range(0, epochs):\n",
451 | " \n",
452 | " # ========================================\n",
453 | " # Training\n",
454 | " # ========================================\n",
455 | " \n",
456 | " # Perform one full pass over the training set.\n",
457 | "\n",
458 | " print(\"\")\n",
459 | " print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n",
460 | " print('Training...')\n",
461 | " \n",
462 | " #tr and val\n",
463 | " vec_output_tr = []\n",
464 | " vec_output_val =[]\n",
465 | "\n",
466 | " # Measure how long the training epoch takes.\n",
467 | " t0 = time.time()\n",
468 | "\n",
469 | " # Reset the total loss for this epoch.\n",
470 | " total_train_loss = 0\n",
471 | "\n",
472 | " # Put the model into training mode. Don't be mislead--the call to \n",
473 | " # `train` just changes the *mode*, it doesn't *perform* the training.\n",
474 | " # `dropout` and `batchnorm` layers behave differently during training\n",
475 | " # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)\n",
476 | " best_f1 = 0\n",
477 | " model.train()\n",
478 | "\n",
479 | " # For each batch of training data...\n",
480 | " for step, batch in tqdm(enumerate(train_dataloader)):\n",
481 | " \n",
482 | " # Unpack this training batch from our dataloader. \n",
483 | " #\n",
484 | " \n",
485 | " # As we unpack the batch, we'll also copy each tensor to the GPU using the \n",
486 | " # `to` method.\n",
487 | " #\n",
488 | " # `batch` contains three pytorch tensors:\n",
489 | " # [0]: input ids \n",
490 | " # [1]: attention masks\n",
491 | " # [2]: labels \n",
492 | " b_input_ids = batch[0].to(device)\n",
493 | " b_input_mask = batch[1].to(device)\n",
494 | " b_labels = batch[2].to(device)\n",
495 | "\n",
496 | " \n",
497 | " model.zero_grad() \n",
498 | "\n",
499 | " \n",
500 | " logits,vec = model(b_input_ids, \n",
501 | " token_type_ids=None, \n",
502 | " attention_mask=b_input_mask\n",
503 | " )\n",
504 | " #new\n",
505 | " logits = logits[0]\n",
506 | " \n",
507 | " #Defining the loss\n",
508 | " loss = loss_criterion(logits, b_labels)\n",
509 | " \n",
510 | " #saving the features_tr\n",
511 | " vec = vec.detach().cpu().numpy()\n",
512 | " vec_output_tr.extend(vec)\n",
513 | " \n",
514 | " # Accumulate the training loss over all of the batches so that we can\n",
515 | " # calculate the average loss at the end. `loss` is a Tensor containing a\n",
516 | " # single value; the `.item()` function just returns the Python value \n",
517 | " # from the tensor.\n",
518 | " total_train_loss += loss.item()\n",
519 | "\n",
520 | " # Perform a backward pass to calculate the gradients.\n",
521 | " loss.backward()\n",
522 | "\n",
523 | " # Clip the norm of the gradients to 1.0.\n",
524 | " # This is to help prevent the \"exploding gradients\" problem.\n",
525 | " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n",
526 | "\n",
527 | " # Update parameters and take a step using the computed gradient.\n",
528 | " # The optimizer dictates the \"update rule\"--how the parameters are\n",
529 | " # modified based on their gradients, the learning rate, etc.\n",
530 | " optimizer.step()\n",
531 | "\n",
532 | " # Update the learning rate.\n",
533 | " scheduler.step()\n",
534 | " \n",
535 | " \n",
536 | " \n",
537 | "\n",
538 | " # Calculate the average loss over all of the batches.\n",
539 | " avg_train_loss = total_train_loss / len(train_dataloader) \n",
540 | " \n",
541 | " # Measure how long this epoch took.\n",
542 | " training_time = format_time(time.time() - t0)\n",
543 | "\n",
544 | " print(\"\")\n",
545 | " print(\" Average training loss: {0:.2f} \".format(avg_train_loss))\n",
546 | " print(\" Training epcoh took: {:} \".format(training_time))\n",
547 | " \n",
548 | " # ========================================\n",
549 | " # Validation\n",
550 | " # ========================================\n",
551 | " # After the completion of each training epoch, measure our performance on\n",
552 | " # our validation set.\n",
553 | "\n",
554 | " print(\"\")\n",
555 | " print(\"Running Validation...\")\n",
556 | "\n",
557 | " t0 = time.time()\n",
558 | "\n",
559 | " # Put the model in evaluation mode--the dropout layers behave differently\n",
560 | " # during evaluation.\n",
561 | " model.eval()\n",
562 | "\n",
563 | " # Tracking variables \n",
564 | " total_eval_accuracy = 0\n",
565 | " total_eval_loss = 0\n",
566 | " nb_eval_steps = 0\n",
567 | " predictions=[]\n",
568 | " true_labels=[]\n",
569 | " \n",
570 | "\n",
571 | " # Evaluate data for one epoch\n",
572 | " for batch in tqdm(validation_dataloader):\n",
573 | " \n",
574 | " # Unpack this training batch from our dataloader. \n",
575 | " #\n",
576 | " # As we unpack the batch, we'll also copy each tensor to the GPU using \n",
577 | " # the `to` method.\n",
578 | " #\n",
579 | " # `batch` contains three pytorch tensors:\n",
580 | " # [0]: input ids \n",
581 | " # [1]: attention masks\n",
582 | " # [2]: labels \n",
583 | " b_input_ids = batch[0].to(device)\n",
584 | " b_input_mask = batch[1].to(device)\n",
585 | " b_labels = batch[2].to(device)\n",
586 | " \n",
587 | " # Tell pytorch not to bother with constructing the compute graph during\n",
588 | " # the forward pass, since this is only needed for backprop (training).\n",
589 | " with torch.no_grad(): \n",
590 | "\n",
591 | " # Forward pass, calculate logit predictions.\n",
592 | " # token_type_ids is the same as the \"segment ids\", which \n",
593 | " # differentiates sentence 1 and 2 in 2-sentence tasks.\n",
594 | " # The documentation for this `model` function is here: \n",
595 | " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n",
596 | " # Get the \"logits\" output by the model. The \"logits\" are the output\n",
597 | " # values prior to applying an activation function like the softmax.\n",
598 | " logits,vec = model(b_input_ids, \n",
599 | " token_type_ids=None, \n",
600 | " attention_mask=b_input_mask\n",
601 | " )\n",
602 | " \n",
603 | " #new\n",
604 | " logits = logits[0]\n",
605 | " \n",
606 | " #defining the val loss\n",
607 | " loss = loss_criterion(logits, b_labels)\n",
608 | " \n",
609 | " \n",
610 | " # Accumulate the validation loss.\n",
611 | " total_eval_loss += loss.item()\n",
612 | "\n",
613 | " # Move logits and labels to CPU\n",
614 | " logits = logits.detach().cpu().numpy()\n",
615 | "\n",
616 | " # Move logits and labels to CPU\n",
617 | " predicted_labels=np.argmax(logits,axis=1)\n",
618 | " predictions.extend(predicted_labels)\n",
619 | " label_ids = b_labels.to('cpu').numpy()\n",
620 | " true_labels.extend(label_ids)\n",
621 | " \n",
622 | " #saving the features_tr\n",
623 | " vec = vec.detach().cpu().numpy()\n",
624 | " vec_output_val.extend(vec)\n",
625 | " \n",
626 | "\n",
627 | " # Calculate the accuracy for this batch of test sentences, and\n",
628 | " # accumulate it over all batches.\n",
629 | " total_eval_accuracy += flat_accuracy(logits, label_ids)\n",
630 | " \n",
631 | "\n",
632 | " # Report the final accuracy for this validation run.\n",
633 | " avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n",
634 | " print(\" Accuracy: {0:.2f}\".format(avg_val_accuracy))\n",
635 | "\n",
636 | " # Calculate the average loss over all of the batches.\n",
637 | " avg_val_loss = total_eval_loss / len(validation_dataloader)\n",
638 | " \n",
639 | " # Measure how long the validation run took.\n",
640 | " validation_time = format_time(time.time() - t0)\n",
641 | " \n",
642 | " print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n",
643 | " print(\" Validation took: {:}\".format(validation_time))\n",
644 | " print(\"Validation F1-Score: {}\".format(f1_score(true_labels,predictions,average='macro')))\n",
645 | " curr_f1=f1_score(true_labels,predictions,average='macro')\n",
646 | " if curr_f1 > best_f1:\n",
647 | " best_f1=curr_f1\n",
648 | " torch.save(model.state_dict(), 'best_model.pt')\n",
649 | " np.save('best_vec_train_model_train.npy',vec_output_tr)\n",
650 | " np.save('best_vec_val.npy',vec_output_val)\n",
651 | " \n",
652 | " # Record all statistics from this epoch.\n",
653 | "# training_stats.append(\n",
654 | "# {\n",
655 | "# 'epoch': epoch_i + 1,\n",
656 | "# 'Training Loss': avg_train_loss,\n",
657 | "# 'Valid. Loss': avg_val_loss,\n",
658 | "# 'Valid. Accur.': avg_val_accuracy,\n",
659 | "# 'Training Time': training_time,\n",
660 | "# 'Validation Time': validation_time\n",
661 | "# }\n",
662 | "# )\n",
663 | "\n",
664 | "print(\"\")\n",
665 | "print(\"Training complete!\")\n",
666 | "\n",
667 | "print(\"Total training took {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))\n"
668 | ]
669 | },
670 | {
671 | "cell_type": "markdown",
672 | "metadata": {},
673 | "source": [
674 | "## Predictions"
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": null,
680 | "metadata": {},
681 | "outputs": [],
682 | "source": [
683 | "model_path = '/kaggle/working/best_model.pt'\n",
684 | "checkpoint = torch.load(model_path)\n",
685 | "# model = checkpoint['model']\n",
686 | "model.load_state_dict(checkpoint)"
687 | ]
688 | },
689 | {
690 | "cell_type": "code",
691 | "execution_count": null,
692 | "metadata": {},
693 | "outputs": [],
694 | "source": [
695 | "def predict_pyt(model, prediction_dataloader):\n",
696 | " \"\"\"\n",
697 | " model: pytorch model\n",
698 | " prediction_dataloader: DataLoader object for which the predictions has to be made.\n",
699 | " return:\n",
700 | " predictions:- Direct predicted labels\n",
701 | " softmax_logits:- logits which are normalized with softmax on output\"\"\"\n",
702 | " \n",
703 | " # Put model in evaluation mode\n",
704 | " model.eval()\n",
705 | "\n",
706 | " # Tracking variables \n",
707 | " predictions = []\n",
708 | " softmax_logits=[]\n",
709 | " vec_outputs = []\n",
710 | " \n",
711 | " # Predict \n",
712 | " for batch in tqdm(prediction_dataloader):\n",
713 | " \n",
714 | " # Add batch to GPU\n",
715 | " batch = tuple(t.to(device) for t in batch)\n",
716 | " # Unpack the inputs from our dataloader\n",
717 | " try:\n",
718 | " b_input_ids, b_input_mask = batch\n",
719 | " except ValueError:\n",
720 | " b_input_ids, b_input_mask, _ = batch\n",
721 | " # Telling the model not to compute or store gradients, saving memory and \n",
722 | " # speeding up prediction\n",
723 | " with torch.no_grad():\n",
724 | " # Forward pass, calculate logit predictions\n",
725 | " logits,vec = model(b_input_ids, token_type_ids=None, \n",
726 | " attention_mask=b_input_mask)\n",
727 | " \n",
728 | " logits = logits[0]\n",
729 | "\n",
730 | " \n",
731 | " #----- Add softmax--- \n",
732 | " m = nn.Softmax(dim=1)\n",
733 | " # # input = torch.randn(2, 3)\n",
734 | " output = m(logits)\n",
735 | " #-------#------\n",
736 | " \n",
737 | " # Move logits and labels to CPU\n",
738 | " logits = logits.detach().cpu().numpy()\n",
739 | " predicted_labels=np.argmax(logits,axis=1)\n",
740 | " predictions.extend(predicted_labels)\n",
741 | " softmax_logits.extend(output)\n",
742 | " \n",
743 | " #vec_outputs saving\n",
744 | " vec = vec.detach().cpu().numpy()\n",
745 | " vec_outputs.extend(vec)\n",
746 | "\n",
747 | " print('DONE')\n",
748 | " return predictions, softmax_logits , vec_outputs\n",
749 | "\n",
750 | "def predict_wrapper(model, sentences, max_len=max_len, batch_size = batch_size ):\n",
751 | " \"\"\"\n",
752 | " Wrapper to create DataLoader object and predict, \n",
753 | " this is if model and sentences are passed\"\"\"\n",
754 | " input_ids,attention_masks=prep_input(sentences,labels=None, max_len=max_len)\n",
755 | " prediction_data = TensorDataset(input_ids, attention_masks)\n",
756 | " prediction_sampler = SequentialSampler(prediction_data)\n",
757 | " prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)\n",
758 | " return predict_pyt(model, prediction_dataloader)\n",
759 | "\n",
760 | "\n",
761 | "\n"
762 | ]
763 | },
764 | {
765 | "cell_type": "code",
766 | "execution_count": null,
767 | "metadata": {},
768 | "outputs": [],
769 | "source": [
770 | "## Prepare the test dataset\n",
771 | "batch_size = 32 \n",
772 | "\n",
773 | "Preprocess.prepare_test(text_col)\n",
774 | "test_sentences = Preprocess.test_sentences\n",
775 | "X_test_phase1= Preprocess.X_test"
776 | ]
777 | },
778 | {
779 | "cell_type": "code",
780 | "execution_count": null,
781 | "metadata": {},
782 | "outputs": [],
783 | "source": [
784 | "## Predictions of train dataset \n",
785 | "# model_path = '../input/camembertvectoroutput/best_model.pt'\n",
786 | "# checkpoint = torch.load(model_path)\n",
787 | "# model = checkpoint['model']\n",
788 | "# model.load_state_dict(checkpoint)\n",
789 | "start = time.time()\n",
790 | "predictions, softmax_logits , vec_outputs = predict_pyt(model, train_dataloader)\n",
791 | "\n",
792 | "#saving\n",
793 | "np.save('best_vec_train_model_eval.npy',vec_outputs)\n",
794 | "softmax_logits = np.array([ten.detach().cpu().numpy() for ten in softmax_logits])\n",
795 | "np.save('train_set_softmax_logits.npy',softmax_logits)\n",
796 | "print('length of predictions {}'.format(len(predictions)))\n",
797 | "print('Time Taken Predict for train set: {:}'.format(format_time(time.time() - start) ))"
798 | ]
799 | },
800 | {
801 | "cell_type": "code",
802 | "execution_count": null,
803 | "metadata": {},
804 | "outputs": [],
805 | "source": [
806 | "# Predictions of validation set which is randomly separated from train dataset\n",
807 | "start = time.time()\n",
808 | "predictions, val_softmax_logits , vec_outputs= predict_pyt(model, validation_dataloader)\n",
809 | "np.save('best_vec_val_model_eval.npy',vec_outputs)\n",
810 | "val_softmax_logits = np.array([ten.detach().cpu().numpy() for ten in val_softmax_logits])\n",
811 | "np.save('validation_set_softmax_logits.npy',val_softmax_logits)\n",
812 | "print('Time Taken Predict for val set: {:}'.format(format_time(time.time() - start)))"
813 | ]
814 | },
815 | {
816 | "cell_type": "code",
817 | "execution_count": null,
818 | "metadata": {},
819 | "outputs": [],
820 | "source": [
821 | "## Predictions of test dataset \n",
822 | "\n",
823 | "start = time.time()\n",
824 | "predictions, softmax_logits , vec_outputs = predict_wrapper(model, test_sentences)\n",
825 | "\n",
826 | "#saving\n",
827 | "np.save('best_vec_test.npy',vec_outputs)\n",
828 | "softmax_logits = np.array([ten.detach().cpu().numpy() for ten in softmax_logits])\n",
829 | "np.save('X_test_phase1_softmax_logits.npy',softmax_logits)\n",
830 | "print('length of predictions {}'.format(len(predictions)))\n",
831 | "print('Time Taken Predict for test set: {:}'.format(format_time(time.time() - start) ))"
832 | ]
833 | },
834 | {
835 | "cell_type": "code",
836 | "execution_count": null,
837 | "metadata": {},
838 | "outputs": [],
839 | "source": [
840 | "X_test_phase1['prediction_model']= predictions\n",
841 | "X_test_phase1['Prdtypecode']=X_test_phase1['prediction_model'].map(Preprocess.dict_id_to_code)\n",
842 | "print(X_test_phase1['Prdtypecode'].value_counts())\n",
843 | "X_test_phase1=X_test_phase1.drop(['prediction_model','Title','Description'],axis=1)"
844 | ]
845 | },
846 | {
847 | "cell_type": "code",
848 | "execution_count": null,
849 | "metadata": {},
850 | "outputs": [],
851 | "source": [
852 | "X_test_phase1.to_csv('y_test_task1_phase1_pred.tsv',sep='\\t',index=False)\n"
853 | ]
854 | }
855 | ],
856 | "metadata": {
857 | "kernelspec": {
858 | "display_name": "Python 3",
859 | "language": "python",
860 | "name": "python3"
861 | },
862 | "language_info": {
863 | "codemirror_mode": {
864 | "name": "ipython",
865 | "version": 3
866 | },
867 | "file_extension": ".py",
868 | "mimetype": "text/x-python",
869 | "name": "python",
870 | "nbconvert_exporter": "python",
871 | "pygments_lexer": "ipython3",
872 | "version": "3.7.6"
873 | },
874 | "toc": {
875 | "base_numbering": 1,
876 | "nav_menu": {},
877 | "number_sections": true,
878 | "sideBar": true,
879 | "skip_h1_title": false,
880 | "title_cell": "Table of Contents",
881 | "title_sidebar": "Contents",
882 | "toc_cell": false,
883 | "toc_position": {},
884 | "toc_section_display": true,
885 | "toc_window_display": false
886 | },
887 | "widgets": {
888 | "application/vnd.jupyter.widget-state+json": {
889 | "state": {
890 | "04e2caaecb124a14945c845ca6e62aad": {
891 | "model_module": "@jupyter-widgets/controls",
892 | "model_module_version": "1.5.0",
893 | "model_name": "ProgressStyleModel",
894 | "state": {
895 | "_model_module": "@jupyter-widgets/controls",
896 | "_model_module_version": "1.5.0",
897 | "_model_name": "ProgressStyleModel",
898 | "_view_count": null,
899 | "_view_module": "@jupyter-widgets/base",
900 | "_view_module_version": "1.2.0",
901 | "_view_name": "StyleView",
902 | "bar_color": null,
903 | "description_width": "initial"
904 | }
905 | },
906 | "0d47bfa702554bd68f25bb94db8a8811": {
907 | "model_module": "@jupyter-widgets/controls",
908 | "model_module_version": "1.5.0",
909 | "model_name": "DescriptionStyleModel",
910 | "state": {
911 | "_model_module": "@jupyter-widgets/controls",
912 | "_model_module_version": "1.5.0",
913 | "_model_name": "DescriptionStyleModel",
914 | "_view_count": null,
915 | "_view_module": "@jupyter-widgets/base",
916 | "_view_module_version": "1.2.0",
917 | "_view_name": "StyleView",
918 | "description_width": ""
919 | }
920 | },
921 | "1e4ce92ff6a44d89b65e7917319266eb": {
922 | "model_module": "@jupyter-widgets/controls",
923 | "model_module_version": "1.5.0",
924 | "model_name": "DescriptionStyleModel",
925 | "state": {
926 | "_model_module": "@jupyter-widgets/controls",
927 | "_model_module_version": "1.5.0",
928 | "_model_name": "DescriptionStyleModel",
929 | "_view_count": null,
930 | "_view_module": "@jupyter-widgets/base",
931 | "_view_module_version": "1.2.0",
932 | "_view_name": "StyleView",
933 | "description_width": ""
934 | }
935 | },
936 | "212f4750f35d4bc2b4272e6d070fce89": {
937 | "model_module": "@jupyter-widgets/controls",
938 | "model_module_version": "1.5.0",
939 | "model_name": "HBoxModel",
940 | "state": {
941 | "_dom_classes": [],
942 | "_model_module": "@jupyter-widgets/controls",
943 | "_model_module_version": "1.5.0",
944 | "_model_name": "HBoxModel",
945 | "_view_count": null,
946 | "_view_module": "@jupyter-widgets/controls",
947 | "_view_module_version": "1.5.0",
948 | "_view_name": "HBoxView",
949 | "box_style": "",
950 | "children": [
951 | "IPY_MODEL_85588758dedc4b8bbc6ee33178593140",
952 | "IPY_MODEL_a39feb5a6e374ea2ab65be2fe8b75b00"
953 | ],
954 | "layout": "IPY_MODEL_5a054222842941dab063a8db8ede0ff2"
955 | }
956 | },
957 | "4bc03bf5ab334fc590007e48be4dd318": {
958 | "model_module": "@jupyter-widgets/base",
959 | "model_module_version": "1.2.0",
960 | "model_name": "LayoutModel",
961 | "state": {
962 | "_model_module": "@jupyter-widgets/base",
963 | "_model_module_version": "1.2.0",
964 | "_model_name": "LayoutModel",
965 | "_view_count": null,
966 | "_view_module": "@jupyter-widgets/base",
967 | "_view_module_version": "1.2.0",
968 | "_view_name": "LayoutView",
969 | "align_content": null,
970 | "align_items": null,
971 | "align_self": null,
972 | "border": null,
973 | "bottom": null,
974 | "display": null,
975 | "flex": null,
976 | "flex_flow": null,
977 | "grid_area": null,
978 | "grid_auto_columns": null,
979 | "grid_auto_flow": null,
980 | "grid_auto_rows": null,
981 | "grid_column": null,
982 | "grid_gap": null,
983 | "grid_row": null,
984 | "grid_template_areas": null,
985 | "grid_template_columns": null,
986 | "grid_template_rows": null,
987 | "height": null,
988 | "justify_content": null,
989 | "justify_items": null,
990 | "left": null,
991 | "margin": null,
992 | "max_height": null,
993 | "max_width": null,
994 | "min_height": null,
995 | "min_width": null,
996 | "object_fit": null,
997 | "object_position": null,
998 | "order": null,
999 | "overflow": null,
1000 | "overflow_x": null,
1001 | "overflow_y": null,
1002 | "padding": null,
1003 | "right": null,
1004 | "top": null,
1005 | "visibility": null,
1006 | "width": null
1007 | }
1008 | },
1009 | "4ec5441ef13241dcb0af2d40a4036e6e": {
1010 | "model_module": "@jupyter-widgets/controls",
1011 | "model_module_version": "1.5.0",
1012 | "model_name": "HBoxModel",
1013 | "state": {
1014 | "_dom_classes": [],
1015 | "_model_module": "@jupyter-widgets/controls",
1016 | "_model_module_version": "1.5.0",
1017 | "_model_name": "HBoxModel",
1018 | "_view_count": null,
1019 | "_view_module": "@jupyter-widgets/controls",
1020 | "_view_module_version": "1.5.0",
1021 | "_view_name": "HBoxView",
1022 | "box_style": "",
1023 | "children": [
1024 | "IPY_MODEL_8cd310281c3e4133b3776f69196bef32",
1025 | "IPY_MODEL_702441bdd088466d8e1d264071baca75"
1026 | ],
1027 | "layout": "IPY_MODEL_dcc661b801f940139925b83564e8f282"
1028 | }
1029 | },
1030 | "4f92279308be48e2a1b543fdb441246c": {
1031 | "model_module": "@jupyter-widgets/base",
1032 | "model_module_version": "1.2.0",
1033 | "model_name": "LayoutModel",
1034 | "state": {
1035 | "_model_module": "@jupyter-widgets/base",
1036 | "_model_module_version": "1.2.0",
1037 | "_model_name": "LayoutModel",
1038 | "_view_count": null,
1039 | "_view_module": "@jupyter-widgets/base",
1040 | "_view_module_version": "1.2.0",
1041 | "_view_name": "LayoutView",
1042 | "align_content": null,
1043 | "align_items": null,
1044 | "align_self": null,
1045 | "border": null,
1046 | "bottom": null,
1047 | "display": null,
1048 | "flex": null,
1049 | "flex_flow": null,
1050 | "grid_area": null,
1051 | "grid_auto_columns": null,
1052 | "grid_auto_flow": null,
1053 | "grid_auto_rows": null,
1054 | "grid_column": null,
1055 | "grid_gap": null,
1056 | "grid_row": null,
1057 | "grid_template_areas": null,
1058 | "grid_template_columns": null,
1059 | "grid_template_rows": null,
1060 | "height": null,
1061 | "justify_content": null,
1062 | "justify_items": null,
1063 | "left": null,
1064 | "margin": null,
1065 | "max_height": null,
1066 | "max_width": null,
1067 | "min_height": null,
1068 | "min_width": null,
1069 | "object_fit": null,
1070 | "object_position": null,
1071 | "order": null,
1072 | "overflow": null,
1073 | "overflow_x": null,
1074 | "overflow_y": null,
1075 | "padding": null,
1076 | "right": null,
1077 | "top": null,
1078 | "visibility": null,
1079 | "width": null
1080 | }
1081 | },
1082 | "57be58d12ce0415590cd75f529dc8a06": {
1083 | "model_module": "@jupyter-widgets/base",
1084 | "model_module_version": "1.2.0",
1085 | "model_name": "LayoutModel",
1086 | "state": {
1087 | "_model_module": "@jupyter-widgets/base",
1088 | "_model_module_version": "1.2.0",
1089 | "_model_name": "LayoutModel",
1090 | "_view_count": null,
1091 | "_view_module": "@jupyter-widgets/base",
1092 | "_view_module_version": "1.2.0",
1093 | "_view_name": "LayoutView",
1094 | "align_content": null,
1095 | "align_items": null,
1096 | "align_self": null,
1097 | "border": null,
1098 | "bottom": null,
1099 | "display": null,
1100 | "flex": null,
1101 | "flex_flow": null,
1102 | "grid_area": null,
1103 | "grid_auto_columns": null,
1104 | "grid_auto_flow": null,
1105 | "grid_auto_rows": null,
1106 | "grid_column": null,
1107 | "grid_gap": null,
1108 | "grid_row": null,
1109 | "grid_template_areas": null,
1110 | "grid_template_columns": null,
1111 | "grid_template_rows": null,
1112 | "height": null,
1113 | "justify_content": null,
1114 | "justify_items": null,
1115 | "left": null,
1116 | "margin": null,
1117 | "max_height": null,
1118 | "max_width": null,
1119 | "min_height": null,
1120 | "min_width": null,
1121 | "object_fit": null,
1122 | "object_position": null,
1123 | "order": null,
1124 | "overflow": null,
1125 | "overflow_x": null,
1126 | "overflow_y": null,
1127 | "padding": null,
1128 | "right": null,
1129 | "top": null,
1130 | "visibility": null,
1131 | "width": null
1132 | }
1133 | },
1134 | "5a054222842941dab063a8db8ede0ff2": {
1135 | "model_module": "@jupyter-widgets/base",
1136 | "model_module_version": "1.2.0",
1137 | "model_name": "LayoutModel",
1138 | "state": {
1139 | "_model_module": "@jupyter-widgets/base",
1140 | "_model_module_version": "1.2.0",
1141 | "_model_name": "LayoutModel",
1142 | "_view_count": null,
1143 | "_view_module": "@jupyter-widgets/base",
1144 | "_view_module_version": "1.2.0",
1145 | "_view_name": "LayoutView",
1146 | "align_content": null,
1147 | "align_items": null,
1148 | "align_self": null,
1149 | "border": null,
1150 | "bottom": null,
1151 | "display": null,
1152 | "flex": null,
1153 | "flex_flow": null,
1154 | "grid_area": null,
1155 | "grid_auto_columns": null,
1156 | "grid_auto_flow": null,
1157 | "grid_auto_rows": null,
1158 | "grid_column": null,
1159 | "grid_gap": null,
1160 | "grid_row": null,
1161 | "grid_template_areas": null,
1162 | "grid_template_columns": null,
1163 | "grid_template_rows": null,
1164 | "height": null,
1165 | "justify_content": null,
1166 | "justify_items": null,
1167 | "left": null,
1168 | "margin": null,
1169 | "max_height": null,
1170 | "max_width": null,
1171 | "min_height": null,
1172 | "min_width": null,
1173 | "object_fit": null,
1174 | "object_position": null,
1175 | "order": null,
1176 | "overflow": null,
1177 | "overflow_x": null,
1178 | "overflow_y": null,
1179 | "padding": null,
1180 | "right": null,
1181 | "top": null,
1182 | "visibility": null,
1183 | "width": null
1184 | }
1185 | },
1186 | "6a940c5ee47e4a0fa6bd17899077b04c": {
1187 | "model_module": "@jupyter-widgets/base",
1188 | "model_module_version": "1.2.0",
1189 | "model_name": "LayoutModel",
1190 | "state": {
1191 | "_model_module": "@jupyter-widgets/base",
1192 | "_model_module_version": "1.2.0",
1193 | "_model_name": "LayoutModel",
1194 | "_view_count": null,
1195 | "_view_module": "@jupyter-widgets/base",
1196 | "_view_module_version": "1.2.0",
1197 | "_view_name": "LayoutView",
1198 | "align_content": null,
1199 | "align_items": null,
1200 | "align_self": null,
1201 | "border": null,
1202 | "bottom": null,
1203 | "display": null,
1204 | "flex": null,
1205 | "flex_flow": null,
1206 | "grid_area": null,
1207 | "grid_auto_columns": null,
1208 | "grid_auto_flow": null,
1209 | "grid_auto_rows": null,
1210 | "grid_column": null,
1211 | "grid_gap": null,
1212 | "grid_row": null,
1213 | "grid_template_areas": null,
1214 | "grid_template_columns": null,
1215 | "grid_template_rows": null,
1216 | "height": null,
1217 | "justify_content": null,
1218 | "justify_items": null,
1219 | "left": null,
1220 | "margin": null,
1221 | "max_height": null,
1222 | "max_width": null,
1223 | "min_height": null,
1224 | "min_width": null,
1225 | "object_fit": null,
1226 | "object_position": null,
1227 | "order": null,
1228 | "overflow": null,
1229 | "overflow_x": null,
1230 | "overflow_y": null,
1231 | "padding": null,
1232 | "right": null,
1233 | "top": null,
1234 | "visibility": null,
1235 | "width": null
1236 | }
1237 | },
1238 | "702441bdd088466d8e1d264071baca75": {
1239 | "model_module": "@jupyter-widgets/controls",
1240 | "model_module_version": "1.5.0",
1241 | "model_name": "HTMLModel",
1242 | "state": {
1243 | "_dom_classes": [],
1244 | "_model_module": "@jupyter-widgets/controls",
1245 | "_model_module_version": "1.5.0",
1246 | "_model_name": "HTMLModel",
1247 | "_view_count": null,
1248 | "_view_module": "@jupyter-widgets/controls",
1249 | "_view_module_version": "1.5.0",
1250 | "_view_name": "HTMLView",
1251 | "description": "",
1252 | "description_tooltip": null,
1253 | "layout": "IPY_MODEL_94ed9026bc664a81a39ea16f09293c7c",
1254 | "placeholder": "",
1255 | "style": "IPY_MODEL_0d47bfa702554bd68f25bb94db8a8811",
1256 | "value": " 811k/811k [00:01<00:00, 648kB/s]"
1257 | }
1258 | },
1259 | "82320d113b0b40e1b038d3cf321b3433": {
1260 | "model_module": "@jupyter-widgets/controls",
1261 | "model_module_version": "1.5.0",
1262 | "model_name": "HTMLModel",
1263 | "state": {
1264 | "_dom_classes": [],
1265 | "_model_module": "@jupyter-widgets/controls",
1266 | "_model_module_version": "1.5.0",
1267 | "_model_name": "HTMLModel",
1268 | "_view_count": null,
1269 | "_view_module": "@jupyter-widgets/controls",
1270 | "_view_module_version": "1.5.0",
1271 | "_view_name": "HTMLView",
1272 | "description": "",
1273 | "description_tooltip": null,
1274 | "layout": "IPY_MODEL_4bc03bf5ab334fc590007e48be4dd318",
1275 | "placeholder": "",
1276 | "style": "IPY_MODEL_895e9b60a3974711883bcd1d827de8a6",
1277 | "value": " 508/508 [00:00<00:00, 1.38kB/s]"
1278 | }
1279 | },
1280 | "85588758dedc4b8bbc6ee33178593140": {
1281 | "model_module": "@jupyter-widgets/controls",
1282 | "model_module_version": "1.5.0",
1283 | "model_name": "FloatProgressModel",
1284 | "state": {
1285 | "_dom_classes": [],
1286 | "_model_module": "@jupyter-widgets/controls",
1287 | "_model_module_version": "1.5.0",
1288 | "_model_name": "FloatProgressModel",
1289 | "_view_count": null,
1290 | "_view_module": "@jupyter-widgets/controls",
1291 | "_view_module_version": "1.5.0",
1292 | "_view_name": "ProgressView",
1293 | "bar_style": "success",
1294 | "description": "Downloading: 100%",
1295 | "description_tooltip": null,
1296 | "layout": "IPY_MODEL_4f92279308be48e2a1b543fdb441246c",
1297 | "max": 445032417,
1298 | "min": 0,
1299 | "orientation": "horizontal",
1300 | "style": "IPY_MODEL_04e2caaecb124a14945c845ca6e62aad",
1301 | "value": 445032417
1302 | }
1303 | },
1304 | "895e9b60a3974711883bcd1d827de8a6": {
1305 | "model_module": "@jupyter-widgets/controls",
1306 | "model_module_version": "1.5.0",
1307 | "model_name": "DescriptionStyleModel",
1308 | "state": {
1309 | "_model_module": "@jupyter-widgets/controls",
1310 | "_model_module_version": "1.5.0",
1311 | "_model_name": "DescriptionStyleModel",
1312 | "_view_count": null,
1313 | "_view_module": "@jupyter-widgets/base",
1314 | "_view_module_version": "1.2.0",
1315 | "_view_name": "StyleView",
1316 | "description_width": ""
1317 | }
1318 | },
1319 | "8cd310281c3e4133b3776f69196bef32": {
1320 | "model_module": "@jupyter-widgets/controls",
1321 | "model_module_version": "1.5.0",
1322 | "model_name": "FloatProgressModel",
1323 | "state": {
1324 | "_dom_classes": [],
1325 | "_model_module": "@jupyter-widgets/controls",
1326 | "_model_module_version": "1.5.0",
1327 | "_model_name": "FloatProgressModel",
1328 | "_view_count": null,
1329 | "_view_module": "@jupyter-widgets/controls",
1330 | "_view_module_version": "1.5.0",
1331 | "_view_name": "ProgressView",
1332 | "bar_style": "success",
1333 | "description": "Downloading: 100%",
1334 | "description_tooltip": null,
1335 | "layout": "IPY_MODEL_fb5ba4132e1e455ea0b38556501346c8",
1336 | "max": 810912,
1337 | "min": 0,
1338 | "orientation": "horizontal",
1339 | "style": "IPY_MODEL_a717d5b6e71341408ed3a51d679f1ed6",
1340 | "value": 810912
1341 | }
1342 | },
1343 | "94ed9026bc664a81a39ea16f09293c7c": {
1344 | "model_module": "@jupyter-widgets/base",
1345 | "model_module_version": "1.2.0",
1346 | "model_name": "LayoutModel",
1347 | "state": {
1348 | "_model_module": "@jupyter-widgets/base",
1349 | "_model_module_version": "1.2.0",
1350 | "_model_name": "LayoutModel",
1351 | "_view_count": null,
1352 | "_view_module": "@jupyter-widgets/base",
1353 | "_view_module_version": "1.2.0",
1354 | "_view_name": "LayoutView",
1355 | "align_content": null,
1356 | "align_items": null,
1357 | "align_self": null,
1358 | "border": null,
1359 | "bottom": null,
1360 | "display": null,
1361 | "flex": null,
1362 | "flex_flow": null,
1363 | "grid_area": null,
1364 | "grid_auto_columns": null,
1365 | "grid_auto_flow": null,
1366 | "grid_auto_rows": null,
1367 | "grid_column": null,
1368 | "grid_gap": null,
1369 | "grid_row": null,
1370 | "grid_template_areas": null,
1371 | "grid_template_columns": null,
1372 | "grid_template_rows": null,
1373 | "height": null,
1374 | "justify_content": null,
1375 | "justify_items": null,
1376 | "left": null,
1377 | "margin": null,
1378 | "max_height": null,
1379 | "max_width": null,
1380 | "min_height": null,
1381 | "min_width": null,
1382 | "object_fit": null,
1383 | "object_position": null,
1384 | "order": null,
1385 | "overflow": null,
1386 | "overflow_x": null,
1387 | "overflow_y": null,
1388 | "padding": null,
1389 | "right": null,
1390 | "top": null,
1391 | "visibility": null,
1392 | "width": null
1393 | }
1394 | },
1395 | "a39feb5a6e374ea2ab65be2fe8b75b00": {
1396 | "model_module": "@jupyter-widgets/controls",
1397 | "model_module_version": "1.5.0",
1398 | "model_name": "HTMLModel",
1399 | "state": {
1400 | "_dom_classes": [],
1401 | "_model_module": "@jupyter-widgets/controls",
1402 | "_model_module_version": "1.5.0",
1403 | "_model_name": "HTMLModel",
1404 | "_view_count": null,
1405 | "_view_module": "@jupyter-widgets/controls",
1406 | "_view_module_version": "1.5.0",
1407 | "_view_name": "HTMLView",
1408 | "description": "",
1409 | "description_tooltip": null,
1410 | "layout": "IPY_MODEL_57be58d12ce0415590cd75f529dc8a06",
1411 | "placeholder": "",
1412 | "style": "IPY_MODEL_1e4ce92ff6a44d89b65e7917319266eb",
1413 | "value": " 445M/445M [00:12<00:00, 35.8MB/s]"
1414 | }
1415 | },
1416 | "a717d5b6e71341408ed3a51d679f1ed6": {
1417 | "model_module": "@jupyter-widgets/controls",
1418 | "model_module_version": "1.5.0",
1419 | "model_name": "ProgressStyleModel",
1420 | "state": {
1421 | "_model_module": "@jupyter-widgets/controls",
1422 | "_model_module_version": "1.5.0",
1423 | "_model_name": "ProgressStyleModel",
1424 | "_view_count": null,
1425 | "_view_module": "@jupyter-widgets/base",
1426 | "_view_module_version": "1.2.0",
1427 | "_view_name": "StyleView",
1428 | "bar_color": null,
1429 | "description_width": "initial"
1430 | }
1431 | },
1432 | "ba22ce2585f54900b21f7f31ed15e78a": {
1433 | "model_module": "@jupyter-widgets/controls",
1434 | "model_module_version": "1.5.0",
1435 | "model_name": "FloatProgressModel",
1436 | "state": {
1437 | "_dom_classes": [],
1438 | "_model_module": "@jupyter-widgets/controls",
1439 | "_model_module_version": "1.5.0",
1440 | "_model_name": "FloatProgressModel",
1441 | "_view_count": null,
1442 | "_view_module": "@jupyter-widgets/controls",
1443 | "_view_module_version": "1.5.0",
1444 | "_view_name": "ProgressView",
1445 | "bar_style": "success",
1446 | "description": "Downloading: 100%",
1447 | "description_tooltip": null,
1448 | "layout": "IPY_MODEL_e8db38407d4f4525ba87dafb35c67a7d",
1449 | "max": 508,
1450 | "min": 0,
1451 | "orientation": "horizontal",
1452 | "style": "IPY_MODEL_ff8421ceeeb84863a79a95137d57e3a7",
1453 | "value": 508
1454 | }
1455 | },
1456 | "dcc661b801f940139925b83564e8f282": {
1457 | "model_module": "@jupyter-widgets/base",
1458 | "model_module_version": "1.2.0",
1459 | "model_name": "LayoutModel",
1460 | "state": {
1461 | "_model_module": "@jupyter-widgets/base",
1462 | "_model_module_version": "1.2.0",
1463 | "_model_name": "LayoutModel",
1464 | "_view_count": null,
1465 | "_view_module": "@jupyter-widgets/base",
1466 | "_view_module_version": "1.2.0",
1467 | "_view_name": "LayoutView",
1468 | "align_content": null,
1469 | "align_items": null,
1470 | "align_self": null,
1471 | "border": null,
1472 | "bottom": null,
1473 | "display": null,
1474 | "flex": null,
1475 | "flex_flow": null,
1476 | "grid_area": null,
1477 | "grid_auto_columns": null,
1478 | "grid_auto_flow": null,
1479 | "grid_auto_rows": null,
1480 | "grid_column": null,
1481 | "grid_gap": null,
1482 | "grid_row": null,
1483 | "grid_template_areas": null,
1484 | "grid_template_columns": null,
1485 | "grid_template_rows": null,
1486 | "height": null,
1487 | "justify_content": null,
1488 | "justify_items": null,
1489 | "left": null,
1490 | "margin": null,
1491 | "max_height": null,
1492 | "max_width": null,
1493 | "min_height": null,
1494 | "min_width": null,
1495 | "object_fit": null,
1496 | "object_position": null,
1497 | "order": null,
1498 | "overflow": null,
1499 | "overflow_x": null,
1500 | "overflow_y": null,
1501 | "padding": null,
1502 | "right": null,
1503 | "top": null,
1504 | "visibility": null,
1505 | "width": null
1506 | }
1507 | },
1508 | "dd232800d1994d96816b47b1eb042df7": {
1509 | "model_module": "@jupyter-widgets/controls",
1510 | "model_module_version": "1.5.0",
1511 | "model_name": "HBoxModel",
1512 | "state": {
1513 | "_dom_classes": [],
1514 | "_model_module": "@jupyter-widgets/controls",
1515 | "_model_module_version": "1.5.0",
1516 | "_model_name": "HBoxModel",
1517 | "_view_count": null,
1518 | "_view_module": "@jupyter-widgets/controls",
1519 | "_view_module_version": "1.5.0",
1520 | "_view_name": "HBoxView",
1521 | "box_style": "",
1522 | "children": [
1523 | "IPY_MODEL_ba22ce2585f54900b21f7f31ed15e78a",
1524 | "IPY_MODEL_82320d113b0b40e1b038d3cf321b3433"
1525 | ],
1526 | "layout": "IPY_MODEL_6a940c5ee47e4a0fa6bd17899077b04c"
1527 | }
1528 | },
1529 | "e8db38407d4f4525ba87dafb35c67a7d": {
1530 | "model_module": "@jupyter-widgets/base",
1531 | "model_module_version": "1.2.0",
1532 | "model_name": "LayoutModel",
1533 | "state": {
1534 | "_model_module": "@jupyter-widgets/base",
1535 | "_model_module_version": "1.2.0",
1536 | "_model_name": "LayoutModel",
1537 | "_view_count": null,
1538 | "_view_module": "@jupyter-widgets/base",
1539 | "_view_module_version": "1.2.0",
1540 | "_view_name": "LayoutView",
1541 | "align_content": null,
1542 | "align_items": null,
1543 | "align_self": null,
1544 | "border": null,
1545 | "bottom": null,
1546 | "display": null,
1547 | "flex": null,
1548 | "flex_flow": null,
1549 | "grid_area": null,
1550 | "grid_auto_columns": null,
1551 | "grid_auto_flow": null,
1552 | "grid_auto_rows": null,
1553 | "grid_column": null,
1554 | "grid_gap": null,
1555 | "grid_row": null,
1556 | "grid_template_areas": null,
1557 | "grid_template_columns": null,
1558 | "grid_template_rows": null,
1559 | "height": null,
1560 | "justify_content": null,
1561 | "justify_items": null,
1562 | "left": null,
1563 | "margin": null,
1564 | "max_height": null,
1565 | "max_width": null,
1566 | "min_height": null,
1567 | "min_width": null,
1568 | "object_fit": null,
1569 | "object_position": null,
1570 | "order": null,
1571 | "overflow": null,
1572 | "overflow_x": null,
1573 | "overflow_y": null,
1574 | "padding": null,
1575 | "right": null,
1576 | "top": null,
1577 | "visibility": null,
1578 | "width": null
1579 | }
1580 | },
1581 | "fb5ba4132e1e455ea0b38556501346c8": {
1582 | "model_module": "@jupyter-widgets/base",
1583 | "model_module_version": "1.2.0",
1584 | "model_name": "LayoutModel",
1585 | "state": {
1586 | "_model_module": "@jupyter-widgets/base",
1587 | "_model_module_version": "1.2.0",
1588 | "_model_name": "LayoutModel",
1589 | "_view_count": null,
1590 | "_view_module": "@jupyter-widgets/base",
1591 | "_view_module_version": "1.2.0",
1592 | "_view_name": "LayoutView",
1593 | "align_content": null,
1594 | "align_items": null,
1595 | "align_self": null,
1596 | "border": null,
1597 | "bottom": null,
1598 | "display": null,
1599 | "flex": null,
1600 | "flex_flow": null,
1601 | "grid_area": null,
1602 | "grid_auto_columns": null,
1603 | "grid_auto_flow": null,
1604 | "grid_auto_rows": null,
1605 | "grid_column": null,
1606 | "grid_gap": null,
1607 | "grid_row": null,
1608 | "grid_template_areas": null,
1609 | "grid_template_columns": null,
1610 | "grid_template_rows": null,
1611 | "height": null,
1612 | "justify_content": null,
1613 | "justify_items": null,
1614 | "left": null,
1615 | "margin": null,
1616 | "max_height": null,
1617 | "max_width": null,
1618 | "min_height": null,
1619 | "min_width": null,
1620 | "object_fit": null,
1621 | "object_position": null,
1622 | "order": null,
1623 | "overflow": null,
1624 | "overflow_x": null,
1625 | "overflow_y": null,
1626 | "padding": null,
1627 | "right": null,
1628 | "top": null,
1629 | "visibility": null,
1630 | "width": null
1631 | }
1632 | },
1633 | "ff8421ceeeb84863a79a95137d57e3a7": {
1634 | "model_module": "@jupyter-widgets/controls",
1635 | "model_module_version": "1.5.0",
1636 | "model_name": "ProgressStyleModel",
1637 | "state": {
1638 | "_model_module": "@jupyter-widgets/controls",
1639 | "_model_module_version": "1.5.0",
1640 | "_model_name": "ProgressStyleModel",
1641 | "_view_count": null,
1642 | "_view_module": "@jupyter-widgets/base",
1643 | "_view_module_version": "1.2.0",
1644 | "_view_name": "StyleView",
1645 | "bar_color": null,
1646 | "description_width": "initial"
1647 | }
1648 | }
1649 | },
1650 | "version_major": 2,
1651 | "version_minor": 0
1652 | }
1653 | }
1654 | },
1655 | "nbformat": 4,
1656 | "nbformat_minor": 4
1657 | }
1658 |
--------------------------------------------------------------------------------
/flaubert_train_predict.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
8 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
9 | },
10 | "outputs": [],
11 | "source": [
12 | "import os, time, datetime\n",
13 | "import numpy as np\n",
14 | "import pandas as pd\n",
15 | "from tqdm import tqdm\n",
16 | "import random\n",
17 | "import logging\n",
18 | "tqdm.pandas()\n",
19 | "import seaborn as sns\n",
20 | "from sklearn.model_selection import train_test_split\n",
21 | "\n",
22 | "#NN Packages\n",
23 | "import torch\n",
24 | "import torch.nn as nn\n",
25 | "from torch.utils.data import TensorDataset, random_split,DataLoader, RandomSampler, SequentialSampler\n",
26 | "\n",
27 | "logger = logging.getLogger(__name__)\n",
28 | "\n",
29 | "\n",
30 | "if torch.cuda.is_available(): \n",
31 | "\n",
32 | " # Tell PyTorch to use the GPU. \n",
33 | " device = torch.device(\"cuda\")\n",
34 | "\n",
35 | " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n",
36 | "\n",
37 | " print('We will use the GPU:', torch.cuda.get_device_name(0))\n",
38 | "\n",
39 | "# If not...\n",
40 | "else:\n",
41 | " print('No GPU available, using the CPU instead.')\n",
42 | " device = torch.device(\"cpu\")"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "# Processing text data"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "def format_time(elapsed):\n",
59 | " '''\n",
60 | " Takes a time in seconds and returns a string hh:mm:ss\n",
61 | " '''\n",
62 | " # Round to the nearest second.\n",
63 | " elapsed_rounded = int(round((elapsed)))\n",
64 | " \n",
65 | " # Format as hh:mm:ss\n",
66 | " return str(datetime.timedelta(seconds=elapsed_rounded))\n",
67 | "\n",
68 | "class SigirPreprocess():\n",
69 | " \n",
70 | " \n",
71 | " def __init__(self, text_data_path):\n",
72 | " self.text_data_path = text_data_path\n",
73 | " self.train = None\n",
74 | " self.dict_code_to_id = {}\n",
75 | " self.dict_id_to_code = {}\n",
76 | " self.list_tags = {}\n",
77 | " self.sentences = []\n",
78 | " self.labels = []\n",
79 | " self.text_col = None\n",
80 | " self.X_test = None\n",
81 | " \n",
82 | " \n",
83 | " def prepare_data(self ):\n",
84 | " \n",
85 | " #loading the train data and test data\n",
86 | " catalog_eng = pd.read_csv(self.text_data_path+\"data/catalog_english_taxonomy.tsv\",sep=\"\\t\")\n",
87 | " X_train= pd.read_csv(self.text_data_path+\"data/X_train.tsv\",sep=\"\\t\")\n",
88 | " Y_train= pd.read_csv(self.text_data_path+\"data/Y_train.tsv\",sep=\"\\t\")\n",
89 | " self.list_tags = list(Y_train['Prdtypecode'].unique())\n",
90 | " \n",
91 | " for i,tag in enumerate(self.list_tags):\n",
92 | " self.dict_code_to_id[tag] = i \n",
93 | " self.dict_id_to_code[i]=tag\n",
94 | " \n",
95 | " #map \n",
96 | " Y_train['labels']=Y_train['Prdtypecode'].map(self.dict_code_to_id)\n",
97 | " \n",
98 | " #merge the train\n",
99 | " train=pd.merge(left=X_train,right=Y_train,\n",
100 | " how='left',left_on=['Integer_id','Image_id','Product_id'],\n",
101 | " right_on=['Integer_id','Image_id','Product_id'])\n",
102 | " prod_map=pd.Series(catalog_eng['Top level category'].values,\n",
103 | " index=catalog_eng['Prdtypecode']).to_dict()\n",
104 | " \n",
105 | " #creating the mapping\n",
106 | " train['product'] = train['Prdtypecode'].map(prod_map)\n",
107 | " train['title_len']=train['Title'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n",
108 | " train['desc_len']=train['Description'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n",
109 | " train['title_desc_len']=train['title_len'] + train['desc_len']\n",
110 | " train.loc[train['Description'].isnull(), 'Description'] = \" \"\n",
111 | " train['title_desc'] = train['Title'] + \" \" + train['Description']\n",
112 | " \n",
113 | " self.train = train\n",
114 | " \n",
115 | " def get_sentences(self, text_col, remove_null_rows=False):\n",
116 | " self.text_col = text_col\n",
117 | " if remove_null_rows==True:\n",
118 | " new_train = self.train[self.train[text_col].notnull()]\n",
119 | "\n",
120 | " else:\n",
121 | " new_train = self.train.copy()\n",
122 | " \n",
123 | " self.sentences = new_train[text_col].values\n",
124 | " self.labels = new_train['labels'].values\n",
125 | " \n",
126 | " def prepare_test(self, text_col):\n",
127 | " X_test=pd.read_csv(self.text_data_path+\"data/x_test_task1_phase1.tsv\",sep=\"\\t\")\n",
128 | " X_test.loc[X_test['Description'].isnull(), 'Description'] = \" \"\n",
129 | " X_test['title_desc'] = X_test['Title'] + \" \" + X_test['Description']\n",
130 | " self.X_test = X_test\n",
131 | " self.test_sentences = X_test[text_col].values\n",
132 | " "
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "text_col = 'title_desc'\n",
142 | "max_len = 256\n",
143 | "val_size = 0.1\n",
144 | "\n",
145 | "# model_str_dict = {'c':'camembert',\n",
146 | "# 'f':'flaubert'}\n",
147 | "# # 'f' for flaubert & 'c' for camembert\n",
148 | "# case='f' \n",
149 | "# model_str = model_str_dict[case]"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "Preprocess = SigirPreprocess(\"/../input/textphase1/\")\n",
159 | "Preprocess.prepare_data()\n",
160 | "Preprocess.get_sentences(text_col, True)"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "sentences = Preprocess.sentences\n",
170 | "labels = Preprocess.labels\n",
171 | "print(\"Total number of sentences:{}, labels:{}\".format(len(sentences), len(labels)))"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "# sns.countplot(x='product', data=self.train)\n",
181 | "# sns.countplot(x='Prdtypecode', data=self.train)\n",
182 | "# sns.distplot(Preprocess.train['title_len'])\n",
183 | "# sns.distplot(Preprocess.train['title_desc_len'])\n",
184 | "# np.percentile(Preprocess.train['title_desc_len'], 99)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "len(Preprocess.dict_code_to_id)"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "from transformers import XLMForSequenceClassification\n",
203 | "from transformers import FlaubertModel, FlaubertTokenizer,FlaubertForSequenceClassification,AdamW, FlaubertConfig \n",
204 | "from torch.nn import Dropout,Conv1d, Linear\n",
205 | "from transformers.modeling_utils import SequenceSummary"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "# a1 = sentences[0]\n",
215 | "# max_len = 40\n",
216 | "# modelname = 'flaubert-base-cased'\n",
217 | "# tokenizer = FlaubertTokenizer.from_pretrained(modelname, do_lowercase=False)\n",
218 | "\n",
219 | "# encoded_dict = tokenizer.encode_plus(\n",
220 | "# a1, # Sentence to encode.\n",
221 | "# add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
222 | "# max_length = max_len, # Pad & truncate all sentences.\n",
223 | "# pad_to_max_length = True,\n",
224 | "# return_attention_mask = True, # Construct attn. masks.\n",
225 | "# return_tensors = 'pt', # Return pytorch tensors.\n",
226 | "# )\n",
227 | "\n",
228 | "\n",
229 | "# iid = encoded_dict['input_ids']\n",
230 | "# mask = encoded_dict['attention_mask']\n",
231 | "\n",
232 | "# iid,mask\n",
233 | "\n",
234 | "# # modelname = 'flaubert-base-cased'\n",
235 | "\n",
236 | "# model = CustFlaubertForSequenceClassification.from_pretrained(\n",
237 | "# modelname, # Use the 12-layer BERT model, with an uncased vocab.\n",
238 | "# # num_labels = len(Preprocess.dict_code_to_id), # The number of output labels--2 for binary classification.\n",
239 | "# # You can increase this for multi-class tasks. \n",
240 | "# output_attentions = False, # Whether the model returns attentions weights.\n",
241 | "# output_hidden_states = False, # Whether the model returns all hidden-states.\n",
242 | "# )\n",
243 | "\n",
244 | "# outputs, embed1 = model(iid, token_type_ids=None, attention_mask=mask, \n",
245 | "# )"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "# #max length after tokenization\n",
255 | "# _max_len = 0\n",
256 | "# # For every sentence...\n",
257 | "# for sent in tqdm(sentences):\n",
258 | "\n",
259 | "# # Tokenize the text and add `[CLS]` and `[SEP]` tokens.\n",
260 | "# input_ids = tokenizer.encode(sent, add_special_tokens=True)\n",
261 | "\n",
262 | "# # Update the maximum sentence length.\n",
263 | "# _max_len = max(_max_len, len(input_ids))\n",
264 | "\n",
265 | "# print('Max sentence length: ', _max_len)"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "modelname = 'flaubert-base-cased'\n",
275 | "tokenizer = FlaubertTokenizer.from_pretrained(modelname, do_lowercase=False)"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": null,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "#function to prepare input for model training\n",
285 | "def prep_input(sentences,labels, max_len):\n",
286 | " input_ids = []\n",
287 | " attention_masks = []\n",
288 | "\n",
289 | " # For every sentence...\n",
290 | " for sent in tqdm(sentences):\n",
291 | " # `encode_plus` will:\n",
292 | " # (1) Tokenize the sentence.\n",
293 | " # (2) Prepend the `[CLS]` token to the start.\n",
294 | " # (3) Append the `[SEP]` token to the end.\n",
295 | " # (4) Map tokens to their IDs.\n",
296 | " # (5) Pad or truncate the sentence to `max_length`\n",
297 | " # (6) Create attention masks for [PAD] tokens.\n",
298 | " encoded_dict = tokenizer.encode_plus(\n",
299 | " sent, # Sentence to encode.\n",
300 | " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
301 | " max_length = max_len, # Pad & truncate all sentences.\n",
302 | " pad_to_max_length = True,\n",
303 | " return_attention_mask = True, # Construct attn. masks.\n",
304 | " return_tensors = 'pt', # Return pytorch tensors.\n",
305 | " )\n",
306 | "\n",
307 | " # Add the encoded sentence to the list. \n",
308 | " input_ids.append(encoded_dict['input_ids'])\n",
309 | "\n",
310 | " # And its attention mask (simply differentiates padding from non-padding).\n",
311 | " attention_masks.append(encoded_dict['attention_mask'])\n",
312 | "\n",
313 | " # Convert the lists into tensors.\n",
314 | " input_ids = torch.cat(input_ids, dim=0)\n",
315 | " attention_masks = torch.cat(attention_masks, dim=0)\n",
316 | " if labels is not None:\n",
317 | " labels = torch.tensor(labels)\n",
318 | " return input_ids,attention_masks,labels\n",
319 | " else:\n",
320 | " return input_ids,attention_masks\n",
321 | " "
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "input_ids,attention_masks,labels=prep_input(sentences,labels, max_len=max_len)\n",
331 | "# print('Original: ', sentences[0])\n",
332 | "# print('Token IDs:', input_ids[0])"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [],
340 | "source": [
341 | "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n",
342 | "\n",
343 | "\n",
344 | "\n",
345 | "#Validation split\n",
346 | "tr_inputs, val_inputs, tr_labels, val_labels = train_test_split(input_ids, labels,stratify=labels,\n",
347 | " random_state=2020, test_size=val_size)\n",
348 | "\n",
349 | "\n",
350 | "tr_masks, val_masks, u,v = train_test_split(attention_masks, labels,stratify=labels,\n",
351 | " random_state=2020, test_size=val_size)\n",
352 | "\n",
353 | "\n",
354 | "train_dataset=TensorDataset(tr_inputs, tr_masks, tr_labels)\n",
355 | "val_dataset=TensorDataset(val_inputs, val_masks, val_labels)\n",
356 | "train_sampler = RandomSampler(train_dataset) \n",
357 | "valid_sampler = SequentialSampler(val_dataset)\n",
358 | "\n",
359 | "\n",
360 | "# The DataLoader needs to know our batch size for training, so we specify it \n",
361 | "# here. For fine-tuning BERT on a specific task, the authors recommend a batch \n",
362 | "# size of 16 or 32.\n",
363 | "batch_size = 32\n",
364 | "\n",
365 | "# Create the DataLoaders for our training and validation sets.\n",
366 | "# We'll take training samples in random order. \n",
367 | "train_dataloader = DataLoader(\n",
368 | " train_dataset, # The training samples.\n",
369 | " sampler = train_sampler, # Select batches randomly\n",
370 | " batch_size = batch_size # Trains with this batch size.\n",
371 | " )\n",
372 | "\n",
373 | "# For validation the order doesn't matter, so we'll just read them sequentially.\n",
374 | "validation_dataloader = DataLoader(\n",
375 | " val_dataset, # The validation samples.\n",
376 | " sampler = valid_sampler, # Pull out batches sequentially.\n",
377 | " batch_size = batch_size # Evaluate with this batch size.\n",
378 | " )"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "# Function to calculate the accuracy of our predictions vs labels\n",
388 | "def flat_accuracy(preds, labels):\n",
389 | " pred_flat = np.argmax(preds, axis=1).flatten()\n",
390 | " labels_flat = labels.flatten()\n",
391 | " return np.sum(pred_flat == labels_flat) / len(labels_flat)"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": null,
397 | "metadata": {},
398 | "outputs": [],
399 | "source": [
400 | "num_classes = 27"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": null,
406 | "metadata": {},
407 | "outputs": [],
408 | "source": [
409 | "class vec_output_FlaubertForSequenceClassification(FlaubertModel):\n",
410 | " \n",
411 | " config_class = FlaubertConfig\n",
412 | " \n",
413 | "\n",
414 | " def __init__(self, config):\n",
415 | " super().__init__(config)\n",
416 | " self.transformer = FlaubertModel(config)\n",
417 | " self.sequence_summary = SequenceSummary(config)\n",
418 | " self.init_weights()\n",
419 | " self.dropout = torch.nn.Dropout(0.1)\n",
420 | " self.classifier = torch.nn.Linear(config.hidden_size, num_classes)\n",
421 | "\n",
422 | "\n",
423 | " def forward(\n",
424 | " self,\n",
425 | " input_ids=None,\n",
426 | " attention_mask=None,\n",
427 | " langs=None,\n",
428 | " token_type_ids=None,\n",
429 | " position_ids=None,\n",
430 | " lengths=None,\n",
431 | " cache=None,\n",
432 | " head_mask=None,\n",
433 | " inputs_embeds=None,\n",
434 | " labels=None,\n",
435 | " ):\n",
436 | " \n",
437 | " \n",
438 | " transformer_outputs = self.transformer(\n",
439 | " input_ids,\n",
440 | " attention_mask=attention_mask,\n",
441 | " langs=langs,\n",
442 | " token_type_ids=token_type_ids,\n",
443 | " position_ids=position_ids,\n",
444 | " lengths=lengths,\n",
445 | " cache=cache,\n",
446 | " head_mask=head_mask,\n",
447 | " inputs_embeds=inputs_embeds,\n",
448 | " )\n",
449 | "\n",
450 | " #output = self.dropout(output)\n",
451 | " output = transformer_outputs[0]\n",
452 | " vec = output[:,0]\n",
453 | " \n",
454 | " \n",
455 | " #logits\n",
456 | " dense = self.dropout(vec)\n",
457 | " \n",
458 | " #classifier\n",
459 | " logits = self.classifier(dense)\n",
460 | " \n",
461 | " outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here\n",
462 | " \n",
463 | " \n",
464 | " return outputs,dense"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "metadata": {},
471 | "outputs": [],
472 | "source": [
473 | "len(Preprocess.dict_code_to_id)"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": null,
479 | "metadata": {},
480 | "outputs": [],
481 | "source": [
482 | "modelname = 'flaubert-base-cased'\n",
483 | "\n",
484 | "model = vec_output_FlaubertForSequenceClassification.from_pretrained(\n",
485 | " modelname, # Use the 12-layer BERT model, with an uncased vocab.\n",
486 | " num_labels = len(Preprocess.dict_code_to_id), # The number of output labels--2 for binary classification.\n",
487 | " # You can increase this for multi-class tasks. \n",
488 | " output_attentions = False, # Whether the model returns attentions weights.\n",
489 | " output_hidden_states = False, # Whether the model returns all hidden-states.\n",
490 | ")\n",
491 | "\n",
492 | "model.cuda()"
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "execution_count": null,
498 | "metadata": {},
499 | "outputs": [],
500 | "source": [
501 | "optimizer = AdamW(model.parameters(),\n",
502 | " lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n",
503 | " eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n",
504 | " )\n"
505 | ]
506 | },
507 | {
508 | "cell_type": "code",
509 | "execution_count": null,
510 | "metadata": {},
511 | "outputs": [],
512 | "source": [
513 | "from transformers import get_linear_schedule_with_warmup\n",
514 | "# Number of training epochs. The BERT authors recommend between 2 and 4. \n",
515 | "# We chose to run for 4, but we'll see later that this may be over-fitting the\n",
516 | "# training data.\n",
517 | "epochs = 12\n",
518 | "\n",
519 | "# Total number of training steps is [number of batches] x [number of epochs]. \n",
520 | "# (Note that this is not the same as the number of training samples).\n",
521 | "total_steps = len(train_dataloader) * epochs\n",
522 | "\n",
523 | "# Create the learning rate scheduler.\n",
524 | "scheduler = get_linear_schedule_with_warmup(optimizer, \n",
525 | " num_warmup_steps = 0, # Default value in run_glue.py\n",
526 | " num_training_steps = total_steps)"
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": null,
532 | "metadata": {},
533 | "outputs": [],
534 | "source": [
535 | "import torch.nn as nn\n",
536 | "loss_criterion = nn.CrossEntropyLoss()\n"
537 | ]
538 | },
539 | {
540 | "cell_type": "code",
541 | "execution_count": null,
542 | "metadata": {},
543 | "outputs": [],
544 | "source": [
545 | "from sklearn.metrics import f1_score\n",
546 | "# This training code is based on the `run_glue.py` script here:\n",
547 | "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n",
548 | "\n",
549 | "# Set the seed value all over the place to make this reproducible.\n",
550 | "seed_val = 42\n",
551 | "\n",
552 | "random.seed(seed_val)\n",
553 | "np.random.seed(seed_val)\n",
554 | "torch.manual_seed(seed_val)\n",
555 | "torch.cuda.manual_seed_all(seed_val)\n",
556 | "\n",
557 | "# We'll store a number of quantities such as training and validation loss, \n",
558 | "# validation accuracy, and timings.\n",
559 | "training_stats = []\n",
560 | "\n",
561 | "# Measure the total training time for the whole run.\n",
562 | "total_t0 = time.time()\n",
563 | "\n",
564 | "\n",
565 | "# For each epoch...\n",
566 | "for epoch_i in range(0, epochs):\n",
567 | " \n",
568 | " # ========================================\n",
569 | " # Training\n",
570 | " # ========================================\n",
571 | " \n",
572 | " # Perform one full pass over the training set.\n",
573 | "\n",
574 | " print(\"\")\n",
575 | " print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n",
576 | " print('Training...')\n",
577 | " \n",
578 | " #tr and val\n",
579 | " vec_output_tr = []\n",
580 | " vec_output_val =[]\n",
581 | "\n",
582 | " # Measure how long the training epoch takes.\n",
583 | " t0 = time.time()\n",
584 | "\n",
585 | " # Reset the total loss for this epoch.\n",
586 | " total_train_loss = 0\n",
587 | "\n",
588 | " # Put the model into training mode. Don't be mislead--the call to \n",
589 | " # `train` just changes the *mode*, it doesn't *perform* the training.\n",
590 | " # `dropout` and `batchnorm` layers behave differently during training\n",
591 | " # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)\n",
592 | " best_f1 = 0\n",
593 | " model.train()\n",
594 | "\n",
595 | " # For each batch of training data...\n",
596 | " for step, batch in enumerate(train_dataloader):\n",
597 | "\n",
598 | " # Progress update every 40 batches.\n",
599 | " if step % 40 == 0 and not step == 0:\n",
600 | " # Calculate elapsed time in minutes.\n",
601 | " elapsed = format_time(time.time() - t0)\n",
602 | " \n",
603 | " # Report progress.\n",
604 | " print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n",
605 | "\n",
606 | " # Unpack this training batch from our dataloader. \n",
607 | " #\n",
608 | " # As we unpack the batch, we'll also copy each tensor to the GPU using the \n",
609 | " # `to` method.\n",
610 | " #\n",
611 | " # `batch` contains three pytorch tensors:\n",
612 | " # [0]: input ids \n",
613 | " # [1]: attention masks\n",
614 | " # [2]: labels \n",
615 | " b_input_ids = batch[0].to(device)\n",
616 | " b_input_mask = batch[1].to(device)\n",
617 | " b_labels = batch[2].to(device)\n",
618 | "\n",
619 | " # Always clear any previously calculated gradients before performing a\n",
620 | " # backward pass. PyTorch doesn't do this automatically because \n",
621 | " # accumulating the gradients is \"convenient while training RNNs\". \n",
622 | " # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)\n",
623 | " model.zero_grad() \n",
624 | "\n",
625 | " # Perform a forward pass (evaluate the model on this training batch).\n",
626 | " # The documentation for this `model` function is here: \n",
627 | " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n",
628 | " # It returns different numbers of parameters depending on what arguments\n",
629 | " # arge given and what flags are set. For our useage here, it returns\n",
630 | " # the loss (because we provided labels) and the \"logits\"--the model\n",
631 | " # outputs prior to activation.\n",
632 | " logits,vec = model(b_input_ids, \n",
633 | " token_type_ids=None, \n",
634 | " attention_mask=b_input_mask\n",
635 | " )\n",
636 | " #new\n",
637 | " logits = logits[0]\n",
638 | " \n",
639 | " #Defining the loss\n",
640 | " loss = loss_criterion(logits, b_labels)\n",
641 | " \n",
642 | " #saving the features_tr\n",
643 | " vec = vec.detach().cpu().numpy()\n",
644 | " vec_output_tr.extend(vec)\n",
645 | " \n",
646 | " # Accumulate the training loss over all of the batches so that we can\n",
647 | " # calculate the average loss at the end. `loss` is a Tensor containing a\n",
648 | " # single value; the `.item()` function just returns the Python value \n",
649 | " # from the tensor.\n",
650 | " total_train_loss += loss.item()\n",
651 | "\n",
652 | " # Perform a backward pass to calculate the gradients.\n",
653 | " loss.backward()\n",
654 | "\n",
655 | " # Clip the norm of the gradients to 1.0.\n",
656 | " # This is to help prevent the \"exploding gradients\" problem.\n",
657 | " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n",
658 | "\n",
659 | " # Update parameters and take a step using the computed gradient.\n",
660 | " # The optimizer dictates the \"update rule\"--how the parameters are\n",
661 | " # modified based on their gradients, the learning rate, etc.\n",
662 | " optimizer.step()\n",
663 | "\n",
664 | " # Update the learning rate.\n",
665 | " scheduler.step()\n",
666 | " \n",
667 | " \n",
668 | " \n",
669 | "\n",
670 | " # Calculate the average loss over all of the batches.\n",
671 | " avg_train_loss = total_train_loss / len(train_dataloader) \n",
672 | " \n",
673 | " # Measure how long this epoch took.\n",
674 | " training_time = format_time(time.time() - t0)\n",
675 | "\n",
676 | " print(\"\")\n",
677 | " print(\" Average training loss: {0:.2f} \".format(avg_train_loss))\n",
678 | " print(\" Training epcoh took: {:} \".format(training_time))\n",
679 | " \n",
680 | " # ========================================\n",
681 | " # Validation\n",
682 | " # ========================================\n",
683 | " # After the completion of each training epoch, measure our performance on\n",
684 | " # our validation set.\n",
685 | "\n",
686 | " print(\"\")\n",
687 | " print(\"Running Validation...\")\n",
688 | "\n",
689 | " t0 = time.time()\n",
690 | "\n",
691 | " # Put the model in evaluation mode--the dropout layers behave differently\n",
692 | " # during evaluation.\n",
693 | " model.eval()\n",
694 | "\n",
695 | " # Tracking variables \n",
696 | " total_eval_accuracy = 0\n",
697 | " total_eval_loss = 0\n",
698 | " nb_eval_steps = 0\n",
699 | " predictions=[]\n",
700 | " true_labels=[]\n",
701 | " \n",
702 | "\n",
703 | " # Evaluate data for one epoch\n",
704 | " for batch in validation_dataloader:\n",
705 | " \n",
706 | " # Unpack this training batch from our dataloader. \n",
707 | " #\n",
708 | " # As we unpack the batch, we'll also copy each tensor to the GPU using \n",
709 | " # the `to` method.\n",
710 | " #\n",
711 | " # `batch` contains three pytorch tensors:\n",
712 | " # [0]: input ids \n",
713 | " # [1]: attention masks\n",
714 | " # [2]: labels \n",
715 | " b_input_ids = batch[0].to(device)\n",
716 | " b_input_mask = batch[1].to(device)\n",
717 | " b_labels = batch[2].to(device)\n",
718 | " \n",
719 | " # Tell pytorch not to bother with constructing the compute graph during\n",
720 | " # the forward pass, since this is only needed for backprop (training).\n",
721 | " with torch.no_grad(): \n",
722 | "\n",
723 | " # Forward pass, calculate logit predictions.\n",
724 | " # token_type_ids is the same as the \"segment ids\", which \n",
725 | " # differentiates sentence 1 and 2 in 2-sentence tasks.\n",
726 | " # The documentation for this `model` function is here: \n",
727 | " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n",
728 | " # Get the \"logits\" output by the model. The \"logits\" are the output\n",
729 | " # values prior to applying an activation function like the softmax.\n",
730 | " logits,vec = model(b_input_ids, \n",
731 | " token_type_ids=None, \n",
732 | " attention_mask=b_input_mask\n",
733 | " )\n",
734 | " \n",
735 | " #new\n",
736 | " logits = logits[0]\n",
737 | " \n",
738 | " #defining the val loss\n",
739 | " loss = loss_criterion(logits, b_labels)\n",
740 | " \n",
741 | " \n",
742 | " # Accumulate the validation loss.\n",
743 | " total_eval_loss += loss.item()\n",
744 | "\n",
745 | " # Move logits and labels to CPU\n",
746 | " logits = logits.detach().cpu().numpy()\n",
747 | "\n",
748 | " # Move logits and labels to CPU\n",
749 | " predicted_labels=np.argmax(logits,axis=1)\n",
750 | " predictions.extend(predicted_labels)\n",
751 | " label_ids = b_labels.to('cpu').numpy()\n",
752 | " true_labels.extend(label_ids)\n",
753 | " \n",
754 | " #saving the features_tr\n",
755 | " vec = vec.detach().cpu().numpy()\n",
756 | " vec_output_val.extend(vec)\n",
757 | " \n",
758 | "\n",
759 | " # Calculate the accuracy for this batch of test sentences, and\n",
760 | " # accumulate it over all batches.\n",
761 | " total_eval_accuracy += flat_accuracy(logits, label_ids)\n",
762 | " \n",
763 | "\n",
764 | " # Report the final accuracy for this validation run.\n",
765 | " avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n",
766 | " print(\" Accuracy: {0:.2f}\".format(avg_val_accuracy))\n",
767 | "\n",
768 | " # Calculate the average loss over all of the batches.\n",
769 | " avg_val_loss = total_eval_loss / len(validation_dataloader)\n",
770 | " \n",
771 | " # Measure how long the validation run took.\n",
772 | " validation_time = format_time(time.time() - t0)\n",
773 | " \n",
774 | " print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n",
775 | " print(\" Validation took: {:}\".format(validation_time))\n",
776 | " print(\"Validation F1-Score: {}\".format(f1_score(true_labels,predictions,average='macro')))\n",
777 | " curr_f1=f1_score(true_labels,predictions,average='macro')\n",
778 | " if curr_f1 > best_f1:\n",
779 | " best_f1=curr_f1\n",
780 | " torch.save(model.state_dict(), 'best_model.pt')\n",
781 | " np.save('best_vec_train.npy',vec_output_tr)\n",
782 | " np.save('best_vec_val.npy',vec_output_val)\n",
783 | " # Record all statistics from this epoch.\n",
784 | "# training_stats.append(\n",
785 | "# {\n",
786 | "# 'epoch': epoch_i + 1,\n",
787 | "# 'Training Loss': avg_train_loss,\n",
788 | "# 'Valid. Loss': avg_val_loss,\n",
789 | "# 'Valid. Accur.': avg_val_accuracy,\n",
790 | "# 'Training Time': training_time,\n",
791 | "# 'Validation Time': validation_time\n",
792 | "# }\n",
793 | "# )\n",
794 | "\n",
795 | "print(\"\")\n",
796 | "print(\"Training complete!\")\n",
797 | "\n",
798 | "print(\"Total training took {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))"
799 | ]
800 | },
801 | {
802 | "cell_type": "code",
803 | "execution_count": null,
804 | "metadata": {},
805 | "outputs": [],
806 | "source": [
807 | "# Save model\n",
808 | "# try:\n",
809 | "# model_state = {'model': model,\n",
810 | "# 'state_dict': model.state_dict(),\n",
811 | "# 'optimizer' : optimizer.state_dict()}\n",
812 | "\n",
813 | "# torch.save(model_state, 'saved_model.pth')\n",
814 | "# except:\n",
815 | "# print('Error in saving model')"
816 | ]
817 | },
818 | {
819 | "cell_type": "markdown",
820 | "metadata": {},
821 | "source": [
822 | "**Test model on unseen data**"
823 | ]
824 | },
825 | {
826 | "cell_type": "markdown",
827 | "metadata": {},
828 | "source": [
829 | "# PREDICTIONS"
830 | ]
831 | },
832 | {
833 | "cell_type": "markdown",
834 | "metadata": {},
835 | "source": [
836 | "**UNCOMMENT THE BELOW CELL IF TRAINING IS NOT PERFORMED IN THIS RUN**"
837 | ]
838 | },
839 | {
840 | "cell_type": "code",
841 | "execution_count": null,
842 | "metadata": {},
843 | "outputs": [],
844 | "source": [
845 | "model_path = '/../working/best_model.pt'"
846 | ]
847 | },
848 | {
849 | "cell_type": "code",
850 | "execution_count": null,
851 | "metadata": {},
852 | "outputs": [],
853 | "source": [
854 | "## Change the **model path** accordingly\n",
855 | "# model_str = 'flaubert'\n",
856 | "# model_path_dict = {'camembert':'/../input/camembertvinodh/saved_model.pth',\n",
857 | "# 'flaubert':'/../input/flaubertekansh/saved_model.pth'}\n",
858 | "\n",
859 | "# model_path = model_path_dict[model_str]\n",
860 | "checkpoint = torch.load(model_path)\n",
861 | "# model = checkpoint['model']\n",
862 | "model.load_state_dict(checkpoint)"
863 | ]
864 | },
865 | {
866 | "cell_type": "code",
867 | "execution_count": null,
868 | "metadata": {},
869 | "outputs": [],
870 | "source": [
871 | "def predict_pyt(model, prediction_dataloader):\n",
872 | " \"\"\"\n",
873 | " model: pytorch model\n",
874 | " prediction_dataloader: DataLoader object for which the predictions has to be made.\n",
875 | " return:\n",
876 | " predictions:- Direct predicted labels\n",
877 | " softmax_logits:- logits which are normalized with softmax on output\"\"\"\n",
878 | " \n",
879 | " # Put model in evaluation mode\n",
880 | " model.eval()\n",
881 | "\n",
882 | " # Tracking variables \n",
883 | " predictions = []\n",
884 | " softmax_logits=[]\n",
885 | " vec_outputs = []\n",
886 | " \n",
887 | " # Predict \n",
888 | " for batch in prediction_dataloader:\n",
889 | " \n",
890 | " # Add batch to GPU\n",
891 | " batch = tuple(t.to(device) for t in batch)\n",
892 | " # Unpack the inputs from our dataloader\n",
893 | " try:\n",
894 | " b_input_ids, b_input_mask = batch\n",
895 | " except ValueError:\n",
896 | " b_input_ids, b_input_mask, _ = batch\n",
897 | " # Telling the model not to compute or store gradients, saving memory and \n",
898 | " # speeding up prediction\n",
899 | " with torch.no_grad():\n",
900 | " # Forward pass, calculate logit predictions\n",
901 | " logits,vec = model(b_input_ids, token_type_ids=None, \n",
902 | " attention_mask=b_input_mask)\n",
903 | " \n",
904 | " logits = logits[0]\n",
905 | "\n",
906 | " \n",
907 | " #----- Add softmax--- \n",
908 | " m = nn.Softmax(dim=1)\n",
909 | " # # input = torch.randn(2, 3)\n",
910 | " output = m(logits)\n",
911 | " #-------#------\n",
912 | " \n",
913 | " # Move logits and labels to CPU\n",
914 | " logits = logits.detach().cpu().numpy()\n",
915 | " predicted_labels=np.argmax(logits,axis=1)\n",
916 | " predictions.extend(predicted_labels)\n",
917 | " softmax_logits.extend(output)\n",
918 | " \n",
919 | " #vec_outputs saving\n",
920 | " vec = vec.detach().cpu().numpy()\n",
921 | " vec_outputs.extend(vec)\n",
922 | "\n",
923 | " print('DONE')\n",
924 | " return predictions, softmax_logits , vec_outputs\n",
925 | "\n",
926 | "def predict_wrapper(model, sentences, max_len=max_len, batch_size = batch_size ):\n",
927 | " \"\"\"\n",
928 | " Wrapper to create DataLoader object and predict, \n",
929 | " this is if model and sentences are passed\"\"\"\n",
930 | " input_ids,attention_masks=prep_input(sentences,labels=None, max_len=max_len)\n",
931 | " prediction_data = TensorDataset(input_ids, attention_masks)\n",
932 | " prediction_sampler = SequentialSampler(prediction_data)\n",
933 | " prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)\n",
934 | " return predict_pyt(model, prediction_dataloader)"
935 | ]
936 | },
937 | {
938 | "cell_type": "code",
939 | "execution_count": null,
940 | "metadata": {},
941 | "outputs": [],
942 | "source": [
943 | "## Prepare the test dataset\n",
944 | "batch_size = 32 \n",
945 | "\n",
946 | "Preprocess.prepare_test(text_col)\n",
947 | "test_sentences = Preprocess.test_sentences\n",
948 | "X_test_phase1= Preprocess.X_test"
949 | ]
950 | },
951 | {
952 | "cell_type": "code",
953 | "execution_count": null,
954 | "metadata": {},
955 | "outputs": [],
956 | "source": [
957 | "# Predictions of validation set which is randomly separated from train dataset\n",
958 | "start = time.time()\n",
959 | "predictions, val_softmax_logits , vec_outputs= predict_pyt(model, validation_dataloader)\n",
960 | "val_softmax_logits = np.array([ten.detach().cpu().numpy() for ten in val_softmax_logits])\n",
961 | "np.save('validation_set_softmax_logits.npy',val_softmax_logits)\n",
962 | "print('Time Taken Predict for val set: {:}'.format(format_time(time.time() - start)))"
963 | ]
964 | },
965 | {
966 | "cell_type": "code",
967 | "execution_count": null,
968 | "metadata": {},
969 | "outputs": [],
970 | "source": [
971 | "## Predictions of test dataset \n",
972 | "\n",
973 | "start = time.time()\n",
974 | "predictions, softmax_logits , vec_outputs = predict_wrapper(model, test_sentences)\n",
975 | "\n",
976 | "#saving\n",
977 | "np.save('best_vec_test.npy',vec_outputs)\n",
978 | "softmax_logits = np.array([ten.detach().cpu().numpy() for ten in softmax_logits])\n",
979 | "np.save('X_test_phase1_softmax_logits.npy',softmax_logits)\n",
980 | "print('length of predictions {}'.format(len(predictions)))\n",
981 | "print('Time Taken Predict for val set: {:}'.format(format_time(time.time() - start) ))"
982 | ]
983 | },
984 | {
985 | "cell_type": "code",
986 | "execution_count": null,
987 | "metadata": {},
988 | "outputs": [],
989 | "source": [
990 | "X_test_phase1['prediction_model']= predictions\n",
991 | "X_test_phase1['Prdtypecode']=X_test_phase1['prediction_model'].map(Preprocess.dict_id_to_code)\n",
992 | "print(X_test_phase1['Prdtypecode'].value_counts())\n",
993 | "X_test_phase1=X_test_phase1.drop(['prediction_model','Title','Description'],axis=1)"
994 | ]
995 | },
996 | {
997 | "cell_type": "code",
998 | "execution_count": null,
999 | "metadata": {},
1000 | "outputs": [],
1001 | "source": [
1002 | "X_test_phase1.to_csv('y_test_task1_phase1_pred.tsv',sep='\\t',index=False)"
1003 | ]
1004 | }
1005 | ],
1006 | "metadata": {
1007 | "kernelspec": {
1008 | "display_name": "Python 3",
1009 | "language": "python",
1010 | "name": "python3"
1011 | },
1012 | "language_info": {
1013 | "codemirror_mode": {
1014 | "name": "ipython",
1015 | "version": 3
1016 | },
1017 | "file_extension": ".py",
1018 | "mimetype": "text/x-python",
1019 | "name": "python",
1020 | "nbconvert_exporter": "python",
1021 | "pygments_lexer": "ipython3",
1022 | "version": "3.7.7"
1023 | },
1024 | "toc": {
1025 | "base_numbering": 1,
1026 | "nav_menu": {},
1027 | "number_sections": true,
1028 | "sideBar": true,
1029 | "skip_h1_title": false,
1030 | "title_cell": "Table of Contents",
1031 | "title_sidebar": "Contents",
1032 | "toc_cell": false,
1033 | "toc_position": {},
1034 | "toc_section_display": true,
1035 | "toc_window_display": false
1036 | },
1037 | "widgets": {
1038 | "application/vnd.jupyter.widget-state+json": {
1039 | "state": {
1040 | "06d2301a3d34440eb19a887fb51d562c": {
1041 | "model_module": "@jupyter-widgets/controls",
1042 | "model_module_version": "1.5.0",
1043 | "model_name": "FloatProgressModel",
1044 | "state": {
1045 | "_dom_classes": [],
1046 | "_model_module": "@jupyter-widgets/controls",
1047 | "_model_module_version": "1.5.0",
1048 | "_model_name": "FloatProgressModel",
1049 | "_view_count": null,
1050 | "_view_module": "@jupyter-widgets/controls",
1051 | "_view_module_version": "1.5.0",
1052 | "_view_name": "ProgressView",
1053 | "bar_style": "success",
1054 | "description": "Downloading: 100%",
1055 | "description_tooltip": null,
1056 | "layout": "IPY_MODEL_18fe6f38ca234379a17e44cd1fad50d4",
1057 | "max": 553238687,
1058 | "min": 0,
1059 | "orientation": "horizontal",
1060 | "style": "IPY_MODEL_54ca2b8f204b4760bccb27572ff7b74a",
1061 | "value": 553238687
1062 | }
1063 | },
1064 | "0f7e8f4e75fc4889a2dd464588c0516d": {
1065 | "model_module": "@jupyter-widgets/controls",
1066 | "model_module_version": "1.5.0",
1067 | "model_name": "HBoxModel",
1068 | "state": {
1069 | "_dom_classes": [],
1070 | "_model_module": "@jupyter-widgets/controls",
1071 | "_model_module_version": "1.5.0",
1072 | "_model_name": "HBoxModel",
1073 | "_view_count": null,
1074 | "_view_module": "@jupyter-widgets/controls",
1075 | "_view_module_version": "1.5.0",
1076 | "_view_name": "HBoxView",
1077 | "box_style": "",
1078 | "children": [
1079 | "IPY_MODEL_33dc04e218864811a4fe0c62ca737d83",
1080 | "IPY_MODEL_274606aec69a461f8c4259316b86c4af"
1081 | ],
1082 | "layout": "IPY_MODEL_f5c9c24c01114e168e90ed555fb9f05a"
1083 | }
1084 | },
1085 | "12a4548fdad44ea181868776c7616455": {
1086 | "model_module": "@jupyter-widgets/base",
1087 | "model_module_version": "1.2.0",
1088 | "model_name": "LayoutModel",
1089 | "state": {
1090 | "_model_module": "@jupyter-widgets/base",
1091 | "_model_module_version": "1.2.0",
1092 | "_model_name": "LayoutModel",
1093 | "_view_count": null,
1094 | "_view_module": "@jupyter-widgets/base",
1095 | "_view_module_version": "1.2.0",
1096 | "_view_name": "LayoutView",
1097 | "align_content": null,
1098 | "align_items": null,
1099 | "align_self": null,
1100 | "border": null,
1101 | "bottom": null,
1102 | "display": null,
1103 | "flex": null,
1104 | "flex_flow": null,
1105 | "grid_area": null,
1106 | "grid_auto_columns": null,
1107 | "grid_auto_flow": null,
1108 | "grid_auto_rows": null,
1109 | "grid_column": null,
1110 | "grid_gap": null,
1111 | "grid_row": null,
1112 | "grid_template_areas": null,
1113 | "grid_template_columns": null,
1114 | "grid_template_rows": null,
1115 | "height": null,
1116 | "justify_content": null,
1117 | "justify_items": null,
1118 | "left": null,
1119 | "margin": null,
1120 | "max_height": null,
1121 | "max_width": null,
1122 | "min_height": null,
1123 | "min_width": null,
1124 | "object_fit": null,
1125 | "object_position": null,
1126 | "order": null,
1127 | "overflow": null,
1128 | "overflow_x": null,
1129 | "overflow_y": null,
1130 | "padding": null,
1131 | "right": null,
1132 | "top": null,
1133 | "visibility": null,
1134 | "width": null
1135 | }
1136 | },
1137 | "18fe6f38ca234379a17e44cd1fad50d4": {
1138 | "model_module": "@jupyter-widgets/base",
1139 | "model_module_version": "1.2.0",
1140 | "model_name": "LayoutModel",
1141 | "state": {
1142 | "_model_module": "@jupyter-widgets/base",
1143 | "_model_module_version": "1.2.0",
1144 | "_model_name": "LayoutModel",
1145 | "_view_count": null,
1146 | "_view_module": "@jupyter-widgets/base",
1147 | "_view_module_version": "1.2.0",
1148 | "_view_name": "LayoutView",
1149 | "align_content": null,
1150 | "align_items": null,
1151 | "align_self": null,
1152 | "border": null,
1153 | "bottom": null,
1154 | "display": null,
1155 | "flex": null,
1156 | "flex_flow": null,
1157 | "grid_area": null,
1158 | "grid_auto_columns": null,
1159 | "grid_auto_flow": null,
1160 | "grid_auto_rows": null,
1161 | "grid_column": null,
1162 | "grid_gap": null,
1163 | "grid_row": null,
1164 | "grid_template_areas": null,
1165 | "grid_template_columns": null,
1166 | "grid_template_rows": null,
1167 | "height": null,
1168 | "justify_content": null,
1169 | "justify_items": null,
1170 | "left": null,
1171 | "margin": null,
1172 | "max_height": null,
1173 | "max_width": null,
1174 | "min_height": null,
1175 | "min_width": null,
1176 | "object_fit": null,
1177 | "object_position": null,
1178 | "order": null,
1179 | "overflow": null,
1180 | "overflow_x": null,
1181 | "overflow_y": null,
1182 | "padding": null,
1183 | "right": null,
1184 | "top": null,
1185 | "visibility": null,
1186 | "width": null
1187 | }
1188 | },
1189 | "26cc77465c0e4f30b086bf93a81f9386": {
1190 | "model_module": "@jupyter-widgets/controls",
1191 | "model_module_version": "1.5.0",
1192 | "model_name": "DescriptionStyleModel",
1193 | "state": {
1194 | "_model_module": "@jupyter-widgets/controls",
1195 | "_model_module_version": "1.5.0",
1196 | "_model_name": "DescriptionStyleModel",
1197 | "_view_count": null,
1198 | "_view_module": "@jupyter-widgets/base",
1199 | "_view_module_version": "1.2.0",
1200 | "_view_name": "StyleView",
1201 | "description_width": ""
1202 | }
1203 | },
1204 | "274606aec69a461f8c4259316b86c4af": {
1205 | "model_module": "@jupyter-widgets/controls",
1206 | "model_module_version": "1.5.0",
1207 | "model_name": "HTMLModel",
1208 | "state": {
1209 | "_dom_classes": [],
1210 | "_model_module": "@jupyter-widgets/controls",
1211 | "_model_module_version": "1.5.0",
1212 | "_model_name": "HTMLModel",
1213 | "_view_count": null,
1214 | "_view_module": "@jupyter-widgets/controls",
1215 | "_view_module_version": "1.5.0",
1216 | "_view_name": "HTMLView",
1217 | "description": "",
1218 | "description_tooltip": null,
1219 | "layout": "IPY_MODEL_ed03788fd9b14684b1d339664f56bfd5",
1220 | "placeholder": "",
1221 | "style": "IPY_MODEL_5fb92f13f2a5410b84cc9a7573e7da0a",
1222 | "value": " 896k/896k [00:01<00:00, 770kB/s]"
1223 | }
1224 | },
1225 | "2a6633db6b2946d6a6c8a66065e394cd": {
1226 | "model_module": "@jupyter-widgets/base",
1227 | "model_module_version": "1.2.0",
1228 | "model_name": "LayoutModel",
1229 | "state": {
1230 | "_model_module": "@jupyter-widgets/base",
1231 | "_model_module_version": "1.2.0",
1232 | "_model_name": "LayoutModel",
1233 | "_view_count": null,
1234 | "_view_module": "@jupyter-widgets/base",
1235 | "_view_module_version": "1.2.0",
1236 | "_view_name": "LayoutView",
1237 | "align_content": null,
1238 | "align_items": null,
1239 | "align_self": null,
1240 | "border": null,
1241 | "bottom": null,
1242 | "display": null,
1243 | "flex": null,
1244 | "flex_flow": null,
1245 | "grid_area": null,
1246 | "grid_auto_columns": null,
1247 | "grid_auto_flow": null,
1248 | "grid_auto_rows": null,
1249 | "grid_column": null,
1250 | "grid_gap": null,
1251 | "grid_row": null,
1252 | "grid_template_areas": null,
1253 | "grid_template_columns": null,
1254 | "grid_template_rows": null,
1255 | "height": null,
1256 | "justify_content": null,
1257 | "justify_items": null,
1258 | "left": null,
1259 | "margin": null,
1260 | "max_height": null,
1261 | "max_width": null,
1262 | "min_height": null,
1263 | "min_width": null,
1264 | "object_fit": null,
1265 | "object_position": null,
1266 | "order": null,
1267 | "overflow": null,
1268 | "overflow_x": null,
1269 | "overflow_y": null,
1270 | "padding": null,
1271 | "right": null,
1272 | "top": null,
1273 | "visibility": null,
1274 | "width": null
1275 | }
1276 | },
1277 | "33dc04e218864811a4fe0c62ca737d83": {
1278 | "model_module": "@jupyter-widgets/controls",
1279 | "model_module_version": "1.5.0",
1280 | "model_name": "FloatProgressModel",
1281 | "state": {
1282 | "_dom_classes": [],
1283 | "_model_module": "@jupyter-widgets/controls",
1284 | "_model_module_version": "1.5.0",
1285 | "_model_name": "FloatProgressModel",
1286 | "_view_count": null,
1287 | "_view_module": "@jupyter-widgets/controls",
1288 | "_view_module_version": "1.5.0",
1289 | "_view_name": "ProgressView",
1290 | "bar_style": "success",
1291 | "description": "Downloading: 100%",
1292 | "description_tooltip": null,
1293 | "layout": "IPY_MODEL_703da9466c0241519229161cb6ec5d87",
1294 | "max": 895731,
1295 | "min": 0,
1296 | "orientation": "horizontal",
1297 | "style": "IPY_MODEL_55d928f692d04a008a85a77abf0e46a0",
1298 | "value": 895731
1299 | }
1300 | },
1301 | "432e4a857a5d4151a4d1b5bc7b6bb4fb": {
1302 | "model_module": "@jupyter-widgets/controls",
1303 | "model_module_version": "1.5.0",
1304 | "model_name": "DescriptionStyleModel",
1305 | "state": {
1306 | "_model_module": "@jupyter-widgets/controls",
1307 | "_model_module_version": "1.5.0",
1308 | "_model_name": "DescriptionStyleModel",
1309 | "_view_count": null,
1310 | "_view_module": "@jupyter-widgets/base",
1311 | "_view_module_version": "1.2.0",
1312 | "_view_name": "StyleView",
1313 | "description_width": ""
1314 | }
1315 | },
1316 | "4c498d5d31d543ad9d4f63df61ce9332": {
1317 | "model_module": "@jupyter-widgets/controls",
1318 | "model_module_version": "1.5.0",
1319 | "model_name": "HBoxModel",
1320 | "state": {
1321 | "_dom_classes": [],
1322 | "_model_module": "@jupyter-widgets/controls",
1323 | "_model_module_version": "1.5.0",
1324 | "_model_name": "HBoxModel",
1325 | "_view_count": null,
1326 | "_view_module": "@jupyter-widgets/controls",
1327 | "_view_module_version": "1.5.0",
1328 | "_view_name": "HBoxView",
1329 | "box_style": "",
1330 | "children": [
1331 | "IPY_MODEL_7be53379597948ed83e132d9014abf53",
1332 | "IPY_MODEL_ddb8a6c765504d6ea10daf2da84a5c83"
1333 | ],
1334 | "layout": "IPY_MODEL_d8565e4e46924bbfbabe7b3eb8df8b79"
1335 | }
1336 | },
1337 | "54ca2b8f204b4760bccb27572ff7b74a": {
1338 | "model_module": "@jupyter-widgets/controls",
1339 | "model_module_version": "1.5.0",
1340 | "model_name": "ProgressStyleModel",
1341 | "state": {
1342 | "_model_module": "@jupyter-widgets/controls",
1343 | "_model_module_version": "1.5.0",
1344 | "_model_name": "ProgressStyleModel",
1345 | "_view_count": null,
1346 | "_view_module": "@jupyter-widgets/base",
1347 | "_view_module_version": "1.2.0",
1348 | "_view_name": "StyleView",
1349 | "bar_color": null,
1350 | "description_width": "initial"
1351 | }
1352 | },
1353 | "55d928f692d04a008a85a77abf0e46a0": {
1354 | "model_module": "@jupyter-widgets/controls",
1355 | "model_module_version": "1.5.0",
1356 | "model_name": "ProgressStyleModel",
1357 | "state": {
1358 | "_model_module": "@jupyter-widgets/controls",
1359 | "_model_module_version": "1.5.0",
1360 | "_model_name": "ProgressStyleModel",
1361 | "_view_count": null,
1362 | "_view_module": "@jupyter-widgets/base",
1363 | "_view_module_version": "1.2.0",
1364 | "_view_name": "StyleView",
1365 | "bar_color": null,
1366 | "description_width": "initial"
1367 | }
1368 | },
1369 | "5bc9b6ada49a4642a1cba622c93f8b62": {
1370 | "model_module": "@jupyter-widgets/base",
1371 | "model_module_version": "1.2.0",
1372 | "model_name": "LayoutModel",
1373 | "state": {
1374 | "_model_module": "@jupyter-widgets/base",
1375 | "_model_module_version": "1.2.0",
1376 | "_model_name": "LayoutModel",
1377 | "_view_count": null,
1378 | "_view_module": "@jupyter-widgets/base",
1379 | "_view_module_version": "1.2.0",
1380 | "_view_name": "LayoutView",
1381 | "align_content": null,
1382 | "align_items": null,
1383 | "align_self": null,
1384 | "border": null,
1385 | "bottom": null,
1386 | "display": null,
1387 | "flex": null,
1388 | "flex_flow": null,
1389 | "grid_area": null,
1390 | "grid_auto_columns": null,
1391 | "grid_auto_flow": null,
1392 | "grid_auto_rows": null,
1393 | "grid_column": null,
1394 | "grid_gap": null,
1395 | "grid_row": null,
1396 | "grid_template_areas": null,
1397 | "grid_template_columns": null,
1398 | "grid_template_rows": null,
1399 | "height": null,
1400 | "justify_content": null,
1401 | "justify_items": null,
1402 | "left": null,
1403 | "margin": null,
1404 | "max_height": null,
1405 | "max_width": null,
1406 | "min_height": null,
1407 | "min_width": null,
1408 | "object_fit": null,
1409 | "object_position": null,
1410 | "order": null,
1411 | "overflow": null,
1412 | "overflow_x": null,
1413 | "overflow_y": null,
1414 | "padding": null,
1415 | "right": null,
1416 | "top": null,
1417 | "visibility": null,
1418 | "width": null
1419 | }
1420 | },
1421 | "5fb92f13f2a5410b84cc9a7573e7da0a": {
1422 | "model_module": "@jupyter-widgets/controls",
1423 | "model_module_version": "1.5.0",
1424 | "model_name": "DescriptionStyleModel",
1425 | "state": {
1426 | "_model_module": "@jupyter-widgets/controls",
1427 | "_model_module_version": "1.5.0",
1428 | "_model_name": "DescriptionStyleModel",
1429 | "_view_count": null,
1430 | "_view_module": "@jupyter-widgets/base",
1431 | "_view_module_version": "1.2.0",
1432 | "_view_name": "StyleView",
1433 | "description_width": ""
1434 | }
1435 | },
1436 | "67f9de54d6e5434190bd07b7151d23b7": {
1437 | "model_module": "@jupyter-widgets/controls",
1438 | "model_module_version": "1.5.0",
1439 | "model_name": "FloatProgressModel",
1440 | "state": {
1441 | "_dom_classes": [],
1442 | "_model_module": "@jupyter-widgets/controls",
1443 | "_model_module_version": "1.5.0",
1444 | "_model_name": "FloatProgressModel",
1445 | "_view_count": null,
1446 | "_view_module": "@jupyter-widgets/controls",
1447 | "_view_module_version": "1.5.0",
1448 | "_view_name": "ProgressView",
1449 | "bar_style": "success",
1450 | "description": "Downloading: 100%",
1451 | "description_tooltip": null,
1452 | "layout": "IPY_MODEL_edcc338999ac45feaab03a86e2af75a9",
1453 | "max": 1496,
1454 | "min": 0,
1455 | "orientation": "horizontal",
1456 | "style": "IPY_MODEL_f575006dc6624157bfb408cced4e6ae6",
1457 | "value": 1496
1458 | }
1459 | },
1460 | "68cf808ab7e1428fa9acf6a9fd435b49": {
1461 | "model_module": "@jupyter-widgets/base",
1462 | "model_module_version": "1.2.0",
1463 | "model_name": "LayoutModel",
1464 | "state": {
1465 | "_model_module": "@jupyter-widgets/base",
1466 | "_model_module_version": "1.2.0",
1467 | "_model_name": "LayoutModel",
1468 | "_view_count": null,
1469 | "_view_module": "@jupyter-widgets/base",
1470 | "_view_module_version": "1.2.0",
1471 | "_view_name": "LayoutView",
1472 | "align_content": null,
1473 | "align_items": null,
1474 | "align_self": null,
1475 | "border": null,
1476 | "bottom": null,
1477 | "display": null,
1478 | "flex": null,
1479 | "flex_flow": null,
1480 | "grid_area": null,
1481 | "grid_auto_columns": null,
1482 | "grid_auto_flow": null,
1483 | "grid_auto_rows": null,
1484 | "grid_column": null,
1485 | "grid_gap": null,
1486 | "grid_row": null,
1487 | "grid_template_areas": null,
1488 | "grid_template_columns": null,
1489 | "grid_template_rows": null,
1490 | "height": null,
1491 | "justify_content": null,
1492 | "justify_items": null,
1493 | "left": null,
1494 | "margin": null,
1495 | "max_height": null,
1496 | "max_width": null,
1497 | "min_height": null,
1498 | "min_width": null,
1499 | "object_fit": null,
1500 | "object_position": null,
1501 | "order": null,
1502 | "overflow": null,
1503 | "overflow_x": null,
1504 | "overflow_y": null,
1505 | "padding": null,
1506 | "right": null,
1507 | "top": null,
1508 | "visibility": null,
1509 | "width": null
1510 | }
1511 | },
1512 | "703da9466c0241519229161cb6ec5d87": {
1513 | "model_module": "@jupyter-widgets/base",
1514 | "model_module_version": "1.2.0",
1515 | "model_name": "LayoutModel",
1516 | "state": {
1517 | "_model_module": "@jupyter-widgets/base",
1518 | "_model_module_version": "1.2.0",
1519 | "_model_name": "LayoutModel",
1520 | "_view_count": null,
1521 | "_view_module": "@jupyter-widgets/base",
1522 | "_view_module_version": "1.2.0",
1523 | "_view_name": "LayoutView",
1524 | "align_content": null,
1525 | "align_items": null,
1526 | "align_self": null,
1527 | "border": null,
1528 | "bottom": null,
1529 | "display": null,
1530 | "flex": null,
1531 | "flex_flow": null,
1532 | "grid_area": null,
1533 | "grid_auto_columns": null,
1534 | "grid_auto_flow": null,
1535 | "grid_auto_rows": null,
1536 | "grid_column": null,
1537 | "grid_gap": null,
1538 | "grid_row": null,
1539 | "grid_template_areas": null,
1540 | "grid_template_columns": null,
1541 | "grid_template_rows": null,
1542 | "height": null,
1543 | "justify_content": null,
1544 | "justify_items": null,
1545 | "left": null,
1546 | "margin": null,
1547 | "max_height": null,
1548 | "max_width": null,
1549 | "min_height": null,
1550 | "min_width": null,
1551 | "object_fit": null,
1552 | "object_position": null,
1553 | "order": null,
1554 | "overflow": null,
1555 | "overflow_x": null,
1556 | "overflow_y": null,
1557 | "padding": null,
1558 | "right": null,
1559 | "top": null,
1560 | "visibility": null,
1561 | "width": null
1562 | }
1563 | },
1564 | "7be53379597948ed83e132d9014abf53": {
1565 | "model_module": "@jupyter-widgets/controls",
1566 | "model_module_version": "1.5.0",
1567 | "model_name": "FloatProgressModel",
1568 | "state": {
1569 | "_dom_classes": [],
1570 | "_model_module": "@jupyter-widgets/controls",
1571 | "_model_module_version": "1.5.0",
1572 | "_model_name": "FloatProgressModel",
1573 | "_view_count": null,
1574 | "_view_module": "@jupyter-widgets/controls",
1575 | "_view_module_version": "1.5.0",
1576 | "_view_name": "ProgressView",
1577 | "bar_style": "success",
1578 | "description": "Downloading: 100%",
1579 | "description_tooltip": null,
1580 | "layout": "IPY_MODEL_91e5603df33e4f6fad3bc0b8fe67cb47",
1581 | "max": 1561415,
1582 | "min": 0,
1583 | "orientation": "horizontal",
1584 | "style": "IPY_MODEL_b4807859ba084927b1c01ad25559e790",
1585 | "value": 1561415
1586 | }
1587 | },
1588 | "91e35ac677a045c49b3f7dbf243b2e6a": {
1589 | "model_module": "@jupyter-widgets/base",
1590 | "model_module_version": "1.2.0",
1591 | "model_name": "LayoutModel",
1592 | "state": {
1593 | "_model_module": "@jupyter-widgets/base",
1594 | "_model_module_version": "1.2.0",
1595 | "_model_name": "LayoutModel",
1596 | "_view_count": null,
1597 | "_view_module": "@jupyter-widgets/base",
1598 | "_view_module_version": "1.2.0",
1599 | "_view_name": "LayoutView",
1600 | "align_content": null,
1601 | "align_items": null,
1602 | "align_self": null,
1603 | "border": null,
1604 | "bottom": null,
1605 | "display": null,
1606 | "flex": null,
1607 | "flex_flow": null,
1608 | "grid_area": null,
1609 | "grid_auto_columns": null,
1610 | "grid_auto_flow": null,
1611 | "grid_auto_rows": null,
1612 | "grid_column": null,
1613 | "grid_gap": null,
1614 | "grid_row": null,
1615 | "grid_template_areas": null,
1616 | "grid_template_columns": null,
1617 | "grid_template_rows": null,
1618 | "height": null,
1619 | "justify_content": null,
1620 | "justify_items": null,
1621 | "left": null,
1622 | "margin": null,
1623 | "max_height": null,
1624 | "max_width": null,
1625 | "min_height": null,
1626 | "min_width": null,
1627 | "object_fit": null,
1628 | "object_position": null,
1629 | "order": null,
1630 | "overflow": null,
1631 | "overflow_x": null,
1632 | "overflow_y": null,
1633 | "padding": null,
1634 | "right": null,
1635 | "top": null,
1636 | "visibility": null,
1637 | "width": null
1638 | }
1639 | },
1640 | "91e5603df33e4f6fad3bc0b8fe67cb47": {
1641 | "model_module": "@jupyter-widgets/base",
1642 | "model_module_version": "1.2.0",
1643 | "model_name": "LayoutModel",
1644 | "state": {
1645 | "_model_module": "@jupyter-widgets/base",
1646 | "_model_module_version": "1.2.0",
1647 | "_model_name": "LayoutModel",
1648 | "_view_count": null,
1649 | "_view_module": "@jupyter-widgets/base",
1650 | "_view_module_version": "1.2.0",
1651 | "_view_name": "LayoutView",
1652 | "align_content": null,
1653 | "align_items": null,
1654 | "align_self": null,
1655 | "border": null,
1656 | "bottom": null,
1657 | "display": null,
1658 | "flex": null,
1659 | "flex_flow": null,
1660 | "grid_area": null,
1661 | "grid_auto_columns": null,
1662 | "grid_auto_flow": null,
1663 | "grid_auto_rows": null,
1664 | "grid_column": null,
1665 | "grid_gap": null,
1666 | "grid_row": null,
1667 | "grid_template_areas": null,
1668 | "grid_template_columns": null,
1669 | "grid_template_rows": null,
1670 | "height": null,
1671 | "justify_content": null,
1672 | "justify_items": null,
1673 | "left": null,
1674 | "margin": null,
1675 | "max_height": null,
1676 | "max_width": null,
1677 | "min_height": null,
1678 | "min_width": null,
1679 | "object_fit": null,
1680 | "object_position": null,
1681 | "order": null,
1682 | "overflow": null,
1683 | "overflow_x": null,
1684 | "overflow_y": null,
1685 | "padding": null,
1686 | "right": null,
1687 | "top": null,
1688 | "visibility": null,
1689 | "width": null
1690 | }
1691 | },
1692 | "94ef6e8f88bb498783522af9621bf811": {
1693 | "model_module": "@jupyter-widgets/controls",
1694 | "model_module_version": "1.5.0",
1695 | "model_name": "HTMLModel",
1696 | "state": {
1697 | "_dom_classes": [],
1698 | "_model_module": "@jupyter-widgets/controls",
1699 | "_model_module_version": "1.5.0",
1700 | "_model_name": "HTMLModel",
1701 | "_view_count": null,
1702 | "_view_module": "@jupyter-widgets/controls",
1703 | "_view_module_version": "1.5.0",
1704 | "_view_name": "HTMLView",
1705 | "description": "",
1706 | "description_tooltip": null,
1707 | "layout": "IPY_MODEL_91e35ac677a045c49b3f7dbf243b2e6a",
1708 | "placeholder": "",
1709 | "style": "IPY_MODEL_432e4a857a5d4151a4d1b5bc7b6bb4fb",
1710 | "value": " 1.50k/1.50k [00:01<00:00, 1.23kB/s]"
1711 | }
1712 | },
1713 | "9f4e9ae9b7fc4e89ac6aa81af567a678": {
1714 | "model_module": "@jupyter-widgets/controls",
1715 | "model_module_version": "1.5.0",
1716 | "model_name": "HBoxModel",
1717 | "state": {
1718 | "_dom_classes": [],
1719 | "_model_module": "@jupyter-widgets/controls",
1720 | "_model_module_version": "1.5.0",
1721 | "_model_name": "HBoxModel",
1722 | "_view_count": null,
1723 | "_view_module": "@jupyter-widgets/controls",
1724 | "_view_module_version": "1.5.0",
1725 | "_view_name": "HBoxView",
1726 | "box_style": "",
1727 | "children": [
1728 | "IPY_MODEL_06d2301a3d34440eb19a887fb51d562c",
1729 | "IPY_MODEL_b31585a6d0574a0cb973bb3679a8168a"
1730 | ],
1731 | "layout": "IPY_MODEL_68cf808ab7e1428fa9acf6a9fd435b49"
1732 | }
1733 | },
1734 | "b31585a6d0574a0cb973bb3679a8168a": {
1735 | "model_module": "@jupyter-widgets/controls",
1736 | "model_module_version": "1.5.0",
1737 | "model_name": "HTMLModel",
1738 | "state": {
1739 | "_dom_classes": [],
1740 | "_model_module": "@jupyter-widgets/controls",
1741 | "_model_module_version": "1.5.0",
1742 | "_model_name": "HTMLModel",
1743 | "_view_count": null,
1744 | "_view_module": "@jupyter-widgets/controls",
1745 | "_view_module_version": "1.5.0",
1746 | "_view_name": "HTMLView",
1747 | "description": "",
1748 | "description_tooltip": null,
1749 | "layout": "IPY_MODEL_12a4548fdad44ea181868776c7616455",
1750 | "placeholder": "",
1751 | "style": "IPY_MODEL_d422f60f5607443da23dba147889e3b7",
1752 | "value": " 553M/553M [00:16<00:00, 34.0MB/s]"
1753 | }
1754 | },
1755 | "b4807859ba084927b1c01ad25559e790": {
1756 | "model_module": "@jupyter-widgets/controls",
1757 | "model_module_version": "1.5.0",
1758 | "model_name": "ProgressStyleModel",
1759 | "state": {
1760 | "_model_module": "@jupyter-widgets/controls",
1761 | "_model_module_version": "1.5.0",
1762 | "_model_name": "ProgressStyleModel",
1763 | "_view_count": null,
1764 | "_view_module": "@jupyter-widgets/base",
1765 | "_view_module_version": "1.2.0",
1766 | "_view_name": "StyleView",
1767 | "bar_color": null,
1768 | "description_width": "initial"
1769 | }
1770 | },
1771 | "d422f60f5607443da23dba147889e3b7": {
1772 | "model_module": "@jupyter-widgets/controls",
1773 | "model_module_version": "1.5.0",
1774 | "model_name": "DescriptionStyleModel",
1775 | "state": {
1776 | "_model_module": "@jupyter-widgets/controls",
1777 | "_model_module_version": "1.5.0",
1778 | "_model_name": "DescriptionStyleModel",
1779 | "_view_count": null,
1780 | "_view_module": "@jupyter-widgets/base",
1781 | "_view_module_version": "1.2.0",
1782 | "_view_name": "StyleView",
1783 | "description_width": ""
1784 | }
1785 | },
1786 | "d8565e4e46924bbfbabe7b3eb8df8b79": {
1787 | "model_module": "@jupyter-widgets/base",
1788 | "model_module_version": "1.2.0",
1789 | "model_name": "LayoutModel",
1790 | "state": {
1791 | "_model_module": "@jupyter-widgets/base",
1792 | "_model_module_version": "1.2.0",
1793 | "_model_name": "LayoutModel",
1794 | "_view_count": null,
1795 | "_view_module": "@jupyter-widgets/base",
1796 | "_view_module_version": "1.2.0",
1797 | "_view_name": "LayoutView",
1798 | "align_content": null,
1799 | "align_items": null,
1800 | "align_self": null,
1801 | "border": null,
1802 | "bottom": null,
1803 | "display": null,
1804 | "flex": null,
1805 | "flex_flow": null,
1806 | "grid_area": null,
1807 | "grid_auto_columns": null,
1808 | "grid_auto_flow": null,
1809 | "grid_auto_rows": null,
1810 | "grid_column": null,
1811 | "grid_gap": null,
1812 | "grid_row": null,
1813 | "grid_template_areas": null,
1814 | "grid_template_columns": null,
1815 | "grid_template_rows": null,
1816 | "height": null,
1817 | "justify_content": null,
1818 | "justify_items": null,
1819 | "left": null,
1820 | "margin": null,
1821 | "max_height": null,
1822 | "max_width": null,
1823 | "min_height": null,
1824 | "min_width": null,
1825 | "object_fit": null,
1826 | "object_position": null,
1827 | "order": null,
1828 | "overflow": null,
1829 | "overflow_x": null,
1830 | "overflow_y": null,
1831 | "padding": null,
1832 | "right": null,
1833 | "top": null,
1834 | "visibility": null,
1835 | "width": null
1836 | }
1837 | },
1838 | "ddb8a6c765504d6ea10daf2da84a5c83": {
1839 | "model_module": "@jupyter-widgets/controls",
1840 | "model_module_version": "1.5.0",
1841 | "model_name": "HTMLModel",
1842 | "state": {
1843 | "_dom_classes": [],
1844 | "_model_module": "@jupyter-widgets/controls",
1845 | "_model_module_version": "1.5.0",
1846 | "_model_name": "HTMLModel",
1847 | "_view_count": null,
1848 | "_view_module": "@jupyter-widgets/controls",
1849 | "_view_module_version": "1.5.0",
1850 | "_view_name": "HTMLView",
1851 | "description": "",
1852 | "description_tooltip": null,
1853 | "layout": "IPY_MODEL_5bc9b6ada49a4642a1cba622c93f8b62",
1854 | "placeholder": "",
1855 | "style": "IPY_MODEL_26cc77465c0e4f30b086bf93a81f9386",
1856 | "value": " 1.56M/1.56M [00:02<00:00, 629kB/s]"
1857 | }
1858 | },
1859 | "ed03788fd9b14684b1d339664f56bfd5": {
1860 | "model_module": "@jupyter-widgets/base",
1861 | "model_module_version": "1.2.0",
1862 | "model_name": "LayoutModel",
1863 | "state": {
1864 | "_model_module": "@jupyter-widgets/base",
1865 | "_model_module_version": "1.2.0",
1866 | "_model_name": "LayoutModel",
1867 | "_view_count": null,
1868 | "_view_module": "@jupyter-widgets/base",
1869 | "_view_module_version": "1.2.0",
1870 | "_view_name": "LayoutView",
1871 | "align_content": null,
1872 | "align_items": null,
1873 | "align_self": null,
1874 | "border": null,
1875 | "bottom": null,
1876 | "display": null,
1877 | "flex": null,
1878 | "flex_flow": null,
1879 | "grid_area": null,
1880 | "grid_auto_columns": null,
1881 | "grid_auto_flow": null,
1882 | "grid_auto_rows": null,
1883 | "grid_column": null,
1884 | "grid_gap": null,
1885 | "grid_row": null,
1886 | "grid_template_areas": null,
1887 | "grid_template_columns": null,
1888 | "grid_template_rows": null,
1889 | "height": null,
1890 | "justify_content": null,
1891 | "justify_items": null,
1892 | "left": null,
1893 | "margin": null,
1894 | "max_height": null,
1895 | "max_width": null,
1896 | "min_height": null,
1897 | "min_width": null,
1898 | "object_fit": null,
1899 | "object_position": null,
1900 | "order": null,
1901 | "overflow": null,
1902 | "overflow_x": null,
1903 | "overflow_y": null,
1904 | "padding": null,
1905 | "right": null,
1906 | "top": null,
1907 | "visibility": null,
1908 | "width": null
1909 | }
1910 | },
1911 | "edcc338999ac45feaab03a86e2af75a9": {
1912 | "model_module": "@jupyter-widgets/base",
1913 | "model_module_version": "1.2.0",
1914 | "model_name": "LayoutModel",
1915 | "state": {
1916 | "_model_module": "@jupyter-widgets/base",
1917 | "_model_module_version": "1.2.0",
1918 | "_model_name": "LayoutModel",
1919 | "_view_count": null,
1920 | "_view_module": "@jupyter-widgets/base",
1921 | "_view_module_version": "1.2.0",
1922 | "_view_name": "LayoutView",
1923 | "align_content": null,
1924 | "align_items": null,
1925 | "align_self": null,
1926 | "border": null,
1927 | "bottom": null,
1928 | "display": null,
1929 | "flex": null,
1930 | "flex_flow": null,
1931 | "grid_area": null,
1932 | "grid_auto_columns": null,
1933 | "grid_auto_flow": null,
1934 | "grid_auto_rows": null,
1935 | "grid_column": null,
1936 | "grid_gap": null,
1937 | "grid_row": null,
1938 | "grid_template_areas": null,
1939 | "grid_template_columns": null,
1940 | "grid_template_rows": null,
1941 | "height": null,
1942 | "justify_content": null,
1943 | "justify_items": null,
1944 | "left": null,
1945 | "margin": null,
1946 | "max_height": null,
1947 | "max_width": null,
1948 | "min_height": null,
1949 | "min_width": null,
1950 | "object_fit": null,
1951 | "object_position": null,
1952 | "order": null,
1953 | "overflow": null,
1954 | "overflow_x": null,
1955 | "overflow_y": null,
1956 | "padding": null,
1957 | "right": null,
1958 | "top": null,
1959 | "visibility": null,
1960 | "width": null
1961 | }
1962 | },
1963 | "f575006dc6624157bfb408cced4e6ae6": {
1964 | "model_module": "@jupyter-widgets/controls",
1965 | "model_module_version": "1.5.0",
1966 | "model_name": "ProgressStyleModel",
1967 | "state": {
1968 | "_model_module": "@jupyter-widgets/controls",
1969 | "_model_module_version": "1.5.0",
1970 | "_model_name": "ProgressStyleModel",
1971 | "_view_count": null,
1972 | "_view_module": "@jupyter-widgets/base",
1973 | "_view_module_version": "1.2.0",
1974 | "_view_name": "StyleView",
1975 | "bar_color": null,
1976 | "description_width": "initial"
1977 | }
1978 | },
1979 | "f5c9c24c01114e168e90ed555fb9f05a": {
1980 | "model_module": "@jupyter-widgets/base",
1981 | "model_module_version": "1.2.0",
1982 | "model_name": "LayoutModel",
1983 | "state": {
1984 | "_model_module": "@jupyter-widgets/base",
1985 | "_model_module_version": "1.2.0",
1986 | "_model_name": "LayoutModel",
1987 | "_view_count": null,
1988 | "_view_module": "@jupyter-widgets/base",
1989 | "_view_module_version": "1.2.0",
1990 | "_view_name": "LayoutView",
1991 | "align_content": null,
1992 | "align_items": null,
1993 | "align_self": null,
1994 | "border": null,
1995 | "bottom": null,
1996 | "display": null,
1997 | "flex": null,
1998 | "flex_flow": null,
1999 | "grid_area": null,
2000 | "grid_auto_columns": null,
2001 | "grid_auto_flow": null,
2002 | "grid_auto_rows": null,
2003 | "grid_column": null,
2004 | "grid_gap": null,
2005 | "grid_row": null,
2006 | "grid_template_areas": null,
2007 | "grid_template_columns": null,
2008 | "grid_template_rows": null,
2009 | "height": null,
2010 | "justify_content": null,
2011 | "justify_items": null,
2012 | "left": null,
2013 | "margin": null,
2014 | "max_height": null,
2015 | "max_width": null,
2016 | "min_height": null,
2017 | "min_width": null,
2018 | "object_fit": null,
2019 | "object_position": null,
2020 | "order": null,
2021 | "overflow": null,
2022 | "overflow_x": null,
2023 | "overflow_y": null,
2024 | "padding": null,
2025 | "right": null,
2026 | "top": null,
2027 | "visibility": null,
2028 | "width": null
2029 | }
2030 | },
2031 | "f5d5b2c07f5745538d04f3968c244002": {
2032 | "model_module": "@jupyter-widgets/controls",
2033 | "model_module_version": "1.5.0",
2034 | "model_name": "HBoxModel",
2035 | "state": {
2036 | "_dom_classes": [],
2037 | "_model_module": "@jupyter-widgets/controls",
2038 | "_model_module_version": "1.5.0",
2039 | "_model_name": "HBoxModel",
2040 | "_view_count": null,
2041 | "_view_module": "@jupyter-widgets/controls",
2042 | "_view_module_version": "1.5.0",
2043 | "_view_name": "HBoxView",
2044 | "box_style": "",
2045 | "children": [
2046 | "IPY_MODEL_67f9de54d6e5434190bd07b7151d23b7",
2047 | "IPY_MODEL_94ef6e8f88bb498783522af9621bf811"
2048 | ],
2049 | "layout": "IPY_MODEL_2a6633db6b2946d6a6c8a66065e394cd"
2050 | }
2051 | }
2052 | },
2053 | "version_major": 2,
2054 | "version_minor": 0
2055 | }
2056 | }
2057 | },
2058 | "nbformat": 4,
2059 | "nbformat_minor": 4
2060 | }
2061 |
--------------------------------------------------------------------------------
/multi_modal_addition_fusion.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","trusted":true},"cell_type":"code","source":"import os, time, datetime\nimport numpy as np\nimport pandas as pd\nfrom tqdm.notebook import tqdm\nimport random\nimport logging\ntqdm.pandas()\nimport seaborn as sns\nfrom sklearn.model_selection import train_test_split\n\n#NN Packages\nimport torch\nimport torch.nn as nn\nfrom torch.utils.data import TensorDataset, random_split,DataLoader, RandomSampler, SequentialSampler\n\nlogger = logging.getLogger(__name__)\n\n\nif torch.cuda.is_available(): \n\n # Tell PyTorch to use the GPU. \n device = torch.device(\"cuda\")\n\n print('There are %d GPU(s) available.' % torch.cuda.device_count())\n\n print('We will use the GPU:', torch.cuda.get_device_name(0))\n\n# If not...\nelse:\n print('No GPU available, using the CPU instead.')\n device = torch.device(\"cpu\")\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def format_time(elapsed):\n '''\n Takes a time in seconds and returns a string hh:mm:ss\n '''\n # Round to the nearest second.\n elapsed_rounded = int(round((elapsed)))\n \n # Format as hh:mm:ss\n return str(datetime.timedelta(seconds=elapsed_rounded))\n\nclass SigirPreprocess():\n \n def __init__(self, text_data_path):\n self.text_data_path = text_data_path\n self.train = None\n self.dict_code_to_id = {}\n self.dict_id_to_code = {}\n self.list_tags = {}\n self.sentences = []\n self.labels = []\n self.text_col = None\n self.X_test = None\n def prepare_data(self ):\n catalog_eng= pd.read_csv(self.text_data_path+\"data/catalog_english_taxonomy.tsv\",sep=\"\\t\")\n X_train= pd.read_csv(self.text_data_path+\"data/X_train.tsv\",sep=\"\\t\")\n Y_train= pd.read_csv(self.text_data_path+\"data/Y_train.tsv\",sep=\"\\t\")\n \n self.list_tags = list(Y_train['Prdtypecode'].unique())\n for i,tag in enumerate(self.list_tags):\n self.dict_code_to_id[tag] = i \n self.dict_id_to_code[i]=tag\n print(self.dict_code_to_id)\n \n Y_train['labels']=Y_train['Prdtypecode'].map(self.dict_code_to_id)\n train=pd.merge(left=X_train,right=Y_train,\n how='left',left_on=['Integer_id','Image_id','Product_id'],\n right_on=['Integer_id','Image_id','Product_id'])\n prod_map=pd.Series(catalog_eng['Top level category'].values,\n index=catalog_eng['Prdtypecode']).to_dict()\n\n train['product'] = train['Prdtypecode'].map(prod_map)\n train['title_len']=train['Title'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n train['desc_len']=train['Description'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n train['title_desc_len']=train['title_len'] + train['desc_len']\n train.loc[train['Description'].isnull(), 'Description'] = \" \"\n train['title_desc'] = train['Title'] + \" \" + train['Description']\n \n self.train = train\n \n def get_sentences(self, text_col, remove_null_rows=False):\n self.text_col = text_col\n if remove_null_rows==True:\n new_train = self.train[self.train[text_col].notnull()]\n\n else:\n new_train = self.train.copy()\n \n self.sentences = new_train[text_col].values\n self.labels = new_train['labels'].values\n \n def prepare_test(self, text_col):\n X_test=pd.read_csv(self.text_data_path+\"data/x_test_task1_phase1.tsv\",sep=\"\\t\")\n X_test.loc[X_test['Description'].isnull(), 'Description'] = \" \"\n X_test['title_desc'] = X_test['Title'] + \" \" + X_test['Description']\n self.X_test = X_test\n self.test_sentences = X_test[text_col].values\n ","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"text_col = 'title_desc'\nmax_len = 256\nval_size = 0.1","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"Preprocess = SigirPreprocess(\"/kaggle/input/textphase1/\")\nPreprocess.prepare_data()\nPreprocess.get_sentences(text_col, True)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"sentences = Preprocess.sentences\nlabels = Preprocess.labels\nprint(\"Total number of sentences:{}, labels:{}\".format(len(sentences), len(labels)))","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"\n# print('Using Camembert')\n# tokenizer_cam = CamembertTokenizer.from_pretrained('camembert-base', do_lowercase=False)\n# print('Using Flaubert')\n# tokenizer_flau = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', do_lowercase=False)\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#function to prepare input for model training\ndef prep_input(sentences,labels, max_len,tokenizer):\n input_ids = []\n attention_masks = []\n\n # For every sentence...\n for sent in tqdm(sentences):\n # `encode_plus` will:\n # (1) Tokenize the sentence.\n # (2) Prepend the `[CLS]` token to the start.\n # (3) Append the `[SEP]` token to the end.\n # (4) Map tokens to their IDs.\n # (5) Pad or truncate the sentence to `max_length`\n # (6) Create attention masks for [PAD] tokens.\n encoded_dict = tokenizer.encode_plus(\n sent, # Sentence to encode.\n add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n max_length = max_len, # Pad & truncate all sentences.\n pad_to_max_length = True,\n return_attention_mask = True, # Construct attn. masks.\n return_tensors = 'pt', # Return pytorch tensors.\n )\n\n # Add the encoded sentence to the list. \n input_ids.append(encoded_dict['input_ids'])\n\n # And its attention mask (simply differentiates padding from non-padding).\n attention_masks.append(encoded_dict['attention_mask'])\n\n # Convert the lists into tensors.\n input_ids = torch.cat(input_ids, dim=0)\n attention_masks = torch.cat(attention_masks, dim=0)\n if labels is not None:\n labels = torch.tensor(labels)\n return input_ids,attention_masks,labels\n else:\n return input_ids,attention_masks\n ","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# input_ids_cam,attention_masks_cam,labels_cam=prep_input(sentences,labels, max_len,tokenizer_cam)\n# # print('Original: ', sentences[0])\n# # print('Token IDs:', input_ids[0]) ","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# input_ids_flau,attention_masks_flau,labels_flau=prep_input(sentences,labels, max_len,tokenizer_flau)\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# tr_inputs_cam, val_inputs_cam, _,_ = train_test_split(input_ids_cam, labels_cam,stratify=labels_cam,\n# random_state=2020, test_size=val_size)\n# tr_masks_cam, val_masks_cam, _,_ = train_test_split(attention_masks_cam, labels,stratify=labels,\n# random_state=2020, test_size=val_size)\n\n# tr_inputs_flau, val_inputs_flau, _,_ = train_test_split(input_ids_flau, labels,stratify=labels,\n# random_state=2020, test_size=val_size)\n# tr_masks_flau, val_masks_flau, _,_ = train_test_split(attention_masks_flau, labels,stratify=labels_flau,\n# random_state=2020, test_size=val_size)\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# torch.save(tr_inputs_cam, \"tr_inputs_cam.pt\")\n# torch.save(val_inputs_cam, \"val_inputs_cam.pt\")\n# torch.save(tr_masks_cam, \"tr_masks_cam.pt\")\n# torch.save(val_masks_cam, \"val_masks_cam.pt\")\n\n# torch.save(tr_inputs_flau, \"tr_inputs_flau.pt\")\n# torch.save(val_inputs_flau, \"val_inputs_flau.pt\")\n# torch.save(tr_masks_flau, \"tr_masks_flau.pt\")\n# torch.save(val_masks_flau, \"val_masks_flau.pt\")\n\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# !mkdir -p /root/.kaggle/\n# !cp ../input/myjson/kaggle.json /root/.kaggle/\n# !chmod 600 /root/.kaggle/kaggle.json","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# data = '''{\n# \"title\": \"Multi_modal_input_text\",\n# \"id\": \"deepbugger/Multi-modal-input-text\",\n# \"licenses\": [\n# {\n# \"name\": \"CC0-1.0\"\n# }\n# ]\n# }\n# '''\n# text_file = open(\"/kaggle/working/dataset-metadata.json\", 'w+')\n# n = text_file.write(data)\n# text_file.close()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# !kaggle datasets create -p /kaggle/working\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"text_input='../input/multi-modal-input-text/'\ntr_inputs_cam=torch.load(text_input+\"tr_inputs_cam.pt\")\nval_inputs_cam=torch.load(text_input+\"val_inputs_cam.pt\")\ntr_masks_cam=torch.load( text_input+\"tr_masks_cam.pt\")\nval_masks_cam=torch.load( text_input+\"val_masks_cam.pt\")\n\ntr_inputs_flau=torch.load(text_input+\"tr_inputs_flau.pt\")\nval_inputs_flau=torch.load(text_input+\"val_inputs_flau.pt\")\ntr_masks_flau=torch.load(text_input+\"tr_masks_flau.pt\")\nval_masks_flau=torch.load(text_input+\"val_masks_flau.pt\")","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"!pip install pretrainedmodels","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from transformers import CamembertConfig, CamembertTokenizer, CamembertModel, CamembertForSequenceClassification, AdamW\nfrom transformers import FlaubertModel, FlaubertTokenizer,FlaubertForSequenceClassification,AdamW, FlaubertConfig \nfrom transformers.modeling_roberta import RobertaClassificationHead\nfrom transformers.modeling_utils import SequenceSummary","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from torch.nn import functional as F\nimport torch.nn as nn\nimport pretrainedmodels\n\nclass SEResnext50_32x4d(nn.Module):\n def __init__(self, pretrained='imagenet'):\n super(SEResnext50_32x4d, self).__init__()\n \n self.base_model = pretrainedmodels.__dict__[\"se_resnext50_32x4d\"](pretrained=None)\n if pretrained is not None:\n self.base_model.load_state_dict(\n torch.load(\"../input/pretrained-model-weights-pytorch/se_resnext50_32x4d-a260b3a4.pth\"\n )\n )\n self.l0 = nn.Linear(2048, 27)\n \n def forward(self, image):\n batch_size, _, _, _ = image.shape\n \n x = self.base_model.features(image)\n x = F.adaptive_avg_pool2d(x, 1).reshape(batch_size, -1)\n \n out = self.l0(x)\n\n return out","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"class Identity(nn.Module):\n def __init__(self):\n super(Identity, self).__init__()\n \n def forward(self, x):\n return x","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# num_classes=27\n# img_model = SEResnext50_32x4d(pretrained=None)\n# img_model.load_state_dict(torch.load('../input/seresnext2048/best_model.pt'))\n# img_model.cuda()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# img_model.l0=Identity()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# img_model","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# for params in img_model.parameters():\n# params.requires_grad=False\n ","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"class vec_output_CamembertForSequenceClassification(CamembertModel):\n config_class = CamembertConfig\n\n def __init__(self, config):\n super().__init__(config)\n self.num_labels = config.num_labels\n\n self.roberta = CamembertModel(config)\n self.dense = nn.Linear(256*config.hidden_size, config.hidden_size)\n self.dropout = nn.Dropout(0.1)\n self.out_proj = nn.Linear(config.hidden_size, config.num_labels)\n self.init_weights()\n\n\n def forward(\n self,\n input_ids=None,\n attention_mask=None,\n token_type_ids=None,\n position_ids=None,\n head_mask=None,\n inputs_embeds=None,\n labels=None,\n output_attentions=None,\n output_hidden_states=None,\n ):\n outputs = self.roberta(\n input_ids,\n attention_mask=attention_mask,\n token_type_ids=token_type_ids,\n position_ids=position_ids,\n head_mask=head_mask,\n inputs_embeds=inputs_embeds,\n# output_attentions=output_attentions,\n# output_hidden_states=output_hidden_states,\n )\n sequence_output = outputs[0] #(B,256,768)\n x = sequence_output.view(sequence_output.shape[0], 256*768)\n x = self.dense(x) # 256*768 -> 768\n feat= torch.tanh(x) \n logits = self.out_proj(feat) # 768 -> 27\n outputs = (logits,) + outputs[2:]\n \n return outputs # (loss), logits, (hidden_states), (attentions)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"num_classes = 27\n\nclass vec_output_FlaubertForSequenceClassification(FlaubertModel):\n \n config_class = FlaubertConfig\n \n\n def __init__(self, config):\n super().__init__(config)\n self.transformer = FlaubertModel(config)\n self.sequence_summary = SequenceSummary(config)\n self.init_weights()\n self.dropout = torch.nn.Dropout(0.1)\n self.classifier = torch.nn.Linear(config.hidden_size, num_classes)\n\n\n def forward(\n self,\n input_ids=None,\n attention_mask=None,\n langs=None,\n token_type_ids=None,\n position_ids=None,\n lengths=None,\n cache=None,\n head_mask=None,\n inputs_embeds=None,\n labels=None,\n ):\n \n \n transformer_outputs = self.transformer(\n input_ids,\n attention_mask=attention_mask,\n langs=langs,\n token_type_ids=token_type_ids,\n position_ids=position_ids,\n lengths=lengths,\n cache=cache,\n head_mask=head_mask,\n inputs_embeds=inputs_embeds,\n )\n\n #output = self.dropout(output)\n output = transformer_outputs[0]\n vec = output[:,0]\n \n \n #logits\n dense = self.dropout(vec)\n \n #classifier\n logits = self.classifier(dense)\n \n outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here\n \n \n return outputs\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# model = vec_output_CamembertForSequenceClassification.from_pretrained(\n# modelname, # Use the 12-layer BERT model, with an uncased vocab.\n# num_labels = len(Preprocess.dict_code_to_id), # The number of output labels--2 for binary classification.\n# # You can increase this for multi-class tasks. \n# output_attentions = False, # Whether the model returns attentions weights.\n# output_hidden_states = False, # Whether the model returns all hidden-states.\n# )","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# model_path = '../input/camembert-vec-256m768-10ep/best_model.pt'\n# checkpoint = torch.load(model_path)\n# # model = checkpoint['model']\n# model.load_state_dict(checkpoint)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# model.cuda()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# for param in model.parameters():\n# param.requires_grad=False","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# model.out_proj=Identity()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"### Image data prep"},{"metadata":{"trusted":true},"cell_type":"code","source":"catalog_eng= pd.read_csv(\"/kaggle/input/textphase1/data/catalog_english_taxonomy.tsv\",sep=\"\\t\")\nX_train= pd.read_csv(\"/kaggle/input/textphase1/data/X_train.tsv\",sep=\"\\t\")\nY_train= pd.read_csv(\"/kaggle/input/textphase1/data/Y_train.tsv\",sep=\"\\t\")\nX_test=pd.read_csv(\"/kaggle/input/textphase1/data/x_test_task1_phase1.tsv\",sep=\"\\t\")\ndict_code_to_id = {}\ndict_id_to_code={}\nlist_tags = list(Y_train['Prdtypecode'].unique())\n\nfor i,tag in enumerate(list_tags):\n dict_code_to_id[tag] = i \n dict_id_to_code[i]=tag\nY_train['labels']=Y_train['Prdtypecode'].map(dict_code_to_id)\ntrain=pd.merge(left=X_train,right=Y_train,\n how='left',left_on=['Integer_id','Image_id','Product_id'],\n right_on=['Integer_id','Image_id','Product_id'])\nprod_map=pd.Series(catalog_eng['Top level category'].values,index=catalog_eng['Prdtypecode']).to_dict()\ntrain['product']=train['Prdtypecode'].map(prod_map)\n\ndef get_img_path(img_id,prd_id,path):\n \n pattern = 'image'+'_'+str(img_id)+'_'+'product'+'_'+str(prd_id)+'.jpg'\n return path + pattern\ntrain_img = train[['Image_id','Product_id','labels','product']]\n\ntrain_img['image_path']=train_img.progress_apply(lambda x: get_img_path(x['Image_id'],x['Product_id'],\n path = '/kaggle/input/imagetrain/image_training/'),axis=1)\nX_test['image_path']=X_test.progress_apply(lambda x: get_img_path(x['Image_id'],x['Product_id'],\n path='/kaggle/input/imagetest/image_test/image_test_task1_phase1/'),axis=1)\ntrain_df, val_df, _, _ = train_test_split(train_img, train_img['labels'],random_state=2020, test_size = 0.1, stratify=train_img['labels'])","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"input_size = 224 # for Resnt\n# Applying Transforms to the Data\nfrom torchvision import datasets, models, transforms\n\nimage_transforms = { \n 'train': transforms.Compose([\n transforms.RandomResizedCrop(size=256, scale=(0.8, 1.0)),\n transforms.RandomRotation(degrees=15),\n transforms.RandomHorizontalFlip(),\n transforms.Resize(size=256),\n transforms.CenterCrop(size=input_size),\n transforms.ToTensor(),\n transforms.Normalize([0.485, 0.456, 0.406],\n [0.229, 0.224, 0.225])\n ]),\n 'valid': transforms.Compose([\n transforms.Resize(size=256),\n transforms.CenterCrop(size=input_size),\n transforms.ToTensor(),\n transforms.Normalize([0.485, 0.456, 0.406],\n [0.229, 0.224, 0.225])\n ]),\n 'test': transforms.Compose([\n transforms.Resize(size=256),\n transforms.CenterCrop(size=input_size),\n transforms.ToTensor(),\n transforms.Normalize([0.485, 0.456, 0.406],\n [0.229, 0.224, 0.225])\n ])\n}","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from torch.utils.data import Dataset, DataLoader, Subset\nimport cv2\nfrom PIL import Image\n\nclass FusionDataset(Dataset):\n \n def __init__(self,df,inputs_cam,masks_cam,inputs_flau,masks_flau,transform=None,mode='train'):\n self.df = df\n self.transform=transform\n self.mode=mode\n self.inputs_cam=inputs_cam\n self.masks_cam=masks_cam\n self.inputs_flau=inputs_flau\n self.masks_flau=masks_flau\n \n def __len__(self):\n return len(self.df)\n \n def __getitem__(self,idx):\n \n im_path = self.df.iloc[idx]['image_path']\n img = cv2.imread(im_path)\n img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n img=Image.fromarray(img)\n if self.transform is not None:\n img = self.transform(img)\n img=img.cuda()\n input_id_cam=self.inputs_cam[idx].cuda()\n input_mask_cam=self.masks_cam[idx].cuda()\n input_id_flau=self.inputs_flau[idx].cuda()\n input_mask_flau=self.masks_flau[idx].cuda()\n \n if self.mode=='test':\n return img,input_id_cam,input_mask_cam,input_id_flau,input_mask_flau\n else:\n# labels = torch.tensor(self.df.iloc[idx]['labels'])\n labels = torch.tensor(self.df.iloc[idx]['labels']).cuda() \n\n return img,input_id_cam,input_mask_cam,input_id_flau,input_mask_flau,labels","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"batch_size = 32 \nPreprocess.prepare_test(text_col)\ntest_sentences = Preprocess.test_sentences\nX_test_phase1= Preprocess.X_test","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# print('Using Camembert')\ntokenizer_cam = CamembertTokenizer.from_pretrained('camembert-base', do_lowercase=False)\n# print('Using Flaubert')\ntokenizer_flau = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', do_lowercase=False)\n\ninput_ids_test_flau,attention_masks_test_flau=prep_input(test_sentences,labels=None, max_len=max_len,tokenizer = tokenizer_flau)\ninput_ids_test_cam,attention_masks_test_cam=prep_input(test_sentences,labels=None, max_len=max_len,tokenizer = tokenizer_cam)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"class vector_fusion(nn.Module):\n \n def __init__(self):\n super(vector_fusion, self).__init__()\n self.img_model = SEResnext50_32x4d(pretrained=None)\n self.img_model.load_state_dict(torch.load('../input/seresnext2048/best_model.pt'))\n self.img_model.l0=Identity()\n for params in self.img_model.parameters():\n params.requires_grad=False\n\n self.cam_model= vec_output_CamembertForSequenceClassification.from_pretrained(\n 'camembert-base', # Use the 12-layer BERT model, with an uncased vocab.\n num_labels = len(Preprocess.dict_code_to_id), # The number of output labels--2 for binary classification.\n # You can increase this for multi-class tasks. \n output_attentions = False, # Whether the model returns attentions weights.\n output_hidden_states = False,) # Whether the model returns all hidden-states.\n \n \n cam_model_path = '../input/camembert-vec-256m768-10ep/best_model.pt'\n checkpoint = torch.load(cam_model_path)\n # model = checkpoint['model']\n self.cam_model.load_state_dict(checkpoint)\n for param in self.cam_model.parameters():\n param.requires_grad=False\n self.cam_model.out_proj=Identity()\n \n self.flau_model=vec_output_FlaubertForSequenceClassification.from_pretrained(\n 'flaubert/flaubert_base_cased', \n num_labels = len(Preprocess.dict_code_to_id), \n output_attentions = False,\n output_hidden_states = False,)\n flau_model_path='../input/flaubert-8933/best_model.pt'\n checkpoint = torch.load(flau_model_path)\n self.flau_model.load_state_dict(checkpoint)\n for param in self.flau_model.parameters():\n param.requires_grad=False\n self.flau_model.classifier=Identity()\n \n \n #reducing the dimensionality\n self.reduce_dim=nn.Conv1d(in_channels = 2048 , out_channels = 768 , kernel_size= 1)\n \n #output\n self.out=nn.Linear(768, 27)\n \n\n \n \n def forward(self,img,input_id_cam,input_mask_cam,input_id_flau,input_mask_flau):\n \n cam_emb =self.cam_model(input_id_cam, \n token_type_ids=None, ###### bs * 768 \n attention_mask=input_mask_cam)\n \n #alignment\n #cam_emb1 = cam_emb[0]\n \n \n flau_emb =self.flau_model(input_id_flau, \n token_type_ids=None, ###### bs * 768 \n attention_mask=input_mask_flau)\n \n #alignment\n #flau_emb1 = flau_emb[0]\n \n #Projecting the image embedding to lower dimension\n img_emb=self.img_model(img)\n img_emb=img_emb.view(img_emb.shape[0],img_emb.shape[1],1) \n img_emb=self.reduce_dim(img_emb) \n img_emb=img_emb.view(img_emb.shape[0],img_emb.shape[1]) ###### bs * 768 \n \n #adding\n fuse= img_emb + cam_emb[0] + flau_emb[0]\n \n logits=self.out(fuse)\n return logits","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"model = vector_fusion()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"model.cuda()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"train_dataset=FusionDataset(train_df,tr_inputs_cam,tr_masks_cam,tr_inputs_flau,tr_masks_flau,\n transform=image_transforms['test'])\nval_dataset=FusionDataset(val_df,val_inputs_cam,val_masks_cam,val_inputs_flau,val_masks_flau,\n transform=image_transforms['test'])\ntest_dataset=FusionDataset(X_test,input_ids_test_cam,attention_masks_test_cam,input_ids_test_flau,attention_masks_test_flau\n ,transform=image_transforms['test'],mode = 'test')","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"batch_size=64\ntrain_dataloader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)\nvalidation_dataloader=DataLoader(val_dataset,batch_size=batch_size,shuffle=False)\ntest_dataloader=DataLoader(test_dataset,batch_size=batch_size,shuffle=False)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# tr_inputs, val_inputs, tr_labels, val_labels = train_test_split(input_ids, labels,stratify=labels,\n# random_state=2020, test_size=val_size)\n# tr_masks, val_masks, u,v = train_test_split(attention_masks, labels,stratify=labels,\n# random_state=2020, test_size=val_size)\n\n\n# train_dataset=TensorDataset(tr_inputs, tr_masks, tr_labels)\n# val_dataset=TensorDataset(val_inputs, val_masks, val_labels)\n# train_sampler = RandomSampler(train_dataset) \n# valid_sampler = SequentialSampler(val_dataset)\n# from torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n\n# # The DataLoader needs to know our batch size for training, so we specify it \n# # here. For fine-tuning BERT on a specific task, the authors recommend a batch \n# # size of 16 or 32.\n# batch_size = 32\n\n# # Create the DataLoaders for our training and validation sets.\n# # We'll take training samples in random order. \n# train_dataloader = DataLoader(\n# train_dataset, # The training samples.\n# sampler = train_sampler, # Select batches randomly\n# batch_size = batch_size # Trains with this batch size.\n# )\n\n# # For validation the order doesn't matter, so we'll just read them sequentially.\n# validation_dataloader = DataLoader(\n# val_dataset, # The validation samples.\n# sampler = valid_sampler, # Pull out batches sequentially.\n# batch_size = batch_size # Evaluate with this batch size.\n# )","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"optimizer = AdamW(model.parameters(),\n lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n )","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def count_parameters(model):\n return sum(p.numel() for p in model.parameters() if p.requires_grad)\ncount_parameters(model)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from transformers import get_linear_schedule_with_warmup\n\n# Number of training epochs. The BERT authors recommend between 2 and 4. \n# We chose to run for 4, but we'll see later that this may be over-fitting the\n# training data.\nepochs = 6\n\n# Total number of training steps is [number of batches] x [number of epochs]. \n# (Note that this is not the same as the number of training samples).\ntotal_steps = len(train_dataloader) * epochs\n\n# Create the learning rate scheduler.\nscheduler = get_linear_schedule_with_warmup(optimizer, \n num_warmup_steps = 0, # Default value in run_glue.py\n num_training_steps = total_steps)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"import torch.nn as nn\nloss_criterion = nn.CrossEntropyLoss()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def flat_accuracy(preds, labels):\n pred_flat = np.argmax(preds, axis=1).flatten()\n labels_flat = labels.flatten()\n return np.sum(pred_flat == labels_flat) / len(labels_flat)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.metrics import f1_score\n\nseed_val = 42\n\nrandom.seed(seed_val)\nnp.random.seed(seed_val)\ntorch.manual_seed(seed_val)\ntorch.cuda.manual_seed_all(seed_val)\n\n# We'll store a number of quantities such as training and validation loss, \n# validation accuracy, and timings.\ntraining_stats = []\n\n# Measure the total training time for the whole run.\ntotal_t0 = time.time()\n\n\n# For each epoch...\nfor epoch_i in range(0, epochs):\n \n # ========================================\n # Training\n # ========================================\n \n # Perform one full pass over the training set.\n\n print(\"\")\n print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n print('Training...')\n \n #tr and val\n# vec_output_tr = []\n# vec_output_val =[]\n\n # Measure how long the training epoch takes.\n t0 = time.time()\n\n # Reset the total loss for this epoch.\n total_train_loss = 0\n\n # Put the model into training mode. Don't be mislead--the call to \n # `train` just changes the *mode*, it doesn't *perform* the training.\n # `dropout` and `batchnorm` layers behave differently during training\n # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)\n best_f1 = 0\n model.train()\n\n # For each batch of training data...\n for step, batch in tqdm(enumerate(train_dataloader)):\n \n # Unpack this training batch from our dataloader. \n #\n \n # As we unpack the batch, we'll also copy each tensor to the GPU using the \n # `to` method.\n #\n # `batch` contains three pytorch tensors:\n # [0]: input ids \n # [1]: attention masks\n # [2]: labels \n# return img,input_id_cam,input_mask_cam,input_id_flau,input_mask_flau\n\n b_img=batch[0].to(device)\n\n b_input_id_cam = batch[1].to(device)\n b_input_mask_cam = batch[2].to(device)\n b_input_id_flau = batch[3].to(device)\n b_input_mask_flau = batch[4].to(device)\n\n b_labels = batch[5].to(device)\n \n \n model.zero_grad() \n\n \n logits = model(b_img,b_input_id_cam ,b_input_mask_cam,b_input_id_flau,b_input_mask_flau)\n \n #Defining the loss\n loss = loss_criterion(logits, b_labels)\n \n #saving the features_tr\n# vec = vec.detach().cpu().numpy()\n# vec_output_tr.extend(vec)\n \n # Accumulate the training loss over all of the batches so that we can\n # calculate the average loss at the end. `loss` is a Tensor containing a\n # single value; the `.item()` function just returns the Python value \n # from the tensor.\n total_train_loss += loss.item()\n\n # Perform a backward pass to calculate the gradients.\n loss.backward()\n\n # Clip the norm of the gradients to 1.0.\n # This is to help prevent the \"exploding gradients\" problem.\n torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n\n # Update parameters and take a step using the computed gradient.\n # The optimizer dictates the \"update rule\"--how the parameters are\n # modified based on their gradients, the learning rate, etc.\n optimizer.step()\n\n # Update the learning rate.\n scheduler.step()\n \n \n \n\n # Calculate the average loss over all of the batches.\n avg_train_loss = total_train_loss / len(train_dataloader) \n \n # Measure how long this epoch took.\n training_time = format_time(time.time() - t0)\n\n print(\"\")\n print(\" Average training loss: {0:.2f} \".format(avg_train_loss))\n print(\" Training epcoh took: {:} \".format(training_time))\n \n # ========================================\n # Validation\n # ========================================\n # After the completion of each training epoch, measure our performance on\n # our validation set.\n\n print(\"\")\n print(\"Running Validation...\")\n\n t0 = time.time()\n\n # Put the model in evaluation mode--the dropout layers behave differently\n # during evaluation.\n model.eval()\n\n # Tracking variables \n total_eval_accuracy = 0\n total_eval_loss = 0\n nb_eval_steps = 0\n predictions=[]\n true_labels=[]\n \n\n # Evaluate data for one epoch\n for batch in tqdm(validation_dataloader):\n \n # Unpack this training batch from our dataloader. \n #\n # As we unpack the batch, we'll also copy each tensor to the GPU using \n # the `to` method.\n #\n # `batch` contains three pytorch tensors:\n # [0]: input ids \n # [1]: attention masks\n # [2]: labels \n b_img=batch[0].to(device)\n\n b_input_id_cam = batch[1].to(device)\n b_input_mask_cam = batch[2].to(device)\n b_input_id_flau = batch[3].to(device)\n b_input_mask_flau = batch[4].to(device)\n\n b_labels = batch[5].to(device)\n \n \n # Tell pytorch not to bother with constructing the compute graph during\n # the forward pass, since this is only needed for backprop (training).\n with torch.no_grad(): \n \n\n # Forward pass, calculate logit predictions.\n # token_type_ids is the same as the \"segment ids\", which \n # differentiates sentence 1 and 2 in 2-sentence tasks.\n # The documentation for this `model` function is here: \n # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n # Get the \"logits\" output by the model. The \"logits\" are the output\n # values prior to applying an activation function like the softmax.\n logits = model(b_img,b_input_id_cam ,b_input_mask_cam,b_input_id_flau,b_input_mask_flau)\n \n #new\n \n #defining the val loss\n loss = loss_criterion(logits, b_labels)\n \n \n # Accumulate the validation loss.\n total_eval_loss += loss.item()\n\n # Move logits and labels to CPU\n logits = logits.detach().cpu().numpy()\n\n # Move logits and labels to CPU\n predicted_labels=np.argmax(logits,axis=1)\n predictions.extend(predicted_labels)\n label_ids = b_labels.to('cpu').numpy()\n true_labels.extend(label_ids)\n \n #saving the features_tr\n# vec = vec.detach().cpu().numpy()\n# vec_output_val.extend(vec)\n \n\n # Calculate the accuracy for this batch of test sentences, and\n # accumulate it over all batches.\n total_eval_accuracy += flat_accuracy(logits, label_ids)\n \n\n # Report the final accuracy for this validation run.\n avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n print(\" Accuracy: {0:.2f}\".format(avg_val_accuracy))\n\n # Calculate the average loss over all of the batches.\n avg_val_loss = total_eval_loss / len(validation_dataloader)\n \n # Measure how long the validation run took.\n validation_time = format_time(time.time() - t0)\n \n print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n print(\" Validation took: {:}\".format(validation_time))\n print(\"Validation F1-Score: {}\".format(f1_score(true_labels,predictions,average='macro')))\n curr_f1=f1_score(true_labels,predictions,average='macro')\n if curr_f1 > best_f1:\n best_f1=curr_f1\n torch.save(model.state_dict(), 'best_model.pt')\n# np.save('best_vec_train_model_train.npy',vec_output_tr)\n# np.save('best_vec_val.npy',vec_output_val)\n \n # Record all statistics from this epoch.\n# training_stats.append(\n# {\n# 'epoch': epoch_i + 1,\n# 'Training Loss': avg_train_loss,\n# 'Valid. Loss': avg_val_loss,\n# 'Valid. Accur.': avg_val_accuracy,\n# 'Training Time': training_time,\n# 'Validation Time': validation_time\n# }\n# )\n\nprint(\"\")\nprint(\"Training complete!\")\n\nprint(\"Total training took {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"ckpt = '../input/vec-fusion-9093/best_model.pt'\nmodel.load_state_dict(torch.load(ckpt))","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def predict_pyt(model, prediction_dataloader):\n \"\"\"\n model: pytorch model\n prediction_dataloader: DataLoader object for which the predictions has to be made.\n return:\n predictions:- Direct predicted labels\n softmax_logits:- logits which are normalized with softmax on output\"\"\"\n # Put model in evaluation mode\n model.eval()\n # Tracking variables \n predictions = []\n softmax_logits=[]\n # Predict \n \n for batch in tqdm(prediction_dataloader):\n \n # Add batch to GPU\n b_img=batch[0].to(device)\n b_input_id_cam = batch[1].to(device)\n b_input_mask_cam = batch[2].to(device)\n b_input_id_flau = batch[3].to(device)\n b_input_mask_flau = batch[4].to(device)\n \n \n # Telling the model not to compute or store gradients, saving memory and \n # speeding up prediction\n with torch.no_grad():\n # Forward pass, calculate logit predictions\n logits = model(b_img,b_input_id_cam ,b_input_mask_cam,b_input_id_flau,b_input_mask_flau)\n \n \n #find logits\n #----- Add softmax--- \n m = nn.Softmax(dim=1)\n # # input = torch.randn(2, 3)\n output = m(logits)\n #-------#------\n # Move logits and labels to CPU\n logits = logits.detach().cpu().numpy()\n predicted_labels=np.argmax(logits,axis=1)\n predictions.extend(predicted_labels)\n softmax_logits.extend(output)\n print('DONE')\n return predictions, softmax_logits\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#validation predictions\npredictions_val, softmax_logits_val = predict_pyt(model, validation_dataloader)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"softmax_logits_val = np.array([ten.detach().cpu().numpy() for ten in softmax_logits_val])","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"np.save('predictions_val_add.npy',np.array(predictions_val))\nnp.save('softmax_logits_val_add.npy',softmax_logits_val)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#test_predictions\n#predictions_test, softmax_logits_test = predict_pyt(model, test_dataloader)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#softmax_logits_test = np.array([ten.detach().cpu().numpy() for ten in softmax_logits_test])","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# np.save('predictions_test_9093.npy',np.array(predictions_test))\n# np.save('softmax_logits_test_9093.npy',softmax_logits_test)","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":4}
--------------------------------------------------------------------------------