├── Boosted Late-Fusion.ipynb ├── LICENSE ├── README.md ├── SEResnext50_train_predict.ipynb ├── camembert_train_predict.ipynb ├── flaubert_train_predict.ipynb ├── multi-modal_concatenate_fusion.ipynb └── multi_modal_addition_fusion.ipynb /Boosted Late-Fusion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 8 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import pandas as pd\n", 13 | "import numpy as np\n", 14 | "from tqdm import tqdm\n", 15 | "tqdm.pandas()\n", 16 | "\n", 17 | "import os, time, datetime\n", 18 | "from sklearn.model_selection import train_test_split\n", 19 | "from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc\n", 20 | "import lightgbm as lgb\n", 21 | "import xgboost as xgb" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "def format_time(elapsed):\n", 31 | " '''\n", 32 | " Takes a time in seconds and returns a string hh:mm:ss\n", 33 | " '''\n", 34 | " # Round to the nearest second.\n", 35 | " elapsed_rounded = int(round((elapsed)))\n", 36 | " \n", 37 | " # Format as hh:mm:ss\n", 38 | " return str(datetime.timedelta(seconds=elapsed_rounded))\n", 39 | "\n", 40 | "class SigirPreprocess():\n", 41 | " \n", 42 | " def __init__(self, text_data_path):\n", 43 | " self.text_data_path = text_data_path\n", 44 | " self.train = None\n", 45 | " self.dict_code_to_id = {}\n", 46 | " self.dict_id_to_code = {}\n", 47 | " self.list_tags = {}\n", 48 | " self.sentences = []\n", 49 | " self.labels = []\n", 50 | " self.text_col = None\n", 51 | " self.X_test = None\n", 52 | " \n", 53 | " def prepare_data(self ):\n", 54 | " catalog_eng= pd.read_csv(self.text_data_path+\"data/catalog_english_taxonomy.tsv\",sep=\"\\t\")\n", 55 | " X_train= pd.read_csv(self.text_data_path+\"data/X_train.tsv\",sep=\"\\t\")\n", 56 | " Y_train= pd.read_csv(self.text_data_path+\"data/Y_train.tsv\",sep=\"\\t\")\n", 57 | " \n", 58 | " self.list_tags = list(Y_train['Prdtypecode'].unique())\n", 59 | " for i,tag in enumerate(self.list_tags):\n", 60 | " self.dict_code_to_id[tag] = i \n", 61 | " self.dict_id_to_code[i]=tag\n", 62 | " print(self.dict_code_to_id)\n", 63 | " \n", 64 | " Y_train['labels']=Y_train['Prdtypecode'].map(self.dict_code_to_id)\n", 65 | " train=pd.merge(left=X_train,right=Y_train,\n", 66 | " how='left',left_on=['Integer_id','Image_id','Product_id'],\n", 67 | " right_on=['Integer_id','Image_id','Product_id'])\n", 68 | " prod_map=pd.Series(catalog_eng['Top level category'].values,\n", 69 | " index=catalog_eng['Prdtypecode']).to_dict()\n", 70 | "\n", 71 | " train['product'] = train['Prdtypecode'].map(prod_map)\n", 72 | " train['title_len']=train['Title'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n", 73 | " train['desc_len']=train['Description'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n", 74 | " train['title_desc_len']=train['title_len'] + train['desc_len']\n", 75 | " train.loc[train['Description'].isnull(), 'Description'] = \" \"\n", 76 | " train['title_desc'] = train['Title'] + \" \" + train['Description']\n", 77 | " \n", 78 | " self.train = train\n", 79 | " \n", 80 | " def get_sentences(self, text_col, remove_null_rows=False):\n", 81 | " self.text_col = text_col\n", 82 | " if remove_null_rows==True:\n", 83 | " new_train = self.train[self.train[text_col].notnull()]\n", 84 | "\n", 85 | " else:\n", 86 | " new_train = self.train.copy()\n", 87 | " \n", 88 | " self.sentences = new_train[text_col].values\n", 89 | " self.labels = new_train['labels'].values\n", 90 | " \n", 91 | " def prepare_test(self, text_col, test_data_path, phase=1):\n", 92 | " X_test=pd.read_csv(test_data_path+f\"data/x_test_task1_phase{phase}.tsv\",sep=\"\\t\")\n", 93 | " X_test.loc[X_test['Description'].isnull(), 'Description'] = \" \"\n", 94 | " X_test['title_desc'] = X_test['Title'] + \" \" + X_test['Description']\n", 95 | " self.X_test = X_test\n", 96 | " self.test_sentences = X_test[text_col].values\n", 97 | " " 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "text_col = 'title_desc'\n", 107 | "val_size = 0.1\n", 108 | "random_state=2020\n", 109 | "num_class = 27\n", 110 | "do_gridsearch = False" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "kwargs = {'add_logits':['cam', 'fla']}\n", 120 | "\n", 121 | "\n", 122 | "cam_path = '/../input/camembert-vec-256m768-10ep/'\n", 123 | "flau_path = '/../input/flaubertlogits2107/' \n", 124 | "res_path = '/../input/resnextfinal/'\n", 125 | "cms_path = '/../input/crossmodal-v0/'\n", 126 | "vca_path = '/../input/vec-concat-9093/'\n", 127 | "vca_path_phase2 = '/../input/predictions-test-phase2-vec-fusion/'\n", 128 | "aem_path = '/../input/addition-ensemble-latest/'\n", 129 | "\n", 130 | "\n", 131 | "val_logits_path = {'cam':cam_path + 'validation_set_softmax_logits.npy',\n", 132 | " 'fla':flau_path + 'validation_set_softmax_logits.npy',\n", 133 | " 'res':res_path + 'Valid_resnext50_32x4d_phase1_softmax_logits.npy',\n", 134 | " 'vca':vca_path + 'softmax_logits_val_9093.npy',\n", 135 | " 'aem':aem_path + 'softmax_logits_val_add.npy'}\n", 136 | "\n", 137 | "test_logits_path_phase1 = {'cam':cam_path+f'X_test_phase1_softmax_logits.npy',\n", 138 | " 'fla':flau_path + f'X_test_phase1_softmax_logits.npy', \n", 139 | " 'res':res_path + f'Test_resnext50_32x4d_phase1_softmax_logits.npy',\n", 140 | " 'vca':vca_path + f'softmax_logits_test_9093.npy'}\n", 141 | "\n", 142 | "test_logits_path_phase2 = {'cam':cam_path+f'X_test_phase2_softmax_logits.npy',\n", 143 | " 'fla':flau_path + f'X_test_phase2_softmax_logits.npy', \n", 144 | " 'res':res_path + f'Test_resnext50_32x4d_phase2_softmax_logits.npy',\n", 145 | " 'vca':vca_path_phase2 + f'softmax_logits_test_phase2_9093.npy'}\n", 146 | " \n", 147 | "\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "## Get valdation dataset from original train dataset\n", 157 | "Preprocess = SigirPreprocess(\"/../input/textphase1/\")\n", 158 | "Preprocess.prepare_data()\n", 159 | "Preprocess.get_sentences(text_col, True)\n", 160 | "\n", 161 | "full_data = Preprocess.train\n", 162 | "labels = Preprocess.labels\n", 163 | "index = full_data.Integer_id\n", 164 | "\n", 165 | "\n", 166 | "tr_index, val_index, tr_labels, val_labels = train_test_split(index, labels,\n", 167 | " stratify=labels,\n", 168 | " random_state=random_state, \n", 169 | " test_size=val_size)\n", 170 | "\n", 171 | "train_data = full_data.loc[tr_index, :]\n", 172 | "train_data.reset_index(inplace=True, drop=True)\n", 173 | "val_data = full_data.loc[val_index, :]\n", 174 | "val_data.reset_index(inplace=True, drop=True)\n", 175 | "\n", 176 | "full_data.loc[val_index, 'sample'] = 'val'\n", 177 | "full_data['sample'].fillna('train', inplace=True)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "def preparelogits_df(logit_paths, df=None, val_labels=None, **kwargs):\n", 187 | " ### Prepare and combine Logits data with original validation dataset\n", 188 | " logits_dict = {}\n", 189 | " dfs_dict = {}\n", 190 | " for key, logit_path in logit_paths.items():\n", 191 | " logits_dict[key] = np.load(logit_path)\n", 192 | " \n", 193 | " dfs_dict[key] = pd.DataFrame(logits_dict[key], \n", 194 | " columns=[key + \"_\" + str(i) for i in range(1,28)])\n", 195 | " print(\"Shape of logit arrays: {}\", logits_dict[key].shape)\n", 196 | " \n", 197 | " if kwargs['add_logits']:\n", 198 | " if len(kwargs['add_logits'])>0:\n", 199 | " add_str = '_'.join(kwargs['add_logits'])\n", 200 | " logits_dict[add_str] = logits_dict[kwargs['add_logits'][0]]\n", 201 | " for k in kwargs['add_logits'][1:]:\n", 202 | " logits_dict[add_str] += logits_dict[k]\n", 203 | " logits_dict[add_str] = logits_dict[add_str]/len(kwargs['add_logits'])\n", 204 | " dfs_dict[add_str] = pd.DataFrame(logits_dict[add_str], \n", 205 | " columns=[add_str + \"_\" + str(i) for i in range(1,28)])\n", 206 | " print(\"Shape of logit arrays: {}\", logits_dict[add_str].shape)\n", 207 | "\n", 208 | "\n", 209 | " \n", 210 | " if type(val_labels) == np.ndarray:\n", 211 | " for key,logits in logits_dict.items():\n", 212 | " print(\"\"\"Validation F1 scores for {} logits: {} \"\"\".format(key, \n", 213 | " f1_score(val_labels, np.argmax(logits, axis=1), average='macro')))\n", 214 | " \n", 215 | " \n", 216 | "\n", 217 | " df = pd.concat([df] + list(dfs_dict.values()), axis=1)\n", 218 | " \n", 219 | " return df" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "val_data = preparelogits_df(val_logits_path, df=val_data, \n", 229 | " val_labels=val_labels, **kwargs)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "# Model Data Prep" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "df_log = val_data.copy()\n", 246 | "\n", 247 | "probas_cols = [\"fla_\" + str(i) for i in range(1,28)] + [\"cam_\" + str(i) for i in range(1,28)] +\\\n", 248 | "[\"res_\" + str(i) for i in range(1,28)] \\\n", 249 | "+ [\"vca_\" + str(i) for i in range(1,28)] \\\n", 250 | "\n", 251 | "X = df_log[probas_cols]\n", 252 | "y = df_log['labels'].values\n", 253 | "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=random_state)\n" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "from scipy.stats import randint as sp_randint\n", 263 | "from scipy.stats import uniform as sp_uniform\n", 264 | "\n", 265 | "from sklearn.model_selection import RandomizedSearchCV, GridSearchCV\n", 266 | "n_HP_points_to_test = 100\n", 267 | "\n", 268 | "\n", 269 | "param_test ={'num_leaves': sp_randint(6, 50), \n", 270 | " 'min_child_samples': sp_randint(100, 500), \n", 271 | " 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],\n", 272 | " 'subsample': sp_uniform(loc=0.2, scale=0.8), \n", 273 | " 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),\n", 274 | " 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],\n", 275 | " 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],\n", 276 | "# \"bagging_fraction\" : [0.5, 0.6, 0.7, 0.8, 0.9],\n", 277 | "# \"feature_fraction\":[0.5, 0.6, 0.7, 0.8, 0.9]\n", 278 | " }\n", 279 | "\n", 280 | "\n", 281 | "\n", 282 | "\n", 283 | "fit_params={\n", 284 | " \"early_stopping_rounds\":100, \n", 285 | " \"eval_metric\" : 'multi_logloss', \n", 286 | " \"eval_set\" : [(X_test,y_test)],\n", 287 | " 'eval_names': ['valid'],\n", 288 | " #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],\n", 289 | " 'verbose': 100,\n", 290 | " 'categorical_feature': 'auto'}\n", 291 | "\n", 292 | "\n", 293 | "clf = lgb.LGBMClassifier(num_iteration=1000, max_depth=-1, random_state=314, silent=True,\n", 294 | " metric='multi_logloss', n_jobs=4, early_stopping_rounds=100,\n", 295 | " num_class=num_class, objective= \"multiclass\")\n", 296 | "gs = RandomizedSearchCV(\n", 297 | " estimator=clf, param_distributions=param_test, \n", 298 | " n_iter=n_HP_points_to_test,\n", 299 | " cv=3,\n", 300 | " refit=True,\n", 301 | " random_state=314,\n", 302 | " verbose=True)\n", 303 | "\n", 304 | "if do_gridsearch==True:\n", 305 | " gs.fit(X_train, y_train, **fit_params)\n", 306 | " print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "# opt_parameters = gs.best_params_\n", 316 | "opt_parameters = {'colsample_bytree': 0.5284213741879101, 'min_child_samples': 125, \n", 317 | " 'min_child_weight': 10.0, 'num_leaves': 22, \n", 318 | " 'reg_alpha': 0.1, 'reg_lambda': 20, 'subsample': 0.3080033455431848} \n" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "# Model Training" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "### Run lightgbm to get weights for different class logits\n", 335 | "\n", 336 | "t0 = time.time()\n", 337 | "\n", 338 | "model_met = 'fit' #'xgb'#'train' #fit\n", 339 | "\n", 340 | "params = {\n", 341 | " \"objective\" : \"multiclass\",\n", 342 | " \"num_class\" : num_class,\n", 343 | " \"num_leaves\" : 60,\n", 344 | " \"max_depth\": -1,\n", 345 | " \"learning_rate\" : 0.01,\n", 346 | " \"bagging_fraction\" : 0.9, # subsample\n", 347 | " \"feature_fraction\" : 0.9, # colsample_bytree\n", 348 | " \"bagging_freq\" : 5, # subsample_freq\n", 349 | " \"bagging_seed\" : 2018,\n", 350 | " \"verbosity\" : -1 }\n", 351 | "\n", 352 | "lgtrain, lgval = lgb.Dataset(X_train, y_train), lgb.Dataset(X_test, y_test)\n", 353 | "\n", 354 | "if model_met == 'train':\n", 355 | " params.update(opt_parameters)\n", 356 | " params.update(fit_params)\n", 357 | " \n", 358 | " lgbmodel = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgval], \n", 359 | " num_iterations = 1000, metric= 'multi_logloss')\n", 360 | " train_logits = lgbmodel.predict(X_train) \n", 361 | " test_logits = lgbmodel.predict(X_test)\n", 362 | "\n", 363 | " train_pred = np.argmax(train_logits, axis=1) \n", 364 | " test_pred = np.argmax(test_logits, axis=1) \n", 365 | "elif model_met == 'xgb':\n", 366 | " dtrain = xgb.DMatrix(X_train, label=y_train)\n", 367 | " dtrain.save_binary('xgb_train.buffer')\n", 368 | " dtest = xgb.DMatrix(X_test, label=y_test)\n", 369 | " \n", 370 | " num_round = 200\n", 371 | " xgb_param = {'max_depth': 5, 'eta': 0.1, 'seed':2020, 'verbosity':1,\n", 372 | " 'objective': 'multi:softmax', 'num_class':num_class}\n", 373 | " xgb_param['nthread'] = 4\n", 374 | " xgb_param['eval_metric'] = 'mlogloss'\n", 375 | " evallist = [(dtest, 'eval'), (dtrain, 'train')]\n", 376 | " bst = xgb.train(xgb_param, dtrain, num_round, evallist\n", 377 | " , early_stopping_rounds=10\n", 378 | " )\n", 379 | " \n", 380 | " train_logits = bst.predict(xgb.DMatrix(X_train), ntree_limit=bst.best_ntree_limit) \n", 381 | " test_logits = bst.predict(xgb.DMatrix(X_test), ntree_limit=bst.best_ntree_limit)\n", 382 | "\n", 383 | " train_pred = train_logits \n", 384 | " test_pred = test_logits \n", 385 | " \n", 386 | "else:\n", 387 | "\n", 388 | " lgbmodel = lgb.LGBMClassifier(**clf.get_params())\n", 389 | " #set optimal parameters\n", 390 | " lgbmodel.set_params(**opt_parameters)\n", 391 | " lgbmodel.fit(X_train, y_train, **fit_params)\n", 392 | " \n", 393 | " train_logits = lgbmodel.predict(X_train) \n", 394 | " test_logits = lgbmodel.predict(X_test)\n", 395 | "\n", 396 | " train_pred = train_logits \n", 397 | " test_pred = test_logits \n", 398 | " \n", 399 | "print(\"Validation F1: {} and Training F1: {} \".format(\n", 400 | " f1_score(y_test, test_pred, average='macro'), \n", 401 | " f1_score(y_train, train_pred, average='macro')))\n", 402 | "\n", 403 | "if model_met == 'train':\n", 404 | " feat_imp = pd.DataFrame({'feature':probas_cols, \n", 405 | " 'logit_kind': [i.split('_')[0] for i in probas_cols],\n", 406 | " 'imp':lgbmodel.feature_importance()/sum(lgbmodel.feature_importance())})\n", 407 | "\n", 408 | "\n", 409 | " lgbmodel.save_model('lgb_classifier_81feats.txt', num_iteration=lgbmodel.best_iteration) \n", 410 | " print(\"\"\"Feature Importances by logits group: \n", 411 | " \"\"\", feat_imp.groupby(['logit_kind'])['imp'].sum())\n", 412 | "else:\n", 413 | " feat_imp = pd.DataFrame({'feature':probas_cols, \n", 414 | " 'logit_kind': [i.split('_')[0] for i in probas_cols],\n", 415 | " 'imp':lgbmodel.feature_importances_/sum(lgbmodel.feature_importances_)})\n", 416 | "\n", 417 | " print(\"\"\"Feature Importances by logits group: \n", 418 | " \"\"\", feat_imp.groupby(['logit_kind'])['imp'].sum())\n", 419 | " \n", 420 | "import shap\n", 421 | "explainer = shap.TreeExplainer(lgbmodel)\n", 422 | "shap_values = explainer.shap_values(X)\n", 423 | "print(\"Time Elapsed: {:}.\".format(format_time(time.time() - t0)))" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "for n, path in enumerate(['/kaggle/input/textphase1/', \n", 433 | " '/kaggle/input/testphase2/']):\n", 434 | " phase = n+1\n", 435 | " if phase==1:\n", 436 | " test_logits_path = test_logits_path_phase1\n", 437 | " else:\n", 438 | " test_logits_path = test_logits_path_phase2\n", 439 | " Preprocess.prepare_test(text_col, path, phase)\n", 440 | " X_test_phase1= Preprocess.X_test\n", 441 | "\n", 442 | " test_phase1 = preparelogits_df(test_logits_path,\n", 443 | " df=X_test_phase1, val_labels=None, **kwargs)\n", 444 | " \n", 445 | " phase1_logits = lgbmodel.predict(test_phase1[probas_cols].values) \n", 446 | " if model_met == 'train':\n", 447 | " predictions = np.argmax(phase1_logits, axis=1) \n", 448 | " elif model_met == 'xgb':\n", 449 | " phase1_logits = bst.predict(xgb.DMatrix(test_phase1[probas_cols]), \n", 450 | " ntree_limit=bst.best_ntree_limit) \n", 451 | " predictions = phase1_logits\n", 452 | " else:\n", 453 | " predictions = phase1_logits\n", 454 | " X_test_phase1['prediction_model']= predictions\n", 455 | " X_test_phase1['Prdtypecode']=X_test_phase1['prediction_model'].map(Preprocess.dict_id_to_code)\n", 456 | " print(X_test_phase1['Prdtypecode'].value_counts())\n", 457 | " X_test_phase1=X_test_phase1.drop(['prediction_model','Title','Description'],axis=1)\n", 458 | " X_test_phase1.to_csv(f'y_test_task1_phase{phase}_pred_.tsv',sep='\\t',index=False)" 459 | ] 460 | } 461 | ], 462 | "metadata": { 463 | "kernelspec": { 464 | "display_name": "Python 3", 465 | "language": "python", 466 | "name": "python3" 467 | }, 468 | "language_info": { 469 | "codemirror_mode": { 470 | "name": "ipython", 471 | "version": 3 472 | }, 473 | "file_extension": ".py", 474 | "mimetype": "text/x-python", 475 | "name": "python", 476 | "nbconvert_exporter": "python", 477 | "pygments_lexer": "ipython3", 478 | "version": "3.7.7" 479 | }, 480 | "toc": { 481 | "base_numbering": 1, 482 | "nav_menu": {}, 483 | "number_sections": true, 484 | "sideBar": true, 485 | "skip_h1_title": false, 486 | "title_cell": "Table of Contents", 487 | "title_sidebar": "Contents", 488 | "toc_cell": false, 489 | "toc_position": {}, 490 | "toc_section_display": true, 491 | "toc_window_display": false 492 | } 493 | }, 494 | "nbformat": 4, 495 | "nbformat_minor": 4 496 | } 497 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 depshad 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning Framework for Multi-modal Product Classification 2 | Code repository for Rakuten Data Challenge : Multimodal Product Classification and Retrieval. 3 | 4 | Team Transformer's solution : Deep Multi-level Boosted Fusion Learning Framework for Multi-modal Product Classification 5 | 6 | Paper Link : https://sigir-ecom.github.io/ecom20DCPapers/SIGIR_eCom20_DC_paper_8.pdf 7 | 8 | 9 | Data challenge link : https://sigir-ecom.github.io/data-task.html 10 | 11 | ## Abstract 12 | 13 | In this paper, we present our approach for the ’Multimodal Product 14 | Classification’ task as a part of the 2020 SIGIR Workshop On eCommerce (ECOM20). The specific objective of this task is to build and 15 | submit systems that classify previously unseen products into their 16 | corresponding product type codes. We propose a deep Multi-Modal 17 | Multi-level Boosted Fusion Learning Framework used to categorize 18 | large-scale multi-modal (text and image) product data into product 19 | type codes. Our proposed final methodology achieved a macro F1- 20 | score of 91.94 on the phase 1 test dataset which is the top-scoring 21 | submission and third position on the scoreboard for phase 2 test 22 | dataset with macro F1-score of 90.53. 23 | 24 | ## Code Usage 25 | 26 | ### Unimodal Model Training and Prediction Scripts 27 | 28 | 1. SEResnext50_train_predict.ipynb : Fine tune the pre-trained SEResnext50 model on Rakuten images 29 | 30 | 2. camembert_train_predict.ipynb : Fine tune the pre-trained Cammebert model on French text; Custom Cammbert model with vector output (used later for feature fusion) 31 | 32 | 3. flaubert_train_predict.ipynb : Fine tune the pre-trained Flaubert model on French text; Custom Flaubert model with vector output (used later for feature fusion) 33 | 34 | ### Multimodal Feature Level Fusion 35 | 1. multi-modal_concatenate_fusion.ipynb : Concatenate the features extracted and train NN module on top 36 | 37 | ### Probability Level Fusion 38 | 1. Boosted Late-Fusion.ipynb : Train LightGBM model with class probability as input 39 | 40 | 41 | 42 |

Multi-modal Joint Representation Learning

43 | 44 |

45 | 46 |

47 | 48 | 49 | 50 |

Late Fusion Model

51 | 52 |

53 | 54 |

55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /camembert_train_predict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 8 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import os, time, datetime\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "from tqdm import tqdm\n", 16 | "import random\n", 17 | "import logging\n", 18 | "tqdm.pandas()\n", 19 | "import seaborn as sns\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "\n", 22 | "#NN Packages\n", 23 | "import torch\n", 24 | "import torch.nn as nn\n", 25 | "from torch.utils.data import TensorDataset, random_split,DataLoader, RandomSampler, SequentialSampler\n", 26 | "\n", 27 | "logger = logging.getLogger(__name__)\n", 28 | "\n", 29 | "\n", 30 | "if torch.cuda.is_available(): \n", 31 | "\n", 32 | " # Tell PyTorch to use the GPU. \n", 33 | " device = torch.device(\"cuda\")\n", 34 | "\n", 35 | " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n", 36 | "\n", 37 | " print('We will use the GPU:', torch.cuda.get_device_name(0))\n", 38 | "\n", 39 | "# If not...\n", 40 | "else:\n", 41 | " print('No GPU available, using the CPU instead.')\n", 42 | " device = torch.device(\"cpu\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "def format_time(elapsed):\n", 52 | " '''\n", 53 | " Takes a time in seconds and returns a string hh:mm:ss\n", 54 | " '''\n", 55 | " # Round to the nearest second.\n", 56 | " elapsed_rounded = int(round((elapsed)))\n", 57 | " \n", 58 | " # Format as hh:mm:ss\n", 59 | " return str(datetime.timedelta(seconds=elapsed_rounded))\n", 60 | "\n", 61 | "class SigirPreprocess():\n", 62 | " \n", 63 | " def __init__(self, text_data_path):\n", 64 | " self.text_data_path = text_data_path\n", 65 | " self.train = None\n", 66 | " self.dict_code_to_id = {}\n", 67 | " self.dict_id_to_code = {}\n", 68 | " self.list_tags = {}\n", 69 | " self.sentences = []\n", 70 | " self.labels = []\n", 71 | " self.text_col = None\n", 72 | " self.X_test = None\n", 73 | " def prepare_data(self ):\n", 74 | " catalog_eng= pd.read_csv(self.text_data_path+\"data/catalog_english_taxonomy.tsv\",sep=\"\\t\")\n", 75 | " X_train= pd.read_csv(self.text_data_path+\"data/X_train.tsv\",sep=\"\\t\")\n", 76 | " Y_train= pd.read_csv(self.text_data_path+\"data/Y_train.tsv\",sep=\"\\t\")\n", 77 | " \n", 78 | " self.list_tags = list(Y_train['Prdtypecode'].unique())\n", 79 | " for i,tag in enumerate(self.list_tags):\n", 80 | " self.dict_code_to_id[tag] = i \n", 81 | " self.dict_id_to_code[i]=tag\n", 82 | " print(self.dict_code_to_id)\n", 83 | " \n", 84 | " Y_train['labels']=Y_train['Prdtypecode'].map(self.dict_code_to_id)\n", 85 | " train=pd.merge(left=X_train,right=Y_train,\n", 86 | " how='left',left_on=['Integer_id','Image_id','Product_id'],\n", 87 | " right_on=['Integer_id','Image_id','Product_id'])\n", 88 | " prod_map=pd.Series(catalog_eng['Top level category'].values,\n", 89 | " index=catalog_eng['Prdtypecode']).to_dict()\n", 90 | "\n", 91 | " train['product'] = train['Prdtypecode'].map(prod_map)\n", 92 | " train['title_len']=train['Title'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n", 93 | " train['desc_len']=train['Description'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n", 94 | " train['title_desc_len']=train['title_len'] + train['desc_len']\n", 95 | " train.loc[train['Description'].isnull(), 'Description'] = \" \"\n", 96 | " train['title_desc'] = train['Title'] + \" \" + train['Description']\n", 97 | " \n", 98 | " self.train = train\n", 99 | " \n", 100 | " def get_sentences(self, text_col, remove_null_rows=False):\n", 101 | " self.text_col = text_col\n", 102 | " if remove_null_rows==True:\n", 103 | " new_train = self.train[self.train[text_col].notnull()]\n", 104 | "\n", 105 | " else:\n", 106 | " new_train = self.train.copy()\n", 107 | " \n", 108 | " self.sentences = new_train[text_col].values\n", 109 | " self.labels = new_train['labels'].values\n", 110 | " \n", 111 | " def prepare_test(self, text_col):\n", 112 | " X_test=pd.read_csv(self.text_data_path+\"data/x_test_task1_phase1.tsv\",sep=\"\\t\")\n", 113 | " X_test.loc[X_test['Description'].isnull(), 'Description'] = \" \"\n", 114 | " X_test['title_desc'] = X_test['Title'] + \" \" + X_test['Description']\n", 115 | " self.X_test = X_test\n", 116 | " self.test_sentences = X_test[text_col].values\n", 117 | " " 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "text_col = 'title_desc'\n", 127 | "max_len = 256\n", 128 | "val_size = 0.1\n", 129 | "\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "Preprocess = SigirPreprocess(\"/kaggle/input/textphase1/\")\n", 139 | "Preprocess.prepare_data()\n", 140 | "Preprocess.get_sentences(text_col, True)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "sentences = Preprocess.sentences\n", 150 | "labels = Preprocess.labels\n", 151 | "print(\"Total number of sentences:{}, labels:{}\".format(len(sentences), len(labels)))" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "from transformers import CamembertConfig, CamembertTokenizer, CamembertModel, CamembertForSequenceClassification, AdamW\n", 161 | "from transformers.modeling_roberta import RobertaClassificationHead\n", 162 | "print('Using Camembert')\n", 163 | "modelname = 'camembert-base'\n", 164 | "tokenizer = CamembertTokenizer.from_pretrained(modelname, do_lowercase=False)\n" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "#function to prepare input for model training\n", 174 | "def prep_input(sentences,labels, max_len):\n", 175 | " input_ids = []\n", 176 | " attention_masks = []\n", 177 | "\n", 178 | " # For every sentence...\n", 179 | " for sent in tqdm(sentences):\n", 180 | " # `encode_plus` will:\n", 181 | " # (1) Tokenize the sentence.\n", 182 | " # (2) Prepend the `[CLS]` token to the start.\n", 183 | " # (3) Append the `[SEP]` token to the end.\n", 184 | " # (4) Map tokens to their IDs.\n", 185 | " # (5) Pad or truncate the sentence to `max_length`\n", 186 | " # (6) Create attention masks for [PAD] tokens.\n", 187 | " encoded_dict = tokenizer.encode_plus(\n", 188 | " sent, # Sentence to encode.\n", 189 | " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", 190 | " max_length = max_len, # Pad & truncate all sentences.\n", 191 | " pad_to_max_length = True,\n", 192 | " return_attention_mask = True, # Construct attn. masks.\n", 193 | " return_tensors = 'pt', # Return pytorch tensors.\n", 194 | " )\n", 195 | "\n", 196 | " # Add the encoded sentence to the list. \n", 197 | " input_ids.append(encoded_dict['input_ids'])\n", 198 | "\n", 199 | " # And its attention mask (simply differentiates padding from non-padding).\n", 200 | " attention_masks.append(encoded_dict['attention_mask'])\n", 201 | "\n", 202 | " # Convert the lists into tensors.\n", 203 | " input_ids = torch.cat(input_ids, dim=0)\n", 204 | " attention_masks = torch.cat(attention_masks, dim=0)\n", 205 | " if labels is not None:\n", 206 | " labels = torch.tensor(labels)\n", 207 | " return input_ids,attention_masks,labels\n", 208 | " else:\n", 209 | " return input_ids,attention_masks\n", 210 | " " 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "input_ids,attention_masks,labels=prep_input(sentences,labels, max_len=max_len)\n", 220 | "print('Original: ', sentences[0])\n", 221 | "print('Token IDs:', input_ids[0]) " 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "### Camembert Model with Vector Output" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "# class RobertaClassificationHead(nn.Module):\n", 238 | "# \"\"\"Head for sentence-level classification tasks.\"\"\"\n", 239 | "\n", 240 | "# def __init__(self, config):\n", 241 | "# super().__init__()\n", 242 | "# self.dense = nn.Linear(config.hidden_size, config.hidden_size)\n", 243 | "# self.dropout = nn.Dropout(config.hidden_dropout_prob)\n", 244 | "# self.out_proj = nn.Linear(config.hidden_size, config.num_labels)\n", 245 | "\n", 246 | "# def forward(self, features, **kwargs):\n", 247 | "# x = features[:, 0, :] # take token (equiv. to [CLS])\n", 248 | "# x = self.dropout(x)\n", 249 | "# x = self.dense(x)\n", 250 | "# x = torch.tanh(x)\n", 251 | "# feat = self.dropout(x)\n", 252 | "# x = self.out_proj(feat)\n", 253 | "# return x,feat" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "class vec_output_CamembertForSequenceClassification(CamembertModel):\n", 263 | " config_class = CamembertConfig\n", 264 | "\n", 265 | " def __init__(self, config):\n", 266 | " super().__init__(config)\n", 267 | " self.num_labels = config.num_labels\n", 268 | "\n", 269 | " self.roberta = CamembertModel(config)\n", 270 | " self.dense = nn.Linear(256*config.hidden_size, config.hidden_size)\n", 271 | " self.dropout = nn.Dropout(0.1)\n", 272 | " self.out_proj = nn.Linear(config.hidden_size, config.num_labels)\n", 273 | " self.init_weights()\n", 274 | "\n", 275 | "\n", 276 | " def forward(\n", 277 | " self,\n", 278 | " input_ids=None,\n", 279 | " attention_mask=None,\n", 280 | " token_type_ids=None,\n", 281 | " position_ids=None,\n", 282 | " head_mask=None,\n", 283 | " inputs_embeds=None,\n", 284 | " labels=None,\n", 285 | " output_attentions=None,\n", 286 | " output_hidden_states=None,\n", 287 | " ):\n", 288 | " outputs = self.roberta(\n", 289 | " input_ids,\n", 290 | " attention_mask=attention_mask,\n", 291 | " token_type_ids=token_type_ids,\n", 292 | " position_ids=position_ids,\n", 293 | " head_mask=head_mask,\n", 294 | " inputs_embeds=inputs_embeds,\n", 295 | "# output_attentions=output_attentions,\n", 296 | "# output_hidden_states=output_hidden_states,\n", 297 | " )\n", 298 | " sequence_output = outputs[0] #(B,256,768)\n", 299 | " x = sequence_output.view(sequence_output.shape[0], 256*768)\n", 300 | "# x = sequence_output[:, 0, :] # take token (equiv. to [CLS])-> #(B,768) Image -> (B,2048)\n", 301 | " x = self.dense(x) # 768 -> 768\n", 302 | " feat= torch.tanh(x) \n", 303 | " logits = self.out_proj(feat) # 768 -> 27\n", 304 | " outputs = (logits,) + outputs[2:]\n", 305 | "\n", 306 | " return outputs,feat # (loss), logits, (hidden_states), (attentions)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "\n", 316 | "model = vec_output_CamembertForSequenceClassification.from_pretrained(\n", 317 | " modelname, # Use the 12-layer BERT model, with an uncased vocab.\n", 318 | " num_labels = len(Preprocess.dict_code_to_id), # The number of output labels--2 for binary classification.\n", 319 | " # You can increase this for multi-class tasks. \n", 320 | " output_attentions = False, # Whether the model returns attentions weights.\n", 321 | " output_hidden_states = False, # Whether the model returns all hidden-states.\n", 322 | ")\n", 323 | "model.cuda()" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "tr_inputs, val_inputs, tr_labels, val_labels = train_test_split(input_ids, labels,stratify=labels,\n", 333 | " random_state=2020, test_size=val_size)\n", 334 | "tr_masks, val_masks, u,v = train_test_split(attention_masks, labels,stratify=labels,\n", 335 | " random_state=2020, test_size=val_size)\n", 336 | "\n", 337 | "\n", 338 | "train_dataset=TensorDataset(tr_inputs, tr_masks, tr_labels)\n", 339 | "val_dataset=TensorDataset(val_inputs, val_masks, val_labels)\n", 340 | "train_sampler = RandomSampler(train_dataset) \n", 341 | "valid_sampler = SequentialSampler(val_dataset)\n", 342 | "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n", 343 | "\n", 344 | "# The DataLoader needs to know our batch size for training, so we specify it \n", 345 | "# here. For fine-tuning BERT on a specific task, the authors recommend a batch \n", 346 | "# size of 16 or 32.\n", 347 | "batch_size = 32\n", 348 | "\n", 349 | "# Create the DataLoaders for our training and validation sets.\n", 350 | "# We'll take training samples in random order. \n", 351 | "train_dataloader = DataLoader(\n", 352 | " train_dataset, # The training samples.\n", 353 | " sampler = train_sampler, # Select batches randomly\n", 354 | " batch_size = batch_size # Trains with this batch size.\n", 355 | " )\n", 356 | "\n", 357 | "# For validation the order doesn't matter, so we'll just read them sequentially.\n", 358 | "validation_dataloader = DataLoader(\n", 359 | " val_dataset, # The validation samples.\n", 360 | " sampler = valid_sampler, # Pull out batches sequentially.\n", 361 | " batch_size = batch_size # Evaluate with this batch size.\n", 362 | " )" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "optimizer = AdamW(model.parameters(),\n", 372 | " lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n", 373 | " eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n", 374 | " )\n" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "from transformers import get_linear_schedule_with_warmup\n", 384 | "\n", 385 | "# Number of training epochs. The BERT authors recommend between 2 and 4. \n", 386 | "# We chose to run for 4, but we'll see later that this may be over-fitting the\n", 387 | "# training data.\n", 388 | "epochs = 10\n", 389 | "\n", 390 | "# Total number of training steps is [number of batches] x [number of epochs]. \n", 391 | "# (Note that this is not the same as the number of training samples).\n", 392 | "total_steps = len(train_dataloader) * epochs\n", 393 | "\n", 394 | "# Create the learning rate scheduler.\n", 395 | "scheduler = get_linear_schedule_with_warmup(optimizer, \n", 396 | " num_warmup_steps = 0, # Default value in run_glue.py\n", 397 | " num_training_steps = total_steps)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "# Function to calculate the accuracy of our predictions vs labels\n", 407 | "def flat_accuracy(preds, labels):\n", 408 | " pred_flat = np.argmax(preds, axis=1).flatten()\n", 409 | " labels_flat = labels.flatten()\n", 410 | " return np.sum(pred_flat == labels_flat) / len(labels_flat)" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "import torch.nn as nn\n", 420 | "loss_criterion = nn.CrossEntropyLoss()" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "from sklearn.metrics import f1_score\n", 430 | "# This training code is based on the `run_glue.py` script here:\n", 431 | "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n", 432 | "\n", 433 | "# Set the seed value all over the place to make this reproducible.\n", 434 | "seed_val = 42\n", 435 | "\n", 436 | "random.seed(seed_val)\n", 437 | "np.random.seed(seed_val)\n", 438 | "torch.manual_seed(seed_val)\n", 439 | "torch.cuda.manual_seed_all(seed_val)\n", 440 | "\n", 441 | "# We'll store a number of quantities such as training and validation loss, \n", 442 | "# validation accuracy, and timings.\n", 443 | "training_stats = []\n", 444 | "\n", 445 | "# Measure the total training time for the whole run.\n", 446 | "total_t0 = time.time()\n", 447 | "\n", 448 | "\n", 449 | "# For each epoch...\n", 450 | "for epoch_i in range(0, epochs):\n", 451 | " \n", 452 | " # ========================================\n", 453 | " # Training\n", 454 | " # ========================================\n", 455 | " \n", 456 | " # Perform one full pass over the training set.\n", 457 | "\n", 458 | " print(\"\")\n", 459 | " print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n", 460 | " print('Training...')\n", 461 | " \n", 462 | " #tr and val\n", 463 | " vec_output_tr = []\n", 464 | " vec_output_val =[]\n", 465 | "\n", 466 | " # Measure how long the training epoch takes.\n", 467 | " t0 = time.time()\n", 468 | "\n", 469 | " # Reset the total loss for this epoch.\n", 470 | " total_train_loss = 0\n", 471 | "\n", 472 | " # Put the model into training mode. Don't be mislead--the call to \n", 473 | " # `train` just changes the *mode*, it doesn't *perform* the training.\n", 474 | " # `dropout` and `batchnorm` layers behave differently during training\n", 475 | " # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)\n", 476 | " best_f1 = 0\n", 477 | " model.train()\n", 478 | "\n", 479 | " # For each batch of training data...\n", 480 | " for step, batch in tqdm(enumerate(train_dataloader)):\n", 481 | " \n", 482 | " # Unpack this training batch from our dataloader. \n", 483 | " #\n", 484 | " \n", 485 | " # As we unpack the batch, we'll also copy each tensor to the GPU using the \n", 486 | " # `to` method.\n", 487 | " #\n", 488 | " # `batch` contains three pytorch tensors:\n", 489 | " # [0]: input ids \n", 490 | " # [1]: attention masks\n", 491 | " # [2]: labels \n", 492 | " b_input_ids = batch[0].to(device)\n", 493 | " b_input_mask = batch[1].to(device)\n", 494 | " b_labels = batch[2].to(device)\n", 495 | "\n", 496 | " \n", 497 | " model.zero_grad() \n", 498 | "\n", 499 | " \n", 500 | " logits,vec = model(b_input_ids, \n", 501 | " token_type_ids=None, \n", 502 | " attention_mask=b_input_mask\n", 503 | " )\n", 504 | " #new\n", 505 | " logits = logits[0]\n", 506 | " \n", 507 | " #Defining the loss\n", 508 | " loss = loss_criterion(logits, b_labels)\n", 509 | " \n", 510 | " #saving the features_tr\n", 511 | " vec = vec.detach().cpu().numpy()\n", 512 | " vec_output_tr.extend(vec)\n", 513 | " \n", 514 | " # Accumulate the training loss over all of the batches so that we can\n", 515 | " # calculate the average loss at the end. `loss` is a Tensor containing a\n", 516 | " # single value; the `.item()` function just returns the Python value \n", 517 | " # from the tensor.\n", 518 | " total_train_loss += loss.item()\n", 519 | "\n", 520 | " # Perform a backward pass to calculate the gradients.\n", 521 | " loss.backward()\n", 522 | "\n", 523 | " # Clip the norm of the gradients to 1.0.\n", 524 | " # This is to help prevent the \"exploding gradients\" problem.\n", 525 | " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n", 526 | "\n", 527 | " # Update parameters and take a step using the computed gradient.\n", 528 | " # The optimizer dictates the \"update rule\"--how the parameters are\n", 529 | " # modified based on their gradients, the learning rate, etc.\n", 530 | " optimizer.step()\n", 531 | "\n", 532 | " # Update the learning rate.\n", 533 | " scheduler.step()\n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | "\n", 538 | " # Calculate the average loss over all of the batches.\n", 539 | " avg_train_loss = total_train_loss / len(train_dataloader) \n", 540 | " \n", 541 | " # Measure how long this epoch took.\n", 542 | " training_time = format_time(time.time() - t0)\n", 543 | "\n", 544 | " print(\"\")\n", 545 | " print(\" Average training loss: {0:.2f} \".format(avg_train_loss))\n", 546 | " print(\" Training epcoh took: {:} \".format(training_time))\n", 547 | " \n", 548 | " # ========================================\n", 549 | " # Validation\n", 550 | " # ========================================\n", 551 | " # After the completion of each training epoch, measure our performance on\n", 552 | " # our validation set.\n", 553 | "\n", 554 | " print(\"\")\n", 555 | " print(\"Running Validation...\")\n", 556 | "\n", 557 | " t0 = time.time()\n", 558 | "\n", 559 | " # Put the model in evaluation mode--the dropout layers behave differently\n", 560 | " # during evaluation.\n", 561 | " model.eval()\n", 562 | "\n", 563 | " # Tracking variables \n", 564 | " total_eval_accuracy = 0\n", 565 | " total_eval_loss = 0\n", 566 | " nb_eval_steps = 0\n", 567 | " predictions=[]\n", 568 | " true_labels=[]\n", 569 | " \n", 570 | "\n", 571 | " # Evaluate data for one epoch\n", 572 | " for batch in tqdm(validation_dataloader):\n", 573 | " \n", 574 | " # Unpack this training batch from our dataloader. \n", 575 | " #\n", 576 | " # As we unpack the batch, we'll also copy each tensor to the GPU using \n", 577 | " # the `to` method.\n", 578 | " #\n", 579 | " # `batch` contains three pytorch tensors:\n", 580 | " # [0]: input ids \n", 581 | " # [1]: attention masks\n", 582 | " # [2]: labels \n", 583 | " b_input_ids = batch[0].to(device)\n", 584 | " b_input_mask = batch[1].to(device)\n", 585 | " b_labels = batch[2].to(device)\n", 586 | " \n", 587 | " # Tell pytorch not to bother with constructing the compute graph during\n", 588 | " # the forward pass, since this is only needed for backprop (training).\n", 589 | " with torch.no_grad(): \n", 590 | "\n", 591 | " # Forward pass, calculate logit predictions.\n", 592 | " # token_type_ids is the same as the \"segment ids\", which \n", 593 | " # differentiates sentence 1 and 2 in 2-sentence tasks.\n", 594 | " # The documentation for this `model` function is here: \n", 595 | " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n", 596 | " # Get the \"logits\" output by the model. The \"logits\" are the output\n", 597 | " # values prior to applying an activation function like the softmax.\n", 598 | " logits,vec = model(b_input_ids, \n", 599 | " token_type_ids=None, \n", 600 | " attention_mask=b_input_mask\n", 601 | " )\n", 602 | " \n", 603 | " #new\n", 604 | " logits = logits[0]\n", 605 | " \n", 606 | " #defining the val loss\n", 607 | " loss = loss_criterion(logits, b_labels)\n", 608 | " \n", 609 | " \n", 610 | " # Accumulate the validation loss.\n", 611 | " total_eval_loss += loss.item()\n", 612 | "\n", 613 | " # Move logits and labels to CPU\n", 614 | " logits = logits.detach().cpu().numpy()\n", 615 | "\n", 616 | " # Move logits and labels to CPU\n", 617 | " predicted_labels=np.argmax(logits,axis=1)\n", 618 | " predictions.extend(predicted_labels)\n", 619 | " label_ids = b_labels.to('cpu').numpy()\n", 620 | " true_labels.extend(label_ids)\n", 621 | " \n", 622 | " #saving the features_tr\n", 623 | " vec = vec.detach().cpu().numpy()\n", 624 | " vec_output_val.extend(vec)\n", 625 | " \n", 626 | "\n", 627 | " # Calculate the accuracy for this batch of test sentences, and\n", 628 | " # accumulate it over all batches.\n", 629 | " total_eval_accuracy += flat_accuracy(logits, label_ids)\n", 630 | " \n", 631 | "\n", 632 | " # Report the final accuracy for this validation run.\n", 633 | " avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n", 634 | " print(\" Accuracy: {0:.2f}\".format(avg_val_accuracy))\n", 635 | "\n", 636 | " # Calculate the average loss over all of the batches.\n", 637 | " avg_val_loss = total_eval_loss / len(validation_dataloader)\n", 638 | " \n", 639 | " # Measure how long the validation run took.\n", 640 | " validation_time = format_time(time.time() - t0)\n", 641 | " \n", 642 | " print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n", 643 | " print(\" Validation took: {:}\".format(validation_time))\n", 644 | " print(\"Validation F1-Score: {}\".format(f1_score(true_labels,predictions,average='macro')))\n", 645 | " curr_f1=f1_score(true_labels,predictions,average='macro')\n", 646 | " if curr_f1 > best_f1:\n", 647 | " best_f1=curr_f1\n", 648 | " torch.save(model.state_dict(), 'best_model.pt')\n", 649 | " np.save('best_vec_train_model_train.npy',vec_output_tr)\n", 650 | " np.save('best_vec_val.npy',vec_output_val)\n", 651 | " \n", 652 | " # Record all statistics from this epoch.\n", 653 | "# training_stats.append(\n", 654 | "# {\n", 655 | "# 'epoch': epoch_i + 1,\n", 656 | "# 'Training Loss': avg_train_loss,\n", 657 | "# 'Valid. Loss': avg_val_loss,\n", 658 | "# 'Valid. Accur.': avg_val_accuracy,\n", 659 | "# 'Training Time': training_time,\n", 660 | "# 'Validation Time': validation_time\n", 661 | "# }\n", 662 | "# )\n", 663 | "\n", 664 | "print(\"\")\n", 665 | "print(\"Training complete!\")\n", 666 | "\n", 667 | "print(\"Total training took {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))\n" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": {}, 673 | "source": [ 674 | "## Predictions" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": null, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "model_path = '/kaggle/working/best_model.pt'\n", 684 | "checkpoint = torch.load(model_path)\n", 685 | "# model = checkpoint['model']\n", 686 | "model.load_state_dict(checkpoint)" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": null, 692 | "metadata": {}, 693 | "outputs": [], 694 | "source": [ 695 | "def predict_pyt(model, prediction_dataloader):\n", 696 | " \"\"\"\n", 697 | " model: pytorch model\n", 698 | " prediction_dataloader: DataLoader object for which the predictions has to be made.\n", 699 | " return:\n", 700 | " predictions:- Direct predicted labels\n", 701 | " softmax_logits:- logits which are normalized with softmax on output\"\"\"\n", 702 | " \n", 703 | " # Put model in evaluation mode\n", 704 | " model.eval()\n", 705 | "\n", 706 | " # Tracking variables \n", 707 | " predictions = []\n", 708 | " softmax_logits=[]\n", 709 | " vec_outputs = []\n", 710 | " \n", 711 | " # Predict \n", 712 | " for batch in tqdm(prediction_dataloader):\n", 713 | " \n", 714 | " # Add batch to GPU\n", 715 | " batch = tuple(t.to(device) for t in batch)\n", 716 | " # Unpack the inputs from our dataloader\n", 717 | " try:\n", 718 | " b_input_ids, b_input_mask = batch\n", 719 | " except ValueError:\n", 720 | " b_input_ids, b_input_mask, _ = batch\n", 721 | " # Telling the model not to compute or store gradients, saving memory and \n", 722 | " # speeding up prediction\n", 723 | " with torch.no_grad():\n", 724 | " # Forward pass, calculate logit predictions\n", 725 | " logits,vec = model(b_input_ids, token_type_ids=None, \n", 726 | " attention_mask=b_input_mask)\n", 727 | " \n", 728 | " logits = logits[0]\n", 729 | "\n", 730 | " \n", 731 | " #----- Add softmax--- \n", 732 | " m = nn.Softmax(dim=1)\n", 733 | " # # input = torch.randn(2, 3)\n", 734 | " output = m(logits)\n", 735 | " #-------#------\n", 736 | " \n", 737 | " # Move logits and labels to CPU\n", 738 | " logits = logits.detach().cpu().numpy()\n", 739 | " predicted_labels=np.argmax(logits,axis=1)\n", 740 | " predictions.extend(predicted_labels)\n", 741 | " softmax_logits.extend(output)\n", 742 | " \n", 743 | " #vec_outputs saving\n", 744 | " vec = vec.detach().cpu().numpy()\n", 745 | " vec_outputs.extend(vec)\n", 746 | "\n", 747 | " print('DONE')\n", 748 | " return predictions, softmax_logits , vec_outputs\n", 749 | "\n", 750 | "def predict_wrapper(model, sentences, max_len=max_len, batch_size = batch_size ):\n", 751 | " \"\"\"\n", 752 | " Wrapper to create DataLoader object and predict, \n", 753 | " this is if model and sentences are passed\"\"\"\n", 754 | " input_ids,attention_masks=prep_input(sentences,labels=None, max_len=max_len)\n", 755 | " prediction_data = TensorDataset(input_ids, attention_masks)\n", 756 | " prediction_sampler = SequentialSampler(prediction_data)\n", 757 | " prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)\n", 758 | " return predict_pyt(model, prediction_dataloader)\n", 759 | "\n", 760 | "\n", 761 | "\n" 762 | ] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": null, 767 | "metadata": {}, 768 | "outputs": [], 769 | "source": [ 770 | "## Prepare the test dataset\n", 771 | "batch_size = 32 \n", 772 | "\n", 773 | "Preprocess.prepare_test(text_col)\n", 774 | "test_sentences = Preprocess.test_sentences\n", 775 | "X_test_phase1= Preprocess.X_test" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [ 784 | "## Predictions of train dataset \n", 785 | "# model_path = '../input/camembertvectoroutput/best_model.pt'\n", 786 | "# checkpoint = torch.load(model_path)\n", 787 | "# model = checkpoint['model']\n", 788 | "# model.load_state_dict(checkpoint)\n", 789 | "start = time.time()\n", 790 | "predictions, softmax_logits , vec_outputs = predict_pyt(model, train_dataloader)\n", 791 | "\n", 792 | "#saving\n", 793 | "np.save('best_vec_train_model_eval.npy',vec_outputs)\n", 794 | "softmax_logits = np.array([ten.detach().cpu().numpy() for ten in softmax_logits])\n", 795 | "np.save('train_set_softmax_logits.npy',softmax_logits)\n", 796 | "print('length of predictions {}'.format(len(predictions)))\n", 797 | "print('Time Taken Predict for train set: {:}'.format(format_time(time.time() - start) ))" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": null, 803 | "metadata": {}, 804 | "outputs": [], 805 | "source": [ 806 | "# Predictions of validation set which is randomly separated from train dataset\n", 807 | "start = time.time()\n", 808 | "predictions, val_softmax_logits , vec_outputs= predict_pyt(model, validation_dataloader)\n", 809 | "np.save('best_vec_val_model_eval.npy',vec_outputs)\n", 810 | "val_softmax_logits = np.array([ten.detach().cpu().numpy() for ten in val_softmax_logits])\n", 811 | "np.save('validation_set_softmax_logits.npy',val_softmax_logits)\n", 812 | "print('Time Taken Predict for val set: {:}'.format(format_time(time.time() - start)))" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": {}, 819 | "outputs": [], 820 | "source": [ 821 | "## Predictions of test dataset \n", 822 | "\n", 823 | "start = time.time()\n", 824 | "predictions, softmax_logits , vec_outputs = predict_wrapper(model, test_sentences)\n", 825 | "\n", 826 | "#saving\n", 827 | "np.save('best_vec_test.npy',vec_outputs)\n", 828 | "softmax_logits = np.array([ten.detach().cpu().numpy() for ten in softmax_logits])\n", 829 | "np.save('X_test_phase1_softmax_logits.npy',softmax_logits)\n", 830 | "print('length of predictions {}'.format(len(predictions)))\n", 831 | "print('Time Taken Predict for test set: {:}'.format(format_time(time.time() - start) ))" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": null, 837 | "metadata": {}, 838 | "outputs": [], 839 | "source": [ 840 | "X_test_phase1['prediction_model']= predictions\n", 841 | "X_test_phase1['Prdtypecode']=X_test_phase1['prediction_model'].map(Preprocess.dict_id_to_code)\n", 842 | "print(X_test_phase1['Prdtypecode'].value_counts())\n", 843 | "X_test_phase1=X_test_phase1.drop(['prediction_model','Title','Description'],axis=1)" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": null, 849 | "metadata": {}, 850 | "outputs": [], 851 | "source": [ 852 | "X_test_phase1.to_csv('y_test_task1_phase1_pred.tsv',sep='\\t',index=False)\n" 853 | ] 854 | } 855 | ], 856 | "metadata": { 857 | "kernelspec": { 858 | "display_name": "Python 3", 859 | "language": "python", 860 | "name": "python3" 861 | }, 862 | "language_info": { 863 | "codemirror_mode": { 864 | "name": "ipython", 865 | "version": 3 866 | }, 867 | "file_extension": ".py", 868 | "mimetype": "text/x-python", 869 | "name": "python", 870 | "nbconvert_exporter": "python", 871 | "pygments_lexer": "ipython3", 872 | "version": "3.7.6" 873 | }, 874 | "toc": { 875 | "base_numbering": 1, 876 | "nav_menu": {}, 877 | "number_sections": true, 878 | "sideBar": true, 879 | "skip_h1_title": false, 880 | "title_cell": "Table of Contents", 881 | "title_sidebar": "Contents", 882 | "toc_cell": false, 883 | "toc_position": {}, 884 | "toc_section_display": true, 885 | "toc_window_display": false 886 | }, 887 | "widgets": { 888 | "application/vnd.jupyter.widget-state+json": { 889 | "state": { 890 | "04e2caaecb124a14945c845ca6e62aad": { 891 | "model_module": "@jupyter-widgets/controls", 892 | "model_module_version": "1.5.0", 893 | "model_name": "ProgressStyleModel", 894 | "state": { 895 | "_model_module": "@jupyter-widgets/controls", 896 | "_model_module_version": "1.5.0", 897 | "_model_name": "ProgressStyleModel", 898 | "_view_count": null, 899 | "_view_module": "@jupyter-widgets/base", 900 | "_view_module_version": "1.2.0", 901 | "_view_name": "StyleView", 902 | "bar_color": null, 903 | "description_width": "initial" 904 | } 905 | }, 906 | "0d47bfa702554bd68f25bb94db8a8811": { 907 | "model_module": "@jupyter-widgets/controls", 908 | "model_module_version": "1.5.0", 909 | "model_name": "DescriptionStyleModel", 910 | "state": { 911 | "_model_module": "@jupyter-widgets/controls", 912 | "_model_module_version": "1.5.0", 913 | "_model_name": "DescriptionStyleModel", 914 | "_view_count": null, 915 | "_view_module": "@jupyter-widgets/base", 916 | "_view_module_version": "1.2.0", 917 | "_view_name": "StyleView", 918 | "description_width": "" 919 | } 920 | }, 921 | "1e4ce92ff6a44d89b65e7917319266eb": { 922 | "model_module": "@jupyter-widgets/controls", 923 | "model_module_version": "1.5.0", 924 | "model_name": "DescriptionStyleModel", 925 | "state": { 926 | "_model_module": "@jupyter-widgets/controls", 927 | "_model_module_version": "1.5.0", 928 | "_model_name": "DescriptionStyleModel", 929 | "_view_count": null, 930 | "_view_module": "@jupyter-widgets/base", 931 | "_view_module_version": "1.2.0", 932 | "_view_name": "StyleView", 933 | "description_width": "" 934 | } 935 | }, 936 | "212f4750f35d4bc2b4272e6d070fce89": { 937 | "model_module": "@jupyter-widgets/controls", 938 | "model_module_version": "1.5.0", 939 | "model_name": "HBoxModel", 940 | "state": { 941 | "_dom_classes": [], 942 | "_model_module": "@jupyter-widgets/controls", 943 | "_model_module_version": "1.5.0", 944 | "_model_name": "HBoxModel", 945 | "_view_count": null, 946 | "_view_module": "@jupyter-widgets/controls", 947 | "_view_module_version": "1.5.0", 948 | "_view_name": "HBoxView", 949 | "box_style": "", 950 | "children": [ 951 | "IPY_MODEL_85588758dedc4b8bbc6ee33178593140", 952 | "IPY_MODEL_a39feb5a6e374ea2ab65be2fe8b75b00" 953 | ], 954 | "layout": "IPY_MODEL_5a054222842941dab063a8db8ede0ff2" 955 | } 956 | }, 957 | "4bc03bf5ab334fc590007e48be4dd318": { 958 | "model_module": "@jupyter-widgets/base", 959 | "model_module_version": "1.2.0", 960 | "model_name": "LayoutModel", 961 | "state": { 962 | "_model_module": "@jupyter-widgets/base", 963 | "_model_module_version": "1.2.0", 964 | "_model_name": "LayoutModel", 965 | "_view_count": null, 966 | "_view_module": "@jupyter-widgets/base", 967 | "_view_module_version": "1.2.0", 968 | "_view_name": "LayoutView", 969 | "align_content": null, 970 | "align_items": null, 971 | "align_self": null, 972 | "border": null, 973 | "bottom": null, 974 | "display": null, 975 | "flex": null, 976 | "flex_flow": null, 977 | "grid_area": null, 978 | "grid_auto_columns": null, 979 | "grid_auto_flow": null, 980 | "grid_auto_rows": null, 981 | "grid_column": null, 982 | "grid_gap": null, 983 | "grid_row": null, 984 | "grid_template_areas": null, 985 | "grid_template_columns": null, 986 | "grid_template_rows": null, 987 | "height": null, 988 | "justify_content": null, 989 | "justify_items": null, 990 | "left": null, 991 | "margin": null, 992 | "max_height": null, 993 | "max_width": null, 994 | "min_height": null, 995 | "min_width": null, 996 | "object_fit": null, 997 | "object_position": null, 998 | "order": null, 999 | "overflow": null, 1000 | "overflow_x": null, 1001 | "overflow_y": null, 1002 | "padding": null, 1003 | "right": null, 1004 | "top": null, 1005 | "visibility": null, 1006 | "width": null 1007 | } 1008 | }, 1009 | "4ec5441ef13241dcb0af2d40a4036e6e": { 1010 | "model_module": "@jupyter-widgets/controls", 1011 | "model_module_version": "1.5.0", 1012 | "model_name": "HBoxModel", 1013 | "state": { 1014 | "_dom_classes": [], 1015 | "_model_module": "@jupyter-widgets/controls", 1016 | "_model_module_version": "1.5.0", 1017 | "_model_name": "HBoxModel", 1018 | "_view_count": null, 1019 | "_view_module": "@jupyter-widgets/controls", 1020 | "_view_module_version": "1.5.0", 1021 | "_view_name": "HBoxView", 1022 | "box_style": "", 1023 | "children": [ 1024 | "IPY_MODEL_8cd310281c3e4133b3776f69196bef32", 1025 | "IPY_MODEL_702441bdd088466d8e1d264071baca75" 1026 | ], 1027 | "layout": "IPY_MODEL_dcc661b801f940139925b83564e8f282" 1028 | } 1029 | }, 1030 | "4f92279308be48e2a1b543fdb441246c": { 1031 | "model_module": "@jupyter-widgets/base", 1032 | "model_module_version": "1.2.0", 1033 | "model_name": "LayoutModel", 1034 | "state": { 1035 | "_model_module": "@jupyter-widgets/base", 1036 | "_model_module_version": "1.2.0", 1037 | "_model_name": "LayoutModel", 1038 | "_view_count": null, 1039 | "_view_module": "@jupyter-widgets/base", 1040 | "_view_module_version": "1.2.0", 1041 | "_view_name": "LayoutView", 1042 | "align_content": null, 1043 | "align_items": null, 1044 | "align_self": null, 1045 | "border": null, 1046 | "bottom": null, 1047 | "display": null, 1048 | "flex": null, 1049 | "flex_flow": null, 1050 | "grid_area": null, 1051 | "grid_auto_columns": null, 1052 | "grid_auto_flow": null, 1053 | "grid_auto_rows": null, 1054 | "grid_column": null, 1055 | "grid_gap": null, 1056 | "grid_row": null, 1057 | "grid_template_areas": null, 1058 | "grid_template_columns": null, 1059 | "grid_template_rows": null, 1060 | "height": null, 1061 | "justify_content": null, 1062 | "justify_items": null, 1063 | "left": null, 1064 | "margin": null, 1065 | "max_height": null, 1066 | "max_width": null, 1067 | "min_height": null, 1068 | "min_width": null, 1069 | "object_fit": null, 1070 | "object_position": null, 1071 | "order": null, 1072 | "overflow": null, 1073 | "overflow_x": null, 1074 | "overflow_y": null, 1075 | "padding": null, 1076 | "right": null, 1077 | "top": null, 1078 | "visibility": null, 1079 | "width": null 1080 | } 1081 | }, 1082 | "57be58d12ce0415590cd75f529dc8a06": { 1083 | "model_module": "@jupyter-widgets/base", 1084 | "model_module_version": "1.2.0", 1085 | "model_name": "LayoutModel", 1086 | "state": { 1087 | "_model_module": "@jupyter-widgets/base", 1088 | "_model_module_version": "1.2.0", 1089 | "_model_name": "LayoutModel", 1090 | "_view_count": null, 1091 | "_view_module": "@jupyter-widgets/base", 1092 | "_view_module_version": "1.2.0", 1093 | "_view_name": "LayoutView", 1094 | "align_content": null, 1095 | "align_items": null, 1096 | "align_self": null, 1097 | "border": null, 1098 | "bottom": null, 1099 | "display": null, 1100 | "flex": null, 1101 | "flex_flow": null, 1102 | "grid_area": null, 1103 | "grid_auto_columns": null, 1104 | "grid_auto_flow": null, 1105 | "grid_auto_rows": null, 1106 | "grid_column": null, 1107 | "grid_gap": null, 1108 | "grid_row": null, 1109 | "grid_template_areas": null, 1110 | "grid_template_columns": null, 1111 | "grid_template_rows": null, 1112 | "height": null, 1113 | "justify_content": null, 1114 | "justify_items": null, 1115 | "left": null, 1116 | "margin": null, 1117 | "max_height": null, 1118 | "max_width": null, 1119 | "min_height": null, 1120 | "min_width": null, 1121 | "object_fit": null, 1122 | "object_position": null, 1123 | "order": null, 1124 | "overflow": null, 1125 | "overflow_x": null, 1126 | "overflow_y": null, 1127 | "padding": null, 1128 | "right": null, 1129 | "top": null, 1130 | "visibility": null, 1131 | "width": null 1132 | } 1133 | }, 1134 | "5a054222842941dab063a8db8ede0ff2": { 1135 | "model_module": "@jupyter-widgets/base", 1136 | "model_module_version": "1.2.0", 1137 | "model_name": "LayoutModel", 1138 | "state": { 1139 | "_model_module": "@jupyter-widgets/base", 1140 | "_model_module_version": "1.2.0", 1141 | "_model_name": "LayoutModel", 1142 | "_view_count": null, 1143 | "_view_module": "@jupyter-widgets/base", 1144 | "_view_module_version": "1.2.0", 1145 | "_view_name": "LayoutView", 1146 | "align_content": null, 1147 | "align_items": null, 1148 | "align_self": null, 1149 | "border": null, 1150 | "bottom": null, 1151 | "display": null, 1152 | "flex": null, 1153 | "flex_flow": null, 1154 | "grid_area": null, 1155 | "grid_auto_columns": null, 1156 | "grid_auto_flow": null, 1157 | "grid_auto_rows": null, 1158 | "grid_column": null, 1159 | "grid_gap": null, 1160 | "grid_row": null, 1161 | "grid_template_areas": null, 1162 | "grid_template_columns": null, 1163 | "grid_template_rows": null, 1164 | "height": null, 1165 | "justify_content": null, 1166 | "justify_items": null, 1167 | "left": null, 1168 | "margin": null, 1169 | "max_height": null, 1170 | "max_width": null, 1171 | "min_height": null, 1172 | "min_width": null, 1173 | "object_fit": null, 1174 | "object_position": null, 1175 | "order": null, 1176 | "overflow": null, 1177 | "overflow_x": null, 1178 | "overflow_y": null, 1179 | "padding": null, 1180 | "right": null, 1181 | "top": null, 1182 | "visibility": null, 1183 | "width": null 1184 | } 1185 | }, 1186 | "6a940c5ee47e4a0fa6bd17899077b04c": { 1187 | "model_module": "@jupyter-widgets/base", 1188 | "model_module_version": "1.2.0", 1189 | "model_name": "LayoutModel", 1190 | "state": { 1191 | "_model_module": "@jupyter-widgets/base", 1192 | "_model_module_version": "1.2.0", 1193 | "_model_name": "LayoutModel", 1194 | "_view_count": null, 1195 | "_view_module": "@jupyter-widgets/base", 1196 | "_view_module_version": "1.2.0", 1197 | "_view_name": "LayoutView", 1198 | "align_content": null, 1199 | "align_items": null, 1200 | "align_self": null, 1201 | "border": null, 1202 | "bottom": null, 1203 | "display": null, 1204 | "flex": null, 1205 | "flex_flow": null, 1206 | "grid_area": null, 1207 | "grid_auto_columns": null, 1208 | "grid_auto_flow": null, 1209 | "grid_auto_rows": null, 1210 | "grid_column": null, 1211 | "grid_gap": null, 1212 | "grid_row": null, 1213 | "grid_template_areas": null, 1214 | "grid_template_columns": null, 1215 | "grid_template_rows": null, 1216 | "height": null, 1217 | "justify_content": null, 1218 | "justify_items": null, 1219 | "left": null, 1220 | "margin": null, 1221 | "max_height": null, 1222 | "max_width": null, 1223 | "min_height": null, 1224 | "min_width": null, 1225 | "object_fit": null, 1226 | "object_position": null, 1227 | "order": null, 1228 | "overflow": null, 1229 | "overflow_x": null, 1230 | "overflow_y": null, 1231 | "padding": null, 1232 | "right": null, 1233 | "top": null, 1234 | "visibility": null, 1235 | "width": null 1236 | } 1237 | }, 1238 | "702441bdd088466d8e1d264071baca75": { 1239 | "model_module": "@jupyter-widgets/controls", 1240 | "model_module_version": "1.5.0", 1241 | "model_name": "HTMLModel", 1242 | "state": { 1243 | "_dom_classes": [], 1244 | "_model_module": "@jupyter-widgets/controls", 1245 | "_model_module_version": "1.5.0", 1246 | "_model_name": "HTMLModel", 1247 | "_view_count": null, 1248 | "_view_module": "@jupyter-widgets/controls", 1249 | "_view_module_version": "1.5.0", 1250 | "_view_name": "HTMLView", 1251 | "description": "", 1252 | "description_tooltip": null, 1253 | "layout": "IPY_MODEL_94ed9026bc664a81a39ea16f09293c7c", 1254 | "placeholder": "​", 1255 | "style": "IPY_MODEL_0d47bfa702554bd68f25bb94db8a8811", 1256 | "value": " 811k/811k [00:01<00:00, 648kB/s]" 1257 | } 1258 | }, 1259 | "82320d113b0b40e1b038d3cf321b3433": { 1260 | "model_module": "@jupyter-widgets/controls", 1261 | "model_module_version": "1.5.0", 1262 | "model_name": "HTMLModel", 1263 | "state": { 1264 | "_dom_classes": [], 1265 | "_model_module": "@jupyter-widgets/controls", 1266 | "_model_module_version": "1.5.0", 1267 | "_model_name": "HTMLModel", 1268 | "_view_count": null, 1269 | "_view_module": "@jupyter-widgets/controls", 1270 | "_view_module_version": "1.5.0", 1271 | "_view_name": "HTMLView", 1272 | "description": "", 1273 | "description_tooltip": null, 1274 | "layout": "IPY_MODEL_4bc03bf5ab334fc590007e48be4dd318", 1275 | "placeholder": "​", 1276 | "style": "IPY_MODEL_895e9b60a3974711883bcd1d827de8a6", 1277 | "value": " 508/508 [00:00<00:00, 1.38kB/s]" 1278 | } 1279 | }, 1280 | "85588758dedc4b8bbc6ee33178593140": { 1281 | "model_module": "@jupyter-widgets/controls", 1282 | "model_module_version": "1.5.0", 1283 | "model_name": "FloatProgressModel", 1284 | "state": { 1285 | "_dom_classes": [], 1286 | "_model_module": "@jupyter-widgets/controls", 1287 | "_model_module_version": "1.5.0", 1288 | "_model_name": "FloatProgressModel", 1289 | "_view_count": null, 1290 | "_view_module": "@jupyter-widgets/controls", 1291 | "_view_module_version": "1.5.0", 1292 | "_view_name": "ProgressView", 1293 | "bar_style": "success", 1294 | "description": "Downloading: 100%", 1295 | "description_tooltip": null, 1296 | "layout": "IPY_MODEL_4f92279308be48e2a1b543fdb441246c", 1297 | "max": 445032417, 1298 | "min": 0, 1299 | "orientation": "horizontal", 1300 | "style": "IPY_MODEL_04e2caaecb124a14945c845ca6e62aad", 1301 | "value": 445032417 1302 | } 1303 | }, 1304 | "895e9b60a3974711883bcd1d827de8a6": { 1305 | "model_module": "@jupyter-widgets/controls", 1306 | "model_module_version": "1.5.0", 1307 | "model_name": "DescriptionStyleModel", 1308 | "state": { 1309 | "_model_module": "@jupyter-widgets/controls", 1310 | "_model_module_version": "1.5.0", 1311 | "_model_name": "DescriptionStyleModel", 1312 | "_view_count": null, 1313 | "_view_module": "@jupyter-widgets/base", 1314 | "_view_module_version": "1.2.0", 1315 | "_view_name": "StyleView", 1316 | "description_width": "" 1317 | } 1318 | }, 1319 | "8cd310281c3e4133b3776f69196bef32": { 1320 | "model_module": "@jupyter-widgets/controls", 1321 | "model_module_version": "1.5.0", 1322 | "model_name": "FloatProgressModel", 1323 | "state": { 1324 | "_dom_classes": [], 1325 | "_model_module": "@jupyter-widgets/controls", 1326 | "_model_module_version": "1.5.0", 1327 | "_model_name": "FloatProgressModel", 1328 | "_view_count": null, 1329 | "_view_module": "@jupyter-widgets/controls", 1330 | "_view_module_version": "1.5.0", 1331 | "_view_name": "ProgressView", 1332 | "bar_style": "success", 1333 | "description": "Downloading: 100%", 1334 | "description_tooltip": null, 1335 | "layout": "IPY_MODEL_fb5ba4132e1e455ea0b38556501346c8", 1336 | "max": 810912, 1337 | "min": 0, 1338 | "orientation": "horizontal", 1339 | "style": "IPY_MODEL_a717d5b6e71341408ed3a51d679f1ed6", 1340 | "value": 810912 1341 | } 1342 | }, 1343 | "94ed9026bc664a81a39ea16f09293c7c": { 1344 | "model_module": "@jupyter-widgets/base", 1345 | "model_module_version": "1.2.0", 1346 | "model_name": "LayoutModel", 1347 | "state": { 1348 | "_model_module": "@jupyter-widgets/base", 1349 | "_model_module_version": "1.2.0", 1350 | "_model_name": "LayoutModel", 1351 | "_view_count": null, 1352 | "_view_module": "@jupyter-widgets/base", 1353 | "_view_module_version": "1.2.0", 1354 | "_view_name": "LayoutView", 1355 | "align_content": null, 1356 | "align_items": null, 1357 | "align_self": null, 1358 | "border": null, 1359 | "bottom": null, 1360 | "display": null, 1361 | "flex": null, 1362 | "flex_flow": null, 1363 | "grid_area": null, 1364 | "grid_auto_columns": null, 1365 | "grid_auto_flow": null, 1366 | "grid_auto_rows": null, 1367 | "grid_column": null, 1368 | "grid_gap": null, 1369 | "grid_row": null, 1370 | "grid_template_areas": null, 1371 | "grid_template_columns": null, 1372 | "grid_template_rows": null, 1373 | "height": null, 1374 | "justify_content": null, 1375 | "justify_items": null, 1376 | "left": null, 1377 | "margin": null, 1378 | "max_height": null, 1379 | "max_width": null, 1380 | "min_height": null, 1381 | "min_width": null, 1382 | "object_fit": null, 1383 | "object_position": null, 1384 | "order": null, 1385 | "overflow": null, 1386 | "overflow_x": null, 1387 | "overflow_y": null, 1388 | "padding": null, 1389 | "right": null, 1390 | "top": null, 1391 | "visibility": null, 1392 | "width": null 1393 | } 1394 | }, 1395 | "a39feb5a6e374ea2ab65be2fe8b75b00": { 1396 | "model_module": "@jupyter-widgets/controls", 1397 | "model_module_version": "1.5.0", 1398 | "model_name": "HTMLModel", 1399 | "state": { 1400 | "_dom_classes": [], 1401 | "_model_module": "@jupyter-widgets/controls", 1402 | "_model_module_version": "1.5.0", 1403 | "_model_name": "HTMLModel", 1404 | "_view_count": null, 1405 | "_view_module": "@jupyter-widgets/controls", 1406 | "_view_module_version": "1.5.0", 1407 | "_view_name": "HTMLView", 1408 | "description": "", 1409 | "description_tooltip": null, 1410 | "layout": "IPY_MODEL_57be58d12ce0415590cd75f529dc8a06", 1411 | "placeholder": "​", 1412 | "style": "IPY_MODEL_1e4ce92ff6a44d89b65e7917319266eb", 1413 | "value": " 445M/445M [00:12<00:00, 35.8MB/s]" 1414 | } 1415 | }, 1416 | "a717d5b6e71341408ed3a51d679f1ed6": { 1417 | "model_module": "@jupyter-widgets/controls", 1418 | "model_module_version": "1.5.0", 1419 | "model_name": "ProgressStyleModel", 1420 | "state": { 1421 | "_model_module": "@jupyter-widgets/controls", 1422 | "_model_module_version": "1.5.0", 1423 | "_model_name": "ProgressStyleModel", 1424 | "_view_count": null, 1425 | "_view_module": "@jupyter-widgets/base", 1426 | "_view_module_version": "1.2.0", 1427 | "_view_name": "StyleView", 1428 | "bar_color": null, 1429 | "description_width": "initial" 1430 | } 1431 | }, 1432 | "ba22ce2585f54900b21f7f31ed15e78a": { 1433 | "model_module": "@jupyter-widgets/controls", 1434 | "model_module_version": "1.5.0", 1435 | "model_name": "FloatProgressModel", 1436 | "state": { 1437 | "_dom_classes": [], 1438 | "_model_module": "@jupyter-widgets/controls", 1439 | "_model_module_version": "1.5.0", 1440 | "_model_name": "FloatProgressModel", 1441 | "_view_count": null, 1442 | "_view_module": "@jupyter-widgets/controls", 1443 | "_view_module_version": "1.5.0", 1444 | "_view_name": "ProgressView", 1445 | "bar_style": "success", 1446 | "description": "Downloading: 100%", 1447 | "description_tooltip": null, 1448 | "layout": "IPY_MODEL_e8db38407d4f4525ba87dafb35c67a7d", 1449 | "max": 508, 1450 | "min": 0, 1451 | "orientation": "horizontal", 1452 | "style": "IPY_MODEL_ff8421ceeeb84863a79a95137d57e3a7", 1453 | "value": 508 1454 | } 1455 | }, 1456 | "dcc661b801f940139925b83564e8f282": { 1457 | "model_module": "@jupyter-widgets/base", 1458 | "model_module_version": "1.2.0", 1459 | "model_name": "LayoutModel", 1460 | "state": { 1461 | "_model_module": "@jupyter-widgets/base", 1462 | "_model_module_version": "1.2.0", 1463 | "_model_name": "LayoutModel", 1464 | "_view_count": null, 1465 | "_view_module": "@jupyter-widgets/base", 1466 | "_view_module_version": "1.2.0", 1467 | "_view_name": "LayoutView", 1468 | "align_content": null, 1469 | "align_items": null, 1470 | "align_self": null, 1471 | "border": null, 1472 | "bottom": null, 1473 | "display": null, 1474 | "flex": null, 1475 | "flex_flow": null, 1476 | "grid_area": null, 1477 | "grid_auto_columns": null, 1478 | "grid_auto_flow": null, 1479 | "grid_auto_rows": null, 1480 | "grid_column": null, 1481 | "grid_gap": null, 1482 | "grid_row": null, 1483 | "grid_template_areas": null, 1484 | "grid_template_columns": null, 1485 | "grid_template_rows": null, 1486 | "height": null, 1487 | "justify_content": null, 1488 | "justify_items": null, 1489 | "left": null, 1490 | "margin": null, 1491 | "max_height": null, 1492 | "max_width": null, 1493 | "min_height": null, 1494 | "min_width": null, 1495 | "object_fit": null, 1496 | "object_position": null, 1497 | "order": null, 1498 | "overflow": null, 1499 | "overflow_x": null, 1500 | "overflow_y": null, 1501 | "padding": null, 1502 | "right": null, 1503 | "top": null, 1504 | "visibility": null, 1505 | "width": null 1506 | } 1507 | }, 1508 | "dd232800d1994d96816b47b1eb042df7": { 1509 | "model_module": "@jupyter-widgets/controls", 1510 | "model_module_version": "1.5.0", 1511 | "model_name": "HBoxModel", 1512 | "state": { 1513 | "_dom_classes": [], 1514 | "_model_module": "@jupyter-widgets/controls", 1515 | "_model_module_version": "1.5.0", 1516 | "_model_name": "HBoxModel", 1517 | "_view_count": null, 1518 | "_view_module": "@jupyter-widgets/controls", 1519 | "_view_module_version": "1.5.0", 1520 | "_view_name": "HBoxView", 1521 | "box_style": "", 1522 | "children": [ 1523 | "IPY_MODEL_ba22ce2585f54900b21f7f31ed15e78a", 1524 | "IPY_MODEL_82320d113b0b40e1b038d3cf321b3433" 1525 | ], 1526 | "layout": "IPY_MODEL_6a940c5ee47e4a0fa6bd17899077b04c" 1527 | } 1528 | }, 1529 | "e8db38407d4f4525ba87dafb35c67a7d": { 1530 | "model_module": "@jupyter-widgets/base", 1531 | "model_module_version": "1.2.0", 1532 | "model_name": "LayoutModel", 1533 | "state": { 1534 | "_model_module": "@jupyter-widgets/base", 1535 | "_model_module_version": "1.2.0", 1536 | "_model_name": "LayoutModel", 1537 | "_view_count": null, 1538 | "_view_module": "@jupyter-widgets/base", 1539 | "_view_module_version": "1.2.0", 1540 | "_view_name": "LayoutView", 1541 | "align_content": null, 1542 | "align_items": null, 1543 | "align_self": null, 1544 | "border": null, 1545 | "bottom": null, 1546 | "display": null, 1547 | "flex": null, 1548 | "flex_flow": null, 1549 | "grid_area": null, 1550 | "grid_auto_columns": null, 1551 | "grid_auto_flow": null, 1552 | "grid_auto_rows": null, 1553 | "grid_column": null, 1554 | "grid_gap": null, 1555 | "grid_row": null, 1556 | "grid_template_areas": null, 1557 | "grid_template_columns": null, 1558 | "grid_template_rows": null, 1559 | "height": null, 1560 | "justify_content": null, 1561 | "justify_items": null, 1562 | "left": null, 1563 | "margin": null, 1564 | "max_height": null, 1565 | "max_width": null, 1566 | "min_height": null, 1567 | "min_width": null, 1568 | "object_fit": null, 1569 | "object_position": null, 1570 | "order": null, 1571 | "overflow": null, 1572 | "overflow_x": null, 1573 | "overflow_y": null, 1574 | "padding": null, 1575 | "right": null, 1576 | "top": null, 1577 | "visibility": null, 1578 | "width": null 1579 | } 1580 | }, 1581 | "fb5ba4132e1e455ea0b38556501346c8": { 1582 | "model_module": "@jupyter-widgets/base", 1583 | "model_module_version": "1.2.0", 1584 | "model_name": "LayoutModel", 1585 | "state": { 1586 | "_model_module": "@jupyter-widgets/base", 1587 | "_model_module_version": "1.2.0", 1588 | "_model_name": "LayoutModel", 1589 | "_view_count": null, 1590 | "_view_module": "@jupyter-widgets/base", 1591 | "_view_module_version": "1.2.0", 1592 | "_view_name": "LayoutView", 1593 | "align_content": null, 1594 | "align_items": null, 1595 | "align_self": null, 1596 | "border": null, 1597 | "bottom": null, 1598 | "display": null, 1599 | "flex": null, 1600 | "flex_flow": null, 1601 | "grid_area": null, 1602 | "grid_auto_columns": null, 1603 | "grid_auto_flow": null, 1604 | "grid_auto_rows": null, 1605 | "grid_column": null, 1606 | "grid_gap": null, 1607 | "grid_row": null, 1608 | "grid_template_areas": null, 1609 | "grid_template_columns": null, 1610 | "grid_template_rows": null, 1611 | "height": null, 1612 | "justify_content": null, 1613 | "justify_items": null, 1614 | "left": null, 1615 | "margin": null, 1616 | "max_height": null, 1617 | "max_width": null, 1618 | "min_height": null, 1619 | "min_width": null, 1620 | "object_fit": null, 1621 | "object_position": null, 1622 | "order": null, 1623 | "overflow": null, 1624 | "overflow_x": null, 1625 | "overflow_y": null, 1626 | "padding": null, 1627 | "right": null, 1628 | "top": null, 1629 | "visibility": null, 1630 | "width": null 1631 | } 1632 | }, 1633 | "ff8421ceeeb84863a79a95137d57e3a7": { 1634 | "model_module": "@jupyter-widgets/controls", 1635 | "model_module_version": "1.5.0", 1636 | "model_name": "ProgressStyleModel", 1637 | "state": { 1638 | "_model_module": "@jupyter-widgets/controls", 1639 | "_model_module_version": "1.5.0", 1640 | "_model_name": "ProgressStyleModel", 1641 | "_view_count": null, 1642 | "_view_module": "@jupyter-widgets/base", 1643 | "_view_module_version": "1.2.0", 1644 | "_view_name": "StyleView", 1645 | "bar_color": null, 1646 | "description_width": "initial" 1647 | } 1648 | } 1649 | }, 1650 | "version_major": 2, 1651 | "version_minor": 0 1652 | } 1653 | } 1654 | }, 1655 | "nbformat": 4, 1656 | "nbformat_minor": 4 1657 | } 1658 | -------------------------------------------------------------------------------- /flaubert_train_predict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 8 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import os, time, datetime\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "from tqdm import tqdm\n", 16 | "import random\n", 17 | "import logging\n", 18 | "tqdm.pandas()\n", 19 | "import seaborn as sns\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "\n", 22 | "#NN Packages\n", 23 | "import torch\n", 24 | "import torch.nn as nn\n", 25 | "from torch.utils.data import TensorDataset, random_split,DataLoader, RandomSampler, SequentialSampler\n", 26 | "\n", 27 | "logger = logging.getLogger(__name__)\n", 28 | "\n", 29 | "\n", 30 | "if torch.cuda.is_available(): \n", 31 | "\n", 32 | " # Tell PyTorch to use the GPU. \n", 33 | " device = torch.device(\"cuda\")\n", 34 | "\n", 35 | " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n", 36 | "\n", 37 | " print('We will use the GPU:', torch.cuda.get_device_name(0))\n", 38 | "\n", 39 | "# If not...\n", 40 | "else:\n", 41 | " print('No GPU available, using the CPU instead.')\n", 42 | " device = torch.device(\"cpu\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# Processing text data" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "def format_time(elapsed):\n", 59 | " '''\n", 60 | " Takes a time in seconds and returns a string hh:mm:ss\n", 61 | " '''\n", 62 | " # Round to the nearest second.\n", 63 | " elapsed_rounded = int(round((elapsed)))\n", 64 | " \n", 65 | " # Format as hh:mm:ss\n", 66 | " return str(datetime.timedelta(seconds=elapsed_rounded))\n", 67 | "\n", 68 | "class SigirPreprocess():\n", 69 | " \n", 70 | " \n", 71 | " def __init__(self, text_data_path):\n", 72 | " self.text_data_path = text_data_path\n", 73 | " self.train = None\n", 74 | " self.dict_code_to_id = {}\n", 75 | " self.dict_id_to_code = {}\n", 76 | " self.list_tags = {}\n", 77 | " self.sentences = []\n", 78 | " self.labels = []\n", 79 | " self.text_col = None\n", 80 | " self.X_test = None\n", 81 | " \n", 82 | " \n", 83 | " def prepare_data(self ):\n", 84 | " \n", 85 | " #loading the train data and test data\n", 86 | " catalog_eng = pd.read_csv(self.text_data_path+\"data/catalog_english_taxonomy.tsv\",sep=\"\\t\")\n", 87 | " X_train= pd.read_csv(self.text_data_path+\"data/X_train.tsv\",sep=\"\\t\")\n", 88 | " Y_train= pd.read_csv(self.text_data_path+\"data/Y_train.tsv\",sep=\"\\t\")\n", 89 | " self.list_tags = list(Y_train['Prdtypecode'].unique())\n", 90 | " \n", 91 | " for i,tag in enumerate(self.list_tags):\n", 92 | " self.dict_code_to_id[tag] = i \n", 93 | " self.dict_id_to_code[i]=tag\n", 94 | " \n", 95 | " #map \n", 96 | " Y_train['labels']=Y_train['Prdtypecode'].map(self.dict_code_to_id)\n", 97 | " \n", 98 | " #merge the train\n", 99 | " train=pd.merge(left=X_train,right=Y_train,\n", 100 | " how='left',left_on=['Integer_id','Image_id','Product_id'],\n", 101 | " right_on=['Integer_id','Image_id','Product_id'])\n", 102 | " prod_map=pd.Series(catalog_eng['Top level category'].values,\n", 103 | " index=catalog_eng['Prdtypecode']).to_dict()\n", 104 | " \n", 105 | " #creating the mapping\n", 106 | " train['product'] = train['Prdtypecode'].map(prod_map)\n", 107 | " train['title_len']=train['Title'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n", 108 | " train['desc_len']=train['Description'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n", 109 | " train['title_desc_len']=train['title_len'] + train['desc_len']\n", 110 | " train.loc[train['Description'].isnull(), 'Description'] = \" \"\n", 111 | " train['title_desc'] = train['Title'] + \" \" + train['Description']\n", 112 | " \n", 113 | " self.train = train\n", 114 | " \n", 115 | " def get_sentences(self, text_col, remove_null_rows=False):\n", 116 | " self.text_col = text_col\n", 117 | " if remove_null_rows==True:\n", 118 | " new_train = self.train[self.train[text_col].notnull()]\n", 119 | "\n", 120 | " else:\n", 121 | " new_train = self.train.copy()\n", 122 | " \n", 123 | " self.sentences = new_train[text_col].values\n", 124 | " self.labels = new_train['labels'].values\n", 125 | " \n", 126 | " def prepare_test(self, text_col):\n", 127 | " X_test=pd.read_csv(self.text_data_path+\"data/x_test_task1_phase1.tsv\",sep=\"\\t\")\n", 128 | " X_test.loc[X_test['Description'].isnull(), 'Description'] = \" \"\n", 129 | " X_test['title_desc'] = X_test['Title'] + \" \" + X_test['Description']\n", 130 | " self.X_test = X_test\n", 131 | " self.test_sentences = X_test[text_col].values\n", 132 | " " 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "text_col = 'title_desc'\n", 142 | "max_len = 256\n", 143 | "val_size = 0.1\n", 144 | "\n", 145 | "# model_str_dict = {'c':'camembert',\n", 146 | "# 'f':'flaubert'}\n", 147 | "# # 'f' for flaubert & 'c' for camembert\n", 148 | "# case='f' \n", 149 | "# model_str = model_str_dict[case]" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "Preprocess = SigirPreprocess(\"/../input/textphase1/\")\n", 159 | "Preprocess.prepare_data()\n", 160 | "Preprocess.get_sentences(text_col, True)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "sentences = Preprocess.sentences\n", 170 | "labels = Preprocess.labels\n", 171 | "print(\"Total number of sentences:{}, labels:{}\".format(len(sentences), len(labels)))" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "# sns.countplot(x='product', data=self.train)\n", 181 | "# sns.countplot(x='Prdtypecode', data=self.train)\n", 182 | "# sns.distplot(Preprocess.train['title_len'])\n", 183 | "# sns.distplot(Preprocess.train['title_desc_len'])\n", 184 | "# np.percentile(Preprocess.train['title_desc_len'], 99)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "len(Preprocess.dict_code_to_id)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "from transformers import XLMForSequenceClassification\n", 203 | "from transformers import FlaubertModel, FlaubertTokenizer,FlaubertForSequenceClassification,AdamW, FlaubertConfig \n", 204 | "from torch.nn import Dropout,Conv1d, Linear\n", 205 | "from transformers.modeling_utils import SequenceSummary" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "# a1 = sentences[0]\n", 215 | "# max_len = 40\n", 216 | "# modelname = 'flaubert-base-cased'\n", 217 | "# tokenizer = FlaubertTokenizer.from_pretrained(modelname, do_lowercase=False)\n", 218 | "\n", 219 | "# encoded_dict = tokenizer.encode_plus(\n", 220 | "# a1, # Sentence to encode.\n", 221 | "# add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", 222 | "# max_length = max_len, # Pad & truncate all sentences.\n", 223 | "# pad_to_max_length = True,\n", 224 | "# return_attention_mask = True, # Construct attn. masks.\n", 225 | "# return_tensors = 'pt', # Return pytorch tensors.\n", 226 | "# )\n", 227 | "\n", 228 | "\n", 229 | "# iid = encoded_dict['input_ids']\n", 230 | "# mask = encoded_dict['attention_mask']\n", 231 | "\n", 232 | "# iid,mask\n", 233 | "\n", 234 | "# # modelname = 'flaubert-base-cased'\n", 235 | "\n", 236 | "# model = CustFlaubertForSequenceClassification.from_pretrained(\n", 237 | "# modelname, # Use the 12-layer BERT model, with an uncased vocab.\n", 238 | "# # num_labels = len(Preprocess.dict_code_to_id), # The number of output labels--2 for binary classification.\n", 239 | "# # You can increase this for multi-class tasks. \n", 240 | "# output_attentions = False, # Whether the model returns attentions weights.\n", 241 | "# output_hidden_states = False, # Whether the model returns all hidden-states.\n", 242 | "# )\n", 243 | "\n", 244 | "# outputs, embed1 = model(iid, token_type_ids=None, attention_mask=mask, \n", 245 | "# )" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "# #max length after tokenization\n", 255 | "# _max_len = 0\n", 256 | "# # For every sentence...\n", 257 | "# for sent in tqdm(sentences):\n", 258 | "\n", 259 | "# # Tokenize the text and add `[CLS]` and `[SEP]` tokens.\n", 260 | "# input_ids = tokenizer.encode(sent, add_special_tokens=True)\n", 261 | "\n", 262 | "# # Update the maximum sentence length.\n", 263 | "# _max_len = max(_max_len, len(input_ids))\n", 264 | "\n", 265 | "# print('Max sentence length: ', _max_len)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "modelname = 'flaubert-base-cased'\n", 275 | "tokenizer = FlaubertTokenizer.from_pretrained(modelname, do_lowercase=False)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "#function to prepare input for model training\n", 285 | "def prep_input(sentences,labels, max_len):\n", 286 | " input_ids = []\n", 287 | " attention_masks = []\n", 288 | "\n", 289 | " # For every sentence...\n", 290 | " for sent in tqdm(sentences):\n", 291 | " # `encode_plus` will:\n", 292 | " # (1) Tokenize the sentence.\n", 293 | " # (2) Prepend the `[CLS]` token to the start.\n", 294 | " # (3) Append the `[SEP]` token to the end.\n", 295 | " # (4) Map tokens to their IDs.\n", 296 | " # (5) Pad or truncate the sentence to `max_length`\n", 297 | " # (6) Create attention masks for [PAD] tokens.\n", 298 | " encoded_dict = tokenizer.encode_plus(\n", 299 | " sent, # Sentence to encode.\n", 300 | " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", 301 | " max_length = max_len, # Pad & truncate all sentences.\n", 302 | " pad_to_max_length = True,\n", 303 | " return_attention_mask = True, # Construct attn. masks.\n", 304 | " return_tensors = 'pt', # Return pytorch tensors.\n", 305 | " )\n", 306 | "\n", 307 | " # Add the encoded sentence to the list. \n", 308 | " input_ids.append(encoded_dict['input_ids'])\n", 309 | "\n", 310 | " # And its attention mask (simply differentiates padding from non-padding).\n", 311 | " attention_masks.append(encoded_dict['attention_mask'])\n", 312 | "\n", 313 | " # Convert the lists into tensors.\n", 314 | " input_ids = torch.cat(input_ids, dim=0)\n", 315 | " attention_masks = torch.cat(attention_masks, dim=0)\n", 316 | " if labels is not None:\n", 317 | " labels = torch.tensor(labels)\n", 318 | " return input_ids,attention_masks,labels\n", 319 | " else:\n", 320 | " return input_ids,attention_masks\n", 321 | " " 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "input_ids,attention_masks,labels=prep_input(sentences,labels, max_len=max_len)\n", 331 | "# print('Original: ', sentences[0])\n", 332 | "# print('Token IDs:', input_ids[0])" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n", 342 | "\n", 343 | "\n", 344 | "\n", 345 | "#Validation split\n", 346 | "tr_inputs, val_inputs, tr_labels, val_labels = train_test_split(input_ids, labels,stratify=labels,\n", 347 | " random_state=2020, test_size=val_size)\n", 348 | "\n", 349 | "\n", 350 | "tr_masks, val_masks, u,v = train_test_split(attention_masks, labels,stratify=labels,\n", 351 | " random_state=2020, test_size=val_size)\n", 352 | "\n", 353 | "\n", 354 | "train_dataset=TensorDataset(tr_inputs, tr_masks, tr_labels)\n", 355 | "val_dataset=TensorDataset(val_inputs, val_masks, val_labels)\n", 356 | "train_sampler = RandomSampler(train_dataset) \n", 357 | "valid_sampler = SequentialSampler(val_dataset)\n", 358 | "\n", 359 | "\n", 360 | "# The DataLoader needs to know our batch size for training, so we specify it \n", 361 | "# here. For fine-tuning BERT on a specific task, the authors recommend a batch \n", 362 | "# size of 16 or 32.\n", 363 | "batch_size = 32\n", 364 | "\n", 365 | "# Create the DataLoaders for our training and validation sets.\n", 366 | "# We'll take training samples in random order. \n", 367 | "train_dataloader = DataLoader(\n", 368 | " train_dataset, # The training samples.\n", 369 | " sampler = train_sampler, # Select batches randomly\n", 370 | " batch_size = batch_size # Trains with this batch size.\n", 371 | " )\n", 372 | "\n", 373 | "# For validation the order doesn't matter, so we'll just read them sequentially.\n", 374 | "validation_dataloader = DataLoader(\n", 375 | " val_dataset, # The validation samples.\n", 376 | " sampler = valid_sampler, # Pull out batches sequentially.\n", 377 | " batch_size = batch_size # Evaluate with this batch size.\n", 378 | " )" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "# Function to calculate the accuracy of our predictions vs labels\n", 388 | "def flat_accuracy(preds, labels):\n", 389 | " pred_flat = np.argmax(preds, axis=1).flatten()\n", 390 | " labels_flat = labels.flatten()\n", 391 | " return np.sum(pred_flat == labels_flat) / len(labels_flat)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "num_classes = 27" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "class vec_output_FlaubertForSequenceClassification(FlaubertModel):\n", 410 | " \n", 411 | " config_class = FlaubertConfig\n", 412 | " \n", 413 | "\n", 414 | " def __init__(self, config):\n", 415 | " super().__init__(config)\n", 416 | " self.transformer = FlaubertModel(config)\n", 417 | " self.sequence_summary = SequenceSummary(config)\n", 418 | " self.init_weights()\n", 419 | " self.dropout = torch.nn.Dropout(0.1)\n", 420 | " self.classifier = torch.nn.Linear(config.hidden_size, num_classes)\n", 421 | "\n", 422 | "\n", 423 | " def forward(\n", 424 | " self,\n", 425 | " input_ids=None,\n", 426 | " attention_mask=None,\n", 427 | " langs=None,\n", 428 | " token_type_ids=None,\n", 429 | " position_ids=None,\n", 430 | " lengths=None,\n", 431 | " cache=None,\n", 432 | " head_mask=None,\n", 433 | " inputs_embeds=None,\n", 434 | " labels=None,\n", 435 | " ):\n", 436 | " \n", 437 | " \n", 438 | " transformer_outputs = self.transformer(\n", 439 | " input_ids,\n", 440 | " attention_mask=attention_mask,\n", 441 | " langs=langs,\n", 442 | " token_type_ids=token_type_ids,\n", 443 | " position_ids=position_ids,\n", 444 | " lengths=lengths,\n", 445 | " cache=cache,\n", 446 | " head_mask=head_mask,\n", 447 | " inputs_embeds=inputs_embeds,\n", 448 | " )\n", 449 | "\n", 450 | " #output = self.dropout(output)\n", 451 | " output = transformer_outputs[0]\n", 452 | " vec = output[:,0]\n", 453 | " \n", 454 | " \n", 455 | " #logits\n", 456 | " dense = self.dropout(vec)\n", 457 | " \n", 458 | " #classifier\n", 459 | " logits = self.classifier(dense)\n", 460 | " \n", 461 | " outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here\n", 462 | " \n", 463 | " \n", 464 | " return outputs,dense" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "len(Preprocess.dict_code_to_id)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "modelname = 'flaubert-base-cased'\n", 483 | "\n", 484 | "model = vec_output_FlaubertForSequenceClassification.from_pretrained(\n", 485 | " modelname, # Use the 12-layer BERT model, with an uncased vocab.\n", 486 | " num_labels = len(Preprocess.dict_code_to_id), # The number of output labels--2 for binary classification.\n", 487 | " # You can increase this for multi-class tasks. \n", 488 | " output_attentions = False, # Whether the model returns attentions weights.\n", 489 | " output_hidden_states = False, # Whether the model returns all hidden-states.\n", 490 | ")\n", 491 | "\n", 492 | "model.cuda()" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "optimizer = AdamW(model.parameters(),\n", 502 | " lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n", 503 | " eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n", 504 | " )\n" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": null, 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "from transformers import get_linear_schedule_with_warmup\n", 514 | "# Number of training epochs. The BERT authors recommend between 2 and 4. \n", 515 | "# We chose to run for 4, but we'll see later that this may be over-fitting the\n", 516 | "# training data.\n", 517 | "epochs = 12\n", 518 | "\n", 519 | "# Total number of training steps is [number of batches] x [number of epochs]. \n", 520 | "# (Note that this is not the same as the number of training samples).\n", 521 | "total_steps = len(train_dataloader) * epochs\n", 522 | "\n", 523 | "# Create the learning rate scheduler.\n", 524 | "scheduler = get_linear_schedule_with_warmup(optimizer, \n", 525 | " num_warmup_steps = 0, # Default value in run_glue.py\n", 526 | " num_training_steps = total_steps)" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "import torch.nn as nn\n", 536 | "loss_criterion = nn.CrossEntropyLoss()\n" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": null, 542 | "metadata": {}, 543 | "outputs": [], 544 | "source": [ 545 | "from sklearn.metrics import f1_score\n", 546 | "# This training code is based on the `run_glue.py` script here:\n", 547 | "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n", 548 | "\n", 549 | "# Set the seed value all over the place to make this reproducible.\n", 550 | "seed_val = 42\n", 551 | "\n", 552 | "random.seed(seed_val)\n", 553 | "np.random.seed(seed_val)\n", 554 | "torch.manual_seed(seed_val)\n", 555 | "torch.cuda.manual_seed_all(seed_val)\n", 556 | "\n", 557 | "# We'll store a number of quantities such as training and validation loss, \n", 558 | "# validation accuracy, and timings.\n", 559 | "training_stats = []\n", 560 | "\n", 561 | "# Measure the total training time for the whole run.\n", 562 | "total_t0 = time.time()\n", 563 | "\n", 564 | "\n", 565 | "# For each epoch...\n", 566 | "for epoch_i in range(0, epochs):\n", 567 | " \n", 568 | " # ========================================\n", 569 | " # Training\n", 570 | " # ========================================\n", 571 | " \n", 572 | " # Perform one full pass over the training set.\n", 573 | "\n", 574 | " print(\"\")\n", 575 | " print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n", 576 | " print('Training...')\n", 577 | " \n", 578 | " #tr and val\n", 579 | " vec_output_tr = []\n", 580 | " vec_output_val =[]\n", 581 | "\n", 582 | " # Measure how long the training epoch takes.\n", 583 | " t0 = time.time()\n", 584 | "\n", 585 | " # Reset the total loss for this epoch.\n", 586 | " total_train_loss = 0\n", 587 | "\n", 588 | " # Put the model into training mode. Don't be mislead--the call to \n", 589 | " # `train` just changes the *mode*, it doesn't *perform* the training.\n", 590 | " # `dropout` and `batchnorm` layers behave differently during training\n", 591 | " # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)\n", 592 | " best_f1 = 0\n", 593 | " model.train()\n", 594 | "\n", 595 | " # For each batch of training data...\n", 596 | " for step, batch in enumerate(train_dataloader):\n", 597 | "\n", 598 | " # Progress update every 40 batches.\n", 599 | " if step % 40 == 0 and not step == 0:\n", 600 | " # Calculate elapsed time in minutes.\n", 601 | " elapsed = format_time(time.time() - t0)\n", 602 | " \n", 603 | " # Report progress.\n", 604 | " print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n", 605 | "\n", 606 | " # Unpack this training batch from our dataloader. \n", 607 | " #\n", 608 | " # As we unpack the batch, we'll also copy each tensor to the GPU using the \n", 609 | " # `to` method.\n", 610 | " #\n", 611 | " # `batch` contains three pytorch tensors:\n", 612 | " # [0]: input ids \n", 613 | " # [1]: attention masks\n", 614 | " # [2]: labels \n", 615 | " b_input_ids = batch[0].to(device)\n", 616 | " b_input_mask = batch[1].to(device)\n", 617 | " b_labels = batch[2].to(device)\n", 618 | "\n", 619 | " # Always clear any previously calculated gradients before performing a\n", 620 | " # backward pass. PyTorch doesn't do this automatically because \n", 621 | " # accumulating the gradients is \"convenient while training RNNs\". \n", 622 | " # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)\n", 623 | " model.zero_grad() \n", 624 | "\n", 625 | " # Perform a forward pass (evaluate the model on this training batch).\n", 626 | " # The documentation for this `model` function is here: \n", 627 | " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n", 628 | " # It returns different numbers of parameters depending on what arguments\n", 629 | " # arge given and what flags are set. For our useage here, it returns\n", 630 | " # the loss (because we provided labels) and the \"logits\"--the model\n", 631 | " # outputs prior to activation.\n", 632 | " logits,vec = model(b_input_ids, \n", 633 | " token_type_ids=None, \n", 634 | " attention_mask=b_input_mask\n", 635 | " )\n", 636 | " #new\n", 637 | " logits = logits[0]\n", 638 | " \n", 639 | " #Defining the loss\n", 640 | " loss = loss_criterion(logits, b_labels)\n", 641 | " \n", 642 | " #saving the features_tr\n", 643 | " vec = vec.detach().cpu().numpy()\n", 644 | " vec_output_tr.extend(vec)\n", 645 | " \n", 646 | " # Accumulate the training loss over all of the batches so that we can\n", 647 | " # calculate the average loss at the end. `loss` is a Tensor containing a\n", 648 | " # single value; the `.item()` function just returns the Python value \n", 649 | " # from the tensor.\n", 650 | " total_train_loss += loss.item()\n", 651 | "\n", 652 | " # Perform a backward pass to calculate the gradients.\n", 653 | " loss.backward()\n", 654 | "\n", 655 | " # Clip the norm of the gradients to 1.0.\n", 656 | " # This is to help prevent the \"exploding gradients\" problem.\n", 657 | " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n", 658 | "\n", 659 | " # Update parameters and take a step using the computed gradient.\n", 660 | " # The optimizer dictates the \"update rule\"--how the parameters are\n", 661 | " # modified based on their gradients, the learning rate, etc.\n", 662 | " optimizer.step()\n", 663 | "\n", 664 | " # Update the learning rate.\n", 665 | " scheduler.step()\n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | "\n", 670 | " # Calculate the average loss over all of the batches.\n", 671 | " avg_train_loss = total_train_loss / len(train_dataloader) \n", 672 | " \n", 673 | " # Measure how long this epoch took.\n", 674 | " training_time = format_time(time.time() - t0)\n", 675 | "\n", 676 | " print(\"\")\n", 677 | " print(\" Average training loss: {0:.2f} \".format(avg_train_loss))\n", 678 | " print(\" Training epcoh took: {:} \".format(training_time))\n", 679 | " \n", 680 | " # ========================================\n", 681 | " # Validation\n", 682 | " # ========================================\n", 683 | " # After the completion of each training epoch, measure our performance on\n", 684 | " # our validation set.\n", 685 | "\n", 686 | " print(\"\")\n", 687 | " print(\"Running Validation...\")\n", 688 | "\n", 689 | " t0 = time.time()\n", 690 | "\n", 691 | " # Put the model in evaluation mode--the dropout layers behave differently\n", 692 | " # during evaluation.\n", 693 | " model.eval()\n", 694 | "\n", 695 | " # Tracking variables \n", 696 | " total_eval_accuracy = 0\n", 697 | " total_eval_loss = 0\n", 698 | " nb_eval_steps = 0\n", 699 | " predictions=[]\n", 700 | " true_labels=[]\n", 701 | " \n", 702 | "\n", 703 | " # Evaluate data for one epoch\n", 704 | " for batch in validation_dataloader:\n", 705 | " \n", 706 | " # Unpack this training batch from our dataloader. \n", 707 | " #\n", 708 | " # As we unpack the batch, we'll also copy each tensor to the GPU using \n", 709 | " # the `to` method.\n", 710 | " #\n", 711 | " # `batch` contains three pytorch tensors:\n", 712 | " # [0]: input ids \n", 713 | " # [1]: attention masks\n", 714 | " # [2]: labels \n", 715 | " b_input_ids = batch[0].to(device)\n", 716 | " b_input_mask = batch[1].to(device)\n", 717 | " b_labels = batch[2].to(device)\n", 718 | " \n", 719 | " # Tell pytorch not to bother with constructing the compute graph during\n", 720 | " # the forward pass, since this is only needed for backprop (training).\n", 721 | " with torch.no_grad(): \n", 722 | "\n", 723 | " # Forward pass, calculate logit predictions.\n", 724 | " # token_type_ids is the same as the \"segment ids\", which \n", 725 | " # differentiates sentence 1 and 2 in 2-sentence tasks.\n", 726 | " # The documentation for this `model` function is here: \n", 727 | " # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n", 728 | " # Get the \"logits\" output by the model. The \"logits\" are the output\n", 729 | " # values prior to applying an activation function like the softmax.\n", 730 | " logits,vec = model(b_input_ids, \n", 731 | " token_type_ids=None, \n", 732 | " attention_mask=b_input_mask\n", 733 | " )\n", 734 | " \n", 735 | " #new\n", 736 | " logits = logits[0]\n", 737 | " \n", 738 | " #defining the val loss\n", 739 | " loss = loss_criterion(logits, b_labels)\n", 740 | " \n", 741 | " \n", 742 | " # Accumulate the validation loss.\n", 743 | " total_eval_loss += loss.item()\n", 744 | "\n", 745 | " # Move logits and labels to CPU\n", 746 | " logits = logits.detach().cpu().numpy()\n", 747 | "\n", 748 | " # Move logits and labels to CPU\n", 749 | " predicted_labels=np.argmax(logits,axis=1)\n", 750 | " predictions.extend(predicted_labels)\n", 751 | " label_ids = b_labels.to('cpu').numpy()\n", 752 | " true_labels.extend(label_ids)\n", 753 | " \n", 754 | " #saving the features_tr\n", 755 | " vec = vec.detach().cpu().numpy()\n", 756 | " vec_output_val.extend(vec)\n", 757 | " \n", 758 | "\n", 759 | " # Calculate the accuracy for this batch of test sentences, and\n", 760 | " # accumulate it over all batches.\n", 761 | " total_eval_accuracy += flat_accuracy(logits, label_ids)\n", 762 | " \n", 763 | "\n", 764 | " # Report the final accuracy for this validation run.\n", 765 | " avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n", 766 | " print(\" Accuracy: {0:.2f}\".format(avg_val_accuracy))\n", 767 | "\n", 768 | " # Calculate the average loss over all of the batches.\n", 769 | " avg_val_loss = total_eval_loss / len(validation_dataloader)\n", 770 | " \n", 771 | " # Measure how long the validation run took.\n", 772 | " validation_time = format_time(time.time() - t0)\n", 773 | " \n", 774 | " print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n", 775 | " print(\" Validation took: {:}\".format(validation_time))\n", 776 | " print(\"Validation F1-Score: {}\".format(f1_score(true_labels,predictions,average='macro')))\n", 777 | " curr_f1=f1_score(true_labels,predictions,average='macro')\n", 778 | " if curr_f1 > best_f1:\n", 779 | " best_f1=curr_f1\n", 780 | " torch.save(model.state_dict(), 'best_model.pt')\n", 781 | " np.save('best_vec_train.npy',vec_output_tr)\n", 782 | " np.save('best_vec_val.npy',vec_output_val)\n", 783 | " # Record all statistics from this epoch.\n", 784 | "# training_stats.append(\n", 785 | "# {\n", 786 | "# 'epoch': epoch_i + 1,\n", 787 | "# 'Training Loss': avg_train_loss,\n", 788 | "# 'Valid. Loss': avg_val_loss,\n", 789 | "# 'Valid. Accur.': avg_val_accuracy,\n", 790 | "# 'Training Time': training_time,\n", 791 | "# 'Validation Time': validation_time\n", 792 | "# }\n", 793 | "# )\n", 794 | "\n", 795 | "print(\"\")\n", 796 | "print(\"Training complete!\")\n", 797 | "\n", 798 | "print(\"Total training took {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": null, 804 | "metadata": {}, 805 | "outputs": [], 806 | "source": [ 807 | "# Save model\n", 808 | "# try:\n", 809 | "# model_state = {'model': model,\n", 810 | "# 'state_dict': model.state_dict(),\n", 811 | "# 'optimizer' : optimizer.state_dict()}\n", 812 | "\n", 813 | "# torch.save(model_state, 'saved_model.pth')\n", 814 | "# except:\n", 815 | "# print('Error in saving model')" 816 | ] 817 | }, 818 | { 819 | "cell_type": "markdown", 820 | "metadata": {}, 821 | "source": [ 822 | "**Test model on unseen data**" 823 | ] 824 | }, 825 | { 826 | "cell_type": "markdown", 827 | "metadata": {}, 828 | "source": [ 829 | "# PREDICTIONS" 830 | ] 831 | }, 832 | { 833 | "cell_type": "markdown", 834 | "metadata": {}, 835 | "source": [ 836 | "**UNCOMMENT THE BELOW CELL IF TRAINING IS NOT PERFORMED IN THIS RUN**" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": null, 842 | "metadata": {}, 843 | "outputs": [], 844 | "source": [ 845 | "model_path = '/../working/best_model.pt'" 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": null, 851 | "metadata": {}, 852 | "outputs": [], 853 | "source": [ 854 | "## Change the **model path** accordingly\n", 855 | "# model_str = 'flaubert'\n", 856 | "# model_path_dict = {'camembert':'/../input/camembertvinodh/saved_model.pth',\n", 857 | "# 'flaubert':'/../input/flaubertekansh/saved_model.pth'}\n", 858 | "\n", 859 | "# model_path = model_path_dict[model_str]\n", 860 | "checkpoint = torch.load(model_path)\n", 861 | "# model = checkpoint['model']\n", 862 | "model.load_state_dict(checkpoint)" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": null, 868 | "metadata": {}, 869 | "outputs": [], 870 | "source": [ 871 | "def predict_pyt(model, prediction_dataloader):\n", 872 | " \"\"\"\n", 873 | " model: pytorch model\n", 874 | " prediction_dataloader: DataLoader object for which the predictions has to be made.\n", 875 | " return:\n", 876 | " predictions:- Direct predicted labels\n", 877 | " softmax_logits:- logits which are normalized with softmax on output\"\"\"\n", 878 | " \n", 879 | " # Put model in evaluation mode\n", 880 | " model.eval()\n", 881 | "\n", 882 | " # Tracking variables \n", 883 | " predictions = []\n", 884 | " softmax_logits=[]\n", 885 | " vec_outputs = []\n", 886 | " \n", 887 | " # Predict \n", 888 | " for batch in prediction_dataloader:\n", 889 | " \n", 890 | " # Add batch to GPU\n", 891 | " batch = tuple(t.to(device) for t in batch)\n", 892 | " # Unpack the inputs from our dataloader\n", 893 | " try:\n", 894 | " b_input_ids, b_input_mask = batch\n", 895 | " except ValueError:\n", 896 | " b_input_ids, b_input_mask, _ = batch\n", 897 | " # Telling the model not to compute or store gradients, saving memory and \n", 898 | " # speeding up prediction\n", 899 | " with torch.no_grad():\n", 900 | " # Forward pass, calculate logit predictions\n", 901 | " logits,vec = model(b_input_ids, token_type_ids=None, \n", 902 | " attention_mask=b_input_mask)\n", 903 | " \n", 904 | " logits = logits[0]\n", 905 | "\n", 906 | " \n", 907 | " #----- Add softmax--- \n", 908 | " m = nn.Softmax(dim=1)\n", 909 | " # # input = torch.randn(2, 3)\n", 910 | " output = m(logits)\n", 911 | " #-------#------\n", 912 | " \n", 913 | " # Move logits and labels to CPU\n", 914 | " logits = logits.detach().cpu().numpy()\n", 915 | " predicted_labels=np.argmax(logits,axis=1)\n", 916 | " predictions.extend(predicted_labels)\n", 917 | " softmax_logits.extend(output)\n", 918 | " \n", 919 | " #vec_outputs saving\n", 920 | " vec = vec.detach().cpu().numpy()\n", 921 | " vec_outputs.extend(vec)\n", 922 | "\n", 923 | " print('DONE')\n", 924 | " return predictions, softmax_logits , vec_outputs\n", 925 | "\n", 926 | "def predict_wrapper(model, sentences, max_len=max_len, batch_size = batch_size ):\n", 927 | " \"\"\"\n", 928 | " Wrapper to create DataLoader object and predict, \n", 929 | " this is if model and sentences are passed\"\"\"\n", 930 | " input_ids,attention_masks=prep_input(sentences,labels=None, max_len=max_len)\n", 931 | " prediction_data = TensorDataset(input_ids, attention_masks)\n", 932 | " prediction_sampler = SequentialSampler(prediction_data)\n", 933 | " prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)\n", 934 | " return predict_pyt(model, prediction_dataloader)" 935 | ] 936 | }, 937 | { 938 | "cell_type": "code", 939 | "execution_count": null, 940 | "metadata": {}, 941 | "outputs": [], 942 | "source": [ 943 | "## Prepare the test dataset\n", 944 | "batch_size = 32 \n", 945 | "\n", 946 | "Preprocess.prepare_test(text_col)\n", 947 | "test_sentences = Preprocess.test_sentences\n", 948 | "X_test_phase1= Preprocess.X_test" 949 | ] 950 | }, 951 | { 952 | "cell_type": "code", 953 | "execution_count": null, 954 | "metadata": {}, 955 | "outputs": [], 956 | "source": [ 957 | "# Predictions of validation set which is randomly separated from train dataset\n", 958 | "start = time.time()\n", 959 | "predictions, val_softmax_logits , vec_outputs= predict_pyt(model, validation_dataloader)\n", 960 | "val_softmax_logits = np.array([ten.detach().cpu().numpy() for ten in val_softmax_logits])\n", 961 | "np.save('validation_set_softmax_logits.npy',val_softmax_logits)\n", 962 | "print('Time Taken Predict for val set: {:}'.format(format_time(time.time() - start)))" 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": null, 968 | "metadata": {}, 969 | "outputs": [], 970 | "source": [ 971 | "## Predictions of test dataset \n", 972 | "\n", 973 | "start = time.time()\n", 974 | "predictions, softmax_logits , vec_outputs = predict_wrapper(model, test_sentences)\n", 975 | "\n", 976 | "#saving\n", 977 | "np.save('best_vec_test.npy',vec_outputs)\n", 978 | "softmax_logits = np.array([ten.detach().cpu().numpy() for ten in softmax_logits])\n", 979 | "np.save('X_test_phase1_softmax_logits.npy',softmax_logits)\n", 980 | "print('length of predictions {}'.format(len(predictions)))\n", 981 | "print('Time Taken Predict for val set: {:}'.format(format_time(time.time() - start) ))" 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": null, 987 | "metadata": {}, 988 | "outputs": [], 989 | "source": [ 990 | "X_test_phase1['prediction_model']= predictions\n", 991 | "X_test_phase1['Prdtypecode']=X_test_phase1['prediction_model'].map(Preprocess.dict_id_to_code)\n", 992 | "print(X_test_phase1['Prdtypecode'].value_counts())\n", 993 | "X_test_phase1=X_test_phase1.drop(['prediction_model','Title','Description'],axis=1)" 994 | ] 995 | }, 996 | { 997 | "cell_type": "code", 998 | "execution_count": null, 999 | "metadata": {}, 1000 | "outputs": [], 1001 | "source": [ 1002 | "X_test_phase1.to_csv('y_test_task1_phase1_pred.tsv',sep='\\t',index=False)" 1003 | ] 1004 | } 1005 | ], 1006 | "metadata": { 1007 | "kernelspec": { 1008 | "display_name": "Python 3", 1009 | "language": "python", 1010 | "name": "python3" 1011 | }, 1012 | "language_info": { 1013 | "codemirror_mode": { 1014 | "name": "ipython", 1015 | "version": 3 1016 | }, 1017 | "file_extension": ".py", 1018 | "mimetype": "text/x-python", 1019 | "name": "python", 1020 | "nbconvert_exporter": "python", 1021 | "pygments_lexer": "ipython3", 1022 | "version": "3.7.7" 1023 | }, 1024 | "toc": { 1025 | "base_numbering": 1, 1026 | "nav_menu": {}, 1027 | "number_sections": true, 1028 | "sideBar": true, 1029 | "skip_h1_title": false, 1030 | "title_cell": "Table of Contents", 1031 | "title_sidebar": "Contents", 1032 | "toc_cell": false, 1033 | "toc_position": {}, 1034 | "toc_section_display": true, 1035 | "toc_window_display": false 1036 | }, 1037 | "widgets": { 1038 | "application/vnd.jupyter.widget-state+json": { 1039 | "state": { 1040 | "06d2301a3d34440eb19a887fb51d562c": { 1041 | "model_module": "@jupyter-widgets/controls", 1042 | "model_module_version": "1.5.0", 1043 | "model_name": "FloatProgressModel", 1044 | "state": { 1045 | "_dom_classes": [], 1046 | "_model_module": "@jupyter-widgets/controls", 1047 | "_model_module_version": "1.5.0", 1048 | "_model_name": "FloatProgressModel", 1049 | "_view_count": null, 1050 | "_view_module": "@jupyter-widgets/controls", 1051 | "_view_module_version": "1.5.0", 1052 | "_view_name": "ProgressView", 1053 | "bar_style": "success", 1054 | "description": "Downloading: 100%", 1055 | "description_tooltip": null, 1056 | "layout": "IPY_MODEL_18fe6f38ca234379a17e44cd1fad50d4", 1057 | "max": 553238687, 1058 | "min": 0, 1059 | "orientation": "horizontal", 1060 | "style": "IPY_MODEL_54ca2b8f204b4760bccb27572ff7b74a", 1061 | "value": 553238687 1062 | } 1063 | }, 1064 | "0f7e8f4e75fc4889a2dd464588c0516d": { 1065 | "model_module": "@jupyter-widgets/controls", 1066 | "model_module_version": "1.5.0", 1067 | "model_name": "HBoxModel", 1068 | "state": { 1069 | "_dom_classes": [], 1070 | "_model_module": "@jupyter-widgets/controls", 1071 | "_model_module_version": "1.5.0", 1072 | "_model_name": "HBoxModel", 1073 | "_view_count": null, 1074 | "_view_module": "@jupyter-widgets/controls", 1075 | "_view_module_version": "1.5.0", 1076 | "_view_name": "HBoxView", 1077 | "box_style": "", 1078 | "children": [ 1079 | "IPY_MODEL_33dc04e218864811a4fe0c62ca737d83", 1080 | "IPY_MODEL_274606aec69a461f8c4259316b86c4af" 1081 | ], 1082 | "layout": "IPY_MODEL_f5c9c24c01114e168e90ed555fb9f05a" 1083 | } 1084 | }, 1085 | "12a4548fdad44ea181868776c7616455": { 1086 | "model_module": "@jupyter-widgets/base", 1087 | "model_module_version": "1.2.0", 1088 | "model_name": "LayoutModel", 1089 | "state": { 1090 | "_model_module": "@jupyter-widgets/base", 1091 | "_model_module_version": "1.2.0", 1092 | "_model_name": "LayoutModel", 1093 | "_view_count": null, 1094 | "_view_module": "@jupyter-widgets/base", 1095 | "_view_module_version": "1.2.0", 1096 | "_view_name": "LayoutView", 1097 | "align_content": null, 1098 | "align_items": null, 1099 | "align_self": null, 1100 | "border": null, 1101 | "bottom": null, 1102 | "display": null, 1103 | "flex": null, 1104 | "flex_flow": null, 1105 | "grid_area": null, 1106 | "grid_auto_columns": null, 1107 | "grid_auto_flow": null, 1108 | "grid_auto_rows": null, 1109 | "grid_column": null, 1110 | "grid_gap": null, 1111 | "grid_row": null, 1112 | "grid_template_areas": null, 1113 | "grid_template_columns": null, 1114 | "grid_template_rows": null, 1115 | "height": null, 1116 | "justify_content": null, 1117 | "justify_items": null, 1118 | "left": null, 1119 | "margin": null, 1120 | "max_height": null, 1121 | "max_width": null, 1122 | "min_height": null, 1123 | "min_width": null, 1124 | "object_fit": null, 1125 | "object_position": null, 1126 | "order": null, 1127 | "overflow": null, 1128 | "overflow_x": null, 1129 | "overflow_y": null, 1130 | "padding": null, 1131 | "right": null, 1132 | "top": null, 1133 | "visibility": null, 1134 | "width": null 1135 | } 1136 | }, 1137 | "18fe6f38ca234379a17e44cd1fad50d4": { 1138 | "model_module": "@jupyter-widgets/base", 1139 | "model_module_version": "1.2.0", 1140 | "model_name": "LayoutModel", 1141 | "state": { 1142 | "_model_module": "@jupyter-widgets/base", 1143 | "_model_module_version": "1.2.0", 1144 | "_model_name": "LayoutModel", 1145 | "_view_count": null, 1146 | "_view_module": "@jupyter-widgets/base", 1147 | "_view_module_version": "1.2.0", 1148 | "_view_name": "LayoutView", 1149 | "align_content": null, 1150 | "align_items": null, 1151 | "align_self": null, 1152 | "border": null, 1153 | "bottom": null, 1154 | "display": null, 1155 | "flex": null, 1156 | "flex_flow": null, 1157 | "grid_area": null, 1158 | "grid_auto_columns": null, 1159 | "grid_auto_flow": null, 1160 | "grid_auto_rows": null, 1161 | "grid_column": null, 1162 | "grid_gap": null, 1163 | "grid_row": null, 1164 | "grid_template_areas": null, 1165 | "grid_template_columns": null, 1166 | "grid_template_rows": null, 1167 | "height": null, 1168 | "justify_content": null, 1169 | "justify_items": null, 1170 | "left": null, 1171 | "margin": null, 1172 | "max_height": null, 1173 | "max_width": null, 1174 | "min_height": null, 1175 | "min_width": null, 1176 | "object_fit": null, 1177 | "object_position": null, 1178 | "order": null, 1179 | "overflow": null, 1180 | "overflow_x": null, 1181 | "overflow_y": null, 1182 | "padding": null, 1183 | "right": null, 1184 | "top": null, 1185 | "visibility": null, 1186 | "width": null 1187 | } 1188 | }, 1189 | "26cc77465c0e4f30b086bf93a81f9386": { 1190 | "model_module": "@jupyter-widgets/controls", 1191 | "model_module_version": "1.5.0", 1192 | "model_name": "DescriptionStyleModel", 1193 | "state": { 1194 | "_model_module": "@jupyter-widgets/controls", 1195 | "_model_module_version": "1.5.0", 1196 | "_model_name": "DescriptionStyleModel", 1197 | "_view_count": null, 1198 | "_view_module": "@jupyter-widgets/base", 1199 | "_view_module_version": "1.2.0", 1200 | "_view_name": "StyleView", 1201 | "description_width": "" 1202 | } 1203 | }, 1204 | "274606aec69a461f8c4259316b86c4af": { 1205 | "model_module": "@jupyter-widgets/controls", 1206 | "model_module_version": "1.5.0", 1207 | "model_name": "HTMLModel", 1208 | "state": { 1209 | "_dom_classes": [], 1210 | "_model_module": "@jupyter-widgets/controls", 1211 | "_model_module_version": "1.5.0", 1212 | "_model_name": "HTMLModel", 1213 | "_view_count": null, 1214 | "_view_module": "@jupyter-widgets/controls", 1215 | "_view_module_version": "1.5.0", 1216 | "_view_name": "HTMLView", 1217 | "description": "", 1218 | "description_tooltip": null, 1219 | "layout": "IPY_MODEL_ed03788fd9b14684b1d339664f56bfd5", 1220 | "placeholder": "​", 1221 | "style": "IPY_MODEL_5fb92f13f2a5410b84cc9a7573e7da0a", 1222 | "value": " 896k/896k [00:01<00:00, 770kB/s]" 1223 | } 1224 | }, 1225 | "2a6633db6b2946d6a6c8a66065e394cd": { 1226 | "model_module": "@jupyter-widgets/base", 1227 | "model_module_version": "1.2.0", 1228 | "model_name": "LayoutModel", 1229 | "state": { 1230 | "_model_module": "@jupyter-widgets/base", 1231 | "_model_module_version": "1.2.0", 1232 | "_model_name": "LayoutModel", 1233 | "_view_count": null, 1234 | "_view_module": "@jupyter-widgets/base", 1235 | "_view_module_version": "1.2.0", 1236 | "_view_name": "LayoutView", 1237 | "align_content": null, 1238 | "align_items": null, 1239 | "align_self": null, 1240 | "border": null, 1241 | "bottom": null, 1242 | "display": null, 1243 | "flex": null, 1244 | "flex_flow": null, 1245 | "grid_area": null, 1246 | "grid_auto_columns": null, 1247 | "grid_auto_flow": null, 1248 | "grid_auto_rows": null, 1249 | "grid_column": null, 1250 | "grid_gap": null, 1251 | "grid_row": null, 1252 | "grid_template_areas": null, 1253 | "grid_template_columns": null, 1254 | "grid_template_rows": null, 1255 | "height": null, 1256 | "justify_content": null, 1257 | "justify_items": null, 1258 | "left": null, 1259 | "margin": null, 1260 | "max_height": null, 1261 | "max_width": null, 1262 | "min_height": null, 1263 | "min_width": null, 1264 | "object_fit": null, 1265 | "object_position": null, 1266 | "order": null, 1267 | "overflow": null, 1268 | "overflow_x": null, 1269 | "overflow_y": null, 1270 | "padding": null, 1271 | "right": null, 1272 | "top": null, 1273 | "visibility": null, 1274 | "width": null 1275 | } 1276 | }, 1277 | "33dc04e218864811a4fe0c62ca737d83": { 1278 | "model_module": "@jupyter-widgets/controls", 1279 | "model_module_version": "1.5.0", 1280 | "model_name": "FloatProgressModel", 1281 | "state": { 1282 | "_dom_classes": [], 1283 | "_model_module": "@jupyter-widgets/controls", 1284 | "_model_module_version": "1.5.0", 1285 | "_model_name": "FloatProgressModel", 1286 | "_view_count": null, 1287 | "_view_module": "@jupyter-widgets/controls", 1288 | "_view_module_version": "1.5.0", 1289 | "_view_name": "ProgressView", 1290 | "bar_style": "success", 1291 | "description": "Downloading: 100%", 1292 | "description_tooltip": null, 1293 | "layout": "IPY_MODEL_703da9466c0241519229161cb6ec5d87", 1294 | "max": 895731, 1295 | "min": 0, 1296 | "orientation": "horizontal", 1297 | "style": "IPY_MODEL_55d928f692d04a008a85a77abf0e46a0", 1298 | "value": 895731 1299 | } 1300 | }, 1301 | "432e4a857a5d4151a4d1b5bc7b6bb4fb": { 1302 | "model_module": "@jupyter-widgets/controls", 1303 | "model_module_version": "1.5.0", 1304 | "model_name": "DescriptionStyleModel", 1305 | "state": { 1306 | "_model_module": "@jupyter-widgets/controls", 1307 | "_model_module_version": "1.5.0", 1308 | "_model_name": "DescriptionStyleModel", 1309 | "_view_count": null, 1310 | "_view_module": "@jupyter-widgets/base", 1311 | "_view_module_version": "1.2.0", 1312 | "_view_name": "StyleView", 1313 | "description_width": "" 1314 | } 1315 | }, 1316 | "4c498d5d31d543ad9d4f63df61ce9332": { 1317 | "model_module": "@jupyter-widgets/controls", 1318 | "model_module_version": "1.5.0", 1319 | "model_name": "HBoxModel", 1320 | "state": { 1321 | "_dom_classes": [], 1322 | "_model_module": "@jupyter-widgets/controls", 1323 | "_model_module_version": "1.5.0", 1324 | "_model_name": "HBoxModel", 1325 | "_view_count": null, 1326 | "_view_module": "@jupyter-widgets/controls", 1327 | "_view_module_version": "1.5.0", 1328 | "_view_name": "HBoxView", 1329 | "box_style": "", 1330 | "children": [ 1331 | "IPY_MODEL_7be53379597948ed83e132d9014abf53", 1332 | "IPY_MODEL_ddb8a6c765504d6ea10daf2da84a5c83" 1333 | ], 1334 | "layout": "IPY_MODEL_d8565e4e46924bbfbabe7b3eb8df8b79" 1335 | } 1336 | }, 1337 | "54ca2b8f204b4760bccb27572ff7b74a": { 1338 | "model_module": "@jupyter-widgets/controls", 1339 | "model_module_version": "1.5.0", 1340 | "model_name": "ProgressStyleModel", 1341 | "state": { 1342 | "_model_module": "@jupyter-widgets/controls", 1343 | "_model_module_version": "1.5.0", 1344 | "_model_name": "ProgressStyleModel", 1345 | "_view_count": null, 1346 | "_view_module": "@jupyter-widgets/base", 1347 | "_view_module_version": "1.2.0", 1348 | "_view_name": "StyleView", 1349 | "bar_color": null, 1350 | "description_width": "initial" 1351 | } 1352 | }, 1353 | "55d928f692d04a008a85a77abf0e46a0": { 1354 | "model_module": "@jupyter-widgets/controls", 1355 | "model_module_version": "1.5.0", 1356 | "model_name": "ProgressStyleModel", 1357 | "state": { 1358 | "_model_module": "@jupyter-widgets/controls", 1359 | "_model_module_version": "1.5.0", 1360 | "_model_name": "ProgressStyleModel", 1361 | "_view_count": null, 1362 | "_view_module": "@jupyter-widgets/base", 1363 | "_view_module_version": "1.2.0", 1364 | "_view_name": "StyleView", 1365 | "bar_color": null, 1366 | "description_width": "initial" 1367 | } 1368 | }, 1369 | "5bc9b6ada49a4642a1cba622c93f8b62": { 1370 | "model_module": "@jupyter-widgets/base", 1371 | "model_module_version": "1.2.0", 1372 | "model_name": "LayoutModel", 1373 | "state": { 1374 | "_model_module": "@jupyter-widgets/base", 1375 | "_model_module_version": "1.2.0", 1376 | "_model_name": "LayoutModel", 1377 | "_view_count": null, 1378 | "_view_module": "@jupyter-widgets/base", 1379 | "_view_module_version": "1.2.0", 1380 | "_view_name": "LayoutView", 1381 | "align_content": null, 1382 | "align_items": null, 1383 | "align_self": null, 1384 | "border": null, 1385 | "bottom": null, 1386 | "display": null, 1387 | "flex": null, 1388 | "flex_flow": null, 1389 | "grid_area": null, 1390 | "grid_auto_columns": null, 1391 | "grid_auto_flow": null, 1392 | "grid_auto_rows": null, 1393 | "grid_column": null, 1394 | "grid_gap": null, 1395 | "grid_row": null, 1396 | "grid_template_areas": null, 1397 | "grid_template_columns": null, 1398 | "grid_template_rows": null, 1399 | "height": null, 1400 | "justify_content": null, 1401 | "justify_items": null, 1402 | "left": null, 1403 | "margin": null, 1404 | "max_height": null, 1405 | "max_width": null, 1406 | "min_height": null, 1407 | "min_width": null, 1408 | "object_fit": null, 1409 | "object_position": null, 1410 | "order": null, 1411 | "overflow": null, 1412 | "overflow_x": null, 1413 | "overflow_y": null, 1414 | "padding": null, 1415 | "right": null, 1416 | "top": null, 1417 | "visibility": null, 1418 | "width": null 1419 | } 1420 | }, 1421 | "5fb92f13f2a5410b84cc9a7573e7da0a": { 1422 | "model_module": "@jupyter-widgets/controls", 1423 | "model_module_version": "1.5.0", 1424 | "model_name": "DescriptionStyleModel", 1425 | "state": { 1426 | "_model_module": "@jupyter-widgets/controls", 1427 | "_model_module_version": "1.5.0", 1428 | "_model_name": "DescriptionStyleModel", 1429 | "_view_count": null, 1430 | "_view_module": "@jupyter-widgets/base", 1431 | "_view_module_version": "1.2.0", 1432 | "_view_name": "StyleView", 1433 | "description_width": "" 1434 | } 1435 | }, 1436 | "67f9de54d6e5434190bd07b7151d23b7": { 1437 | "model_module": "@jupyter-widgets/controls", 1438 | "model_module_version": "1.5.0", 1439 | "model_name": "FloatProgressModel", 1440 | "state": { 1441 | "_dom_classes": [], 1442 | "_model_module": "@jupyter-widgets/controls", 1443 | "_model_module_version": "1.5.0", 1444 | "_model_name": "FloatProgressModel", 1445 | "_view_count": null, 1446 | "_view_module": "@jupyter-widgets/controls", 1447 | "_view_module_version": "1.5.0", 1448 | "_view_name": "ProgressView", 1449 | "bar_style": "success", 1450 | "description": "Downloading: 100%", 1451 | "description_tooltip": null, 1452 | "layout": "IPY_MODEL_edcc338999ac45feaab03a86e2af75a9", 1453 | "max": 1496, 1454 | "min": 0, 1455 | "orientation": "horizontal", 1456 | "style": "IPY_MODEL_f575006dc6624157bfb408cced4e6ae6", 1457 | "value": 1496 1458 | } 1459 | }, 1460 | "68cf808ab7e1428fa9acf6a9fd435b49": { 1461 | "model_module": "@jupyter-widgets/base", 1462 | "model_module_version": "1.2.0", 1463 | "model_name": "LayoutModel", 1464 | "state": { 1465 | "_model_module": "@jupyter-widgets/base", 1466 | "_model_module_version": "1.2.0", 1467 | "_model_name": "LayoutModel", 1468 | "_view_count": null, 1469 | "_view_module": "@jupyter-widgets/base", 1470 | "_view_module_version": "1.2.0", 1471 | "_view_name": "LayoutView", 1472 | "align_content": null, 1473 | "align_items": null, 1474 | "align_self": null, 1475 | "border": null, 1476 | "bottom": null, 1477 | "display": null, 1478 | "flex": null, 1479 | "flex_flow": null, 1480 | "grid_area": null, 1481 | "grid_auto_columns": null, 1482 | "grid_auto_flow": null, 1483 | "grid_auto_rows": null, 1484 | "grid_column": null, 1485 | "grid_gap": null, 1486 | "grid_row": null, 1487 | "grid_template_areas": null, 1488 | "grid_template_columns": null, 1489 | "grid_template_rows": null, 1490 | "height": null, 1491 | "justify_content": null, 1492 | "justify_items": null, 1493 | "left": null, 1494 | "margin": null, 1495 | "max_height": null, 1496 | "max_width": null, 1497 | "min_height": null, 1498 | "min_width": null, 1499 | "object_fit": null, 1500 | "object_position": null, 1501 | "order": null, 1502 | "overflow": null, 1503 | "overflow_x": null, 1504 | "overflow_y": null, 1505 | "padding": null, 1506 | "right": null, 1507 | "top": null, 1508 | "visibility": null, 1509 | "width": null 1510 | } 1511 | }, 1512 | "703da9466c0241519229161cb6ec5d87": { 1513 | "model_module": "@jupyter-widgets/base", 1514 | "model_module_version": "1.2.0", 1515 | "model_name": "LayoutModel", 1516 | "state": { 1517 | "_model_module": "@jupyter-widgets/base", 1518 | "_model_module_version": "1.2.0", 1519 | "_model_name": "LayoutModel", 1520 | "_view_count": null, 1521 | "_view_module": "@jupyter-widgets/base", 1522 | "_view_module_version": "1.2.0", 1523 | "_view_name": "LayoutView", 1524 | "align_content": null, 1525 | "align_items": null, 1526 | "align_self": null, 1527 | "border": null, 1528 | "bottom": null, 1529 | "display": null, 1530 | "flex": null, 1531 | "flex_flow": null, 1532 | "grid_area": null, 1533 | "grid_auto_columns": null, 1534 | "grid_auto_flow": null, 1535 | "grid_auto_rows": null, 1536 | "grid_column": null, 1537 | "grid_gap": null, 1538 | "grid_row": null, 1539 | "grid_template_areas": null, 1540 | "grid_template_columns": null, 1541 | "grid_template_rows": null, 1542 | "height": null, 1543 | "justify_content": null, 1544 | "justify_items": null, 1545 | "left": null, 1546 | "margin": null, 1547 | "max_height": null, 1548 | "max_width": null, 1549 | "min_height": null, 1550 | "min_width": null, 1551 | "object_fit": null, 1552 | "object_position": null, 1553 | "order": null, 1554 | "overflow": null, 1555 | "overflow_x": null, 1556 | "overflow_y": null, 1557 | "padding": null, 1558 | "right": null, 1559 | "top": null, 1560 | "visibility": null, 1561 | "width": null 1562 | } 1563 | }, 1564 | "7be53379597948ed83e132d9014abf53": { 1565 | "model_module": "@jupyter-widgets/controls", 1566 | "model_module_version": "1.5.0", 1567 | "model_name": "FloatProgressModel", 1568 | "state": { 1569 | "_dom_classes": [], 1570 | "_model_module": "@jupyter-widgets/controls", 1571 | "_model_module_version": "1.5.0", 1572 | "_model_name": "FloatProgressModel", 1573 | "_view_count": null, 1574 | "_view_module": "@jupyter-widgets/controls", 1575 | "_view_module_version": "1.5.0", 1576 | "_view_name": "ProgressView", 1577 | "bar_style": "success", 1578 | "description": "Downloading: 100%", 1579 | "description_tooltip": null, 1580 | "layout": "IPY_MODEL_91e5603df33e4f6fad3bc0b8fe67cb47", 1581 | "max": 1561415, 1582 | "min": 0, 1583 | "orientation": "horizontal", 1584 | "style": "IPY_MODEL_b4807859ba084927b1c01ad25559e790", 1585 | "value": 1561415 1586 | } 1587 | }, 1588 | "91e35ac677a045c49b3f7dbf243b2e6a": { 1589 | "model_module": "@jupyter-widgets/base", 1590 | "model_module_version": "1.2.0", 1591 | "model_name": "LayoutModel", 1592 | "state": { 1593 | "_model_module": "@jupyter-widgets/base", 1594 | "_model_module_version": "1.2.0", 1595 | "_model_name": "LayoutModel", 1596 | "_view_count": null, 1597 | "_view_module": "@jupyter-widgets/base", 1598 | "_view_module_version": "1.2.0", 1599 | "_view_name": "LayoutView", 1600 | "align_content": null, 1601 | "align_items": null, 1602 | "align_self": null, 1603 | "border": null, 1604 | "bottom": null, 1605 | "display": null, 1606 | "flex": null, 1607 | "flex_flow": null, 1608 | "grid_area": null, 1609 | "grid_auto_columns": null, 1610 | "grid_auto_flow": null, 1611 | "grid_auto_rows": null, 1612 | "grid_column": null, 1613 | "grid_gap": null, 1614 | "grid_row": null, 1615 | "grid_template_areas": null, 1616 | "grid_template_columns": null, 1617 | "grid_template_rows": null, 1618 | "height": null, 1619 | "justify_content": null, 1620 | "justify_items": null, 1621 | "left": null, 1622 | "margin": null, 1623 | "max_height": null, 1624 | "max_width": null, 1625 | "min_height": null, 1626 | "min_width": null, 1627 | "object_fit": null, 1628 | "object_position": null, 1629 | "order": null, 1630 | "overflow": null, 1631 | "overflow_x": null, 1632 | "overflow_y": null, 1633 | "padding": null, 1634 | "right": null, 1635 | "top": null, 1636 | "visibility": null, 1637 | "width": null 1638 | } 1639 | }, 1640 | "91e5603df33e4f6fad3bc0b8fe67cb47": { 1641 | "model_module": "@jupyter-widgets/base", 1642 | "model_module_version": "1.2.0", 1643 | "model_name": "LayoutModel", 1644 | "state": { 1645 | "_model_module": "@jupyter-widgets/base", 1646 | "_model_module_version": "1.2.0", 1647 | "_model_name": "LayoutModel", 1648 | "_view_count": null, 1649 | "_view_module": "@jupyter-widgets/base", 1650 | "_view_module_version": "1.2.0", 1651 | "_view_name": "LayoutView", 1652 | "align_content": null, 1653 | "align_items": null, 1654 | "align_self": null, 1655 | "border": null, 1656 | "bottom": null, 1657 | "display": null, 1658 | "flex": null, 1659 | "flex_flow": null, 1660 | "grid_area": null, 1661 | "grid_auto_columns": null, 1662 | "grid_auto_flow": null, 1663 | "grid_auto_rows": null, 1664 | "grid_column": null, 1665 | "grid_gap": null, 1666 | "grid_row": null, 1667 | "grid_template_areas": null, 1668 | "grid_template_columns": null, 1669 | "grid_template_rows": null, 1670 | "height": null, 1671 | "justify_content": null, 1672 | "justify_items": null, 1673 | "left": null, 1674 | "margin": null, 1675 | "max_height": null, 1676 | "max_width": null, 1677 | "min_height": null, 1678 | "min_width": null, 1679 | "object_fit": null, 1680 | "object_position": null, 1681 | "order": null, 1682 | "overflow": null, 1683 | "overflow_x": null, 1684 | "overflow_y": null, 1685 | "padding": null, 1686 | "right": null, 1687 | "top": null, 1688 | "visibility": null, 1689 | "width": null 1690 | } 1691 | }, 1692 | "94ef6e8f88bb498783522af9621bf811": { 1693 | "model_module": "@jupyter-widgets/controls", 1694 | "model_module_version": "1.5.0", 1695 | "model_name": "HTMLModel", 1696 | "state": { 1697 | "_dom_classes": [], 1698 | "_model_module": "@jupyter-widgets/controls", 1699 | "_model_module_version": "1.5.0", 1700 | "_model_name": "HTMLModel", 1701 | "_view_count": null, 1702 | "_view_module": "@jupyter-widgets/controls", 1703 | "_view_module_version": "1.5.0", 1704 | "_view_name": "HTMLView", 1705 | "description": "", 1706 | "description_tooltip": null, 1707 | "layout": "IPY_MODEL_91e35ac677a045c49b3f7dbf243b2e6a", 1708 | "placeholder": "​", 1709 | "style": "IPY_MODEL_432e4a857a5d4151a4d1b5bc7b6bb4fb", 1710 | "value": " 1.50k/1.50k [00:01<00:00, 1.23kB/s]" 1711 | } 1712 | }, 1713 | "9f4e9ae9b7fc4e89ac6aa81af567a678": { 1714 | "model_module": "@jupyter-widgets/controls", 1715 | "model_module_version": "1.5.0", 1716 | "model_name": "HBoxModel", 1717 | "state": { 1718 | "_dom_classes": [], 1719 | "_model_module": "@jupyter-widgets/controls", 1720 | "_model_module_version": "1.5.0", 1721 | "_model_name": "HBoxModel", 1722 | "_view_count": null, 1723 | "_view_module": "@jupyter-widgets/controls", 1724 | "_view_module_version": "1.5.0", 1725 | "_view_name": "HBoxView", 1726 | "box_style": "", 1727 | "children": [ 1728 | "IPY_MODEL_06d2301a3d34440eb19a887fb51d562c", 1729 | "IPY_MODEL_b31585a6d0574a0cb973bb3679a8168a" 1730 | ], 1731 | "layout": "IPY_MODEL_68cf808ab7e1428fa9acf6a9fd435b49" 1732 | } 1733 | }, 1734 | "b31585a6d0574a0cb973bb3679a8168a": { 1735 | "model_module": "@jupyter-widgets/controls", 1736 | "model_module_version": "1.5.0", 1737 | "model_name": "HTMLModel", 1738 | "state": { 1739 | "_dom_classes": [], 1740 | "_model_module": "@jupyter-widgets/controls", 1741 | "_model_module_version": "1.5.0", 1742 | "_model_name": "HTMLModel", 1743 | "_view_count": null, 1744 | "_view_module": "@jupyter-widgets/controls", 1745 | "_view_module_version": "1.5.0", 1746 | "_view_name": "HTMLView", 1747 | "description": "", 1748 | "description_tooltip": null, 1749 | "layout": "IPY_MODEL_12a4548fdad44ea181868776c7616455", 1750 | "placeholder": "​", 1751 | "style": "IPY_MODEL_d422f60f5607443da23dba147889e3b7", 1752 | "value": " 553M/553M [00:16<00:00, 34.0MB/s]" 1753 | } 1754 | }, 1755 | "b4807859ba084927b1c01ad25559e790": { 1756 | "model_module": "@jupyter-widgets/controls", 1757 | "model_module_version": "1.5.0", 1758 | "model_name": "ProgressStyleModel", 1759 | "state": { 1760 | "_model_module": "@jupyter-widgets/controls", 1761 | "_model_module_version": "1.5.0", 1762 | "_model_name": "ProgressStyleModel", 1763 | "_view_count": null, 1764 | "_view_module": "@jupyter-widgets/base", 1765 | "_view_module_version": "1.2.0", 1766 | "_view_name": "StyleView", 1767 | "bar_color": null, 1768 | "description_width": "initial" 1769 | } 1770 | }, 1771 | "d422f60f5607443da23dba147889e3b7": { 1772 | "model_module": "@jupyter-widgets/controls", 1773 | "model_module_version": "1.5.0", 1774 | "model_name": "DescriptionStyleModel", 1775 | "state": { 1776 | "_model_module": "@jupyter-widgets/controls", 1777 | "_model_module_version": "1.5.0", 1778 | "_model_name": "DescriptionStyleModel", 1779 | "_view_count": null, 1780 | "_view_module": "@jupyter-widgets/base", 1781 | "_view_module_version": "1.2.0", 1782 | "_view_name": "StyleView", 1783 | "description_width": "" 1784 | } 1785 | }, 1786 | "d8565e4e46924bbfbabe7b3eb8df8b79": { 1787 | "model_module": "@jupyter-widgets/base", 1788 | "model_module_version": "1.2.0", 1789 | "model_name": "LayoutModel", 1790 | "state": { 1791 | "_model_module": "@jupyter-widgets/base", 1792 | "_model_module_version": "1.2.0", 1793 | "_model_name": "LayoutModel", 1794 | "_view_count": null, 1795 | "_view_module": "@jupyter-widgets/base", 1796 | "_view_module_version": "1.2.0", 1797 | "_view_name": "LayoutView", 1798 | "align_content": null, 1799 | "align_items": null, 1800 | "align_self": null, 1801 | "border": null, 1802 | "bottom": null, 1803 | "display": null, 1804 | "flex": null, 1805 | "flex_flow": null, 1806 | "grid_area": null, 1807 | "grid_auto_columns": null, 1808 | "grid_auto_flow": null, 1809 | "grid_auto_rows": null, 1810 | "grid_column": null, 1811 | "grid_gap": null, 1812 | "grid_row": null, 1813 | "grid_template_areas": null, 1814 | "grid_template_columns": null, 1815 | "grid_template_rows": null, 1816 | "height": null, 1817 | "justify_content": null, 1818 | "justify_items": null, 1819 | "left": null, 1820 | "margin": null, 1821 | "max_height": null, 1822 | "max_width": null, 1823 | "min_height": null, 1824 | "min_width": null, 1825 | "object_fit": null, 1826 | "object_position": null, 1827 | "order": null, 1828 | "overflow": null, 1829 | "overflow_x": null, 1830 | "overflow_y": null, 1831 | "padding": null, 1832 | "right": null, 1833 | "top": null, 1834 | "visibility": null, 1835 | "width": null 1836 | } 1837 | }, 1838 | "ddb8a6c765504d6ea10daf2da84a5c83": { 1839 | "model_module": "@jupyter-widgets/controls", 1840 | "model_module_version": "1.5.0", 1841 | "model_name": "HTMLModel", 1842 | "state": { 1843 | "_dom_classes": [], 1844 | "_model_module": "@jupyter-widgets/controls", 1845 | "_model_module_version": "1.5.0", 1846 | "_model_name": "HTMLModel", 1847 | "_view_count": null, 1848 | "_view_module": "@jupyter-widgets/controls", 1849 | "_view_module_version": "1.5.0", 1850 | "_view_name": "HTMLView", 1851 | "description": "", 1852 | "description_tooltip": null, 1853 | "layout": "IPY_MODEL_5bc9b6ada49a4642a1cba622c93f8b62", 1854 | "placeholder": "​", 1855 | "style": "IPY_MODEL_26cc77465c0e4f30b086bf93a81f9386", 1856 | "value": " 1.56M/1.56M [00:02<00:00, 629kB/s]" 1857 | } 1858 | }, 1859 | "ed03788fd9b14684b1d339664f56bfd5": { 1860 | "model_module": "@jupyter-widgets/base", 1861 | "model_module_version": "1.2.0", 1862 | "model_name": "LayoutModel", 1863 | "state": { 1864 | "_model_module": "@jupyter-widgets/base", 1865 | "_model_module_version": "1.2.0", 1866 | "_model_name": "LayoutModel", 1867 | "_view_count": null, 1868 | "_view_module": "@jupyter-widgets/base", 1869 | "_view_module_version": "1.2.0", 1870 | "_view_name": "LayoutView", 1871 | "align_content": null, 1872 | "align_items": null, 1873 | "align_self": null, 1874 | "border": null, 1875 | "bottom": null, 1876 | "display": null, 1877 | "flex": null, 1878 | "flex_flow": null, 1879 | "grid_area": null, 1880 | "grid_auto_columns": null, 1881 | "grid_auto_flow": null, 1882 | "grid_auto_rows": null, 1883 | "grid_column": null, 1884 | "grid_gap": null, 1885 | "grid_row": null, 1886 | "grid_template_areas": null, 1887 | "grid_template_columns": null, 1888 | "grid_template_rows": null, 1889 | "height": null, 1890 | "justify_content": null, 1891 | "justify_items": null, 1892 | "left": null, 1893 | "margin": null, 1894 | "max_height": null, 1895 | "max_width": null, 1896 | "min_height": null, 1897 | "min_width": null, 1898 | "object_fit": null, 1899 | "object_position": null, 1900 | "order": null, 1901 | "overflow": null, 1902 | "overflow_x": null, 1903 | "overflow_y": null, 1904 | "padding": null, 1905 | "right": null, 1906 | "top": null, 1907 | "visibility": null, 1908 | "width": null 1909 | } 1910 | }, 1911 | "edcc338999ac45feaab03a86e2af75a9": { 1912 | "model_module": "@jupyter-widgets/base", 1913 | "model_module_version": "1.2.0", 1914 | "model_name": "LayoutModel", 1915 | "state": { 1916 | "_model_module": "@jupyter-widgets/base", 1917 | "_model_module_version": "1.2.0", 1918 | "_model_name": "LayoutModel", 1919 | "_view_count": null, 1920 | "_view_module": "@jupyter-widgets/base", 1921 | "_view_module_version": "1.2.0", 1922 | "_view_name": "LayoutView", 1923 | "align_content": null, 1924 | "align_items": null, 1925 | "align_self": null, 1926 | "border": null, 1927 | "bottom": null, 1928 | "display": null, 1929 | "flex": null, 1930 | "flex_flow": null, 1931 | "grid_area": null, 1932 | "grid_auto_columns": null, 1933 | "grid_auto_flow": null, 1934 | "grid_auto_rows": null, 1935 | "grid_column": null, 1936 | "grid_gap": null, 1937 | "grid_row": null, 1938 | "grid_template_areas": null, 1939 | "grid_template_columns": null, 1940 | "grid_template_rows": null, 1941 | "height": null, 1942 | "justify_content": null, 1943 | "justify_items": null, 1944 | "left": null, 1945 | "margin": null, 1946 | "max_height": null, 1947 | "max_width": null, 1948 | "min_height": null, 1949 | "min_width": null, 1950 | "object_fit": null, 1951 | "object_position": null, 1952 | "order": null, 1953 | "overflow": null, 1954 | "overflow_x": null, 1955 | "overflow_y": null, 1956 | "padding": null, 1957 | "right": null, 1958 | "top": null, 1959 | "visibility": null, 1960 | "width": null 1961 | } 1962 | }, 1963 | "f575006dc6624157bfb408cced4e6ae6": { 1964 | "model_module": "@jupyter-widgets/controls", 1965 | "model_module_version": "1.5.0", 1966 | "model_name": "ProgressStyleModel", 1967 | "state": { 1968 | "_model_module": "@jupyter-widgets/controls", 1969 | "_model_module_version": "1.5.0", 1970 | "_model_name": "ProgressStyleModel", 1971 | "_view_count": null, 1972 | "_view_module": "@jupyter-widgets/base", 1973 | "_view_module_version": "1.2.0", 1974 | "_view_name": "StyleView", 1975 | "bar_color": null, 1976 | "description_width": "initial" 1977 | } 1978 | }, 1979 | "f5c9c24c01114e168e90ed555fb9f05a": { 1980 | "model_module": "@jupyter-widgets/base", 1981 | "model_module_version": "1.2.0", 1982 | "model_name": "LayoutModel", 1983 | "state": { 1984 | "_model_module": "@jupyter-widgets/base", 1985 | "_model_module_version": "1.2.0", 1986 | "_model_name": "LayoutModel", 1987 | "_view_count": null, 1988 | "_view_module": "@jupyter-widgets/base", 1989 | "_view_module_version": "1.2.0", 1990 | "_view_name": "LayoutView", 1991 | "align_content": null, 1992 | "align_items": null, 1993 | "align_self": null, 1994 | "border": null, 1995 | "bottom": null, 1996 | "display": null, 1997 | "flex": null, 1998 | "flex_flow": null, 1999 | "grid_area": null, 2000 | "grid_auto_columns": null, 2001 | "grid_auto_flow": null, 2002 | "grid_auto_rows": null, 2003 | "grid_column": null, 2004 | "grid_gap": null, 2005 | "grid_row": null, 2006 | "grid_template_areas": null, 2007 | "grid_template_columns": null, 2008 | "grid_template_rows": null, 2009 | "height": null, 2010 | "justify_content": null, 2011 | "justify_items": null, 2012 | "left": null, 2013 | "margin": null, 2014 | "max_height": null, 2015 | "max_width": null, 2016 | "min_height": null, 2017 | "min_width": null, 2018 | "object_fit": null, 2019 | "object_position": null, 2020 | "order": null, 2021 | "overflow": null, 2022 | "overflow_x": null, 2023 | "overflow_y": null, 2024 | "padding": null, 2025 | "right": null, 2026 | "top": null, 2027 | "visibility": null, 2028 | "width": null 2029 | } 2030 | }, 2031 | "f5d5b2c07f5745538d04f3968c244002": { 2032 | "model_module": "@jupyter-widgets/controls", 2033 | "model_module_version": "1.5.0", 2034 | "model_name": "HBoxModel", 2035 | "state": { 2036 | "_dom_classes": [], 2037 | "_model_module": "@jupyter-widgets/controls", 2038 | "_model_module_version": "1.5.0", 2039 | "_model_name": "HBoxModel", 2040 | "_view_count": null, 2041 | "_view_module": "@jupyter-widgets/controls", 2042 | "_view_module_version": "1.5.0", 2043 | "_view_name": "HBoxView", 2044 | "box_style": "", 2045 | "children": [ 2046 | "IPY_MODEL_67f9de54d6e5434190bd07b7151d23b7", 2047 | "IPY_MODEL_94ef6e8f88bb498783522af9621bf811" 2048 | ], 2049 | "layout": "IPY_MODEL_2a6633db6b2946d6a6c8a66065e394cd" 2050 | } 2051 | } 2052 | }, 2053 | "version_major": 2, 2054 | "version_minor": 0 2055 | } 2056 | } 2057 | }, 2058 | "nbformat": 4, 2059 | "nbformat_minor": 4 2060 | } 2061 | -------------------------------------------------------------------------------- /multi_modal_addition_fusion.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","trusted":true},"cell_type":"code","source":"import os, time, datetime\nimport numpy as np\nimport pandas as pd\nfrom tqdm.notebook import tqdm\nimport random\nimport logging\ntqdm.pandas()\nimport seaborn as sns\nfrom sklearn.model_selection import train_test_split\n\n#NN Packages\nimport torch\nimport torch.nn as nn\nfrom torch.utils.data import TensorDataset, random_split,DataLoader, RandomSampler, SequentialSampler\n\nlogger = logging.getLogger(__name__)\n\n\nif torch.cuda.is_available(): \n\n # Tell PyTorch to use the GPU. \n device = torch.device(\"cuda\")\n\n print('There are %d GPU(s) available.' % torch.cuda.device_count())\n\n print('We will use the GPU:', torch.cuda.get_device_name(0))\n\n# If not...\nelse:\n print('No GPU available, using the CPU instead.')\n device = torch.device(\"cpu\")\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def format_time(elapsed):\n '''\n Takes a time in seconds and returns a string hh:mm:ss\n '''\n # Round to the nearest second.\n elapsed_rounded = int(round((elapsed)))\n \n # Format as hh:mm:ss\n return str(datetime.timedelta(seconds=elapsed_rounded))\n\nclass SigirPreprocess():\n \n def __init__(self, text_data_path):\n self.text_data_path = text_data_path\n self.train = None\n self.dict_code_to_id = {}\n self.dict_id_to_code = {}\n self.list_tags = {}\n self.sentences = []\n self.labels = []\n self.text_col = None\n self.X_test = None\n def prepare_data(self ):\n catalog_eng= pd.read_csv(self.text_data_path+\"data/catalog_english_taxonomy.tsv\",sep=\"\\t\")\n X_train= pd.read_csv(self.text_data_path+\"data/X_train.tsv\",sep=\"\\t\")\n Y_train= pd.read_csv(self.text_data_path+\"data/Y_train.tsv\",sep=\"\\t\")\n \n self.list_tags = list(Y_train['Prdtypecode'].unique())\n for i,tag in enumerate(self.list_tags):\n self.dict_code_to_id[tag] = i \n self.dict_id_to_code[i]=tag\n print(self.dict_code_to_id)\n \n Y_train['labels']=Y_train['Prdtypecode'].map(self.dict_code_to_id)\n train=pd.merge(left=X_train,right=Y_train,\n how='left',left_on=['Integer_id','Image_id','Product_id'],\n right_on=['Integer_id','Image_id','Product_id'])\n prod_map=pd.Series(catalog_eng['Top level category'].values,\n index=catalog_eng['Prdtypecode']).to_dict()\n\n train['product'] = train['Prdtypecode'].map(prod_map)\n train['title_len']=train['Title'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n train['desc_len']=train['Description'].progress_apply(lambda x : len(x.split()) if pd.notna(x) else 0)\n train['title_desc_len']=train['title_len'] + train['desc_len']\n train.loc[train['Description'].isnull(), 'Description'] = \" \"\n train['title_desc'] = train['Title'] + \" \" + train['Description']\n \n self.train = train\n \n def get_sentences(self, text_col, remove_null_rows=False):\n self.text_col = text_col\n if remove_null_rows==True:\n new_train = self.train[self.train[text_col].notnull()]\n\n else:\n new_train = self.train.copy()\n \n self.sentences = new_train[text_col].values\n self.labels = new_train['labels'].values\n \n def prepare_test(self, text_col):\n X_test=pd.read_csv(self.text_data_path+\"data/x_test_task1_phase1.tsv\",sep=\"\\t\")\n X_test.loc[X_test['Description'].isnull(), 'Description'] = \" \"\n X_test['title_desc'] = X_test['Title'] + \" \" + X_test['Description']\n self.X_test = X_test\n self.test_sentences = X_test[text_col].values\n ","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"text_col = 'title_desc'\nmax_len = 256\nval_size = 0.1","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"Preprocess = SigirPreprocess(\"/kaggle/input/textphase1/\")\nPreprocess.prepare_data()\nPreprocess.get_sentences(text_col, True)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"sentences = Preprocess.sentences\nlabels = Preprocess.labels\nprint(\"Total number of sentences:{}, labels:{}\".format(len(sentences), len(labels)))","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"\n# print('Using Camembert')\n# tokenizer_cam = CamembertTokenizer.from_pretrained('camembert-base', do_lowercase=False)\n# print('Using Flaubert')\n# tokenizer_flau = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', do_lowercase=False)\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#function to prepare input for model training\ndef prep_input(sentences,labels, max_len,tokenizer):\n input_ids = []\n attention_masks = []\n\n # For every sentence...\n for sent in tqdm(sentences):\n # `encode_plus` will:\n # (1) Tokenize the sentence.\n # (2) Prepend the `[CLS]` token to the start.\n # (3) Append the `[SEP]` token to the end.\n # (4) Map tokens to their IDs.\n # (5) Pad or truncate the sentence to `max_length`\n # (6) Create attention masks for [PAD] tokens.\n encoded_dict = tokenizer.encode_plus(\n sent, # Sentence to encode.\n add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n max_length = max_len, # Pad & truncate all sentences.\n pad_to_max_length = True,\n return_attention_mask = True, # Construct attn. masks.\n return_tensors = 'pt', # Return pytorch tensors.\n )\n\n # Add the encoded sentence to the list. \n input_ids.append(encoded_dict['input_ids'])\n\n # And its attention mask (simply differentiates padding from non-padding).\n attention_masks.append(encoded_dict['attention_mask'])\n\n # Convert the lists into tensors.\n input_ids = torch.cat(input_ids, dim=0)\n attention_masks = torch.cat(attention_masks, dim=0)\n if labels is not None:\n labels = torch.tensor(labels)\n return input_ids,attention_masks,labels\n else:\n return input_ids,attention_masks\n ","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# input_ids_cam,attention_masks_cam,labels_cam=prep_input(sentences,labels, max_len,tokenizer_cam)\n# # print('Original: ', sentences[0])\n# # print('Token IDs:', input_ids[0]) ","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# input_ids_flau,attention_masks_flau,labels_flau=prep_input(sentences,labels, max_len,tokenizer_flau)\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# tr_inputs_cam, val_inputs_cam, _,_ = train_test_split(input_ids_cam, labels_cam,stratify=labels_cam,\n# random_state=2020, test_size=val_size)\n# tr_masks_cam, val_masks_cam, _,_ = train_test_split(attention_masks_cam, labels,stratify=labels,\n# random_state=2020, test_size=val_size)\n\n# tr_inputs_flau, val_inputs_flau, _,_ = train_test_split(input_ids_flau, labels,stratify=labels,\n# random_state=2020, test_size=val_size)\n# tr_masks_flau, val_masks_flau, _,_ = train_test_split(attention_masks_flau, labels,stratify=labels_flau,\n# random_state=2020, test_size=val_size)\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# torch.save(tr_inputs_cam, \"tr_inputs_cam.pt\")\n# torch.save(val_inputs_cam, \"val_inputs_cam.pt\")\n# torch.save(tr_masks_cam, \"tr_masks_cam.pt\")\n# torch.save(val_masks_cam, \"val_masks_cam.pt\")\n\n# torch.save(tr_inputs_flau, \"tr_inputs_flau.pt\")\n# torch.save(val_inputs_flau, \"val_inputs_flau.pt\")\n# torch.save(tr_masks_flau, \"tr_masks_flau.pt\")\n# torch.save(val_masks_flau, \"val_masks_flau.pt\")\n\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# !mkdir -p /root/.kaggle/\n# !cp ../input/myjson/kaggle.json /root/.kaggle/\n# !chmod 600 /root/.kaggle/kaggle.json","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# data = '''{\n# \"title\": \"Multi_modal_input_text\",\n# \"id\": \"deepbugger/Multi-modal-input-text\",\n# \"licenses\": [\n# {\n# \"name\": \"CC0-1.0\"\n# }\n# ]\n# }\n# '''\n# text_file = open(\"/kaggle/working/dataset-metadata.json\", 'w+')\n# n = text_file.write(data)\n# text_file.close()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# !kaggle datasets create -p /kaggle/working\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"text_input='../input/multi-modal-input-text/'\ntr_inputs_cam=torch.load(text_input+\"tr_inputs_cam.pt\")\nval_inputs_cam=torch.load(text_input+\"val_inputs_cam.pt\")\ntr_masks_cam=torch.load( text_input+\"tr_masks_cam.pt\")\nval_masks_cam=torch.load( text_input+\"val_masks_cam.pt\")\n\ntr_inputs_flau=torch.load(text_input+\"tr_inputs_flau.pt\")\nval_inputs_flau=torch.load(text_input+\"val_inputs_flau.pt\")\ntr_masks_flau=torch.load(text_input+\"tr_masks_flau.pt\")\nval_masks_flau=torch.load(text_input+\"val_masks_flau.pt\")","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"!pip install pretrainedmodels","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from transformers import CamembertConfig, CamembertTokenizer, CamembertModel, CamembertForSequenceClassification, AdamW\nfrom transformers import FlaubertModel, FlaubertTokenizer,FlaubertForSequenceClassification,AdamW, FlaubertConfig \nfrom transformers.modeling_roberta import RobertaClassificationHead\nfrom transformers.modeling_utils import SequenceSummary","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from torch.nn import functional as F\nimport torch.nn as nn\nimport pretrainedmodels\n\nclass SEResnext50_32x4d(nn.Module):\n def __init__(self, pretrained='imagenet'):\n super(SEResnext50_32x4d, self).__init__()\n \n self.base_model = pretrainedmodels.__dict__[\"se_resnext50_32x4d\"](pretrained=None)\n if pretrained is not None:\n self.base_model.load_state_dict(\n torch.load(\"../input/pretrained-model-weights-pytorch/se_resnext50_32x4d-a260b3a4.pth\"\n )\n )\n self.l0 = nn.Linear(2048, 27)\n \n def forward(self, image):\n batch_size, _, _, _ = image.shape\n \n x = self.base_model.features(image)\n x = F.adaptive_avg_pool2d(x, 1).reshape(batch_size, -1)\n \n out = self.l0(x)\n\n return out","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"class Identity(nn.Module):\n def __init__(self):\n super(Identity, self).__init__()\n \n def forward(self, x):\n return x","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# num_classes=27\n# img_model = SEResnext50_32x4d(pretrained=None)\n# img_model.load_state_dict(torch.load('../input/seresnext2048/best_model.pt'))\n# img_model.cuda()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# img_model.l0=Identity()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# img_model","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# for params in img_model.parameters():\n# params.requires_grad=False\n ","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"class vec_output_CamembertForSequenceClassification(CamembertModel):\n config_class = CamembertConfig\n\n def __init__(self, config):\n super().__init__(config)\n self.num_labels = config.num_labels\n\n self.roberta = CamembertModel(config)\n self.dense = nn.Linear(256*config.hidden_size, config.hidden_size)\n self.dropout = nn.Dropout(0.1)\n self.out_proj = nn.Linear(config.hidden_size, config.num_labels)\n self.init_weights()\n\n\n def forward(\n self,\n input_ids=None,\n attention_mask=None,\n token_type_ids=None,\n position_ids=None,\n head_mask=None,\n inputs_embeds=None,\n labels=None,\n output_attentions=None,\n output_hidden_states=None,\n ):\n outputs = self.roberta(\n input_ids,\n attention_mask=attention_mask,\n token_type_ids=token_type_ids,\n position_ids=position_ids,\n head_mask=head_mask,\n inputs_embeds=inputs_embeds,\n# output_attentions=output_attentions,\n# output_hidden_states=output_hidden_states,\n )\n sequence_output = outputs[0] #(B,256,768)\n x = sequence_output.view(sequence_output.shape[0], 256*768)\n x = self.dense(x) # 256*768 -> 768\n feat= torch.tanh(x) \n logits = self.out_proj(feat) # 768 -> 27\n outputs = (logits,) + outputs[2:]\n \n return outputs # (loss), logits, (hidden_states), (attentions)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"num_classes = 27\n\nclass vec_output_FlaubertForSequenceClassification(FlaubertModel):\n \n config_class = FlaubertConfig\n \n\n def __init__(self, config):\n super().__init__(config)\n self.transformer = FlaubertModel(config)\n self.sequence_summary = SequenceSummary(config)\n self.init_weights()\n self.dropout = torch.nn.Dropout(0.1)\n self.classifier = torch.nn.Linear(config.hidden_size, num_classes)\n\n\n def forward(\n self,\n input_ids=None,\n attention_mask=None,\n langs=None,\n token_type_ids=None,\n position_ids=None,\n lengths=None,\n cache=None,\n head_mask=None,\n inputs_embeds=None,\n labels=None,\n ):\n \n \n transformer_outputs = self.transformer(\n input_ids,\n attention_mask=attention_mask,\n langs=langs,\n token_type_ids=token_type_ids,\n position_ids=position_ids,\n lengths=lengths,\n cache=cache,\n head_mask=head_mask,\n inputs_embeds=inputs_embeds,\n )\n\n #output = self.dropout(output)\n output = transformer_outputs[0]\n vec = output[:,0]\n \n \n #logits\n dense = self.dropout(vec)\n \n #classifier\n logits = self.classifier(dense)\n \n outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here\n \n \n return outputs\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# model = vec_output_CamembertForSequenceClassification.from_pretrained(\n# modelname, # Use the 12-layer BERT model, with an uncased vocab.\n# num_labels = len(Preprocess.dict_code_to_id), # The number of output labels--2 for binary classification.\n# # You can increase this for multi-class tasks. \n# output_attentions = False, # Whether the model returns attentions weights.\n# output_hidden_states = False, # Whether the model returns all hidden-states.\n# )","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# model_path = '../input/camembert-vec-256m768-10ep/best_model.pt'\n# checkpoint = torch.load(model_path)\n# # model = checkpoint['model']\n# model.load_state_dict(checkpoint)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# model.cuda()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# for param in model.parameters():\n# param.requires_grad=False","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# model.out_proj=Identity()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"### Image data prep"},{"metadata":{"trusted":true},"cell_type":"code","source":"catalog_eng= pd.read_csv(\"/kaggle/input/textphase1/data/catalog_english_taxonomy.tsv\",sep=\"\\t\")\nX_train= pd.read_csv(\"/kaggle/input/textphase1/data/X_train.tsv\",sep=\"\\t\")\nY_train= pd.read_csv(\"/kaggle/input/textphase1/data/Y_train.tsv\",sep=\"\\t\")\nX_test=pd.read_csv(\"/kaggle/input/textphase1/data/x_test_task1_phase1.tsv\",sep=\"\\t\")\ndict_code_to_id = {}\ndict_id_to_code={}\nlist_tags = list(Y_train['Prdtypecode'].unique())\n\nfor i,tag in enumerate(list_tags):\n dict_code_to_id[tag] = i \n dict_id_to_code[i]=tag\nY_train['labels']=Y_train['Prdtypecode'].map(dict_code_to_id)\ntrain=pd.merge(left=X_train,right=Y_train,\n how='left',left_on=['Integer_id','Image_id','Product_id'],\n right_on=['Integer_id','Image_id','Product_id'])\nprod_map=pd.Series(catalog_eng['Top level category'].values,index=catalog_eng['Prdtypecode']).to_dict()\ntrain['product']=train['Prdtypecode'].map(prod_map)\n\ndef get_img_path(img_id,prd_id,path):\n \n pattern = 'image'+'_'+str(img_id)+'_'+'product'+'_'+str(prd_id)+'.jpg'\n return path + pattern\ntrain_img = train[['Image_id','Product_id','labels','product']]\n\ntrain_img['image_path']=train_img.progress_apply(lambda x: get_img_path(x['Image_id'],x['Product_id'],\n path = '/kaggle/input/imagetrain/image_training/'),axis=1)\nX_test['image_path']=X_test.progress_apply(lambda x: get_img_path(x['Image_id'],x['Product_id'],\n path='/kaggle/input/imagetest/image_test/image_test_task1_phase1/'),axis=1)\ntrain_df, val_df, _, _ = train_test_split(train_img, train_img['labels'],random_state=2020, test_size = 0.1, stratify=train_img['labels'])","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"input_size = 224 # for Resnt\n# Applying Transforms to the Data\nfrom torchvision import datasets, models, transforms\n\nimage_transforms = { \n 'train': transforms.Compose([\n transforms.RandomResizedCrop(size=256, scale=(0.8, 1.0)),\n transforms.RandomRotation(degrees=15),\n transforms.RandomHorizontalFlip(),\n transforms.Resize(size=256),\n transforms.CenterCrop(size=input_size),\n transforms.ToTensor(),\n transforms.Normalize([0.485, 0.456, 0.406],\n [0.229, 0.224, 0.225])\n ]),\n 'valid': transforms.Compose([\n transforms.Resize(size=256),\n transforms.CenterCrop(size=input_size),\n transforms.ToTensor(),\n transforms.Normalize([0.485, 0.456, 0.406],\n [0.229, 0.224, 0.225])\n ]),\n 'test': transforms.Compose([\n transforms.Resize(size=256),\n transforms.CenterCrop(size=input_size),\n transforms.ToTensor(),\n transforms.Normalize([0.485, 0.456, 0.406],\n [0.229, 0.224, 0.225])\n ])\n}","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from torch.utils.data import Dataset, DataLoader, Subset\nimport cv2\nfrom PIL import Image\n\nclass FusionDataset(Dataset):\n \n def __init__(self,df,inputs_cam,masks_cam,inputs_flau,masks_flau,transform=None,mode='train'):\n self.df = df\n self.transform=transform\n self.mode=mode\n self.inputs_cam=inputs_cam\n self.masks_cam=masks_cam\n self.inputs_flau=inputs_flau\n self.masks_flau=masks_flau\n \n def __len__(self):\n return len(self.df)\n \n def __getitem__(self,idx):\n \n im_path = self.df.iloc[idx]['image_path']\n img = cv2.imread(im_path)\n img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n img=Image.fromarray(img)\n if self.transform is not None:\n img = self.transform(img)\n img=img.cuda()\n input_id_cam=self.inputs_cam[idx].cuda()\n input_mask_cam=self.masks_cam[idx].cuda()\n input_id_flau=self.inputs_flau[idx].cuda()\n input_mask_flau=self.masks_flau[idx].cuda()\n \n if self.mode=='test':\n return img,input_id_cam,input_mask_cam,input_id_flau,input_mask_flau\n else:\n# labels = torch.tensor(self.df.iloc[idx]['labels'])\n labels = torch.tensor(self.df.iloc[idx]['labels']).cuda() \n\n return img,input_id_cam,input_mask_cam,input_id_flau,input_mask_flau,labels","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"batch_size = 32 \nPreprocess.prepare_test(text_col)\ntest_sentences = Preprocess.test_sentences\nX_test_phase1= Preprocess.X_test","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# print('Using Camembert')\ntokenizer_cam = CamembertTokenizer.from_pretrained('camembert-base', do_lowercase=False)\n# print('Using Flaubert')\ntokenizer_flau = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', do_lowercase=False)\n\ninput_ids_test_flau,attention_masks_test_flau=prep_input(test_sentences,labels=None, max_len=max_len,tokenizer = tokenizer_flau)\ninput_ids_test_cam,attention_masks_test_cam=prep_input(test_sentences,labels=None, max_len=max_len,tokenizer = tokenizer_cam)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"class vector_fusion(nn.Module):\n \n def __init__(self):\n super(vector_fusion, self).__init__()\n self.img_model = SEResnext50_32x4d(pretrained=None)\n self.img_model.load_state_dict(torch.load('../input/seresnext2048/best_model.pt'))\n self.img_model.l0=Identity()\n for params in self.img_model.parameters():\n params.requires_grad=False\n\n self.cam_model= vec_output_CamembertForSequenceClassification.from_pretrained(\n 'camembert-base', # Use the 12-layer BERT model, with an uncased vocab.\n num_labels = len(Preprocess.dict_code_to_id), # The number of output labels--2 for binary classification.\n # You can increase this for multi-class tasks. \n output_attentions = False, # Whether the model returns attentions weights.\n output_hidden_states = False,) # Whether the model returns all hidden-states.\n \n \n cam_model_path = '../input/camembert-vec-256m768-10ep/best_model.pt'\n checkpoint = torch.load(cam_model_path)\n # model = checkpoint['model']\n self.cam_model.load_state_dict(checkpoint)\n for param in self.cam_model.parameters():\n param.requires_grad=False\n self.cam_model.out_proj=Identity()\n \n self.flau_model=vec_output_FlaubertForSequenceClassification.from_pretrained(\n 'flaubert/flaubert_base_cased', \n num_labels = len(Preprocess.dict_code_to_id), \n output_attentions = False,\n output_hidden_states = False,)\n flau_model_path='../input/flaubert-8933/best_model.pt'\n checkpoint = torch.load(flau_model_path)\n self.flau_model.load_state_dict(checkpoint)\n for param in self.flau_model.parameters():\n param.requires_grad=False\n self.flau_model.classifier=Identity()\n \n \n #reducing the dimensionality\n self.reduce_dim=nn.Conv1d(in_channels = 2048 , out_channels = 768 , kernel_size= 1)\n \n #output\n self.out=nn.Linear(768, 27)\n \n\n \n \n def forward(self,img,input_id_cam,input_mask_cam,input_id_flau,input_mask_flau):\n \n cam_emb =self.cam_model(input_id_cam, \n token_type_ids=None, ###### bs * 768 \n attention_mask=input_mask_cam)\n \n #alignment\n #cam_emb1 = cam_emb[0]\n \n \n flau_emb =self.flau_model(input_id_flau, \n token_type_ids=None, ###### bs * 768 \n attention_mask=input_mask_flau)\n \n #alignment\n #flau_emb1 = flau_emb[0]\n \n #Projecting the image embedding to lower dimension\n img_emb=self.img_model(img)\n img_emb=img_emb.view(img_emb.shape[0],img_emb.shape[1],1) \n img_emb=self.reduce_dim(img_emb) \n img_emb=img_emb.view(img_emb.shape[0],img_emb.shape[1]) ###### bs * 768 \n \n #adding\n fuse= img_emb + cam_emb[0] + flau_emb[0]\n \n logits=self.out(fuse)\n return logits","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"model = vector_fusion()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"model.cuda()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"train_dataset=FusionDataset(train_df,tr_inputs_cam,tr_masks_cam,tr_inputs_flau,tr_masks_flau,\n transform=image_transforms['test'])\nval_dataset=FusionDataset(val_df,val_inputs_cam,val_masks_cam,val_inputs_flau,val_masks_flau,\n transform=image_transforms['test'])\ntest_dataset=FusionDataset(X_test,input_ids_test_cam,attention_masks_test_cam,input_ids_test_flau,attention_masks_test_flau\n ,transform=image_transforms['test'],mode = 'test')","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"batch_size=64\ntrain_dataloader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)\nvalidation_dataloader=DataLoader(val_dataset,batch_size=batch_size,shuffle=False)\ntest_dataloader=DataLoader(test_dataset,batch_size=batch_size,shuffle=False)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# tr_inputs, val_inputs, tr_labels, val_labels = train_test_split(input_ids, labels,stratify=labels,\n# random_state=2020, test_size=val_size)\n# tr_masks, val_masks, u,v = train_test_split(attention_masks, labels,stratify=labels,\n# random_state=2020, test_size=val_size)\n\n\n# train_dataset=TensorDataset(tr_inputs, tr_masks, tr_labels)\n# val_dataset=TensorDataset(val_inputs, val_masks, val_labels)\n# train_sampler = RandomSampler(train_dataset) \n# valid_sampler = SequentialSampler(val_dataset)\n# from torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n\n# # The DataLoader needs to know our batch size for training, so we specify it \n# # here. For fine-tuning BERT on a specific task, the authors recommend a batch \n# # size of 16 or 32.\n# batch_size = 32\n\n# # Create the DataLoaders for our training and validation sets.\n# # We'll take training samples in random order. \n# train_dataloader = DataLoader(\n# train_dataset, # The training samples.\n# sampler = train_sampler, # Select batches randomly\n# batch_size = batch_size # Trains with this batch size.\n# )\n\n# # For validation the order doesn't matter, so we'll just read them sequentially.\n# validation_dataloader = DataLoader(\n# val_dataset, # The validation samples.\n# sampler = valid_sampler, # Pull out batches sequentially.\n# batch_size = batch_size # Evaluate with this batch size.\n# )","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"optimizer = AdamW(model.parameters(),\n lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n )","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def count_parameters(model):\n return sum(p.numel() for p in model.parameters() if p.requires_grad)\ncount_parameters(model)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from transformers import get_linear_schedule_with_warmup\n\n# Number of training epochs. The BERT authors recommend between 2 and 4. \n# We chose to run for 4, but we'll see later that this may be over-fitting the\n# training data.\nepochs = 6\n\n# Total number of training steps is [number of batches] x [number of epochs]. \n# (Note that this is not the same as the number of training samples).\ntotal_steps = len(train_dataloader) * epochs\n\n# Create the learning rate scheduler.\nscheduler = get_linear_schedule_with_warmup(optimizer, \n num_warmup_steps = 0, # Default value in run_glue.py\n num_training_steps = total_steps)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"import torch.nn as nn\nloss_criterion = nn.CrossEntropyLoss()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def flat_accuracy(preds, labels):\n pred_flat = np.argmax(preds, axis=1).flatten()\n labels_flat = labels.flatten()\n return np.sum(pred_flat == labels_flat) / len(labels_flat)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.metrics import f1_score\n\nseed_val = 42\n\nrandom.seed(seed_val)\nnp.random.seed(seed_val)\ntorch.manual_seed(seed_val)\ntorch.cuda.manual_seed_all(seed_val)\n\n# We'll store a number of quantities such as training and validation loss, \n# validation accuracy, and timings.\ntraining_stats = []\n\n# Measure the total training time for the whole run.\ntotal_t0 = time.time()\n\n\n# For each epoch...\nfor epoch_i in range(0, epochs):\n \n # ========================================\n # Training\n # ========================================\n \n # Perform one full pass over the training set.\n\n print(\"\")\n print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n print('Training...')\n \n #tr and val\n# vec_output_tr = []\n# vec_output_val =[]\n\n # Measure how long the training epoch takes.\n t0 = time.time()\n\n # Reset the total loss for this epoch.\n total_train_loss = 0\n\n # Put the model into training mode. Don't be mislead--the call to \n # `train` just changes the *mode*, it doesn't *perform* the training.\n # `dropout` and `batchnorm` layers behave differently during training\n # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)\n best_f1 = 0\n model.train()\n\n # For each batch of training data...\n for step, batch in tqdm(enumerate(train_dataloader)):\n \n # Unpack this training batch from our dataloader. \n #\n \n # As we unpack the batch, we'll also copy each tensor to the GPU using the \n # `to` method.\n #\n # `batch` contains three pytorch tensors:\n # [0]: input ids \n # [1]: attention masks\n # [2]: labels \n# return img,input_id_cam,input_mask_cam,input_id_flau,input_mask_flau\n\n b_img=batch[0].to(device)\n\n b_input_id_cam = batch[1].to(device)\n b_input_mask_cam = batch[2].to(device)\n b_input_id_flau = batch[3].to(device)\n b_input_mask_flau = batch[4].to(device)\n\n b_labels = batch[5].to(device)\n \n \n model.zero_grad() \n\n \n logits = model(b_img,b_input_id_cam ,b_input_mask_cam,b_input_id_flau,b_input_mask_flau)\n \n #Defining the loss\n loss = loss_criterion(logits, b_labels)\n \n #saving the features_tr\n# vec = vec.detach().cpu().numpy()\n# vec_output_tr.extend(vec)\n \n # Accumulate the training loss over all of the batches so that we can\n # calculate the average loss at the end. `loss` is a Tensor containing a\n # single value; the `.item()` function just returns the Python value \n # from the tensor.\n total_train_loss += loss.item()\n\n # Perform a backward pass to calculate the gradients.\n loss.backward()\n\n # Clip the norm of the gradients to 1.0.\n # This is to help prevent the \"exploding gradients\" problem.\n torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n\n # Update parameters and take a step using the computed gradient.\n # The optimizer dictates the \"update rule\"--how the parameters are\n # modified based on their gradients, the learning rate, etc.\n optimizer.step()\n\n # Update the learning rate.\n scheduler.step()\n \n \n \n\n # Calculate the average loss over all of the batches.\n avg_train_loss = total_train_loss / len(train_dataloader) \n \n # Measure how long this epoch took.\n training_time = format_time(time.time() - t0)\n\n print(\"\")\n print(\" Average training loss: {0:.2f} \".format(avg_train_loss))\n print(\" Training epcoh took: {:} \".format(training_time))\n \n # ========================================\n # Validation\n # ========================================\n # After the completion of each training epoch, measure our performance on\n # our validation set.\n\n print(\"\")\n print(\"Running Validation...\")\n\n t0 = time.time()\n\n # Put the model in evaluation mode--the dropout layers behave differently\n # during evaluation.\n model.eval()\n\n # Tracking variables \n total_eval_accuracy = 0\n total_eval_loss = 0\n nb_eval_steps = 0\n predictions=[]\n true_labels=[]\n \n\n # Evaluate data for one epoch\n for batch in tqdm(validation_dataloader):\n \n # Unpack this training batch from our dataloader. \n #\n # As we unpack the batch, we'll also copy each tensor to the GPU using \n # the `to` method.\n #\n # `batch` contains three pytorch tensors:\n # [0]: input ids \n # [1]: attention masks\n # [2]: labels \n b_img=batch[0].to(device)\n\n b_input_id_cam = batch[1].to(device)\n b_input_mask_cam = batch[2].to(device)\n b_input_id_flau = batch[3].to(device)\n b_input_mask_flau = batch[4].to(device)\n\n b_labels = batch[5].to(device)\n \n \n # Tell pytorch not to bother with constructing the compute graph during\n # the forward pass, since this is only needed for backprop (training).\n with torch.no_grad(): \n \n\n # Forward pass, calculate logit predictions.\n # token_type_ids is the same as the \"segment ids\", which \n # differentiates sentence 1 and 2 in 2-sentence tasks.\n # The documentation for this `model` function is here: \n # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n # Get the \"logits\" output by the model. The \"logits\" are the output\n # values prior to applying an activation function like the softmax.\n logits = model(b_img,b_input_id_cam ,b_input_mask_cam,b_input_id_flau,b_input_mask_flau)\n \n #new\n \n #defining the val loss\n loss = loss_criterion(logits, b_labels)\n \n \n # Accumulate the validation loss.\n total_eval_loss += loss.item()\n\n # Move logits and labels to CPU\n logits = logits.detach().cpu().numpy()\n\n # Move logits and labels to CPU\n predicted_labels=np.argmax(logits,axis=1)\n predictions.extend(predicted_labels)\n label_ids = b_labels.to('cpu').numpy()\n true_labels.extend(label_ids)\n \n #saving the features_tr\n# vec = vec.detach().cpu().numpy()\n# vec_output_val.extend(vec)\n \n\n # Calculate the accuracy for this batch of test sentences, and\n # accumulate it over all batches.\n total_eval_accuracy += flat_accuracy(logits, label_ids)\n \n\n # Report the final accuracy for this validation run.\n avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n print(\" Accuracy: {0:.2f}\".format(avg_val_accuracy))\n\n # Calculate the average loss over all of the batches.\n avg_val_loss = total_eval_loss / len(validation_dataloader)\n \n # Measure how long the validation run took.\n validation_time = format_time(time.time() - t0)\n \n print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n print(\" Validation took: {:}\".format(validation_time))\n print(\"Validation F1-Score: {}\".format(f1_score(true_labels,predictions,average='macro')))\n curr_f1=f1_score(true_labels,predictions,average='macro')\n if curr_f1 > best_f1:\n best_f1=curr_f1\n torch.save(model.state_dict(), 'best_model.pt')\n# np.save('best_vec_train_model_train.npy',vec_output_tr)\n# np.save('best_vec_val.npy',vec_output_val)\n \n # Record all statistics from this epoch.\n# training_stats.append(\n# {\n# 'epoch': epoch_i + 1,\n# 'Training Loss': avg_train_loss,\n# 'Valid. Loss': avg_val_loss,\n# 'Valid. Accur.': avg_val_accuracy,\n# 'Training Time': training_time,\n# 'Validation Time': validation_time\n# }\n# )\n\nprint(\"\")\nprint(\"Training complete!\")\n\nprint(\"Total training took {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"ckpt = '../input/vec-fusion-9093/best_model.pt'\nmodel.load_state_dict(torch.load(ckpt))","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def predict_pyt(model, prediction_dataloader):\n \"\"\"\n model: pytorch model\n prediction_dataloader: DataLoader object for which the predictions has to be made.\n return:\n predictions:- Direct predicted labels\n softmax_logits:- logits which are normalized with softmax on output\"\"\"\n # Put model in evaluation mode\n model.eval()\n # Tracking variables \n predictions = []\n softmax_logits=[]\n # Predict \n \n for batch in tqdm(prediction_dataloader):\n \n # Add batch to GPU\n b_img=batch[0].to(device)\n b_input_id_cam = batch[1].to(device)\n b_input_mask_cam = batch[2].to(device)\n b_input_id_flau = batch[3].to(device)\n b_input_mask_flau = batch[4].to(device)\n \n \n # Telling the model not to compute or store gradients, saving memory and \n # speeding up prediction\n with torch.no_grad():\n # Forward pass, calculate logit predictions\n logits = model(b_img,b_input_id_cam ,b_input_mask_cam,b_input_id_flau,b_input_mask_flau)\n \n \n #find logits\n #----- Add softmax--- \n m = nn.Softmax(dim=1)\n # # input = torch.randn(2, 3)\n output = m(logits)\n #-------#------\n # Move logits and labels to CPU\n logits = logits.detach().cpu().numpy()\n predicted_labels=np.argmax(logits,axis=1)\n predictions.extend(predicted_labels)\n softmax_logits.extend(output)\n print('DONE')\n return predictions, softmax_logits\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#validation predictions\npredictions_val, softmax_logits_val = predict_pyt(model, validation_dataloader)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"softmax_logits_val = np.array([ten.detach().cpu().numpy() for ten in softmax_logits_val])","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"np.save('predictions_val_add.npy',np.array(predictions_val))\nnp.save('softmax_logits_val_add.npy',softmax_logits_val)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#test_predictions\n#predictions_test, softmax_logits_test = predict_pyt(model, test_dataloader)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#softmax_logits_test = np.array([ten.detach().cpu().numpy() for ten in softmax_logits_test])","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# np.save('predictions_test_9093.npy',np.array(predictions_test))\n# np.save('softmax_logits_test_9093.npy',softmax_logits_test)","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":4} --------------------------------------------------------------------------------