├── README.md ├── experiments ├── bert.ipynb ├── data_analysis.ipynb ├── data_partition.ipynb ├── elmo.ipynb ├── glove.ipynb ├── traditional_ml.ipynb ├── traditional_ml_cross_entropy.ipynb ├── ulmfit.ipynb └── word2vec.ipynb └── label_extract ├── __init__.py ├── extract.py └── utils ├── __init__.py ├── check_duplicates.py ├── convert.py ├── document.py ├── extract_utils.py ├── file_utils.py ├── post_processing.py ├── preprocessing.py └── text.py /README.md: -------------------------------------------------------------------------------- 1 | # Multi-Label Text Classification with Transfer Learning for Policy Documents: The Case of the Sustainable Development Goals 2 | 3 | Repository for my master thesis in Language Technology at Uppsala University. 4 | [Link to thesis](http://uu.diva-portal.org/smash/record.jsf?pid=diva2%3A1360968&dswid=-5478) 5 | 6 | ## Abstract 7 | 8 | We created and analyzed a text classification dataset from freely-available web documents from the United Nation's Sustainable Development Goals. We then used it to train and compare different multi-label text classifiers with the aim of exploring the alternatives for methods that facilitate the search of information of this type of documents. 9 | 10 | We explored the effectiveness of deep learning and transfer learning in text classification by fine-tuning different pre-trained language representations — Word2Vec, GloVe, ELMo, ULMFiT and BERT. We also compared these approaches against a baseline of more traditional algorithms without using transfer learning. More specifically, we used multinomial Naive Bayes, logistic regression, k-nearest neighbors and Support Vector Machines. 11 | 12 | We then analyzed the results of our experiments quantitatively and qualitatively. The best results in terms of micro-averaged F1 scores and AUROC are obtained by BERT. However, it is also interesting that the second best classifier in terms of micro-averaged F1 scores is the Support Vector Machines, closely followed by the logistic regression classifier, which both have the advantage of being less computationally expensive than BERT. The results also show a close relation between our dataset size and the effectiveness of the classifiers. 13 | 14 | ## Repository modules 15 | 16 | * **Label_extract** contains the code used to create and label the dataset from documents scraped with Scrapy (whose script is not publicly available). 17 | * **Experiments** contains all the experimental Jupyter notebooks, which includes: 18 | 19 | * Data analysis of the dataset 20 | * K-fold splitting of the data 21 | * Multi-label text classification experiments with Multinomial Naive Bayes, k-NN, SVM, Logistic Regression, Word2Vec, GloVe, ELMo, ULMFiT and BERT 22 | 23 | ## How to cite 24 | ``` 25 | @Misc{multi_label_sdgs, 26 | author = {Rodríguez Medina, Samuel}, 27 | title = {Multi-Label Text Classification with Transfer Learning for Policy Documents: The Case of the Sustainable Development Goals}, 28 | year = {2019}, 29 | url = {http://urn.kb.se/resolve?urn=urn:nbn:se:uu:diva-395186}, 30 | } 31 | ``` 32 | -------------------------------------------------------------------------------- /experiments/bert.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "colab": { 8 | "base_uri": "https://localhost:8080/", 9 | "height": 1000 10 | }, 11 | "colab_type": "code", 12 | "id": "JQU-evlhdMNu", 13 | "outputId": "506d03db-6dde-40b4-96be-f6cd7635f3ab" 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "#!pip install pytorch-pretrained-bert\n", 18 | "#!pip install fast-bert\n", 19 | "#!pip install tensorboardX\n", 20 | "#!pip freeze" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "colab": { 28 | "base_uri": "https://localhost:8080/", 29 | "height": 1000 30 | }, 31 | "colab_type": "code", 32 | "id": "C1Eqn1fGAxJ2", 33 | "outputId": "bd594c4f-5d79-44c7-b0c0-6bae64200537" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "#!git clone https://github.com/NVIDIA/apex\n", 38 | "#%cd apex\n", 39 | "#!ls\n", 40 | "#!pip install -v --no-cache-dir --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\" ./\n", 41 | "#%cd .." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "colab": {}, 49 | "colab_type": "code", 50 | "id": "YSWjeDbakMWN" 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "from pytorch_pretrained_bert.tokenization import BertTokenizer\n", 55 | "from pytorch_pretrained_bert.modeling import BertForPreTraining, BertConfig, BertForMaskedLM, BertForSequenceClassification\n", 56 | "from pathlib import Path\n", 57 | "import torch\n", 58 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 59 | "from fast_bert.prediction import BertClassificationPredictor\n", 60 | "\n", 61 | "from fastai.text import Tokenizer, Vocab\n", 62 | "import pandas as pd\n", 63 | "import collections\n", 64 | "import os\n", 65 | "from tqdm import tqdm, trange\n", 66 | "import sys\n", 67 | "import random\n", 68 | "import numpy as np\n", 69 | "import apex\n", 70 | "import re\n", 71 | "\n", 72 | "import datetime\n", 73 | " \n", 74 | "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", 75 | "from torch.utils.data.distributed import DistributedSampler\n", 76 | "from pytorch_pretrained_bert.optimization import BertAdam\n", 77 | "\n", 78 | "from fast_bert.modeling import BertForMultiLabelSequenceClassification\n", 79 | "from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features\n", 80 | "from fast_bert.learner_cls import BertLearner\n", 81 | "from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc\n", 82 | "from sklearn.metrics import classification_report, hamming_loss, roc_auc_score\n", 83 | "\n", 84 | "import logging\n", 85 | "import os" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "colab": {}, 93 | "colab_type": "code", 94 | "id": "46-8UWXySPDQ" 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "torch.cuda.empty_cache()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "colab": {}, 106 | "colab_type": "code", 107 | "id": "oYQJi2j5SS41" 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "pd.set_option('display.max_colwidth', -1)\n", 112 | "run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "colab": { 120 | "base_uri": "https://localhost:8080/", 121 | "height": 124 122 | }, 123 | "colab_type": "code", 124 | "id": "itVUcAwDCnCP", 125 | "outputId": "0ac265f0-9c06-473e-dd74-9436b6b0a981" 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "DATA_PATH = Path(\"../datasets\")\n", 130 | "CROSS_FOLDS = Path(\"../datasets/cross_validation/\")\n", 131 | "BERT_DATA_PATH = Path(\"data/\")\n", 132 | "BERT_PATH = Path(\".\")\n", 133 | "LABEL_PATH = Path(\".\")\n", 134 | "LOG_PATH = Path(\"logs/\")\n", 135 | "OUTPUT_PATH = Path(\"models/\")\n", 136 | "\n", 137 | "model_state_dict = None\n", 138 | "LOG_PATH.mkdir(exist_ok=True)\n", 139 | "OUTPUT_PATH.mkdir(exist_ok=True)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": { 145 | "colab_type": "text", 146 | "id": "bbP7LLn9uOgy" 147 | }, 148 | "source": [ 149 | "# Model parameters" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "colab": {}, 157 | "colab_type": "code", 158 | "id": "xxWS107suLk9" 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "args = {\n", 163 | " \"run_text\": \"multilabel sdgs with freezable layers - more epochs\",\n", 164 | " \"train_size\": -1,\n", 165 | " \"val_size\": -1,\n", 166 | " \"log_path\": BERT_PATH,\n", 167 | " \"full_data_dir\": DATA_PATH,\n", 168 | " \"data_dir\": DATA_PATH,\n", 169 | " \"task_name\": \"final-3epochs\",\n", 170 | " \"no_cuda\": False,\n", 171 | " \"bert_model\": 'bert-large-uncased', \n", 172 | " \"output_dir\": OUTPUT_PATH,\n", 173 | " \"max_seq_length\": 512, \n", 174 | " \"do_train\": True,\n", 175 | " \"do_eval\": True,\n", 176 | " \"do_lower_case\": True,\n", 177 | " \"train_batch_size\": 4,\n", 178 | " \"eval_batch_size\": 4,\n", 179 | " \"learning_rate\": 1e-3, #1e-3 with three epochs, 0.07 loss\n", 180 | " \"num_train_epochs\": 3,\n", 181 | " \"warmup_proportion\": 0.1,\n", 182 | " \"local_rank\": -1,\n", 183 | " \"seed\": 42,\n", 184 | " \"gradient_accumulation_steps\": 1,\n", 185 | " \"optimize_on_cpu\": False,\n", 186 | " \"fp16\": True,\n", 187 | " \"loss_scale\": 128\n", 188 | "}" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "colab": {}, 196 | "colab_type": "code", 197 | "id": "-3_va60WUFfu" 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "import logging\n", 202 | "\n", 203 | "logfile = str(BERT_PATH/'log-{}-{}.txt'.format(run_start_time, args[\"run_text\"]))\n", 204 | "\n", 205 | "logging.basicConfig(\n", 206 | " level=logging.INFO,\n", 207 | " format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',\n", 208 | " datefmt='%m/%d/%Y %H:%M:%S',\n", 209 | " handlers=[\n", 210 | " logging.FileHandler(logfile),\n", 211 | " logging.StreamHandler(sys.stdout)\n", 212 | " ])\n", 213 | "\n", 214 | "logger = logging.getLogger()" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "colab": { 222 | "base_uri": "https://localhost:8080/", 223 | "height": 55 224 | }, 225 | "colab_type": "code", 226 | "id": "65_P6HwpUHXB", 227 | "outputId": "41f280bc-e872-4401-fd15-6fd86a85d5ff" 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "#logger.info(args)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "colab": {}, 239 | "colab_type": "code", 240 | "id": "N8S766_HU2fl" 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "device = torch.device('cuda')\n", 245 | "if torch.cuda.device_count() > 1:\n", 246 | " multi_gpu = True\n", 247 | "else:\n", 248 | " multi_gpu = False" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "# Create cross validation files" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": { 262 | "colab": {}, 263 | "colab_type": "code", 264 | "id": "PxE6HZCrghnp" 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "labels_index = [str(i) for i in range(1,18)]\n", 269 | "\n", 270 | "\"\"\"\n", 271 | "data_df = pd.read_csv(os.path.join(DATA_PATH, 'cleanup_labelled.csv'))\n", 272 | "data_df.labels = data_df.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n", 273 | "\n", 274 | "mlb = MultiLabelBinarizer()\n", 275 | "\n", 276 | "pattern = r\"(indicator)(\\s+\\d+\\.[\\d+a-d]\\.\\d+)|(target)(\\s+\\d+\\.[\\d+a-d])|(sdgs|sdg|goals|goal)\\W*\\s+(,?\\s*\\b\\d{1,2}\\b[and\\s\\b\\d{1,2}\\b]*)\"\n", 277 | "masked_df = data_df.text.str.replace(pattern, ' SDGLABEL ', regex=True, flags=re.IGNORECASE)\n", 278 | "masked_df = pd.DataFrame(masked_df.str.replace(' ', ' ', regex=True, flags=re.IGNORECASE))\n", 279 | "\n", 280 | "x = masked_df[['text']].values # text\n", 281 | "y = mlb.fit_transform(data_df.labels) # labels\n", 282 | "\n", 283 | "columns = ['text'] + labels_index\n", 284 | "\n", 285 | "for fold in os.listdir(CROSS_FOLDS):\n", 286 | " print(f\"Creating {fold}\")\n", 287 | " train_index = np.load(f\"{CROSS_FOLDS}/{fold}/train.npy\")\n", 288 | " val_index = np.load(f\"{CROSS_FOLDS}/{fold}/val.npy\")\n", 289 | " test_index = np.load(f\"{CROSS_FOLDS}/{fold}/test.npy\")\n", 290 | " \n", 291 | " x_train, x_val, x_test = x[train_index], x[val_index], x[test_index]\n", 292 | " y_train, y_val, y_test = y[train_index], y[val_index], y[test_index]\n", 293 | " \n", 294 | " train = pd.DataFrame(np.hstack((x_train, y_train)))\n", 295 | " val = pd.DataFrame(np.hstack((x_val, y_val)))\n", 296 | " test = pd.DataFrame(np.hstack((x_test, y_test)))\n", 297 | " \n", 298 | " fold_dir = Path(BERT_DATA_PATH/fold)\n", 299 | " fold_dir.mkdir(exist_ok=True)\n", 300 | " \n", 301 | " for split, name in [(train, \"train\"), (val, \"val\"), (test, \"test\")]:\n", 302 | " split.columns = columns\n", 303 | " split.to_csv(fold_dir/f'{name}_masked.csv')\n", 304 | " \n", 305 | "print('Finished creating all cross validation sets.')\n", 306 | "\"\"\"" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "colab": {}, 314 | "colab_type": "code", 315 | "id": "j8WOwi-27cNL" 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "metrics = []\n", 320 | "#metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})\n", 321 | "#metrics.append({'name': 'roc_auc', 'function': roc_auc})\n", 322 | "#metrics.append({'name': 'fbeta', 'function': fbeta})\n", 323 | "metrics.append({'name': 'accuracy_single', 'function': accuracy_multilabel})" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "colab": {}, 331 | "colab_type": "code", 332 | "id": "0eIR1w6QXaCb" 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "is_masked = \"\"\n", 337 | "output_dir = OUTPUT_PATH/args['task_name']\n", 338 | "output_dir.mkdir(exist_ok=True)\n", 339 | "\n", 340 | "for fold in sorted(os.listdir(BERT_DATA_PATH)):\n", 341 | " if fold.startswith(\"fold_5\"):\n", 342 | " print(f\"Processing {fold} {is_masked}\")\n", 343 | "\n", 344 | " fold_dir = output_dir/fold\n", 345 | " fold_dir.mkdir(exist_ok=True)\n", 346 | " \n", 347 | " databunch = BertDataBunch(data_dir=BERT_DATA_PATH/fold, \n", 348 | " label_dir=LABEL_PATH, \n", 349 | " tokenizer=args['bert_model'], \n", 350 | " train_file=f'train{is_masked}.csv', \n", 351 | " val_file=f'val{is_masked}.csv',\n", 352 | " test_data=None,\n", 353 | " text_col=\"text\", \n", 354 | " label_col=labels_index,\n", 355 | " batch_size_per_gpu=args['train_batch_size'], \n", 356 | " max_seq_length=args['max_seq_length'], \n", 357 | " multi_gpu=multi_gpu, \n", 358 | " multi_label=True, \n", 359 | " model_type='bert')\n", 360 | "\n", 361 | " learner = BertLearner.from_pretrained_model(databunch, \n", 362 | " pretrained_path=args['bert_model'], \n", 363 | " metrics=metrics, \n", 364 | " device=device, \n", 365 | " logger=logger, \n", 366 | " finetuned_wgts_path=None, \n", 367 | " warmup_steps=500,\n", 368 | " output_dir=fold_dir,\n", 369 | " is_fp16=args['fp16'],\n", 370 | " loss_scale=args['loss_scale'],\n", 371 | " multi_gpu=multi_gpu, \n", 372 | " multi_label=True,\n", 373 | " logging_steps=50)\n", 374 | " learner.fit(args['num_train_epochs'], lr=args['learning_rate'], schedule_type=\"warmup_linear\")\n", 375 | " learner.save_model()" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "# Load and evaluate results" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "def metrics_avg(models_testx_testy, labels_, thres=0.3):\n", 392 | " def calc(model, test_x, test_y):\n", 393 | " texts = [x[0] for x in test_x]\n", 394 | " predictions = model.predict_batch(texts)\n", 395 | " \n", 396 | " converted_preds = []\n", 397 | " for row in predictions:\n", 398 | " row_scores = sorted(row, key=lambda i: (int(i[0])))\n", 399 | " final = [y for x,y in row_scores]\n", 400 | " converted_preds.append(final)\n", 401 | " \n", 402 | " preds = np.array(converted_preds)>thres\n", 403 | " metrics = classification_report(test_y, preds, target_names=labels_, output_dict=True)\n", 404 | " metrics_df = pd.DataFrame.from_dict(metrics)\n", 405 | " h = hamming_loss(test_y, preds)\n", 406 | " roc = roc_auc_score(test_y, preds, average='micro')\n", 407 | " return metrics_df, h, roc\n", 408 | "\n", 409 | " count = 0\n", 410 | " model_1, test_x_first, test_y_first = models_testx_testy[0]\n", 411 | " metrics_agg, ham, roc = calc(model_1, test_x_first, test_y_first)\n", 412 | " n = len(models_testx_testy)\n", 413 | "\n", 414 | " for model, test_x, test_y in models_testx_testy[1:]:\n", 415 | " metrics, h, r = calc(model, test_x, test_y)\n", 416 | " metrics_agg += metrics\n", 417 | " ham += h\n", 418 | " roc += r\n", 419 | " count +=1\n", 420 | " print(count)\n", 421 | "\n", 422 | " return metrics_agg/n, ham/n, roc/n" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "loaded_models = []\n", 432 | "data_df = pd.read_csv(os.path.join(DATA_PATH, 'cleanup_labelled.csv'))\n", 433 | "data_df.labels = data_df.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n", 434 | "\n", 435 | "mlb = MultiLabelBinarizer()\n", 436 | "x = data_df[['text']].values # text\n", 437 | "y = mlb.fit_transform(data_df.labels) # labels\n", 438 | "\n", 439 | "\n", 440 | "for fold in sorted(os.listdir(OUTPUT_PATH/f\"{args['task_name']}\")):\n", 441 | " if fold.startswith(\"fold\"):\n", 442 | " print(f\"Processing {fold}\")\n", 443 | " \n", 444 | " # Load model\n", 445 | " fold_dir = OUTPUT_PATH/f\"{args['task_name']}/{fold}/model_out\"\n", 446 | " model = BertClassificationPredictor(model_path=fold_dir, \n", 447 | " label_path=LABEL_PATH, \n", 448 | " multi_label=True)\n", 449 | " \n", 450 | " # Load test data\n", 451 | " test_index = np.load(f\"{CROSS_FOLDS}/{fold}/test.npy\")\n", 452 | " x_test = x[test_index]\n", 453 | " y_test = y[test_index]\n", 454 | " \n", 455 | " loaded_models.append((model, x_test, y_test))\n", 456 | "print(f\"Finished loading the Bert models.\")" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "avg_results = metrics_avg(loaded_models, labels_index)" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "avg_results[0].to_csv(f'results.csv', sep=';')" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "avg_results[0]" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "hl = round(avg_results[1],4)\n", 493 | "roc_auc = round(avg_results[2],4)\n", 494 | "print(f\"hl;{hl}\")\n", 495 | "print(f\"roc-auc;{roc_auc}\")" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [] 504 | } 505 | ], 506 | "metadata": { 507 | "accelerator": "GPU", 508 | "colab": { 509 | "collapsed_sections": [], 510 | "name": "Copy of fastbert.ipynb", 511 | "provenance": [], 512 | "version": "0.3.2" 513 | }, 514 | "kernelspec": { 515 | "display_name": "Python 3", 516 | "language": "python", 517 | "name": "python3" 518 | }, 519 | "language_info": { 520 | "codemirror_mode": { 521 | "name": "ipython", 522 | "version": 3 523 | }, 524 | "file_extension": ".py", 525 | "mimetype": "text/x-python", 526 | "name": "python", 527 | "nbconvert_exporter": "python", 528 | "pygments_lexer": "ipython3", 529 | "version": "3.7.4" 530 | } 531 | }, 532 | "nbformat": 4, 533 | "nbformat_minor": 4 534 | } 535 | -------------------------------------------------------------------------------- /experiments/data_partition.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "data_partition.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "machine_shape": "hm", 11 | "include_colab_link": true 12 | }, 13 | "kernelspec": { 14 | "name": "python3", 15 | "display_name": "Python 3" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "metadata": { 32 | "id": "mdUWMhzx6-Nh", 33 | "colab_type": "code", 34 | "colab": { 35 | "base_uri": "https://localhost:8080/", 36 | "height": 176 37 | }, 38 | "outputId": "41521190-5574-4a4e-8fa8-3ab319a1ffaf" 39 | }, 40 | "source": [ 41 | "!pip install iterative-stratification" 42 | ], 43 | "execution_count": 2, 44 | "outputs": [ 45 | { 46 | "output_type": "stream", 47 | "text": [ 48 | "Collecting iterative-stratification\n", 49 | " Downloading https://files.pythonhosted.org/packages/9d/79/9ba64c8c07b07b8b45d80725b2ebd7b7884701c1da34f70d4749f7b45f9a/iterative_stratification-0.1.6-py3-none-any.whl\n", 50 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from iterative-stratification) (1.16.4)\n", 51 | "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from iterative-stratification) (0.21.3)\n", 52 | "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from iterative-stratification) (1.3.1)\n", 53 | "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->iterative-stratification) (0.13.2)\n", 54 | "Installing collected packages: iterative-stratification\n", 55 | "Successfully installed iterative-stratification-0.1.6\n" 56 | ], 57 | "name": "stdout" 58 | } 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "metadata": { 64 | "id": "G8ti7ZNG0-wI", 65 | "colab_type": "code", 66 | "colab": {} 67 | }, 68 | "source": [ 69 | "import numpy as np\n", 70 | "import pandas as pd \n", 71 | "from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, MultilabelStratifiedShuffleSplit\n", 72 | "from sklearn.model_selection import ShuffleSplit, train_test_split\n", 73 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 74 | "from pathlib import Path\n", 75 | "import os" 76 | ], 77 | "execution_count": 0, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "QvKq8Voj1n4Z", 84 | "colab_type": "code", 85 | "outputId": "4d9481f3-b50c-448d-9d04-115287e1cd12", 86 | "colab": { 87 | "base_uri": "https://localhost:8080/", 88 | "height": 34 89 | } 90 | }, 91 | "source": [ 92 | "from google.colab import drive\n", 93 | "drive.mount('/content/gdrive', force_remount=True)\n", 94 | "base_dir = \"gdrive/My Drive/fastai-v3/sdgs/dataset/\"\n", 95 | "labelled_dataset = base_dir + \"cleanup_labelled.csv\"\n", 96 | "output_dir = \"gdrive/My Drive/fastai-v3/sdgs/dataset/cross_validation/\"" 97 | ], 98 | "execution_count": 3, 99 | "outputs": [ 100 | { 101 | "output_type": "stream", 102 | "text": [ 103 | "Mounted at /content/gdrive\n" 104 | ], 105 | "name": "stdout" 106 | } 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "metadata": { 112 | "id": "KJdPWdoa4JB6", 113 | "colab_type": "code", 114 | "colab": { 115 | "base_uri": "https://localhost:8080/", 116 | "height": 104 117 | }, 118 | "outputId": "be0697c2-5014-4289-ac0e-0dc94d0e96db" 119 | }, 120 | "source": [ 121 | "data_df = pd.read_csv(labelled_dataset)\n", 122 | "data_df.labels = data_df.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n", 123 | "\n", 124 | "\n", 125 | "mskf = MultilabelStratifiedKFold(n_splits=5, random_state=0)\n", 126 | "mlb = MultiLabelBinarizer()\n", 127 | "count = 0\n", 128 | "\n", 129 | "x = data_df[['text']].values # text\n", 130 | "y = mlb.fit_transform(data_df.labels) # labels\n", 131 | "\n", 132 | "for train_index, test_index in mskf.split(x, y):\n", 133 | " count += 1\n", 134 | " print(f\"Fold no. {count}\")\n", 135 | " fold_dir = Path(f\"{output_dir}fold_{count}/\")\n", 136 | " fold_dir.mkdir(exist_ok=True)\n", 137 | " np.save(fold_dir/\"train\", train_index)\n", 138 | " np.save(fold_dir/\"test\", test_index)" 139 | ], 140 | "execution_count": 129, 141 | "outputs": [ 142 | { 143 | "output_type": "stream", 144 | "text": [ 145 | "Fold no. 1\n", 146 | "Fold no. 2\n", 147 | "Fold no. 3\n", 148 | "Fold no. 4\n", 149 | "Fold no. 5\n" 150 | ], 151 | "name": "stdout" 152 | } 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "metadata": { 158 | "id": "sy21_h6Uco09", 159 | "colab_type": "code", 160 | "colab": {} 161 | }, 162 | "source": [ 163 | "def get_indices(original_arr, new_arr):\n", 164 | " results = []\n", 165 | " for text in new_arr:\n", 166 | " results.append(np.where(x == text)[0][0])\n", 167 | " return np.array(results)" 168 | ], 169 | "execution_count": 0, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "colab_type": "code", 176 | "outputId": "b21c8ac4-e859-45a1-c6a6-f1e3ff317155", 177 | "id": "5OCFEtB-SzCa", 178 | "colab": { 179 | "base_uri": "https://localhost:8080/", 180 | "height": 885 181 | } 182 | }, 183 | "source": [ 184 | "data_df = pd.read_csv(labelled_dataset)\n", 185 | "data_df.labels = data_df.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n", 186 | "\n", 187 | "\n", 188 | "mskf = MultilabelStratifiedShuffleSplit(n_splits=5, random_state=0, test_size=0.1)\n", 189 | "mlb = MultiLabelBinarizer()\n", 190 | "count = 0\n", 191 | "\n", 192 | "x = data_df[['text']].values # text\n", 193 | "y = mlb.fit_transform(data_df.labels) # labels\n", 194 | "\n", 195 | "for original_train_index, test_index in mskf.split(x, y):\n", 196 | " count += 1\n", 197 | " print(f\"Fold no. {count}\")\n", 198 | " fold_dir = Path(f\"{output_dir}fold_{count}/\")\n", 199 | " fold_dir.mkdir(exist_ok=True)\n", 200 | " \n", 201 | " train_x = x[original_train_index]\n", 202 | " train, val = train_test_split(train_x, test_size=0.11)\n", 203 | " train_index = get_indices(x, train)\n", 204 | " val_index = get_indices(x, val)\n", 205 | "\n", 206 | " \n", 207 | " np.save(fold_dir/\"train\", train_index)\n", 208 | " np.save(fold_dir/\"val\", val_index)\n", 209 | " np.save(fold_dir/\"test\", test_index)\n", 210 | "\n", 211 | "\n", 212 | " train_val = len(set(train_index) & set(val_index))\n", 213 | " train_test = len(set(train_index) & set(test_index))\n", 214 | " val_test = len(set(val_index) & set(test_index))\n", 215 | " print(f\"Overlapping train & val: {train_val != 0}\")\n", 216 | " print(f\"Overlapping train & test: {train_test != 0}\")\n", 217 | " print(f\"Overlapping val & test: {val_test != 0}\")\n", 218 | " print()\n", 219 | " print(f\"Train size: {len(train_index)}\")\n", 220 | " print(f\"Val size: {len(val_index)}\")\n", 221 | " print(f\"Test size: {len(test_index)}\")\n", 222 | " print(f\"Total: {len(train_index)+len(val_index)+len(test_index)}\")\n", 223 | " print(\"______________\")" 224 | ], 225 | "execution_count": 184, 226 | "outputs": [ 227 | { 228 | "output_type": "stream", 229 | "text": [ 230 | "Fold no. 1\n", 231 | "Overlapping train & val: False\n", 232 | "Overlapping train & test: False\n", 233 | "Overlapping val & test: False\n", 234 | "\n", 235 | "Train size: 4173\n", 236 | "Val size: 516\n", 237 | "Test size: 493\n", 238 | "Total: 5182\n", 239 | "______________\n", 240 | "Fold no. 2\n", 241 | "Overlapping train & val: False\n", 242 | "Overlapping train & test: False\n", 243 | "Overlapping val & test: False\n", 244 | "\n", 245 | "Train size: 4151\n", 246 | "Val size: 514\n", 247 | "Test size: 517\n", 248 | "Total: 5182\n", 249 | "______________\n", 250 | "Fold no. 3\n", 251 | "Overlapping train & val: False\n", 252 | "Overlapping train & test: False\n", 253 | "Overlapping val & test: False\n", 254 | "\n", 255 | "Train size: 4142\n", 256 | "Val size: 513\n", 257 | "Test size: 527\n", 258 | "Total: 5182\n", 259 | "______________\n", 260 | "Fold no. 4\n", 261 | "Overlapping train & val: False\n", 262 | "Overlapping train & test: False\n", 263 | "Overlapping val & test: False\n", 264 | "\n", 265 | "Train size: 4140\n", 266 | "Val size: 512\n", 267 | "Test size: 530\n", 268 | "Total: 5182\n", 269 | "______________\n", 270 | "Fold no. 5\n", 271 | "Overlapping train & val: False\n", 272 | "Overlapping train & test: False\n", 273 | "Overlapping val & test: False\n", 274 | "\n", 275 | "Train size: 4155\n", 276 | "Val size: 514\n", 277 | "Test size: 513\n", 278 | "Total: 5182\n", 279 | "______________\n" 280 | ], 281 | "name": "stdout" 282 | } 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "metadata": { 288 | "id": "OMO68tx8cWcY", 289 | "colab_type": "code", 290 | "colab": {} 291 | }, 292 | "source": [ 293 | "# Load data\n", 294 | "my_data = np.load(fold_dir/\"train.npy\")" 295 | ], 296 | "execution_count": 0, 297 | "outputs": [] 298 | } 299 | ] 300 | } -------------------------------------------------------------------------------- /experiments/elmo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "colab": { 8 | "base_uri": "https://localhost:8080/", 9 | "height": 279 10 | }, 11 | "colab_type": "code", 12 | "id": "JBt_T5zUMG68", 13 | "outputId": "bd5f1f72-a7d3-451c-b4fe-bd48887ad47c" 14 | }, 15 | "outputs": [ 16 | { 17 | "name": "stdout", 18 | "output_type": "stream", 19 | "text": [ 20 | "Requirement already satisfied: tensorflow_hub==0.4.0 in /opt/anaconda3/lib/python3.7/site-packages (0.4.0)\n", 21 | "Requirement already satisfied: six>=1.10.0 in /opt/anaconda3/lib/python3.7/site-packages (from tensorflow_hub==0.4.0) (1.12.0)\n", 22 | "Requirement already satisfied: protobuf>=3.4.0 in /opt/anaconda3/lib/python3.7/site-packages (from tensorflow_hub==0.4.0) (3.9.0)\n", 23 | "Requirement already satisfied: numpy>=1.12.0 in /opt/anaconda3/lib/python3.7/site-packages (from tensorflow_hub==0.4.0) (1.16.2)\n", 24 | "Requirement already satisfied: setuptools in /opt/anaconda3/lib/python3.7/site-packages (from protobuf>=3.4.0->tensorflow_hub==0.4.0) (40.8.0)\n", 25 | "Collecting pip\n", 26 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/62/ca/94d32a6516ed197a491d17d46595ce58a83cbb2fca280414e57cd86b84dc/pip-19.2.1-py2.py3-none-any.whl (1.4MB)\n", 27 | "\u001b[K 100% |████████████████████████████████| 1.4MB 22.6MB/s ta 0:00:01\n", 28 | "\u001b[?25hCollecting setuptools\n", 29 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ec/51/f45cea425fd5cb0b0380f5b0f048ebc1da5b417e48d304838c02d6288a1e/setuptools-41.0.1-py2.py3-none-any.whl (575kB)\n", 30 | "\u001b[K 100% |████████████████████████████████| 583kB 34.8MB/s ta 0:00:01\n", 31 | "\u001b[?25hCollecting wheel\n", 32 | " Downloading https://files.pythonhosted.org/packages/bb/10/44230dd6bf3563b8f227dbf344c908d412ad2ff48066476672f3a72e174e/wheel-0.33.4-py2.py3-none-any.whl\n", 33 | "\u001b[31mfairing 0.5.3 has requirement tornado<6.0.0,>=5.1.1, but you'll have tornado 6.0.3 which is incompatible.\u001b[0m\n", 34 | "\u001b[31mfairing 0.5.3 has requirement urllib3==1.24.2, but you'll have urllib3 1.24.1 which is incompatible.\u001b[0m\n", 35 | "\u001b[31mdatalab 1.1.4 has requirement six==1.10.0, but you'll have six 1.12.0 which is incompatible.\u001b[0m\n", 36 | "Installing collected packages: pip, setuptools, wheel\n", 37 | " Found existing installation: pip 19.0.3\n", 38 | " Uninstalling pip-19.0.3:\n", 39 | " Successfully uninstalled pip-19.0.3\n", 40 | " Found existing installation: setuptools 40.8.0\n", 41 | " Uninstalling setuptools-40.8.0:\n", 42 | " Successfully uninstalled setuptools-40.8.0\n", 43 | " Found existing installation: wheel 0.33.1\n", 44 | " Uninstalling wheel-0.33.1:\n", 45 | " Successfully uninstalled wheel-0.33.1\n", 46 | "Successfully installed pip-19.2.1 setuptools-41.0.1 wheel-0.33.4\n", 47 | "Collecting tensorflow\n", 48 | " Using cached https://files.pythonhosted.org/packages/f4/28/96efba1a516cdacc2e2d6d081f699c001d414cc8ca3250e6d59ae657eb2b/tensorflow-1.14.0-cp37-cp37m-manylinux1_x86_64.whl\n", 49 | "Collecting six>=1.10.0 (from tensorflow)\n", 50 | " Downloading https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl\n", 51 | "Collecting google-pasta>=0.1.6 (from tensorflow)\n", 52 | " Using cached https://files.pythonhosted.org/packages/d0/33/376510eb8d6246f3c30545f416b2263eee461e40940c2a4413c711bdf62d/google_pasta-0.1.7-py3-none-any.whl\n", 53 | "Collecting tensorflow-estimator<1.15.0rc0,>=1.14.0rc0 (from tensorflow)\n", 54 | " Using cached https://files.pythonhosted.org/packages/3c/d5/21860a5b11caf0678fbc8319341b0ae21a07156911132e0e71bffed0510d/tensorflow_estimator-1.14.0-py2.py3-none-any.whl\n", 55 | "Collecting astor>=0.6.0 (from tensorflow)\n", 56 | " Using cached https://files.pythonhosted.org/packages/d1/4f/950dfae467b384fc96bc6469de25d832534f6b4441033c39f914efd13418/astor-0.8.0-py2.py3-none-any.whl\n", 57 | "Collecting tensorboard<1.15.0,>=1.14.0 (from tensorflow)\n", 58 | " Using cached https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl\n", 59 | "Collecting keras-applications>=1.0.6 (from tensorflow)\n", 60 | " Using cached https://files.pythonhosted.org/packages/71/e3/19762fdfc62877ae9102edf6342d71b28fbfd9dea3d2f96a882ce099b03f/Keras_Applications-1.0.8-py3-none-any.whl\n", 61 | "Collecting wrapt>=1.11.1 (from tensorflow)\n", 62 | "Collecting keras-preprocessing>=1.0.5 (from tensorflow)\n", 63 | " Using cached https://files.pythonhosted.org/packages/28/6a/8c1f62c37212d9fc441a7e26736df51ce6f0e38455816445471f10da4f0a/Keras_Preprocessing-1.1.0-py2.py3-none-any.whl\n", 64 | "Collecting absl-py>=0.7.0 (from tensorflow)\n", 65 | "Collecting gast>=0.2.0 (from tensorflow)\n", 66 | "Collecting protobuf>=3.6.1 (from tensorflow)\n", 67 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/30/fd/60ce148d8e4205bdf6da4ffec31348fd33f710c20a882b44319d54fd51ae/protobuf-3.9.1-cp37-cp37m-manylinux1_x86_64.whl (1.2MB)\n", 68 | "\u001b[K |████████████████████████████████| 1.2MB 59.5MB/s eta 0:00:01\n", 69 | "\u001b[?25hCollecting grpcio>=1.8.6 (from tensorflow)\n", 70 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9d/b1/b80dea9e0bbbdd07bf7ba69c6df1aeb3e88b90b85ca326c40be9e29bc37c/grpcio-1.22.0-cp37-cp37m-manylinux1_x86_64.whl (2.2MB)\n", 71 | "\u001b[K |████████████████████████████████| 2.2MB 49.5MB/s eta 0:00:01\n", 72 | "\u001b[?25hCollecting wheel>=0.26 (from tensorflow)\n", 73 | " Using cached https://files.pythonhosted.org/packages/bb/10/44230dd6bf3563b8f227dbf344c908d412ad2ff48066476672f3a72e174e/wheel-0.33.4-py2.py3-none-any.whl\n", 74 | "Collecting termcolor>=1.1.0 (from tensorflow)\n", 75 | "Collecting numpy<2.0,>=1.14.5 (from tensorflow)\n", 76 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/05/4b/55cfbfd3e5e85016eeef9f21c0ec809d978706a0d60b62cc28aeec8c792f/numpy-1.17.0-cp37-cp37m-manylinux1_x86_64.whl (20.3MB)\n", 77 | "\u001b[K |████████████████████████████████| 20.3MB 51.8MB/s eta 0:00:01\n", 78 | "\u001b[?25hCollecting werkzeug>=0.11.15 (from tensorboard<1.15.0,>=1.14.0->tensorflow)\n", 79 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d1/ab/d3bed6b92042622d24decc7aadc8877badf18aeca1571045840ad4956d3f/Werkzeug-0.15.5-py2.py3-none-any.whl (328kB)\n", 80 | "\u001b[K |████████████████████████████████| 337kB 67.7MB/s eta 0:00:01\n", 81 | "\u001b[?25hCollecting setuptools>=41.0.0 (from tensorboard<1.15.0,>=1.14.0->tensorflow)\n", 82 | " Using cached https://files.pythonhosted.org/packages/ec/51/f45cea425fd5cb0b0380f5b0f048ebc1da5b417e48d304838c02d6288a1e/setuptools-41.0.1-py2.py3-none-any.whl\n", 83 | "Collecting markdown>=2.6.8 (from tensorboard<1.15.0,>=1.14.0->tensorflow)\n", 84 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/c0/4e/fd492e91abdc2d2fcb70ef453064d980688762079397f779758e055f6575/Markdown-3.1.1-py2.py3-none-any.whl (87kB)\n", 85 | "\u001b[K |████████████████████████████████| 92kB 51.6MB/s eta 0:00:01\n", 86 | "\u001b[?25hCollecting h5py (from keras-applications>=1.0.6->tensorflow)\n", 87 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/8e/fd/2ca5c4f4ed33ac4178f9c4d551e3946ab480866e3cd67a65a67a4bb35367/h5py-2.9.0-cp37-cp37m-manylinux1_x86_64.whl (2.8MB)\n", 88 | "\u001b[K |████████████████████████████████| 2.8MB 71.9MB/s eta 0:00:01\n", 89 | "\u001b[31mERROR: astroid 2.2.5 requires typed-ast>=1.3.0; implementation_name == \"cpython\", which is not installed.\u001b[0m\n", 90 | "\u001b[31mERROR: thinc 6.12.1 has requirement wrapt<1.11.0,>=1.10.0, but you'll have wrapt 1.11.2 which is incompatible.\u001b[0m\n", 91 | "\u001b[31mERROR: fairing 0.5.3 has requirement tornado<6.0.0,>=5.1.1, but you'll have tornado 6.0.3 which is incompatible.\u001b[0m\n", 92 | "\u001b[31mERROR: fairing 0.5.3 has requirement urllib3==1.24.2, but you'll have urllib3 1.24.1 which is incompatible.\u001b[0m\n", 93 | "\u001b[31mERROR: datalab 1.1.4 has requirement six==1.10.0, but you'll have six 1.12.0 which is incompatible.\u001b[0m\n", 94 | "\u001b[?25hInstalling collected packages: six, google-pasta, tensorflow-estimator, astor, absl-py, werkzeug, setuptools, protobuf, numpy, markdown, grpcio, wheel, tensorboard, h5py, keras-applications, wrapt, keras-preprocessing, gast, termcolor, tensorflow\n", 95 | "Successfully installed absl-py-0.7.1 astor-0.8.0 gast-0.2.2 google-pasta-0.1.7 grpcio-1.22.0 h5py-2.9.0 keras-applications-1.0.8 keras-preprocessing-1.1.0 markdown-3.1.1 numpy-1.17.0 protobuf-3.9.1 setuptools-41.0.1 six-1.12.0 tensorboard-1.14.0 tensorflow-1.14.0 tensorflow-estimator-1.14.0 termcolor-1.1.0 werkzeug-0.15.5 wheel-0.33.4 wrapt-1.11.2\n", 96 | "Collecting keras\n", 97 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/5e/10/aa32dad071ce52b5502266b5c659451cfd6ffcbf14e6c8c4f16c0ff5aaab/Keras-2.2.4-py2.py3-none-any.whl (312kB)\n", 98 | "\u001b[K |████████████████████████████████| 317kB 56.5MB/s eta 0:00:01\n", 99 | "\u001b[?25hCollecting pyyaml (from keras)\n", 100 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e3/e8/b3212641ee2718d556df0f23f78de8303f068fe29cdaa7a91018849582fe/PyYAML-5.1.2.tar.gz (265kB)\n", 101 | "\u001b[K |████████████████████████████████| 266kB 68.0MB/s eta 0:00:01\n", 102 | "\u001b[?25hCollecting h5py (from keras)\n", 103 | " Using cached https://files.pythonhosted.org/packages/8e/fd/2ca5c4f4ed33ac4178f9c4d551e3946ab480866e3cd67a65a67a4bb35367/h5py-2.9.0-cp37-cp37m-manylinux1_x86_64.whl\n" 104 | ] 105 | }, 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "Collecting keras-applications>=1.0.6 (from keras)\n", 111 | " Using cached https://files.pythonhosted.org/packages/71/e3/19762fdfc62877ae9102edf6342d71b28fbfd9dea3d2f96a882ce099b03f/Keras_Applications-1.0.8-py3-none-any.whl\n", 112 | "Collecting numpy>=1.9.1 (from keras)\n", 113 | " Using cached https://files.pythonhosted.org/packages/05/4b/55cfbfd3e5e85016eeef9f21c0ec809d978706a0d60b62cc28aeec8c792f/numpy-1.17.0-cp37-cp37m-manylinux1_x86_64.whl\n", 114 | "Collecting six>=1.9.0 (from keras)\n", 115 | " Using cached https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl\n", 116 | "Collecting keras-preprocessing>=1.0.5 (from keras)\n", 117 | " Using cached https://files.pythonhosted.org/packages/28/6a/8c1f62c37212d9fc441a7e26736df51ce6f0e38455816445471f10da4f0a/Keras_Preprocessing-1.1.0-py2.py3-none-any.whl\n", 118 | "Collecting scipy>=0.14 (from keras)\n", 119 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/5d/bd/c0feba81fb60e231cf40fc8a322ed5873c90ef7711795508692b1481a4ae/scipy-1.3.0-cp37-cp37m-manylinux1_x86_64.whl (25.2MB)\n", 120 | "\u001b[K |████████████████████████████████| 25.2MB 47.7MB/s eta 0:00:01\n", 121 | "\u001b[?25hBuilding wheels for collected packages: pyyaml\n", 122 | " Building wheel for pyyaml (setup.py) ... \u001b[?25ldone\n", 123 | "\u001b[?25h Created wheel for pyyaml: filename=PyYAML-5.1.2-cp37-cp37m-linux_x86_64.whl size=44104 sha256=c3cb996f3a1849dbcce29cdc8ab26f3aa89e6111b37adbf89a62a65b54931766\n", 124 | " Stored in directory: /home/jupyter/.cache/pip/wheels/d9/45/dd/65f0b38450c47cf7e5312883deb97d065e030c5cca0a365030\n", 125 | "Successfully built pyyaml\n", 126 | "\u001b[31mERROR: astroid 2.2.5 requires typed-ast>=1.3.0; implementation_name == \"cpython\", which is not installed.\u001b[0m\n", 127 | "\u001b[31mERROR: thinc 6.12.1 has requirement wrapt<1.11.0,>=1.10.0, but you'll have wrapt 1.11.2 which is incompatible.\u001b[0m\n", 128 | "\u001b[31mERROR: fairing 0.5.3 has requirement tornado<6.0.0,>=5.1.1, but you'll have tornado 6.0.3 which is incompatible.\u001b[0m\n", 129 | "\u001b[31mERROR: fairing 0.5.3 has requirement urllib3==1.24.2, but you'll have urllib3 1.24.1 which is incompatible.\u001b[0m\n", 130 | "\u001b[31mERROR: datalab 1.1.4 has requirement six==1.10.0, but you'll have six 1.12.0 which is incompatible.\u001b[0m\n", 131 | "Installing collected packages: pyyaml, numpy, six, h5py, keras-applications, keras-preprocessing, scipy, keras\n", 132 | "Successfully installed h5py-2.9.0 keras-2.2.4 keras-applications-1.0.8 keras-preprocessing-1.1.0 numpy-1.17.0 pyyaml-5.1.2 scipy-1.3.0 six-1.12.0\n", 133 | "Requirement already satisfied: iterative-stratification in /opt/anaconda3/lib/python3.7/site-packages (0.1.6)\n", 134 | "Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.7/site-packages (from iterative-stratification) (1.17.0)\n", 135 | "Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.7/site-packages (from iterative-stratification) (0.20.3)\n", 136 | "Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.7/site-packages (from iterative-stratification) (1.3.0)\n", 137 | "Requirement already up-to-date: sacremoses in /opt/anaconda3/lib/python3.7/site-packages (0.0.31)\n", 138 | "Requirement already satisfied, skipping upgrade: joblib in /opt/anaconda3/lib/python3.7/site-packages (from sacremoses) (0.13.2)\n", 139 | "Requirement already satisfied, skipping upgrade: six in /opt/anaconda3/lib/python3.7/site-packages (from sacremoses) (1.12.0)\n", 140 | "Requirement already satisfied, skipping upgrade: click in /opt/anaconda3/lib/python3.7/site-packages (from sacremoses) (7.0)\n", 141 | "Requirement already satisfied, skipping upgrade: tqdm in /opt/anaconda3/lib/python3.7/site-packages (from sacremoses) (4.31.1)\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "#!pip install \"tensorflow_hub==0.4.0\"\n", 147 | "#!pip install --upgrade pip setuptools wheel\n", 148 | "#!pip install -I tensorflow\n", 149 | "#!pip install -I keras\n", 150 | "#!pip install iterative-stratification\n", 151 | "#!pip install -U sacremoses" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 4, 157 | "metadata": { 158 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 159 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 160 | "colab": {}, 161 | "colab_type": "code", 162 | "id": "yzkDHu1qhHIa" 163 | }, 164 | "outputs": [ 165 | { 166 | "name": "stderr", 167 | "output_type": "stream", 168 | "text": [ 169 | "/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 170 | " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", 171 | "/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 172 | " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", 173 | "/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 174 | " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", 175 | "/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 176 | " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", 177 | "/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 178 | " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", 179 | "/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 180 | " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", 181 | "/opt/anaconda3/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 182 | " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", 183 | "/opt/anaconda3/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 184 | " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", 185 | "/opt/anaconda3/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 186 | " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", 187 | "/opt/anaconda3/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 188 | " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", 189 | "/opt/anaconda3/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 190 | " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", 191 | "/opt/anaconda3/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 192 | " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", 193 | "Using TensorFlow backend.\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "import tensorflow as tf\n", 199 | "import pandas as pd\n", 200 | "import numpy as np\n", 201 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 202 | "from sklearn.model_selection import train_test_split\n", 203 | "from sklearn.metrics import classification_report, hamming_loss, roc_auc_score\n", 204 | "from iterstrat.ml_stratifiers import MultilabelStratifiedKFold\n", 205 | "\n", 206 | "mlb = MultiLabelBinarizer()\n", 207 | "\n", 208 | "from keras.preprocessing.text import Tokenizer\n", 209 | "from keras.preprocessing.sequence import pad_sequences\n", 210 | "from keras.utils import to_categorical\n", 211 | "from keras.layers import Dense, Input, GlobalMaxPooling1D\n", 212 | "from keras.layers import Conv1D, MaxPooling1D, Embedding\n", 213 | "from keras.initializers import Constant\n", 214 | "\n", 215 | "import tensorflow_hub as hub\n", 216 | "import os\n", 217 | "import re\n", 218 | "from keras import backend as K\n", 219 | "import keras.layers as layers\n", 220 | "from keras.models import Model, load_model\n", 221 | "from keras.engine import Layer\n", 222 | "\n", 223 | "from sacremoses import MosesTokenizer\n", 224 | "\n", 225 | "# Initialize session\n", 226 | "#sess = tf.Session()\n", 227 | "#K.set_session(sess)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 9, 233 | "metadata": { 234 | "colab": {}, 235 | "colab_type": "code", 236 | "id": "Pi63MqzME0Ji" 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "TEXT_DATA_DIR = f\"../../datasets/cleanup_labelled.csv\"\n", 241 | "\n", 242 | "MAX_SEQUENCE_LENGTH = 1000\n", 243 | "MAX_NUM_WORDS = 20000\n", 244 | "EMBEDDING_DIM = 50\n", 245 | "N_EPOCHS = 10\n", 246 | "VALIDATION_SPLIT = 0.2\n", 247 | "\n", 248 | "labels_index = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 6, 254 | "metadata": { 255 | "colab": {}, 256 | "colab_type": "code", 257 | "id": "p_nighR5UdrV" 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "class ElmoEmbeddingLayer(Layer):\n", 262 | " def __init__(self, **kwargs):\n", 263 | " self.dimensions = 1024\n", 264 | " self.trainable=True\n", 265 | " super(ElmoEmbeddingLayer, self).__init__(**kwargs)\n", 266 | "\n", 267 | " def build(self, input_shape):\n", 268 | " self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,\n", 269 | " name=\"{}_module\".format(self.name))\n", 270 | " \n", 271 | "\n", 272 | " self.trainable_weights += K.tf.trainable_variables(scope=\"^{}_module/.*\".format(self.name))\n", 273 | " super(ElmoEmbeddingLayer, self).build(input_shape)\n", 274 | "\n", 275 | " def call(self, x, mask=None):\n", 276 | " result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),\n", 277 | " as_dict=True,\n", 278 | " signature='default',\n", 279 | " )['default']\n", 280 | " return result\n", 281 | "\n", 282 | " def compute_mask(self, inputs, mask=None):\n", 283 | " return K.not_equal(inputs, '--PAD--')\n", 284 | "\n", 285 | " def compute_output_shape(self, input_shape):\n", 286 | " return (input_shape[0], self.dimensions)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 26, 292 | "metadata": { 293 | "colab": {}, 294 | "colab_type": "code", 295 | "id": "YJkKGmTooe0E" 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "# Function to build model\n", 300 | "def build_model(): \n", 301 | " input_text = layers.Input(shape=(1,), dtype=\"string\")\n", 302 | " embedding = ElmoEmbeddingLayer()(input_text)\n", 303 | " dense = layers.Dense(256, activation='relu')(embedding)\n", 304 | " pred = layers.Dense(17, activation='sigmoid')(dense)\n", 305 | "\n", 306 | " model = Model(inputs=[input_text], outputs=pred)\n", 307 | "\n", 308 | " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 309 | " model.summary()\n", 310 | "\n", 311 | " return model" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 13, 317 | "metadata": { 318 | "colab": { 319 | "base_uri": "https://localhost:8080/", 320 | "height": 86 321 | }, 322 | "colab_type": "code", 323 | "id": "DYvfb0fCOX_a", 324 | "outputId": "f8da6174-991b-49eb-a620-a2748267890a" 325 | }, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "Processing text dataset\n", 332 | "Tokenizing data\n", 333 | "Fold no. 1\n", 334 | "Fold no. 2\n", 335 | "Fold no. 3\n", 336 | "Fold no. 4\n", 337 | "Fold no. 5\n", 338 | "Fold no. 6\n", 339 | "Fold no. 7\n", 340 | "Fold no. 8\n", 341 | "Fold no. 9\n", 342 | "Fold no. 10\n", 343 | "Finished\n" 344 | ] 345 | } 346 | ], 347 | "source": [ 348 | "# second, prepare text samples and their labels\n", 349 | "print('Processing text dataset')\n", 350 | "mt = MosesTokenizer(lang='en')\n", 351 | "df = pd.read_csv(TEXT_DATA_DIR)\n", 352 | "df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n", 353 | "\n", 354 | "print('Tokenizing data')\n", 355 | "data = np.array([mt.tokenize(t, escape=False)[:150] for t in df.text])\n", 356 | "mlb = MultiLabelBinarizer()\n", 357 | "labels = mlb.fit_transform(df.labels)\n", 358 | "\n", 359 | "# Cross-validation: split the data into a training set and a test set\n", 360 | "cross_validation_sets = []\n", 361 | "count = 0\n", 362 | "mskf = MultilabelStratifiedKFold(n_splits=10, random_state=0)\n", 363 | "\n", 364 | "for train_index, test_index in mskf.split(data, labels):\n", 365 | " count += 1\n", 366 | " print(f\"Fold no. {count}\")\n", 367 | " train_text, test_text = data[train_index], data[test_index]\n", 368 | " \n", 369 | " # Look into adapting the script to accept list of tokens instead\n", 370 | " train_text = [' '.join(t) for t in train_text]\n", 371 | " train_text = np.array(train_text, dtype=object)[:, np.newaxis]\n", 372 | " \n", 373 | " test_text = [' '.join(t) for t in test_text]\n", 374 | " test_text = np.array(test_text, dtype=object)[:, np.newaxis]\n", 375 | " \n", 376 | " train_label, test_label = labels[train_index], labels[test_index]\n", 377 | " cross_validation_sets.append((train_text, train_label, test_text, test_label))\n", 378 | "print('Finished')" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "colab": { 386 | "base_uri": "https://localhost:8080/", 387 | "height": 363 388 | }, 389 | "colab_type": "code", 390 | "id": "zHtvFIu0Zsyx", 391 | "outputId": "c7fc1dc7-4ed3-42fe-bfa4-a8a911bfc317" 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "count = 0\n", 396 | "for train_text, train_label, test_text, test_label in cross_validation_sets:\n", 397 | " count += 1\n", 398 | " print(f\"Fold {count}\")\n", 399 | " \n", 400 | " model = build_model()\n", 401 | " model.fit(train_text, \n", 402 | " train_label,\n", 403 | " epochs=N_EPOCHS,\n", 404 | " batch_size=32)\n", 405 | " model.save(f\"elmo_{N_EPOCHS}_epochs_crossvalidated_fold_{count}.h5\")" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 52, 411 | "metadata": { 412 | "colab": {}, 413 | "colab_type": "code", 414 | "id": "Ki-1denqGAUj" 415 | }, 416 | "outputs": [], 417 | "source": [ 418 | "def metrics_avg(models_testx_testy, labels_, thres=0.3):\n", 419 | " def calc(model, test_x, test_y):\n", 420 | " predictions = model.predict(test_x)>thres\n", 421 | " metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)\n", 422 | " metrics_df = pd.DataFrame.from_dict(metrics)\n", 423 | " h = hamming_loss(test_y, predictions)\n", 424 | " roc = roc_auc_score(test_y, predictions, average='micro')\n", 425 | " return metrics_df, h, roc\n", 426 | "\n", 427 | " model_1, test_x_1, test_y_1 = models_testx_testy[0]\n", 428 | " metrics_agg, ham, roc = calc(model_1, test_x_1, test_y_1)\n", 429 | " n = len(models_testx_testy)\n", 430 | "\n", 431 | " for model, test_x, test_y_1 in models_testx_testy[1:]:\n", 432 | " metrics, h, r = calc(model, test_x, test_y_1)\n", 433 | " metrics_agg += metrics\n", 434 | " ham += h\n", 435 | " roc += r\n", 436 | "\n", 437 | " return metrics_agg/n, ham/n, roc/n" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 59, 443 | "metadata": { 444 | "colab": { 445 | "base_uri": "https://localhost:8080/", 446 | "height": 327 447 | }, 448 | "colab_type": "code", 449 | "id": "yT92tW4qGCsD", 450 | "outputId": "66aba9a4-5b73-471b-b220-76c75aa49e96" 451 | }, 452 | "outputs": [ 453 | { 454 | "name": "stdout", 455 | "output_type": "stream", 456 | "text": [ 457 | "_________________________________________________________________\n", 458 | "Layer (type) Output Shape Param # \n", 459 | "=================================================================\n", 460 | "input_19 (InputLayer) (None, 1) 0 \n", 461 | "_________________________________________________________________\n", 462 | "elmo_embedding_layer_19 (Elm (None, 1024) 4 \n", 463 | "_________________________________________________________________\n", 464 | "dense_35 (Dense) (None, 256) 262400 \n", 465 | "_________________________________________________________________\n", 466 | "dense_36 (Dense) (None, 17) 4369 \n", 467 | "=================================================================\n", 468 | "Total params: 266,773\n", 469 | "Trainable params: 266,773\n", 470 | "Non-trainable params: 0\n", 471 | "_________________________________________________________________\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "labels = [str(i) for i in range(1,18)]\n", 477 | "model = None\n", 478 | "model = build_model()\n", 479 | "model.load_weights('elmo_15epochs_crossvalidated_fold_1.h5')\n", 480 | "#averaged_results = metrics_avg([(model, test_text, test_label)], labels)" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 60, 486 | "metadata": { 487 | "colab": {}, 488 | "colab_type": "code", 489 | "id": "vSKFkaq0HeCD" 490 | }, 491 | "outputs": [ 492 | { 493 | "name": "stderr", 494 | "output_type": "stream", 495 | "text": [ 496 | "/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels.\n", 497 | " 'precision', 'predicted', average, warn_for)\n" 498 | ] 499 | } 500 | ], 501 | "source": [ 502 | "train_text, train_label, test_text, test_label = cross_validation_sets[0]\n", 503 | "avg_results = metrics_avg([(model, test_text, test_label)], labels)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 63, 509 | "metadata": {}, 510 | "outputs": [ 511 | { 512 | "data": { 513 | "text/plain": [ 514 | "0.7789815538905387" 515 | ] 516 | }, 517 | "execution_count": 63, 518 | "metadata": {}, 519 | "output_type": "execute_result" 520 | } 521 | ], 522 | "source": [ 523 | "avg_results[2]" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 65, 529 | "metadata": {}, 530 | "outputs": [ 531 | { 532 | "name": "stdout", 533 | "output_type": "stream", 534 | "text": [ 535 | "Epoch 1/5\n", 536 | "4711/4711 [==============================] - 948s 201ms/step - loss: 0.0859 - acc: 0.9699\n", 537 | "Epoch 2/5\n", 538 | "4711/4711 [==============================] - 936s 199ms/step - loss: 0.0771 - acc: 0.9727\n", 539 | "Epoch 3/5\n", 540 | "4711/4711 [==============================] - 930s 197ms/step - loss: 0.0710 - acc: 0.9753\n", 541 | "Epoch 4/5\n", 542 | "4711/4711 [==============================] - 934s 198ms/step - loss: 0.0659 - acc: 0.9766\n", 543 | "Epoch 5/5\n", 544 | "4711/4711 [==============================] - 936s 199ms/step - loss: 0.0582 - acc: 0.9806\n" 545 | ] 546 | } 547 | ], 548 | "source": [ 549 | "model.fit(train_text, \n", 550 | " train_label,\n", 551 | " epochs=5,\n", 552 | " batch_size=32)\n", 553 | "model.save(f\"elmo_20epochs_crossvalidated_fold_1.h5\")" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 66, 559 | "metadata": {}, 560 | "outputs": [ 561 | { 562 | "name": "stderr", 563 | "output_type": "stream", 564 | "text": [ 565 | "/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels.\n", 566 | " 'precision', 'predicted', average, warn_for)\n" 567 | ] 568 | } 569 | ], 570 | "source": [ 571 | "avg_results = metrics_avg([(model, test_text, test_label)], labels)" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 69, 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "data": { 581 | "text/plain": [ 582 | "0.7735558335056892" 583 | ] 584 | }, 585 | "execution_count": 69, 586 | "metadata": {}, 587 | "output_type": "execute_result" 588 | } 589 | ], 590 | "source": [ 591 | "avg_results[2]" 592 | ] 593 | } 594 | ], 595 | "metadata": { 596 | "accelerator": "GPU", 597 | "colab": { 598 | "collapsed_sections": [], 599 | "name": "word-embeddings-elmo.ipynb", 600 | "provenance": [], 601 | "version": "0.3.2" 602 | }, 603 | "kernelspec": { 604 | "display_name": "Python 3", 605 | "language": "python", 606 | "name": "python3" 607 | }, 608 | "language_info": { 609 | "codemirror_mode": { 610 | "name": "ipython", 611 | "version": 3 612 | }, 613 | "file_extension": ".py", 614 | "mimetype": "text/x-python", 615 | "name": "python", 616 | "nbconvert_exporter": "python", 617 | "pygments_lexer": "ipython3", 618 | "version": "3.7.3" 619 | } 620 | }, 621 | "nbformat": 4, 622 | "nbformat_minor": 1 623 | } 624 | -------------------------------------------------------------------------------- /experiments/glove.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "name": "python3", 7 | "display_name": "Python 3" 8 | }, 9 | "language_info": { 10 | "codemirror_mode": { 11 | "name": "ipython", 12 | "version": 3 13 | }, 14 | "file_extension": ".py", 15 | "mimetype": "text/x-python", 16 | "name": "python", 17 | "nbconvert_exporter": "python", 18 | "pygments_lexer": "ipython3", 19 | "version": "3.6.6" 20 | }, 21 | "colab": { 22 | "name": "glove.ipynb", 23 | "version": "0.3.2", 24 | "provenance": [], 25 | "collapsed_sections": [], 26 | "include_colab_link": true 27 | }, 28 | "accelerator": "GPU" 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "\"Open" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "metadata": { 44 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 45 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 46 | "id": "yzkDHu1qhHIa", 47 | "colab_type": "code", 48 | "colab": {} 49 | }, 50 | "source": [ 51 | "import tensorflow as tf\n", 52 | "import pandas as pd\n", 53 | "import numpy as np\n", 54 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 55 | "from sklearn.metrics import classification_report, roc_auc_score, hamming_loss, accuracy_score\n", 56 | "from keras import optimizers\n", 57 | "import os\n", 58 | "import re\n", 59 | "\n", 60 | "from keras.preprocessing.text import Tokenizer\n", 61 | "from keras.preprocessing.sequence import pad_sequences\n", 62 | "from keras.layers import Dense, Input, GlobalMaxPooling1D\n", 63 | "from keras.layers import Conv1D, MaxPooling1D, Embedding\n", 64 | "from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, Bidirectional\n", 65 | "from keras.models import Model, Sequential\n", 66 | "from keras.initializers import Constant\n", 67 | "from keras.optimizers import Adam, RMSprop\n", 68 | "\n", 69 | "from keras.preprocessing import sequence\n", 70 | "from keras.models import Sequential\n", 71 | "from keras.layers import Dense, Dropout, Activation, LSTM, Conv1D, GlobalMaxPooling1D, CuDNNLSTM\n", 72 | "from keras.layers import Embedding\n", 73 | "from keras.models import load_model\n", 74 | "\n", 75 | "\n", 76 | "#from tensorflow.keras.backend import set_session\n", 77 | "#sess = tf.Session()\n", 78 | "#set_session(sess)\n", 79 | "#sess.run(tf.global_variables_initializer())" 80 | ], 81 | "execution_count": 0, 82 | "outputs": [] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "metadata": { 87 | "id": "G6ld3ptbEqsA", 88 | "colab_type": "code", 89 | "colab": {} 90 | }, 91 | "source": [ 92 | "from google.colab import drive\n", 93 | "drive.mount('/content/gdrive', force_remount=True)\n", 94 | "base_dir = \"gdrive/My Drive/fastai-v3/sdgs/\"" 95 | ], 96 | "execution_count": 0, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "metadata": { 102 | "id": "Pi63MqzME0Ji", 103 | "colab_type": "code", 104 | "colab": {} 105 | }, 106 | "source": [ 107 | "TEXT_DATA_DIR = f\"{base_dir}dataset/cleanup_labelled.csv\"\n", 108 | "CROSS_FOLDS = f\"{base_dir}dataset/cross_validation/\"\n", 109 | "GLOVE_DIR = f\"{base_dir}embeddings/glove/glove.6B/\"\n", 110 | "EMBEDDINGS_DIR = f\"{base_dir}embeddings/glove/\"\n", 111 | "\n", 112 | "MAX_SEQUENCE_LENGTH = 500\n", 113 | "MAX_NUM_WORDS = 20000\n", 114 | "EMBEDDING_DIM = 300\n", 115 | "NUM_EPOCHS = 20\n", 116 | "BATCH_SIZE = 128\n", 117 | "labels_index = [str(i) for i in range(1,18)]" 118 | ], 119 | "execution_count": 0, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "id": "k-DVSj0NhHIj", 126 | "colab_type": "code", 127 | "colab": {} 128 | }, 129 | "source": [ 130 | "# Load pretrained embeddings in an index mapping words in the embeddings set\n", 131 | "# to their embeddings vector\n", 132 | "print('Indexing word vectors.')\n", 133 | "\n", 134 | "embeddings_index = {}\n", 135 | "with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt')) as f:\n", 136 | " for line in f:\n", 137 | " values = line.split()\n", 138 | " word = values[0]\n", 139 | " coefs = np.asarray(values[1:], dtype='float32')\n", 140 | " embeddings_index[word] = coefs\n", 141 | "print(f\"Found {len(embeddings_index)} word vectors.\")" 142 | ], 143 | "execution_count": 0, 144 | "outputs": [] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "metadata": { 149 | "id": "ilVT-WLTmmH-", 150 | "colab_type": "code", 151 | "colab": {} 152 | }, 153 | "source": [ 154 | "is_mask = \"\"\n", 155 | "df = pd.read_csv(TEXT_DATA_DIR)\n", 156 | "\n", 157 | "###### MASK LABELS\n", 158 | "pattern = r\"(indicator)(\\s+\\d+\\.[\\d+a-d]\\.\\d+)|(target)(\\s+\\d+\\.[\\d+a-d])|(sdgs|sdg|goals|goal)\\W*\\s+(,?\\s*\\b\\d{1,2}\\b[and\\s\\b\\d{1,2}\\b]*)\"\n", 159 | "masked_df = df.text.str.replace(pattern, ' SDGLABEL ', regex=True, flags=re.IGNORECASE)\n", 160 | "masked_df = pd.DataFrame(masked_df.str.replace(' ', ' ', regex=True, flags=re.IGNORECASE))\n", 161 | "\n", 162 | "\n", 163 | "# Masked sequences for training, word index \n", 164 | "tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)\n", 165 | "tokenizer.fit_on_texts(masked_df.text)\n", 166 | "word_index = tokenizer.word_index\n", 167 | "masked_sequences = tokenizer.texts_to_sequences(masked_df.text)\n", 168 | "masked_data = pad_sequences(masked_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 169 | "\n", 170 | "# Non masked sequences for testing\n", 171 | "non_masked_sequences = tokenizer.texts_to_sequences(df.text)\n", 172 | "non_masked_data = pad_sequences(non_masked_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 173 | "\n", 174 | "# Labels\n", 175 | "mlb = MultiLabelBinarizer()\n", 176 | "df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n", 177 | "labels = mlb.fit_transform(df.labels)" 178 | ], 179 | "execution_count": 0, 180 | "outputs": [] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "metadata": { 185 | "id": "kLEjjQVeSvVn", 186 | "colab_type": "code", 187 | "colab": {} 188 | }, 189 | "source": [ 190 | "models = []\n", 191 | "arch = 'Conv1D_glorot_uniform'\n", 192 | "\n", 193 | "# Cross-validation: split the data into a training set and a test set\n", 194 | "for fold in os.listdir(CROSS_FOLDS):\n", 195 | " train_index = np.load(f\"{CROSS_FOLDS}{fold}/train.npy\")\n", 196 | " val_index = np.load(f\"{CROSS_FOLDS}{fold}/val.npy\")\n", 197 | " test_index = np.load(f\"{CROSS_FOLDS}{fold}/test.npy\")\n", 198 | "\n", 199 | " # Masked for training, and non_masked for testing\n", 200 | " x_train, x_val, x_test = masked_data[train_index], masked_data[val_index], non_masked_data[test_index]\n", 201 | " y_train, y_val, y_test = labels[train_index], labels[val_index], labels[test_index]\n", 202 | " \n", 203 | " print(fold)\n", 204 | " print('Preparing embedding matrix.')\n", 205 | " \n", 206 | " # prepare embedding matrix\n", 207 | " num_words = min(MAX_NUM_WORDS, len(word_index)) + 1\n", 208 | " embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n", 209 | " for word, i in word_index.items():\n", 210 | " # Ignore word if not in the n most common words\n", 211 | " if i > MAX_NUM_WORDS:\n", 212 | " continue\n", 213 | " embedding_vector = embeddings_index.get(word)\n", 214 | " if embedding_vector is not None:\n", 215 | " # words not found in embedding index will be all-zeros.\n", 216 | " embedding_matrix[i] = embedding_vector\n", 217 | "\n", 218 | " # load pre-trained word embeddings into an Embedding layer\n", 219 | " # note that we set trainable = False so as to keep the embeddings fixed\n", 220 | " embedding_layer = Embedding(num_words,\n", 221 | " EMBEDDING_DIM,\n", 222 | " embeddings_initializer=Constant(embedding_matrix),\n", 223 | " input_length=MAX_SEQUENCE_LENGTH,\n", 224 | " trainable=False)\n", 225 | " sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n", 226 | " embedded_sequences = embedding_layer(sequence_input)\n", 227 | " \n", 228 | " \n", 229 | " # \n", 230 | " if arch == \"Bidirectional_LSTM\":\n", 231 | " x = Bidirectional(LSTM(25, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedded_sequences)\n", 232 | " x = GlobalMaxPooling1D()(x)\n", 233 | " x = Dense(50, activation=\"relu\")(x)\n", 234 | " x = Dropout(0.1)(x)\n", 235 | " x = Dense(17, activation=\"sigmoid\")(x)\n", 236 | " model = Model(inputs=sequence_input, outputs=x)\n", 237 | " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 238 | "\n", 239 | " \n", 240 | " # 0.179 with 10 epochs, 300 dimensions\n", 241 | " if arch == \"convnet\":\n", 242 | " # 1D convnet with global maxpooling\n", 243 | " x = Conv1D(128, 5, activation='relu')(embedded_sequences)\n", 244 | " x = MaxPooling1D(5)(x)\n", 245 | " x = Conv1D(128, 5, activation='relu')(x)\n", 246 | " x = MaxPooling1D(5)(x)\n", 247 | " x = Conv1D(128, 5, activation='relu')(x)\n", 248 | " x = GlobalMaxPooling1D()(x)\n", 249 | " x = Dense(128, activation='relu')(x)\n", 250 | " preds = Dense(len(labels_index), activation='sigmoid')(x)\n", 251 | " model = Model(sequence_input, preds)\n", 252 | " model.compile(loss='binary_crossentropy', \n", 253 | " optimizer='rmsprop',\n", 254 | " metrics=['accuracy'])\n", 255 | " \n", 256 | " \n", 257 | " # 0.131 with 20 epochs, 300 dimensions\n", 258 | " if arch == \"Conv1D_glorot_uniform\":\n", 259 | " x = Conv1D(64, kernel_size=3, padding=\"valid\", kernel_initializer=\"glorot_uniform\")(embedded_sequences)\n", 260 | " avg_pool = GlobalAveragePooling1D()(x)\n", 261 | " max_pool = GlobalMaxPooling1D()(x)\n", 262 | " x = concatenate([avg_pool, max_pool])\n", 263 | " preds = Dense(len(labels_index), activation='sigmoid')(x)\n", 264 | " model = Model(sequence_input, preds)\n", 265 | " model.compile(loss='binary_crossentropy', \n", 266 | " optimizer=RMSprop(lr=0.001),\n", 267 | " metrics=['accuracy'])\n", 268 | " \n", 269 | "\n", 270 | " \n", 271 | " if arch == \"convolution1d\":\n", 272 | " #https://github.com/keras-team/keras/blob/master/examples/imdb_cnn.py\n", 273 | " model = Sequential()\n", 274 | "\n", 275 | " # we start off with an efficient embedding layer which maps\n", 276 | " # our vocab indices into embedding_dims dimensions\n", 277 | " #model.add(embedded_sequences)\n", 278 | " model.add(Embedding(num_words,\n", 279 | " EMBEDDING_DIM,\n", 280 | " input_length=MAX_SEQUENCE_LENGTH))\n", 281 | " model.add(Dropout(0.2))\n", 282 | "\n", 283 | " # we add a Convolution1D, which will learn filters\n", 284 | " # word group filters of size filter_length:\n", 285 | " model.add(Conv1D(filters,\n", 286 | " kernel_size,\n", 287 | " padding='valid',\n", 288 | " activation='relu',\n", 289 | " strides=1))\n", 290 | " # we use max pooling:\n", 291 | " model.add(GlobalMaxPooling1D())\n", 292 | "\n", 293 | " # We add a vanilla hidden layer:\n", 294 | " model.add(Dense(hidden_dims))\n", 295 | " model.add(Dropout(0.2))\n", 296 | " model.add(Activation('relu'))\n", 297 | "\n", 298 | " # We project onto a single unit output layer, and squash it with a sigmoid:\n", 299 | " model.add(len(labels_index))\n", 300 | " model.add(Activation('sigmoid'))\n", 301 | "\n", 302 | "\n", 303 | " model.fit(x_train, y_train,\n", 304 | " batch_size=128,\n", 305 | " epochs=NUM_EPOCHS,\n", 306 | " validation_data=(x_val, y_val))\n", 307 | "\n", 308 | " models.append((model, x_test, y_test))\n", 309 | " #model.save(EMBEDDINGS_DIR + f\"{is_mask}{arch}_{NUM_EPOCHS}epochs_{EMBEDDING_DIM}D_batchsize{BATCH_SIZE}_5fold-cross-val_{fold}.h5\")" 310 | ], 311 | "execution_count": 0, 312 | "outputs": [] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": { 317 | "id": "1dnhmTNH7KeM", 318 | "colab_type": "text" 319 | }, 320 | "source": [ 321 | "# Load and evaluate folds on test" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "metadata": { 327 | "id": "Ip3GMV8X03J_", 328 | "colab_type": "code", 329 | "colab": {} 330 | }, 331 | "source": [ 332 | "def metrics_avg(models_testx_testy, labels_, thres=0.3):\n", 333 | " def calc(model, test_x, test_y):\n", 334 | " predictions = model.predict(test_x)>thres\n", 335 | " metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)\n", 336 | " metrics_df = pd.DataFrame.from_dict(metrics)\n", 337 | " h = hamming_loss(test_y, predictions)\n", 338 | " roc = roc_auc_score(test_y, predictions, average='micro')\n", 339 | " return metrics_df, h, roc\n", 340 | "\n", 341 | " model_1, test_x_first, test_y_first = models_testx_testy[0]\n", 342 | " metrics_agg, ham, roc = calc(model_1, test_x_first, test_y_first)\n", 343 | " n = len(models_testx_testy)\n", 344 | "\n", 345 | " for model, test_x, test_y in models_testx_testy[1:]:\n", 346 | " metrics, h, r = calc(model, test_x, test_y)\n", 347 | " metrics_agg += metrics\n", 348 | " ham += h\n", 349 | " roc += r\n", 350 | "\n", 351 | " return metrics_agg/n, ham/n, roc/n" 352 | ], 353 | "execution_count": 0, 354 | "outputs": [] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "metadata": { 359 | "id": "PVBmo_HUH7O5", 360 | "colab_type": "code", 361 | "colab": {} 362 | }, 363 | "source": [ 364 | "loaded_arch = 'Conv1D_glorot_uniform'\n", 365 | "loaded_models = []\n", 366 | "for fold in os.listdir(CROSS_FOLDS):\n", 367 | " print(f\"Loading {fold}...\")\n", 368 | " test_index = np.load(f\"{CROSS_FOLDS}{fold}/test.npy\")\n", 369 | "\n", 370 | " x_test = data[test_index]\n", 371 | " y_test = labels[test_index]\n", 372 | " \n", 373 | " load_dir = EMBEDDINGS_DIR + f\"{loaded_arch}_{NUM_EPOCHS}epochs_{EMBEDDING_DIM}D_batchsize{BATCH_SIZE}_5fold-cross-val_{fold}.h5\"\n", 374 | " loaded_model = load_model(load_dir)\n", 375 | " \n", 376 | " loaded_models.append((loaded_model, x_test, y_test))\n", 377 | "print(f\"Finished loading the {loaded_arch} models.\")" 378 | ], 379 | "execution_count": 0, 380 | "outputs": [] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "metadata": { 385 | "id": "MYVLRfpK7QiZ", 386 | "colab_type": "code", 387 | "colab": {} 388 | }, 389 | "source": [ 390 | "avg_results = metrics_avg(models, labels_index); avg_results[0]" 391 | ], 392 | "execution_count": 0, 393 | "outputs": [] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "metadata": { 398 | "id": "TLr3rO4g8mDL", 399 | "colab_type": "code", 400 | "colab": {} 401 | }, 402 | "source": [ 403 | "#avg_results[0].to_csv(EMBEDDINGS_DIR + f'masked_results_{arch}.csv', sep=';')" 404 | ], 405 | "execution_count": 0, 406 | "outputs": [] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "metadata": { 411 | "id": "UiZRbVe9o6b9", 412 | "colab_type": "code", 413 | "colab": {} 414 | }, 415 | "source": [ 416 | "hl = round(avg_results[1],4)\n", 417 | "roc_auc = round(avg_results[2],4)\n", 418 | "print(f\"hl;{hl};;roc-auc;{roc_auc}\")" 419 | ], 420 | "execution_count": 0, 421 | "outputs": [] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "metadata": { 426 | "id": "swnzksFiU6x5", 427 | "colab_type": "code", 428 | "colab": {} 429 | }, 430 | "source": [ 431 | "" 432 | ], 433 | "execution_count": 0, 434 | "outputs": [] 435 | } 436 | ] 437 | } -------------------------------------------------------------------------------- /experiments/traditional_ml.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "traditional_ml.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "metadata": { 32 | "id": "h6p8vLIwha9i", 33 | "colab_type": "code", 34 | "outputId": "a94edd48-4995-415e-dc06-1253155788d4", 35 | "colab": { 36 | "base_uri": "https://localhost:8080/", 37 | "height": 69 38 | } 39 | }, 40 | "source": [ 41 | "import os\n", 42 | "import pandas as pd\n", 43 | "import numpy as np\n", 44 | "import nltk\n", 45 | "from nltk.corpus import stopwords\n", 46 | "from sklearn.pipeline import Pipeline\n", 47 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 48 | "from sklearn.multiclass import OneVsRestClassifier\n", 49 | "from sklearn.naive_bayes import MultinomialNB\n", 50 | "from sklearn.tree import DecisionTreeClassifier\n", 51 | "from sklearn.svm import LinearSVC\n", 52 | "from sklearn.neighbors import KNeighborsClassifier\n", 53 | "from sklearn.linear_model import LogisticRegression\n", 54 | "from sklearn.model_selection import GridSearchCV\n", 55 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 56 | "\n", 57 | "nltk.download('stopwords')" 58 | ], 59 | "execution_count": 2, 60 | "outputs": [ 61 | { 62 | "output_type": "stream", 63 | "text": [ 64 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 65 | "[nltk_data] Package stopwords is already up-to-date!\n" 66 | ], 67 | "name": "stdout" 68 | }, 69 | { 70 | "output_type": "execute_result", 71 | "data": { 72 | "text/plain": [ 73 | "True" 74 | ] 75 | }, 76 | "metadata": { 77 | "tags": [] 78 | }, 79 | "execution_count": 2 80 | } 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "metadata": { 86 | "id": "XbXFaXGGhr21", 87 | "colab_type": "code", 88 | "outputId": "056f12e4-5e38-4ee4-d3db-d865264e4a3d", 89 | "colab": { 90 | "base_uri": "https://localhost:8080/", 91 | "height": 34 92 | } 93 | }, 94 | "source": [ 95 | "from google.colab import drive\n", 96 | "drive.mount('/content/gdrive', force_remount=True)\n", 97 | "base_dir = \"gdrive/My Drive/fastai-v3/sdgs/\"\n", 98 | "labelled_dataset = base_dir + \"dataset/cleanup_labelled.csv\"\n", 99 | "CROSS_FOLDS = f\"{base_dir}dataset/cross_validation/\"" 100 | ], 101 | "execution_count": 3, 102 | "outputs": [ 103 | { 104 | "output_type": "stream", 105 | "text": [ 106 | "Mounted at /content/gdrive\n" 107 | ], 108 | "name": "stdout" 109 | } 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "metadata": { 115 | "id": "TEVes-M5h8r8", 116 | "colab_type": "code", 117 | "colab": {} 118 | }, 119 | "source": [ 120 | "labelled = pd.read_csv(labelled_dataset)\n", 121 | "labelled.labels = labelled.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n", 122 | "mlb = MultiLabelBinarizer()\n", 123 | "\n", 124 | "data_x = labelled[['text']].values\n", 125 | "x = np.array([x[0] for x in data_x.tolist()])\n", 126 | "y = mlb.fit_transform(labelled.labels)\n", 127 | "\n", 128 | "stop_words = set(stopwords.words('english'))\n", 129 | "labels = [str(i) for i in range(1,18)]" 130 | ], 131 | "execution_count": 0, 132 | "outputs": [] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "metadata": { 137 | "id": "fBklIRcbhscE", 138 | "colab_type": "code", 139 | "colab": {} 140 | }, 141 | "source": [ 142 | "splits = []\n", 143 | "for fold in os.listdir(CROSS_FOLDS):\n", 144 | " train_index = np.load(f\"{CROSS_FOLDS}{fold}/train.npy\")\n", 145 | " val_index = np.load(f\"{CROSS_FOLDS}{fold}/val.npy\")\n", 146 | " splits.append((train_index, val_index))" 147 | ], 148 | "execution_count": 0, 149 | "outputs": [] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "metadata": { 154 | "id": "hUf7LEcxiS31", 155 | "colab_type": "code", 156 | "colab": {} 157 | }, 158 | "source": [ 159 | "def grid_search(x, y, parameters, pipeline, splits):\n", 160 | " '''Train pipeline, test and print results'''\n", 161 | " gs = GridSearchCV(pipeline, \n", 162 | " parameters, \n", 163 | " cv=splits, \n", 164 | " n_jobs=5, \n", 165 | " verbose=10, \n", 166 | " return_train_score=True, \n", 167 | " scoring='f1_micro')\n", 168 | " gs.fit(x, y)\n", 169 | " print()\n", 170 | " print(\"Best parameters set:\")\n", 171 | " print(gs.best_estimator_.steps)\n", 172 | " print()\n", 173 | " results = gs.cv_results_\n", 174 | " print(f\"Mean train scores: {results['mean_train_score']}\")\n", 175 | " print(f\"Mean validation scores: {results['mean_test_score']}\")" 176 | ], 177 | "execution_count": 0, 178 | "outputs": [] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "id": "TfjWBDTr-NdK", 184 | "colab_type": "text" 185 | }, 186 | "source": [ 187 | "# Naive Bayes" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "metadata": { 193 | "id": "TXmcCcXm9y8f", 194 | "colab_type": "code", 195 | "outputId": "7ed5acb2-daee-463d-89d5-c6270147fefd", 196 | "colab": { 197 | "base_uri": "https://localhost:8080/", 198 | "height": 697 199 | } 200 | }, 201 | "source": [ 202 | "pipeline = Pipeline([\n", 203 | " ('tfidf', TfidfVectorizer(stop_words=stop_words)),\n", 204 | " ('clf', OneVsRestClassifier(MultinomialNB(\n", 205 | " fit_prior=True, class_prior=None))),\n", 206 | " ])\n", 207 | "parameters = {\n", 208 | " 'tfidf__max_df': (0.25, 0.5, 0.75),\n", 209 | " 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],\n", 210 | " 'clf__estimator__alpha': (1e-2, 1e-3)\n", 211 | " }\n", 212 | "grid_search(x, y, parameters, pipeline, splits)" 213 | ], 214 | "execution_count": 69, 215 | "outputs": [ 216 | { 217 | "output_type": "stream", 218 | "text": [ 219 | "Fitting 5 folds for each of 18 candidates, totalling 90 fits\n" 220 | ], 221 | "name": "stdout" 222 | }, 223 | { 224 | "output_type": "stream", 225 | "text": [ 226 | "[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.\n", 227 | "[Parallel(n_jobs=5)]: Done 3 tasks | elapsed: 7.7s\n", 228 | "[Parallel(n_jobs=5)]: Done 8 tasks | elapsed: 16.5s\n", 229 | "/usr/local/lib/python3.6/dist-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", 230 | " \"timeout or by a memory leak.\", UserWarning\n", 231 | "[Parallel(n_jobs=5)]: Done 15 tasks | elapsed: 32.6s\n", 232 | "[Parallel(n_jobs=5)]: Done 22 tasks | elapsed: 44.8s\n", 233 | "[Parallel(n_jobs=5)]: Done 31 tasks | elapsed: 1.0min\n", 234 | "[Parallel(n_jobs=5)]: Done 40 tasks | elapsed: 1.3min\n", 235 | "[Parallel(n_jobs=5)]: Done 51 tasks | elapsed: 1.7min\n", 236 | "[Parallel(n_jobs=5)]: Done 62 tasks | elapsed: 2.1min\n", 237 | "[Parallel(n_jobs=5)]: Done 75 tasks | elapsed: 2.5min\n", 238 | "[Parallel(n_jobs=5)]: Done 90 out of 90 | elapsed: 2.9min finished\n" 239 | ], 240 | "name": "stderr" 241 | }, 242 | { 243 | "output_type": "stream", 244 | "text": [ 245 | "\n", 246 | "Best parameters set:\n", 247 | "[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 248 | " dtype=, encoding='utf-8',\n", 249 | " input='content', lowercase=True, max_df=0.25, max_features=None,\n", 250 | " min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,\n", 251 | " smooth_idf=True,\n", 252 | " stop_words={'a', 'about', 'above', 'after', 'again', 'against',\n", 253 | " 'ain', 'all', 'am', 'an', 'and', 'any', 'are',\n", 254 | " 'aren', \"aren't\", 'as', 'at', 'be', 'because',\n", 255 | " 'been', 'before', 'being', 'below', 'between',\n", 256 | " 'both', 'but', 'by', 'can', 'couldn', \"couldn't\", ...},\n", 257 | " strip_accents=None, sublinear_tf=False,\n", 258 | " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", 259 | " vocabulary=None)), ('clf', OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,\n", 260 | " fit_prior=True),\n", 261 | " n_jobs=None))]\n", 262 | "\n", 263 | "Mean train score: [0.88382074 0.98138123 0.97579812 0.88124053 0.98106887 0.97567946\n", 264 | " 0.88124053 0.98106887 0.97567946 0.90170611 0.98557881 0.98512222\n", 265 | " 0.89956406 0.98557758 0.98505287 0.89956406 0.98557758 0.98505287]\n", 266 | "Mean validation score: [0.54239689 0.65258213 0.66209054 0.54033704 0.65196753 0.65998952\n", 267 | " 0.54033704 0.65196753 0.65998952 0.51788825 0.63263591 0.65253621\n", 268 | " 0.51603845 0.63320208 0.65183251 0.51603845 0.63320208 0.65183251]\n" 269 | ], 270 | "name": "stdout" 271 | } 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": { 277 | "id": "6np7GSQXErBx", 278 | "colab_type": "text" 279 | }, 280 | "source": [ 281 | "# Support Vector Machine" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "metadata": { 287 | "id": "1pMWIkVZEf2m", 288 | "colab_type": "code", 289 | "outputId": "48bf17b0-fdab-44ec-86d4-e742a77b68d8", 290 | "colab": { 291 | "base_uri": "https://localhost:8080/", 292 | "height": 1000 293 | } 294 | }, 295 | "source": [ 296 | "pipeline = Pipeline([\n", 297 | " ('tfidf', TfidfVectorizer(stop_words=stop_words)),\n", 298 | " ('clf', OneVsRestClassifier(LinearSVC())),\n", 299 | "])\n", 300 | "parameters = {\n", 301 | " 'tfidf__max_df': (0.25, 0.5, 0.75),\n", 302 | " 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],\n", 303 | " \"clf__estimator__C\": [0.01, 0.1, 1],\n", 304 | " \"clf__estimator__class_weight\": ['balanced', None],\n", 305 | "}\n", 306 | "grid_search(x, y, parameters, pipeline, splits)" 307 | ], 308 | "execution_count": 72, 309 | "outputs": [ 310 | { 311 | "output_type": "stream", 312 | "text": [ 313 | "Fitting 5 folds for each of 54 candidates, totalling 270 fits\n" 314 | ], 315 | "name": "stdout" 316 | }, 317 | { 318 | "output_type": "stream", 319 | "text": [ 320 | "[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.\n", 321 | "[Parallel(n_jobs=5)]: Done 3 tasks | elapsed: 3.8s\n", 322 | "[Parallel(n_jobs=5)]: Done 8 tasks | elapsed: 13.6s\n", 323 | "/usr/local/lib/python3.6/dist-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", 324 | " \"timeout or by a memory leak.\", UserWarning\n", 325 | "[Parallel(n_jobs=5)]: Done 15 tasks | elapsed: 31.6s\n", 326 | "[Parallel(n_jobs=5)]: Done 22 tasks | elapsed: 43.4s\n", 327 | "[Parallel(n_jobs=5)]: Done 31 tasks | elapsed: 1.1min\n", 328 | "[Parallel(n_jobs=5)]: Done 40 tasks | elapsed: 1.3min\n", 329 | "[Parallel(n_jobs=5)]: Done 51 tasks | elapsed: 1.7min\n", 330 | "[Parallel(n_jobs=5)]: Done 62 tasks | elapsed: 2.0min\n", 331 | "[Parallel(n_jobs=5)]: Done 75 tasks | elapsed: 2.5min\n", 332 | "[Parallel(n_jobs=5)]: Done 88 tasks | elapsed: 3.0min\n", 333 | "[Parallel(n_jobs=5)]: Done 103 tasks | elapsed: 3.6min\n", 334 | "[Parallel(n_jobs=5)]: Done 118 tasks | elapsed: 4.3min\n", 335 | "[Parallel(n_jobs=5)]: Done 135 tasks | elapsed: 5.0min\n", 336 | "[Parallel(n_jobs=5)]: Done 152 tasks | elapsed: 5.5min\n", 337 | "[Parallel(n_jobs=5)]: Done 171 tasks | elapsed: 6.2min\n", 338 | "[Parallel(n_jobs=5)]: Done 190 tasks | elapsed: 7.2min\n", 339 | "[Parallel(n_jobs=5)]: Done 211 tasks | elapsed: 9.4min\n", 340 | "[Parallel(n_jobs=5)]: Done 232 tasks | elapsed: 11.0min\n", 341 | "[Parallel(n_jobs=5)]: Done 255 tasks | elapsed: 12.1min\n", 342 | "[Parallel(n_jobs=5)]: Done 270 out of 270 | elapsed: 12.7min finished\n" 343 | ], 344 | "name": "stderr" 345 | }, 346 | { 347 | "output_type": "stream", 348 | "text": [ 349 | "\n", 350 | "Best parameters set:\n", 351 | "[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 352 | " dtype=, encoding='utf-8',\n", 353 | " input='content', lowercase=True, max_df=0.5, max_features=None,\n", 354 | " min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,\n", 355 | " smooth_idf=True,\n", 356 | " stop_words={'a', 'about', 'above', 'after', 'again', 'against',\n", 357 | " 'ain', 'all', 'am', 'an', 'and', 'any', 'are',\n", 358 | " 'aren', \"aren't\", 'as', 'at', 'be', 'because',\n", 359 | " 'been', 'before', 'being', 'below', 'between',\n", 360 | " 'both', 'but', 'by', 'can', 'couldn', \"couldn't\", ...},\n", 361 | " strip_accents=None, sublinear_tf=False,\n", 362 | " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", 363 | " vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight='balanced', dual=True,\n", 364 | " fit_intercept=True, intercept_scaling=1,\n", 365 | " loss='squared_hinge', max_iter=1000,\n", 366 | " multi_class='ovr', penalty='l2',\n", 367 | " random_state=None, tol=0.0001,\n", 368 | " verbose=0),\n", 369 | " n_jobs=None))]\n", 370 | "\n", 371 | "Mean train scores: [0.77294833 0.85204538 0.89323386 0.77030056 0.84885368 0.89099056\n", 372 | " 0.77030056 0.84885368 0.89099056 0.07203024 0.00618181 0.00309553\n", 373 | " 0.06303203 0.00570288 0.00309553 0.06303203 0.00570288 0.00309553\n", 374 | " 0.88884474 0.95874936 0.97233492 0.88732735 0.95805672 0.97177693\n", 375 | " 0.88732735 0.95805672 0.97177693 0.57575521 0.48758409 0.39508222\n", 376 | " 0.57483189 0.48729448 0.39538995 0.57483189 0.48729448 0.39538995\n", 377 | " 0.96398181 0.98785065 0.98840078 0.96328752 0.98745277 0.9882041\n", 378 | " 0.96328752 0.98745277 0.9882041 0.97136943 0.99406589 0.99534927\n", 379 | " 0.96973065 0.99390942 0.99527989 0.96973065 0.99390942 0.99527989]\n", 380 | "Mean validation scores: [0.69123257 0.72278014 0.72724794 0.68812894 0.71628516 0.72197627\n", 381 | " 0.68812894 0.71628516 0.72197627 0.06990639 0.00645839 0.00412055\n", 382 | " 0.0564234 0.00645839 0.00350896 0.0564234 0.00645839 0.00350896\n", 383 | " 0.71706419 0.75515093 0.75660924 0.71644835 0.75365843 0.75331609\n", 384 | " 0.71644835 0.75365843 0.75331609 0.5246548 0.47820268 0.4310089\n", 385 | " 0.52357481 0.4796591 0.43348646 0.52357481 0.4796591 0.43348646\n", 386 | " 0.71537598 0.76219688 0.76223206 0.7142166 0.76373858 0.76172959\n", 387 | " 0.7142166 0.76373858 0.76172959 0.69841614 0.72884177 0.73114328\n", 388 | " 0.70037614 0.72852174 0.73070509 0.70037614 0.72866054 0.73070509]\n" 389 | ], 390 | "name": "stdout" 391 | } 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": { 397 | "id": "FlXiPkDuEytV", 398 | "colab_type": "text" 399 | }, 400 | "source": [ 401 | "# Logistic Regression" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "metadata": { 407 | "id": "H_Lao8-DEnN_", 408 | "colab_type": "code", 409 | "outputId": "e590a8f4-e6ac-42fb-e380-4cc02ff7cc9d", 410 | "colab": { 411 | "base_uri": "https://localhost:8080/", 412 | "height": 1000 413 | } 414 | }, 415 | "source": [ 416 | "pipeline = Pipeline([\n", 417 | " ('tfidf', TfidfVectorizer(stop_words=stop_words)),\n", 418 | " ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),\n", 419 | "])\n", 420 | "parameters = {\n", 421 | " 'tfidf__max_df': (0.25, 0.5, 0.75),\n", 422 | " 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],\n", 423 | " \"clf__estimator__C\": [0.01, 0.1, 1],\n", 424 | " \"clf__estimator__class_weight\": ['balanced', None],\n", 425 | " \"clf__estimator__multi_class\": ['ovr', 'multinomial']\n", 426 | "}\n", 427 | "grid_search(x, y, parameters, pipeline, splits)" 428 | ], 429 | "execution_count": 73, 430 | "outputs": [ 431 | { 432 | "output_type": "stream", 433 | "text": [ 434 | "Fitting 5 folds for each of 108 candidates, totalling 540 fits\n" 435 | ], 436 | "name": "stdout" 437 | }, 438 | { 439 | "output_type": "stream", 440 | "text": [ 441 | "[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.\n", 442 | "[Parallel(n_jobs=5)]: Done 3 tasks | elapsed: 8.6s\n", 443 | "[Parallel(n_jobs=5)]: Done 8 tasks | elapsed: 38.3s\n", 444 | "[Parallel(n_jobs=5)]: Done 15 tasks | elapsed: 1.6min\n", 445 | "/usr/local/lib/python3.6/dist-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", 446 | " \"timeout or by a memory leak.\", UserWarning\n", 447 | "[Parallel(n_jobs=5)]: Done 22 tasks | elapsed: 2.2min\n", 448 | "[Parallel(n_jobs=5)]: Done 31 tasks | elapsed: 3.2min\n", 449 | "[Parallel(n_jobs=5)]: Done 40 tasks | elapsed: 3.8min\n", 450 | "[Parallel(n_jobs=5)]: Done 51 tasks | elapsed: 5.4min\n", 451 | "[Parallel(n_jobs=5)]: Done 62 tasks | elapsed: 6.9min\n", 452 | "[Parallel(n_jobs=5)]: Done 75 tasks | elapsed: 8.9min\n", 453 | "[Parallel(n_jobs=5)]: Done 88 tasks | elapsed: 10.9min\n", 454 | "[Parallel(n_jobs=5)]: Done 103 tasks | elapsed: 12.4min\n", 455 | "[Parallel(n_jobs=5)]: Done 118 tasks | elapsed: 14.0min\n", 456 | "[Parallel(n_jobs=5)]: Done 135 tasks | elapsed: 15.6min\n", 457 | "[Parallel(n_jobs=5)]: Done 152 tasks | elapsed: 17.6min\n", 458 | "[Parallel(n_jobs=5)]: Done 171 tasks | elapsed: 20.3min\n", 459 | "[Parallel(n_jobs=5)]: Done 190 tasks | elapsed: 22.0min\n", 460 | "[Parallel(n_jobs=5)]: Done 211 tasks | elapsed: 23.5min\n", 461 | "[Parallel(n_jobs=5)]: Done 232 tasks | elapsed: 24.8min\n", 462 | "[Parallel(n_jobs=5)]: Done 255 tasks | elapsed: 26.9min\n", 463 | "[Parallel(n_jobs=5)]: Done 278 tasks | elapsed: 28.6min\n", 464 | "[Parallel(n_jobs=5)]: Done 303 tasks | elapsed: 30.0min\n", 465 | "[Parallel(n_jobs=5)]: Done 328 tasks | elapsed: 31.7min\n", 466 | "[Parallel(n_jobs=5)]: Done 355 tasks | elapsed: 33.3min\n", 467 | "[Parallel(n_jobs=5)]: Done 382 tasks | elapsed: 37.6min\n", 468 | "[Parallel(n_jobs=5)]: Done 411 tasks | elapsed: 45.4min\n", 469 | "[Parallel(n_jobs=5)]: Done 440 tasks | elapsed: 57.9min\n", 470 | "[Parallel(n_jobs=5)]: Done 471 tasks | elapsed: 66.3min\n", 471 | "[Parallel(n_jobs=5)]: Done 502 tasks | elapsed: 69.1min\n", 472 | "[Parallel(n_jobs=5)]: Done 540 out of 540 | elapsed: 73.9min finished\n", 473 | "/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", 474 | " \"the coef_ did not converge\", ConvergenceWarning)\n" 475 | ], 476 | "name": "stderr" 477 | }, 478 | { 479 | "output_type": "stream", 480 | "text": [ 481 | "\n", 482 | "Best parameters set:\n", 483 | "[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 484 | " dtype=, encoding='utf-8',\n", 485 | " input='content', lowercase=True, max_df=0.25, max_features=None,\n", 486 | " min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,\n", 487 | " smooth_idf=True,\n", 488 | " stop_words={'a', 'about', 'above', 'after', 'again', 'against',\n", 489 | " 'ain', 'all', 'am', 'an', 'and', 'any', 'are',\n", 490 | " 'aren', \"aren't\", 'as', 'at', 'be', 'because',\n", 491 | " 'been', 'before', 'being', 'below', 'between',\n", 492 | " 'both', 'but', 'by', 'can', 'couldn', \"couldn't\", ...},\n", 493 | " strip_accents=None, sublinear_tf=False,\n", 494 | " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", 495 | " vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight='balanced',\n", 496 | " dual=False, fit_intercept=True,\n", 497 | " intercept_scaling=1,\n", 498 | " l1_ratio=None, max_iter=100,\n", 499 | " multi_class='ovr', n_jobs=None,\n", 500 | " penalty='l2',\n", 501 | " random_state=None,\n", 502 | " solver='sag', tol=0.0001,\n", 503 | " verbose=0, warm_start=False),\n", 504 | " n_jobs=None))]\n", 505 | "\n", 506 | "Mean train scores: [0.71173682 0.77697963 0.81386591 0.70391098 0.77194347 0.81022469\n", 507 | " 0.70411356 0.77200972 0.80994233 0.72460563 0.79198746 0.82880731\n", 508 | " 0.71845348 0.78788701 0.825587 0.71858288 0.78798999 0.8256514\n", 509 | " 0. 0. 0. 0. 0. 0.\n", 510 | " 0. 0. 0. 0. 0. 0.\n", 511 | " 0. 0. 0. 0. 0. 0.\n", 512 | " 0.77576207 0.86146837 0.90319218 0.77304261 0.85816527 0.901181\n", 513 | " 0.77300832 0.85818952 0.90111916 0.80818784 0.90414044 0.94077658\n", 514 | " 0.80679987 0.90053967 0.93789457 0.80678715 0.90058401 0.93792812\n", 515 | " 0.05460194 0.00405803 0.00254609 0.04589755 0.00385192 0.00268333\n", 516 | " 0.04589755 0.00385192 0.00268333 0.18710012 0.0753853 0.03891922\n", 517 | " 0.18578649 0.07582503 0.03878756 0.18578649 0.07582503 0.03878756\n", 518 | " 0.88344878 0.95519249 0.96973288 0.88289275 0.95449698 0.96956937\n", 519 | " 0.88232043 0.95467973 0.96950898 0.83863305 0.94319369 0.95763089\n", 520 | " 0.83529162 0.9401308 0.95490003 0.83435054 0.9373421 0.95732457\n", 521 | " 0.53842762 0.42436727 0.33370738 0.53770059 0.42505249 0.33375524\n", 522 | " 0.53766435 0.42500999 0.33375524 0.67019333 0.62387455 0.55917563\n", 523 | " 0.67173422 0.62479204 0.56007461 0.67179429 0.62492126 0.56007484]\n", 524 | "Mean validation scores: [0.67564074 0.7059686 0.71051405 0.66844108 0.69976085 0.70607837\n", 525 | " 0.66854569 0.70033487 0.70609599 0.67786271 0.70882909 0.71378035\n", 526 | " 0.67411611 0.70257507 0.70987338 0.6743223 0.70231985 0.70992551\n", 527 | " 0. 0. 0. 0. 0. 0.\n", 528 | " 0. 0. 0. 0. 0. 0.\n", 529 | " 0. 0. 0. 0. 0. 0.\n", 530 | " 0.69390518 0.72545524 0.72908992 0.6912414 0.72201224 0.72603783\n", 531 | " 0.6912414 0.72201224 0.7259348 0.70055958 0.73332417 0.73577836\n", 532 | " 0.69745261 0.73270567 0.73559387 0.69754992 0.73270567 0.73559387\n", 533 | " 0.04813402 0.00585286 0.00289548 0.0393736 0.00585286 0.00289548\n", 534 | " 0.0393736 0.00585286 0.00289548 0.1916123 0.08721381 0.04272109\n", 535 | " 0.18680926 0.0881392 0.04384319 0.18680926 0.0881392 0.04384319\n", 536 | " 0.71585408 0.75441297 0.75458996 0.71456135 0.75280116 0.75325123\n", 537 | " 0.71517983 0.75304333 0.75324417 0.67255892 0.74003016 0.73756923\n", 538 | " 0.66421345 0.7295356 0.73545831 0.66498539 0.73062041 0.7375956\n", 539 | " 0.50029206 0.43935878 0.38902827 0.49851857 0.43939077 0.39161177\n", 540 | " 0.49851857 0.43939077 0.39161177 0.57913382 0.56414074 0.53900445\n", 541 | " 0.58190362 0.56419852 0.54045572 0.58190362 0.56419852 0.54045572]\n" 542 | ], 543 | "name": "stdout" 544 | } 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": { 550 | "id": "kvVCq_YGFH8R", 551 | "colab_type": "text" 552 | }, 553 | "source": [ 554 | "# Trees" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "metadata": { 560 | "id": "S7CSwel9FJfd", 561 | "colab_type": "code", 562 | "outputId": "4636048e-79bf-4ec9-dc2e-a0744b363b3b", 563 | "colab": { 564 | "base_uri": "https://localhost:8080/", 565 | "height": 607 566 | } 567 | }, 568 | "source": [ 569 | "pipeline = Pipeline([\n", 570 | " ('tfidf', TfidfVectorizer(stop_words=stop_words)),\n", 571 | " ('clf', DecisionTreeClassifier()),\n", 572 | "])\n", 573 | "parameters = {\n", 574 | " 'tfidf__max_df': (0.25, 0.5, 0.75),\n", 575 | " 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]\n", 576 | "}\n", 577 | "grid_search(x, y, parameters, pipeline, splits)" 578 | ], 579 | "execution_count": 74, 580 | "outputs": [ 581 | { 582 | "output_type": "stream", 583 | "text": [ 584 | "Fitting 5 folds for each of 9 candidates, totalling 45 fits\n" 585 | ], 586 | "name": "stdout" 587 | }, 588 | { 589 | "output_type": "stream", 590 | "text": [ 591 | "[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.\n", 592 | "[Parallel(n_jobs=5)]: Done 3 tasks | elapsed: 19.6s\n", 593 | "[Parallel(n_jobs=5)]: Done 8 tasks | elapsed: 1.4min\n", 594 | "[Parallel(n_jobs=5)]: Done 15 tasks | elapsed: 3.4min\n", 595 | "[Parallel(n_jobs=5)]: Done 22 tasks | elapsed: 4.6min\n", 596 | "[Parallel(n_jobs=5)]: Done 31 tasks | elapsed: 6.9min\n", 597 | "[Parallel(n_jobs=5)]: Done 41 out of 45 | elapsed: 9.9min remaining: 58.0s\n", 598 | "[Parallel(n_jobs=5)]: Done 45 out of 45 | elapsed: 10.0min finished\n" 599 | ], 600 | "name": "stderr" 601 | }, 602 | { 603 | "output_type": "stream", 604 | "text": [ 605 | "\n", 606 | "Best parameters set:\n", 607 | "[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 608 | " dtype=, encoding='utf-8',\n", 609 | " input='content', lowercase=True, max_df=0.25, max_features=None,\n", 610 | " min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,\n", 611 | " smooth_idf=True,\n", 612 | " stop_words={'a', 'about', 'above', 'after', 'again', 'against',\n", 613 | " 'ain', 'all', 'am', 'an', 'and', 'any', 'are',\n", 614 | " 'aren', \"aren't\", 'as', 'at', 'be', 'because',\n", 615 | " 'been', 'before', 'being', 'below', 'between',\n", 616 | " 'both', 'but', 'by', 'can', 'couldn', \"couldn't\", ...},\n", 617 | " strip_accents=None, sublinear_tf=False,\n", 618 | " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", 619 | " vocabulary=None)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n", 620 | " max_features=None, max_leaf_nodes=None,\n", 621 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 622 | " min_samples_leaf=1, min_samples_split=2,\n", 623 | " min_weight_fraction_leaf=0.0, presort=False,\n", 624 | " random_state=None, splitter='best'))]\n", 625 | "\n", 626 | "Mean train scores: [0.99723591 0.99770228 0.99770228 0.99756458 0.99784069 0.99784069\n", 627 | " 0.99756458 0.99784069 0.99784069]\n", 628 | "Mean validation scores: [0.61193436 0.61485835 0.60765938 0.60602461 0.6039503 0.60307723\n", 629 | " 0.60399592 0.60201962 0.6012122 ]\n" 630 | ], 631 | "name": "stdout" 632 | } 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": { 638 | "id": "Eqw-DqlqIV8C", 639 | "colab_type": "text" 640 | }, 641 | "source": [ 642 | "# KNN" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "metadata": { 648 | "id": "tE_pyE6ZF0jM", 649 | "colab_type": "code", 650 | "outputId": "1fe76d87-224e-43ea-83f0-086f6e829e1e", 651 | "colab": { 652 | "base_uri": "https://localhost:8080/", 653 | "height": 1000 654 | } 655 | }, 656 | "source": [ 657 | "pipeline = Pipeline([\n", 658 | " ('tfidf', TfidfVectorizer(stop_words=stop_words)),\n", 659 | " ('clf', KNeighborsClassifier()),\n", 660 | "])\n", 661 | "parameters = {\n", 662 | " 'tfidf__max_df': (0.25, 0.5, 0.75),\n", 663 | " 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],\n", 664 | " 'clf__n_neighbors': (2,3,4,5),\n", 665 | " 'clf__weights': ('uniform', 'distance'),\n", 666 | " 'clf__metric': ['minkowski'],\n", 667 | " 'clf__algorithm': ('ball_tree', 'kd_tree', 'brute')\n", 668 | "}\n", 669 | "grid_search(x, y, parameters, pipeline, splits)" 670 | ], 671 | "execution_count": 75, 672 | "outputs": [ 673 | { 674 | "output_type": "stream", 675 | "text": [ 676 | "Fitting 5 folds for each of 216 candidates, totalling 1080 fits\n" 677 | ], 678 | "name": "stdout" 679 | }, 680 | { 681 | "output_type": "stream", 682 | "text": [ 683 | "[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.\n", 684 | "/usr/local/lib/python3.6/dist-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", 685 | " \"timeout or by a memory leak.\", UserWarning\n", 686 | "[Parallel(n_jobs=5)]: Done 3 tasks | elapsed: 23.9s\n", 687 | "[Parallel(n_jobs=5)]: Done 8 tasks | elapsed: 59.0s\n", 688 | "[Parallel(n_jobs=5)]: Done 15 tasks | elapsed: 1.6min\n", 689 | "[Parallel(n_jobs=5)]: Done 22 tasks | elapsed: 2.4min\n", 690 | "[Parallel(n_jobs=5)]: Done 31 tasks | elapsed: 3.3min\n", 691 | "[Parallel(n_jobs=5)]: Done 40 tasks | elapsed: 4.0min\n", 692 | "[Parallel(n_jobs=5)]: Done 51 tasks | elapsed: 4.8min\n", 693 | "[Parallel(n_jobs=5)]: Done 62 tasks | elapsed: 5.3min\n", 694 | "[Parallel(n_jobs=5)]: Done 75 tasks | elapsed: 6.0min\n", 695 | "[Parallel(n_jobs=5)]: Done 88 tasks | elapsed: 6.7min\n", 696 | "[Parallel(n_jobs=5)]: Done 103 tasks | elapsed: 8.1min\n", 697 | "[Parallel(n_jobs=5)]: Done 118 tasks | elapsed: 9.5min\n", 698 | "[Parallel(n_jobs=5)]: Done 135 tasks | elapsed: 11.0min\n", 699 | "[Parallel(n_jobs=5)]: Done 152 tasks | elapsed: 11.8min\n", 700 | "[Parallel(n_jobs=5)]: Done 171 tasks | elapsed: 12.7min\n", 701 | "[Parallel(n_jobs=5)]: Done 190 tasks | elapsed: 14.1min\n", 702 | "[Parallel(n_jobs=5)]: Done 211 tasks | elapsed: 16.4min\n", 703 | "[Parallel(n_jobs=5)]: Done 232 tasks | elapsed: 18.0min\n", 704 | "[Parallel(n_jobs=5)]: Done 255 tasks | elapsed: 19.1min\n", 705 | "[Parallel(n_jobs=5)]: Done 278 tasks | elapsed: 20.7min\n", 706 | "[Parallel(n_jobs=5)]: Done 303 tasks | elapsed: 23.2min\n", 707 | "[Parallel(n_jobs=5)]: Done 328 tasks | elapsed: 24.9min\n", 708 | "[Parallel(n_jobs=5)]: Done 355 tasks | elapsed: 25.8min\n", 709 | "[Parallel(n_jobs=5)]: Done 382 tasks | elapsed: 27.7min\n", 710 | "[Parallel(n_jobs=5)]: Done 411 tasks | elapsed: 29.6min\n", 711 | "[Parallel(n_jobs=5)]: Done 440 tasks | elapsed: 30.8min\n", 712 | "[Parallel(n_jobs=5)]: Done 471 tasks | elapsed: 33.2min\n", 713 | "[Parallel(n_jobs=5)]: Done 502 tasks | elapsed: 35.2min\n", 714 | "[Parallel(n_jobs=5)]: Done 535 tasks | elapsed: 36.7min\n", 715 | "[Parallel(n_jobs=5)]: Done 568 tasks | elapsed: 39.8min\n", 716 | "[Parallel(n_jobs=5)]: Done 603 tasks | elapsed: 41.7min\n", 717 | "[Parallel(n_jobs=5)]: Done 638 tasks | elapsed: 43.6min\n", 718 | "[Parallel(n_jobs=5)]: Done 675 tasks | elapsed: 46.9min\n", 719 | "[Parallel(n_jobs=5)]: Done 712 tasks | elapsed: 48.4min\n", 720 | "[Parallel(n_jobs=5)]: Done 751 tasks | elapsed: 51.6min\n", 721 | "[Parallel(n_jobs=5)]: Done 790 tasks | elapsed: 53.9min\n", 722 | "[Parallel(n_jobs=5)]: Done 831 tasks | elapsed: 56.8min\n", 723 | "[Parallel(n_jobs=5)]: Done 872 tasks | elapsed: 58.9min\n", 724 | "[Parallel(n_jobs=5)]: Done 915 tasks | elapsed: 60.9min\n", 725 | "[Parallel(n_jobs=5)]: Done 958 tasks | elapsed: 63.5min\n", 726 | "[Parallel(n_jobs=5)]: Done 1003 tasks | elapsed: 65.9min\n", 727 | "[Parallel(n_jobs=5)]: Done 1048 tasks | elapsed: 69.4min\n", 728 | "[Parallel(n_jobs=5)]: Done 1080 out of 1080 | elapsed: 71.0min finished\n" 729 | ], 730 | "name": "stderr" 731 | }, 732 | { 733 | "output_type": "stream", 734 | "text": [ 735 | "\n", 736 | "Best parameters set:\n", 737 | "[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 738 | " dtype=, encoding='utf-8',\n", 739 | " input='content', lowercase=True, max_df=0.5, max_features=None,\n", 740 | " min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,\n", 741 | " smooth_idf=True,\n", 742 | " stop_words={'a', 'about', 'above', 'after', 'again', 'against',\n", 743 | " 'ain', 'all', 'am', 'an', 'and', 'any', 'are',\n", 744 | " 'aren', \"aren't\", 'as', 'at', 'be', 'because',\n", 745 | " 'been', 'before', 'being', 'below', 'between',\n", 746 | " 'both', 'but', 'by', 'can', 'couldn', \"couldn't\", ...},\n", 747 | " strip_accents=None, sublinear_tf=False,\n", 748 | " token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n", 749 | " vocabulary=None)), ('clf', KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',\n", 750 | " metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n", 751 | " weights='distance'))]\n", 752 | "\n", 753 | "Mean train scores: [0.49281024 0.40116321 0.38039959 0.58334489 0.5275787 0.51279092\n", 754 | " 0.58334489 0.5275787 0.51279092 0.99723591 0.99770228 0.99770228\n", 755 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 756 | " 0.55530321 0.47592492 0.4580474 0.76823683 0.7784613 0.77446107\n", 757 | " 0.76823683 0.7784613 0.77446107 0.99723591 0.99770228 0.99770228\n", 758 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 759 | " 0.30843316 0.22113406 0.20106167 0.63842024 0.65954501 0.65764994\n", 760 | " 0.63842024 0.65954501 0.65764994 0.99723591 0.99770228 0.99770228\n", 761 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 762 | " 0.32362291 0.29484388 0.27835 0.68953662 0.71320175 0.71269051\n", 763 | " 0.68953662 0.71320175 0.71269051 0.99723591 0.99770228 0.99770228\n", 764 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 765 | " 0.49281024 0.40116321 0.38039959 0.58334489 0.5275787 0.51279092\n", 766 | " 0.58334489 0.5275787 0.51279092 0.99723591 0.99770228 0.99770228\n", 767 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 768 | " 0.55530321 0.47592492 0.4580474 0.76823683 0.7784613 0.77446107\n", 769 | " 0.76823683 0.7784613 0.77446107 0.99723591 0.99770228 0.99770228\n", 770 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 771 | " 0.30843316 0.22113406 0.20106167 0.63842024 0.65954501 0.65764994\n", 772 | " 0.63842024 0.65954501 0.65764994 0.99723591 0.99770228 0.99770228\n", 773 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 774 | " 0.32362291 0.29484388 0.27835 0.68953662 0.71320175 0.71269051\n", 775 | " 0.68953662 0.71320175 0.71269051 0.99723591 0.99770228 0.99770228\n", 776 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 777 | " 0.49281024 0.40116321 0.38039959 0.58334489 0.5275787 0.51279092\n", 778 | " 0.58334489 0.5275787 0.51279092 0.99723591 0.99770228 0.99770228\n", 779 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 780 | " 0.55530321 0.47592492 0.4580474 0.76823683 0.7784613 0.77446107\n", 781 | " 0.76823683 0.7784613 0.77446107 0.99723591 0.99770228 0.99770228\n", 782 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 783 | " 0.30843316 0.22113406 0.20106167 0.63842024 0.65954501 0.65764994\n", 784 | " 0.63842024 0.65954501 0.65764994 0.99723591 0.99770228 0.99770228\n", 785 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069\n", 786 | " 0.32362291 0.29484388 0.27835 0.68953662 0.71320175 0.71269051\n", 787 | " 0.68953662 0.71320175 0.71269051 0.99723591 0.99770228 0.99770228\n", 788 | " 0.99756458 0.99784069 0.99784069 0.99756458 0.99784069 0.99784069]\n", 789 | "Mean validation scores: [0.30406416 0.23381372 0.22288764 0.3967972 0.36270829 0.36003685\n", 790 | " 0.3967972 0.36270829 0.36003685 0.45169094 0.38798096 0.37647288\n", 791 | " 0.40040163 0.35716065 0.34756693 0.40040163 0.35716065 0.34756693\n", 792 | " 0.32624954 0.25174417 0.23854894 0.59356057 0.61609346 0.61690853\n", 793 | " 0.59356057 0.61609346 0.61690853 0.37312724 0.30953969 0.30109056\n", 794 | " 0.61052402 0.62934277 0.62837834 0.61052402 0.62934277 0.62837834\n", 795 | " 0.23244425 0.1650914 0.15395218 0.54394996 0.57129111 0.57013033\n", 796 | " 0.54394996 0.57129111 0.57013033 0.36275886 0.29559062 0.28828532\n", 797 | " 0.58793125 0.6031649 0.59880192 0.58793125 0.6031649 0.59880192\n", 798 | " 0.23735064 0.22100986 0.21540526 0.59720931 0.62622858 0.62665629\n", 799 | " 0.59720931 0.62622858 0.62665629 0.30530561 0.30178386 0.29927714\n", 800 | " 0.61758008 0.63926302 0.64036329 0.61758008 0.63926302 0.64036329\n", 801 | " 0.30406416 0.23381372 0.22288764 0.3967972 0.36270829 0.36003685\n", 802 | " 0.3967972 0.36270829 0.36003685 0.45169094 0.38798096 0.37647288\n", 803 | " 0.40040163 0.35716065 0.34756693 0.40040163 0.35716065 0.34756693\n", 804 | " 0.32624954 0.25174417 0.23854894 0.59356057 0.61609346 0.61690853\n", 805 | " 0.59356057 0.61609346 0.61690853 0.37312724 0.30953969 0.30109056\n", 806 | " 0.61052402 0.62934277 0.62837834 0.61052402 0.62934277 0.62837834\n", 807 | " 0.23244425 0.1650914 0.15395218 0.54394996 0.57129111 0.57013033\n", 808 | " 0.54394996 0.57129111 0.57013033 0.36275886 0.29559062 0.28828532\n", 809 | " 0.58793125 0.6031649 0.59880192 0.58793125 0.6031649 0.59880192\n", 810 | " 0.23735064 0.22100986 0.21540526 0.59720931 0.62622858 0.62665629\n", 811 | " 0.59720931 0.62622858 0.62665629 0.30530561 0.30178386 0.29927714\n", 812 | " 0.61758008 0.63926302 0.64036329 0.61758008 0.63926302 0.64036329\n", 813 | " 0.30406416 0.23381372 0.22288764 0.3967972 0.36270829 0.36003685\n", 814 | " 0.3967972 0.36270829 0.36003685 0.45169094 0.38798096 0.37647288\n", 815 | " 0.40040163 0.35716065 0.34756693 0.40040163 0.35716065 0.34756693\n", 816 | " 0.32624954 0.25174417 0.23854894 0.59356057 0.61609346 0.61690853\n", 817 | " 0.59356057 0.61609346 0.61690853 0.37312724 0.30953969 0.30109056\n", 818 | " 0.61052402 0.62934277 0.62837834 0.61052402 0.62934277 0.62837834\n", 819 | " 0.23244425 0.1650914 0.15395218 0.54394996 0.57129111 0.57013033\n", 820 | " 0.54394996 0.57129111 0.57013033 0.36275886 0.29559062 0.28828532\n", 821 | " 0.58793125 0.6031649 0.59880192 0.58793125 0.6031649 0.59880192\n", 822 | " 0.23735064 0.22100986 0.21540526 0.59720931 0.62622858 0.62665629\n", 823 | " 0.59720931 0.62622858 0.62665629 0.30530561 0.30178386 0.29927714\n", 824 | " 0.61758008 0.63926302 0.64036329 0.61758008 0.63926302 0.64036329]\n" 825 | ], 826 | "name": "stdout" 827 | }, 828 | { 829 | "output_type": "stream", 830 | "text": [ 831 | "/usr/local/lib/python3.6/dist-packages/sklearn/neighbors/base.py:216: UserWarning: cannot use tree with sparse input: using brute force\n", 832 | " warnings.warn(\"cannot use tree with sparse input: \"\n" 833 | ], 834 | "name": "stderr" 835 | } 836 | ] 837 | }, 838 | { 839 | "cell_type": "code", 840 | "metadata": { 841 | "id": "Xe1XdVYjItmO", 842 | "colab_type": "code", 843 | "colab": {} 844 | }, 845 | "source": [ 846 | "" 847 | ], 848 | "execution_count": 0, 849 | "outputs": [] 850 | } 851 | ] 852 | } -------------------------------------------------------------------------------- /experiments/traditional_ml_cross_entropy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "traditional_ml_cross_entropy.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "cB6aDzb72Lz3", 32 | "colab_type": "code", 33 | "colab": {} 34 | }, 35 | "source": [ 36 | "#!python -m spacy download en_core_web_lg" 37 | ], 38 | "execution_count": 0, 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "metadata": { 44 | "id": "h6p8vLIwha9i", 45 | "colab_type": "code", 46 | "outputId": "193e032e-a5bc-4f2f-da8c-ddc40500b78b", 47 | "colab": { 48 | "base_uri": "https://localhost:8080/", 49 | "height": 173 50 | } 51 | }, 52 | "source": [ 53 | "import numpy as np\n", 54 | "import pandas as pd \n", 55 | "import string\n", 56 | "import os\n", 57 | "import re\n", 58 | "\n", 59 | "### SKLEARN ###\n", 60 | "from sklearn.pipeline import Pipeline\n", 61 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 62 | "from sklearn.multiclass import OneVsRestClassifier\n", 63 | "from sklearn.naive_bayes import MultinomialNB\n", 64 | "from sklearn.svm import LinearSVC\n", 65 | "from sklearn.linear_model import LogisticRegression\n", 66 | "from sklearn.neighbors import KNeighborsClassifier\n", 67 | "from sklearn.metrics import classification_report, roc_auc_score, hamming_loss, accuracy_score\n", 68 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 69 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 70 | "\n", 71 | "### NLTK ###\n", 72 | "import nltk\n", 73 | "nltk.download('stopwords')\n", 74 | "nltk.download('punkt')\n", 75 | "nltk.download('averaged_perceptron_tagger')\n", 76 | "nltk.download('wordnet')\n", 77 | "from nltk.corpus import stopwords as sw\n", 78 | "from nltk.corpus import wordnet as wn\n", 79 | "from nltk.tokenize import word_tokenize\n", 80 | "from nltk import WordNetLemmatizer\n", 81 | "from nltk import pos_tag\n", 82 | "\n", 83 | "\n", 84 | "### SPACY ###\n", 85 | "#import spacy\n", 86 | "#spacy_stop_words = spacy.lang.en.stop_words.STOP_WORDS\n", 87 | "#nlp = spacy.load('en_core_web_lg', disable=['ner', 'parser'])\n", 88 | "\n", 89 | "from joblib import dump, load\n", 90 | "from pathlib import Path" 91 | ], 92 | "execution_count": 1, 93 | "outputs": [ 94 | { 95 | "output_type": "stream", 96 | "text": [ 97 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 98 | "[nltk_data] Unzipping corpora/stopwords.zip.\n", 99 | "[nltk_data] Downloading package punkt to /root/nltk_data...\n", 100 | "[nltk_data] Unzipping tokenizers/punkt.zip.\n", 101 | "[nltk_data] Downloading package averaged_perceptron_tagger to\n", 102 | "[nltk_data] /root/nltk_data...\n", 103 | "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n", 104 | "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", 105 | "[nltk_data] Unzipping corpora/wordnet.zip.\n" 106 | ], 107 | "name": "stdout" 108 | } 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "metadata": { 114 | "id": "XbXFaXGGhr21", 115 | "colab_type": "code", 116 | "outputId": "9459dbeb-0e65-48a8-d47f-f5148176f74c", 117 | "colab": { 118 | "base_uri": "https://localhost:8080/", 119 | "height": 124 120 | } 121 | }, 122 | "source": [ 123 | "from google.colab import drive\n", 124 | "drive.mount('/content/gdrive', force_remount=True)\n", 125 | "base_dir = \"gdrive/My Drive/fastai-v3/sdgs/\"\n", 126 | "\n", 127 | "CROSS_FOLDS = f\"{base_dir}dataset/cross_validation/\"\n", 128 | "OUTPUT_DIR = f\"{base_dir}traditional_ml/\"" 129 | ], 130 | "execution_count": 2, 131 | "outputs": [ 132 | { 133 | "output_type": "stream", 134 | "text": [ 135 | "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code\n", 136 | "\n", 137 | "Enter your authorization code:\n", 138 | "··········\n", 139 | "Mounted at /content/gdrive\n" 140 | ], 141 | "name": "stdout" 142 | } 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": { 148 | "id": "lq-Npyjn70jj", 149 | "colab_type": "text" 150 | }, 151 | "source": [ 152 | "# Preprocessors" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "metadata": { 158 | "id": "gwDFmhd99uVz", 159 | "colab_type": "code", 160 | "colab": {} 161 | }, 162 | "source": [ 163 | "class SpacyPreprocessor(BaseEstimator, TransformerMixin):\n", 164 | "\n", 165 | " def __init__(self):\n", 166 | " self.stopwords = spacy_stop_words\n", 167 | " self.punct = set(string.punctuation)\n", 168 | "\n", 169 | " def fit(self, X, y=None):\n", 170 | " return self\n", 171 | "\n", 172 | " def inverse_transform(self, X):\n", 173 | " return [\" \".join(doc) for doc in X]\n", 174 | "\n", 175 | " def transform(self, X):\n", 176 | " return [\n", 177 | " list(self.tokenize(doc)) for doc in X\n", 178 | " ]\n", 179 | "\n", 180 | " def tokenize(self, document):\n", 181 | " for token in nlp(document):\n", 182 | "\n", 183 | " # Disregard stopwords\n", 184 | " if token in self.stopwords:\n", 185 | " continue\n", 186 | "\n", 187 | " # Disregard punctuation\n", 188 | " if all(char in self.punct for char in token.text):\n", 189 | " continue\n", 190 | "\n", 191 | " # yield lemmatized tokens\n", 192 | " yield token.lemma_" 193 | ], 194 | "execution_count": 0, 195 | "outputs": [] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "metadata": { 200 | "id": "xoc981iI6psQ", 201 | "colab_type": "code", 202 | "colab": {} 203 | }, 204 | "source": [ 205 | "class NLTKPreprocessor(BaseEstimator, TransformerMixin):\n", 206 | "\n", 207 | " def __init__(self, stopwords=None, punct=None,\n", 208 | " lower=True, strip=True):\n", 209 | " self.stopwords = set(sw.words('english'))\n", 210 | " self.punct = set(string.punctuation)\n", 211 | " self.lemmatizer = WordNetLemmatizer()\n", 212 | "\n", 213 | " def fit(self, X, y=None):\n", 214 | " return self\n", 215 | "\n", 216 | " def inverse_transform(self, X):\n", 217 | " return [\" \".join(doc) for doc in X]\n", 218 | "\n", 219 | " def transform(self, X):\n", 220 | " return [\n", 221 | " list(self.tokenize(doc)) for doc in X\n", 222 | " ]\n", 223 | "\n", 224 | " def tokenize(self, document):\n", 225 | " for token, tag in pos_tag(word_tokenize(document)):\n", 226 | " token = token.lower()\n", 227 | " token = token.strip()\n", 228 | " token = token.strip('_')\n", 229 | " token = token.strip('*')\n", 230 | "\n", 231 | " # Disregard stopwords\n", 232 | " if token in self.stopwords:\n", 233 | " continue\n", 234 | "\n", 235 | " # Disregard punctuation\n", 236 | " if all(char in self.punct for char in token):\n", 237 | " continue\n", 238 | "\n", 239 | " # yield lemmatized tokens\n", 240 | " lemma = self.lemmatize(token, tag)\n", 241 | " yield lemma\n", 242 | "\n", 243 | " def lemmatize(self, token, tag):\n", 244 | " tag = {\n", 245 | " 'N': wn.NOUN,\n", 246 | " 'V': wn.VERB,\n", 247 | " 'R': wn.ADV,\n", 248 | " 'J': wn.ADJ\n", 249 | " }.get(tag[0], wn.NOUN)\n", 250 | " return self.lemmatizer.lemmatize(token, tag)" 251 | ], 252 | "execution_count": 0, 253 | "outputs": [] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "metadata": { 258 | "id": "FGWKkATC9Wav", 259 | "colab_type": "code", 260 | "colab": {} 261 | }, 262 | "source": [ 263 | "def identity(arg):\n", 264 | " \"\"\"\n", 265 | " Simple identity function works as a passthrough.\n", 266 | " \"\"\"\n", 267 | " return arg" 268 | ], 269 | "execution_count": 0, 270 | "outputs": [] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": { 275 | "id": "sAWjeu2n7rNU", 276 | "colab_type": "text" 277 | }, 278 | "source": [ 279 | "# Pipeline with preprocessor, vectorizer and model" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "id": "hDT5rTaHV9jU", 286 | "colab_type": "code", 287 | "colab": {} 288 | }, 289 | "source": [ 290 | "def run_classifier(train_x, train_y, arch, preprocessor=NLTKPreprocessor()):\n", 291 | " if arch == 'svm':\n", 292 | " clf = OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight='balanced', dual=True,\n", 293 | " fit_intercept=True, intercept_scaling=1,\n", 294 | " loss='squared_hinge', max_iter=1000,\n", 295 | " multi_class='ovr', penalty='l2',\n", 296 | " random_state=None, tol=0.0001,\n", 297 | " verbose=0))\n", 298 | "\n", 299 | " word_vectorizer = TfidfVectorizer(binary=False, decode_error='strict',\n", 300 | " encoding='utf-8', dtype=np.float64,\n", 301 | " input='content', lowercase=False, max_df=0.25, max_features=None,\n", 302 | " min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,\n", 303 | " smooth_idf=True,\n", 304 | " stop_words=None,\n", 305 | " strip_accents=None, sublinear_tf=False,\n", 306 | " tokenizer=identity, use_idf=True,\n", 307 | " vocabulary=None) \n", 308 | " \n", 309 | "\n", 310 | " elif arch == 'nb':\n", 311 | " clf = OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,\n", 312 | " fit_prior=True))\n", 313 | "\n", 314 | " word_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 315 | " dtype=np.float64, encoding='utf-8',\n", 316 | " input='content', lowercase=False, max_df=0.25, max_features=None,\n", 317 | " min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,\n", 318 | " smooth_idf=True,\n", 319 | " stop_words=None,\n", 320 | " strip_accents=None, sublinear_tf=False,\n", 321 | " tokenizer=identity, use_idf=True,\n", 322 | " vocabulary=None)\n", 323 | " \n", 324 | " \n", 325 | " elif arch == 'lg':\n", 326 | " clf = OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight='balanced',\n", 327 | " dual=False, fit_intercept=True,\n", 328 | " intercept_scaling=1,\n", 329 | " l1_ratio=None, max_iter=4000,\n", 330 | " multi_class='ovr',\n", 331 | " n_jobs=None, penalty='l2',\n", 332 | " random_state=None,\n", 333 | " solver='sag', tol=0.0001,\n", 334 | " verbose=0, warm_start=False),\n", 335 | " n_jobs=None)\n", 336 | "\n", 337 | " word_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 338 | " dtype=np.float64, encoding='utf-8',\n", 339 | " input='content', lowercase=False, max_df=0.25, max_features=None,\n", 340 | " min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,\n", 341 | " smooth_idf=True,\n", 342 | " stop_words=None,\n", 343 | " strip_accents=None, sublinear_tf=False,\n", 344 | " tokenizer=identity, use_idf=True,\n", 345 | " vocabulary=None)\n", 346 | " \n", 347 | " \n", 348 | " elif arch == 'knn':\n", 349 | " clf = KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',\n", 350 | " metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n", 351 | " weights='distance')\n", 352 | " word_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 353 | " dtype=np.float64, encoding='utf-8',\n", 354 | " input='content', lowercase=False, max_df=0.5, max_features=None,\n", 355 | " min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,\n", 356 | " stop_words=None,\n", 357 | " strip_accents=None, sublinear_tf=False,\n", 358 | " tokenizer=identity, use_idf=True,\n", 359 | " vocabulary=None) \n", 360 | " \n", 361 | "\n", 362 | " pipe = Pipeline([('preprocessor', preprocessor), ('tfidf', word_vectorizer), ('multilabel', clf)])\n", 363 | " pipe.fit(train_x, train_y)\n", 364 | " return pipe" 365 | ], 366 | "execution_count": 0, 367 | "outputs": [] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "id": "EmwDCosdpw6Q", 373 | "colab_type": "text" 374 | }, 375 | "source": [ 376 | "# Train the models" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "metadata": { 382 | "id": "fMeghBn3zxg4", 383 | "colab_type": "code", 384 | "colab": {} 385 | }, 386 | "source": [ 387 | "df = pd.read_csv(base_dir + \"dataset/cleanup_labelled.csv\")\n", 388 | "df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n", 389 | "\n", 390 | "### Mask labels in text: replace all labels by SDGLABEL\n", 391 | "pattern = r\"(indicator)(\\s+\\d+\\.[\\d+a-d]\\.\\d+)|(target)(\\s+\\d+\\.[\\d+a-d])|(sdgs|sdg|goals|goal)\\W*\\s+(,?\\s*\\b\\d{1,2}\\b[and\\s\\b\\d{1,2}\\b]*)\"\n", 392 | "masked_df = df.text.str.replace(pattern, ' SDGLABEL ', regex=True, flags=re.IGNORECASE)\n", 393 | "# Remove double spaces\n", 394 | "masked_df = pd.DataFrame(masked_df.str.replace(' ', ' ', regex=True, flags=re.IGNORECASE))" 395 | ], 396 | "execution_count": 0, 397 | "outputs": [] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "metadata": { 402 | "colab_type": "code", 403 | "id": "KlAa0AW0kz-q", 404 | "colab": {} 405 | }, 406 | "source": [ 407 | "mlb = MultiLabelBinarizer()\n", 408 | "\n", 409 | "non_masked_x = df[['text']].values \n", 410 | "masked_x = masked_df[['text']].values\n", 411 | "y = mlb.fit_transform(df.labels)\n", 412 | "\n", 413 | "labels = [str(i) for i in range(1,18)]" 414 | ], 415 | "execution_count": 0, 416 | "outputs": [] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "metadata": { 421 | "colab_type": "code", 422 | "id": "EIY5pZzCMNLt", 423 | "colab": {} 424 | }, 425 | "source": [ 426 | "\"\"\"\n", 427 | "archs = ['svm', 'lg', 'knn', 'nb']\n", 428 | "\n", 429 | "for arch in archs:\n", 430 | " print(f\"Processing {arch}...\")\n", 431 | " for fold in os.listdir(CROSS_FOLDS):\n", 432 | " print(fold)\n", 433 | "\n", 434 | " # Load predefined indices for train, val and test\n", 435 | " train_index = np.load(f\"{CROSS_FOLDS}{fold}/train.npy\")\n", 436 | "\n", 437 | " \n", 438 | " # Load train text\n", 439 | " x_train = [t[0] for t in non_masked_x[train_index].tolist()]\n", 440 | "\n", 441 | " # Load train labels\n", 442 | " y_train = y[train_index]\n", 443 | "\n", 444 | " # Fit model on fold data\n", 445 | " model = run_classifier(x_train, y_train, arch=arch)\n", 446 | "\n", 447 | " # Save model\n", 448 | " save_dir = Path(f\"{OUTPUT_DIR}{arch}{mask}/\")\n", 449 | " save_dir.mkdir(exist_ok=True)\n", 450 | " file_dir = save_dir/f\"{arch}_{fold}.joblib\"\n", 451 | " dump(model, file_dir)\n", 452 | " print(f\"Finished training {arch}.\")\n", 453 | "\"\"\"" 454 | ], 455 | "execution_count": 0, 456 | "outputs": [] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": { 461 | "id": "3dshrJceYb1O", 462 | "colab_type": "text" 463 | }, 464 | "source": [ 465 | "# Evaluate on test" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "metadata": { 471 | "id": "bwmb4EOCI-Gt", 472 | "colab_type": "code", 473 | "colab": {} 474 | }, 475 | "source": [ 476 | "def metrics_avg(models_testx_testy, labels_):\n", 477 | " def calc(model, test_x, test_y):\n", 478 | " predictions = model.predict(test_x)\n", 479 | " metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)\n", 480 | " metrics_df = pd.DataFrame.from_dict(metrics)\n", 481 | " h = hamming_loss(test_y, predictions)\n", 482 | " roc = roc_auc_score(test_y, predictions, average='micro')\n", 483 | " return metrics_df, h, roc\n", 484 | " \n", 485 | " model_first, test_x_first, test_y_first = models_testx_testy[0]\n", 486 | " metrics_agg, ham, roc = calc(model_first, test_x_first, test_y_first)\n", 487 | " n = len(models_testx_testy)\n", 488 | " \n", 489 | " for model, test_x, test_y in models_testx_testy[1:]:\n", 490 | " metrics, h, r = calc(model, test_x, test_y)\n", 491 | " metrics_agg += metrics\n", 492 | " ham += h\n", 493 | " roc += r\n", 494 | "\n", 495 | " return metrics_agg/n, ham/n, roc/n" 496 | ], 497 | "execution_count": 0, 498 | "outputs": [] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": { 503 | "id": "jlm6H2vmYEWD", 504 | "colab_type": "text" 505 | }, 506 | "source": [ 507 | "# Load and evaluate saved models on test data" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "metadata": { 513 | "id": "ywBE6Hdu4bQ4", 514 | "colab_type": "code", 515 | "colab": {} 516 | }, 517 | "source": [ 518 | "archs = [\"nb\", \"lg\", \"svm\", \"knn\"]\n", 519 | "\n", 520 | "#mask = \"\"\n", 521 | "mask = \"_masked/\"\n", 522 | "hr_restuls = {}\n", 523 | "\n", 524 | "for arch in archs:\n", 525 | " results_dir = Path(f\"{OUTPUT_DIR}{arch}{mask}/\")\n", 526 | " loaded_models = []\n", 527 | " for fold in os.listdir(CROSS_FOLDS):\n", 528 | " print(f\"Loading {fold} of {arch}\")\n", 529 | " test_index = np.load(f\"{CROSS_FOLDS}{fold}/test.npy\")\n", 530 | "\n", 531 | " # We used the untouched test for both masked and unmasked\n", 532 | " x_test = [t[0] for t in non_masked_x[test_index].tolist()]\n", 533 | "\n", 534 | "\n", 535 | " y_test = y[test_index]\n", 536 | "\n", 537 | " load_dir = Path(f\"{OUTPUT_DIR}{arch}{mask}\")\n", 538 | " load_dir = load_dir/f\"{arch}_{fold}.joblib\"\n", 539 | "\n", 540 | " loaded_model = load(load_dir)\n", 541 | " loaded_models.append((loaded_model, x_test, y_test))\n", 542 | " print(f\"Finished loading the {mask} {arch} models.\")\n", 543 | " \n", 544 | " print(f\"Evaluating {mask} {arch} models.\")\n", 545 | " loaded_results = metrics_avg(loaded_models, labels)\n", 546 | " loaded_results[0].to_csv(results_dir/f'results_{arch}.csv', sep=';')\n", 547 | " \n", 548 | " hlos = round(loaded_results[1],4)\n", 549 | " roc_auc = round(loaded_results[2],4)\n", 550 | " hr_restuls[arch] = {\n", 551 | " \"hl\": hlos,\n", 552 | " \"roc_auc\": roc_auc\n", 553 | " }\n", 554 | " \n", 555 | " print(f\"Finished evaluation of {mask} {arch} models.\")\n", 556 | " print()" 557 | ], 558 | "execution_count": 0, 559 | "outputs": [] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "metadata": { 564 | "id": "xjv6SHRBR5TE", 565 | "colab_type": "code", 566 | "colab": { 567 | "base_uri": "https://localhost:8080/", 568 | "height": 34 569 | }, 570 | "outputId": "75d32f39-634d-4ebe-a9aa-e06d90ed4418" 571 | }, 572 | "source": [ 573 | "hr_restuls['knn']" 574 | ], 575 | "execution_count": 24, 576 | "outputs": [ 577 | { 578 | "output_type": "execute_result", 579 | "data": { 580 | "text/plain": [ 581 | "{'hl': 0.0739, 'roc_auc': 0.5535}" 582 | ] 583 | }, 584 | "metadata": { 585 | "tags": [] 586 | }, 587 | "execution_count": 24 588 | } 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "metadata": { 594 | "id": "dBmK1Oi8BkZW", 595 | "colab_type": "code", 596 | "colab": {} 597 | }, 598 | "source": [ 599 | "" 600 | ], 601 | "execution_count": 0, 602 | "outputs": [] 603 | } 604 | ] 605 | } -------------------------------------------------------------------------------- /experiments/ulmfit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "ulmfit.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "language_info": { 13 | "codemirror_mode": { 14 | "name": "ipython", 15 | "version": 3 16 | }, 17 | "file_extension": ".py", 18 | "mimetype": "text/x-python", 19 | "name": "python", 20 | "nbconvert_exporter": "python", 21 | "pygments_lexer": "ipython3", 22 | "version": "3.7.3" 23 | }, 24 | "kernelspec": { 25 | "name": "python3", 26 | "display_name": "Python 3" 27 | }, 28 | "accelerator": "GPU" 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "\"Open" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "metadata": { 44 | "id": "JQU-evlhdMNu", 45 | "colab_type": "code", 46 | "colab": {} 47 | }, 48 | "source": [ 49 | "!pip install iterative-stratification" 50 | ], 51 | "execution_count": 0, 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "metadata": { 57 | "colab_type": "code", 58 | "id": "GFXrTP1PCs0Y", 59 | "colab": {} 60 | }, 61 | "source": [ 62 | "from fastai.text import *\n", 63 | "from iterstrat.ml_stratifiers import MultilabelStratifiedKFold\n", 64 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 65 | "from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, hamming_loss, roc_auc_score\n", 66 | "from sklearn.metrics import classification_report\n", 67 | "import numpy as np" 68 | ], 69 | "execution_count": 0, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "colab_type": "code", 76 | "id": "itVUcAwDCnCP", 77 | "colab": {} 78 | }, 79 | "source": [ 80 | "from google.colab import drive\n", 81 | "drive.mount('/content/gdrive', force_remount=True)\n", 82 | "base_dir = \"gdrive/My Drive/fastai-v3/sdgs/\"\n", 83 | "dataset_dir = base_dir + \"lstm/\"" 84 | ], 85 | "execution_count": 0, 86 | "outputs": [] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": { 91 | "colab_type": "text", 92 | "id": "KiIJJ01XCwle" 93 | }, 94 | "source": [ 95 | "## Uploading the data" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "metadata": { 101 | "id": "wS_rW9djL5nQ", 102 | "colab_type": "code", 103 | "colab": {} 104 | }, 105 | "source": [ 106 | "source_path = Path(dataset_dir + 'cleanup_labelled.csv')\n", 107 | "df = pd.read_csv(source_path)\n", 108 | "df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n", 109 | "df.head()" 110 | ], 111 | "execution_count": 0, 112 | "outputs": [] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": { 117 | "id": "TM5R8MrpD9ja", 118 | "colab_type": "text" 119 | }, 120 | "source": [ 121 | "We first create a general language model with the unlabelled data, which we will later fit into the WikiText 103 language model, so that it has more information about SDGs." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "metadata": { 127 | "id": "eeaPUkUiRla6", 128 | "colab_type": "code", 129 | "colab": {} 130 | }, 131 | "source": [ 132 | "#data_lm = TextLMDataBunch.from_csv(dataset_dir, 'cleanup_unlabelled.csv')" 133 | ], 134 | "execution_count": 0, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "metadata": { 140 | "id": "mD9GZMqpRLNY", 141 | "colab_type": "code", 142 | "colab": {} 143 | }, 144 | "source": [ 145 | "#data_lm.show_batch()\n", 146 | "#data_lm.save('data_lm_export.pkl')\n", 147 | "path = Path(dataset_dir)\n", 148 | "data_lm = load_data(path, 'data_lm_export.pkl')" 149 | ], 150 | "execution_count": 0, 151 | "outputs": [] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "id": "F64WFx3bKiZq", 157 | "colab_type": "text" 158 | }, 159 | "source": [ 160 | "# Training the general language model" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "id": "yLwASThLN65j", 167 | "colab_type": "text" 168 | }, 169 | "source": [ 170 | "We train on the first layer of the language model with all the data and a pretrained language model from the WikiText 103. This is included in the architecture of AWD_LSTM." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "metadata": { 176 | "id": "aRQHabOwpmwy", 177 | "colab_type": "code", 178 | "colab": {} 179 | }, 180 | "source": [ 181 | "learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)" 182 | ], 183 | "execution_count": 0, 184 | "outputs": [] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "metadata": { 189 | "id": "Ak-OSKYmGfbu", 190 | "colab_type": "code", 191 | "colab": {} 192 | }, 193 | "source": [ 194 | "learn.lr_find()\n", 195 | "learn.recorder.plot(skip_end=15)" 196 | ], 197 | "execution_count": 0, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "metadata": { 203 | "id": "Jiyv5xGFAEYQ", 204 | "colab_type": "code", 205 | "colab": {} 206 | }, 207 | "source": [ 208 | "learn.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))" 209 | ], 210 | "execution_count": 0, 211 | "outputs": [] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "metadata": { 216 | "id": "Vvx2C2TlRVGz", 217 | "colab_type": "code", 218 | "colab": {} 219 | }, 220 | "source": [ 221 | "#learn.save('fit_head')" 222 | ], 223 | "execution_count": 0, 224 | "outputs": [] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "metadata": { 229 | "id": "GLROGMRsRaK1", 230 | "colab_type": "code", 231 | "colab": {} 232 | }, 233 | "source": [ 234 | "learn.load('fit_head')" 235 | ], 236 | "execution_count": 0, 237 | "outputs": [] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "metadata": { 242 | "id": "LcKIOKCERK1S", 243 | "colab_type": "code", 244 | "colab": {} 245 | }, 246 | "source": [ 247 | "learn.unfreeze()\n", 248 | "learn.fit_one_cycle(10, 1e-3, moms=(0.8,0.7))" 249 | ], 250 | "execution_count": 0, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "HyHBz2yMPvOR", 257 | "colab_type": "code", 258 | "colab": {} 259 | }, 260 | "source": [ 261 | "#learn.save_encoder('fine_tuned_encoder')" 262 | ], 263 | "execution_count": 0, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "p5upJ_jFSEXi", 270 | "colab_type": "code", 271 | "colab": {} 272 | }, 273 | "source": [ 274 | "learn.load_encoder('fine_tuned_encoder')" 275 | ], 276 | "execution_count": 0, 277 | "outputs": [] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "metadata": { 282 | "id": "v8DNZHLecaMC", 283 | "colab_type": "code", 284 | "colab": {} 285 | }, 286 | "source": [ 287 | "learn.predict(\"African countries like\", n_words=20)" 288 | ], 289 | "execution_count": 0, 290 | "outputs": [] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": { 295 | "id": "Ry9vK4WdPIi_", 296 | "colab_type": "text" 297 | }, 298 | "source": [ 299 | "# Training the classifier" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": { 305 | "id": "c_iAtHuKEP8P", 306 | "colab_type": "text" 307 | }, 308 | "source": [ 309 | "Then we load the language model for the classifier" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "metadata": { 315 | "id": "eC455fwCdxlA", 316 | "colab_type": "code", 317 | "colab": {} 318 | }, 319 | "source": [ 320 | "def train_classifier(path_, train, test, lm, kfold, process='train'):\n", 321 | " data_clas = TextClasDataBunch.from_df(path_, train_df=train, valid_df=test, vocab=lm.train_ds.vocab, text_cols='text', label_cols='labels', label_delim='|', bs=32)\n", 322 | " acc_02 = partial(accuracy_thresh, thresh=0.2)\n", 323 | " f_score = partial(fbeta, thresh=0.2)\n", 324 | "\n", 325 | " F1macro = partial(MultiLabelFbeta, average=\"macro\")\n", 326 | " F1micro = partial(MultiLabelFbeta, average=\"micro\")\n", 327 | " F1weighted = partial(MultiLabelFbeta, average=\"weighted\")\n", 328 | "\n", 329 | " learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[acc_02, f_score], callback_fns=[F1macro, F1micro, F1weighted])\n", 330 | " learn.load_encoder('ft_enc')\n", 331 | " learn.freeze()\n", 332 | " current_file = f'general_model_{kfold}'\n", 333 | " \n", 334 | " if process == 'train':\n", 335 | " # First\n", 336 | " learn.fit_one_cycle(1, 3e-2, moms=(0.8,0.7))\n", 337 | "\n", 338 | " # Second\n", 339 | " learn.freeze_to(-2)\n", 340 | " learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8, 0.7))\n", 341 | "\n", 342 | " # Third\n", 343 | " learn.freeze_to(-3)\n", 344 | " learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8, 0.7))\n", 345 | "\n", 346 | " # Fourth\n", 347 | " learn.unfreeze()\n", 348 | " learn.fit_one_cycle(10, slice(1e-3/(2.6**4),1e-3), moms=(0.8, 0.7))\n", 349 | "\n", 350 | " # Fifth\n", 351 | " learn.unfreeze()\n", 352 | " learn.fit_one_cycle(6, slice(1e-3/(2.6**4),1e-3), moms=(0.8, 0.7))\n", 353 | " learn.save(current_file)\n", 354 | " else:\n", 355 | " learn.load(current_file)\n", 356 | " return learn" 357 | ], 358 | "execution_count": 0, 359 | "outputs": [] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "metadata": { 364 | "id": "kdT8Tt6LiD5h", 365 | "colab_type": "code", 366 | "colab": {} 367 | }, 368 | "source": [ 369 | "def merge_dataset(x_train, x_test, y_train, y_test):\n", 370 | " # Merge text and unbinarized labels for train\n", 371 | " train_label = np.array([[\"|\".join(map(str, tr))] for tr in mlb.inverse_transform(y_train)])\n", 372 | " train = pd.DataFrame(np.hstack((x_train,train_label)))\n", 373 | " train.columns = ['text', 'labels']\n", 374 | " \n", 375 | " # Merge text and unbinarized labels for test\n", 376 | " test_label = np.array([[\"|\".join(map(str, tr))] for tr in mlb.inverse_transform(y_test)])\n", 377 | " test = pd.DataFrame(np.hstack((x_test,test_label)))\n", 378 | " test.columns = ['text', 'labels']\n", 379 | " \n", 380 | " return train, test" 381 | ], 382 | "execution_count": 0, 383 | "outputs": [] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "metadata": { 388 | "id": "B2bWOQjaEPMq", 389 | "colab_type": "code", 390 | "colab": {} 391 | }, 392 | "source": [ 393 | "mskf = MultilabelStratifiedKFold(n_splits=10, random_state=0)\n", 394 | "mlb = MultiLabelBinarizer()\n", 395 | "models = []\n", 396 | "x = df[['text']].values # text\n", 397 | "y = mlb.fit_transform(df.labels) # labels\n", 398 | "path = Path(dataset_dir)\n", 399 | "count = 0\n", 400 | "\n", 401 | "for train_index, test_index in mskf.split(x, y):\n", 402 | " count += 1\n", 403 | " print(f\"Fold no. {count}\")\n", 404 | " x_train, x_test = x[train_index], x[test_index]\n", 405 | " y_train, y_test = y[train_index], y[test_index]\n", 406 | " train_df, test_df = merge_dataset(x_train, x_test, y_train, y_test)\n", 407 | " model = train_classifier(path, train_df, test_df, data_lm, kfold=count, process='load')\n", 408 | " models.append(model)" 409 | ], 410 | "execution_count": 0, 411 | "outputs": [] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "metadata": { 416 | "id": "r6y_lxRqWwlS", 417 | "colab_type": "code", 418 | "colab": {} 419 | }, 420 | "source": [ 421 | "def metrics_avg(models, thres=0.3):\n", 422 | " labels_ = list(range(1,18))\n", 423 | " \n", 424 | " def calc(model):\n", 425 | " y_pred, y_true = model.get_preds()\n", 426 | " y_true = y_true.numpy()\n", 427 | " y_pred = y_pred.numpy()\n", 428 | " metrics = classification_report(y_true, y_pred>thres, target_names=labels_, output_dict=True)\n", 429 | " metrics_df = pd.DataFrame.from_dict(metrics)\n", 430 | " h = hamming_loss(y_true, y_pred>thres, labels=labels_)\n", 431 | " roc = roc_auc_score(y_true, y_pred>thres, average='micro')\n", 432 | " return metrics_df, h, roc\n", 433 | " \n", 434 | " metrics_agg, ham, roc = calc(models[0])\n", 435 | " n = len(models)\n", 436 | " for model in models[1:]:\n", 437 | " metrics, h, r = calc(models[0])\n", 438 | " metrics_agg += metrics\n", 439 | " ham += h\n", 440 | " roc += r\n", 441 | " \n", 442 | " return metrics_agg/n, ham/n, roc/n" 443 | ], 444 | "execution_count": 0, 445 | "outputs": [] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "metadata": { 450 | "id": "kbRkbekDq2R-", 451 | "colab_type": "code", 452 | "colab": {} 453 | }, 454 | "source": [ 455 | "averaged_results = metrics_avg(models)" 456 | ], 457 | "execution_count": 0, 458 | "outputs": [] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "metadata": { 463 | "id": "bUtzo8kLBC8B", 464 | "colab_type": "code", 465 | "colab": {} 466 | }, 467 | "source": [ 468 | "averaged_results[2]" 469 | ], 470 | "execution_count": 0, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "XliDW9AsAml8", 477 | "colab_type": "code", 478 | "colab": {} 479 | }, 480 | "source": [ 481 | "" 482 | ], 483 | "execution_count": 0, 484 | "outputs": [] 485 | } 486 | ] 487 | } -------------------------------------------------------------------------------- /experiments/word2vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "name": "python3", 7 | "display_name": "Python 3" 8 | }, 9 | "language_info": { 10 | "codemirror_mode": { 11 | "name": "ipython", 12 | "version": 3 13 | }, 14 | "file_extension": ".py", 15 | "mimetype": "text/x-python", 16 | "name": "python", 17 | "nbconvert_exporter": "python", 18 | "pygments_lexer": "ipython3", 19 | "version": "3.6.6" 20 | }, 21 | "colab": { 22 | "name": "word2vec.ipynb", 23 | "version": "0.3.2", 24 | "provenance": [], 25 | "collapsed_sections": [], 26 | "include_colab_link": true 27 | }, 28 | "accelerator": "GPU" 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "\"Open" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "metadata": { 44 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 45 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 46 | "id": "yzkDHu1qhHIa", 47 | "colab_type": "code", 48 | "colab": {} 49 | }, 50 | "source": [ 51 | "import tensorflow as tf\n", 52 | "import pandas as pd\n", 53 | "import numpy as np\n", 54 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 55 | "from sklearn.metrics import classification_report, roc_auc_score, hamming_loss, accuracy_score\n", 56 | "from keras import optimizers\n", 57 | "import os\n", 58 | "\n", 59 | "from keras.preprocessing.text import Tokenizer\n", 60 | "from keras.preprocessing.sequence import pad_sequences\n", 61 | "from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, Flatten\n", 62 | "from keras.models import Model, Sequential\n", 63 | "from keras.initializers import Constant\n", 64 | "# Conv\n", 65 | "from keras.layers import Conv1D, MaxPooling1D, Embedding\n", 66 | "# LSTM\n", 67 | "from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, SpatialDropout1D, Bidirectional, GRU, LSTM\n", 68 | "from keras.layers.normalization import BatchNormalization\n", 69 | "from keras.optimizers import Adam\n", 70 | "\n", 71 | "import gensim\n", 72 | "from gensim.models import Word2Vec\n", 73 | "\n", 74 | "import nltk\n", 75 | "nltk.download('punkt')\n", 76 | "from nltk.tokenize import word_tokenize\n", 77 | "from collections import Counter\n", 78 | "from keras.models import load_model\n", 79 | "from keras.optimizers import Adam, RMSprop\n", 80 | "\n", 81 | "\n", 82 | "import re" 83 | ], 84 | "execution_count": 0, 85 | "outputs": [] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "metadata": { 90 | "id": "G6ld3ptbEqsA", 91 | "colab_type": "code", 92 | "colab": {} 93 | }, 94 | "source": [ 95 | "from google.colab import drive\n", 96 | "drive.mount('/content/gdrive', force_remount=True)\n", 97 | "base_dir = \"gdrive/My Drive/fastai-v3/sdgs/\"" 98 | ], 99 | "execution_count": 0, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "Pi63MqzME0Ji", 106 | "colab_type": "code", 107 | "colab": {} 108 | }, 109 | "source": [ 110 | "TEXT_DATA_DIR = f\"{base_dir}dataset/cleanup_labelled.csv\"\n", 111 | "EMBEDDINGS_DIR = f\"{base_dir}embeddings/word2vec/\"\n", 112 | "CROSS_FOLDS = f\"{base_dir}dataset/cross_validation/\"\n", 113 | "\n", 114 | "MAX_SEQUENCE_LENGTH = 500\n", 115 | "MAX_NUM_WORDS = 20000\n", 116 | "EMBEDDING_DIM = 300\n", 117 | "NUM_EPOCHS = 20\n", 118 | "BATCH_SIZE = 128\n", 119 | "labels_index = [str(i) for i in range(1,18)]" 120 | ], 121 | "execution_count": 0, 122 | "outputs": [] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "metadata": { 127 | "id": "k-DVSj0NhHIj", 128 | "colab_type": "code", 129 | "colab": {} 130 | }, 131 | "source": [ 132 | "df = pd.read_csv(TEXT_DATA_DIR)\n", 133 | "labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n", 134 | "\n", 135 | "### MASK\n", 136 | "pattern = r\"(indicator)(\\s+\\d+\\.[\\d+a-d]\\.\\d+)|(target)(\\s+\\d+\\.[\\d+a-d])|(sdgs|sdg|goals|goal)\\W*\\s+(,?\\s*\\b\\d{1,2}\\b[and\\s\\b\\d{1,2}\\b]*)\"\n", 137 | "masked_df = df.text.str.replace(pattern, ' SDGLABEL ', regex=True, flags=re.IGNORECASE)\n", 138 | "masked_df = pd.DataFrame(masked_df.str.replace(' ', ' ', regex=True, flags=re.IGNORECASE))\n", 139 | "\n", 140 | "\n", 141 | "vocab = Counter()\n", 142 | "\n", 143 | "\n", 144 | "\n", 145 | "# Masked for training and valid. This will be part of the vocab and index\n", 146 | "masked_texts = [word_tokenize(t.lower()) for t in masked_df.text]\n", 147 | "\n", 148 | "\n", 149 | "# Non masked for testing\n", 150 | "non_masked_texts = [word_tokenize(t.lower()) for t in df.text]\n", 151 | "\n", 152 | "\n", 153 | "# Same masked vocab, embeddings and index\n", 154 | "for text in texts:\n", 155 | " vocab.update(text) \n", 156 | "model = Word2Vec(masked_texts, size=EMBEDDING_DIM, window=5, min_count=5, workers=16, sg=0, negative=5)\n", 157 | "word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NUM_WORDS))}" 158 | ], 159 | "execution_count": 0, 160 | "outputs": [] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "metadata": { 165 | "id": "GRbWnT5MAcfd", 166 | "colab_type": "code", 167 | "colab": {} 168 | }, 169 | "source": [ 170 | "word_vectors = model.wv" 171 | ], 172 | "execution_count": 0, 173 | "outputs": [] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "metadata": { 178 | "id": "N7KLmdDHOvuz", 179 | "colab_type": "code", 180 | "colab": {} 181 | }, 182 | "source": [ 183 | "# Masked padded sequences for training\n", 184 | "masked_sequences = np.array([[word_index.get(t, 0) for t in text]\n", 185 | " for text in masked_texts])\n", 186 | "masked_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 187 | "\n", 188 | "\n", 189 | "# Non masked padded sequences for training\n", 190 | "non_masked_sequences = np.array([[word_index.get(t, 0) for t in text]\n", 191 | " for text in non_masked_texts])\n", 192 | "non_masked_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 193 | "\n", 194 | "\n", 195 | "mlb = MultiLabelBinarizer()\n", 196 | "labels = np.array(mlb.fit_transform(labels))" 197 | ], 198 | "execution_count": 0, 199 | "outputs": [] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "metadata": { 204 | "id": "kLEjjQVeSvVn", 205 | "colab_type": "code", 206 | "colab": {} 207 | }, 208 | "source": [ 209 | "models = []\n", 210 | "arch = 'Conv1D_glorot_uniform'\n", 211 | "is_mask = \"masked\"\n", 212 | "\n", 213 | "for fold in os.listdir(CROSS_FOLDS):\n", 214 | " train_index = np.load(f\"{CROSS_FOLDS}{fold}/train.npy\")\n", 215 | " val_index = np.load(f\"{CROSS_FOLDS}{fold}/val.npy\")\n", 216 | " test_index = np.load(f\"{CROSS_FOLDS}{fold}/test.npy\")\n", 217 | "\n", 218 | " x_train, x_val, x_test = masked_data[train_index], masked_data[val_index], non_masked_data[test_index]\n", 219 | " y_train, y_val, y_test = labels[train_index], labels[val_index], labels[test_index]\n", 220 | " \n", 221 | " \n", 222 | " print(F\"Training {fold}\")\n", 223 | "\n", 224 | " print('Preparing embedding matrix.')\n", 225 | " # prepare embedding matrix\n", 226 | " num_words = min(MAX_NUM_WORDS, len(word_index)) + 1\n", 227 | " embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n", 228 | " \n", 229 | " for word, i in word_index.items():\n", 230 | " if i > MAX_NUM_WORDS:\n", 231 | " continue\n", 232 | " try:\n", 233 | " embedding_vector = word_vectors[word]\n", 234 | " # words not found in embedding index will be all-zeros.\n", 235 | " embedding_matrix[i] = embedding_vector\n", 236 | " except:\n", 237 | " pass \n", 238 | " \n", 239 | " # load pre-trained word embeddings into an Embedding layer\n", 240 | " # note that we set trainable = False so as to keep the embeddings fixed\n", 241 | " embedding_layer = Embedding(num_words,\n", 242 | " EMBEDDING_DIM,\n", 243 | " embeddings_initializer=Constant(embedding_matrix),\n", 244 | " input_length=MAX_SEQUENCE_LENGTH,\n", 245 | " trainable=False)\n", 246 | "\n", 247 | " print('Training model.')\n", 248 | " sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n", 249 | " embedded_sequences = embedding_layer(sequence_input)\n", 250 | "\n", 251 | " # 0.22\n", 252 | " if arch == 'conv': \n", 253 | " # 1D convnet with global maxpooling\n", 254 | " x = Conv1D(128, 5, activation='relu')(embedded_sequences)\n", 255 | " x = MaxPooling1D(5)(x)\n", 256 | " x = Conv1D(128, 5, activation='relu')(x)\n", 257 | " x = MaxPooling1D(5)(x)\n", 258 | " x = Conv1D(128, 5, activation='relu')(x)\n", 259 | " x = GlobalMaxPooling1D()(x)\n", 260 | " x = Dense(128, activation='relu')(x)\n", 261 | " preds = Dense(len(labels_index), activation='sigmoid')(x)\n", 262 | " model = Model(sequence_input, preds)\n", 263 | " model.compile(loss='binary_crossentropy', \n", 264 | " optimizer=Adam(lr=0.01), \n", 265 | " metrics=['accuracy'])\n", 266 | " \n", 267 | " \n", 268 | " # 0.16, 8 epochs without Bidirectional\n", 269 | " # 0.15, 8 epochs with Bidirectional\n", 270 | " # 0.13, 10 epochs with Bidirectional\n", 271 | " if arch == \"bidirectionalGRU\":\n", 272 | " x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1,recurrent_dropout=0.1))(embedded_sequences)\n", 273 | " x = Conv1D(64, kernel_size=3, padding=\"valid\", kernel_initializer=\"glorot_uniform\")(x)\n", 274 | " avg_pool = GlobalAveragePooling1D()(x)\n", 275 | " max_pool = GlobalMaxPooling1D()(x)\n", 276 | " x = concatenate([avg_pool, max_pool])\n", 277 | " preds = Dense(17, activation=\"sigmoid\")(x)\n", 278 | " model = Model(sequence_input, preds)\n", 279 | " model.summary() \n", 280 | " model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])\n", 281 | " \n", 282 | " # around .21, 10 epochs with Bidirectional\n", 283 | " if arch == \"Bidirectional_LSTM\":\n", 284 | " x = Bidirectional(LSTM(25, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedded_sequences)\n", 285 | " x = GlobalMaxPooling1D()(x)\n", 286 | " x = Dense(50, activation=\"relu\")(x)\n", 287 | " x = Dropout(0.1)(x)\n", 288 | " x = Dense(17, activation=\"sigmoid\")(x)\n", 289 | " model = Model(inputs=sequence_input, outputs=x)\n", 290 | " model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])\n", 291 | " \n", 292 | " \n", 293 | " if arch == \"Conv1D_glorot_uniform\":\n", 294 | " x = Conv1D(64, kernel_size=3, padding=\"valid\", kernel_initializer=\"glorot_uniform\")(embedded_sequences)\n", 295 | " avg_pool = GlobalAveragePooling1D()(x)\n", 296 | " max_pool = GlobalMaxPooling1D()(x)\n", 297 | " x = concatenate([avg_pool, max_pool])\n", 298 | " preds = Dense(len(labels_index), activation='sigmoid')(x)\n", 299 | " model = Model(sequence_input, preds)\n", 300 | " model.compile(loss='binary_crossentropy', \n", 301 | " #optimizer=Adam(lr=0.001),\n", 302 | " optimizer='rmsprop',\n", 303 | " metrics=['accuracy'])\n", 304 | " \n", 305 | " model.fit(x_train, y_train,\n", 306 | " batch_size=BATCH_SIZE,\n", 307 | " epochs=NUM_EPOCHS,\n", 308 | " validation_data=(x_val, y_val))\n", 309 | "\n", 310 | " models.append([model, x_test, y_test])\n", 311 | " #model.save(EMBEDDINGS_DIR + f\"{is_mask}{arch}_{NUM_EPOCHS}epochs_{EMBEDDING_DIM}D_batchsize{BATCH_SIZE}_5fold-cross-val_{fold}.h5\")" 312 | ], 313 | "execution_count": 0, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": { 319 | "id": "pPuk9pBg4yys", 320 | "colab_type": "text" 321 | }, 322 | "source": [ 323 | "# Load and evaluate folds on test" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "metadata": { 329 | "id": "Ip3GMV8X03J_", 330 | "colab_type": "code", 331 | "colab": {} 332 | }, 333 | "source": [ 334 | "def metrics_avg(models_testx_testy, labels_, thres=0.3):\n", 335 | " def calc(model, test_x, test_y):\n", 336 | " predictions = model.predict(test_x)>thres\n", 337 | " metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)\n", 338 | " metrics_df = pd.DataFrame.from_dict(metrics)\n", 339 | " h = hamming_loss(test_y, predictions)\n", 340 | " roc = roc_auc_score(test_y, predictions, average='micro')\n", 341 | " return metrics_df, h, roc\n", 342 | "\n", 343 | " model_1, test_x_first, test_y_first = models_testx_testy[0]\n", 344 | " metrics_agg, ham, roc = calc(model_1, test_x_first, test_y_first)\n", 345 | " n = len(models_testx_testy)\n", 346 | "\n", 347 | " for model, test_x, test_y in models_testx_testy[1:]:\n", 348 | " metrics, h, r = calc(model, test_x, test_y)\n", 349 | " metrics_agg += metrics\n", 350 | " ham += h\n", 351 | " roc += r\n", 352 | "\n", 353 | " return metrics_agg/n, ham/n, roc/n" 354 | ], 355 | "execution_count": 0, 356 | "outputs": [] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "metadata": { 361 | "colab_type": "code", 362 | "id": "tjF_N8wBHPCR", 363 | "colab": {} 364 | }, 365 | "source": [ 366 | "loaded_arch = 'maskedConv1D_glorot_uniform'\n", 367 | "loaded_models = []\n", 368 | "\n", 369 | "for i, fold in enumerate(os.listdir(CROSS_FOLDS)):\n", 370 | " print(f\"Loading {fold}...\")\n", 371 | " test_index = np.load(f\"{CROSS_FOLDS}{fold}/test.npy\")\n", 372 | "\n", 373 | " x_test = data[test_index]\n", 374 | " y_test = labels[test_index]\n", 375 | " \n", 376 | " load_dir = EMBEDDINGS_DIR + f\"{loaded_arch}_{NUM_EPOCHS}epochs_{EMBEDDING_DIM}D_batchsize{BATCH_SIZE}_5fold-cross-val_{fold}.h5\"\n", 377 | " \n", 378 | " final_models.append((loaded_models[i], x_test, y_test))\n", 379 | "print(f\"Finished loading the {loaded_arch} models.\")" 380 | ], 381 | "execution_count": 0, 382 | "outputs": [] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "metadata": { 387 | "id": "k3_cIkni687i", 388 | "colab_type": "code", 389 | "colab": {} 390 | }, 391 | "source": [ 392 | "avg_results = metrics_avg(models, labels_index, thres=0.3)" 393 | ], 394 | "execution_count": 0, 395 | "outputs": [] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "metadata": { 400 | "id": "t37qTPuujw6P", 401 | "colab_type": "code", 402 | "colab": {} 403 | }, 404 | "source": [ 405 | "avg_results[0]" 406 | ], 407 | "execution_count": 0, 408 | "outputs": [] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "metadata": { 413 | "id": "bOSBRX5Iig7y", 414 | "colab_type": "code", 415 | "colab": {} 416 | }, 417 | "source": [ 418 | "avg_results[0].to_csv(EMBEDDINGS_DIR + f'{is_mask}results_{arch}.csv', sep=';')" 419 | ], 420 | "execution_count": 0, 421 | "outputs": [] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "metadata": { 426 | "id": "0cHrYWaEhHIv", 427 | "colab_type": "code", 428 | "colab": {} 429 | }, 430 | "source": [ 431 | "hl = round(avg_results[1],4)\n", 432 | "roc_auc = round(avg_results[2],4)\n", 433 | "print(f\"hl;{hl}\")\n", 434 | "print(f\"roc-auc;{roc_auc}\")" 435 | ], 436 | "execution_count": 0, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "zHCwDRXIkA93", 443 | "colab_type": "code", 444 | "colab": {} 445 | }, 446 | "source": [ 447 | "" 448 | ], 449 | "execution_count": 0, 450 | "outputs": [] 451 | } 452 | ] 453 | } -------------------------------------------------------------------------------- /label_extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vondersam/sdgs_text_classifier/f79111a5d261615e9a8a2fad445ec4f39ca916a3/label_extract/__init__.py -------------------------------------------------------------------------------- /label_extract/extract.py: -------------------------------------------------------------------------------- 1 | import json 2 | from utils.document import Document, extract_labels, merge_dicts 3 | from utils.file_utils import get_files, save_files 4 | import time 5 | from tqdm import tqdm 6 | from multiprocessing import Process, Queue 7 | 8 | 9 | 10 | if __name__ == '__main__': 11 | start = time.time() 12 | # Indicate the path to the files from where the texts 13 | # and labels need to be extracted. 14 | main_dir = '/Users/samuelrodriguezmedina/Documents/ir4sdgs/crawl_sdgs/' 15 | folders = [ 16 | 'word', 17 | 'other_html', 18 | 'pdf', 19 | 'extra_pdf', 20 | 'extra_word', 21 | 'downloads', 22 | 'downloadable_pdfs' 23 | ] 24 | files = get_files(main_dir, folders) 25 | final_labelled = {} 26 | final_unlabelled = {} 27 | q = Queue() 28 | 29 | for file in tqdm(files): 30 | p = Process(target=extract_labels, args=(doc, q,)) 31 | p.start() 32 | labelled, unlabelled = q.get() 33 | if labelled: 34 | final_labelled = merge_dicts(final_labelled, labelled) 35 | if unlabelled: 36 | final_unlabelled = {**final_unlabelled, **unlabelled} 37 | p.join() 38 | 39 | save_files(final_labelled, final_unlabelled) 40 | print(time.time()-start) 41 | -------------------------------------------------------------------------------- /label_extract/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vondersam/sdgs_text_classifier/f79111a5d261615e9a8a2fad445ec4f39ca916a3/label_extract/utils/__init__.py -------------------------------------------------------------------------------- /label_extract/utils/check_duplicates.py: -------------------------------------------------------------------------------- 1 | # dupFinder.py 2 | import os, sys 3 | import hashlib 4 | 5 | 6 | def findDup(parentFolder): 7 | # Dups in format {hash:[names]} 8 | dups = {} 9 | for dirName, subdirs, fileList in os.walk(parentFolder): 10 | print('Scanning %s...' % dirName) 11 | for filename in fileList: 12 | # Get the path to the file 13 | path = os.path.join(dirName, filename) 14 | # Calculate hash 15 | file_hash = hashfile(path) 16 | # Add or append the file path 17 | if file_hash in dups: 18 | dups[file_hash].append(path) 19 | else: 20 | dups[file_hash] = [path] 21 | return dups 22 | 23 | 24 | # Joins two dictionaries 25 | def joinDicts(dict1, dict2): 26 | for key in dict2.keys(): 27 | if key in dict1: 28 | dict1[key] = dict1[key] + dict2[key] 29 | else: 30 | dict1[key] = dict2[key] 31 | 32 | 33 | def hashfile(path, blocksize=65536): 34 | afile = open(path, 'rb') 35 | hasher = hashlib.md5() 36 | buf = afile.read(blocksize) 37 | while len(buf) > 0: 38 | hasher.update(buf) 39 | buf = afile.read(blocksize) 40 | afile.close() 41 | return hasher.hexdigest() 42 | 43 | 44 | def printResults(dict1): 45 | results = list(filter(lambda x: len(x) > 1, dict1.values())) 46 | if len(results) > 0: 47 | print('Duplicates Found:') 48 | print('The following files are identical. The name could differ, but the content is identical') 49 | print('___________________') 50 | for result in results: 51 | for subresult in result: 52 | os.remove(subresult) 53 | print('\t\t%s' % subresult) 54 | print('___________________') 55 | 56 | else: 57 | print('No duplicate files found.') 58 | 59 | 60 | if __name__ == '__main__': 61 | if len(sys.argv) > 1: 62 | dups = {} 63 | folders = sys.argv[1:] 64 | for i in folders: 65 | # Iterate the folders given 66 | if os.path.exists(i): 67 | # Find the duplicated files and append them to the dups 68 | joinDicts(dups, findDup(i)) 69 | else: 70 | print('%s is not a valid path, please verify' % i) 71 | sys.exit() 72 | printResults(dups) 73 | else: 74 | print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3') -------------------------------------------------------------------------------- /label_extract/utils/convert.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | 4 | def get_dict(text, tags): 5 | result = { 6 | 'g_1': 0, 7 | 'g_2': 0, 8 | 'g_3': 0, 9 | 'g_4': 0, 10 | 'g_5': 0, 11 | 'g_6': 0, 12 | 'g_7': 0, 13 | 'g_8': 0, 14 | 'g_9': 0, 15 | 'g_10': 0, 16 | 'g_11': 0, 17 | 'g_12': 0, 18 | 'g_13': 0, 19 | 'g_14': 0, 20 | 'g_15': 0, 21 | 'g_16': 0, 22 | 'g_17': 0 23 | } 24 | result['text'] = text 25 | for tag in tags: 26 | try: 27 | result[tag] = 1 28 | except: 29 | pass 30 | return result 31 | 32 | 33 | def to_binary(file): 34 | with open(file, 'r') as fi: 35 | data = json.load(fi) 36 | fieldnames = ['text', 37 | 'g_1', 38 | 'g_2', 39 | 'g_3', 40 | 'g_4', 41 | 'g_5', 42 | 'g_6', 43 | 'g_7', 44 | 'g_8', 45 | 'g_9', 46 | 'g_10', 47 | 'g_11', 48 | 'g_12', 49 | 'g_13', 50 | 'g_14', 51 | 'g_15', 52 | 'g_16', 53 | 'g_17'] 54 | 55 | with open('final_final.csv', 'w') as fo: 56 | writer = csv.DictWriter(fo, fieldnames=fieldnames) 57 | writer.writeheader() 58 | 59 | for text, tags in data.items(): 60 | d = get_dict(text, tags['cats']) 61 | writer.writerow(d) 62 | 63 | 64 | path = '../label_extract/final_final.json' 65 | to_binary(path) -------------------------------------------------------------------------------- /label_extract/utils/document.py: -------------------------------------------------------------------------------- 1 | import os 2 | import PyPDF2 3 | from bs4 import BeautifulSoup 4 | import subprocess 5 | from docx import Document as Docx 6 | from utils.extract_utils import MAPPINGS, format_labels 7 | from utils.text import Text 8 | from langdetect import detect 9 | import slate3k 10 | import re 11 | 12 | 13 | 14 | class Document: 15 | """ 16 | Doc containing all paragraphs from .doc, .docx, .pdf 17 | """ 18 | def __init__(self, filepath, filename): 19 | self.paragraphs = [] 20 | self.name = filename 21 | self.filename_label = [] 22 | self.from_any(filepath) 23 | self.doc_label_from(filename) 24 | 25 | def get_filename_label(self): 26 | return self.filename_label 27 | 28 | def doc_label_from(self, filename): 29 | # If 'goal|SDG no.' found in filename, process all doc with that label 30 | pattern = r'(sdg|goal)\W?((?:[1-9]\b|1[0-7]?\b))' 31 | match = re.search(pattern, filename, flags=re.IGNORECASE) 32 | if match: 33 | self.filename_label = [int(match.group(2))] 34 | 35 | def from_any(self, filepath): 36 | base = os.path.basename(filepath) 37 | extension = os.path.splitext(base)[1].lower() 38 | if extension == '.doc': 39 | self.from_word(filepath) 40 | elif extension == '.pdf': 41 | self.from_pdf(filepath) 42 | elif extension == '.html': 43 | self.from_html(filepath) 44 | elif extension == '.txt': 45 | self.from_txt(filepath) 46 | else: 47 | print(f"File with extension {extension} not read.") 48 | 49 | def from_word(self, file): 50 | try: 51 | # Docx 52 | paragraphs = Docx(file).paragraphs 53 | for paragraphs in paragraphs: 54 | if detect(paragraph) == 'en': 55 | self.paragraphs.append(Text(paragraphs)) 56 | except: 57 | try: 58 | # Doc 59 | string = subprocess.check_output(['antiword', '-t', file]) 60 | # Decode and split by paragraphs 61 | extracted_list = string.decode('utf-8').split('\n\n') 62 | for paragraph in extracted_list: 63 | if detect(paragraph) == 'en': 64 | self.paragraphs.append(Text(paragraph)) 65 | except: 66 | # If Antiword does not work, convert to txt 67 | subprocess.run(['textutil', '-convert', 'txt', file]) 68 | file = file.replace('.doc', '.txt') 69 | with open(file, 'r') as f: 70 | data = f.readlines() 71 | for paragraph in data: 72 | if detect(paragraph) == 'en': 73 | self.paragraphs.append(Text(paragraph)) 74 | 75 | def from_pdf(self, file): 76 | try: 77 | with open(file, 'rb') as fi: 78 | doc = slate3k.PDF(fi, word_margin=0) 79 | for i in range(len(doc)): 80 | string = doc[i] 81 | extracted_list = string.split('. \n') 82 | for line in extracted_list: 83 | if detect(line) == 'en': 84 | self.paragraphs.append(Text(line)) 85 | except: 86 | pass 87 | 88 | def from_html(self, file): 89 | with open(file, 'rb') as f: 90 | soup = BeautifulSoup(f, 'html.parser') 91 | # Strip out any code from the text 92 | for script in soup(["script", "style"]): 93 | script.decompose() 94 | for paragraph in soup.stripped_strings: 95 | try: 96 | if detect(paragraph) == 'en': 97 | self.paragraphs.append(Text(paragraph)) 98 | except: 99 | pass 100 | 101 | def from_txt(self, file): 102 | with open(file, 'r') as f: 103 | data = f.readlines() 104 | for paragraph in data: 105 | if detect(paragraph) == 'en': 106 | self.paragraphs.append(Text(paragraph)) 107 | 108 | 109 | def extract_labels(doc, q): 110 | labelled = {} 111 | unlabelled = {} 112 | patterns = [ 113 | MAPPINGS['g'] + r"\W*\s+(,?\s*\b\d{1,2}\b[and\s\b\d{1,2}\b]*)", 114 | MAPPINGS['t'] + r"(\s+\d+\.[\d+a-d])", 115 | MAPPINGS['i'] + r"(\s+\d+\.[\d+a-d]\.\d+)" 116 | ] 117 | 118 | for paragraph in doc.paragraphs: 119 | labels = [] 120 | goals = [] 121 | text = paragraph.text 122 | 123 | # To avoid extracting Millennium Goals 124 | if 'millennium' in text.lower(): 125 | pass 126 | elif ' mdg' in text.lower(): 127 | pass 128 | else: 129 | labelled_text = False 130 | for pattern in patterns: 131 | goals_extracted = re.findall(pattern, text, re.I) 132 | goals.extend(goals_extracted) 133 | text_labels = format_labels(goals) 134 | 135 | # Use labels from text if available 136 | if text_labels: 137 | if text not in labelled: 138 | labelled[text] = { 139 | 'cats': text_labels, 140 | 'doc_id': doc.name 141 | } 142 | else: 143 | labelled[text]['cats'].extend(text_labels) 144 | labelled[text]['cats'] = list(set(labelled[text]['cats'])) 145 | labelled_text = True 146 | 147 | # Use filename label is available and no labels found in text 148 | elif doc.filename_label: 149 | if text not in labelled: 150 | labelled[text] = { 151 | 'cats': doc.filename_label, 152 | 'doc_id': doc.name 153 | } 154 | else: 155 | labelled[text]['cats'].extend(text_labels) 156 | labelled[text]['cats'] = list(set(labelled[text]['cats'])) 157 | labelled_text = True 158 | 159 | if labelled_text == False: 160 | unlabelled[text] = None 161 | 162 | q.put((labelled, unlabelled)) 163 | 164 | 165 | def merge_dicts(old_dict, new_dict): 166 | for new_text, new_values in new_dict.items(): 167 | if new_text in old_dict: 168 | new_cats = list(set(old_dict[new_text]['cats'] + new_values['cats'])) 169 | old_dict[new_text]['cats'] = new_cats 170 | else: 171 | old_dict[new_text] = new_values 172 | return old_dict 173 | 174 | 175 | -------------------------------------------------------------------------------- /label_extract/utils/extract_utils.py: -------------------------------------------------------------------------------- 1 | from string import punctuation as punct 2 | 3 | MAPPINGS = { 4 | 'g': r'(sdgs|sdg|goals|goal)', 5 | 't': r'(target)', 6 | 'i': r'(indicator)' 7 | } 8 | 9 | 10 | def extract_type(type_): 11 | """ 12 | Extract the type of label 13 | :param type_: 14 | :return: 15 | """ 16 | for key, pattern in MAPPINGS.items(): 17 | if type_.lower() in pattern: 18 | return key 19 | 20 | def is_number(s): 21 | try: 22 | float(s) 23 | return True 24 | except ValueError: 25 | return False 26 | 27 | def extract_num(numbers): 28 | results = [] 29 | for i in numbers.replace(',', ' ').split(): 30 | if i.lower() not in 'and': 31 | i = i.strip(punct).split('.')[0] 32 | if is_number(i) and int(i) in range(1, 18): 33 | results.append(i) 34 | return list(set(results)) 35 | 36 | 37 | def format_labels(extracted_goals): 38 | """ 39 | type_, numbers 40 | :param extracted_goals: regex-extracted list with type_ and number of labels 41 | :param numbers: 42 | :return: list of labels 43 | """ 44 | labels = [] 45 | for type_, numbers in extracted_goals: 46 | labels.extend(extract_num(numbers)) 47 | return list(set(labels)) 48 | 49 | 50 | def trans_labels(labels): 51 | results = dict() 52 | for i in range(1,18): 53 | label = f'g_{i}' 54 | if label in labels: 55 | results[label] = True 56 | else: 57 | results[label] = False 58 | return results 59 | 60 | 61 | -------------------------------------------------------------------------------- /label_extract/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from bs4 import BeautifulSoup 4 | import json 5 | 6 | 7 | def download_pdf(file): 8 | try: 9 | with open(file, 'rb') as f: 10 | soup = BeautifulSoup(f, 'html.parser') 11 | link = soup.find(class_='btn btn-primary library-link') 12 | if link and 'sdgfund.org_un-promotes-major-regional-agreement-water' not in link: 13 | link = link['href'] 14 | subprocess.call(['wget', link]) 15 | except: 16 | print('Did not download') 17 | print(file) 18 | 19 | 20 | def get_files(filepath, folders): 21 | """ 22 | Get all the files from the folders listed given a filepath 23 | :param filepath: 24 | :param folders: 25 | :return:list of files 26 | """ 27 | files_list = [] 28 | for folder in folders: 29 | if folder in os.listdir(filepath): 30 | folder_files = os.listdir(os.path.join(filepath, folder)) 31 | files_list.extend([os.path.join(folder, f) for f in folder_files]) 32 | else: 33 | print(f'Cannot find {folder} folder in this path.') 34 | return files_list 35 | 36 | def save_files(labelled, unlabelled): 37 | """ 38 | Save the labelled and unlabelled texts, and stats into .csv and .json formats 39 | :param labelled: 40 | :param unlabelled: 41 | :return: .csv and .json formats 42 | """ 43 | pd.DataFrame.from_dict(labelled, orient='index').to_csv( 44 | 'labelled_filenamelabel.csv') 45 | pd.DataFrame.from_dict(unlabelled, orient='index').to_csv( 46 | 'unlabelled_filenamelabel.csv') 47 | 48 | with open('labelled_filenamelabel.json', 'w') as fo: 49 | json.dump(labelled, fo) 50 | with open('unlabelled_filenamelabel.json', 'w') as fo: 51 | json.dump(unlabelled, fo) 52 | 53 | stats = { 54 | 'labelled': len(labelled), 55 | 'unlabelled': len(unlabelled) 56 | } 57 | with open('stats.json', 'w') as fo: 58 | json.dump(stats, fo) 59 | -------------------------------------------------------------------------------- /label_extract/utils/post_processing.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import re 3 | import json 4 | import pandas as pd 5 | from langdetect import detect 6 | from tqdm import tqdm 7 | from multiprocessing import Process, Queue 8 | import sys 9 | import csv 10 | maxInt = sys.maxsize 11 | 12 | while True: 13 | # decrease the maxInt value by factor 10 14 | # as long as the OverflowError occurs. 15 | 16 | try: 17 | csv.field_size_limit(maxInt) 18 | break 19 | except OverflowError: 20 | maxInt = int(maxInt/10) 21 | 22 | 23 | """ 24 | with open('new_labelled.csv', 'r') as one, open('labelled_filenamelabel.csv', 'r') as two: 25 | new_labelled = csv.DictReader(one, strict=False) 26 | latest = csv.DictReader(two, strict=False) 27 | latest_dict = {row['text']: {'cats': row['cats'], 'doc_id': row['doc_id']} for row in latest} 28 | 29 | 30 | with open("final.csv", "w") as fo: 31 | fieldnames = ['text', 'cats', 'doc_id'] 32 | writer = csv.DictWriter(fo, fieldnames=fieldnames) 33 | 34 | writer.writeheader() 35 | for row in new_labelled: 36 | if row['text'] in latest_dict: 37 | writer.writerow({ 38 | 'text': row['text'], 39 | 'cats': latest_dict[row['text']]['cats'], 40 | 'doc_id': row['doc_id'] 41 | }) 42 | 43 | with open('previous.csv', 'r') as one: 44 | previous = csv.DictReader(one, strict=False) 45 | pattern = r'^(\w{1} )([A-Z])' 46 | 47 | with open("final.csv", "w") as fo: 48 | fieldnames = ['text', 'cats', 'doc_id'] 49 | writer = csv.DictWriter(fo, fieldnames=fieldnames) 50 | 51 | for row in previous: 52 | text = row['text'] 53 | if re.search(pattern, row['text']): 54 | text = re.sub(pattern, "\g<2>", text) 55 | writer.writerow({ 56 | 'text': text, 57 | 'cats': row['cats'], 58 | 'doc_id': row['doc_id'] 59 | }) 60 | 61 | 62 | 63 | with open('html_tags.json', 'r') as f: 64 | data = json.load(f) 65 | final = {} 66 | for text, values in data.items(): 67 | final[text] = {'cats': [values['cats'][0].strip('g_')]} 68 | 69 | df = pd.DataFrame.from_dict(final, columns=["text", "cats"], orient="index") 70 | df.to_csv("html.csv") 71 | """ 72 | 73 | def detect_lang(text, q=None): 74 | if q: 75 | try: 76 | q.put((detect(text[:150]), text)) 77 | except: 78 | q.put(('de', "")) 79 | else: 80 | try: 81 | return detect(text), text 82 | except: 83 | return 'de', "" 84 | 85 | if __name__ == '__main__': 86 | 87 | with open('from_pandas.csv', 'r') as f: 88 | data = csv.DictReader(f, strict=False) 89 | 90 | with open("cleanup_unlabelled_final.csv", "w") as fo: 91 | fieldnames = ['text'] 92 | writer = csv.DictWriter(fo, fieldnames=fieldnames) 93 | writer.writeheader() 94 | #q = Queue() 95 | 96 | for row in tqdm(data): 97 | try: 98 | #p = Process(target=detect_lang, args=(row['text'],q)) 99 | #p.start() 100 | #language, text = q.get() 101 | language, text = detect_lang(row['text']) 102 | if language == "en": 103 | writer.writerow({'text': text}) 104 | #p.join() 105 | except: 106 | pass 107 | 108 | -------------------------------------------------------------------------------- /label_extract/utils/preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import subprocess 4 | from urllib import request 5 | import shutil 6 | 7 | 8 | directory = "/Users/samuelrodriguezmedina/Documents/ir4sdgs/crawl_sdgs/downloads" 9 | 10 | for filename in os.listdir(directory): 11 | new_name = os.path.join(directory, filename.replace('.', '_') + '.html') 12 | ''' 13 | os.rename(os.path.join(directory, filename), new_name) 14 | if re.search('=(.*)&Lang', filename): 15 | new_filename = re.search('=(.*)&Lang', filename).group(1).replace('%2F', '_') + '.doc' 16 | print(new_filename) 17 | os.rename(filename, new_filename) 18 | 19 | # Files with + in their filenames 20 | 21 | if "+" in filename: 22 | #new_filename = filename.replace('+', '').replace('_', '/').replace('.doc', '') 23 | #download_link = f"https://daccess-ods.un.org/access.nsf/GetFile?Open&DS={new_filename}&Lang=E&Type=DOC" 24 | #print(download_link) 25 | #download_file(download_link, new_filename) 26 | #subprocess.call(['wget', download_link, '-P images/']) 27 | # move files 28 | #shutil.move(path, target) 29 | ''' 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /label_extract/utils/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class Text: 4 | def __init__(self, text): 5 | self.text = None 6 | self.clean(text) 7 | 8 | def clean(self, text): 9 | patterns = [ 10 | (r'^\s?\d+\.[\s\t]?', ''), # Numbered bullet points at start of paragraph 11 | (r' ?( ?\W)\1+', r' \1 '), # Reduce repeated non alphanumeric characters, ------- 12 | (r'\s\s+', ' '), # Two or more spaces 13 | (r'\t', ''), # Tabulations 14 | (r'\xa0', ' '), # Non-breaking space \xa0 15 | (r'(\r\n|\r|\n)', ' '), # Line breaks 16 | (r'\•', ''), # Bullet points 17 | (r'^\s?\(\w+\)\s*', ''), 18 | (r'(\|?\s\|)+', ' '), # Vertical bars, such as '| |goals | |' 19 | (r'^\W*', ''), # Any non-alphanumerical character at the start of string 20 | (r'Œ', '-') # Replace corrupted dash 21 | ] 22 | # Run regex twice so that is not affected by order 23 | for _ in range(2): 24 | for pattern, substitute in patterns: 25 | text = re.sub(pattern, substitute, text) 26 | self.text = text.strip() 27 | --------------------------------------------------------------------------------