├── Assessment test for Lilly E&C Data Scientist.pdf ├── README.md └── documnet_classifer_using_NLP.ipynb /Assessment test for Lilly E&C Data Scientist.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NitinBhore/document_classifier_using_NLP/HEAD/Assessment test for Lilly E&C Data Scientist.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # document_classifier_using_NLP 2 | Document classification or document categorization is a process to assign different classes or categories to documents as required, eventually helping with storage, management, and analysis of the documents. It has become an important part of the computer sciences and the daily functioning of many companies today. 3 | -------------------------------------------------------------------------------- /documnet_classifer_using_NLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "oVV_s2mebADO" 7 | }, 8 | "source": [ 9 | "#Exercise 1 \n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "id": "o3XE4488pQ9P" 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "# unzip the file\n", 22 | "# !unzip train.zip\n", 23 | "# !unzip test.zip" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "id": "-fBVn4MxUPKi" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "# Import library\n", 35 | "import pandas as pd\n", 36 | "import numpy as np\n", 37 | "import matplotlib.pyplot as plt\n", 38 | "from nltk.corpus import stopwords\n", 39 | "plt.style.use('ggplot')\n", 40 | "import os, json\n", 41 | "import nltk\n", 42 | "nltk.download('stopwords')\n", 43 | "nltk.download('punkt')\n", 44 | "import re\n", 45 | "import matplotlib\n", 46 | "from nltk.tokenize import word_tokenize\n", 47 | "import tensorflow as tf\n", 48 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 49 | "from keras.preprocessing.text import Tokenizer\n", 50 | "from tensorflow.keras.regularizers import l1\n", 51 | "from tqdm import tqdm\n", 52 | "from keras.models import Sequential\n", 53 | "from keras.layers import Embedding,RNN, SimpleRNN,Dense,SpatialDropout1D\n", 54 | "from keras.initializers import Constant\n", 55 | "from sklearn.model_selection import train_test_split\n", 56 | "from tensorflow.keras.optimizers import Adam\n", 57 | "from sklearn.metrics import confusion_matrix, classification_report\n", 58 | "stop=set(stopwords.words('english'))\n", 59 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 60 | "from sklearn.naive_bayes import MultinomialNB\n", 61 | "from sklearn.pipeline import make_pipeline\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "id": "6FqybFcHMIYM" 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "# Function used to create the dataframe\n", 73 | "def create_dataframe(dir_name):\n", 74 | " path_to_json = dir_name+'/'\n", 75 | " json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]\n", 76 | " # print(json_files)\n", 77 | "\n", 78 | " # here I define pandas Dataframe with the columns name from the json\n", 79 | " jsons_data = pd.DataFrame(columns=['abstract', 'categories'])\n", 80 | "\n", 81 | " # we need both the json and an index number so use enumerate()\n", 82 | " for index, js in enumerate(json_files):\n", 83 | " try:\n", 84 | " with open(os.path.join(path_to_json, js)) as json_file:\n", 85 | " json_text = json.load(json_file)\n", 86 | "\n", 87 | " abstract = json_text['abstract']\n", 88 | " categories = json_text['categories']\n", 89 | " \n", 90 | " # here I push a list of data into a pandas DataFrame at row given by 'index'\n", 91 | " jsons_data.loc[index] = [abstract, categories]\n", 92 | "\n", 93 | " except Exception as e:\n", 94 | " print('Exception :', str(e)) \n", 95 | "\n", 96 | " return jsons_data" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "id": "AwQy_xbgMId0" 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "## Create the train and test dataframe\n", 108 | "train_df = create_dataframe(\"train\")\n", 109 | "train_df\n", 110 | "\n", 111 | "test_df = create_dataframe(\"test\")\n", 112 | "test_df\n" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "id": "8ls6gRkNMIgc" 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "# Export the dataframe into CSV\n", 124 | "train_df.to_csv(\"train.csv\", index=False)\n", 125 | "test_df.to_csv(\"test.csv\", index=False)\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": { 131 | "id": "BZvwqC-CKdAO" 132 | }, 133 | "source": [ 134 | "#Exercise 2" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "id": "TkHIVGRIKe3S" 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "# Read Dataframe\n", 146 | "train_df = pd.read_csv(\"train.csv\", usecols= ['abstract', 'categories'])\n", 147 | "test_df = pd.read_csv(\"test.csv\", usecols= ['abstract', 'categories'])\n", 148 | "train_df.head()\n" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "id": "rX3qOeoRKe4k" 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "# info\n", 160 | "train_df.info()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "id": "4ckwpM-zKfDZ" 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "# Check the null values\n", 172 | "train_df.isnull().sum()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "id": "lU-vtEUoWrCs" 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "# Check the null values\n", 184 | "test_df.isnull().sum()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "id": "Cq9CE9rtbL6a" 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "# drop the null values\n", 196 | "train_df.dropna(axis = 0, how = 'any', inplace=True)\n", 197 | "test_df.dropna(axis = 0, how = 'any', inplace=True)\n" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "id": "qY2OQJTydFag" 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "train_df['categories'].value_counts()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "id": "lYpOe8IjbL7w" 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "# plot the values count \n", 220 | "train_df['categories'].value_counts().plot(kind='bar', figsize =(12,6))\n" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "id": "M0_PvaZQbMAK" 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "# plot the values count \n", 232 | "test_df['categories'].value_counts().plot(kind='bar', figsize =(12,6))" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "id": "dJR7-dRKzXTr" 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "# Load the dataset\n", 244 | "data = train_df\n", 245 | "# Get the text categories\n", 246 | "text_categories = data.categories.unique()\n", 247 | "# define the training set\n", 248 | "train_data = train_df\n", 249 | "# define the test set\n", 250 | "test_data = test_df\n", 251 | "\n", 252 | "print(\"We have {} unique classes\".format(len(text_categories)))\n", 253 | "print(\"We have {} training samples\".format(len(train_data.abstract)))\n", 254 | "print(\"We have {} test samples\".format(len(test_data.abstract)))" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": { 260 | "id": "ndCbWwgE8tQc" 261 | }, 262 | "source": [ 263 | "What are the top 3 insights generated while doing the data analysis on train set (df_train)?\n", 264 | "\n", 265 | "1. We have 26 unique classes\n", 266 | "\n", 267 | "2. We have 43916 training samples\n", 268 | "\n", 269 | "3. We have 10862 test samples" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": { 275 | "id": "OYwsRgu25OUp" 276 | }, 277 | "source": [ 278 | "**What are the top 3 data challenges you observe on train set?**\n", 279 | "1. Requirement of a large amount of training data\n", 280 | "\n", 281 | "2. Requirement of powerful computing resources to perform analysis and advanced analytics\n", 282 | "\n", 283 | "3. Required more training samples of minor categories like Skin and Connective Tissue Diseases, Disorders of Environmental Origin, Immune System Diseases and Musculoskeletal Diseases etc\n" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": { 289 | "id": "y0JxB-5YbZib" 290 | }, 291 | "source": [ 292 | "#Exercise 3" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "id": "smZ-ukTZ-WQS" 299 | }, 300 | "source": [ 301 | "a. Pick your favourite machine learning algorithm to train a multi-class text classifier using the train set (df_train). The classifier should be able to consider the abstract as input and predict any one of the 26 disease categories \n", 302 | "\n", 303 | "**Naive Bayes**" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "id": "AYx3kPagbe99" 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "# Build the model\n", 315 | "model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n", 316 | "# Train the model using the training data\n", 317 | "model.fit(train_data.abstract, train_data.categories)\n", 318 | "# Predict the categories of the test data\n", 319 | "predicted_categories = model.predict(test_data.abstract)\n", 320 | "\n", 321 | "# classification report\n", 322 | "print(classification_report(test_data.categories, predicted_categories))\n" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": { 328 | "id": "6CpJT1OAXWVh" 329 | }, 330 | "source": [ 331 | "**b. Report key metrics on your test set (df_test) and explain your observations**\n", 332 | "\n", 333 | "Naïve Bayes Model is showing a performance with 48% accuracy. For the categories like Male Urogenital Diseases showing a good precision- 0.86 and recall- 0.63 whereas the categories like 'Pathological Conditions and Signs and Symptoms', 'Respiratory Tract Diseases' , 'Skin and Connective Tissue Diseases' etc have the precision- 0.00 and recall- 0.00 due to availability of the skewed dataset." 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": { 339 | "id": "YfoLQgMjZMR_" 340 | }, 341 | "source": [ 342 | "**c. Explain the rationale behind choosing the algorithm in 3.a**\n", 343 | "\n", 344 | "Naive Bayes classifiers are a collection of classification algorithms based on Bayes’ Theorem.\n", 345 | "\n", 346 | "Naive Bayes assumes that each feature/variable of the same class makes an: independent and equal.\n", 347 | "\n", 348 | "Bayes’ Theorem finds the probability of an event occurring given the probability of another event that has already occurred." 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "id": "6DhY-nj2bfLQ" 356 | }, 357 | "outputs": [], 358 | "source": [] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": { 363 | "id": "zOFbmoaLbRT0" 364 | }, 365 | "source": [ 366 | "#Exercise 4" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "id": "9wA2fm9-qw5_" 373 | }, 374 | "source": [ 375 | "**a. Build a text classifier which classifies the abstracts into one of the 26 disease categories using any RNN based architecture and report key metrics on test set. Explain your observations**" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "id": "PBbRzsc6a0W3" 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "############## Recurrent Neural Network #######################\n", 387 | "\n", 388 | "class RNN():\n", 389 | " \"\"\"\n", 390 | " A Recurrent Neural Network (RNN) is a type of artificial neural network \n", 391 | " which uses sequential data.\n", 392 | " \n", 393 | " Args:\n", 394 | " embedding_dict : Embedding dictionary\n", 395 | " train_df : Train Dataframe name\n", 396 | " test_df : Test Dataframe name\n", 397 | " text_col : Text column name\n", 398 | "\n", 399 | " Returns: \n", 400 | " model : GloVe-Contextualized Vectors with SimpleRNN model\n", 401 | " \n", 402 | " \"\"\"\n", 403 | " def __init__(self, embedding_dict, train_df, test_df, text_col):\n", 404 | " \"\"\" Inits the Preprocessing \"\"\"\n", 405 | " self.embedding_dict = embedding_dict\n", 406 | " self.df = train_df\n", 407 | " self.test_df = test_df\n", 408 | " self.text_col = text_col\n", 409 | "\n", 410 | " # clean text\n", 411 | " def clean_text(self, text):\n", 412 | " \"\"\"Clean the text\"\"\"\n", 413 | " text = re.sub('[^a-zA-Z]', ' ', text) \n", 414 | " text = text.lower() \n", 415 | " text = text.split(' ') \n", 416 | " text = [w for w in text if not w in set(stopwords.words('english'))] \n", 417 | " text = ' '.join(text) \n", 418 | " return text\n", 419 | "\n", 420 | " # create the corpus GloVe \n", 421 | " def create_corpus(self, df,col_name):\n", 422 | " \"\"\" create the corpus GloVe \"\"\"\n", 423 | " corpus=[]\n", 424 | " for abstract in tqdm(df[col_name]):\n", 425 | " words=[word.lower() for word in word_tokenize(abstract) if((word.isalpha()==1) & (word not in stop))]\n", 426 | " corpus.append(words)\n", 427 | " return corpus\n", 428 | " \n", 429 | " def run_all(self):\n", 430 | " \"\"\" Run all the methods as per the requirements \"\"\"\n", 431 | " self.df['clean_abstract'] = self.df[self.text_col].apply(lambda x : self.clean_text(x))\n", 432 | "\n", 433 | " self.test_df['clean_abstract'] = self.test_df[self.text_col].apply(lambda x : self.clean_text(x))\n", 434 | "\n", 435 | " # padding\n", 436 | " MAX_LEN=10\n", 437 | " tokenizer_obj=Tokenizer()\n", 438 | "\n", 439 | " corpus=self.create_corpus(self.df, 'clean_abstract')\n", 440 | " tokenizer_obj.fit_on_texts(corpus)\n", 441 | " sequences=tokenizer_obj.texts_to_sequences(corpus)\n", 442 | "\n", 443 | " abstract_pad = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')\n", 444 | "\n", 445 | " word_index=tokenizer_obj.word_index\n", 446 | "\n", 447 | " # Embedding\n", 448 | " num_words=len(word_index)+1\n", 449 | " embedding_matrix=np.zeros((num_words,50))\n", 450 | "\n", 451 | " for word, i in tqdm(word_index.items()):\n", 452 | " if i > num_words:\n", 453 | " continue\n", 454 | " \n", 455 | " emb_vec=embedding_dict.get(word)\n", 456 | " if emb_vec is not None:\n", 457 | " embedding_matrix[i]=emb_vec\n", 458 | "\n", 459 | " # Dataset split\n", 460 | " X_train,X_val, y_train, y_val = train_test_split(abstract_pad,self.df.categories, test_size=.2, random_state=2)\n", 461 | "\n", 462 | " # Create Model\n", 463 | " model=Sequential()\n", 464 | "\n", 465 | " embedding_layer=Embedding(num_words,50,embeddings_initializer=Constant(embedding_matrix),\n", 466 | " input_length=MAX_LEN,trainable=False)\n", 467 | "\n", 468 | " model.add(embedding_layer)\n", 469 | " model.add(SpatialDropout1D(0.2))\n", 470 | " model.add(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2,return_sequences=True))\n", 471 | " model.add(tf.keras.layers.SimpleRNN(32,return_sequences=True))\n", 472 | " model.add(tf.keras.layers.SimpleRNN(16))\n", 473 | " model.add(tf.keras.layers.Dense(16, activation='relu'))\n", 474 | "\n", 475 | " model.add(Dense(1, activation='softmax'))\n", 476 | "\n", 477 | " optimzer=Adam(learning_rate=0.0001)\n", 478 | " model.compile(loss='categorical_crossentropy',optimizer=optimzer,metrics=['acc'])\n", 479 | " model.summary()\n", 480 | "\n", 481 | " #Fitting The Model\n", 482 | " history=model.fit(X_train,y_train,batch_size=32,epochs=10,validation_data=(X_val,y_val),verbose=1)\n", 483 | " \n", 484 | " # padding\n", 485 | " MAX_LEN=10\n", 486 | " tokenizer_obj=Tokenizer()\n", 487 | "\n", 488 | " corpus=self.create_corpus(self.test_df, 'clean_abstract')\n", 489 | " tokenizer_obj.fit_on_texts(corpus)\n", 490 | " sequences=tokenizer_obj.texts_to_sequences(corpus)\n", 491 | "\n", 492 | " abstract_pad_test = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')\n", 493 | "\n", 494 | " word_index=tokenizer_obj.word_index\n", 495 | "\n", 496 | " y_predicted = model.predict(abstract_pad_test)\n", 497 | "\n", 498 | " print(classification_report(self.test_df.categories, y_predicted))\n", 499 | " \n", 500 | " return model\n", 501 | "\n" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": { 508 | "id": "sgULQJmra0rd" 509 | }, 510 | "outputs": [], 511 | "source": [ 512 | "train_df = pd.read_csv(\"train.csv\", usecols= ['abstract', 'categories'])\n", 513 | "test_df = pd.read_csv(\"test.csv\", usecols= ['abstract', 'categories'])\n", 514 | "\n", 515 | "train_df.dropna(axis = 0, how = 'any', inplace=True)\n", 516 | "test_df.dropna(axis = 0, how = 'any', inplace=True)\n", 517 | "\n", 518 | "# Create the label collumn\n", 519 | "train_df['categories'] = train_df['categories'].replace({\n", 520 | " 'Digestive System Diseases':0,\n", 521 | " 'Bacterial Infections and Mycoses':1,\n", 522 | " 'Wounds and Injuries':2,\n", 523 | " 'Animal Diseases':3,\n", 524 | " 'Male Urogenital Diseases':4,\n", 525 | " 'Endocrine System Diseases':5,\n", 526 | " 'Congenital Hereditary and Neonatal Diseases and Abnormalities':6,\n", 527 | " 'Hemic and Lymphatic Diseases':7,\n", 528 | " 'Neoplasms':8,\n", 529 | " 'Chemically-Induced Disorders':9,\n", 530 | " \"Stomatognathic Diseases\":10,\n", 531 | " \"Respiratory Tract Diseases\": 11,\n", 532 | " \"Parasitic Diseases\": 12,\n", 533 | " \"Eye Diseases\": 13,\n", 534 | " \"Pathological Conditions and Signs and Symptoms\": 14,\n", 535 | " \"Otorhinolaryngologic Diseases\": 15,\n", 536 | " \"Nutritional and Metabolic Diseases\": 16,\n", 537 | " \"Cardiovascular Diseases\": 17,\n", 538 | " \"Female Urogenital Diseases and Pregnancy Complications\": 18,\n", 539 | " \"Nervous System Diseases\": 19,\n", 540 | " \"Virus Diseases\": 20,\n", 541 | " \"Occupational Diseases\": 21,\n", 542 | " \"Musculoskeletal Diseases\": 22,\n", 543 | " \"Immune System Diseases\": 23,\n", 544 | " \"Skin and Connective Tissue Diseases\": 24,\n", 545 | " \"Disorders of Environmental Origin\": 25\n", 546 | " \n", 547 | " })\n", 548 | "\n", 549 | "\n", 550 | "test_df['categories'] = test_df['categories'].replace({\n", 551 | " 'Digestive System Diseases':0,\n", 552 | " 'Bacterial Infections and Mycoses':1,\n", 553 | " 'Wounds and Injuries':2,\n", 554 | " 'Animal Diseases':3,\n", 555 | " 'Male Urogenital Diseases':4,\n", 556 | " 'Endocrine System Diseases':5,\n", 557 | " 'Congenital Hereditary and Neonatal Diseases and Abnormalities':6,\n", 558 | " 'Hemic and Lymphatic Diseases':7,\n", 559 | " 'Neoplasms':8,\n", 560 | " 'Chemically-Induced Disorders':9,\n", 561 | " \"Stomatognathic Diseases\":10,\n", 562 | " \"Respiratory Tract Diseases\": 11,\n", 563 | " \"Parasitic Diseases\": 12,\n", 564 | " \"Eye Diseases\": 13,\n", 565 | " \"Pathological Conditions and Signs and Symptoms\": 14,\n", 566 | " \"Otorhinolaryngologic Diseases\": 15,\n", 567 | " \"Nutritional and Metabolic Diseases\": 16,\n", 568 | " \"Cardiovascular Diseases\": 17,\n", 569 | " \"Female Urogenital Diseases and Pregnancy Complications\": 18,\n", 570 | " \"Nervous System Diseases\": 19,\n", 571 | " \"Virus Diseases\": 20,\n", 572 | " \"Occupational Diseases\": 21,\n", 573 | " \"Musculoskeletal Diseases\": 22,\n", 574 | " \"Immune System Diseases\": 23,\n", 575 | " \"Skin and Connective Tissue Diseases\": 24,\n", 576 | " \"Disorders of Environmental Origin\": 25 \n", 577 | " })\n", 578 | "\n", 579 | "\n", 580 | "embedding_dict={}\n", 581 | "with open('glove.6B.50d.txt','r') as f:\n", 582 | " for line in f:\n", 583 | " values=line.split()\n", 584 | " word=values[0]\n", 585 | " vectors=np.asarray(values[1:],'float32')\n", 586 | " embedding_dict[word]=vectors\n", 587 | "f.close()\n", 588 | "\n", 589 | "train_df = train_df.head(100)\n", 590 | "\n", 591 | "RNNExe = RNN(embedding_dict, train_df, test_df, 'abstract')\n", 592 | "model = RNNExe.run_all()" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": { 598 | "id": "_yO4XO-Uq5HV" 599 | }, 600 | "source": [ 601 | "**Observations**\n", 602 | "\n", 603 | "Above confusion matrix shows the RNN Model's performance due to the unavailability of powerful computing resources to perform advanced analytics its quite challenging. So selected the GloVe with 6B tokens, 400K vocab, uncased, 50d vectors \n", 604 | "\n", 605 | "Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): glove.840B.300d.zip will helps here to increase the accuracy with precision and recall.\n" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "metadata": { 612 | "id": "L84UjeA-a0uM" 613 | }, 614 | "outputs": [], 615 | "source": [] 616 | }, 617 | { 618 | "cell_type": "markdown", 619 | "metadata": { 620 | "id": "WIDtOf79tyPk" 621 | }, 622 | "source": [ 623 | "**b. Build a text classifier which classifies the abstracts into one of the 26 disease categories using any Transformer architecture and report key metrics on test set. Explain your observations**" 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "metadata": { 629 | "id": "MAMX5YevUFsL" 630 | }, 631 | "source": [ 632 | "**BERT** - Bidirectional Encoder Representations from Transformers\n", 633 | "The main breakthrough of the BERT model is that it scans text in a bidirectional way like left-to-right and right-to-left sequences when looking at texts during training.\n", 634 | "There are two general types of BERT: BERT (base) and BERT (large). The difference is in configurable parameters: base-110 million parameter, large-345 million.\n", 635 | "\n", 636 | "\n", 637 | "Note: BERT Base model is selected due to the restriction on the uses of computing resources (Colab)" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": { 644 | "id": "xOnoeZ5Fb9cM" 645 | }, 646 | "outputs": [], 647 | "source": [ 648 | "### Install the required library\n", 649 | "# ! pip install tensorflow_text" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "metadata": { 656 | "id": "xVQ1gmp4pRv7" 657 | }, 658 | "outputs": [], 659 | "source": [ 660 | "# import BERT model\n", 661 | "import numpy as np\n", 662 | "import pandas as pd\n", 663 | "from sklearn.model_selection import train_test_split\n", 664 | "import tensorflow as tf\n", 665 | "import tensorflow_hub as hub\n", 666 | "import tensorflow_text as text\n", 667 | "from sklearn.metrics import confusion_matrix, classification_report\n", 668 | "from matplotlib import pyplot as plt\n", 669 | "import seaborn as sn\n", 670 | "\n", 671 | "# import BERT model\n", 672 | "bert_preprocess = hub.KerasLayer(\"https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3\")\n", 673 | "bert_encoder = hub.KerasLayer(\"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4\")\n", 674 | "\n", 675 | "#import ALBERT - A Lite BERT model\n", 676 | "albert_preprocess = hub.KerasLayer(\"http://tfhub.dev/tensorflow/albert_en_preprocess/3\")\n", 677 | "albert_encoder = hub.KerasLayer(\"https://tfhub.dev/tensorflow/albert_en_base/3\")\n", 678 | "\n", 679 | "#import RoBERTa - Robustly Optimized BERT Pretraining Approach\n", 680 | "roberta_preprocess = hub.KerasLayer(\"https://tfhub.dev/jeongukjae/xlm_roberta_multi_cased_preprocess/1\")\n", 681 | "roberta_encoder = hub.KerasLayer(\"https://tfhub.dev/jeongukjae/xlm_roberta_multi_cased_L-12_H-768_A-12/1\")\n" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "metadata": { 688 | "id": "1sJ8g9QBpSIO" 689 | }, 690 | "outputs": [], 691 | "source": [ 692 | "train_df = pd.read_csv(\"train.csv\", usecols= ['abstract', 'categories'])\n", 693 | "test_df = pd.read_csv(\"test.csv\", usecols= ['abstract', 'categories'])\n", 694 | "\n", 695 | "# drop the null values\n", 696 | "train_df.dropna(axis = 0, how = 'any', inplace=True)\n", 697 | "test_df.dropna(axis = 0, how = 'any', inplace=True)" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": { 704 | "id": "msvTgo20fN9j" 705 | }, 706 | "outputs": [], 707 | "source": [ 708 | "# Create the label collumn\n", 709 | "train_df['categories'] = train_df['categories'].replace({\n", 710 | " 'Digestive System Diseases':0,\n", 711 | " 'Bacterial Infections and Mycoses':1,\n", 712 | " 'Wounds and Injuries':2,\n", 713 | " 'Animal Diseases':3,\n", 714 | " 'Male Urogenital Diseases':4,\n", 715 | " 'Endocrine System Diseases':5,\n", 716 | " 'Congenital Hereditary and Neonatal Diseases and Abnormalities':6,\n", 717 | " 'Hemic and Lymphatic Diseases':7,\n", 718 | " 'Neoplasms':8,\n", 719 | " 'Chemically-Induced Disorders':9,\n", 720 | " \"Stomatognathic Diseases\":10,\n", 721 | " \"Respiratory Tract Diseases\": 11,\n", 722 | " \"Parasitic Diseases\": 12,\n", 723 | " \"Eye Diseases\": 13,\n", 724 | " \"Pathological Conditions and Signs and Symptoms\": 14,\n", 725 | " \"Otorhinolaryngologic Diseases\": 15,\n", 726 | " \"Nutritional and Metabolic Diseases\": 16,\n", 727 | " \"Cardiovascular Diseases\": 17,\n", 728 | " \"Female Urogenital Diseases and Pregnancy Complications\": 18,\n", 729 | " \"Nervous System Diseases\": 19,\n", 730 | " \"Virus Diseases\": 20,\n", 731 | " \"Occupational Diseases\": 21,\n", 732 | " \"Musculoskeletal Diseases\": 22,\n", 733 | " \"Immune System Diseases\": 23,\n", 734 | " \"Skin and Connective Tissue Diseases\": 24,\n", 735 | " \"Disorders of Environmental Origin\": 25\n", 736 | " \n", 737 | " })\n", 738 | "\n", 739 | "\n", 740 | "test_df['categories'] = test_df['categories'].replace({\n", 741 | " 'Digestive System Diseases':0,\n", 742 | " 'Bacterial Infections and Mycoses':1,\n", 743 | " 'Wounds and Injuries':2,\n", 744 | " 'Animal Diseases':3,\n", 745 | " 'Male Urogenital Diseases':4,\n", 746 | " 'Endocrine System Diseases':5,\n", 747 | " 'Congenital Hereditary and Neonatal Diseases and Abnormalities':6,\n", 748 | " 'Hemic and Lymphatic Diseases':7,\n", 749 | " 'Neoplasms':8,\n", 750 | " 'Chemically-Induced Disorders':9,\n", 751 | " \"Stomatognathic Diseases\":10,\n", 752 | " \"Respiratory Tract Diseases\": 11,\n", 753 | " \"Parasitic Diseases\": 12,\n", 754 | " \"Eye Diseases\": 13,\n", 755 | " \"Pathological Conditions and Signs and Symptoms\": 14,\n", 756 | " \"Otorhinolaryngologic Diseases\": 15,\n", 757 | " \"Nutritional and Metabolic Diseases\": 16,\n", 758 | " \"Cardiovascular Diseases\": 17,\n", 759 | " \"Female Urogenital Diseases and Pregnancy Complications\": 18,\n", 760 | " \"Nervous System Diseases\": 19,\n", 761 | " \"Virus Diseases\": 20,\n", 762 | " \"Occupational Diseases\": 21,\n", 763 | " \"Musculoskeletal Diseases\": 22,\n", 764 | " \"Immune System Diseases\": 23,\n", 765 | " \"Skin and Connective Tissue Diseases\": 24,\n", 766 | " \"Disorders of Environmental Origin\": 25\n", 767 | " \n", 768 | " })" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": null, 774 | "metadata": { 775 | "id": "uMJneac4fOCD" 776 | }, 777 | "outputs": [], 778 | "source": [ 779 | "# Split it into training and test data set\n", 780 | "# X_train, X_test, y_train, y_test = train_test_split(df_balanced[text_column], df_balanced[label_column], stratify=df_balanced[label_column]) # pylint: disable=invalid-name, disable=line-too-long\n", 781 | "\n", 782 | "X_train = train_df['abstract']\n", 783 | "y_train = train_df['categories']\n", 784 | "\n", 785 | "X_test = test_df['abstract']\n", 786 | "y_test = test_df['categories']\n", 787 | "\n", 788 | "def create_model_func(X_train, y_train, X_test, y_test, bert_preprocess, bert_encoder, epochs):\n", 789 | "\n", 790 | " # Bert layers\n", 791 | " text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')\n", 792 | " preprocessed_text = bert_preprocess(text_input)\n", 793 | " outputs = bert_encoder(preprocessed_text)\n", 794 | "\n", 795 | " # Neural network layers\n", 796 | " lay = tf.keras.layers.Dense(64, activation='relu', name=\"dense1\")(outputs['pooled_output'])\n", 797 | " lay = tf.keras.layers.Dropout(0.2, name=\"dropout1\")(lay)\n", 798 | " lay = tf.keras.layers.Dense(32, activation='relu', name=\"dense2\")(lay)\n", 799 | " lay = tf.keras.layers.Dropout(0.2, name=\"dropout\")(lay)\n", 800 | " lay = tf.keras.layers.Dense(1, activation='softmax', name=\"output2\")(lay)\n", 801 | "\n", 802 | " # Use inputs and outputs to construct a final model\n", 803 | " model = tf.keras.Model(inputs=[text_input], outputs=[lay])\n", 804 | "\n", 805 | " # print summary\n", 806 | " model.summary()\n", 807 | " model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n", 808 | " # train the model\n", 809 | " model.fit(X_train, y_train, epochs=epochs)\n", 810 | "\n", 811 | " y_predicted = model.predict(X_test)\n", 812 | "\n", 813 | " print(classification_report(y_test, y_predicted))\n", 814 | " \n" 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": null, 820 | "metadata": { 821 | "id": "FqBIIczaH71P" 822 | }, 823 | "outputs": [], 824 | "source": [ 825 | "## BERT Base model\n", 826 | "create_model_func(X_train, y_train, X_test, y_test, bert_preprocess, bert_encoder, 3)\n" 827 | ] 828 | }, 829 | { 830 | "cell_type": "markdown", 831 | "metadata": { 832 | "id": "NOHxb0F6XN7r" 833 | }, 834 | "source": [ 835 | "**Observations**\n", 836 | "The accueacy of the BERT model is 0.0952. Need more data to train the BERT model.\n" 837 | ] 838 | }, 839 | { 840 | "cell_type": "markdown", 841 | "metadata": { 842 | "id": "G2yXEl0Exw2G" 843 | }, 844 | "source": [ 845 | "**c. You are free to experiment various Transformer architectures for 4.a and 4.b, however, only report the model which you consider the best. What is your rationale for this model selection?**\n", 846 | "\n", 847 | "**ALBERT - A Lite BERT**\n", 848 | "ALBERT is developed based on the BERT model. Its major breakthrough is that it brings a significant parameter reduction but maintains the same level of performance compared to BERT.\n", 849 | "In ALBERT, parameters are shared across 12 layers of transformer encoders, while in the original BERT, each layer of encoders have a unique set of parameters.\n", 850 | "\n", 851 | "\n", 852 | "**RoBERTa - Robustly Optimized BERT Pretraining Approach**\n", 853 | "RoBERTa makes changes to the architecture and training procedures of BERT. Specifically, RoBERTa removes the Next Sentence Prediction (NSP) objective, uses a much larger dataset than BERT, and replaces static masking with dynamic masking.\n", 854 | "\n" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "metadata": { 861 | "id": "xATfnT3twYjy" 862 | }, 863 | "outputs": [], 864 | "source": [ 865 | "#import ALBERT - A Lite BERT model \n", 866 | "create_model_func(X_train, y_train, X_test, y_test, albert_preprocess, albert_encoder, 3)\n" 867 | ] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "execution_count": null, 872 | "metadata": { 873 | "id": "4GViJUm9B6tJ" 874 | }, 875 | "outputs": [], 876 | "source": [ 877 | "#import RoBERTa - Robustly Optimized BERT Pretraining Approach \n", 878 | "create_model_func(X_train, y_train, X_test, y_test, roberta_preprocess, roberta_encoder, 3)\n" 879 | ] 880 | }, 881 | { 882 | "cell_type": "markdown", 883 | "metadata": { 884 | "id": "2I66xYPB0bAo" 885 | }, 886 | "source": [ 887 | "d. Consider the test table/dataframe (df_test). Run an inference through the best model determined in 4.c." 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": null, 893 | "metadata": { 894 | "id": "mFJ8yvBeCVHm" 895 | }, 896 | "outputs": [], 897 | "source": [] 898 | }, 899 | { 900 | "cell_type": "code", 901 | "execution_count": null, 902 | "metadata": { 903 | "id": "g9f3KvnVCVLT" 904 | }, 905 | "outputs": [], 906 | "source": [] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": null, 911 | "metadata": { 912 | "id": "8YbCN2jJCVNM" 913 | }, 914 | "outputs": [], 915 | "source": [] 916 | }, 917 | { 918 | "cell_type": "code", 919 | "execution_count": null, 920 | "metadata": { 921 | "id": "zzfb_p4VCVRt" 922 | }, 923 | "outputs": [], 924 | "source": [] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "execution_count": null, 929 | "metadata": { 930 | "id": "5ledv28mCVXq" 931 | }, 932 | "outputs": [], 933 | "source": [] 934 | }, 935 | { 936 | "cell_type": "code", 937 | "execution_count": null, 938 | "metadata": { 939 | "id": "yAONSKr_nASx" 940 | }, 941 | "outputs": [], 942 | "source": [] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": null, 947 | "metadata": { 948 | "id": "209wa8IhnAXJ" 949 | }, 950 | "outputs": [], 951 | "source": [] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": null, 956 | "metadata": { 957 | "id": "S1FsXIcJnAzZ" 958 | }, 959 | "outputs": [], 960 | "source": [] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": null, 965 | "metadata": { 966 | "id": "S-JaDyvjnA0n" 967 | }, 968 | "outputs": [], 969 | "source": [] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": null, 974 | "metadata": { 975 | "id": "B661proAnA48" 976 | }, 977 | "outputs": [], 978 | "source": [] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": null, 983 | "metadata": { 984 | "id": "FBrT5oG-nA6s" 985 | }, 986 | "outputs": [], 987 | "source": [] 988 | }, 989 | { 990 | "cell_type": "code", 991 | "execution_count": null, 992 | "metadata": { 993 | "id": "1aS5k74RfODc" 994 | }, 995 | "outputs": [], 996 | "source": [] 997 | } 998 | ], 999 | "metadata": { 1000 | "accelerator": "GPU", 1001 | "colab": { 1002 | "collapsed_sections": [ 1003 | "y0JxB-5YbZib" 1004 | ], 1005 | "private_outputs": true, 1006 | "provenance": [] 1007 | }, 1008 | "gpuClass": "standard", 1009 | "kernelspec": { 1010 | "display_name": "Python 3 (ipykernel)", 1011 | "language": "python", 1012 | "name": "python3" 1013 | }, 1014 | "language_info": { 1015 | "codemirror_mode": { 1016 | "name": "ipython", 1017 | "version": 3 1018 | }, 1019 | "file_extension": ".py", 1020 | "mimetype": "text/x-python", 1021 | "name": "python", 1022 | "nbconvert_exporter": "python", 1023 | "pygments_lexer": "ipython3", 1024 | "version": "3.11.5" 1025 | } 1026 | }, 1027 | "nbformat": 4, 1028 | "nbformat_minor": 1 1029 | } 1030 | --------------------------------------------------------------------------------