├── README.md └── Text_Preprocessing.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Data-Pre-processing -------------------------------------------------------------------------------- /Text_Preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text Preprocessing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Importing Libraries\n", 24 | "import unidecode\n", 25 | "import pandas as pd\n", 26 | "import re\n", 27 | "import time\n", 28 | "import nltk\n", 29 | "from nltk.corpus import stopwords\n", 30 | "nltk.download('stopwords')\n", 31 | "from nltk.tokenize import word_tokenize\n", 32 | "from nltk.stem import WordNetLemmatizer\n", 33 | "from autocorrect import Speller\n", 34 | "from bs4 import BeautifulSoup\n", 35 | "from nltk.corpus import stopwords\n", 36 | "from nltk import word_tokenize\n", 37 | "import string\n", 38 | "import timeit" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Read Dataset\n", 48 | "Df = pd.read_csv('New Task.csv', encoding = 'latin-1')\n", 49 | "print('Number of Data points : ', Df.shape[0])\n", 50 | "print('Number of features :', Df.shape[1])\n", 51 | "print('features :', Df.columns.values)\n", 52 | "# Show Dataset\n", 53 | "Df.head()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# This command tells information about the attributes of Dataset.\n", 63 | "Df.info()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# Shows statistics for every numerical column in our dataset.\n", 73 | "Df.describe()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### Check type of Dataframe attribute that has to processed" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# Type of attribute \"Content\"\n", 90 | "type(Df['Content'])\n", 91 | "\n", 92 | "#Type of attribute \"Title\"\n", 93 | "type(Df['Title'])\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "### Remove newlines & tabs " 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "scrolled": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "def remove_newlines_tabs(text):\n", 112 | " \"\"\"\n", 113 | " This function will remove all the occurrences of newlines, tabs, and combinations like: \\\\n, \\\\.\n", 114 | " \n", 115 | " arguments:\n", 116 | " input_text: \"text\" of type \"String\". \n", 117 | " \n", 118 | " return:\n", 119 | " value: \"text\" after removal of newlines, tabs, \\\\n, \\\\ characters.\n", 120 | " \n", 121 | " Example:\n", 122 | " Input : This is her \\\\ first day at this place.\\n Please,\\t Be nice to her.\\\\n\n", 123 | " Output : This is her first day at this place. Please, Be nice to her. \n", 124 | " \n", 125 | " \"\"\"\n", 126 | " \n", 127 | " # Replacing all the occurrences of \\n,\\\\n,\\t,\\\\ with a space.\n", 128 | " Formatted_text = text.replace('\\\\n', ' ').replace('\\n', ' ').replace('\\t',' ').replace('\\\\', ' ').replace('. com', '.com')\n", 129 | " return Formatted_text\n", 130 | "# len of data :- 1618647 lac words" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "### Strip Html Tags" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "def strip_html_tags(text):\n", 147 | " \"\"\" \n", 148 | " This function will remove all the occurrences of html tags from the text.\n", 149 | " \n", 150 | " arguments:\n", 151 | " input_text: \"text\" of type \"String\". \n", 152 | " \n", 153 | " return:\n", 154 | " value: \"text\" after removal of html tags.\n", 155 | " \n", 156 | " Example:\n", 157 | " Input : This is a nice place to live. \n", 158 | " Output : This is a nice place to live. \n", 159 | " \"\"\"\n", 160 | " # Initiating BeautifulSoup object soup.\n", 161 | " soup = BeautifulSoup(text, \"html.parser\")\n", 162 | " # Get all the text other than html tags.\n", 163 | " stripped_text = soup.get_text(separator=\" \")\n", 164 | " return stripped_text\n", 165 | "# len of string:- 1616053 lac words" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "### Remove Links " 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "scrolled": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "def remove_links(text):\n", 184 | " \"\"\"\n", 185 | " This function will remove all the occurrences of links.\n", 186 | " \n", 187 | " arguments:\n", 188 | " input_text: \"text\" of type \"String\". \n", 189 | " \n", 190 | " return:\n", 191 | " value: \"text\" after removal of all types of links.\n", 192 | " \n", 193 | " Example:\n", 194 | " Input : To know more about cats and food & website: catster.com visit: https://catster.com//how-to-feed-cats\n", 195 | " Output : To know more about cats and food & website: visit: \n", 196 | " \n", 197 | " \"\"\"\n", 198 | " \n", 199 | " # Removing all the occurrences of links that starts with https\n", 200 | " remove_https = re.sub(r'http\\S+', '', text)\n", 201 | " # Remove all the occurrences of text that ends with .com\n", 202 | " remove_com = re.sub(r\"\\ [A-Za-z]*\\.com\", \" \", remove_https)\n", 203 | " return remove_com\n", 204 | "# len of words:- 1616053" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "### Remove WhiteSpaces" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "def remove_whitespace(text):\n", 221 | " \"\"\" This function will remove \n", 222 | " extra whitespaces from the text\n", 223 | " arguments:\n", 224 | " input_text: \"text\" of type \"String\". \n", 225 | " \n", 226 | " return:\n", 227 | " value: \"text\" after extra whitespaces removed .\n", 228 | " \n", 229 | " Example:\n", 230 | " Input : How are you doing ?\n", 231 | " Output : How are you doing ? \n", 232 | " \n", 233 | " \"\"\"\n", 234 | " pattern = re.compile(r'\\s+') \n", 235 | " Without_whitespace = re.sub(pattern, ' ', text)\n", 236 | " # There are some instances where there is no space after '?' & ')', \n", 237 | " # So I am replacing these with one space so that It will not consider two words as one token.\n", 238 | " text = Without_whitespace.replace('?', ' ? ').replace(')', ') ')\n", 239 | " return text \n", 240 | "# len of words:- 1596248 lac words" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "### Step1: Remove Accented Characters\n" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "# Code for accented characters removal\n", 257 | "def accented_characters_removal(text):\n", 258 | " # this is a docstring\n", 259 | " \"\"\"\n", 260 | " The function will remove accented characters from the \n", 261 | " text contained within the Dataset.\n", 262 | " \n", 263 | " arguments:\n", 264 | " input_text: \"text\" of type \"String\". \n", 265 | " \n", 266 | " return:\n", 267 | " value: \"text\" with removed accented characters.\n", 268 | " \n", 269 | " Example:\n", 270 | " Input : Málaga, àéêöhello\n", 271 | " Output : Malaga, aeeohello \n", 272 | " \n", 273 | " \"\"\"\n", 274 | " # Remove accented characters from text using unidecode.\n", 275 | " # Unidecode() - It takes unicode data & tries to represent it to ASCII characters. \n", 276 | " text = unidecode.unidecode(text)\n", 277 | " return text\n", 278 | "# Len of data:- 1593952 lac of words" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "### Step2: Case Conversion" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "# Code for text lowercasing\n", 295 | "def lower_casing_text(text):\n", 296 | " \n", 297 | " \"\"\"\n", 298 | " The function will convert text into lower case.\n", 299 | " \n", 300 | " arguments:\n", 301 | " input_text: \"text\" of type \"String\".\n", 302 | " \n", 303 | " return:\n", 304 | " value: text in lowercase\n", 305 | " \n", 306 | " Example:\n", 307 | " Input : The World is Full of Surprises!\n", 308 | " Output : the world is full of surprises!\n", 309 | " \n", 310 | " \"\"\"\n", 311 | " # Convert text to lower case\n", 312 | " # lower() - It converts all upperase letter of given string to lowercase.\n", 313 | " text = text.lower()\n", 314 | " return text\n" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "### Step3: Reduce repeated characters and punctuations¶" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# Code for removing repeated characters and punctuations\n", 331 | "\n", 332 | "def reducing_incorrect_character_repeatation(text):\n", 333 | " \"\"\"\n", 334 | " This Function will reduce repeatition to two characters \n", 335 | " for alphabets and to one character for punctuations.\n", 336 | " \n", 337 | " arguments:\n", 338 | " input_text: \"text\" of type \"String\".\n", 339 | " \n", 340 | " return:\n", 341 | " value: Finally formatted text with alphabets repeating to \n", 342 | " two characters & punctuations limited to one repeatition \n", 343 | " \n", 344 | " Example:\n", 345 | " Input : Realllllllllyyyyy, Greeeeaaaatttt !!!!?....;;;;:)\n", 346 | " Output : Reallyy, Greeaatt !?.;:)\n", 347 | " \n", 348 | " \"\"\"\n", 349 | " # Pattern matching for all case alphabets\n", 350 | " Pattern_alpha = re.compile(r\"([A-Za-z])\\1{1,}\", re.DOTALL)\n", 351 | " \n", 352 | " # Limiting all the repeatation to two characters.\n", 353 | " Formatted_text = Pattern_alpha.sub(r\"\\1\\1\", text) \n", 354 | " \n", 355 | " # Pattern matching for all the punctuations that can occur\n", 356 | " Pattern_Punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\\1{1,}')\n", 357 | " \n", 358 | " # Limiting punctuations in previously formatted string to only one.\n", 359 | " Combined_Formatted = Pattern_Punct.sub(r'\\1', Formatted_text)\n", 360 | " \n", 361 | " # The below statement is replacing repeatation of spaces that occur more than two times with that of one occurrence.\n", 362 | " Final_Formatted = re.sub(' {2,}',' ', Combined_Formatted)\n", 363 | " return Final_Formatted\n" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "### Explanation for using some symbols in above regex expression\n", 371 | "**\\1** —> is equivalent to re.search(...). group(1). It Refers to first capturing group. \\1 matches the exact same text that was matched by the first capturing group.\n", 372 | "\n", 373 | "**{1,}** --> It means we are matching for repeatation that occurs more than one times. \n", 374 | "\n", 375 | "**DOTALL** -> It matches newline character as well unlike dot operator which matches everything in the given text except newline character. \n", 376 | "\n", 377 | "**sub()** --> This function is used to replace occurrences of a particular sub-string with another sub-string. This function takes as input the following: The sub-string to replace. The sub-string to replace with.\n", 378 | "\n", 379 | "**r'\\1\\1'** --> It limits all the repeatation to two characters.\n", 380 | "\n", 381 | "**r'\\1'** --> Limits all the repeatation to only one character.\n", 382 | "\n", 383 | "**{2,}** --> It means to match for repeatation that occurs more than two times" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "## Step4: Expand contraction words" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "CONTRACTION_MAP = {\n", 400 | "\"ain't\": \"is not\",\n", 401 | "\"aren't\": \"are not\",\n", 402 | "\"can't\": \"cannot\",\n", 403 | "\"can't've\": \"cannot have\",\n", 404 | "\"'cause\": \"because\",\n", 405 | "\"could've\": \"could have\",\n", 406 | "\"couldn't\": \"could not\",\n", 407 | "\"couldn't've\": \"could not have\",\n", 408 | "\"didn't\": \"did not\",\n", 409 | "\"doesn't\": \"does not\",\n", 410 | "\"don't\": \"do not\",\n", 411 | "\"hadn't\": \"had not\",\n", 412 | "\"hadn't've\": \"had not have\",\n", 413 | "\"hasn't\": \"has not\",\n", 414 | "\"haven't\": \"have not\",\n", 415 | "\"he'd\": \"he would\",\n", 416 | "\"he'd've\": \"he would have\",\n", 417 | "\"he'll\": \"he will\",\n", 418 | "\"he'll've\": \"he he will have\",\n", 419 | "\"he's\": \"he is\",\n", 420 | "\"how'd\": \"how did\",\n", 421 | "\"how'd'y\": \"how do you\",\n", 422 | "\"how'll\": \"how will\",\n", 423 | "\"how's\": \"how is\",\n", 424 | "\"i'd\": \"i would\",\n", 425 | "\"i'd've\": \"i would have\",\n", 426 | "\"i'll\": \"i will\",\n", 427 | "\"i'll've\": \"i will have\",\n", 428 | "\"i'm\": \"i am\",\n", 429 | "\"i've\": \"i have\",\n", 430 | "\"isn't\": \"is not\",\n", 431 | "\"it'd\": \"it would\",\n", 432 | "\"it'd've\": \"it would have\",\n", 433 | "\"it'll\": \"it will\",\n", 434 | "\"it'll've\": \"it will have\",\n", 435 | "\"it's\": \"it is\",\n", 436 | "\"let's\": \"let us\",\n", 437 | "\"ma'am\": \"madam\",\n", 438 | "\"mayn't\": \"may not\",\n", 439 | "\"might've\": \"might have\",\n", 440 | "\"mightn't\": \"might not\",\n", 441 | "\"mightn't've\": \"might not have\",\n", 442 | "\"must've\": \"must have\",\n", 443 | "\"mustn't\": \"must not\",\n", 444 | "\"mustn't've\": \"must not have\",\n", 445 | "\"needn't\": \"need not\",\n", 446 | "\"needn't've\": \"need not have\",\n", 447 | "\"o'clock\": \"of the clock\",\n", 448 | "\"oughtn't\": \"ought not\",\n", 449 | "\"oughtn't've\": \"ought not have\",\n", 450 | "\"shan't\": \"shall not\",\n", 451 | "\"sha'n't\": \"shall not\",\n", 452 | "\"shan't've\": \"shall not have\",\n", 453 | "\"she'd\": \"she would\",\n", 454 | "\"she'd've\": \"she would have\",\n", 455 | "\"she'll\": \"she will\",\n", 456 | "\"she'll've\": \"she will have\",\n", 457 | "\"she's\": \"she is\",\n", 458 | "\"should've\": \"should have\",\n", 459 | "\"shouldn't\": \"should not\",\n", 460 | "\"shouldn't've\": \"should not have\",\n", 461 | "\"so've\": \"so have\",\n", 462 | "\"so's\": \"so as\",\n", 463 | "\"that'd\": \"that would\",\n", 464 | "\"that'd've\": \"that would have\",\n", 465 | "\"that's\": \"that is\",\n", 466 | "\"there'd\": \"there would\",\n", 467 | "\"there'd've\": \"there would have\",\n", 468 | "\"there's\": \"there is\",\n", 469 | "\"they'd\": \"they would\",\n", 470 | "\"they'd've\": \"they would have\",\n", 471 | "\"they'll\": \"they will\",\n", 472 | "\"they'll've\": \"they will have\",\n", 473 | "\"they're\": \"they are\",\n", 474 | "\"they've\": \"they have\",\n", 475 | "\"to've\": \"to have\",\n", 476 | "\"wasn't\": \"was not\",\n", 477 | "\"we'd\": \"we would\",\n", 478 | "\"we'd've\": \"we would have\",\n", 479 | "\"we'll\": \"we will\",\n", 480 | "\"we'll've\": \"we will have\",\n", 481 | "\"we're\": \"we are\",\n", 482 | "\"we've\": \"we have\",\n", 483 | "\"weren't\": \"were not\",\n", 484 | "\"what'll\": \"what will\",\n", 485 | "\"what'll've\": \"what will have\",\n", 486 | "\"what're\": \"what are\",\n", 487 | "\"what's\": \"what is\",\n", 488 | "\"what've\": \"what have\",\n", 489 | "\"when's\": \"when is\",\n", 490 | "\"when've\": \"when have\",\n", 491 | "\"where'd\": \"where did\",\n", 492 | "\"where's\": \"where is\",\n", 493 | "\"where've\": \"where have\",\n", 494 | "\"who'll\": \"who will\",\n", 495 | "\"who'll've\": \"who will have\",\n", 496 | "\"who's\": \"who is\",\n", 497 | "\"who've\": \"who have\",\n", 498 | "\"why's\": \"why is\",\n", 499 | "\"why've\": \"why have\",\n", 500 | "\"will've\": \"will have\",\n", 501 | "\"won't\": \"will not\",\n", 502 | "\"won't've\": \"will not have\",\n", 503 | "\"would've\": \"would have\",\n", 504 | "\"wouldn't\": \"would not\",\n", 505 | "\"wouldn't've\": \"would not have\",\n", 506 | "\"y'all\": \"you all\",\n", 507 | "\"y'all'd\": \"you all would\",\n", 508 | "\"y'all'd've\": \"you all would have\",\n", 509 | "\"y'all're\": \"you all are\",\n", 510 | "\"y'all've\": \"you all have\",\n", 511 | "\"you'd\": \"you would\",\n", 512 | "\"you'd've\": \"you would have\",\n", 513 | "\"you'll\": \"you will\",\n", 514 | "\"you'll've\": \"you will have\",\n", 515 | "\"you're\": \"you are\",\n", 516 | "\"you've\": \"you have\",\n", 517 | "}\n", 518 | "# The code for expanding contraction words\n", 519 | "def expand_contractions(text, contraction_mapping = CONTRACTION_MAP):\n", 520 | " \"\"\"expand shortened words to the actual form.\n", 521 | " e.g. don't to do not\n", 522 | " \n", 523 | " arguments:\n", 524 | " input_text: \"text\" of type \"String\".\n", 525 | " \n", 526 | " return:\n", 527 | " value: Text with expanded form of shorthened words.\n", 528 | " \n", 529 | " Example: \n", 530 | " Input : ain't, aren't, can't, cause, can't've\n", 531 | " Output : is not, are not, cannot, because, cannot have \n", 532 | " \n", 533 | " \"\"\"\n", 534 | " # Tokenizing text into tokens.\n", 535 | " list_Of_tokens = text.split(' ')\n", 536 | "\n", 537 | " # Checking for whether the given token matches with the Key & replacing word with key's value.\n", 538 | " \n", 539 | " # Check whether Word is in lidt_Of_tokens or not.\n", 540 | " for Word in list_Of_tokens: \n", 541 | " # Check whether found word is in dictionary \"Contraction Map\" or not as a key. \n", 542 | " if Word in CONTRACTION_MAP: \n", 543 | " # If Word is present in both dictionary & list_Of_tokens, replace that word with the key value.\n", 544 | " list_Of_tokens = [item.replace(Word, CONTRACTION_MAP[Word]) for item in list_Of_tokens]\n", 545 | " \n", 546 | " # Converting list of tokens to String.\n", 547 | " String_Of_tokens = ' '.join(str(e) for e in list_Of_tokens) \n", 548 | " return String_Of_tokens \n", 549 | " " 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": {}, 555 | "source": [ 556 | "## Step5: Remove special characters" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "# The code for removing special characters\n", 566 | "def removing_special_characters(text):\n", 567 | " \"\"\"Removing all the special characters except the one that is passed within \n", 568 | " the regex to match, as they have imp meaning in the text provided.\n", 569 | " \n", 570 | " \n", 571 | " arguments:\n", 572 | " input_text: \"text\" of type \"String\".\n", 573 | " \n", 574 | " return:\n", 575 | " value: Text with removed special characters that don't require.\n", 576 | " \n", 577 | " Example: \n", 578 | " Input : Hello, K-a-j-a-l. Thi*s is $100.05 : the payment that you will recieve! (Is this okay?) \n", 579 | " Output : Hello, Kajal. This is $100.05 : the payment that you will recieve! Is this okay?\n", 580 | " \n", 581 | " \"\"\"\n", 582 | " # The formatted text after removing not necessary punctuations.\n", 583 | " Formatted_Text = re.sub(r\"[^a-zA-Z0-9:$-,%.?!]+\", ' ', text) \n", 584 | " # In the above regex expression,I am providing necessary set of punctuations that are frequent in this particular dataset.\n", 585 | " return Formatted_Text\n" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": {}, 591 | "source": [ 592 | "### Punctuations that I am considering Important as per my Dataset.\n", 593 | "**,.?!** --> These are some frequent punctuations that occurs a lot and needed to preserved as to understand the context of text.\n", 594 | "\n", 595 | "__:__ --> This one is also frequent as per the Dataset. It is important to keep bcz it is giving sense whenever there is a occurrence of time like: **9:05 p.m.**\n", 596 | "\n", 597 | "**%** --> This one is also frequently used in many articles and telling more precisely about the data, facts & figures.\n", 598 | "\n", 599 | "**$** --> This one is used in many articles where prices are considered. So, omitting this symbol will not give much sense about those prices that left as just some numbers only." 600 | ] 601 | }, 602 | { 603 | "cell_type": "markdown", 604 | "metadata": {}, 605 | "source": [ 606 | "## Step6: Remove stopwords" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [ 615 | "# The code for removing stopwords\n", 616 | "stoplist = stopwords.words('english') \n", 617 | "stoplist = set(stoplist)\n", 618 | "def removing_stopwords(text):\n", 619 | " \"\"\"This function will remove stopwords which doesn't add much meaning to a sentence \n", 620 | " & they can be remove safely without comprimising meaning of the sentence.\n", 621 | " \n", 622 | " arguments:\n", 623 | " input_text: \"text\" of type \"String\".\n", 624 | " \n", 625 | " return:\n", 626 | " value: Text after omitted all stopwords.\n", 627 | " \n", 628 | " Example: \n", 629 | " Input : This is Kajal from delhi who came here to study.\n", 630 | " Output : [\"'This\", 'Kajal', 'delhi', 'came', 'study', '.', \"'\"] \n", 631 | " \n", 632 | " \"\"\"\n", 633 | " # repr() function actually gives the precise information about the string\n", 634 | " text = repr(text)\n", 635 | " # Text without stopwords\n", 636 | " No_StopWords = [word for word in word_tokenize(text) if word.lower() not in stoplist ]\n", 637 | " # Convert list of tokens_without_stopwords to String type.\n", 638 | " words_string = ' '.join(No_StopWords) \n", 639 | " return words_string\n" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": {}, 645 | "source": [ 646 | "### Checking spellings for all the stopwords " 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "## Step8: Correct mis-spelled words in text" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": null, 659 | "metadata": {}, 660 | "outputs": [], 661 | "source": [ 662 | "# The code for spelling corrections\n", 663 | "def spelling_correction(text):\n", 664 | " ''' \n", 665 | " This function will correct spellings.\n", 666 | " \n", 667 | " arguments:\n", 668 | " input_text: \"text\" of type \"String\".\n", 669 | " \n", 670 | " return:\n", 671 | " value: Text after corrected spellings.\n", 672 | " \n", 673 | " Example: \n", 674 | " Input : This is Oberois from Dlhi who came heree to studdy.\n", 675 | " Output : This is Oberoi from Delhi who came here to study.\n", 676 | " \n", 677 | " \n", 678 | " '''\n", 679 | " # Check for spellings in English language\n", 680 | " spell = Speller(lang='en')\n", 681 | " Corrected_text = spell(text)\n", 682 | " return Corrected_text\n" 683 | ] 684 | }, 685 | { 686 | "cell_type": "markdown", 687 | "metadata": {}, 688 | "source": [ 689 | "## Step7: Lemmatization" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": null, 695 | "metadata": {}, 696 | "outputs": [], 697 | "source": [ 698 | "# The code for lemmatization\n", 699 | "w_tokenizer = nltk.tokenize.WhitespaceTokenizer()\n", 700 | "lemmatizer = nltk.stem.WordNetLemmatizer()\n", 701 | "def lemmatization(text):\n", 702 | " \"\"\"This function converts word to their root words \n", 703 | " without explicitely cut down as done in stemming.\n", 704 | " \n", 705 | " arguments:\n", 706 | " input_text: \"text\" of type \"String\".\n", 707 | " \n", 708 | " return:\n", 709 | " value: Text having root words only, no tense form, no plural forms\n", 710 | " \n", 711 | " Example: \n", 712 | " Input : text reduced \n", 713 | " Output : text reduce\n", 714 | " \n", 715 | " \"\"\"\n", 716 | " # Converting words to their root forms\n", 717 | " lemma = [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]\n", 718 | " return lemma\n" 719 | ] 720 | }, 721 | { 722 | "cell_type": "markdown", 723 | "metadata": {}, 724 | "source": [ 725 | "### Step9: Putting all in single function" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": null, 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [ 734 | "# Writing main function to merge all the preprocessing steps.\n", 735 | "def text_preprocessing(text, accented_chars=True, contractions=True, lemmatization = True,\n", 736 | " extra_whitespace=True, newlines_tabs=True, repeatition=True, \n", 737 | " lowercase=True, punctuations=True, mis_spell=True,\n", 738 | " remove_html=True, links=True, special_chars=True,\n", 739 | " stop_words=True):\n", 740 | " \"\"\"\n", 741 | " This function will preprocess input text and return\n", 742 | " the clean text.\n", 743 | " \"\"\"\n", 744 | " \n", 745 | " if newlines_tabs == True: #remove newlines & tabs.\n", 746 | " Data = remove_newlines_tabs(text)\n", 747 | " \n", 748 | " if remove_html == True: #remove html tags\n", 749 | " Data = strip_html_tags(Data)\n", 750 | " \n", 751 | " if links == True: #remove links\n", 752 | " Data = remove_links(Data)\n", 753 | " \n", 754 | " if extra_whitespace == True: #remove extra whitespaces\n", 755 | " Data = remove_whitespace(Data)\n", 756 | " \n", 757 | " if accented_chars == True: #remove accented characters\n", 758 | " Data = accented_characters_removal(Data)\n", 759 | " \n", 760 | " if lowercase == True: #convert all characters to lowercase\n", 761 | " Data = lower_casing_text(Data)\n", 762 | " \n", 763 | " if repeatition == True: #Reduce repeatitions \n", 764 | " Data = reducing_incorrect_character_repeatation(Data)\n", 765 | " \n", 766 | " if contractions == True: #expand contractions\n", 767 | " Data = expand_contractions(Data)\n", 768 | " \n", 769 | " if punctuations == True: #remove punctuations\n", 770 | " Data = removing_special_characters(Data)\n", 771 | " \n", 772 | " stoplist = stopwords.words('english') \n", 773 | " stoplist = set(stoplist)\n", 774 | " \n", 775 | " if stop_words == True: #Remove stopwords\n", 776 | " Data = removing_stopwords(Data)\n", 777 | " \n", 778 | " spell = Speller(lang='en')\n", 779 | " \n", 780 | " if mis_spell == True: #Check for mis-spelled words & correct them.\n", 781 | " Data = spelling_correction(Data)\n", 782 | " \n", 783 | " w_tokenizer = nltk.tokenize.WhitespaceTokenizer()\n", 784 | " lemmatizer = nltk.stem.WordNetLemmatizer()\n", 785 | " \n", 786 | " if lemmatization == True: #Converts words to lemma form.\n", 787 | " Data = lemmatization(Data)\n", 788 | " \n", 789 | " \n", 790 | " return Data" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": null, 796 | "metadata": {}, 797 | "outputs": [], 798 | "source": [ 799 | "# Pre-processing for Content\n", 800 | "List_Content = Df['Content'].to_list()\n", 801 | "Final_Article = []\n", 802 | "Complete_Content = []\n", 803 | "for article in List_Content:\n", 804 | " Processed_Content = text_preprocessing(article) #Cleaned text of Content attribute after pre-processing\n", 805 | " Final_Article.append(Processed_Content)\n", 806 | "Complete_Content.extend(Final_Article)\n", 807 | "Df['Processed_Content'] = Complete_Content\n", 808 | "\n", 809 | "# Pre-processing for Title\n", 810 | "List_Title = Df['Title'].to_list()\n", 811 | "\n", 812 | "Final_Title = []\n", 813 | "Complete_Title = []\n", 814 | "for title in List_Title:\n", 815 | " Processed_Title = text_preprocessing(title) #Cleaned text of Title attribute after pre-processing\n", 816 | " Final_Title.append(Processed_Title)\n", 817 | "Complete_Title.extend(Final_Title)\n", 818 | "Df['Processed_Title'] = Complete_Title " 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": null, 824 | "metadata": { 825 | "scrolled": true 826 | }, 827 | "outputs": [], 828 | "source": [ 829 | "Df.head()" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": null, 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [ 838 | "Cleaned_Data = Df.to_csv('Cleaned_Data.csv', index = False)" 839 | ] 840 | } 841 | ], 842 | "metadata": { 843 | "kernelspec": { 844 | "display_name": "Python 3", 845 | "language": "python", 846 | "name": "python3" 847 | }, 848 | "language_info": { 849 | "codemirror_mode": { 850 | "name": "ipython", 851 | "version": 3 852 | }, 853 | "file_extension": ".py", 854 | "mimetype": "text/x-python", 855 | "name": "python", 856 | "nbconvert_exporter": "python", 857 | "pygments_lexer": "ipython3", 858 | "version": "3.7.3" 859 | } 860 | }, 861 | "nbformat": 4, 862 | "nbformat_minor": 2 863 | } 864 | --------------------------------------------------------------------------------