├── .gitattributes ├── 9781484242667.jpg ├── Chapter 1.ipynb ├── Chapter 2.ipynb ├── Chapter 3.ipynb ├── Chapter 4.ipynb ├── Chapter 5.ipynb ├── Chapter 6.ipynb ├── Contributing.md ├── LICENSE.txt ├── README.md └── errata.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /9781484242667.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/natural-language-processing-recipes/347c7704865a83ad41786f96234fbe8bb7a5ec4b/9781484242667.jpg -------------------------------------------------------------------------------- /Chapter 1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Extracting the data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": {}, 13 | "source": [ 14 | "Recipe 1-1. Collecting Data" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 4, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "ename": "ImportError", 26 | "evalue": "No module named 'tweepy'", 27 | "output_type": "error", 28 | "traceback": [ 29 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 30 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 31 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[1;32mimport\u001b[0m \u001b[0mtweepy\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mjson\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 32 | "\u001b[0;31mImportError\u001b[0m: No module named 'tweepy'" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "# Install tweepy\n", 38 | "!pip install tweepy\n", 39 | "\n", 40 | "# Import the libraries\n", 41 | "\n", 42 | "import numpy as np\n", 43 | "import tweepy\n", 44 | "import json\n", 45 | "import pandas as pd\n", 46 | "from tweepy import OAuthHandler\n", 47 | "\n", 48 | "# credentials\n", 49 | "\n", 50 | "consumer_key = \"adjbiejfaaoeh\"\n", 51 | "consumer_secret = \"had73haf78af\"\n", 52 | "access_token = \"jnsfby5u4yuawhafjeh\"\n", 53 | "access_token_secret = \"jhdfgay768476r\"\n", 54 | "\n", 55 | "# calling API\n", 56 | "\n", 57 | "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", 58 | "auth.set_access_token(access_token, access_token_secret)\n", 59 | "api = tweepy.API(auth)\n", 60 | "\n", 61 | "# Provide the query you want to pull the data. For example, pulling data for the mobile phone ABC\n", 62 | "\n", 63 | "query =\"ABC\"\n", 64 | "\n", 65 | "# Fetching tweets\n", 66 | "\n", 67 | "Tweets = api.search(query, count = 10,lang='en',exclude='retweets',tweet_mode='extended')" 68 | ] 69 | }, 70 | { 71 | "cell_type": "raw", 72 | "metadata": {}, 73 | "source": [ 74 | "Recipe 1-2. Collecting Data from PDFs" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "!pip install PyPDF2\n", 86 | "import PyPDF2\n", 87 | "from PyPDF2 import PdfFileReader\n", 88 | "\n", 89 | "#Creating a pdf file object\n", 90 | "\n", 91 | "pdf = open(\"file.pdf\",\"rb\")\n", 92 | "\n", 93 | "#creating pdf reader object\n", 94 | "\n", 95 | "pdf_reader = PyPDF2.PdfFileReader(pdf)\n", 96 | "\n", 97 | "#checking number of pages in a pdf file\n", 98 | "\n", 99 | "print(pdf_reader.numPages)\n", 100 | "\n", 101 | "#creating a page object\n", 102 | "\n", 103 | "page = pdf_reader.getPage(0)\n", 104 | "\n", 105 | "#finally extracting text from the page\n", 106 | "\n", 107 | "print(page.extractText())\n", 108 | "\n", 109 | "#closing the pdf file\n", 110 | "\n", 111 | "pdf.close()\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "raw", 116 | "metadata": {}, 117 | "source": [ 118 | "Recipe 1-3.Collecting Data from Word Files" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "#Install docx \n", 130 | "!pip install docx\n", 131 | "\n", 132 | "#Import library\n", 133 | "from docx import Document\n", 134 | "\n", 135 | "#Creating a word file object\n", 136 | "\n", 137 | "doc = open(\"file.docx\",\"rb\")\n", 138 | "\n", 139 | "#creating word reader object\n", 140 | "\n", 141 | "document = docx.Document(doc)\n", 142 | "\n", 143 | "# create an empty string and call this document. This document variable store each paragraph in the Word document.We then create a for loop that goes through each paragraph in the Word document and appends the paragraph.\n", 144 | "\n", 145 | "docu=\"\"\n", 146 | "for para in document.paragraphs:\n", 147 | " docu += para.text \n", 148 | "\n", 149 | "#to see the output call docu\n", 150 | "print(docu)\n" 151 | ] 152 | }, 153 | { 154 | "cell_type": "raw", 155 | "metadata": {}, 156 | "source": [ 157 | "Recipe 1-4.Collecting Data from JSON " 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "import requests\n", 169 | "import json\n", 170 | "\n", 171 | "#json from \"https://quotes.rest/qod.json\"\n", 172 | "r = requests.get(\"https://quotes.rest/qod.json\")\n", 173 | "res = r.json()\n", 174 | "print(json.dumps(res, indent = 4)) \n", 175 | "\n", 176 | "#output\n", 177 | "{\n", 178 | " \"success\": {\n", 179 | " \"total\": 1\n", 180 | " },\n", 181 | " \"contents\": {\n", 182 | " \"quotes\": [\n", 183 | " {\n", 184 | " \"quote\": \"Where there is ruin, there is hope for a treasure.\",\n", 185 | " \"length\": \"50\",\n", 186 | " \"author\": \"Rumi\",\n", 187 | " \"tags\": [\n", 188 | " \"failure\",\n", 189 | " \"inspire\",\n", 190 | " \"learning-from-failure\"\n", 191 | " ],\n", 192 | " \"category\": \"inspire\",\n", 193 | " \"date\": \"2018-09-29\",\n", 194 | " \"permalink\": \"https://theysaidso.com/quote/dPKsui4sQnQqgMnXHLKtfweF/rumi-where-there-is-ruin-there-is-hope-for-a-treasure\",\n", 195 | " \"title\": \"Inspiring Quote of the day\",\n", 196 | " \"background\": \"https://theysaidso.com/img/bgs/man_on_the_mountain.jpg\",\n", 197 | " \"id\": \"dPKsui4sQnQqgMnXHLKtfweF\"\n", 198 | " }\n", 199 | " ],\n", 200 | " \"copyright\": \"2017-19 theysaidso.com\"\n", 201 | " }\n", 202 | "}\n", 203 | "\n", 204 | "#extract contents\n", 205 | "q = res['contents']['quotes'][0] \n", 206 | "q\n", 207 | "\n", 208 | "#output\n", 209 | "\n", 210 | "{'author': 'Rumi',\n", 211 | " 'background': 'https://theysaidso.com/img/bgs/man_on_the_mountain.jpg',\n", 212 | " 'category': 'inspire',\n", 213 | " 'date': '2018-09-29',\n", 214 | " 'id': 'dPKsui4sQnQqgMnXHLKtfweF',\n", 215 | " 'length': '50',\n", 216 | " 'permalink': 'https://theysaidso.com/quote/dPKsui4sQnQqgMnXHLKtfweF/rumi-where-there-is-ruin-there-is-hope-for-a-treasure',\n", 217 | " 'quote': 'Where there is ruin, there is hope for a treasure.',\n", 218 | " 'tags': ['failure', 'inspire', 'learning-from-failure'],\n", 219 | " 'title': 'Inspiring Quote of the day'}\n", 220 | "\n", 221 | "#extract only quote\n", 222 | "print(q['quote'], '\\n--', q['author'])\n" 223 | ] 224 | }, 225 | { 226 | "cell_type": "raw", 227 | "metadata": {}, 228 | "source": [ 229 | "Recipe 1-5. Collecting Data from HTML" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "collapsed": true 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "!pip install bs4\n", 241 | "import urllib.request as urllib2 \n", 242 | "from bs4 import BeautifulSoup\n", 243 | "\n", 244 | "response = urllib2.urlopen('https://en.wikipedia.org/wiki/Natural_language_processing')\n", 245 | "html_doc = response.read()\n", 246 | "\n", 247 | "#Parsing\n", 248 | "soup = BeautifulSoup(html_doc, 'html.parser')\n", 249 | "# Formating the parsed html file\n", 250 | "strhtm = soup.prettify()\n", 251 | "\n", 252 | "# Print few lines\n", 253 | "print (strhtm[:1000])\n", 254 | "\n", 255 | "print(soup.title)\n", 256 | "print(soup.title.string)\n", 257 | "print(soup.a.string)\n", 258 | "print(soup.b.string)\n", 259 | "\n", 260 | "for x in soup.find_all('a'): print(x.string)\n", 261 | " \n", 262 | "for x in soup.find_all('p'): print(x.text)\n", 263 | " \n" 264 | ] 265 | }, 266 | { 267 | "cell_type": "raw", 268 | "metadata": {}, 269 | "source": [ 270 | "Recipe 1-6. Parsing Text using Regular Expressions" 271 | ] 272 | }, 273 | { 274 | "cell_type": "raw", 275 | "metadata": {}, 276 | "source": [ 277 | "6.A Problem" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "collapsed": true 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "# Import library\n", 289 | "\n", 290 | "import re\n", 291 | "\n", 292 | "#run the split query\n", 293 | "\n", 294 | "re.split('\\s+','I like this book.')\n" 295 | ] 296 | }, 297 | { 298 | "cell_type": "raw", 299 | "metadata": {}, 300 | "source": [ 301 | "6.B Problem" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": { 308 | "collapsed": true 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "doc = \"For more details please mail us at: xyz@abc.com, pqr@mno.com\"\n", 313 | "\n", 314 | "addresses = re.findall(r'[\\w\\.-]+@[\\w\\.-]+', doc)\n", 315 | "for address in addresses: \n", 316 | " print(address)\n" 317 | ] 318 | }, 319 | { 320 | "cell_type": "raw", 321 | "metadata": {}, 322 | "source": [ 323 | "6.C Problem" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "collapsed": true 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "doc = \"For more details please mail us at xyz@abc.com\"\n", 335 | "\n", 336 | "new_email_address = re.sub(r'([\\w\\.-]+)@([\\w\\.-]+)', r'pqr@mno.com', doc)\n", 337 | "print(new_email_address)\n" 338 | ] 339 | }, 340 | { 341 | "cell_type": "raw", 342 | "metadata": {}, 343 | "source": [ 344 | "6.D Problem" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": { 351 | "collapsed": true 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "# Import library\n", 356 | "\n", 357 | "import re\n", 358 | "import requests\n", 359 | "\n", 360 | "#url you want to extract\n", 361 | "url = 'https://www.gutenberg.org/files/2638/2638-0.txt'\n", 362 | "\n", 363 | "#function to extract\n", 364 | "def get_book(url):\n", 365 | " # Sends a http request to get the text from project Gutenberg\n", 366 | " raw = requests.get(url).text\n", 367 | " # Discards the metadata from the beginning of the book\n", 368 | " start = re.search(r\"\\*\\*\\* START OF THIS PROJECT GUTENBERG EBOOK .* \\*\\*\\*\",raw ).end()\n", 369 | " # Discards the metadata from the end of the book\n", 370 | " stop = re.search(r\"II\", raw).start()\n", 371 | " # Keeps the relevant text\n", 372 | " text = raw[start:stop]\n", 373 | " return text\n", 374 | "\n", 375 | "\n", 376 | "# processing\n", 377 | "def preprocess(sentence): \n", 378 | " return re.sub('[^A-Za-z0-9.]+' , ' ', sentence).lower()\n", 379 | "\n", 380 | "\n", 381 | "#calling the above function\n", 382 | "\n", 383 | "book = get_book(url)\n", 384 | "processed_book = preprocess(book)\n", 385 | "print(processed_book)\n", 386 | "\n", 387 | "# Count number of times \"the\" is appeared in the book\n", 388 | "len(re.findall(r'the', processed_book))\n", 389 | "\n", 390 | "#Replace \"i\" with \"I\"\n", 391 | "processed_book = re.sub(r'\\si\\s', \" I \", processed_book)\n", 392 | "print(processed_book)\n", 393 | "\n", 394 | "#find all occurance of text in the format \"abc--xyz\"\n", 395 | "re.findall(r'[a-zA-Z0-9]*--[a-zA-Z0-9]*', book)\n" 396 | ] 397 | }, 398 | { 399 | "cell_type": "raw", 400 | "metadata": {}, 401 | "source": [ 402 | "Recipe 1-7. Handling Strings" 403 | ] 404 | }, 405 | { 406 | "cell_type": "raw", 407 | "metadata": {}, 408 | "source": [ 409 | "7.A Problem" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "collapsed": true 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "String_v1 = \"I am exploring NLP\"\n", 421 | "\n", 422 | "#To extract particular character or range of characters from string\n", 423 | "\n", 424 | "print(String_v1[0])\n", 425 | "\n", 426 | "#output\n", 427 | "“I”\n", 428 | "\n", 429 | "#To extract exploring\n", 430 | "\n", 431 | "print(String_v1[5:14])\n", 432 | "\n", 433 | "String_v2 = String_v1.replace(\"exploring\", \"learning\")\n", 434 | "print(String_v2)\n", 435 | "\n" 436 | ] 437 | }, 438 | { 439 | "cell_type": "raw", 440 | "metadata": {}, 441 | "source": [ 442 | "7.B Problem" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": { 449 | "collapsed": true 450 | }, 451 | "outputs": [], 452 | "source": [ 453 | "s1 = \"nlp\"\n", 454 | "s2 = \"machine learning\"\n", 455 | "s3 = s1+s2\n", 456 | "print(s3)\n" 457 | ] 458 | }, 459 | { 460 | "cell_type": "raw", 461 | "metadata": {}, 462 | "source": [ 463 | "7.C Problem" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": { 470 | "collapsed": true 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "#use find function to fetch the starting index value of the sub string in whole string.\n", 475 | "\n", 476 | "var=\"I am learning NLP\"\n", 477 | "f= \"learn\"\n", 478 | "var.find(f)\n" 479 | ] 480 | }, 481 | { 482 | "cell_type": "raw", 483 | "metadata": {}, 484 | "source": [ 485 | "Recipe 1-8" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 5, 491 | "metadata": { 492 | "collapsed": false 493 | }, 494 | "outputs": [ 495 | { 496 | "name": "stdout", 497 | "output_type": "stream", 498 | "text": [ 499 | "Collecting bs4\n" 500 | ] 501 | }, 502 | { 503 | "name": "stderr", 504 | "output_type": "stream", 505 | "text": [ 506 | " Retrying (Retry(total=4, connect=None, read=None, redirect=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno 11001] getaddrinfo failed',)': /simple/bs4/\n", 507 | " Retrying (Retry(total=3, connect=None, read=None, redirect=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno 11001] getaddrinfo failed',)': /simple/bs4/\n", 508 | " Retrying (Retry(total=2, connect=None, read=None, redirect=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno 11001] getaddrinfo failed',)': /simple/bs4/\n", 509 | " Retrying (Retry(total=1, connect=None, read=None, redirect=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno 11001] getaddrinfo failed',)': /simple/bs4/\n", 510 | " Retrying (Retry(total=0, connect=None, read=None, redirect=None)) after connection broken by 'NewConnectionError(': Failed to establish a new connection: [Errno 11001] getaddrinfo failed',)': /simple/bs4/\n", 511 | " Could not find a version that satisfies the requirement bs4 (from versions: )\n", 512 | "No matching distribution found for bs4\n" 513 | ] 514 | }, 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "Requirement already satisfied (use --upgrade to upgrade): requests in c:\\users\\adarsha.shivananda\\appdata\\local\\continuum\\anaconda3\\lib\\site-packages\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "!pip install bs4\n", 525 | "!pip install requests\n", 526 | "\n", 527 | "from bs4 import BeautifulSoup\n", 528 | "import requests\n", 529 | "import pandas as pd\n", 530 | "from pandas import Series, DataFrame\n", 531 | "from ipywidgets import FloatProgress\n", 532 | "from time import sleep\n", 533 | "from IPython.display import display\n", 534 | "import re\n", 535 | "import pickle\n", 536 | "\n", 537 | "url = 'http://www.imdb.com/chart/top?ref_=nv_mv_250_6'\n", 538 | "\n", 539 | "result = requests.get(url)\n", 540 | "c = result.content\n", 541 | "soup = BeautifulSoup(c,\"lxml\")\n", 542 | "\n", 543 | "summary = soup.find('div',{'class':'article'})\n", 544 | "\n", 545 | "# Create empty lists to append the extracted data.\n", 546 | "\n", 547 | "moviename = []\n", 548 | "cast = []\n", 549 | "description = []\n", 550 | "rating = []\n", 551 | "ratingoutof = []\n", 552 | "year = []\n", 553 | "genre = []\n", 554 | "movielength = []\n", 555 | "rot_audscore = []\n", 556 | "rot_avgrating = []\n", 557 | "rot_users = []\n", 558 | "\n", 559 | "# Extracting the required data from the html soup.\n", 560 | "\n", 561 | "rgx = re.compile('[%s]' % '()')\n", 562 | "f = FloatProgress(min=0, max=250)\n", 563 | "display(f)\n", 564 | "for row,i in zip(summary.find('table').findAll('tr'),range(len(summary.find('table').findAll('tr')))):\n", 565 | " for sitem in row.findAll('span',{'class':'secondaryInfo'}):\n", 566 | " s = sitem.find(text=True)\n", 567 | " year.append(rgx.sub('', s))\n", 568 | " for ritem in row.findAll('td',{'class':'ratingColumn imdbRating'}):\n", 569 | " for iget in ritem.findAll('strong'):\n", 570 | " rating.append(iget.find(text=True))\n", 571 | " ratingoutof.append(iget.get('title').split(' ', 4)[3])\n", 572 | " for item in row.findAll('td',{'class':'titleColumn'}):\n", 573 | " for href in item.findAll('a',href=True):\n", 574 | " moviename.append(href.find(text=True))\n", 575 | " rurl = 'https://www.rottentomatoes.com/m/'+ href.find(text=True)\n", 576 | " try:\n", 577 | " rresult = requests.get(rurl)\n", 578 | " except requests.exceptions.ConnectionError:\n", 579 | " status_code = \"Connection refused\"\n", 580 | " rc = rresult.content\n", 581 | " rsoup = BeautifulSoup(rc)\n", 582 | " try:\n", 583 | " rot_audscore.append(rsoup.find('div',{'class':'meter-value'}).find('span',{'class':'superPageFontColor'}).text)\n", 584 | " rot_avgrating.append(rsoup.find('div',{'class':'audience-info hidden-xs superPageFontColor'}).find('div').contents[2].strip())\n", 585 | " rot_users.append(rsoup.find('div',{'class':'audience-info hidden-xs superPageFontColor'}).contents[3].contents[2].strip())\n", 586 | " except AttributeError:\n", 587 | " rot_audscore.append(\"\")\n", 588 | " rot_avgrating.append(\"\")\n", 589 | " rot_users.append(\"\")\n", 590 | " cast.append(href.get('title'))\n", 591 | " imdb = \"http://www.imdb.com\" + href.get('href')\n", 592 | " try:\n", 593 | " iresult = requests.get(imdb)\n", 594 | " ic = iresult.content\n", 595 | " isoup = BeautifulSoup(ic)\n", 596 | " description.append(isoup.find('div',{'class':'summary_text'}).find(text=True).strip())\n", 597 | " genre.append(isoup.find('span',{'class':'itemprop'}).find(text=True))\n", 598 | " movielength.append(isoup.find('time',{'itemprop':'duration'}).find(text=True).strip())\n", 599 | " except requests.exceptions.ConnectionError:\n", 600 | " description.append(\"\")\n", 601 | " genre.append(\"\")\n", 602 | " movielength.append(\"\")\n", 603 | " sleep(.1)\n", 604 | " f.value = i\n", 605 | "\n", 606 | " # List to pandas series\n", 607 | "\n", 608 | "moviename = Series(moviename)\n", 609 | "cast = Series(cast)\n", 610 | "description = Series(description)\n", 611 | "rating = Series(rating)\n", 612 | "ratingoutof = Series(ratingoutof)\n", 613 | "year = Series(year)\n", 614 | "genre = Series(genre)\n", 615 | "movielength = Series(movielength) \n", 616 | "rot_audscore = Series(rot_audscore)\n", 617 | "rot_avgrating = Series(rot_avgrating)\n", 618 | "rot_users = Series(rot_users)\n", 619 | "\n", 620 | "# creating dataframe and doing analysis\n", 621 | "\n", 622 | "imdb_df = pd.concat([moviename,year,description,genre,movielength,cast,rating,ratingoutof,rot_audscore,rot_avgrating,rot_users],axis=1)\n", 623 | "imdb_df.columns = ['moviename','year','description','genre','movielength','cast','imdb_rating','imdb_ratingbasedon','tomatoes_audscore','tomatoes_rating','tomatoes_ratingbasedon']\n", 624 | "imdb_df['rank'] = imdb_df.index + 1\n", 625 | "imdb_df.head(1)\n", 626 | "\n", 627 | "# Saving the file as CSV.\n", 628 | "\n", 629 | "imdb_df.to_csv(\"imdbdataexport.csv\")\n" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": { 636 | "collapsed": true 637 | }, 638 | "outputs": [], 639 | "source": [] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": { 645 | "collapsed": true 646 | }, 647 | "outputs": [], 648 | "source": [] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": { 654 | "collapsed": true 655 | }, 656 | "outputs": [], 657 | "source": [] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "metadata": { 663 | "collapsed": true 664 | }, 665 | "outputs": [], 666 | "source": [] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": { 672 | "collapsed": true 673 | }, 674 | "outputs": [], 675 | "source": [] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": null, 680 | "metadata": { 681 | "collapsed": true 682 | }, 683 | "outputs": [], 684 | "source": [] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": { 690 | "collapsed": true 691 | }, 692 | "outputs": [], 693 | "source": [] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": { 699 | "collapsed": true 700 | }, 701 | "outputs": [], 702 | "source": [] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": { 708 | "collapsed": true 709 | }, 710 | "outputs": [], 711 | "source": [] 712 | } 713 | ], 714 | "metadata": { 715 | "anaconda-cloud": {}, 716 | "kernelspec": { 717 | "display_name": "Python [conda root]", 718 | "language": "python", 719 | "name": "conda-root-py" 720 | }, 721 | "language_info": { 722 | "codemirror_mode": { 723 | "name": "ipython", 724 | "version": 3 725 | }, 726 | "file_extension": ".py", 727 | "mimetype": "text/x-python", 728 | "name": "python", 729 | "nbconvert_exporter": "python", 730 | "pygments_lexer": "ipython3", 731 | "version": "3.5.2" 732 | } 733 | }, 734 | "nbformat": 4, 735 | "nbformat_minor": 1 736 | } 737 | -------------------------------------------------------------------------------- /Chapter 2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exploring and processing text data " 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": {}, 13 | "source": [ 14 | "Recipe 2-1. Converting Text Data to Lowercase" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "text=['This is introduction to NLP','It is likely to be useful, to people ','Machine learning is the new electrcity','There would be less hype around AI and more action going forward','python is the best tool!','R is good langauage','I like this book','I want more books like this']\n", 26 | "\n", 27 | "#convert list to data frame\n", 28 | "import pandas as pd\n", 29 | "df = pd.DataFrame({'tweet':text})\n", 30 | "print(df)\n", 31 | "\n", 32 | "x = 'Testing'\n", 33 | "x2 = x.lower()\n", 34 | "print(x2)\n", 35 | "\n", 36 | "\n", 37 | "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(x.lower() for x in x.split()))\n", 38 | "df['tweet']\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "raw", 43 | "metadata": {}, 44 | "source": [ 45 | "Recipe 2-2. Removing Punctuation" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "text=['This is introduction to NLP','It is likely to be useful, to people ','Machine learning is the new electrcity','There would be less hype around AI and more action going forward','python is the best tool!','R is good langauage','I like this book','I want more books like this']\n", 57 | "#convert list to dataframe\n", 58 | "import pandas as pd\n", 59 | "df = pd.DataFrame({'tweet':text})\n", 60 | "print(df)\n", 61 | "\n", 62 | "import re\n", 63 | "\n", 64 | "s = \"I. like. This book!\"\n", 65 | "s1 = re.sub(r'[^\\w\\s]','',s)\n", 66 | "s1\n", 67 | "\n", 68 | "df['tweet'] = df['tweet'].str.replace('[^\\w\\s]','')\n", 69 | "df['tweet']\n", 70 | "\n", 71 | "import string\n", 72 | "\n", 73 | "s = \"I. like. This book!\"\n", 74 | "\n", 75 | "for c in string.punctuation:\n", 76 | "s= s.replace(c,\"\")\n", 77 | "s\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "raw", 82 | "metadata": {}, 83 | "source": [ 84 | "Recipe 2-3. Removing Stopwords" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "text=['This is introduction to NLP','It is likely to be useful, to people ','Machine learning is the new electrcity','There would be less hype around AI and more action going forward','python is the best tool!','R is good langauage','I like this book','I want more books like this']\n", 96 | "\n", 97 | "#convert list to data frame\n", 98 | "import pandas as pd\n", 99 | "df = pd.DataFrame({'tweet':text})\n", 100 | "print(df)\n", 101 | "\n", 102 | "\n", 103 | "#install and import libraries\n", 104 | "\n", 105 | "!pip install nltk\n", 106 | "import nltk\n", 107 | "nltk.download()\n", 108 | "from nltk.corpus import stopwords\n", 109 | "\n", 110 | "#remove stop words\n", 111 | "\n", 112 | "stop = stopwords.words('english')\n", 113 | "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(x for x in x.split() if x not in stop))\n", 114 | "df['tweet']\n" 115 | ] 116 | }, 117 | { 118 | "cell_type": "raw", 119 | "metadata": {}, 120 | "source": [ 121 | "Recipe 2-4. Standardizing Text" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": true 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "lookup_dict = {'nlp':'natural language processing', 'ur':'your', \"wbu\" : \"what about you\"}\n", 133 | "\n", 134 | "import re\n", 135 | "\n", 136 | "def text_std(input_text):\n", 137 | " words = input_text.split()\n", 138 | " new_words = []\n", 139 | " for word in words:\n", 140 | " word = re.sub(r'[^\\w\\s]','',word)\n", 141 | " if word.lower() in lookup_dict:\n", 142 | " word = lookup_dict[word.lower()]\n", 143 | " new_words.append(word)\n", 144 | " new_text = \" \".join(new_words)\n", 145 | " return new_text\n", 146 | "\n", 147 | "text_std(\"I like nlp it's ur choice\")" 148 | ] 149 | }, 150 | { 151 | "cell_type": "raw", 152 | "metadata": {}, 153 | "source": [ 154 | "Recipe 2-5. Correcting Spelling" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "collapsed": true 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "text=['Introduction to NLP','It is likely to be useful, to people ','Machine learning is the new electrcity','R is good langauage','I like this book','I want more books like this']\n", 166 | "\n", 167 | "\n", 168 | "#convert list to dataframe\n", 169 | "import pandas as pd\n", 170 | "df = pd.DataFrame({'tweet':text})\n", 171 | "print(df)\n", 172 | "\n", 173 | "#Install textblob library\n", 174 | "!pip install textblob\n", 175 | "\n", 176 | "#import libraries and use 'correct' function \n", 177 | "\n", 178 | "from textblob import TextBlob\n", 179 | "\n", 180 | "df['tweet'].apply(lambda x: str(TextBlob(x).correct()))\n", 181 | "\n", 182 | "#You can also use autocorrect library as shown below\n", 183 | "\n", 184 | "#install autocorrect\n", 185 | "\n", 186 | "!pip install autocorrect\n", 187 | "\n", 188 | "from autocorrect import spell\n", 189 | "print(spell(u'mussage'))\n", 190 | "print(spell(u'sirvice'))\n", 191 | "\n" 192 | ] 193 | }, 194 | { 195 | "cell_type": "raw", 196 | "metadata": {}, 197 | "source": [ 198 | "Recipe 2-6. Tokenizing Text" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "#Let's create a list of strings and assign it to a variable. \n", 210 | "text=['This is introduction to NLP','It is likely to be useful, to people ','Machine learning is the new electrcity','There would be less hype around AI and more action going forward','python is the best tool!','R is good langauage','I like this book','I want more books like this']\n", 211 | "\n", 212 | "#convert list to dataframe\n", 213 | "import pandas as pd\n", 214 | "df = pd.DataFrame({'tweet':text})\n", 215 | "print(df)\n", 216 | "\n", 217 | "#Using textblob\n", 218 | "from textblob import TextBlob\n", 219 | "TextBlob(df['tweet'][3]).words\n", 220 | "\n", 221 | "#output\n", 222 | "WordList(['would', 'less', 'hype', 'around', 'ai', 'action', 'going', 'forward'])\n", 223 | "\n", 224 | "#using NLTK\n", 225 | "import nltk\n", 226 | "\n", 227 | "#create data\n", 228 | "mystring = \"My favorite animal is cat\"\n", 229 | "\n", 230 | "nltk.word_tokenize(mystring)\n", 231 | "\n", 232 | "#output\n", 233 | "['My', 'favorite', 'animal', 'is', 'cat']\n", 234 | "\n", 235 | "#using split function from python\n", 236 | "mystring.split()\n" 237 | ] 238 | }, 239 | { 240 | "cell_type": "raw", 241 | "metadata": {}, 242 | "source": [ 243 | "Recipe 2-7. Stemming" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "text=['I like fishing','I eat fish','There are many fishes in pound']\n", 255 | "\n", 256 | "#convert list to dataframe\n", 257 | "import pandas as pd\n", 258 | "df = pd.DataFrame({'tweet':text})\n", 259 | "print(df)\n", 260 | "\n", 261 | "#Import library\n", 262 | "from nltk.stem import PorterStemmer\n", 263 | "\n", 264 | "st = PorterStemmer()\n", 265 | "\n", 266 | "df['tweet'][:5].apply(lambda x: \" \".join([st.stem(word) for word in x.split()]))\n" 267 | ] 268 | }, 269 | { 270 | "cell_type": "raw", 271 | "metadata": {}, 272 | "source": [ 273 | "Recipe 2-8. Lemmatizing" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": true 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "text=['I like fishing','I eat fish','There are many fishes in pound', 'leaves and leaf']\n", 285 | "\n", 286 | "#convert list to dataframe\n", 287 | "import pandas as pd\n", 288 | "df = pd.DataFrame({'tweet':text})\n", 289 | "\n", 290 | "print(df)\n", 291 | "\n", 292 | "#Import library\n", 293 | "from textblob import Word\n", 294 | "\n", 295 | "#Code for lemmatize\n", 296 | "df['tweet'] = df['tweet'].apply(lambda x: \" \".join([Word(word).lemmatize() for word in x.split()]))\n", 297 | "\n", 298 | "df['tweet']\n" 299 | ] 300 | }, 301 | { 302 | "cell_type": "raw", 303 | "metadata": {}, 304 | "source": [ 305 | "Recipe 2-9. Exploring Text Data" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "collapsed": true 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "#Execute below code to download dataset, if you haven’t already nltk.download().\n", 317 | "\n", 318 | "#Importing data\n", 319 | "import nltk\n", 320 | "from nltk.corpus import webtext\n", 321 | "nltk.download('webtext')\n", 322 | "wt_sentences = webtext.sents('firefox.txt')\n", 323 | "wt_words = webtext.words('firefox.txt')\n", 324 | "\n", 325 | "# Import Library for computing frequency \n", 326 | "from nltk.probability import FreqDist\n", 327 | "from nltk.corpus import stopwords\n", 328 | "import string\n", 329 | "\n", 330 | "\n", 331 | "# Count the number of words \n", 332 | "\n", 333 | "len(wt_sentences)\n", 334 | "len(wt_words)\n", 335 | "\n", 336 | "frequency_dist = nltk.FreqDist(wt_words) \n", 337 | "frequency_dist\n", 338 | "\n", 339 | "sorted_frequency_dist =sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)\n", 340 | "sorted_frequency_dist\n", 341 | "\n", 342 | "large_words = dict([(k,v) for k,v in frequency_dist.items() if len(k)>3])\n", 343 | "\n", 344 | "frequency_dist = nltk.FreqDist(large_words)\n", 345 | "frequency_dist.plot(50,cumulative=False)\n", 346 | "\n", 347 | "#install library\n", 348 | "!pip install wordcloud\n", 349 | "\n", 350 | "#build wordcloud\n", 351 | "\n", 352 | "from wordcloud import WordCloud\n", 353 | "wcloud = WordCloud().generate_from_frequencies(frequency_dist)\n", 354 | "\n", 355 | "#plotting the wordcloud\n", 356 | "\n", 357 | "import matplotlib.pyplot as plt\n", 358 | "plt.imshow(wcloud, interpolation='bilinear')\n", 359 | "plt.axis(\"off\")\n", 360 | "(-0.5, 399.5, 199.5, -0.5)\n", 361 | "plt.show()\n", 362 | "\n" 363 | ] 364 | }, 365 | { 366 | "cell_type": "raw", 367 | "metadata": {}, 368 | "source": [ 369 | "Recipe 2-10" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "collapsed": true 377 | }, 378 | "outputs": [], 379 | "source": [ 380 | "tweet_sample= \"How to take control of your #debt https://personal.vanguard.com/us/insights/saving-investing/debt-management.#Best advice for #family #financial #success (@PrepareToWin)\"\n", 381 | "\n", 382 | "def processRow(row):\n", 383 | " \n", 384 | " import re\n", 385 | " import nltk\n", 386 | " from textblob import TextBlob\n", 387 | " from nltk.corpus import stopwords\n", 388 | " from nltk.stem import PorterStemmer\n", 389 | " from textblob import Word\n", 390 | " from nltk.util import ngrams\n", 391 | " import re\n", 392 | " from wordcloud import WordCloud, STOPWORDS\n", 393 | " from nltk.tokenize import word_tokenize\n", 394 | " \n", 395 | " tweet = row\n", 396 | " #Lower case\n", 397 | " tweet.lower()\n", 398 | " #Removes unicode strings like \"\\u002c\" and \"x96\" \n", 399 | " tweet = re.sub(r'(\\\\u[0-9A-Fa-f]+)',r'', tweet) \n", 400 | " tweet = re.sub(r'[^\\x00-\\x7f]',r'',tweet)\n", 401 | " #convert any url to URL\n", 402 | " tweet = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))','URL',tweet)\n", 403 | " #Convert any @Username to \"AT_USER\"\n", 404 | " tweet = re.sub('@[^\\s]+','AT_USER',tweet)\n", 405 | " #Remove additional white spaces\n", 406 | " tweet = re.sub('[\\s]+', ' ', tweet)\n", 407 | " tweet = re.sub('[\\n]+', ' ', tweet)\n", 408 | " #Remove not alphanumeric symbols white spaces\n", 409 | " tweet = re.sub(r'[^\\w]', ' ', tweet)\n", 410 | " #Removes hastag in front of a word \"\"\"\n", 411 | " tweet = re.sub(r'#([^\\s]+)', r'\\1', tweet)\n", 412 | " #Replace #word with word\n", 413 | " tweet = re.sub(r'#([^\\s]+)', r'\\1', tweet)\n", 414 | " #Remove :( or :)\n", 415 | " tweet = tweet.replace(':)','')\n", 416 | " tweet = tweet.replace(':(','')\n", 417 | " #remove numbers\n", 418 | " tweet = ''.join([i for i in tweet if not i.isdigit()]) \n", 419 | " #remove multiple exclamation\n", 420 | " tweet = re.sub(r\"(\\!)\\1+\", ' ', tweet)\n", 421 | " #remove multiple question marks\n", 422 | " tweet = re.sub(r\"(\\?)\\1+\", ' ', tweet)\n", 423 | " #remove multistop\n", 424 | " tweet = re.sub(r\"(\\.)\\1+\", ' ', tweet)\n", 425 | " #lemma\n", 426 | " from textblob import Word\n", 427 | " tweet =\" \".join([Word(word).lemmatize() for word in tweet.split()])\n", 428 | " #stemmer\n", 429 | " #st = PorterStemmer()\n", 430 | " #tweet=\" \".join([st.stem(word) for word in tweet.split()])\n", 431 | " #Removes emoticons from text \n", 432 | " tweet = re.sub(':\\)|;\\)|:-\\)|\\(-:|:-D|=D|:P|xD|X-p|\\^\\^|:-*|\\^\\.\\^|\\^\\-\\^|\\^\\_\\^|\\,-\\)|\\)-:|:\\'\\(|:\\(|:-\\(|:\\S|T\\.T|\\.\\_\\.|:<|:-\\S|:-<|\\*\\-\\*|:O|=O|=\\-O|O\\.o|XO|O\\_O|:-\\@|=/|:/|X\\-\\(|>\\.<|>=\\(|D:', '', tweet)\n", 433 | " #trim\n", 434 | " tweet = tweet.strip('\\'\"')\n", 435 | "\n", 436 | " row = tweet\n", 437 | "\n", 438 | " return row\n", 439 | "\n", 440 | "#call the function with your data\n", 441 | "processRow(tweet_sample)\n" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": { 448 | "collapsed": true 449 | }, 450 | "outputs": [], 451 | "source": [] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": { 457 | "collapsed": true 458 | }, 459 | "outputs": [], 460 | "source": [] 461 | } 462 | ], 463 | "metadata": { 464 | "kernelspec": { 465 | "display_name": "Python [conda root]", 466 | "language": "python", 467 | "name": "conda-root-py" 468 | }, 469 | "language_info": { 470 | "codemirror_mode": { 471 | "name": "ipython", 472 | "version": 3 473 | }, 474 | "file_extension": ".py", 475 | "mimetype": "text/x-python", 476 | "name": "python", 477 | "nbconvert_exporter": "python", 478 | "pygments_lexer": "ipython3", 479 | "version": "3.5.2" 480 | } 481 | }, 482 | "nbformat": 4, 483 | "nbformat_minor": 1 484 | } 485 | -------------------------------------------------------------------------------- /Chapter 3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Converting text to features" 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": {}, 13 | "source": [ 14 | "Recipe 3-1. Converting Text to Features Using One Hot Encoding" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "# Importing the library \n", 26 | "\n", 27 | "import pandas as pd\n", 28 | "\n", 29 | "# Generating the features\n", 30 | "\n", 31 | "pd.get_dummies(Text.split())\n", 32 | "\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "raw", 37 | "metadata": {}, 38 | "source": [ 39 | "Recipe 3-2. Converting Text to Features Using Count Vectorizing" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "#importing the function \n", 51 | "\n", 52 | "from sklearn.feature_extraction.text import CountVectorizer\n", 53 | "\n", 54 | "# Text\n", 55 | "\n", 56 | "text = [\"I love NLP and I will learn NLP in 2month \"]\n", 57 | "\n", 58 | "# create the transform\n", 59 | "\n", 60 | "vectorizer = CountVectorizer()\n", 61 | "\n", 62 | "# tokenizing\n", 63 | "\n", 64 | "vectorizer.fit(text)\n", 65 | "\n", 66 | "\n", 67 | "# encode document\n", 68 | "\n", 69 | "vector = vectorizer.transform(text)\n", 70 | "\n", 71 | "# summarize & generating output\n", 72 | "\n", 73 | "print(vectorizer.vocabulary_)\n", 74 | "print(vector.toarray())\n", 75 | "\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "raw", 80 | "metadata": {}, 81 | "source": [ 82 | "Recipe 3-3. Generating N-grams" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "Text = \"I am learning NLP\"\n", 94 | "\n", 95 | "#Import textblob\n", 96 | "from textblob import TextBlob\n", 97 | "\n", 98 | "#For unigram : Use n = 1\n", 99 | "\n", 100 | "TextBlob(Text).ngrams(1)\n", 101 | "\n", 102 | "#For Bigram : For bigrams, use n = 2 \n", 103 | "\n", 104 | "TextBlob(Text).ngrams(2)\n", 105 | "\n", 106 | "#importing the function \n", 107 | "\n", 108 | "from sklearn.feature_extraction.text import CountVectorizer\n", 109 | "\n", 110 | "# Text\n", 111 | "\n", 112 | "text = [\"I love NLP and I will learn NLP in 2month \"]\n", 113 | "\n", 114 | "# create the transform\n", 115 | "\n", 116 | "vectorizer = CountVectorizer(ngram_range=(2,2))\n", 117 | "\n", 118 | "# tokenizing\n", 119 | "\n", 120 | "vectorizer.fit(text)\n", 121 | "\n", 122 | "# encode document\n", 123 | "\n", 124 | "\n", 125 | "vector = vectorizer.transform(text)\n", 126 | "\n", 127 | "# summarize & generating output\n", 128 | "print(vectorizer.vocabulary_)\n", 129 | "print(vector.toarray())\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "raw", 134 | "metadata": {}, 135 | "source": [ 136 | "Recipe 3-4. " 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "import numpy as np \n", 148 | "import nltk\n", 149 | "from nltk import bigrams \n", 150 | "import itertools\n", 151 | "\n", 152 | "def co_occurrence_matrix(corpus):\n", 153 | " vocab = set(corpus)\n", 154 | " vocab = list(vocab)\n", 155 | " vocab_to_index = { word:i for i, word in enumerate(vocab) }\n", 156 | " # Create bigrams from all words in corpus\n", 157 | " bi_grams = list(bigrams(corpus))\n", 158 | " # Frequency distribution of bigrams ((word1, word2), num_occurrences)\n", 159 | " bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))\n", 160 | " # Initialise co-occurrence matrix\n", 161 | " # co_occurrence_matrix[current][previous]\n", 162 | " co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))\n", 163 | "\n", 164 | " # Loop through the bigrams taking the current and previous word,\n", 165 | " # and the number of occurrences of the bigram.\n", 166 | " for bigram in bigram_freq:\n", 167 | " current = bigram[0][1]\n", 168 | " previous = bigram[0][0]\n", 169 | " count = bigram[1]\n", 170 | " pos_current = vocab_to_index[current]\n", 171 | " pos_previous = vocab_to_index[previous]\n", 172 | " co_occurrence_matrix[pos_current][pos_previous] = count \n", 173 | " co_occurrence_matrix = np.matrix(co_occurrence_matrix)\n", 174 | " # return the matrix and the index\n", 175 | " return co_occurrence_matrix,vocab_to_index\n", 176 | "\n", 177 | "# sentences for testing\n", 178 | "\n", 179 | "sentences = [['I', 'love', 'nlp'],\n", 180 | "\t\t\t['I', 'love','to' 'learn'],\n", 181 | "\t\t\t['nlp', 'is', 'future'],\n", 182 | "\t\t\t['nlp', 'is', 'cool']]\n", 183 | "\n", 184 | "# create one list using many lists\n", 185 | "\n", 186 | "merged = list(itertools.chain.from_iterable(sentences))\n", 187 | "matrix = co_occurrence_matrix(merged)\n", 188 | "\n", 189 | "# generate the matrix\n", 190 | "\n", 191 | "CoMatrixFinal = pd.DataFrame(matrix[0], index=vocab_to_index, columns=vocab_to_index)\n", 192 | "print(CoMatrixFinal)\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "raw", 197 | "metadata": {}, 198 | "source": [ 199 | "Recipe 3-5. " 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": { 206 | "collapsed": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "# list of text documents\n", 211 | "text = [\"The quick brown fox jumped over the lazy dog.\"]\n", 212 | "\n", 213 | "# Transform\n", 214 | "vectorizer = HashingVectorizer(n_features=10)\n", 215 | "\n", 216 | "# create the hashing vector\n", 217 | "vector = vectorizer.transform(text)\n", 218 | "\n", 219 | "# summarize the vector\n", 220 | "print(vector.shape)\n", 221 | "print(vector.toarray()) \n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "raw", 226 | "metadata": {}, 227 | "source": [ 228 | "Recipe 3-6. Converting Text to Features Using TF-IDF" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 1, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "{'quick': 6, 'jumped': 3, 'the': 7, 'over': 5, 'brown': 0, 'fox': 2, 'dog': 1, 'lazy': 4}\n", 243 | "[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718\n", 244 | " 1.69314718 1. ]\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "Text = [\"The quick brown fox jumped over the lazy dog.\",\n", 250 | "\"The dog.\",\n", 251 | "\"The fox\"]\n", 252 | "\n", 253 | "#Import TfidfVectorizer\n", 254 | "\n", 255 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 256 | "\n", 257 | "#Create the transform\n", 258 | "\n", 259 | "vectorizer = TfidfVectorizer()\n", 260 | "\n", 261 | "#Tokenize and build vocab\n", 262 | "\n", 263 | "vectorizer.fit(Text)\n", 264 | "\n", 265 | "#Summarize\n", 266 | "\n", 267 | "print(vectorizer.vocabulary_)\n", 268 | "print(vectorizer.idf_)\n" 269 | ] 270 | }, 271 | { 272 | "cell_type": "raw", 273 | "metadata": {}, 274 | "source": [ 275 | "Recipe 3-7. Implementing Word Embeddings" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "sentences = [['I', 'love', 'nlp'],\n", 287 | "\t\t\t['I', 'will', 'learn', 'nlp', 'in', '2','months'],\n", 288 | "\t\t\t['nlp', 'is', 'future'],\n", 289 | "\t\t\t['nlp', 'saves', 'time', 'and', 'solves', 'lot', 'of', 'industry', 'problems'],\n", 290 | "\t\t\t['nlp', 'uses', 'machine', 'learning']]\n", 291 | "\n", 292 | "\n", 293 | "#import library \n", 294 | "\n", 295 | "!pip install gensim\n", 296 | "\n", 297 | "import gensim\n", 298 | "from gensim.models import Word2Vec\n", 299 | "from sklearn.decomposition import PCA\n", 300 | "from matplotlib import pyplot\n", 301 | "\n", 302 | "# training the model\n", 303 | "\n", 304 | "skipgram = Word2Vec(sentences, size =50, window = 3, min_count=1,sg = 1)\n", 305 | "print(skipgram)\n", 306 | "\n", 307 | "# access vector for one word\n", 308 | "\n", 309 | "print(skipgram['nlp'])\n" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": { 316 | "collapsed": true 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "# access vector for another one word\n", 321 | "\n", 322 | "print(fast['deep'])\n" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "# save model\n", 334 | "\n", 335 | "skipgram.save('skipgram.bin')\n", 336 | "\n", 337 | "# load model\n", 338 | "\n", 339 | "skipgram = Word2Vec.load('skipgram.bin')\n" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "# T – SNE plot \n", 351 | "\n", 352 | "X = skipgram[skipgram.wv.vocab]\n", 353 | "pca = PCA(n_components=2)\n", 354 | "result = pca.fit_transform(X)\n", 355 | "\n", 356 | "# create a scatter plot of the projection\n", 357 | "\n", 358 | "pyplot.scatter(result[:, 0], result[:, 1])\n", 359 | "words = list(skipgram.wv.vocab)\n", 360 | "for i, word in enumerate(words):\n", 361 | "\tpyplot.annotate(word, xy=(result[i, 0], result[i, 1]))\n", 362 | "pyplot.show() \n" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": { 369 | "collapsed": true 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "## Continuous Bag of Words (CBOW)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "collapsed": true 381 | }, 382 | "outputs": [], 383 | "source": [ 384 | "#import library \n", 385 | "\n", 386 | "from gensim.models import Word2Vec\n", 387 | "from sklearn.decomposition import PCA\n", 388 | "from matplotlib import pyplot\n", 389 | "\n", 390 | "#Example sentences\n", 391 | "\n", 392 | "sentences = [['I', 'love', 'nlp'],\n", 393 | "\t\t\t['I', 'will', 'learn', 'nlp', 'in', '2','months'],\n", 394 | "\t\t\t['nlp', 'is', 'future'],\n", 395 | "\t\t\t['nlp', 'saves', 'time', 'and', 'solves', 'lot', 'of', 'industry', 'problems'],\n", 396 | "\t\t\t['nlp', 'uses', 'machine', 'learning']]\n", 397 | "\n", 398 | "\n", 399 | "# training the model\n", 400 | "\n", 401 | "skipgram = Word2Vec(sentences, size =50, window = 3, min_count=1,sg = 1)\n", 402 | "print(skipgram)\n", 403 | "\n", 404 | "# access vector for one word\n", 405 | "\n", 406 | "print(skipgram['nlp'])\n", 407 | "\n", 408 | "# save model\n", 409 | "\n", 410 | "skipgram.save('skipgram.bin')\n", 411 | "\n", 412 | "# load model\n", 413 | "\n", 414 | "skipgram = Word2Vec.load('skipgram.bin')\n" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": { 421 | "collapsed": true 422 | }, 423 | "outputs": [], 424 | "source": [ 425 | "# T – SNE plot \n", 426 | "\n", 427 | "X = skipgram[skipgram.wv.vocab]\n", 428 | "pca = PCA(n_components=2)\n", 429 | "result = pca.fit_transform(X)\n", 430 | "\n", 431 | "# create a scatter plot of the projection\n", 432 | "\n", 433 | "pyplot.scatter(result[:, 0], result[:, 1])\n", 434 | "words = list(skipgram.wv.vocab)\n", 435 | "for i, word in enumerate(words):\n", 436 | "\tpyplot.annotate(word, xy=(result[i, 0], result[i, 1]))\n", 437 | "pyplot.show() \n", 438 | "\n" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": { 445 | "collapsed": true 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "# import gensim package\n", 450 | "\n", 451 | "import gensim\n", 452 | "\n", 453 | "# load the saved model \n", 454 | " \n", 455 | "model = gensim.models.Word2Vec.load_word2vec_format('C:\\\\Users\\\\GoogleNews-vectors-negative300.bin', binary=True) \n", 456 | " \n", 457 | "\n", 458 | "#Checking how similarity works. \n", 459 | "\n", 460 | "print (model.similarity('this', 'is'))\n", 461 | "\n", 462 | "#Lets check one more.\n", 463 | "print (model.similarity('post', 'book'))\n", 464 | "\n", 465 | "# Finding the odd one out.\n", 466 | "\n", 467 | "model.doesnt_match('breakfast cereal dinner lunch';.split())\n", 468 | "\n", 469 | "# It is also finding the relations between words. \n", 470 | "\n", 471 | "word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])\n" 472 | ] 473 | }, 474 | { 475 | "cell_type": "raw", 476 | "metadata": {}, 477 | "source": [ 478 | "Recipe 3-8 Implementing FastText" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": { 485 | "collapsed": true 486 | }, 487 | "outputs": [], 488 | "source": [ 489 | "# Import FastText\n", 490 | "\n", 491 | "from gensim.models import FastText\n", 492 | "from sklearn.decomposition import PCA\n", 493 | "from matplotlib import pyplot\n", 494 | "\n", 495 | "#Example sentences\n", 496 | "\n", 497 | "sentences = [['I', 'love', 'nlp'],\n", 498 | "\t\t\t['I', 'will', 'learn', 'nlp', 'in', '2','months'],\n", 499 | "\t\t\t['nlp', 'is', 'future'],\n", 500 | "\t\t\t['nlp', 'saves', 'time', 'and', 'solves', 'lot', 'of', 'industry', 'problems'],\n", 501 | "\t\t\t['nlp', 'uses', 'machine', 'learning']]\n", 502 | "\n", 503 | "\n", 504 | "fast = FastText(sentences,size=20, window=1, min_count=1, workers=5, min_n=1, max_n=2)\n", 505 | "\n", 506 | "# vector for word nlp\n", 507 | "\n", 508 | "print(fast['nlp'])\n", 509 | "print(fast['deep'])" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": { 516 | "collapsed": true 517 | }, 518 | "outputs": [], 519 | "source": [ 520 | "# load model\n", 521 | "\n", 522 | "fast = Word2Vec.load('fast.bin')\n", 523 | "\n", 524 | "# visualize \n", 525 | "\n", 526 | "X = skipgram[skipgram.wv.vocab]\n", 527 | "pca = PCA(n_components=2)\n", 528 | "result = pca.fit_transform(X)\n", 529 | "\n", 530 | "\n", 531 | "# create a scatter plot of the projection\n", 532 | "\n", 533 | "pyplot.scatter(result[:, 0], result[:, 1])\n", 534 | "words = list(skipgram.wv.vocab)\n", 535 | "for i, word in enumerate(words):\n", 536 | "\tpyplot.annotate(word, xy=(result[i, 0], result[i, 1]))\n", 537 | "pyplot.show()\n" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": { 544 | "collapsed": true 545 | }, 546 | "outputs": [], 547 | "source": [] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": { 553 | "collapsed": true 554 | }, 555 | "outputs": [], 556 | "source": [] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": { 562 | "collapsed": true 563 | }, 564 | "outputs": [], 565 | "source": [] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": { 571 | "collapsed": true 572 | }, 573 | "outputs": [], 574 | "source": [] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": { 580 | "collapsed": true 581 | }, 582 | "outputs": [], 583 | "source": [] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": { 589 | "collapsed": true 590 | }, 591 | "outputs": [], 592 | "source": [] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": { 598 | "collapsed": true 599 | }, 600 | "outputs": [], 601 | "source": [] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": null, 606 | "metadata": { 607 | "collapsed": true 608 | }, 609 | "outputs": [], 610 | "source": [] 611 | } 612 | ], 613 | "metadata": { 614 | "kernelspec": { 615 | "display_name": "Python [conda root]", 616 | "language": "python", 617 | "name": "conda-root-py" 618 | }, 619 | "language_info": { 620 | "codemirror_mode": { 621 | "name": "ipython", 622 | "version": 3 623 | }, 624 | "file_extension": ".py", 625 | "mimetype": "text/x-python", 626 | "name": "python", 627 | "nbconvert_exporter": "python", 628 | "pygments_lexer": "ipython3", 629 | "version": "3.5.2" 630 | } 631 | }, 632 | "nbformat": 4, 633 | "nbformat_minor": 1 634 | } 635 | -------------------------------------------------------------------------------- /Chapter 4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Advanced Natural Language Processing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": {}, 13 | "source": [ 14 | "Recipe 4-1. Extracting Noun Phrases" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "#Import libraries\n", 26 | "import nltk\n", 27 | "from textblob import TextBlob\n", 28 | "\n", 29 | "#Extract noun\n", 30 | "blob = TextBlob(\"John is learning natural language processing\")\n", 31 | "for np in blob.noun_phrases:\n", 32 | " print(np)\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "raw", 37 | "metadata": {}, 38 | "source": [ 39 | "Recipe 4-2. Finding Similarity Between Texts" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "documents = (\n", 51 | "\"I like NLP\",\n", 52 | "\"I am exploring NLP\",\n", 53 | "\"I am a beginner in NLP\",\n", 54 | "\"I want to learn NLP\",\n", 55 | "\"I like advanced NLP\"\n", 56 | ")\n", 57 | "\n", 58 | "#Import libraries\n", 59 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 60 | "from sklearn.metrics.pairwise import cosine_similarity\n", 61 | "\n", 62 | "#Compute tfidf : feature engineering(refer previous chapter – Recipe 3-4)\n", 63 | "\n", 64 | "tfidf_vectorizer = TfidfVectorizer()\n", 65 | "tfidf_matrix = tfidf_vectorizer.fit_transform(documents)\n", 66 | "\n", 67 | "tfidf_matrix.shape\n", 68 | "\n", 69 | "#output\n", 70 | "(5, 10)\n", 71 | "\n", 72 | "#compute similarity for first sentence with rest of the sentences\n", 73 | "cosine_similarity(tfidf_matrix[0:1],tfidf_matrix)\n" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "!pip install fuzzy\n", 85 | "import fuzzy\n", 86 | "\n", 87 | "soundex = fuzzy.Soundex(4) \n", 88 | "\n", 89 | "soundex('natural')\n", 90 | "soundex('natuaral')" 91 | ] 92 | }, 93 | { 94 | "cell_type": "raw", 95 | "metadata": {}, 96 | "source": [ 97 | "Recipe 4-3. Tagging Part of Speech " 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "Text = \"I love NLP and I will learn NLP in 2 month\"\n", 109 | "\n", 110 | "# Importing necessary packages and stopwords\n", 111 | "import nltk \n", 112 | "from nltk.corpus import stopwords \n", 113 | "from nltk.tokenize import word_tokenize, sent_tokenize \n", 114 | "stop_words = set(stopwords.words('english')) \n", 115 | "\n", 116 | "# Tokenize the text\n", 117 | "tokens = sent_tokenize(text) \n", 118 | "\n", 119 | "#Generate tagging for all the tokens using loop\n", 120 | "for i in tokens: \n", 121 | " words = nltk.word_tokenize(i) \n", 122 | " words = [w for w in words if not w in stop_words] \n", 123 | " # POS-tagger. \n", 124 | " tags = nltk.pos_tag(words) \n", 125 | "\n", 126 | "tags\n" 127 | ] 128 | }, 129 | { 130 | "cell_type": "raw", 131 | "metadata": {}, 132 | "source": [ 133 | "Recipe 4-4. Extract Entities From Text" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "sent = \"John is studying at Stanford University in California\"\n", 145 | "\n", 146 | "#import libraries\n", 147 | "import nltk\n", 148 | "from nltk import ne_chunk\n", 149 | "from nltk import word_tokenize\n", 150 | "\n", 151 | "#NER\n", 152 | "ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False)\n" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "import spacy\n", 164 | "nlp = spacy.load('en')\n", 165 | "\n", 166 | "# Read/create a sentence\n", 167 | "doc = nlp(u'Apple is ready to launch new phone worth $10000 in New york time square ')\n", 168 | "\n", 169 | "for ent in doc.ents:\n", 170 | " print(ent.text, ent.start_char, ent.end_char, ent.label_)\n" 171 | ] 172 | }, 173 | { 174 | "cell_type": "raw", 175 | "metadata": {}, 176 | "source": [ 177 | "Recipe 4-5. Extracting Topics From Text" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": true 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "doc1 = \"I am learning NLP, it is very interesting and exciting. it includes machine learning and deep learning\" \n", 189 | "doc2 = \"My father is a data scientist and he is nlp expert\"\n", 190 | "doc3 = \"My sister has good exposure into android development\"\n", 191 | "\n", 192 | "doc_complete = [doc1, doc2, doc3] \n", 193 | "doc_complete\n", 194 | "\n", 195 | "# Install and import libraries\n", 196 | "\n", 197 | "!pip install gensim\n", 198 | "from nltk.corpus import stopwords\n", 199 | "from nltk.stem.wordnet import WordNetLemmatizer\n", 200 | "import string\n", 201 | "\n", 202 | "# Text preprocessing as discussed in chapter 2\n", 203 | "\n", 204 | "stop = set(stopwords.words('english'))\n", 205 | "exclude = set(string.punctuation) \n", 206 | "lemma = WordNetLemmatizer()\n", 207 | "def clean(doc):\n", 208 | " stop_free = \" \".join([i for i in doc.lower().split() if i not in stop])\n", 209 | " punc_free = ''.join(ch for ch in stop_free if ch not in exclude)\n", 210 | " normalized = \" \".join(lemma.lemmatize(word) for word in punc_free.split())\n", 211 | " return normalized\n", 212 | "\n", 213 | "doc_clean = [clean(doc).split() for doc in doc_complete] \n", 214 | "\n", 215 | "doc_clean\n", 216 | "\n", 217 | "# Importing gensim\n", 218 | "\n", 219 | "import gensim\n", 220 | "from gensim import corpora\n", 221 | "\n", 222 | "# Creating the term dictionary of our corpus, where every unique term is #assigned an index. \n", 223 | "\n", 224 | "dictionary = corpora.Dictionary(doc_clean)\n", 225 | "\n", 226 | "# Converting a list of documents (corpus) into Document-Term Matrix using #dictionary prepared above.\n", 227 | "\n", 228 | "doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]\n", 229 | "\n", 230 | "doc_term_matrix\n" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "collapsed": true 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "# Creating the object for LDA model using gensim library\n", 242 | "Lda = gensim.models.ldamodel.LdaModel\n", 243 | "\n", 244 | "# Running and Training LDA model on the document term matrix for 3 topics.\n", 245 | "ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)\n", 246 | "\n", 247 | "# Results\n", 248 | "print(ldamodel.print_topics())\n" 249 | ] 250 | }, 251 | { 252 | "cell_type": "raw", 253 | "metadata": {}, 254 | "source": [ 255 | "Recipe 4-6. Classifying Text" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": { 262 | "collapsed": true 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "#Read the data\n", 267 | "Email_Data = pd.read_csv(\"spam.csv\",encoding ='latin1')\n", 268 | "\n", 269 | "#Data undestanding\n", 270 | "Email_Data.columns\n", 271 | "\n", 272 | "Email_Data = Email_Data[['v1', 'v2']]\n", 273 | "Email_Data = Email_Data.rename(columns={\"v1\":\"Target\", \"v2\":\"Email\"})\n", 274 | "\n", 275 | "Email_Data.head()\n" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "#import\n", 287 | "import numpy as np\n", 288 | "import pandas as pd\n", 289 | "import matplotlib.pyplot as plt\n", 290 | "import string\n", 291 | "from nltk.stem import SnowballStemmer\n", 292 | "from nltk.corpus import stopwords\n", 293 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 294 | "from sklearn.model_selection import train_test_split\n", 295 | "import os\n", 296 | "from textblob import TextBlob\n", 297 | "from nltk.stem import PorterStemmer\n", 298 | "from textblob import Word\n", 299 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer\n", 300 | "import sklearn.feature_extraction.text as text\n", 301 | "from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm\n", 302 | "\n", 303 | "#pre processing steps like lower case, stemming and lemmatization \n", 304 | "\n", 305 | "Email_Data['Email'] = Email_Data['Email'].apply(lambda x: \" \".join(x.lower() for x in x.split()))\n", 306 | "stop = stopwords.words('english')\n", 307 | "Email_Data['Email'] = Email_Data['Email'].apply(lambda x: \" \".join(x for x in x.split() if x not in stop))\n", 308 | "st = PorterStemmer()\n", 309 | "Email_Data['Email'] = Email_Data['Email'].apply(lambda x: \" \".join([st.stem(word) for word in x.split()]))\n", 310 | "Email_Data['Email'] =Email_Data['Email'].apply(lambda x: \" \".join([Word(word).lemmatize() for word in x.split()]))\n", 311 | "\n", 312 | "Email_Data.head()\n" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "#Splitting data into train and validation\n", 324 | "\n", 325 | "train_x, valid_x, train_y, valid_y = model_selection.train_test_split(Email_Data['Email'], Email_Data['Target'])\n", 326 | "\n", 327 | "# TFIDF feature generation for a maximum of 5000 features\n", 328 | "\n", 329 | "encoder = preprocessing.LabelEncoder()\n", 330 | "train_y = encoder.fit_transform(train_y)\n", 331 | "valid_y = encoder.fit_transform(valid_y)\n", 332 | "\n", 333 | "tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', max_features=5000)\n", 334 | "tfidf_vect.fit(Email_Data['Email'])\n", 335 | "xtrain_tfidf = tfidf_vect.transform(train_x)\n", 336 | "xvalid_tfidf = tfidf_vect.transform(valid_x)\n", 337 | "\n", 338 | "xtrain_tfidf.data\n" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": { 345 | "collapsed": true 346 | }, 347 | "outputs": [], 348 | "source": [ 349 | "def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):\n", 350 | " # fit the training dataset on the classifier\n", 351 | " classifier.fit(feature_vector_train, label)\n", 352 | " # predict the labels on validation dataset\n", 353 | " predictions = classifier.predict(feature_vector_valid)\n", 354 | " return metrics.accuracy_score(predictions, valid_y)\n", 355 | "\n", 356 | "# Naive Bayes trainig\n", 357 | "accuracy = train_model(naive_bayes.MultinomialNB(alpha=0.2), xtrain_tfidf, train_y, xvalid_tfidf)\n", 358 | "print (\"Accuracy: \", accuracy)\n" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "collapsed": true 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "# Linear Classifier on Word Level TF IDF Vectors\n", 370 | "accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)\n", 371 | "print (\"Accuracy: \", accuracy)\n" 372 | ] 373 | }, 374 | { 375 | "cell_type": "raw", 376 | "metadata": {}, 377 | "source": [ 378 | "Recipe 4-7. Carrying Out Sentiment Analysis" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "collapsed": true 386 | }, 387 | "outputs": [], 388 | "source": [ 389 | "review = \"I like this phone. screen quality and camera clarity is really good.\"\n", 390 | "review2 = \"This tv is not good. Bad quality, no clarity, worst experience\"\n", 391 | "\n", 392 | "#import libraries\n", 393 | "from textblob import TextBlob\n", 394 | "\n", 395 | "#TextBlob has a pre trained sentiment prediction model\n", 396 | "blob = TextBlob(review)\n", 397 | "blob.sentiment\n", 398 | "\n", 399 | "#now lets look at the sentiment of review2\n", 400 | "blob = TextBlob(review2)\n", 401 | "blob.sentiment\n" 402 | ] 403 | }, 404 | { 405 | "cell_type": "raw", 406 | "metadata": {}, 407 | "source": [ 408 | "Recipe 4-8. Disambiguating Text" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 1, 414 | "metadata": { 415 | "collapsed": true 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "Text1 = 'I went to the bank to deposit my money'\n", 420 | "Text2 = 'The river bank was full of dead fishes'\n", 421 | "\n", 422 | "#Install pywsd\n", 423 | "\n", 424 | "!pip install pywsd\n", 425 | "\n", 426 | "#Import functions\n", 427 | "\n", 428 | "from nltk.corpus import wordnet as wn\n", 429 | "from nltk.stem import PorterStemmer\n", 430 | "from itertools import chain\n", 431 | "from pywsd.lesk import simple_lesk\n", 432 | "\n", 433 | "# Sentences\n", 434 | "\n", 435 | "bank_sents = ['I went to the bank to deposit my money',\n", 436 | "'The river bank was full of dead fishes']\n", 437 | "\n", 438 | "# calling the lesk function and printing results for both the sentences\n", 439 | "\n", 440 | "print (\"Context-1:\", bank_sents[0])\n", 441 | "answer = simple_lesk(bank_sents[0],'bank')\n", 442 | "print (\"Sense:\", answer)\n", 443 | "print (\"Definition : \", answer.definition())\n", 444 | "\n", 445 | "\n", 446 | "print (\"Context-2:\", bank_sents[1])\n", 447 | "answer = simple_lesk(bank_sents[1],'bank','n')\n", 448 | "print (\"Sense:\", answer)\n", 449 | "print (\"Definition : \", answer.definition())\n" 450 | ] 451 | }, 452 | { 453 | "cell_type": "raw", 454 | "metadata": {}, 455 | "source": [ 456 | "Recipe 4-9. Converting speech to text." 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": { 463 | "collapsed": true 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "!pip install SpeechRecognition\n", 468 | "!pip install PyAudio\n", 469 | "\n", 470 | "import speech_recognition as sr\n", 471 | "\n", 472 | "r=sr.Recognizer()\n", 473 | "\n", 474 | "with sr.Microphone() as source:\n", 475 | " print(\"Please say something\")\n", 476 | " audio = r.listen(source)\n", 477 | " print(\"Time over, thanks\")\n", 478 | " \n", 479 | "try:\n", 480 | " print(\"I think you said: \"+r.recognize_google(audio));\n", 481 | "except:\n", 482 | " pass;\n" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": { 489 | "collapsed": true 490 | }, 491 | "outputs": [], 492 | "source": [ 493 | "#code snippet\n", 494 | "r=sr.Recognizer()\n", 495 | "\n", 496 | "with sr.Microphone() as source:\n", 497 | " print(\"Please say something\")\n", 498 | " audio = r.listen(source)\n", 499 | " print(\"Time over, thanks\")\n", 500 | " \n", 501 | "try:\n", 502 | " print(\"I think you said: \"+r.recognize_google(audio));\n", 503 | "except:\n", 504 | " pass;\n" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": null, 510 | "metadata": { 511 | "collapsed": true 512 | }, 513 | "outputs": [], 514 | "source": [ 515 | "#code snippet\n", 516 | "r=sr.Recognizer()\n", 517 | "\n", 518 | "with sr.Microphone() as source:\n", 519 | " print(\"Please say something\")\n", 520 | " audio = r.listen(source)\n", 521 | " print(\"Time over, thanks\")\n", 522 | " \n", 523 | "try:\n", 524 | " print(\"I think you said: \"+r.recognize_google(audio, language ='hi-IN'));\n", 525 | "except sr.UnknownValueError:\n", 526 | " print(\"Google Speech Recognition could not understand audio\")\n", 527 | "except sr.RequestError as e:\n", 528 | " print(\"Could not request results from Google Speech Recognition service; {0}\".format(e))\n", 529 | "except:\n", 530 | " pass;\n" 531 | ] 532 | }, 533 | { 534 | "cell_type": "raw", 535 | "metadata": {}, 536 | "source": [ 537 | "Recipe 4-10. Converting Text to Speech" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": { 544 | "collapsed": true 545 | }, 546 | "outputs": [], 547 | "source": [ 548 | "!pip install gTTS\n", 549 | "\n", 550 | "from gtts import gTTS\n", 551 | "\n", 552 | "#chooses the language, English(‘en’)\n", 553 | "\n", 554 | "convert = gTTS(text='I like this NLP book', lang='en', slow=False) \n", 555 | " \n", 556 | "# Saving the converted audio in a mp3 file named \n", 557 | "myobj.save(\"audio.mp3\")\n" 558 | ] 559 | }, 560 | { 561 | "cell_type": "raw", 562 | "metadata": {}, 563 | "source": [ 564 | "Recipe 4-11. Translating Speech" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": { 571 | "collapsed": true 572 | }, 573 | "outputs": [], 574 | "source": [ 575 | "!pip install goslate\n", 576 | "import goslate\n", 577 | "\n", 578 | "text = \"Bonjour le monde\" \n", 579 | "gs = goslate.Goslate()\n", 580 | "translatedText = gs.translate(text,'en')\n", 581 | "\n", 582 | "print(translatedText)" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": { 589 | "collapsed": true 590 | }, 591 | "outputs": [], 592 | "source": [] 593 | } 594 | ], 595 | "metadata": { 596 | "kernelspec": { 597 | "display_name": "Python [conda root]", 598 | "language": "python", 599 | "name": "conda-root-py" 600 | }, 601 | "language_info": { 602 | "codemirror_mode": { 603 | "name": "ipython", 604 | "version": 3 605 | }, 606 | "file_extension": ".py", 607 | "mimetype": "text/x-python", 608 | "name": "python", 609 | "nbconvert_exporter": "python", 610 | "pygments_lexer": "ipython3", 611 | "version": "3.5.2" 612 | } 613 | }, 614 | "nbformat": 4, 615 | "nbformat_minor": 1 616 | } 617 | -------------------------------------------------------------------------------- /Chapter 5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Implementing Industry Applications" 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": {}, 13 | "source": [ 14 | "Recipe 5-1. Implementing Multiclass Classification" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np \n", 26 | "import pandas as pd \n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import string\n", 29 | "from nltk.stem import SnowballStemmer\n", 30 | "from nltk.corpus import stopwords\n", 31 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 32 | "from sklearn.model_selection import train_test_split\n", 33 | "import os\n", 34 | "from textblob import TextBlob\n", 35 | "from nltk.stem import PorterStemmer\n", 36 | "from textblob import Word\n", 37 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer\n", 38 | "import sklearn.feature_extraction.text as text\n", 39 | "from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm\n", 40 | "from sklearn.naive_bayes import MultinomialNB\n", 41 | "from sklearn.linear_model import LogisticRegression\n", 42 | "from sklearn.ensemble import RandomForestClassifier\n", 43 | "from sklearn.svm import LinearSVC\n", 44 | "from sklearn.model_selection import cross_val_score\n", 45 | "from io import StringIO\n", 46 | "import seaborn as sns\n", 47 | "\n", 48 | "#Importing the data which was downloaded in the last step\n", 49 | "\n", 50 | "Data = pd.read_csv(\"/Consumer_Complaints.csv\",encoding='latin-1')\n", 51 | "\n", 52 | "#Understanding the columns\n", 53 | "\n", 54 | "Data.dtypes\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "# Selecting required columns and rows\n", 66 | "Data = Data[['product', 'consumer_complaint_narrative']]\n", 67 | "Data = Data[pd.notnull(Data['consumer_complaint_narrative'])]\n", 68 | "\n", 69 | "# See top 5 rows\n", 70 | "Data.head()\n", 71 | "\n", 72 | "# Factorizing the category column\n", 73 | "Data['category_id'] = Data['product'].factorize()[0]\n", 74 | "\n", 75 | "Data.head()\n", 76 | "\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "# Check the distriution of complaints by category\n", 88 | "Data.groupby('product').consumer_complaint_narrative.count()\n", 89 | "\n", 90 | "\n", 91 | "# Lets plot it and see\n", 92 | "fig = plt.figure(figsize=(8,6))\n", 93 | "Data.groupby('product').consumer_complaint_narrative.count().plot.bar(ylim=0)\n", 94 | "plt.show()\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# Split the data into train and validation\n", 106 | "\n", 107 | "train_x, valid_x, train_y, valid_y = model_selection.train_test_split(Data['consumer_complaint_narrative'], Data['product'])\n", 108 | "\n", 109 | "encoder = preprocessing.LabelEncoder()\n", 110 | "train_y = encoder.fit_transform(train_y)\n", 111 | "valid_y = encoder.fit_transform(valid_y)\n", 112 | "\n", 113 | "tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', max_features=5000)\n", 114 | "\n", 115 | "tfidf_vect.fit(Data['consumer_complaint_narrative'])\n", 116 | "xtrain_tfidf = tfidf_vect.transform(train_x)\n", 117 | "xvalid_tfidf = tfidf_vect.transform(valid_x)\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "model = linear_model.LogisticRegression().fit(xtrain_tfidf, train_y)\n", 129 | "\n", 130 | "\n", 131 | "# Model summary\n", 132 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 133 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 134 | " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", 135 | " verbose=0, warm_start=False)\n", 136 | "\n", 137 | "# Checking accuracy\n", 138 | "\n", 139 | "accuracy = metrics.accuracy_score(model.predict(xvalid_tfidf), valid_y)\n", 140 | "print (\"Accuracy: \", accuracy)\n", 141 | "\n", 142 | "# Classification report\n", 143 | "\n", 144 | "print(metrics.classification_report(valid_y, model.predict(xvalid_tfidf),target_names=Data['product'].unique()))\n" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "#confusion matrix\n", 156 | "\n", 157 | "conf_mat = confusion_matrix(valid_y, model.predict(xvalid_tfidf))\n", 158 | "\n", 159 | "# Vizualizing confusion matrix\n", 160 | "\n", 161 | "category_id_df = Data[['product', 'category_id']].drop_duplicates().sort_values('category_id')\n", 162 | "category_to_id = dict(category_id_df.values)\n", 163 | "id_to_category = dict(category_id_df[['category_id', 'product']].values)\n", 164 | "\n", 165 | "\n", 166 | "fig, ax = plt.subplots(figsize=(8,6))\n", 167 | "sns.heatmap(conf_mat, annot=True, fmt='d', cmap=\"BuPu\",\n", 168 | " xticklabels=category_id_df[['product']].values, yticklabels=category_id_df[['product']].values)\n", 169 | "plt.ylabel('Actual')\n", 170 | "plt.xlabel('Predicted')\n", 171 | "plt.show()\n" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": true 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "# Prediction example\n", 183 | "\n", 184 | "texts = [\"This company refuses to provide me verification and validation of debt\"+ \n", 185 | " \"per my right under the FDCPA. I do not believe this debt is mine.\"]\n", 186 | "text_features = tfidf_vect.transform(texts)\n", 187 | "predictions = model.predict(text_features)\n", 188 | "print(texts)\n", 189 | "print(\" - Predicted as: '{}'\".format(id_to_category[predictions[0]]))\n" 190 | ] 191 | }, 192 | { 193 | "cell_type": "raw", 194 | "metadata": {}, 195 | "source": [ 196 | "Recipe 5-2. Implementing Sentiment Analysis" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "# Import necessary libraries\n", 208 | "\n", 209 | "import numpy as np\n", 210 | "import pandas as pd\n", 211 | "import matplotlib.pyplot as plt\n", 212 | "%matplotlib inline \n", 213 | "\n", 214 | "#Read the data\n", 215 | "df = pd.read_csv('Reviews.csv')\n", 216 | "\n", 217 | "# Look at the top 5 rows of the data\n", 218 | "df.head(5)\n", 219 | "\n", 220 | "# Understand the data types of the columns\n", 221 | "df.info()\n", 222 | "\n", 223 | "# Looking at the summary of the reviews.\n", 224 | "df.Summary.head(5)\n", 225 | "\n", 226 | "# Looking at the description of the reviews\n", 227 | "df.Text.head(5)\n", 228 | "\n" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "collapsed": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "# Import libraries\n", 240 | "from nltk.corpus import stopwords\n", 241 | "from textblob import TextBlob\n", 242 | "from textblob import Word\n", 243 | "\n", 244 | "# Lower casing and removing punctuations\n", 245 | "df['Text'] = df['Text'].apply(lambda x: \" \".join(x.lower() for x in x.split()))\n", 246 | "df['Text'] = df['Text'].str.replace('[^\\w\\s]','')\n", 247 | "\n", 248 | "# Removal of stop words\n", 249 | "stop = stopwords.words('english')\n", 250 | "df['Text'] = df['Text'].apply(lambda x: \" \".join(x for x in x.split() if x not in stop))\n", 251 | "\n", 252 | "# Spelling correction\n", 253 | "df['Text'] = df['Text'].apply(lambda x: str(TextBlob(x).correct()))\n", 254 | "\n", 255 | "# Lemmatization\n", 256 | "df['Text'] = df['Text'].apply(lambda x: \" \".join([Word(word).lemmatize() for word in x.split()]))\n", 257 | "\n", 258 | "df.Text.head(5)\n" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": true 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "# Create a new data frame “reviews” to perform exploratory data analysis upon that\n", 270 | "\n", 271 | "reviews = df\n", 272 | "\n", 273 | "# Dropping null values\n", 274 | "reviews.dropna(inplace=True) \n", 275 | "\n", 276 | "# The histogram reveals this dataset is highly unbalanced towards high rating. \n", 277 | "\n", 278 | "reviews.Score.hist(bins=5,grid=False)\n", 279 | "plt.show()\n", 280 | "print(reviews.groupby('Score').count().Id)\n", 281 | "\n", 282 | "# To make it balanced data, we sampled each score by the lowest n-count from above. (i.e. 29743 reviews scored as '2')\n", 283 | "\n", 284 | "score_1 = reviews[reviews['Score'] == 1].sample(n=29743)\n", 285 | "score_2 = reviews[reviews['Score'] == 2].sample(n=29743)\n", 286 | "score_3 = reviews[reviews['Score'] == 3].sample(n=29743)\n", 287 | "score_4 = reviews[reviews['Score'] == 4].sample(n=29743)\n", 288 | "score_5 = reviews[reviews['Score'] == 5].sample(n=29743)\n", 289 | "\n", 290 | "# Here we recreate a 'balanced' dataset.\n", 291 | "reviews_sample = pd.concat([score_1,score_2,score_3,score_4,score_5],axis=0)\n", 292 | "reviews_sample.reset_index(drop=True,inplace=True)\n", 293 | "\n", 294 | "# Printing count by 'Score' to check dataset is now balanced.\n", 295 | "print(reviews_sample.groupby('Score').count().Id)\n" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": true 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "# Let's build a word cloud looking at the 'Summary' text\n", 307 | "\n", 308 | "from wordcloud import WordCloud\n", 309 | "from wordcloud import STOPWORDS\n", 310 | "\n", 311 | "# Wordcloud function's input needs to be a single string of text.\n", 312 | "# Here I'm concatenating all Summaries into a single string.\n", 313 | "# similarly you can build for Text column\n", 314 | "\n", 315 | "reviews_str = reviews_sample.Summary.str.cat()\n", 316 | "\n", 317 | "wordcloud = WordCloud(background_color='white').generate(reviews_str)\n", 318 | "plt.figure(figsize=(10,10))\n", 319 | "plt.imshow(wordcloud,interpolation='bilinear')\n", 320 | "plt.axis(\"off\")\n", 321 | "plt.show()\n" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": { 328 | "collapsed": true 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "# Now let's split the data into Negative (Score is 1 or 2) and Positive (4 or #5) Reviews.\n", 333 | "negative_reviews = reviews_sample[reviews_sample['Score'].isin([1,2]) ]\n", 334 | "positive_reviews = reviews_sample[reviews_sample['Score'].isin([4,5]) ]\n", 335 | "\n", 336 | "# Transform to single string\n", 337 | "negative_reviews_str = negative_reviews.Summary.str.cat()\n", 338 | "positive_reviews_str = positive_reviews.Summary.str.cat()\n", 339 | "\n", 340 | "# Create wordclouds\n", 341 | "wordcloud_negative = WordCloud(background_color='white').generate(negative_reviews_str)\n", 342 | "wordcloud_positive = WordCloud(background_color='white').generate(positive_reviews_str)\n", 343 | "\n", 344 | "# Plot\n", 345 | "fig = plt.figure(figsize=(10,10))\n", 346 | "ax1 = fig.add_subplot(211)\n", 347 | "ax1.imshow(wordcloud_negative,interpolation='bilinear')\n", 348 | "ax1.axis(\"off\")\n", 349 | "ax1.set_title('Reviews with Negative Scores',fontsize=20)\n", 350 | "ax2 = fig.add_subplot(212)\n", 351 | "ax2.imshow(wordcloud_positive,interpolation='bilinear')\n", 352 | "ax2.axis(\"off\")\n", 353 | "ax2.set_title('Reviews with Positive Scores',fontsize=20)\n", 354 | "\n", 355 | "plt.show()\n" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "collapsed": true 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "#Importing required libraries\n", 367 | "import pandas as pd\n", 368 | "import numpy as np\n", 369 | "import matplotlib.pyplot as plt\n", 370 | "%matplotlib inline\n", 371 | "import seaborn as sns\n", 372 | "import re\n", 373 | "import os\n", 374 | "import sys\n", 375 | "import ast\n", 376 | "plt.style.use('fivethirtyeight')\n", 377 | "# Function for getting the sentiment\n", 378 | "cp = sns.color_palette()\n", 379 | "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n", 380 | "analyzer = SentimentIntensityAnalyzer()\n", 381 | "\n", 382 | "# Generating sentiment for all the sentence present in the dataset\n", 383 | "emptyline=[]\n", 384 | "for row in df['Text']:\n", 385 | " vs=analyzer.polarity_scores(row)\n", 386 | " emptyline.append(vs)\n", 387 | "\n", 388 | "# Creating new dataframe with sentiments \n", 389 | "df_sentiments=pd.DataFrame(emptyline)\n", 390 | "df_sentiments.head(5)\n", 391 | "\n", 392 | "# Merging the sentiments back to reviews dataframe\n", 393 | "df_c = pd.concat([df.reset_index(drop=True), d], axis=1)\n", 394 | "df_c.head(3)\n", 395 | "\n", 396 | "# Convert scores into positive and negetive sentiments using some threshold\n", 397 | "df_c['Sentiment'] = np.where(df_c['compound'] >= 0 , 'Positive', 'Negative')\n", 398 | "df_c.head(5)\n" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": { 405 | "collapsed": true 406 | }, 407 | "outputs": [], 408 | "source": [ 409 | "result=df_c['Sentiment'].value_counts()\n", 410 | "result.plot(kind='bar', rot=0,color='br');\n" 411 | ] 412 | }, 413 | { 414 | "cell_type": "raw", 415 | "metadata": {}, 416 | "source": [ 417 | "Recipe 5-3. Applying text similarity functions" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": { 424 | "collapsed": true 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "# Import package\n", 429 | "!pip install recordlinkage\n", 430 | "import recordlinkage\n", 431 | "\n", 432 | "#For this demo let us use the inbuilt dataset from recordlinkage library\n", 433 | "\n", 434 | "#import data set\n", 435 | "from recordlinkage.datasets import load_febrl1 \n", 436 | "\n", 437 | "#create a dataframe - dfa\n", 438 | "dfA = load_febrl1()\n", 439 | "dfA.head()\n", 440 | "\n", 441 | "indexer = recordlinkage.BlockIndex(on='given_name')\n", 442 | "pairs = indexer.index(dfA)\n", 443 | "\n", 444 | "print (len(pairs))\n" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": { 451 | "collapsed": true 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "# This cell can take some time to compute.\n", 456 | "compare_cl = recordlinkage.Compare()\n", 457 | "\n", 458 | "compare_cl.string('given_name', 'given_name',method='jarowinkler', label='given_name')\n", 459 | "compare_cl.string('surname', 'surname', method='jarowinkler', label='surname')\n", 460 | "compare_cl.exact('date_of_birth', 'date_of_birth', label='date_of_birth')\n", 461 | "compare_cl.exact('suburb', 'suburb', label='suburb')\n", 462 | "compare_cl.exact('state', 'state', label='state')\n", 463 | "compare_cl.string('address_1', 'address_1',method='jarowinkler', label='address_1')\n", 464 | "\n", 465 | "features = compare_cl.compute(pairs, dfA)\n", 466 | "features.sample(5)\n" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": { 473 | "collapsed": true 474 | }, 475 | "outputs": [], 476 | "source": [ 477 | "# select all the features except for given_name since its our blocking key\n", 478 | "features1 = features[['suburb','state','surname','date_of_birth','address_1']]\n", 479 | "\n", 480 | "# Unsupervised learning – probabilistic\n", 481 | "\n", 482 | "ecm = recordlinkage.ECMClassifier()\n", 483 | "result_ecm = ecm.learn((features1).astype(int),return_type = 'series')\n", 484 | "\n", 485 | "result_ecm\n" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": { 492 | "collapsed": true 493 | }, 494 | "outputs": [], 495 | "source": [ 496 | "# Let us use the inbuilt dataset from recordlinkage library\n", 497 | "\n", 498 | "from recordlinkage.datasets import load_febrl4\n", 499 | "\n", 500 | "dfA, dfB = load_febrl4()\n", 501 | "dfA.head()\n", 502 | "\n", 503 | "# Same as explained previously, considering given_name as blocking index\n", 504 | "\n", 505 | "indexer = recordlinkage.BlockIndex(on='given_name')\n", 506 | "pairs = indexer.index(dfA, dfB)\n" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": { 513 | "collapsed": true 514 | }, 515 | "outputs": [], 516 | "source": [ 517 | "# Explanation remains same\n", 518 | "compare_cl = recordlinkage.Compare()\n", 519 | "\n", 520 | "compare_cl.string('given_name', 'given_name',method='jarowinkler', label='given_name')\n", 521 | "compare_cl.string('surname', 'surname', method='jarowinkler', label='surname')\n", 522 | "compare_cl.exact('date_of_birth', 'date_of_birth', label='date_of_birth')\n", 523 | "compare_cl.exact('suburb', 'suburb', label='suburb')\n", 524 | "compare_cl.exact('state', 'state', label='state')\n", 525 | "compare_cl.string('address_1', 'address_1',method='jarowinkler', label='address_1')\n", 526 | "\n", 527 | "features = compare_cl.compute(pairs, dfA, dfB)\n", 528 | "\n", 529 | "features.head(10)\n" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": { 536 | "collapsed": true 537 | }, 538 | "outputs": [], 539 | "source": [ 540 | "# select all the features except for given_name since its our blocking key\n", 541 | "features1 = features[['suburb','state','surname','date_of_birth','address_1']]\n", 542 | "\n", 543 | "# unsupervised learning - probablistic\n", 544 | "ecm = recordlinkage.ECMClassifier()\n", 545 | "result_ecm = ecm.learn((features1).astype(int),return_type = 'series')\n", 546 | "\n", 547 | "result_ecm\n" 548 | ] 549 | }, 550 | { 551 | "cell_type": "raw", 552 | "metadata": {}, 553 | "source": [ 554 | "Recipe 5-4. Summarizing Text Data" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": { 561 | "collapsed": true 562 | }, 563 | "outputs": [], 564 | "source": [ 565 | "# Import BeautifulSoup and urllib libraries to fetch data from Wikipedia.\n", 566 | "from bs4 import BeautifulSoup\n", 567 | "from urllib.request import urlopen\n", 568 | "\n", 569 | "\n", 570 | "# Function to get data from Wikipedia\n", 571 | "\n", 572 | "def get_only_text(url):\n", 573 | " page = urlopen(url)\n", 574 | " soup = BeautifulSoup(page)\n", 575 | " text = ' '.join(map(lambda p: p.text, soup.find_all('p')))\n", 576 | " print (text) \n", 577 | " return soup.title.text, text \n", 578 | "\n", 579 | "# Mention the Wikipedia url\n", 580 | "url=”https://en.wikipedia.org/wiki/Natural_language_processing”\n", 581 | "\n", 582 | "# Call the function created above\n", 583 | "text = get_only_text(url) \n", 584 | "\n", 585 | "# Count the number of letters\n", 586 | "len(''.join(text))\n", 587 | "# Lets see first 1000 letters from the text\n", 588 | "text[:1000]\n" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": null, 594 | "metadata": { 595 | "collapsed": true 596 | }, 597 | "outputs": [], 598 | "source": [ 599 | "# Import summarize from gensim\n", 600 | "from gensim.summarization.summarizer import summarize\n", 601 | "from gensim.summarization import keywords\n", 602 | "\n", 603 | "# Convert text to string format\n", 604 | "text = str(text)\n", 605 | "\n", 606 | "#Summarize the text with ratio 0.1 (10% of the total words.)\n", 607 | "summarize(text, ratio=0.1)\n", 608 | "\n", 609 | "#keywords\n", 610 | "print(keywords(text, ratio=0.1))\n" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": { 617 | "collapsed": true 618 | }, 619 | "outputs": [], 620 | "source": [ 621 | "# Install sumy\n", 622 | "\n", 623 | "!pip install sumy\n", 624 | "\n", 625 | "# Import the packages\n", 626 | "\n", 627 | "from sumy.parsers.html import HtmlParser\n", 628 | "from sumy.parsers.plaintext import PlaintextParser\n", 629 | "from sumy.nlp.tokenizers import Tokenizer\n", 630 | "from sumy.summarizers.lsa import LsaSummarizer\n", 631 | "from sumy.nlp.stemmers import Stemmer\n", 632 | "from sumy.utils import get_stop_words\n", 633 | "from sumy.summarizers.luhn import LuhnSummarizer \n", 634 | "\n", 635 | "# Extracting and summarizing\n", 636 | "LANGUAGE = \"english\"\n", 637 | "SENTENCES_COUNT = 10\n", 638 | " \n", 639 | "url=\"https://en.wikipedia.org/wiki/Natural_language_processing\"\n", 640 | " \n", 641 | "parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))\n", 642 | "summarizer = LsaSummarizer()\n", 643 | "summarizer = LsaSummarizer(Stemmer(LANGUAGE))\n", 644 | "summarizer.stop_words = get_stop_words(LANGUAGE)\n", 645 | "for sentence in summarizer(parser.document, SENTENCES_COUNT):\n", 646 | " print(sentence)\n" 647 | ] 648 | }, 649 | { 650 | "cell_type": "raw", 651 | "metadata": {}, 652 | "source": [ 653 | "Recipe 5-5. Clustering Documents" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": null, 659 | "metadata": { 660 | "collapsed": true 661 | }, 662 | "outputs": [], 663 | "source": [ 664 | "!pip install mpld3\n", 665 | "\n", 666 | "import numpy as np\n", 667 | "import pandas as pd\n", 668 | "import nltk\n", 669 | "from nltk.stem.snowball import SnowballStemmer\n", 670 | "from bs4 import BeautifulSoup\n", 671 | "import re\n", 672 | "import os\n", 673 | "import codecs\n", 674 | "from sklearn import feature_extraction\n", 675 | "import mpld3\n", 676 | "from sklearn.metrics.pairwise import cosine_similarity \n", 677 | "import os \n", 678 | "import matplotlib.pyplot as plt\n", 679 | "import matplotlib as mpl\n", 680 | "from sklearn.manifold import MDS\n", 681 | "\n", 682 | "\n", 683 | "#Lets use the same complaint dataset we use for classification\n", 684 | "Data = pd.read_csv(\"/Consumer_Complaints.csv\",encoding='latin-1')\n", 685 | "\n", 686 | "#selecting required columns and rows\n", 687 | "Data = Data[['consumer_complaint_narrative']]\n", 688 | "Data = Data[pd.notnull(Data['consumer_complaint_narrative'])]\n", 689 | "\n", 690 | "# lets do the clustering for just 200 documents. Its easier to interpret.\n", 691 | "Data_sample=Data.sample(200)\n" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": null, 697 | "metadata": { 698 | "collapsed": true 699 | }, 700 | "outputs": [], 701 | "source": [ 702 | "# Remove unwanted symbol\n", 703 | "\n", 704 | "Data_sample['consumer_complaint_narrative'] = Data_sample['consumer_complaint_narrative'].str.replace('XXXX','')\n", 705 | "\n", 706 | "# Convert dataframe to list\n", 707 | "complaints = Data_sample['consumer_complaint_narrative'].tolist()\n", 708 | "\n", 709 | "# create the rank of documents – we will use it later\n", 710 | "\n", 711 | "ranks = []\n", 712 | "for i in range(1, len(complaints)+1):\n", 713 | " ranks.append(i)\n", 714 | "\n", 715 | "# Stop Words\n", 716 | "stopwords = nltk.corpus.stopwords.words('english')\n", 717 | "\n", 718 | "# Load 'stemmer'\n", 719 | "stemmer = SnowballStemmer(\"english\")\n", 720 | "\n", 721 | "# Functions for sentence tokenizer, to remove numeric tokens and raw #punctuation\n", 722 | "\n", 723 | "def tokenize_and_stem(text):\n", 724 | " tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n", 725 | " filtered_tokens = []\n", 726 | " for token in tokens:\n", 727 | " if re.search('[a-zA-Z]', token):\n", 728 | " filtered_tokens.append(token)\n", 729 | " stems = [stemmer.stem(t) for t in filtered_tokens]\n", 730 | " return stems\n", 731 | "\n", 732 | "def tokenize_only(text):\n", 733 | " tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n", 734 | " filtered_tokens = []\n", 735 | " for token in tokens:\n", 736 | " if re.search('[a-zA-Z]', token):\n", 737 | " filtered_tokens.append(token)\n", 738 | " return filtered_tokens\n", 739 | "\n", 740 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 741 | "\n", 742 | "# tfidf vectorizer \n", 743 | "\n", 744 | "tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,\n", 745 | " min_df=0.2, stop_words='english',\n", 746 | " use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))\n", 747 | "\n", 748 | "#fit the vectorizer to data\n", 749 | "\n", 750 | "tfidf_matrix = tfidf_vectorizer.fit_transform(complaints) \n", 751 | "terms = tfidf_vectorizer.get_feature_names()\n", 752 | "print(tfidf_matrix.shape)\n" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": null, 758 | "metadata": { 759 | "collapsed": true 760 | }, 761 | "outputs": [], 762 | "source": [ 763 | "#Import Kmeans\n", 764 | "from sklearn.cluster import KMeans\n", 765 | "\n", 766 | "# Define number of clusters\n", 767 | "num_clusters = 6\n", 768 | "\n", 769 | "#Running clustering algorithm\n", 770 | "km = KMeans(n_clusters=num_clusters)\n", 771 | "km.fit(tfidf_matrix)\n", 772 | "\n", 773 | "#final clusters\n", 774 | "clusters = km.labels_.tolist()\n", 775 | "complaints_data = { 'rank': ranks, 'complaints': complaints, 'cluster': clusters }\n", 776 | "frame = pd.DataFrame(complaints_data, index = [clusters] , columns = ['rank', 'cluster'])\n", 777 | "\n", 778 | "#number of docs per cluster \n", 779 | "frame['cluster'].value_counts()\n" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": null, 785 | "metadata": { 786 | "collapsed": true 787 | }, 788 | "outputs": [], 789 | "source": [ 790 | "totalvocab_stemmed = []\n", 791 | "totalvocab_tokenized = []\n", 792 | "for i in complaints:\n", 793 | " allwords_stemmed = tokenize_and_stem(i) \n", 794 | " totalvocab_stemmed.extend(allwords_stemmed) \n", 795 | " \n", 796 | " allwords_tokenized = tokenize_only(i)\n", 797 | " totalvocab_tokenized.extend(allwords_tokenized)\n", 798 | "\n", 799 | "vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)\n", 800 | "\n", 801 | "#sort cluster centers by proximity to centroid\n", 802 | "order_centroids = km.cluster_centers_.argsort()[:, ::-1] \n", 803 | "\n", 804 | "for i in range(num_clusters):\n", 805 | " print(\"Cluster %d words:\" % i, end='')\n", 806 | " \n", 807 | " for ind in order_centroids[i, :6]: \n", 808 | " print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')\n", 809 | " print()\n", 810 | "\n" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": null, 816 | "metadata": { 817 | "collapsed": true 818 | }, 819 | "outputs": [], 820 | "source": [ 821 | "#Similarity\n", 822 | "\n", 823 | "similarity_distance = 1 - cosine_similarity(tfidf_matrix)\n", 824 | "\n", 825 | "\n", 826 | "# Convert two components as we're plotting points in a two-dimensional plane\n", 827 | "mds = MDS(n_components=2, dissimilarity=\"precomputed\", random_state=1)\n", 828 | "pos = mds.fit_transform(similarity_distance) # shape (n_components, n_samples)\n", 829 | "xs, ys = pos[:, 0], pos[:, 1]\n", 830 | "\n", 831 | "#Set up colors per clusters using a dict\n", 832 | "cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e',5: '#D2691E'}\n", 833 | "\n", 834 | "#set up cluster names using a dict\n", 835 | "cluster_names = {0: 'property, based, assist', \n", 836 | " 1: 'business, card', \n", 837 | " 2: 'authorized, approved, believe', \n", 838 | " 3: 'agreement, application,business', \n", 839 | " 4: 'closed, applied, additional',\n", 840 | " 5: 'applied, card'}\n", 841 | "\n", 842 | "# Finally plot it\n", 843 | "%matplotlib inline \n", 844 | "\n", 845 | "#Create data frame that has the result of the MDS and the cluster \n", 846 | "df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) \n", 847 | "groups = df.groupby('label')\n", 848 | "\n", 849 | "# Set up plot\n", 850 | "fig, ax = plt.subplots(figsize=(17, 9)) # set size\n", 851 | "\n", 852 | "for name, group in groups:\n", 853 | " ax.plot(group.x, group.y, marker='o', linestyle='', ms=20, \n", 854 | " label=cluster_names[name], color=cluster_colors[name], \n", 855 | " mec='none')\n", 856 | " ax.set_aspect('auto')\n", 857 | " ax.tick_params(\\\n", 858 | " axis= 'x', \n", 859 | " which='both', \n", 860 | " bottom='off', \n", 861 | " top='off', \n", 862 | " labelbottom='off')\n", 863 | " ax.tick_params(\\\n", 864 | " axis= 'y', \n", 865 | " which='both', \n", 866 | " left='off', \n", 867 | " top='off', \n", 868 | " labelleft='off')\n", 869 | " \n", 870 | "ax.legend(numpoints=1) \n", 871 | "plt.show()\n", 872 | "\n" 873 | ] 874 | } 875 | ], 876 | "metadata": { 877 | "kernelspec": { 878 | "display_name": "Python [conda root]", 879 | "language": "python", 880 | "name": "conda-root-py" 881 | }, 882 | "language_info": { 883 | "codemirror_mode": { 884 | "name": "ipython", 885 | "version": 3 886 | }, 887 | "file_extension": ".py", 888 | "mimetype": "text/x-python", 889 | "name": "python", 890 | "nbconvert_exporter": "python", 891 | "pygments_lexer": "ipython3", 892 | "version": "3.5.2" 893 | } 894 | }, 895 | "nbformat": 4, 896 | "nbformat_minor": 1 897 | } 898 | -------------------------------------------------------------------------------- /Chapter 6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Deep Learning for NLP" 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": {}, 13 | "source": [ 14 | "Recipe 6-1. Retrieving Information" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import gensim\n", 26 | "from gensim.models import Word2Vec\n", 27 | "import numpy as np \n", 28 | "import nltk\n", 29 | "import itertools\n", 30 | "from nltk.corpus import stopwords\n", 31 | "from nltk.tokenize import sent_tokenize, word_tokenize\n", 32 | "import scipy \n", 33 | "from scipy import spatial\n", 34 | "from nltk.tokenize.toktok import ToktokTokenizer\n", 35 | "import re\n", 36 | "tokenizer = ToktokTokenizer()\n", 37 | "stopword_list = nltk.corpus.stopwords.words('english') \n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "# Randomly taking sentences from internet \n", 49 | "\n", 50 | "Doc1 = [\"With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders.\" ] \n", 51 | " \n", 52 | "Doc2 = [\"Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.\"]\n", 53 | "\n", 54 | "Doc3 = [\"He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban and metro rail systems.\"]\n", 55 | "\n", 56 | "Doc4 = [\"But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni, India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg.\"]\n", 57 | "\n", 58 | "# Put all the documents in one list\n", 59 | "\n", 60 | "fin= Doc1+Doc2+Doc3+Doc4\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "#https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit\n", 72 | "\n", 73 | "#load the model\n", 74 | "\n", 75 | "model = gensim.models.KeyedVectors.load_word2vec_format('/GoogleNews-vectors-negative300.bin', binary=True)\n", 76 | "\n", 77 | "#Preprocessing \n", 78 | "\n", 79 | "def remove_stopwords(text, is_lower_case=False):\n", 80 | " pattern = r'[^a-zA-z0-9\\s]' \n", 81 | " text = re.sub(pattern, '', ''.join(text))\n", 82 | " tokens = tokenizer.tokenize(text)\n", 83 | " tokens = [token.strip() for token in tokens]\n", 84 | " if is_lower_case:\n", 85 | " filtered_tokens = [token for token in tokens if token not in stopword_list]\n", 86 | " else:\n", 87 | " filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]\n", 88 | " filtered_text = ' '.join(filtered_tokens) \n", 89 | " return filtered_text\n", 90 | "\n", 91 | "# Function to get the embedding vector for n dimension, we have used \"300\"\n", 92 | "\n", 93 | "def get_embedding(word):\n", 94 | " if word in model.wv.vocab:\n", 95 | " return model[x]\n", 96 | " else:\n", 97 | " return np.zeros(300)\n" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "# Getting average vector for each document \n", 109 | "out_dict = {}\n", 110 | "for sen in fin:\n", 111 | " average_vector = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(remove_stopwords(sen))]), axis=0))\n", 112 | " dict = { sen : (average_vector) }\n", 113 | " out_dict.update(dict)\n", 114 | "\n", 115 | "# Function to calculate the similarity between the query vector and document vector\n", 116 | "\n", 117 | "def get_sim(query_embedding, average_vector_doc):\n", 118 | " sim = [(1 - scipy.spatial.distance.cosine(query_embedding, average_vector_doc))]\n", 119 | " return sim\n", 120 | "\n", 121 | "# Rank all the documents based on the similarity to get relevant docs\n", 122 | "\n", 123 | "def Ranked_documents(query):\n", 124 | " query_words = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(query.lower())],dtype=float), axis=0))\n", 125 | " rank = []\n", 126 | " for k,v in out_dict.items():\n", 127 | " rank.append((k, get_sim(query_words, v)))\n", 128 | " rank = sorted(rank,key=lambda t: t[1], reverse=True)\n", 129 | " print('Ranked Documents :')\n", 130 | " return rank\n" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": true 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "# Call the IR function with a query\n", 142 | "\n", 143 | "Ranked_documents(\"cricket\")\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "#Let’s take one more example as may be driving. \n", 155 | "\n", 156 | "Ranked_documents(\"driving\")" 157 | ] 158 | }, 159 | { 160 | "cell_type": "raw", 161 | "metadata": {}, 162 | "source": [ 163 | "Recipe 6-2. Classifying Text with Deep Learning" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "#read file\n", 175 | "file_content = pd.read_csv('spam.csv', encoding = \"ISO-8859-1\")\n", 176 | "\n", 177 | "#check sample content in the email\n", 178 | "file_content['v2'][1]\n", 179 | "\n", 180 | "#Import library\n", 181 | "from nltk.corpus import stopwords\n", 182 | "from nltk import *\n", 183 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 184 | "from nltk.stem import WordNetLemmatizer\n", 185 | "import matplotlib.pyplot as plt\n", 186 | "from sklearn.model_selection import train_test_split\n", 187 | "\n", 188 | "# Remove stop words\n", 189 | "stop = stopwords.words('english')\n", 190 | "file_content['v2'] = file_content['v2'].apply(lambda x: \" \".join(x for x in x.split() if x not in stop))\n", 191 | "\n", 192 | "# Delete unwanted columns\n", 193 | "Email_Data = file_content[['v1', 'v2']]\n", 194 | "\n", 195 | "# Rename column names\n", 196 | "Email_Data = Email_Data.rename(columns={\"v1\":\"Target\", \"v2\":\"Email\"})\n", 197 | "Email_Data.head()\n", 198 | "\n", 199 | "#Delete punctuations, convert text in lower case and delete the double space \n", 200 | "\n", 201 | "Email_Data['Email'] = Email_Data['Email'].apply(lambda x: re.sub('[!@#$:).;,?&]', '', x.lower()))\n", 202 | "Email_Data['Email'] = Email_Data['Email'].apply(lambda x: re.sub(' ', ' ', x))\n", 203 | "Email_Data['Email'].head(5)\n" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": true 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "#Separating text(input) and target classes\n", 215 | "\n", 216 | "list_sentences_rawdata = Email_Data[\"Email\"].fillna(\"_na_\").values\n", 217 | "list_classes = [\"Target\"]\n", 218 | "target = Email_Data[list_classes].values\n", 219 | "\n", 220 | "\n", 221 | "To_Process=Email_Data[['Email', 'Target']]\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "#Train and test split with 80:20 ratio\n", 233 | "train, test = train_test_split(To_Process, test_size=0.2) \n", 234 | "\n", 235 | "# Define the sequence lengths, max number of words and embedding dimensions\n", 236 | "# Sequence length of each sentence. If more, truncate. If less, pad with zeros\n", 237 | "\n", 238 | "MAX_SEQUENCE_LENGTH = 300 \n", 239 | "\n", 240 | "# Top 20000 frequently occurring words\n", 241 | "MAX_NB_WORDS = 20000 \n", 242 | " \n", 243 | "# Get the frequently occurring words\n", 244 | " tokenizer = Tokenizer(num_words=MAX_NB_WORDS) \n", 245 | "tokenizer.fit_on_texts(train.Email) \n", 246 | "train_sequences = tokenizer.texts_to_sequences(train.Email)\n", 247 | "test_sequences = tokenizer.texts_to_sequences(test.Email)\n", 248 | "\n", 249 | "# dictionary containing words and their index\n", 250 | "word_index = tokenizer.word_index \n", 251 | "# print(tokenizer.word_index) \n", 252 | "# total words in the corpus\n", 253 | "print('Found %s unique tokens.' % len(word_index)) \n", 254 | "\n", 255 | "# get only the top frequent words on train\n", 256 | "train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH) \n", 257 | "\n", 258 | "# get only the top frequent words on test\n", 259 | "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH) \n", 260 | "\n", 261 | "print(train_data.shape)\n", 262 | "print(test_data.shape)\n" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": true 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "train_labels = train['Target']\n", 274 | "test_labels = test['Target']\n", 275 | "\n", 276 | "#import library\n", 277 | "\n", 278 | "from sklearn.preprocessing import LabelEncoder\n", 279 | "# converts the character array to numeric array. Assigns levels to unique labels.\n", 280 | "\n", 281 | "le = LabelEncoder() \n", 282 | "le.fit(train_labels)\n", 283 | "train_labels = le.transform(train_labels)\n", 284 | "test_labels = le.transform(test_labels)\n", 285 | "\n", 286 | "print(le.classes_)\n", 287 | "print(np.unique(train_labels, return_counts=True))\n", 288 | "print(np.unique(test_labels, return_counts=True))\n" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": true 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "# changing data types\n", 300 | "labels_train = to_categorical(np.asarray(train_labels))\n", 301 | "labels_test = to_categorical(np.asarray(test_labels))\n", 302 | "print('Shape of data tensor:', train_data.shape)\n", 303 | "print('Shape of label tensor:', labels_train.shape)\n", 304 | "print('Shape of label tensor:', labels_test.shape)\n", 305 | "\n", 306 | "EMBEDDING_DIM = 100\n", 307 | "print(MAX_SEQUENCE_LENGTH)\n" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "# Import Libraries \n", 319 | "import sys, os, re, csv, codecs, numpy as np, pandas as pd\n", 320 | "\n", 321 | "from keras.preprocessing.text import Tokenizer\n", 322 | "from keras.preprocessing.sequence import pad_sequences\n", 323 | "from keras.utils import to_categorical\n", 324 | "from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation\n", 325 | "from keras.layers import Bidirectional, GlobalMaxPool1D, Conv1D, SimpleRNN\n", 326 | "from keras.models import Model\n", 327 | "from keras.models import Sequential\n", 328 | "from keras import initializers, regularizers, constraints, optimizers, layers\n", 329 | "from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization\n", 330 | "from keras.layers import Conv1D, MaxPooling1D, Embedding\n", 331 | "from keras.models import Sequential\n", 332 | "\n" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": { 339 | "collapsed": true 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "print('Training CNN 1D model.')\n", 344 | "\n", 345 | "model = Sequential()\n", 346 | "model.add(Embedding(MAX_NB_WORDS,\n", 347 | " EMBEDDING_DIM,\n", 348 | " input_length=MAX_SEQUENCE_LENGTH\n", 349 | " ))\n", 350 | "model.add(Dropout(0.5))\n", 351 | "model.add(Conv1D(128, 5, activation='relu'))\n", 352 | "model.add(MaxPooling1D(5))\n", 353 | "model.add(Dropout(0.5))\n", 354 | "model.add(BatchNormalization())\n", 355 | "model.add(Conv1D(128, 5, activation='relu'))\n", 356 | "model.add(MaxPooling1D(5))\n", 357 | "model.add(Dropout(0.5))\n", 358 | "model.add(BatchNormalization())\n", 359 | "model.add(Flatten())\n", 360 | "model.add(Dense(128, activation='relu'))\n", 361 | "model.add(Dense(2, activation='softmax'))\n", 362 | "\n", 363 | "\n", 364 | "model.compile(loss='categorical_crossentropy',\n", 365 | " optimizer='rmsprop',\n", 366 | " metrics=['acc'])\n", 367 | "\n", 368 | "model.fit(train_data, labels_train,\n", 369 | " batch_size=64,\n", 370 | " epochs=5,\n", 371 | " validation_data=(test_data, labels_test))\n" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": { 378 | "collapsed": true 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "#predictions on test data\n", 383 | "\n", 384 | "predicted=model.predict(test_data)\n", 385 | "predicted\n", 386 | "\n", 387 | "#model evaluation\n", 388 | "\n", 389 | "import sklearn\n", 390 | "from sklearn.metrics import precision_recall_fscore_support as score\n", 391 | "\n", 392 | "precision, recall, fscore, support = score(labels_test, predicted.round())\n", 393 | "\n", 394 | "print('precision: {}'.format(precision))\n", 395 | "print('recall: {}'.format(recall))\n", 396 | "print('fscore: {}'.format(fscore))\n", 397 | "print('support: {}'.format(support))\n", 398 | "\n", 399 | "print(\"############################\")\n", 400 | "\n", 401 | "print(sklearn.metrics.classification_report(labels_test, predicted.round()))\n" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": { 408 | "collapsed": true 409 | }, 410 | "outputs": [], 411 | "source": [ 412 | "#import library\n", 413 | "from keras.layers.recurrent import SimpleRNN\n", 414 | "\n", 415 | "#model training\n", 416 | "\n", 417 | "print('Training SIMPLERNN model.')\n", 418 | "\n", 419 | "model = Sequential()\n", 420 | "model.add(Embedding(MAX_NB_WORDS,\n", 421 | " EMBEDDING_DIM,\n", 422 | " input_length=MAX_SEQUENCE_LENGTH\n", 423 | " ))\n", 424 | "model.add(SimpleRNN(2, input_shape=(None,1)))\n", 425 | "\n", 426 | "model.add(Dense(2,activation='softmax'))\n", 427 | "\n", 428 | "model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])\n", 429 | "\n", 430 | "model.fit(train_data, labels_train,\n", 431 | " batch_size=16,\n", 432 | " epochs=5,\n", 433 | " validation_data=(test_data, labels_test))\n" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": { 440 | "collapsed": true 441 | }, 442 | "outputs": [], 443 | "source": [ 444 | "# prediction on test data\n", 445 | "predicted_Srnn=model.predict(test_data)\n", 446 | "predicted_Srnn\n", 447 | "\n", 448 | "#model evaluation\n", 449 | "\n", 450 | "from sklearn.metrics import precision_recall_fscore_support as score\n", 451 | "\n", 452 | "precision, recall, fscore, support = score(labels_test, predicted_Srnn.round())\n", 453 | "\n", 454 | "print('precision: {}'.format(precision))\n", 455 | "print('recall: {}'.format(recall))\n", 456 | "print('fscore: {}'.format(fscore))\n", 457 | "print('support: {}'.format(support))\n", 458 | "\n", 459 | "print(\"############################\")\n", 460 | "\n", 461 | "print(sklearn.metrics.classification_report(labels_test, predicted_Srnn.round()))\n" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": { 468 | "collapsed": true 469 | }, 470 | "outputs": [], 471 | "source": [ 472 | "#model training\n", 473 | "\n", 474 | "print('Training LSTM model.')\n", 475 | "\n", 476 | "model = Sequential()\n", 477 | "model.add(Embedding(MAX_NB_WORDS,\n", 478 | " EMBEDDING_DIM,\n", 479 | " input_length=MAX_SEQUENCE_LENGTH\n", 480 | " ))\n", 481 | "model.add(LSTM(output_dim=16, activation='relu', inner_activation='hard_sigmoid',return_sequences=True))\n", 482 | "model.add(Dropout(0.2))\n", 483 | "model.add(BatchNormalization())\n", 484 | "\n", 485 | "model.add(Flatten())\n", 486 | "\n", 487 | "model.add(Dense(2,activation='softmax'))\n", 488 | "\n", 489 | "model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])\n", 490 | "\n", 491 | "model.fit(train_data, labels_train,\n", 492 | " batch_size=16,\n", 493 | " epochs=5,\n", 494 | " validation_data=(test_data, labels_test))\n" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": { 501 | "collapsed": true 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "#prediction on text data\n", 506 | "predicted_lstm=model.predict(test_data)\n", 507 | "predicted_lstm\n", 508 | "\n", 509 | "#model evaluation \n", 510 | "\n", 511 | "from sklearn.metrics import precision_recall_fscore_support as score\n", 512 | "\n", 513 | "precision, recall, fscore, support = score(labels_test, predicted_lstm.round())\n", 514 | "\n", 515 | "print('precision: {}'.format(precision))\n", 516 | "print('recall: {}'.format(recall))\n", 517 | "print('fscore: {}'.format(fscore))\n", 518 | "print('support: {}'.format(support))\n", 519 | "\n", 520 | "print(\"############################\")\n", 521 | "\n", 522 | "print(sklearn.metrics.classification_report(labels_test, predicted_lstm.round()))\n" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": { 529 | "collapsed": true 530 | }, 531 | "outputs": [], 532 | "source": [ 533 | "#model training\n", 534 | "\n", 535 | "print('Training Bidirectional LSTM model.')\n", 536 | "\n", 537 | "model = Sequential()\n", 538 | "model.add(Embedding(MAX_NB_WORDS,\n", 539 | " EMBEDDING_DIM,\n", 540 | " input_length=MAX_SEQUENCE_LENGTH\n", 541 | " ))\n", 542 | "model.add(Bidirectional(LSTM(16, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))\n", 543 | "model.add(Conv1D(16, kernel_size = 3, padding = \"valid\", kernel_initializer = \"glorot_uniform\"))\n", 544 | "model.add(GlobalMaxPool1D())\n", 545 | "model.add(Dense(50, activation=\"relu\"))\n", 546 | "model.add(Dropout(0.1))\n", 547 | "\n", 548 | "model.add(Dense(2,activation='softmax'))\n", 549 | "\n", 550 | "model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])\n", 551 | "\n", 552 | "model.fit(train_data, labels_train,\n", 553 | " batch_size=16,\n", 554 | " epochs=3,\n", 555 | " validation_data=(test_data, labels_test))\n" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": { 562 | "collapsed": true 563 | }, 564 | "outputs": [], 565 | "source": [ 566 | "# prediction on test data\n", 567 | "\n", 568 | "predicted_blstm=model.predict(test_data)\n", 569 | "predicted_blstm\n", 570 | "\n", 571 | "#model evaluation\n", 572 | "\n", 573 | "from sklearn.metrics import precision_recall_fscore_support as score\n", 574 | "\n", 575 | "precision, recall, fscore, support = score(labels_test, predicted_blstm.round())\n", 576 | "\n", 577 | "print('precision: {}'.format(precision))\n", 578 | "print('recall: {}'.format(recall))\n", 579 | "print('fscore: {}'.format(fscore))\n", 580 | "print('support: {}'.format(support))\n", 581 | "\n", 582 | "print(\"############################\")\n", 583 | "\n", 584 | "print(sklearn.metrics.classification_report(labels_test, predicted_blstm.round()))\n" 585 | ] 586 | }, 587 | { 588 | "cell_type": "raw", 589 | "metadata": {}, 590 | "source": [ 591 | "Recipe 6-3. Next word/sequence of words suggestion – Next word prediction" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": { 598 | "collapsed": true 599 | }, 600 | "outputs": [], 601 | "source": [ 602 | "file_content = pd.read_csv('spam.csv', encoding = \"ISO-8859-1\")\n", 603 | "\n", 604 | "# Just selecting emails and connverting it into list\n", 605 | "Email_Data = file_content[[ 'v2']]\n", 606 | "\n", 607 | "list_data = Email_Data.values.tolist()\n", 608 | "list_data \n" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": null, 614 | "metadata": { 615 | "collapsed": true 616 | }, 617 | "outputs": [], 618 | "source": [ 619 | "import numpy as np\n", 620 | "import random\n", 621 | "import pandas as pd\n", 622 | "import sys\n", 623 | "import os\n", 624 | "import time\n", 625 | "import codecs\n", 626 | "import collections\n", 627 | "import numpy\n", 628 | "from keras.models import Sequential\n", 629 | "from keras.layers import Dense\n", 630 | "from keras.layers import Dropout\n", 631 | "from keras.layers import LSTM\n", 632 | "from keras.callbacks import ModelCheckpoint\n", 633 | "from keras.utils import np_utils\n", 634 | "from nltk.tokenize import sent_tokenize, word_tokenize\n", 635 | "import scipy \n", 636 | "from scipy import spatial\n", 637 | "from nltk.tokenize.toktok import ToktokTokenizer\n", 638 | "import re\n", 639 | "tokenizer = ToktokTokenizer()\n" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": { 646 | "collapsed": true 647 | }, 648 | "outputs": [], 649 | "source": [ 650 | "#Converting list to string\n", 651 | "from collections import Iterable\n", 652 | "\n", 653 | "\n", 654 | "def flatten(items):\n", 655 | " \"\"\"Yield items from any nested iterable\"\"\"\n", 656 | " for x in items:\n", 657 | " if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):\n", 658 | " for sub_x in flatten(x):\n", 659 | " yield sub_x\n", 660 | " else:\n", 661 | " yield x\n", 662 | "\n", 663 | "\n", 664 | "TextData=list(flatten(list_data)) \n", 665 | "TextData = ''.join(TextData) \n", 666 | "\n", 667 | "# Remove unwanted lines and converting into lower case\n", 668 | "TextData = TextData.replace('\\n','')\n", 669 | "TextData = TextData.lower() \n", 670 | "\n", 671 | "pattern = r'[^a-zA-z0-9\\s]' \n", 672 | "TextData = re.sub(pattern, '', ''.join(TextData)) \n", 673 | "\n", 674 | "# Tokenizing\n", 675 | "\n", 676 | "tokens = tokenizer.tokenize(TextData)\n", 677 | "tokens = [token.strip() for token in tokens] \n", 678 | "\n", 679 | "# get the distinct words and sort it\n", 680 | "\n", 681 | "word_counts = collections.Counter(tokens)\n", 682 | "word_c = len(word_counts)\n", 683 | "print(word_c)\n", 684 | "\n", 685 | "distinct_words = [x[0] for x in word_counts.most_common()]\n", 686 | "distinct_words_sorted = list(sorted(distinct_words)) \n", 687 | "\n", 688 | "\n", 689 | "# Generate indexing for all words\n", 690 | "\n", 691 | "word_index = {x: i for i, x in enumerate(distinct_words_sorted)} \n", 692 | "\n", 693 | "\n", 694 | "# decide on sentence lenght\n", 695 | "\n", 696 | "sentence_length = 25\n" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": null, 702 | "metadata": { 703 | "collapsed": true 704 | }, 705 | "outputs": [], 706 | "source": [ 707 | "#prepare the dataset of input to output pairs encoded as integers\n", 708 | "# Generate the data for the model\n", 709 | "\n", 710 | "#input = the input sentence to the model with index \n", 711 | "#output = output of the model with index\n", 712 | "\n", 713 | "InputData = []\n", 714 | "OutputData = []\n", 715 | "\n", 716 | "for i in range(0, word_c - sentence_length, 1):\n", 717 | " X = tokens[i:i + sentence_length]\n", 718 | " Y = tokens[i + sentence_length]\n", 719 | " InputData.append([word_index[char] for char in X])\n", 720 | " OutputData.append(word_index[Y])\n", 721 | "\n", 722 | "print (InputData[:1])\n", 723 | "print (\"\\n\")\n", 724 | "print(OutputData[:1]) \n" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": null, 730 | "metadata": { 731 | "collapsed": true 732 | }, 733 | "outputs": [], 734 | "source": [ 735 | "# Generate X \n", 736 | "X = numpy.reshape(InputData, (len(InputData), sentence_length, 1))\n", 737 | "\n", 738 | "\n", 739 | "# One hot encode the output variable\n", 740 | "Y = np_utils.to_categorical(OutputData) \n", 741 | "\n", 742 | "Y\n" 743 | ] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "execution_count": null, 748 | "metadata": { 749 | "collapsed": true 750 | }, 751 | "outputs": [], 752 | "source": [ 753 | "# define the LSTM model\n", 754 | "model = Sequential()\n", 755 | "model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))\n", 756 | "model.add(Dropout(0.2))\n", 757 | "model.add(Dense(Y.shape[1], activation='softmax'))\n", 758 | "model.compile(loss='categorical_crossentropy', optimizer='adam')\n", 759 | "\n", 760 | " \n", 761 | "#define the checkpoint\n", 762 | "file_name_path=\"weights-improvement-{epoch:02d}-{loss:.4f}.hdf5\"\n", 763 | "checkpoint = ModelCheckpoint(file_name_path, monitor='loss', verbose=1, save_best_only=True, mode='min')\n", 764 | "callbacks = [checkpoint] \n", 765 | "\n", 766 | "#fit the model\n", 767 | "model.fit(X, Y, epochs=5, batch_size=128, callbacks=callbacks) \n" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": { 774 | "collapsed": true 775 | }, 776 | "outputs": [], 777 | "source": [ 778 | "# load the network weights\n", 779 | "file_name = \"weights-improvement-05-6.8213.hdf5\"\n", 780 | "model.load_weights(file_name)\n", 781 | "model.compile(loss='categorical_crossentropy', optimizer='adam') \n" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": null, 787 | "metadata": { 788 | "collapsed": true 789 | }, 790 | "outputs": [], 791 | "source": [ 792 | "# Generating random sequence\n", 793 | "start = numpy.random.randint(0, len(InputData))\n", 794 | "input_sent = InputData[start]\n", 795 | "\n", 796 | "# Generate index of the next word of the email \n", 797 | "\n", 798 | "X = numpy.reshape(input_sent, (1, len(input_sent), 1))\n", 799 | "predict_word = model.predict(X, verbose=0)\n", 800 | "index = numpy.argmax(predict_word)\n", 801 | "\n", 802 | "print(input_sent)\n", 803 | "print (\"\\n\")\n", 804 | "print(index)\n" 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": null, 810 | "metadata": { 811 | "collapsed": true 812 | }, 813 | "outputs": [], 814 | "source": [ 815 | "# Convert these indexes back to words\n", 816 | "\n", 817 | "word_index_rev = dict((i, c) for i, c in enumerate(tokens))\n", 818 | "result = word_index_rev[index]\n", 819 | "sent_in = [word_index_rev[value] for value in input_sent]\n", 820 | "\n", 821 | "print(sent_in)\n", 822 | "print (\"\\n\")\n", 823 | "print(result)\n" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": null, 829 | "metadata": { 830 | "collapsed": true 831 | }, 832 | "outputs": [], 833 | "source": [] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "metadata": { 839 | "collapsed": true 840 | }, 841 | "outputs": [], 842 | "source": [] 843 | }, 844 | { 845 | "cell_type": "code", 846 | "execution_count": null, 847 | "metadata": { 848 | "collapsed": true 849 | }, 850 | "outputs": [], 851 | "source": [] 852 | } 853 | ], 854 | "metadata": { 855 | "kernelspec": { 856 | "display_name": "Python [conda root]", 857 | "language": "python", 858 | "name": "conda-root-py" 859 | }, 860 | "language_info": { 861 | "codemirror_mode": { 862 | "name": "ipython", 863 | "version": 3 864 | }, 865 | "file_extension": ".py", 866 | "mimetype": "text/x-python", 867 | "name": "python", 868 | "nbconvert_exporter": "python", 869 | "pygments_lexer": "ipython3", 870 | "version": "3.5.2" 871 | } 872 | }, 873 | "nbformat": 4, 874 | "nbformat_minor": 1 875 | } 876 | -------------------------------------------------------------------------------- /Contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to Apress Source Code 2 | 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers. 4 | 5 | ## How to Contribute 6 | 7 | 1. Make sure you have a GitHub account. 8 | 2. Fork the repository for the relevant book. 9 | 3. Create a new branch on which to make your change, e.g. 10 | `git checkout -b my_code_contribution` 11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted. 12 | 5. Submit a pull request. 13 | 14 | Thank you for your contribution! -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Freeware License, some rights reserved 2 | 3 | Copyright (c) 2019 Akshay Kulkarni and Adarsha Shivananda 4 | 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 6 | of this software and associated documentation files (the "Software"), 7 | to work with the Software within the limits of freeware distribution and fair use. 8 | This includes the rights to use, copy, and modify the Software for personal use. 9 | Users are also allowed and encouraged to submit corrections and modifications 10 | to the Software for the benefit of other users. 11 | 12 | It is not allowed to reuse, modify, or redistribute the Software for 13 | commercial use in any way, or for a user’s educational materials such as books 14 | or blog articles without prior permission from the copyright holder. 15 | 16 | The above copyright notice and this permission notice need to be included 17 | in all copies or substantial portions of the software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | 27 | 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apress Source Code 2 | 3 | This repository accompanies [*Natural Language Processing Recipes*](https://www.apress.com/9781484242667) by Akshay Kulkarni and Adarsha Shivananda (Apress, 2019). 4 | 5 | [comment]: #cover 6 | ![Cover image](9781484242667.jpg) 7 | 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git. 9 | 10 | ## Releases 11 | 12 | Release v1.0 corresponds to the code in the published book, without corrections or updates. 13 | 14 | ## Contributions 15 | 16 | See the file Contributing.md for more information on how you can contribute to this repository. -------------------------------------------------------------------------------- /errata.md: -------------------------------------------------------------------------------- 1 | # Errata for *Book Title* 2 | 3 | On **page xx** [Summary of error]: 4 | 5 | Details of error here. Highlight key pieces in **bold**. 6 | 7 | *** 8 | 9 | On **page xx** [Summary of error]: 10 | 11 | Details of error here. Highlight key pieces in **bold**. 12 | 13 | *** --------------------------------------------------------------------------------