├── .gitignore ├── Chapter02 ├── 02_01_nlp_libraries_intro.ipynb ├── 02_02_web_scraping_intro.ipynb ├── Dataset │ └── flight_data.zip └── Output Files │ └── ecommerce.csv ├── Chapter03 ├── All the basic preprocessing in one place.ipynb ├── Dataset │ └── zomato_reviews.csv ├── Stemming, Lemmatization, Stopword Removal, Case-Folding, N-grams and HTML tags.ipynb └── Understanding Tokenization.ipynb ├── Chapter04 ├── Bag of Words in Action.ipynb ├── Chatbot.ipynb ├── Cosine Similarity.ipynb ├── CountVectorizer for Bag of Words model.ipynb ├── Dataset │ └── qa_Electronics.zip ├── Matrix Representation.ipynb ├── One Hot Vectors.ipynb └── TfIdf Vectorizer for text representation.ipynb ├── Chapter05 ├── Building a Word2Vec model.ipynb ├── Understanding the Pre-trained Word2Vec.ipynb └── Word Mover's Distance.ipynb ├── Chapter06 ├── Doc2Vec in Action.ipynb ├── comments.zip ├── fastText based Applications (Spell Correction_Auto Suggestion_Document Distances).ipynb └── fastText from scratch.ipynb ├── Chapter07 ├── Data_Preprocessing.ipynb ├── Sentiment Analyzer Model Reload.ipynb ├── Sentiment Analyzer.ipynb ├── amazon_cells_labelled.txt ├── nb_sa └── vectorizer_sa ├── Chapter08 ├── Dataset │ ├── test_dataset.txt │ └── training_data.txt ├── Output Files │ ├── question_classification_model.json │ └── question_classification_weights.h5 └── Question Classification using Neural Networks (Multi-class).ipynb ├── Chapter09 ├── Dataset │ └── Sarcasm_Headlines_Dataset_v2.zip ├── Output Files │ ├── sarcasm_detection_model_cnn.h5 │ └── sarcasm_detection_model_cnn.json └── Sarcasm Detection using CNNs.ipynb ├── Chapter10 ├── Dataset │ └── hotel_data.zip ├── Output Files │ ├── text_generation_using_LSTM.h5 │ └── text_generation_using_LSTM.json └── Text Generation using LSTM.ipynb ├── Chapter11 ├── French To English Translation.ipynb └── dataset │ └── bilingual_pairs.zip ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | "Word Embeddings and Distance Measurements for Text/GoogleNews-vectors-negative300.bin" 4 | "Applying Convolutions To Text/GoogleNews-vectors-negative300.bin" 5 | -------------------------------------------------------------------------------- /Chapter02/02_01_nlp_libraries_intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "'PPG'" 12 | ] 13 | }, 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "#!pip install pandas\n", 21 | "import pandas as pd\n", 22 | "data = pd.read_csv(\"flight_data.csv\")\n", 23 | "data.groupby(\"ORIGIN\").mean()[\"DEP_DELAY\"].idxmax()" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 4, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "matrix([[1, 1, 1, 2]], dtype=int64)" 35 | ] 36 | }, 37 | "execution_count": 4, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "#!pip install scikit-learn\n", 44 | "from sklearn.feature_extraction.text import CountVectorizer\n", 45 | "sentence = [\"How to change payment method and payment frequency\"]\n", 46 | "vectorizer = CountVectorizer(stop_words='english')\n", 47 | "vectorizer.fit_transform(sentence).todense()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 6, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "['Who', 'would', 'have', 'thought', 'that', 'computer', 'programs', 'would', 'be', 'analyzing', 'human', 'sentiments']\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "#!pip install nltk\n", 65 | "import nltk\n", 66 | "text = \"Who would have thought that computer programs would be analyzing human sentiments\"\n", 67 | "from nltk.tokenize import word_tokenize\n", 68 | "tokens = word_tokenize(text)\n", 69 | "print(tokens)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 7, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "stopwords = nltk.corpus.stopwords.words('english')\n", 87 | "print(stopwords)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 8, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "['Who',\n", 99 | " 'would',\n", 100 | " 'thought',\n", 101 | " 'computer',\n", 102 | " 'programs',\n", 103 | " 'would',\n", 104 | " 'analyzing',\n", 105 | " 'human',\n", 106 | " 'sentiments']" 107 | ] 108 | }, 109 | "execution_count": 8, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "[word for word in tokens if word not in stopwords]" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 9, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "['Who', 'would', 'have', 'thought', 'that', 'computer', 'program', 'would', 'be', 'analyzing', 'human', 'sentiment']\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "from nltk.stem import WordNetLemmatizer\n", 133 | "text = \"Who would have thought that computer programs would be analyzing human sentiments\"\n", 134 | "tokens = word_tokenize(text)\n", 135 | "lemmatizer = WordNetLemmatizer()\n", 136 | "tokens=[lemmatizer.lemmatize(word) for word in tokens]\n", 137 | "print(tokens)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 10, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "['who', 'would', 'have', 'thought', 'that', 'comput', 'program', 'would', 'be', 'analyz', 'human', 'sentiment']\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "from nltk.stem import PorterStemmer\n", 155 | "text = \"Who would have thought that computer programs would be analyzing human sentiments\"\n", 156 | "tokens=word_tokenize(text.lower())\n", 157 | "ps = PorterStemmer()\n", 158 | "tokens=[ps.stem(word) for word in tokens]\n", 159 | "print(tokens)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 11, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/plain": [ 170 | "[('eat', 'NN')]" 171 | ] 172 | }, 173 | "execution_count": 11, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "nltk.pos_tag([\"your\"])\n", 180 | "nltk.pos_tag([\"beautiful\"])\n", 181 | "nltk.pos_tag([\"eat\"])" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 12, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "[[('Usain', 'NN')],\n", 193 | " [('Bolt', 'NN')],\n", 194 | " [('is', 'VBZ')],\n", 195 | " [('the', 'DT')],\n", 196 | " [('fastest', 'JJS')],\n", 197 | " [('runner', 'NN')],\n", 198 | " [('in', 'IN')],\n", 199 | " [('the', 'DT')],\n", 200 | " [('world', 'NN')]]" 201 | ] 202 | }, 203 | "execution_count": 12, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "text = \"Usain Bolt is the fastest runner in the world\"\n", 210 | "tokens = word_tokenize(text)\n", 211 | "[nltk.pos_tag([word]) for word in tokens]" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 13, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "$: dollar\n", 224 | " $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$\n", 225 | "'': closing quotation mark\n", 226 | " ' ''\n", 227 | "(: opening parenthesis\n", 228 | " ( [ {\n", 229 | "): closing parenthesis\n", 230 | " ) ] }\n", 231 | ",: comma\n", 232 | " ,\n", 233 | "--: dash\n", 234 | " --\n", 235 | ".: sentence terminator\n", 236 | " . ! ?\n", 237 | ":: colon or ellipsis\n", 238 | " : ; ...\n", 239 | "CC: conjunction, coordinating\n", 240 | " & 'n and both but either et for less minus neither nor or plus so\n", 241 | " therefore times v. versus vs. whether yet\n", 242 | "CD: numeral, cardinal\n", 243 | " mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-\n", 244 | " seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025\n", 245 | " fifteen 271,124 dozen quintillion DM2,000 ...\n", 246 | "DT: determiner\n", 247 | " all an another any both del each either every half la many much nary\n", 248 | " neither no some such that the them these this those\n", 249 | "EX: existential there\n", 250 | " there\n", 251 | "FW: foreign word\n", 252 | " gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous\n", 253 | " lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte\n", 254 | " terram fiche oui corporis ...\n", 255 | "IN: preposition or conjunction, subordinating\n", 256 | " astride among uppon whether out inside pro despite on by throughout\n", 257 | " below within for towards near behind atop around if like until below\n", 258 | " next into if beside ...\n", 259 | "JJ: adjective or numeral, ordinal\n", 260 | " third ill-mannered pre-war regrettable oiled calamitous first separable\n", 261 | " ectoplasmic battery-powered participatory fourth still-to-be-named\n", 262 | " multilingual multi-disciplinary ...\n", 263 | "JJR: adjective, comparative\n", 264 | " bleaker braver breezier briefer brighter brisker broader bumper busier\n", 265 | " calmer cheaper choosier cleaner clearer closer colder commoner costlier\n", 266 | " cozier creamier crunchier cuter ...\n", 267 | "JJS: adjective, superlative\n", 268 | " calmest cheapest choicest classiest cleanest clearest closest commonest\n", 269 | " corniest costliest crassest creepiest crudest cutest darkest deadliest\n", 270 | " dearest deepest densest dinkiest ...\n", 271 | "LS: list item marker\n", 272 | " A A. B B. C C. D E F First G H I J K One SP-44001 SP-44002 SP-44005\n", 273 | " SP-44007 Second Third Three Two * a b c d first five four one six three\n", 274 | " two\n", 275 | "MD: modal auxiliary\n", 276 | " can cannot could couldn't dare may might must need ought shall should\n", 277 | " shouldn't will would\n", 278 | "NN: noun, common, singular or mass\n", 279 | " common-carrier cabbage knuckle-duster Casino afghan shed thermostat\n", 280 | " investment slide humour falloff slick wind hyena override subhumanity\n", 281 | " machinist ...\n", 282 | "NNP: noun, proper, singular\n", 283 | " Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos\n", 284 | " Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA\n", 285 | " Shannon A.K.C. Meltex Liverpool ...\n", 286 | "NNPS: noun, proper, plural\n", 287 | " Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists\n", 288 | " Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques\n", 289 | " Apache Apaches Apocrypha ...\n", 290 | "NNS: noun, common, plural\n", 291 | " undergraduates scotches bric-a-brac products bodyguards facets coasts\n", 292 | " divestitures storehouses designs clubs fragrances averages\n", 293 | " subjectivists apprehensions muses factory-jobs ...\n", 294 | "PDT: pre-determiner\n", 295 | " all both half many quite such sure this\n", 296 | "POS: genitive marker\n", 297 | " ' 's\n", 298 | "PRP: pronoun, personal\n", 299 | " hers herself him himself hisself it itself me myself one oneself ours\n", 300 | " ourselves ownself self she thee theirs them themselves they thou thy us\n", 301 | "PRP$: pronoun, possessive\n", 302 | " her his mine my our ours their thy your\n", 303 | "RB: adverb\n", 304 | " occasionally unabatingly maddeningly adventurously professedly\n", 305 | " stirringly prominently technologically magisterially predominately\n", 306 | " swiftly fiscally pitilessly ...\n", 307 | "RBR: adverb, comparative\n", 308 | " further gloomier grander graver greater grimmer harder harsher\n", 309 | " healthier heavier higher however larger later leaner lengthier less-\n", 310 | " perfectly lesser lonelier longer louder lower more ...\n", 311 | "RBS: adverb, superlative\n", 312 | " best biggest bluntest earliest farthest first furthest hardest\n", 313 | " heartiest highest largest least less most nearest second tightest worst\n", 314 | "RP: particle\n", 315 | " aboard about across along apart around aside at away back before behind\n", 316 | " by crop down ever fast for forth from go high i.e. in into just later\n", 317 | " low more off on open out over per pie raising start teeth that through\n", 318 | " under unto up up-pp upon whole with you\n", 319 | "SYM: symbol\n", 320 | " % & ' '' ''. ) ). * + ,. < = > @ A[fj] U.S U.S.S.R * ** ***\n", 321 | "TO: \"to\" as preposition or infinitive marker\n", 322 | " to\n", 323 | "UH: interjection\n", 324 | " Goodbye Goody Gosh Wow Jeepers Jee-sus Hubba Hey Kee-reist Oops amen\n", 325 | " huh howdy uh dammit whammo shucks heck anyways whodunnit honey golly\n", 326 | " man baby diddle hush sonuvabitch ...\n", 327 | "VB: verb, base form\n", 328 | " ask assemble assess assign assume atone attention avoid bake balkanize\n", 329 | " bank begin behold believe bend benefit bevel beware bless boil bomb\n", 330 | " boost brace break bring broil brush build ...\n", 331 | "VBD: verb, past tense\n", 332 | " dipped pleaded swiped regummed soaked tidied convened halted registered\n", 333 | " cushioned exacted snubbed strode aimed adopted belied figgered\n", 334 | " speculated wore appreciated contemplated ...\n", 335 | "VBG: verb, present participle or gerund\n", 336 | " telegraphing stirring focusing angering judging stalling lactating\n", 337 | " hankerin' alleging veering capping approaching traveling besieging\n", 338 | " encrypting interrupting erasing wincing ...\n", 339 | "VBN: verb, past participle\n", 340 | " multihulled dilapidated aerosolized chaired languished panelized used\n", 341 | " experimented flourished imitated reunifed factored condensed sheared\n", 342 | " unsettled primed dubbed desired ...\n", 343 | "VBP: verb, present tense, not 3rd person singular\n", 344 | " predominate wrap resort sue twist spill cure lengthen brush terminate\n", 345 | " appear tend stray glisten obtain comprise detest tease attract\n", 346 | " emphasize mold postpone sever return wag ...\n", 347 | "VBZ: verb, present tense, 3rd person singular\n", 348 | " bases reconstructs marks mixes displeases seals carps weaves snatches\n", 349 | " slumps stretches authorizes smolders pictures emerges stockpiles\n", 350 | " seduces fizzes uses bolsters slaps speaks pleads ...\n", 351 | "WDT: WH-determiner\n", 352 | " that what whatever which whichever\n", 353 | "WP: WH-pronoun\n", 354 | " that what whatever whatsoever which who whom whosoever\n", 355 | "WP$: WH-pronoun, possessive\n", 356 | " whose\n", 357 | "WRB: Wh-adverb\n", 358 | " how however whence whenever where whereby whereever wherein whereof why\n", 359 | "``: opening quotation mark\n", 360 | " ` ``\n" 361 | ] 362 | }, 363 | { 364 | "name": "stderr", 365 | "output_type": "stream", 366 | "text": [ 367 | "[nltk_data] Downloading package tagsets to\n", 368 | "[nltk_data] C:\\Users\\User\\AppData\\Roaming\\nltk_data...\n", 369 | "[nltk_data] Package tagsets is already up-to-date!\n" 370 | ] 371 | } 372 | ], 373 | "source": [ 374 | "nltk.download('tagsets') # need to download first time\n", 375 | "nltk.help.upenn_tagset()" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 15, 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "data": { 385 | "text/plain": [ 386 | "Sentiment(polarity=-1.0, subjectivity=1.0)" 387 | ] 388 | }, 389 | "execution_count": 15, 390 | "metadata": {}, 391 | "output_type": "execute_result" 392 | } 393 | ], 394 | "source": [ 395 | "#!pip install textblob\n", 396 | "from textblob import TextBlob\n", 397 | "TextBlob(\"I love pizza\").sentiment\n", 398 | "TextBlob(\"The weather is excellent\").sentiment\n", 399 | "TextBlob(\"What a terrible thing to say\").sentiment" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 16, 405 | "metadata": {}, 406 | "outputs": [ 407 | { 408 | "name": "stdout", 409 | "output_type": "stream", 410 | "text": [ 411 | "Qui savait que la traduction pouvait être amusante\n", 412 | "谁知道翻译会很有趣\n", 413 | "कौन जानता था कि अनुवाद मज़ेदार हो सकता है\n" 414 | ] 415 | }, 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "[('The', 'DT'),\n", 420 | " ('global', 'JJ'),\n", 421 | " ('economy', 'NN'),\n", 422 | " ('is', 'VBZ'),\n", 423 | " ('expected', 'VBN'),\n", 424 | " ('to', 'TO'),\n", 425 | " ('grow', 'VB'),\n", 426 | " ('this', 'DT'),\n", 427 | " ('year', 'NN')]" 428 | ] 429 | }, 430 | "execution_count": 16, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "languages = ['fr','zh-CN','hi']\n", 437 | "for language in languages:\n", 438 | " print(TextBlob(\"Who knew translation could be fun\").translate(to=language))\n", 439 | " \n", 440 | "TextBlob(\"The global economy is expected to grow this year\").tags" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 18, 446 | "metadata": {}, 447 | "outputs": [ 448 | { 449 | "data": { 450 | "text/plain": [ 451 | "{'neg': 0.0, 'neu': 0.604, 'pos': 0.396, 'compound': 0.5079}" 452 | ] 453 | }, 454 | "execution_count": 18, 455 | "metadata": {}, 456 | "output_type": "execute_result" 457 | } 458 | ], 459 | "source": [ 460 | "#!pip install vaderSentiment\n", 461 | "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n", 462 | "analyser = SentimentIntensityAnalyzer()\n", 463 | "\n", 464 | "analyser.polarity_scores(\"This book is very good\")\n", 465 | "analyser.polarity_scores(\"OMG! The book is so cool\")" 466 | ] 467 | } 468 | ], 469 | "metadata": { 470 | "kernelspec": { 471 | "display_name": "Python 3", 472 | "language": "python", 473 | "name": "python3" 474 | }, 475 | "language_info": { 476 | "codemirror_mode": { 477 | "name": "ipython", 478 | "version": 3 479 | }, 480 | "file_extension": ".py", 481 | "mimetype": "text/x-python", 482 | "name": "python", 483 | "nbconvert_exporter": "python", 484 | "pygments_lexer": "ipython3", 485 | "version": "3.7.7" 486 | } 487 | }, 488 | "nbformat": 4, 489 | "nbformat_minor": 4 490 | } 491 | -------------------------------------------------------------------------------- /Chapter02/02_02_web_scraping_intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | "
TitlesPricesRatings
0Asus VivoBook X441NA-GA190$295.993
1Prestigio SmartBook 133S Dark Grey$299.002
2Prestigio SmartBook 133S Gold$299.004
3Aspire E1-510$306.993
4Lenovo V110-15IAP$321.943
\n", 67 | "
" 68 | ], 69 | "text/plain": [ 70 | " Titles Prices Ratings\n", 71 | "0 Asus VivoBook X441NA-GA190 $295.99 3\n", 72 | "1 Prestigio SmartBook 133S Dark Grey $299.00 2\n", 73 | "2 Prestigio SmartBook 133S Gold $299.00 4\n", 74 | "3 Aspire E1-510 $306.99 3\n", 75 | "4 Lenovo V110-15IAP $321.94 3" 76 | ] 77 | }, 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "#!pip install requests\n", 85 | "#!pip install beautifulsoup4\n", 86 | "#!pip install pandas\n", 87 | "\n", 88 | "import requests\n", 89 | "from bs4 import BeautifulSoup\n", 90 | "import pandas as pd\n", 91 | "\n", 92 | "titles = []\n", 93 | "prices = []\n", 94 | "ratings = []\n", 95 | "url = 'https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops'\n", 96 | "request = requests.get(url)\n", 97 | "soup = BeautifulSoup(request.text, \"html.parser\")\n", 98 | "for product in soup.find_all('div', {'class': 'col-sm-4 col-lg-4 col-md-4'}):\n", 99 | " for pr in product.find_all('div', {'class': 'caption'}):\n", 100 | " for p in pr.find_all('h4', {'class': 'pull-right price'}):\n", 101 | " prices.append(p.text)\n", 102 | " for title in pr.find_all('a' , {'title'}):\n", 103 | " titles.append(title.get('title'))\n", 104 | " for rt in product.find_all('div', {'class': 'ratings'}):\n", 105 | " ratings.append(len(rt.find_all('span', {'class': 'glyphicon glyphicon-star'})))\n", 106 | "\n", 107 | "\n", 108 | "#build dataframe and export to csv \n", 109 | "product_df = pd.DataFrame(zip(titles,prices,ratings), columns =['Titles', 'Prices', 'Ratings']) \n", 110 | "product_df.head()\n", 111 | "product_df.to_csv(\"ecommerce.csv\",index=False)\n" 112 | ] 113 | } 114 | ], 115 | "metadata": { 116 | "kernelspec": { 117 | "display_name": "Python 3", 118 | "language": "python", 119 | "name": "python3" 120 | }, 121 | "language_info": { 122 | "codemirror_mode": { 123 | "name": "ipython", 124 | "version": 3 125 | }, 126 | "file_extension": ".py", 127 | "mimetype": "text/x-python", 128 | "name": "python", 129 | "nbconvert_exporter": "python", 130 | "pygments_lexer": "ipython3", 131 | "version": "3.7.7" 132 | } 133 | }, 134 | "nbformat": 4, 135 | "nbformat_minor": 4 136 | } 137 | -------------------------------------------------------------------------------- /Chapter02/Dataset/flight_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter02/Dataset/flight_data.zip -------------------------------------------------------------------------------- /Chapter02/Output Files/ecommerce.csv: -------------------------------------------------------------------------------- 1 | Titles,Prices,Ratings 2 | Asus VivoBook X441NA-GA190,$295.99,3 3 | Prestigio SmartBook 133S Dark Grey,$299.00,2 4 | Prestigio SmartBook 133S Gold,$299.00,4 5 | Aspire E1-510,$306.99,3 6 | Lenovo V110-15IAP,$321.94,3 7 | Lenovo V110-15IAP,$356.49,2 8 | Hewlett Packard 250 G6 Dark Ash Silver,$364.46,1 9 | Acer Aspire 3 A315-31 Black,$372.70,2 10 | Acer Aspire A315-31-C33J,$379.94,2 11 | Acer Aspire ES1-572 Black,$379.95,4 12 | Acer Aspire 3 A315-31 Black,$391.48,4 13 | Acer Aspire 3 A315-21,$393.88,3 14 | Asus VivoBook Max,$399.00,1 15 | Asus VivoBook E502NA-GO022T Dark Blue,$399.99,4 16 | Lenovo ThinkPad E31-80,$404.23,1 17 | Acer Aspire 3 A315-31 Black,$408.98,4 18 | Lenovo V110-15ISK,$409.63,3 19 | Acer Aspire ES1-732 Black,$410.46,4 20 | Asus VivoBook 15 X540NA-GQ026T,$410.66,4 21 | Packard 255 G2,$416.99,2 22 | Asus EeeBook R416NA-FA014T,$433.30,1 23 | Acer Aspire 3 A315-51,$436.29,1 24 | Acer Aspire ES1-572 Black,$436.29,1 25 | Acer Extensa 15 (2540) Black,$439.73,4 26 | Acer Aspire ES1-572 Black,$454.62,1 27 | Lenovo V110-15ISK,$454.73,2 28 | Acer Aspire A315-51-33TG,$457.38,3 29 | Lenovo V110-15IKB,$465.95,1 30 | Asus VivoBook 15 X540UA-DM260 Chocolate Black,$468.56,3 31 | Acer Aspire ES1-572 Black,$469.10,3 32 | Lenovo V510 Black,$484.23,3 33 | Acer Aspire ES1-572 Black,$485.90,3 34 | Lenovo V510 Black,$487.80,2 35 | Acer Swift 1 SF113-31 Silver,$488.64,3 36 | Dell Vostro 15,$488.78,4 37 | Acer Aspire 3 A315-51 Black,$494.71,4 38 | Dell Vostro 15 (3568) Red,$497.17,1 39 | Lenovo V510 Black,$498.23,3 40 | HP 250 G3,$520.99,2 41 | Acer Spin 5,$564.98,2 42 | HP 350 G1,$577.99,2 43 | Aspire E1-572G,$581.99,1 44 | Pavilion,$609.99,1 45 | Acer Aspire A515-51-5654,$679.00,2 46 | Dell Inspiron 15,$679.00,2 47 | Asus VivoBook S14,$729.00,1 48 | ProBook,$739.99,4 49 | Inspiron 15,$745.99,3 50 | Asus ROG STRIX GL553VD-DM256,$799.00,2 51 | Acer Nitro 5 AN515-51,$809.00,1 52 | Asus ROG STRIX GL553VD-DM256,$899.00,1 53 | Lenovo ThinkPad L570,$999.00,3 54 | ThinkPad Yoga,$1033.99,2 55 | Lenovo ThinkPad L460,$1096.02,4 56 | Dell Inspiron 15 (7567) Black,$1098.42,1 57 | MSI GL72M 7RDX,$1099.00,4 58 | MSI GL72M 7RDX,$1099.00,1 59 | Asus ROG Strix GL553VD-DM535T,$1101.83,2 60 | Dell Latitude 5280,$1102.66,1 61 | Dell Latitude 5480,$1110.14,3 62 | Lenovo Legion Y520-15IKBM,$1112.91,4 63 | Toshiba Portege Z30-C-16J Grey,$1114.55,1 64 | Acer Predator Helios 300 (PH317-51),$1123.87,2 65 | Acer Aspire 7 A715-71G,$1123.87,2 66 | Dell Inspiron 17 2in1 (7779) Silver,$1124.20,3 67 | Dell Latitude 5480,$1133.82,4 68 | Lenovo Legion Y520,$1133.91,1 69 | Asus AsusPro Advanced BU401LA-FA271G Dark Grey,$1139.54,3 70 | Acer Nitro 5 AN515-51,$1140.62,3 71 | Dell Latitude 5480,$1143.40,4 72 | Dell Inspiron 15 (7567) Black,$1144.20,1 73 | Dell Latitude 5580,$1144.40,3 74 | Lenovo Legion Y520-15IKBM,$1149.00,3 75 | MSI GP62M 7RDX Leopard,$1149.00,1 76 | Lenovo Yoga 720 Grey,$1149.73,2 77 | Toshiba Portege Z30-C-16L Grey,$1154.04,1 78 | Acer TravelMate P645-S-511A Black,$1170.10,1 79 | Dell Latitude 5580,$1178.19,3 80 | ThinkPad T540p,$1178.99,1 81 | MSI GS63 7RD Stealth,$1179.00,1 82 | Dell Latitude 5480,$1187.88,3 83 | Acer Predator Helios 300 (PH317-51),$1187.98,3 84 | MSI GL62M 7REX,$1199.00,1 85 | MSI GL62M 7REX2,$1199.00,2 86 | Lenovo Yoga 910 Grey,$1199.73,3 87 | Toshiba Portege X30-D-10J Black/Blue,$1203.41,4 88 | Lenovo IdeaPad Miix 510 Platinum Silver,$1212.16,4 89 | Acer Predator Helios 300 (PH317-51),$1221.58,1 90 | ThinkPad Yoga,$1223.99,3 91 | Asus VivoBook Pro 15 N580VN-FI006T Gold Metal,$1235.49,2 92 | Dell Latitude 5480,$1238.37,2 93 | Asus ZenBook UX530UX-FY040T Blue,$1239.20,1 94 | ThinkPad X230,$1244.99,3 95 | Asus ROG Strix GL753VE-GC096T,$1259.00,4 96 | "Apple MacBook Air 13""",$1260.13,4 97 | Dell Latitude 5480,$1271.06,1 98 | Hewlett Packard Spectre 13-v106na Dark Ash Silver,$1273.11,3 99 | Dell XPS 13,$1281.99,3 100 | Toshiba Portege Z30-C-16K Grey,$1294.74,1 101 | MSI GL62VR 7RFX,$1299.00,3 102 | Dell Latitude 5480,$1310.39,3 103 | ThinkPad X240,$1311.99,3 104 | Hewlett Packard ProBook 640 G3,$1326.83,1 105 | "Apple MacBook Pro 13"" Space Gray",$1333.00,1 106 | Dell Latitude 5580,$1337.28,1 107 | Dell Latitude 5480,$1338.37,2 108 | Dell Latitude 5580,$1341.22,3 109 | "Apple MacBook Air 13""",$1347.78,2 110 | Lenovo ThinkPad T470,$1349.23,1 111 | Lenovo ThinkPad Yoga 370 Black,$1362.24,2 112 | Toshiba Portege X20W-D-10V Black/Blue,$1366.32,1 113 | Asus ASUSPRO B9440UA-GV0279R Gray,$1381.13,1 114 | Lenovo Legion Y720,$1399.00,3 115 | Asus ROG Strix GL702VM-GC146T,$1399.00,3 116 | Asus ROG Strix GL702ZC-GC154T,$1769.00,4 117 | Asus ROG Strix GL702ZC-GC209T,$1769.00,1 118 | Asus ROG Strix SCAR Edition GL503VM-ED115T,$1799.00,3 119 | -------------------------------------------------------------------------------- /Chapter03/All the basic preprocessing in one place.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# All the basic preprocessing in one place\n", 8 | "\n", 9 | "#### Let's apply all the preprocessing methods we have discussed so far on our Zomato dataset and see how everything works together\n", 10 | "\n", 11 | "@author: Aman Kedia" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stderr", 21 | "output_type": "stream", 22 | "text": [ 23 | "[nltk_data] Downloading package stopwords to\n", 24 | "[nltk_data] /Users/amankedia/nltk_data...\n", 25 | "[nltk_data] Package stopwords is already up-to-date!\n", 26 | "[nltk_data] Downloading package wordnet to\n", 27 | "[nltk_data] /Users/amankedia/nltk_data...\n", 28 | "[nltk_data] Package wordnet is already up-to-date!\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "import nltk\n", 34 | "nltk.download('stopwords')\n", 35 | "nltk.download('wordnet')\n", 36 | "from nltk.corpus import stopwords\n", 37 | "from nltk.stem.porter import PorterStemmer \n", 38 | "from nltk.stem.snowball import SnowballStemmer\n", 39 | "from nltk.stem.wordnet import WordNetLemmatizer\n", 40 | "import pandas as pd\n", 41 | "import re" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/html": [ 52 | "
\n", 53 | "\n", 66 | "\n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | "
Reviewsentiment
0Virat Kohli did a great thing to open his rest...positive
1This place have some really heathy options to ...positive
2Aerocity is the most finest place in Delhi for...positive
\n", 92 | "
" 93 | ], 94 | "text/plain": [ 95 | " Review sentiment\n", 96 | "0 Virat Kohli did a great thing to open his rest... positive\n", 97 | "1 This place have some really heathy options to ... positive\n", 98 | "2 Aerocity is the most finest place in Delhi for... positive" 99 | ] 100 | }, 101 | "execution_count": 2, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "df = pd.read_csv(\"Dataset/zomato_reviews.csv\")\n", 108 | "df.head(3)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 3, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "corpus = pd.Series(df.Review.tolist()).astype(str)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 4, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "0 Virat Kohli did a great thing to open his rest...\n", 129 | "1 This place have some really heathy options to ...\n", 130 | "2 Aerocity is the most finest place in Delhi for...\n", 131 | "3 Yesterday evening there was small team lunch ,...\n", 132 | "4 I find aerocity to be the best place in delhi ...\n", 133 | " ... \n", 134 | "1591 || DESI LANE || So we were at alipore's most h...\n", 135 | "1592 \"Desi Lane\" is one of the most trending place ...\n", 136 | "1593 One of the cool and pocket pinch restaurant at...\n", 137 | "1594 \"DESI LANE\" one of the best places in town and...\n", 138 | "1595 Looking for good place for lunch but dont wann...\n", 139 | "Length: 1596, dtype: object" 140 | ] 141 | }, 142 | "execution_count": 4, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "corpus" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### Text Cleaning (Removal of special characters/punctuations & case folding)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 5, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "def text_clean(corpus, keep_list):\n", 165 | " '''\n", 166 | " Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)\n", 167 | " \n", 168 | " Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained\n", 169 | " even after the cleaning process\n", 170 | " \n", 171 | " Output : Returns the cleaned text corpus\n", 172 | " \n", 173 | " '''\n", 174 | " cleaned_corpus = pd.Series()\n", 175 | " for row in corpus:\n", 176 | " qs = []\n", 177 | " for word in row.split():\n", 178 | " if word not in keep_list:\n", 179 | " p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)\n", 180 | " p1 = p1.lower()\n", 181 | " qs.append(p1)\n", 182 | " else : qs.append(word)\n", 183 | " cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))\n", 184 | " return cleaned_corpus" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "### Stopwords Removal" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 6, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "def stopwords_removal(corpus):\n", 201 | " wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']\n", 202 | " stop = set(stopwords.words('english'))\n", 203 | " for word in wh_words:\n", 204 | " stop.remove(word)\n", 205 | " corpus = [[x for x in x.split() if x not in stop] for x in corpus]\n", 206 | " return corpus" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "### Lemmatization" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 7, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "def lemmatize(corpus):\n", 223 | " lem = WordNetLemmatizer()\n", 224 | " corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]\n", 225 | " return corpus" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "### Stemming" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 8, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "def stem(corpus, stem_type = None):\n", 242 | " if stem_type == 'snowball':\n", 243 | " stemmer = SnowballStemmer(language = 'english')\n", 244 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 245 | " else :\n", 246 | " stemmer = PorterStemmer()\n", 247 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 248 | " return corpus" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 9, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):\n", 258 | " '''\n", 259 | " Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)\n", 260 | " \n", 261 | " Input : \n", 262 | " 'corpus' - Text corpus on which pre-processing tasks will be performed\n", 263 | " 'keep_list' - List of words to be retained during cleaning process\n", 264 | " 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should \n", 265 | " be performed or not\n", 266 | " 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is \"None\", which corresponds to Porter\n", 267 | " Stemmer. 'snowball' corresponds to Snowball Stemmer\n", 268 | " \n", 269 | " Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together\n", 270 | " \n", 271 | " Output : Returns the processed text corpus\n", 272 | " \n", 273 | " '''\n", 274 | " \n", 275 | " if cleaning == True:\n", 276 | " corpus = text_clean(corpus, keep_list)\n", 277 | " \n", 278 | " if remove_stopwords == True:\n", 279 | " corpus = stopwords_removal(corpus)\n", 280 | " else :\n", 281 | " corpus = [[x for x in x.split()] for x in corpus]\n", 282 | " \n", 283 | " if lemmatization == True:\n", 284 | " corpus = lemmatize(corpus)\n", 285 | " \n", 286 | " \n", 287 | " if stemming == True:\n", 288 | " corpus = stem(corpus, stem_type)\n", 289 | " \n", 290 | " corpus = [' '.join(x) for x in corpus] \n", 291 | "\n", 292 | " return corpus" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 10, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "common_dot_words = ['U.S.A', 'Mr.', 'Mrs.', 'D.C.']" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 11, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "# Preprocessing with Lemmatization here\n", 311 | "corpus_with_lemmatization = preprocess(corpus, keep_list = common_dot_words, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 12, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "# Preprocessing with Stemming here here\n", 321 | "corpus_with_stemming = preprocess(corpus, keep_list = common_dot_words, stemming = True, stem_type = \"snowball\", lemmatization = False, remove_stopwords = True)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "# Let's see the results on applying\n", 329 | "\n", 330 | "### 1. Lemmatization\n", 331 | "### 2. Stemming\n", 332 | "\n", 333 | "Note: Stopwords removal and text cleaning have been applied on both the occassions." 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 13, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "Original string: Virat Kohli did a great thing to open his restaurant in an exquisite place of Delhi. Wide range of food with lots and lots of options on drinks. Courteous staff with a quick response on anything.\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "print(\"Original string: \", corpus[0])" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 14, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "name": "stdout", 360 | "output_type": "stream", 361 | "text": [ 362 | "String after lemmatiization: virat kohli great thing open restaurant exquisite place delhi wide range food lot lot options drink courteous staff quick response anything\n" 363 | ] 364 | } 365 | ], 366 | "source": [ 367 | "print(\"String after lemmatiization: \", corpus_with_lemmatization[0])" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 15, 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "name": "stdout", 377 | "output_type": "stream", 378 | "text": [ 379 | "String after stemming: virat koh great thing open restaur exquisit place delhi wide rang food lot lot option drink courteous staff quick respons anyth\n" 380 | ] 381 | } 382 | ], 383 | "source": [ 384 | "print(\"String after stemming: \", corpus_with_stemming[0])" 385 | ] 386 | } 387 | ], 388 | "metadata": { 389 | "kernelspec": { 390 | "display_name": "Python 3", 391 | "language": "python", 392 | "name": "python3" 393 | }, 394 | "language_info": { 395 | "codemirror_mode": { 396 | "name": "ipython", 397 | "version": 3 398 | }, 399 | "file_extension": ".py", 400 | "mimetype": "text/x-python", 401 | "name": "python", 402 | "nbconvert_exporter": "python", 403 | "pygments_lexer": "ipython3", 404 | "version": "3.6.0" 405 | } 406 | }, 407 | "nbformat": 4, 408 | "nbformat_minor": 2 409 | } 410 | -------------------------------------------------------------------------------- /Chapter03/Stemming, Lemmatization, Stopword Removal, Case-Folding, N-grams and HTML tags.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exploring Tokenization\n", 8 | "\n", 9 | "@author: Aman Kedia" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import nltk" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "plurals = ['caresses', 'flies', 'dies', 'mules', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating',\n", 28 | " 'siezing', 'itemization', 'traditional', 'reference', 'colonizer', 'plotted', 'having', 'generously']" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Porter Stemmer" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have gener\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "from nltk.stem.porter import PorterStemmer \n", 53 | "stemmer = PorterStemmer()\n", 54 | "singles = [stemmer.stem(plural) for plural in plurals]\n", 55 | "print(' '.join(singles))" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# Snowball Stemmer" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "from nltk.stem.snowball import SnowballStemmer\n", 80 | "print(SnowballStemmer.languages)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have generous\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "stemmer2 = SnowballStemmer(language='english')\n", 98 | "singles = [stemmer2.stem(plural) for plural in plurals]\n", 99 | "print(' '.join(singles))" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "# Wordnet Lemmatizer" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 6, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "[nltk_data] Downloading package wordnet to\n", 119 | "[nltk_data] /Users/amankedia/nltk_data...\n", 120 | "[nltk_data] Package wordnet is already up-to-date!\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "nltk.download('wordnet')\n", 126 | "from nltk.stem import WordNetLemmatizer " 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 7, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "The tokens are: ['We', 'are', 'putting', 'in', 'efforts', 'to', 'enhance', 'our', 'understanding', 'of', 'Lemmatization']\n", 139 | "The lemmatized output is: We are putting in effort to enhance our understanding of Lemmatization\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "lemmatizer = WordNetLemmatizer()\n", 145 | "s = \"We are putting in efforts to enhance our understanding of Lemmatization\"\n", 146 | "token_list = s.split()\n", 147 | "print(\"The tokens are: \", token_list)\n", 148 | "lemmatized_output = ' '.join([lemmatizer.lemmatize(token) for token in token_list])\n", 149 | "print(\"The lemmatized output is: \", lemmatized_output)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## POS Tagging" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 8, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stderr", 166 | "output_type": "stream", 167 | "text": [ 168 | "[nltk_data] Downloading package averaged_perceptron_tagger to\n", 169 | "[nltk_data] /Users/amankedia/nltk_data...\n", 170 | "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", 171 | "[nltk_data] date!\n" 172 | ] 173 | }, 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "[('We', 'PRP'),\n", 178 | " ('are', 'VBP'),\n", 179 | " ('putting', 'VBG'),\n", 180 | " ('in', 'IN'),\n", 181 | " ('efforts', 'NNS'),\n", 182 | " ('to', 'TO'),\n", 183 | " ('enhance', 'VB'),\n", 184 | " ('our', 'PRP$'),\n", 185 | " ('understanding', 'NN'),\n", 186 | " ('of', 'IN'),\n", 187 | " ('Lemmatization', 'NN')]" 188 | ] 189 | }, 190 | "execution_count": 8, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "nltk.download('averaged_perceptron_tagger')\n", 197 | "pos_tags = nltk.pos_tag(token_list)\n", 198 | "pos_tags" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "## POS tag Mapping" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 9, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "from nltk.corpus import wordnet\n", 215 | "\n", 216 | "##This is a common method which is widely used across the NLP community of practitioners and readers\n", 217 | "\n", 218 | "def get_part_of_speech_tags(token):\n", 219 | " \n", 220 | " \"\"\"Maps POS tags to first character lemmatize() accepts.\n", 221 | " We are focussing on Verbs, Nouns, Adjectives and Adverbs here.\"\"\"\n", 222 | "\n", 223 | " tag_dict = {\"J\": wordnet.ADJ,\n", 224 | " \"N\": wordnet.NOUN,\n", 225 | " \"V\": wordnet.VERB,\n", 226 | " \"R\": wordnet.ADV}\n", 227 | " \n", 228 | " tag = nltk.pos_tag([token])[0][1][0].upper()\n", 229 | " \n", 230 | " return tag_dict.get(tag, wordnet.NOUN)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "## Wordnet Lemmatizer with POS Tag Information" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 10, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "We be put in effort to enhance our understand of Lemmatization\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "lemmatized_output_with_POS_information = [lemmatizer.lemmatize(token, get_part_of_speech_tags(token)) for token in token_list]\n", 255 | "print(' '.join(lemmatized_output_with_POS_information))" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "## Lemmatization vs Stemming" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 11, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "we are put in effort to enhanc our understand of lemmat\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "stemmer2 = SnowballStemmer(language='english')\n", 280 | "stemmed_sentence = [stemmer2.stem(token) for token in token_list]\n", 281 | "print(' '.join(stemmed_sentence))" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "# spaCy Lemmatizer" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 12, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "data": { 298 | "text/plain": [ 299 | "'-PRON- be put in effort to enhance -PRON- understanding of lemmatization'" 300 | ] 301 | }, 302 | "execution_count": 12, 303 | "metadata": {}, 304 | "output_type": "execute_result" 305 | } 306 | ], 307 | "source": [ 308 | "import spacy\n", 309 | "nlp = spacy.load('en')\n", 310 | "doc = nlp(\"We are putting in efforts to enhance our understanding of Lemmatization\")\n", 311 | "\" \".join([token.lemma_ for token in doc])" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "# Stopwords" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 13, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stderr", 328 | "output_type": "stream", 329 | "text": [ 330 | "[nltk_data] Downloading package stopwords to\n", 331 | "[nltk_data] /Users/amankedia/nltk_data...\n", 332 | "[nltk_data] Package stopwords is already up-to-date!\n" 333 | ] 334 | }, 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "\"it's, yours, an, doing, any, mightn't, you, having, wasn't, themselves, just, over, below, needn't, a, this, shan't, them, isn't, was, wouldn't, as, only, his, or, shan, wouldn, don, where, own, were, he, out, do, it, am, won, isn, there, hers, to, ll, most, for, weren, have, by, while, the, re, that, down, haven, has, is, here, itself, all, didn, herself, shouldn, him, ve, who, doesn, m, hadn't, after, further, weren't, at, hadn, should've, too, because, can, now, same, more, she's, wasn, these, yourself, himself, being, very, until, myself, few, so, which, ourselves, they, t, you'd, did, o, aren, but, that'll, such, whom, of, s, you'll, those, doesn't, my, what, aren't, during, hasn, through, will, couldn, i, mustn, needn, mustn't, d, had, me, under, won't, haven't, its, with, when, their, between, if, once, against, before, on, not, you're, each, yourselves, in, and, are, shouldn't, some, nor, her, does, she, off, how, both, our, then, why, again, we, no, y, be, other, ma, from, up, theirs, couldn't, should, into, didn't, ours, about, ain, you've, don't, above, been, than, your, hasn't, mightn\"" 339 | ] 340 | }, 341 | "execution_count": 13, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "nltk.download('stopwords')\n", 348 | "from nltk.corpus import stopwords\n", 349 | "stop = set(stopwords.words('english'))\n", 350 | "\", \".join(stop)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 14, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "'how putting efforts enhance understanding Lemmatization'" 362 | ] 363 | }, 364 | "execution_count": 14, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']\n", 371 | "\n", 372 | "stop = set(stopwords.words('english'))\n", 373 | "\n", 374 | "sentence = \"how are we putting in efforts to enhance our understanding of Lemmatization\"\n", 375 | "\n", 376 | "for word in wh_words:\n", 377 | " stop.remove(word)\n", 378 | "\n", 379 | "sentence_after_stopword_removal = [token for token in sentence.split() if token not in stop]\n", 380 | "\" \".join(sentence_after_stopword_removal)" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "# Case Folding" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 15, 393 | "metadata": {}, 394 | "outputs": [ 395 | { 396 | "data": { 397 | "text/plain": [ 398 | "'we are putting in efforts to enhance our understanding of lemmatization'" 399 | ] 400 | }, 401 | "execution_count": 15, 402 | "metadata": {}, 403 | "output_type": "execute_result" 404 | } 405 | ], 406 | "source": [ 407 | "s = \"We are putting in efforts to enhance our understanding of Lemmatization\"\n", 408 | "s = s.lower()\n", 409 | "s" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "# N-grams" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 16, 422 | "metadata": {}, 423 | "outputs": [ 424 | { 425 | "data": { 426 | "text/plain": [ 427 | "['Natural Language',\n", 428 | " 'Language Processing',\n", 429 | " 'Processing is',\n", 430 | " 'is the',\n", 431 | " 'the way',\n", 432 | " 'way to',\n", 433 | " 'to go']" 434 | ] 435 | }, 436 | "execution_count": 16, 437 | "metadata": {}, 438 | "output_type": "execute_result" 439 | } 440 | ], 441 | "source": [ 442 | "from nltk.util import ngrams\n", 443 | "s = \"Natural Language Processing is the way to go\"\n", 444 | "tokens = s.split()\n", 445 | "bigrams = list(ngrams(tokens, 2))\n", 446 | "[\" \".join(token) for token in bigrams]" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 17, 452 | "metadata": {}, 453 | "outputs": [ 454 | { 455 | "data": { 456 | "text/plain": [ 457 | "['Natural Language Processing',\n", 458 | " 'Language Processing is',\n", 459 | " 'Processing is the',\n", 460 | " 'is the way',\n", 461 | " 'the way to',\n", 462 | " 'way to go']" 463 | ] 464 | }, 465 | "execution_count": 17, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "s = \"Natural Language Processing is the way to go\"\n", 472 | "tokens = s.split()\n", 473 | "trigrams = list(ngrams(tokens, 3))\n", 474 | "[\" \".join(token) for token in trigrams]" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "# Building a basic vocabulary" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 18, 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "data": { 491 | "text/plain": [ 492 | "['Language', 'Natural', 'Processing', 'go', 'is', 'the', 'to', 'way']" 493 | ] 494 | }, 495 | "execution_count": 18, 496 | "metadata": {}, 497 | "output_type": "execute_result" 498 | } 499 | ], 500 | "source": [ 501 | "s = \"Natural Language Processing is the way to go\"\n", 502 | "tokens = set(s.split())\n", 503 | "vocabulary = sorted(tokens)\n", 504 | "vocabulary" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "# Removing HTML Tags" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 19, 517 | "metadata": {}, 518 | "outputs": [ 519 | { 520 | "name": "stdout", 521 | "output_type": "stream", 522 | "text": [ 523 | "My First HeadingMy first paragraph.\n" 524 | ] 525 | } 526 | ], 527 | "source": [ 528 | "html = \"

My First Heading

My first paragraph.

\"\n", 529 | "from bs4 import BeautifulSoup\n", 530 | "\n", 531 | "soup = BeautifulSoup(html)\n", 532 | "text = soup.get_text()\n", 533 | "print(text)" 534 | ] 535 | } 536 | ], 537 | "metadata": { 538 | "kernelspec": { 539 | "display_name": "Python 3", 540 | "language": "python", 541 | "name": "python3" 542 | }, 543 | "language_info": { 544 | "codemirror_mode": { 545 | "name": "ipython", 546 | "version": 3 547 | }, 548 | "file_extension": ".py", 549 | "mimetype": "text/x-python", 550 | "name": "python", 551 | "nbconvert_exporter": "python", 552 | "pygments_lexer": "ipython3", 553 | "version": "3.6.0" 554 | } 555 | }, 556 | "nbformat": 4, 557 | "nbformat_minor": 2 558 | } 559 | -------------------------------------------------------------------------------- /Chapter03/Understanding Tokenization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exploring Tokenization\n", 8 | "\n", 9 | "@author: Aman Kedia\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "['The', 'capital', 'of', 'China', 'is', 'Beijing']" 21 | ] 22 | }, 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "sentence = \"The capital of China is Beijing\"\n", 30 | "sentence.split()" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "[\"China's\", 'capital', 'is', 'Beijing']" 42 | ] 43 | }, 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "output_type": "execute_result" 47 | } 48 | ], 49 | "source": [ 50 | "sentence = \"China's capital is Beijing\"\n", 51 | "sentence.split()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "['Beijing', 'is', 'where', \"we'll\", 'go']" 63 | ] 64 | }, 65 | "execution_count": 3, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "sentence = \"Beijing is where we'll go\"\n", 72 | "sentence.split()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "[\"I'm\", 'going', 'to', 'travel', 'to', 'Beijing']" 84 | ] 85 | }, 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "sentence = \"I'm going to travel to Beijing\"\n", 93 | "sentence.split()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "['Most', 'of', 'the', 'times', 'umm', 'I', 'travel']" 105 | ] 106 | }, 107 | "execution_count": 5, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "sentence = \"Most of the times umm I travel\"\n", 114 | "sentence.split()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "[\"Let's\", 'travel', 'to', 'Hong', 'Kong', 'from', 'Beijing']" 126 | ] 127 | }, 128 | "execution_count": 6, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "sentence = \"Let's travel to Hong Kong from Beijing\"\n", 135 | "sentence.split()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 7, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "['A', 'friend', 'is', 'pursuing', 'his', 'M.S', 'from', 'Beijing']" 147 | ] 148 | }, 149 | "execution_count": 7, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "sentence = \"A friend is pursuing his M.S from Beijing\"\n", 156 | "sentence.split()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 8, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "['Beijing', 'is', 'a', 'cool', 'place!!!', ':-P', '<3', '#Awesome']" 168 | ] 169 | }, 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "sentence = \"Beijing is a cool place!!! :-P <3 #Awesome\"\n", 177 | "sentence.split()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "# Regexp Tokenizer" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 9, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "['A',\n", 196 | " 'Rolex',\n", 197 | " 'watch',\n", 198 | " 'costs',\n", 199 | " 'in',\n", 200 | " 'the',\n", 201 | " 'range',\n", 202 | " 'of',\n", 203 | " '$3000.0',\n", 204 | " '-',\n", 205 | " '$8000.0',\n", 206 | " 'in',\n", 207 | " 'USA',\n", 208 | " '.']" 209 | ] 210 | }, 211 | "execution_count": 9, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "from nltk.tokenize import RegexpTokenizer\n", 218 | "s = \"A Rolex watch costs in the range of $3000.0 - $8000.0 in USA.\"\n", 219 | "tokenizer = RegexpTokenizer('\\w+|\\$[\\d\\.]+|\\S+')\n", 220 | "tokenizer.tokenize(s)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "# Blankline Tokenizer" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 10, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "['A Rolex watch costs in the range of $3000.0 - $8000.0 in USA.',\n", 239 | " 'I want a book as well']" 240 | ] 241 | }, 242 | "execution_count": 10, 243 | "metadata": {}, 244 | "output_type": "execute_result" 245 | } 246 | ], 247 | "source": [ 248 | "from nltk.tokenize import BlanklineTokenizer\n", 249 | "s = \"A Rolex watch costs in the range of $3000.0 - $8000.0 in USA.\\n\\n I want a book as well\"\n", 250 | "tokenizer = BlanklineTokenizer()\n", 251 | "tokenizer.tokenize(s)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "# WordPunct Tokenizer" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 11, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "['A',\n", 270 | " 'Rolex',\n", 271 | " 'watch',\n", 272 | " 'costs',\n", 273 | " 'in',\n", 274 | " 'the',\n", 275 | " 'range',\n", 276 | " 'of',\n", 277 | " '$',\n", 278 | " '3000',\n", 279 | " '.',\n", 280 | " '0',\n", 281 | " '-',\n", 282 | " '$',\n", 283 | " '8000',\n", 284 | " '.',\n", 285 | " '0',\n", 286 | " 'in',\n", 287 | " 'USA',\n", 288 | " '.',\n", 289 | " 'I',\n", 290 | " 'want',\n", 291 | " 'a',\n", 292 | " 'book',\n", 293 | " 'as',\n", 294 | " 'well']" 295 | ] 296 | }, 297 | "execution_count": 11, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "from nltk.tokenize import WordPunctTokenizer\n", 304 | "s = \"A Rolex watch costs in the range of $3000.0 - $8000.0 in USA.\\n I want a book as well\"\n", 305 | "tokenizer = WordPunctTokenizer()\n", 306 | "tokenizer.tokenize(s)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "# TreebankWord Tokenizer" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 12, 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "['I',\n", 325 | " \"'m\",\n", 326 | " 'going',\n", 327 | " 'to',\n", 328 | " 'buy',\n", 329 | " 'a',\n", 330 | " 'Rolex',\n", 331 | " 'watch',\n", 332 | " 'which',\n", 333 | " 'does',\n", 334 | " \"n't\",\n", 335 | " 'cost',\n", 336 | " 'more',\n", 337 | " 'than',\n", 338 | " '$',\n", 339 | " '3000.0']" 340 | ] 341 | }, 342 | "execution_count": 12, 343 | "metadata": {}, 344 | "output_type": "execute_result" 345 | } 346 | ], 347 | "source": [ 348 | "from nltk.tokenize import TreebankWordTokenizer\n", 349 | "s = \"I'm going to buy a Rolex watch which doesn't cost more than $3000.0\"\n", 350 | "tokenizer = TreebankWordTokenizer()\n", 351 | "tokenizer.tokenize(s)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "# Tweet Tokenizer" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 13, 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "data": { 368 | "text/plain": [ 369 | "['@amankedia',\n", 370 | " \"I'm\",\n", 371 | " 'going',\n", 372 | " 'to',\n", 373 | " 'buy',\n", 374 | " 'a',\n", 375 | " 'Rolexxxxxxxx',\n", 376 | " 'watch',\n", 377 | " '!',\n", 378 | " '!',\n", 379 | " '!',\n", 380 | " ':-D',\n", 381 | " '#happiness',\n", 382 | " '#rolex',\n", 383 | " '<3']" 384 | ] 385 | }, 386 | "execution_count": 13, 387 | "metadata": {}, 388 | "output_type": "execute_result" 389 | } 390 | ], 391 | "source": [ 392 | "from nltk.tokenize import TweetTokenizer\n", 393 | "s = \"@amankedia I'm going to buy a Rolexxxxxxxx watch!!! :-D #happiness #rolex <3\"\n", 394 | "tokenizer = TweetTokenizer()\n", 395 | "tokenizer.tokenize(s)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 14, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "data": { 405 | "text/plain": [ 406 | "[\"I'm\",\n", 407 | " 'going',\n", 408 | " 'to',\n", 409 | " 'buy',\n", 410 | " 'a',\n", 411 | " 'Rolexxx',\n", 412 | " 'watch',\n", 413 | " '!',\n", 414 | " '!',\n", 415 | " '!',\n", 416 | " ':-D',\n", 417 | " '#happiness',\n", 418 | " '#rolex',\n", 419 | " '<3']" 420 | ] 421 | }, 422 | "execution_count": 14, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "from nltk.tokenize import TweetTokenizer\n", 429 | "s = \"@amankedia I'm going to buy a Rolexxxxxxxx watch!!! :-D #happiness #rolex <3\"\n", 430 | "tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)\n", 431 | "tokenizer.tokenize(s)" 432 | ] 433 | } 434 | ], 435 | "metadata": { 436 | "kernelspec": { 437 | "display_name": "Python 3", 438 | "language": "python", 439 | "name": "python3" 440 | }, 441 | "language_info": { 442 | "codemirror_mode": { 443 | "name": "ipython", 444 | "version": 3 445 | }, 446 | "file_extension": ".py", 447 | "mimetype": "text/x-python", 448 | "name": "python", 449 | "nbconvert_exporter": "python", 450 | "pygments_lexer": "ipython3", 451 | "version": "3.6.0" 452 | } 453 | }, 454 | "nbformat": 4, 455 | "nbformat_minor": 2 456 | } 457 | -------------------------------------------------------------------------------- /Chapter04/Bag of Words in Action.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Bag of Words in Action\n", 8 | "\n", 9 | "@author: Aman Kedia" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stderr", 19 | "output_type": "stream", 20 | "text": [ 21 | "[nltk_data] Downloading package stopwords to\n", 22 | "[nltk_data] /Users/amankedia/nltk_data...\n", 23 | "[nltk_data] Package stopwords is already up-to-date!\n", 24 | "[nltk_data] Downloading package wordnet to\n", 25 | "[nltk_data] /Users/amankedia/nltk_data...\n", 26 | "[nltk_data] Package wordnet is already up-to-date!\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "import nltk\n", 32 | "nltk.download('stopwords')\n", 33 | "nltk.download('wordnet')\n", 34 | "from nltk.corpus import stopwords\n", 35 | "from nltk.stem.porter import PorterStemmer \n", 36 | "from nltk.stem.snowball import SnowballStemmer\n", 37 | "from nltk.stem.wordnet import WordNetLemmatizer\n", 38 | "import pandas as pd\n", 39 | "import re\n", 40 | "import numpy as np" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Take in a list of sentences" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "sentences = [\"We are reading about Natural Language Processing Here\",\n", 57 | " \"Natural Language Processing making computers comprehend language data\",\n", 58 | " \"The field of Natural Language Processing is evolving everyday\"]" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Create a Pandas Series of the object" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "0 We are reading about Natural Language Processi...\n", 77 | "1 Natural Language Processing making computers c...\n", 78 | "2 The field of Natural Language Processing is ev...\n", 79 | "dtype: object" 80 | ] 81 | }, 82 | "execution_count": 3, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "corpus = pd.Series(sentences)\n", 89 | "corpus" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## Data preprocessing" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 4, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "def text_clean(corpus, keep_list):\n", 106 | " '''\n", 107 | " Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)\n", 108 | " \n", 109 | " Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained\n", 110 | " even after the cleaning process\n", 111 | " \n", 112 | " Output : Returns the cleaned text corpus\n", 113 | " \n", 114 | " '''\n", 115 | " cleaned_corpus = pd.Series()\n", 116 | " for row in corpus:\n", 117 | " qs = []\n", 118 | " for word in row.split():\n", 119 | " if word not in keep_list:\n", 120 | " p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)\n", 121 | " p1 = p1.lower()\n", 122 | " qs.append(p1)\n", 123 | " else : qs.append(word)\n", 124 | " cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))\n", 125 | " return cleaned_corpus" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 5, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "def stopwords_removal(corpus):\n", 135 | " wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']\n", 136 | " stop = set(stopwords.words('english'))\n", 137 | " for word in wh_words:\n", 138 | " stop.remove(word)\n", 139 | " corpus = [[x for x in x.split() if x not in stop] for x in corpus]\n", 140 | " return corpus" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "def lemmatize(corpus):\n", 150 | " lem = WordNetLemmatizer()\n", 151 | " corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]\n", 152 | " return corpus" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 7, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "def stem(corpus, stem_type = None):\n", 162 | " if stem_type == 'snowball':\n", 163 | " stemmer = SnowballStemmer(language = 'english')\n", 164 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 165 | " else :\n", 166 | " stemmer = PorterStemmer()\n", 167 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 168 | " return corpus" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 8, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):\n", 178 | " '''\n", 179 | " Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)\n", 180 | " \n", 181 | " Input : \n", 182 | " 'corpus' - Text corpus on which pre-processing tasks will be performed\n", 183 | " 'keep_list' - List of words to be retained during cleaning process\n", 184 | " 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should \n", 185 | " be performed or not\n", 186 | " 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is \"None\", which corresponds to Porter\n", 187 | " Stemmer. 'snowball' corresponds to Snowball Stemmer\n", 188 | " \n", 189 | " Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together\n", 190 | " \n", 191 | " Output : Returns the processed text corpus\n", 192 | " \n", 193 | " '''\n", 194 | " \n", 195 | " if cleaning == True:\n", 196 | " corpus = text_clean(corpus, keep_list)\n", 197 | " \n", 198 | " if remove_stopwords == True:\n", 199 | " corpus = stopwords_removal(corpus)\n", 200 | " else :\n", 201 | " corpus = [[x for x in x.split()] for x in corpus]\n", 202 | " \n", 203 | " if lemmatization == True:\n", 204 | " corpus = lemmatize(corpus)\n", 205 | " \n", 206 | " \n", 207 | " if stemming == True:\n", 208 | " corpus = stem(corpus, stem_type)\n", 209 | " \n", 210 | " corpus = [' '.join(x) for x in corpus] \n", 211 | "\n", 212 | " return corpus" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 9, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "common_dot_words = ['U.S.', 'Mr.', 'Mrs.', 'D.C.']" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 10, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/plain": [ 232 | "['read natural language process',\n", 233 | " 'natural language process make computers comprehend language data',\n", 234 | " 'field natural language process evolve everyday']" 235 | ] 236 | }, 237 | "execution_count": 10, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "# Preprocessing with Lemmatization here\n", 244 | "preprocessed_corpus = preprocess(corpus, keep_list = common_dot_words, stemming = False, stem_type = None,\n", 245 | " lemmatization = True, remove_stopwords = True)\n", 246 | "preprocessed_corpus" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "## Building the vocabulary" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 11, 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "['computers', 'make', 'read', 'everyday', 'data', 'natural', 'field', 'evolve', 'language', 'process', 'comprehend']\n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "set_of_words = set()\n", 271 | "for sentence in preprocessed_corpus:\n", 272 | " for word in sentence.split():\n", 273 | " set_of_words.add(word)\n", 274 | "vocab = list(set_of_words)\n", 275 | "print(vocab)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "## Fetching the position of each word in the vocabulary" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 12, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "{'computers': 0, 'make': 1, 'read': 2, 'everyday': 3, 'data': 4, 'natural': 5, 'field': 6, 'evolve': 7, 'language': 8, 'process': 9, 'comprehend': 10}\n" 295 | ] 296 | } 297 | ], 298 | "source": [ 299 | "position = {}\n", 300 | "for i, token in enumerate(vocab):\n", 301 | " position[token] = i\n", 302 | "print(position)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "## Creating a matrix to hold the Bag of Words representation" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 13, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "bow_matrix = np.zeros((len(preprocessed_corpus), len(vocab)))" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 14, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "for i, preprocessed_sentence in enumerate(preprocessed_corpus):\n", 328 | " for token in preprocessed_sentence.split(): \n", 329 | " bow_matrix[i][position[token]] = bow_matrix[i][position[token]] + 1" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "## Let's look at our Bag of Words representation" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 15, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "array([[0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0.],\n", 348 | " [1., 1., 0., 0., 1., 1., 0., 0., 2., 1., 1.],\n", 349 | " [0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0.]])" 350 | ] 351 | }, 352 | "execution_count": 15, 353 | "metadata": {}, 354 | "output_type": "execute_result" 355 | } 356 | ], 357 | "source": [ 358 | "bow_matrix" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "## Inference\n", 366 | "\n", 367 | "Taking example of column 2 in the bow_matrix, the values are 1, 2 and 1 respectively.\n", 368 | "\n", 369 | "Column 2 caters to index 2 corresponding to the word *language*.\n", 370 | "\n", 371 | "*language* occurs **once, twice and again once** in the the sentences 1, 2 and 3 respectively.\n", 372 | "\n", 373 | "Hope that provides you insights into how the Bag of Words model works." 374 | ] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.6.0" 394 | } 395 | }, 396 | "nbformat": 4, 397 | "nbformat_minor": 2 398 | } 399 | -------------------------------------------------------------------------------- /Chapter04/Chatbot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#!pip install scikit-learn\n", 10 | "import numpy as np\n", 11 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 12 | "from sklearn.metrics.pairwise import cosine_similarity\n", 13 | "from sklearn.feature_extraction.text import CountVectorizer\n", 14 | "\n", 15 | "\n", 16 | "#loading questions and answers in separate lists\n", 17 | "import ast \n", 18 | "questions = []\n", 19 | "answers = [] \n", 20 | "with open('qa_Electronics.json','r') as f:\n", 21 | " for line in f:\n", 22 | " data = ast.literal_eval(line)\n", 23 | " questions.append(data['question'].lower())\n", 24 | " answers.append(data['answer'].lower())" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# tokenize the text and convert data in matrix format\n", 34 | "from sklearn.feature_extraction.text import CountVectorizer\n", 35 | "vectorizer = CountVectorizer(stop_words='english')\n", 36 | "X_vec = vectorizer.fit_transform(questions)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# Transform data by applying term frequency inverse document frequency (TF-IDF) \n", 46 | "tfidf = TfidfTransformer() #by default applies \"l2\" normalization\n", 47 | "X_tfidf = tfidf.fit_transform(X_vec)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "def conversation(im):\n", 57 | " global tfidf, answers, X_tfidf\n", 58 | " Y_vec = vectorizer.transform(im)\n", 59 | " Y_tfidf = tfidf.fit_transform(Y_vec)\n", 60 | " cos_sim = np.rad2deg(np.arccos(max(cosine_similarity(Y_tfidf, X_tfidf)[0])))\n", 61 | " if cos_sim > 60 :\n", 62 | " return \"sorry, I did not quite understand that\"\n", 63 | " else:\n", 64 | " return answers[np.argmax(cosine_similarity(Y_tfidf, X_tfidf)[0])]\n", 65 | "\n", 66 | "def main():\n", 67 | " usr = input(\"Please enter your username: \")\n", 68 | " print(\"support: Hi, welcome to Q&A support. How can I help you?\")\n", 69 | " while True:\n", 70 | " im = input(\"{}: \".format(usr))\n", 71 | " if im.lower() == 'bye':\n", 72 | " print(\"Q&A support: bye!\")\n", 73 | " break\n", 74 | " else:\n", 75 | " print(\"Q&A support: \"+conversation([im]))" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 6, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "Please enter your username: techgeek\n", 88 | "support: Hi, welcome to Q&A support. How can I help you?\n", 89 | "techgeek: My laptop would not start\n", 90 | "Q&A support: hi, you may get you laptop in 3 to 5 business day depending on you location. thanks for you interest. tech mark.\n", 91 | "techgeek: Wow! does that mean it is covered under replacement warranty?\n", 92 | "Q&A support: sorry i forgot if it has, but it works well in my laptop, luckily, i can't find any dead point in this screen.\n", 93 | "techgeek: That does not make any sense\n", 94 | "Q&A support: it is a power supply and does nothing to make it a tv, just sends power to it.\n", 95 | "techgeek: You are a moron\n", 96 | "Q&A support: sorry, I did not quite understand that\n", 97 | "techgeek: bye\n", 98 | "Q&A support: bye!\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "main()" 104 | ] 105 | } 106 | ], 107 | "metadata": { 108 | "kernelspec": { 109 | "display_name": "Python 3", 110 | "language": "python", 111 | "name": "python3" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 3 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython3", 123 | "version": "3.7.7" 124 | } 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 4 128 | } 129 | -------------------------------------------------------------------------------- /Chapter04/Cosine Similarity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Measuring Cosine Similarity between Document Vectors" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "[nltk_data] Downloading package stopwords to\n", 20 | "[nltk_data] /Users/amankedia/nltk_data...\n", 21 | "[nltk_data] Package stopwords is already up-to-date!\n", 22 | "[nltk_data] Downloading package wordnet to\n", 23 | "[nltk_data] /Users/amankedia/nltk_data...\n", 24 | "[nltk_data] Package wordnet is already up-to-date!\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "import nltk\n", 30 | "nltk.download('stopwords')\n", 31 | "nltk.download('wordnet')\n", 32 | "from nltk.corpus import stopwords\n", 33 | "from nltk.stem.porter import PorterStemmer \n", 34 | "from nltk.stem.snowball import SnowballStemmer\n", 35 | "from nltk.stem.wordnet import WordNetLemmatizer\n", 36 | "import pandas as pd\n", 37 | "import re\n", 38 | "import numpy as np\n", 39 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Building a corpus of sentences" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "sentences = [\"We are reading about Natural Language Processing Here\",\n", 56 | " \"Natural Language Processing making computers comprehend language data\",\n", 57 | " \"The field of Natural Language Processing is evolving everyday\"]" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "0 We are reading about Natural Language Processi...\n", 69 | "1 Natural Language Processing making computers c...\n", 70 | "2 The field of Natural Language Processing is ev...\n", 71 | "dtype: object" 72 | ] 73 | }, 74 | "execution_count": 3, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "corpus = pd.Series(sentences)\n", 81 | "corpus" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Data preprocessing pipeline" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "def text_clean(corpus, keep_list):\n", 98 | " '''\n", 99 | " Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)\n", 100 | " \n", 101 | " Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained\n", 102 | " even after the cleaning process\n", 103 | " \n", 104 | " Output : Returns the cleaned text corpus\n", 105 | " \n", 106 | " '''\n", 107 | " cleaned_corpus = pd.Series()\n", 108 | " for row in corpus:\n", 109 | " qs = []\n", 110 | " for word in row.split():\n", 111 | " if word not in keep_list:\n", 112 | " p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)\n", 113 | " p1 = p1.lower()\n", 114 | " qs.append(p1)\n", 115 | " else : qs.append(word)\n", 116 | " cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))\n", 117 | " return cleaned_corpus" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "def lemmatize(corpus):\n", 127 | " lem = WordNetLemmatizer()\n", 128 | " corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]\n", 129 | " return corpus" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 6, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "def stem(corpus, stem_type = None):\n", 139 | " if stem_type == 'snowball':\n", 140 | " stemmer = SnowballStemmer(language = 'english')\n", 141 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 142 | " else :\n", 143 | " stemmer = PorterStemmer()\n", 144 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 145 | " return corpus" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 7, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "def stopwords_removal(corpus):\n", 155 | " wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']\n", 156 | " stop = set(stopwords.words('english'))\n", 157 | " for word in wh_words:\n", 158 | " stop.remove(word)\n", 159 | " corpus = [[x for x in x.split() if x not in stop] for x in corpus]\n", 160 | " return corpus" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 8, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):\n", 170 | " '''\n", 171 | " Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)\n", 172 | " \n", 173 | " Input : \n", 174 | " 'corpus' - Text corpus on which pre-processing tasks will be performed\n", 175 | " 'keep_list' - List of words to be retained during cleaning process\n", 176 | " 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should \n", 177 | " be performed or not\n", 178 | " 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is \"None\", which corresponds to Porter\n", 179 | " Stemmer. 'snowball' corresponds to Snowball Stemmer\n", 180 | " \n", 181 | " Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together\n", 182 | " \n", 183 | " Output : Returns the processed text corpus\n", 184 | " \n", 185 | " '''\n", 186 | " \n", 187 | " if cleaning == True:\n", 188 | " corpus = text_clean(corpus, keep_list)\n", 189 | " \n", 190 | " if remove_stopwords == True:\n", 191 | " corpus = stopwords_removal(corpus)\n", 192 | " else :\n", 193 | " corpus = [[x for x in x.split()] for x in corpus]\n", 194 | " \n", 195 | " if lemmatization == True:\n", 196 | " corpus = lemmatize(corpus)\n", 197 | " \n", 198 | " \n", 199 | " if stemming == True:\n", 200 | " corpus = stem(corpus, stem_type)\n", 201 | " \n", 202 | " corpus = [' '.join(x) for x in corpus] \n", 203 | "\n", 204 | " return corpus" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "['read natural language process',\n", 216 | " 'natural language process make computers comprehend language data',\n", 217 | " 'field natural language process evolve everyday']" 218 | ] 219 | }, 220 | "execution_count": 9, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "# Preprocessing with Lemmatization here\n", 227 | "preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,\n", 228 | " lemmatization = True, remove_stopwords = True)\n", 229 | "preprocessed_corpus" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "## Cosine Similarity Calculation" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 10, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "def cosine_similarity(vector1, vector2):\n", 246 | " vector1 = np.array(vector1)\n", 247 | " vector2 = np.array(vector2)\n", 248 | " return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "## CountVectorizer" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 11, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "vectorizer = CountVectorizer()\n", 265 | "bow_matrix = vectorizer.fit_transform(preprocessed_corpus)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 12, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']\n", 278 | "[[0 0 0 0 0 0 1 0 1 1 1]\n", 279 | " [1 1 1 0 0 0 2 1 1 1 0]\n", 280 | " [0 0 0 1 1 1 1 0 1 1 0]]\n" 281 | ] 282 | } 283 | ], 284 | "source": [ 285 | "print(vectorizer.get_feature_names())\n", 286 | "print(bow_matrix.toarray())" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "## Cosine similarity between the document vectors built using CountVectorizer" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 13, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "name": "stdout", 303 | "output_type": "stream", 304 | "text": [ 305 | "The cosine similarity between the documents 0 and 1 is: 0.6324555320336759\n", 306 | "The cosine similarity between the documents 0 and 2 is: 0.6123724356957946\n", 307 | "The cosine similarity between the documents 1 and 2 is: 0.5163977794943223\n" 308 | ] 309 | } 310 | ], 311 | "source": [ 312 | "for i in range(bow_matrix.shape[0]):\n", 313 | " for j in range(i + 1, bow_matrix.shape[0]):\n", 314 | " print(\"The cosine similarity between the documents \", i, \"and\", j, \"is: \",\n", 315 | " cosine_similarity(bow_matrix.toarray()[i], bow_matrix.toarray()[j]))" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "## TfidfVectorizer" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 14, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "vectorizer = TfidfVectorizer()\n", 332 | "tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 15, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "name": "stdout", 342 | "output_type": "stream", 343 | "text": [ 344 | "['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']\n", 345 | "[[0. 0. 0. 0. 0. 0.\n", 346 | " 0.41285857 0. 0.41285857 0.41285857 0.69903033]\n", 347 | " [0.40512186 0.40512186 0.40512186 0. 0. 0.\n", 348 | " 0.478543 0.40512186 0.2392715 0.2392715 0. ]\n", 349 | " [0. 0. 0. 0.49711994 0.49711994 0.49711994\n", 350 | " 0.29360705 0. 0.29360705 0.29360705 0. ]]\n", 351 | "\n", 352 | "The shape of the TF-IDF matrix is: (3, 11)\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "print(vectorizer.get_feature_names())\n", 358 | "print(tf_idf_matrix.toarray())\n", 359 | "print(\"\\nThe shape of the TF-IDF matrix is: \", tf_idf_matrix.shape)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "## Cosine similarity between the document vectors built using TfidfVectorizer" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 16, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "The cosine similarity between the documents 0 and 1 is: 0.39514115766749125\n", 379 | "The cosine similarity between the documents 0 and 2 is: 0.36365455673761865\n", 380 | "The cosine similarity between the documents 1 and 2 is: 0.2810071916500233\n" 381 | ] 382 | } 383 | ], 384 | "source": [ 385 | "for i in range(tf_idf_matrix.shape[0]):\n", 386 | " for j in range(i + 1, tf_idf_matrix.shape[0]):\n", 387 | " print(\"The cosine similarity between the documents \", i, \"and\", j, \"is: \",\n", 388 | " cosine_similarity(tf_idf_matrix.toarray()[i], tf_idf_matrix.toarray()[j]))" 389 | ] 390 | } 391 | ], 392 | "metadata": { 393 | "kernelspec": { 394 | "display_name": "Python 3", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.6.0" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 2 413 | } 414 | -------------------------------------------------------------------------------- /Chapter04/CountVectorizer for Bag of Words model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## CountVectorizer for Bag of Words Model\n", 8 | "\n", 9 | "**Documentation**: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stderr", 19 | "output_type": "stream", 20 | "text": [ 21 | "[nltk_data] Downloading package stopwords to\n", 22 | "[nltk_data] /Users/amankedia/nltk_data...\n", 23 | "[nltk_data] Package stopwords is already up-to-date!\n", 24 | "[nltk_data] Downloading package wordnet to\n", 25 | "[nltk_data] /Users/amankedia/nltk_data...\n", 26 | "[nltk_data] Package wordnet is already up-to-date!\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "import nltk\n", 32 | "nltk.download('stopwords')\n", 33 | "nltk.download('wordnet')\n", 34 | "from nltk.corpus import stopwords\n", 35 | "from nltk.stem.porter import PorterStemmer \n", 36 | "from nltk.stem.snowball import SnowballStemmer\n", 37 | "from nltk.stem.wordnet import WordNetLemmatizer\n", 38 | "import pandas as pd\n", 39 | "import re\n", 40 | "import numpy as np\n", 41 | "from sklearn.feature_extraction.text import CountVectorizer" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Building a corpus of sentences" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "sentences = [\"We are reading about Natural Language Processing Here\",\n", 58 | " \"Natural Language Processing making computers comprehend language data\",\n", 59 | " \"The field of Natural Language Processing is evolving everyday\"]" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "0 We are reading about Natural Language Processi...\n", 71 | "1 Natural Language Processing making computers c...\n", 72 | "2 The field of Natural Language Processing is ev...\n", 73 | "dtype: object" 74 | ] 75 | }, 76 | "execution_count": 3, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "corpus = pd.Series(sentences)\n", 83 | "corpus" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Data preprocessing pipeline" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "def text_clean(corpus, keep_list):\n", 100 | " '''\n", 101 | " Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)\n", 102 | " \n", 103 | " Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained\n", 104 | " even after the cleaning process\n", 105 | " \n", 106 | " Output : Returns the cleaned text corpus\n", 107 | " \n", 108 | " '''\n", 109 | " cleaned_corpus = pd.Series()\n", 110 | " for row in corpus:\n", 111 | " qs = []\n", 112 | " for word in row.split():\n", 113 | " if word not in keep_list:\n", 114 | " p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)\n", 115 | " p1 = p1.lower()\n", 116 | " qs.append(p1)\n", 117 | " else : qs.append(word)\n", 118 | " cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))\n", 119 | " return cleaned_corpus" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "def lemmatize(corpus):\n", 129 | " lem = WordNetLemmatizer()\n", 130 | " corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]\n", 131 | " return corpus" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "def stem(corpus, stem_type = None):\n", 141 | " if stem_type == 'snowball':\n", 142 | " stemmer = SnowballStemmer(language = 'english')\n", 143 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 144 | " else :\n", 145 | " stemmer = PorterStemmer()\n", 146 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 147 | " return corpus" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 7, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "def stopwords_removal(corpus):\n", 157 | " wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']\n", 158 | " stop = set(stopwords.words('english'))\n", 159 | " for word in wh_words:\n", 160 | " stop.remove(word)\n", 161 | " corpus = [[x for x in x.split() if x not in stop] for x in corpus]\n", 162 | " return corpus" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 8, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):\n", 172 | " '''\n", 173 | " Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)\n", 174 | " \n", 175 | " Input : \n", 176 | " 'corpus' - Text corpus on which pre-processing tasks will be performed\n", 177 | " 'keep_list' - List of words to be retained during cleaning process\n", 178 | " 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should \n", 179 | " be performed or not\n", 180 | " 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is \"None\", which corresponds to Porter\n", 181 | " Stemmer. 'snowball' corresponds to Snowball Stemmer\n", 182 | " \n", 183 | " Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together\n", 184 | " \n", 185 | " Output : Returns the processed text corpus\n", 186 | " \n", 187 | " '''\n", 188 | " \n", 189 | " if cleaning == True:\n", 190 | " corpus = text_clean(corpus, keep_list)\n", 191 | " \n", 192 | " if remove_stopwords == True:\n", 193 | " corpus = stopwords_removal(corpus)\n", 194 | " else :\n", 195 | " corpus = [[x for x in x.split()] for x in corpus]\n", 196 | " \n", 197 | " if lemmatization == True:\n", 198 | " corpus = lemmatize(corpus)\n", 199 | " \n", 200 | " \n", 201 | " if stemming == True:\n", 202 | " corpus = stem(corpus, stem_type)\n", 203 | " \n", 204 | " corpus = [' '.join(x) for x in corpus] \n", 205 | "\n", 206 | " return corpus" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 9, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "['read natural language process',\n", 218 | " 'natural language process make computers comprehend language data',\n", 219 | " 'field natural language process evolve everyday']" 220 | ] 221 | }, 222 | "execution_count": 9, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "# Preprocessing with Lemmatization here\n", 229 | "preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,\n", 230 | " lemmatization = True, remove_stopwords = True)\n", 231 | "preprocessed_corpus" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "## CountVectorizer" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 10, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "vectorizer = CountVectorizer()\n", 248 | "bow_matrix = vectorizer.fit_transform(preprocessed_corpus)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "### Let's what features were obtained and the corresponding bag of words matrix" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 11, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']\n", 268 | "[[0 0 0 0 0 0 1 0 1 1 1]\n", 269 | " [1 1 1 0 0 0 2 1 1 1 0]\n", 270 | " [0 0 0 1 1 1 1 0 1 1 0]]\n" 271 | ] 272 | } 273 | ], 274 | "source": [ 275 | "print(vectorizer.get_feature_names())\n", 276 | "print(bow_matrix.toarray())" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 12, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "(3, 11)\n" 289 | ] 290 | } 291 | ], 292 | "source": [ 293 | "print(bow_matrix.toarray().shape)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### The matrix is the same as what was obtained after all the hard work in the previous exercise.\n", 301 | "\n", 302 | "Now you know, what to use when a basic Bag of Words Model is needed." 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "## Let's see how can bigrams and trigrams can be included here" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 13, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "vectorizer_ngram_range = CountVectorizer(analyzer='word', ngram_range=(1,3))\n", 319 | "bow_matrix_ngram = vectorizer_ngram_range.fit_transform(preprocessed_corpus)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 14, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "['comprehend', 'comprehend language', 'comprehend language data', 'computers', 'computers comprehend', 'computers comprehend language', 'data', 'everyday', 'evolve', 'evolve everyday', 'field', 'field natural', 'field natural language', 'language', 'language data', 'language process', 'language process evolve', 'language process make', 'make', 'make computers', 'make computers comprehend', 'natural', 'natural language', 'natural language process', 'process', 'process evolve', 'process evolve everyday', 'process make', 'process make computers', 'read', 'read natural', 'read natural language']\n", 332 | "[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1]\n", 333 | " [1 1 1 1 1 1 1 0 0 0 0 0 0 2 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0]\n", 334 | " [0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]]\n" 335 | ] 336 | } 337 | ], 338 | "source": [ 339 | "print(vectorizer_ngram_range.get_feature_names())\n", 340 | "print(bow_matrix_ngram.toarray())" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "### Inference\n", 348 | "As can be seen, the 9th phrase from the end *natural language process* occurs once in every sentence.\n", 349 | "\n", 350 | "The column corresponding to it has the entries **1, 1 and 1**." 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "## Understanding Max Features " 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 15, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "vectorizer_max_features = CountVectorizer(analyzer='word', ngram_range=(1,3), max_features = 6)\n", 367 | "bow_matrix_max_features = vectorizer_max_features.fit_transform(preprocessed_corpus)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 16, 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "name": "stdout", 377 | "output_type": "stream", 378 | "text": [ 379 | "['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']\n", 380 | "[[1 1 1 1 1 1]\n", 381 | " [2 1 1 1 1 1]\n", 382 | " [1 1 1 1 1 1]]\n" 383 | ] 384 | } 385 | ], 386 | "source": [ 387 | "print(vectorizer_max_features.get_feature_names())\n", 388 | "print(bow_matrix_max_features.toarray())" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "### Inference\n", 396 | "\n", 397 | "The Vocabulary and Bag of Words Model got limited to 6 features since *max_features = 6* was provided as input to the CountVectorizer" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "## Thresholding using Max_df and Min_df" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 17, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "vectorizer_max_features = CountVectorizer(analyzer='word', ngram_range=(1,3), max_df = 3, min_df = 2)\n", 414 | "bow_matrix_max_features = vectorizer_max_features.fit_transform(preprocessed_corpus)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 18, 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "name": "stdout", 424 | "output_type": "stream", 425 | "text": [ 426 | "['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']\n", 427 | "[[1 1 1 1 1 1]\n", 428 | " [2 1 1 1 1 1]\n", 429 | " [1 1 1 1 1 1]]\n" 430 | ] 431 | } 432 | ], 433 | "source": [ 434 | "print(vectorizer_max_features.get_feature_names())\n", 435 | "print(bow_matrix_max_features.toarray())" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": {}, 441 | "source": [ 442 | "## Inference\n", 443 | "\n", 444 | "Terms with max_df less than or equal to or less than 3 were only present in the vocabulary.\n", 445 | "\n", 446 | "**Note**: *max_features* was not used here" 447 | ] 448 | } 449 | ], 450 | "metadata": { 451 | "kernelspec": { 452 | "display_name": "Python 3", 453 | "language": "python", 454 | "name": "python3" 455 | }, 456 | "language_info": { 457 | "codemirror_mode": { 458 | "name": "ipython", 459 | "version": 3 460 | }, 461 | "file_extension": ".py", 462 | "mimetype": "text/x-python", 463 | "name": "python", 464 | "nbconvert_exporter": "python", 465 | "pygments_lexer": "ipython3", 466 | "version": "3.6.0" 467 | } 468 | }, 469 | "nbformat": 4, 470 | "nbformat_minor": 2 471 | } 472 | -------------------------------------------------------------------------------- /Chapter04/Dataset/qa_Electronics.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter04/Dataset/qa_Electronics.zip -------------------------------------------------------------------------------- /Chapter04/Matrix Representation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "{'computers': 2, 'analyze': 1, 'text': 7, 'using': 8, 'vectors': 9, 'matrices': 5, 'process': 6, 'massive': 4, 'amounts': 0, 'data': 3}\n", 13 | "[[0 1 1 0 0 0 0 1 0 0]\n", 14 | " [0 0 0 0 0 1 0 0 1 1]\n", 15 | " [1 0 1 1 1 0 1 1 0 0]]\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "from sklearn.feature_extraction.text import CountVectorizer\n", 21 | "\n", 22 | "X = (\"Computers can analyze text\",\n", 23 | " \"They do it using vectors and matrices\",\n", 24 | " \"Computers can process massive amounts of text data\")\n", 25 | "\n", 26 | "vectorizer = CountVectorizer(stop_words='english')\n", 27 | "X_vec = vectorizer.fit_transform(X)\n", 28 | "print(vectorizer.vocabulary_)\n", 29 | "print(X_vec.todense()) " 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.7.7" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 4 54 | } 55 | -------------------------------------------------------------------------------- /Chapter04/One Hot Vectors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# One Hot Vectors" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "[nltk_data] Downloading package stopwords to\n", 20 | "[nltk_data] /Users/amankedia/nltk_data...\n", 21 | "[nltk_data] Package stopwords is already up-to-date!\n", 22 | "[nltk_data] Downloading package wordnet to\n", 23 | "[nltk_data] /Users/amankedia/nltk_data...\n", 24 | "[nltk_data] Package wordnet is already up-to-date!\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "import nltk\n", 30 | "nltk.download('stopwords')\n", 31 | "nltk.download('wordnet')\n", 32 | "from nltk.corpus import stopwords\n", 33 | "from nltk.stem.porter import PorterStemmer \n", 34 | "from nltk.stem.snowball import SnowballStemmer\n", 35 | "from nltk.stem.wordnet import WordNetLemmatizer\n", 36 | "import pandas as pd\n", 37 | "import re\n", 38 | "import numpy as np" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## We take only 1 sentence as input here" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "sentence = [\"We are reading about Natural Language Processing Here\"]" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "0 We are reading about Natural Language Processi...\n", 66 | "dtype: object" 67 | ] 68 | }, 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "corpus = pd.Series(sentence)\n", 76 | "corpus" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## Data preprocessing" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "def text_clean(corpus, keep_list):\n", 93 | " '''\n", 94 | " Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)\n", 95 | " \n", 96 | " Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained\n", 97 | " even after the cleaning process\n", 98 | " \n", 99 | " Output : Returns the cleaned text corpus\n", 100 | " \n", 101 | " '''\n", 102 | " cleaned_corpus = pd.Series()\n", 103 | " for row in corpus:\n", 104 | " qs = []\n", 105 | " for word in row.split():\n", 106 | " if word not in keep_list:\n", 107 | " p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)\n", 108 | " p1 = p1.lower()\n", 109 | " qs.append(p1)\n", 110 | " else : qs.append(word)\n", 111 | " cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))\n", 112 | " return cleaned_corpus" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 5, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "def lemmatize(corpus):\n", 122 | " lem = WordNetLemmatizer()\n", 123 | " corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]\n", 124 | " return corpus" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 6, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "def stem(corpus, stem_type = None):\n", 134 | " if stem_type == 'snowball':\n", 135 | " stemmer = SnowballStemmer(language = 'english')\n", 136 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 137 | " else :\n", 138 | " stemmer = PorterStemmer()\n", 139 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 140 | " return corpus" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 7, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "def stopwords_removal(corpus):\n", 150 | " wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']\n", 151 | " stop = set(stopwords.words('english'))\n", 152 | " for word in wh_words:\n", 153 | " stop.remove(word)\n", 154 | " corpus = [[x for x in x.split() if x not in stop] for x in corpus]\n", 155 | " return corpus" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 8, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):\n", 165 | " '''\n", 166 | " Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)\n", 167 | " \n", 168 | " Input : \n", 169 | " 'corpus' - Text corpus on which pre-processing tasks will be performed\n", 170 | " 'keep_list' - List of words to be retained during cleaning process\n", 171 | " 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should \n", 172 | " be performed or not\n", 173 | " 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is \"None\", which corresponds to Porter\n", 174 | " Stemmer. 'snowball' corresponds to Snowball Stemmer\n", 175 | " \n", 176 | " Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together\n", 177 | " \n", 178 | " Output : Returns the processed text corpus\n", 179 | " \n", 180 | " '''\n", 181 | " \n", 182 | " if cleaning == True:\n", 183 | " corpus = text_clean(corpus, keep_list)\n", 184 | " \n", 185 | " if remove_stopwords == True:\n", 186 | " corpus = stopwords_removal(corpus)\n", 187 | " else :\n", 188 | " corpus = [[x for x in x.split()] for x in corpus]\n", 189 | " \n", 190 | " if lemmatization == True:\n", 191 | " corpus = lemmatize(corpus)\n", 192 | " \n", 193 | " \n", 194 | " if stemming == True:\n", 195 | " corpus = stem(corpus, stem_type)\n", 196 | " \n", 197 | " corpus = [' '.join(x) for x in corpus] \n", 198 | "\n", 199 | " return corpus" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 9, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "data": { 209 | "text/plain": [ 210 | "['read natural language process']" 211 | ] 212 | }, 213 | "execution_count": 9, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "# Preprocessing with Lemmatization here\n", 220 | "preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,\n", 221 | " lemmatization = True, remove_stopwords = True)\n", 222 | "preprocessed_corpus" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "## Building the vocabulary" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 10, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "['natural', 'process', 'language', 'read']\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "set_of_words = set()\n", 247 | "for word in preprocessed_corpus[0].split():\n", 248 | " set_of_words.add(word)\n", 249 | "vocab = list(set_of_words)\n", 250 | "print(vocab)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "## Fetching the position of each word in the vocabulary" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 11, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "name": "stdout", 267 | "output_type": "stream", 268 | "text": [ 269 | "{'natural': 0, 'process': 1, 'language': 2, 'read': 3}\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "position = {}\n", 275 | "for i, token in enumerate(vocab):\n", 276 | " position[token] = i\n", 277 | "print(position)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "## Instantiating the one hot matrix\n", 285 | "\n", 286 | "### Note here every row in the matrix corresponds to the One Hot vector for an individual term" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 12, 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "data": { 296 | "text/plain": [ 297 | "(4, 4)" 298 | ] 299 | }, 300 | "execution_count": 12, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "one_hot_matrix = np.zeros((len(preprocessed_corpus[0].split()), len(vocab)))\n", 307 | "one_hot_matrix.shape" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "## Building One Hot Vectors" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 13, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "for i, token in enumerate(preprocessed_corpus[0].split()):\n", 324 | " one_hot_matrix[i][position[token]] = 1" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "## Visualizing the One Hot Vectors" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 14, 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "data": { 341 | "text/plain": [ 342 | "array([[0., 0., 0., 1.],\n", 343 | " [1., 0., 0., 0.],\n", 344 | " [0., 0., 1., 0.],\n", 345 | " [0., 1., 0., 0.]])" 346 | ] 347 | }, 348 | "execution_count": 14, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "one_hot_matrix" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "## Inference\n", 362 | "\n", 363 | "The first row corresponds to the One Hot vector of *read*,\n", 364 | "\n", 365 | "second for *natural*,\n", 366 | "\n", 367 | "third for *language* and\n", 368 | "\n", 369 | "the final one for *process*\n", 370 | "\n", 371 | "based on their respective indices in the vocabulary" 372 | ] 373 | } 374 | ], 375 | "metadata": { 376 | "kernelspec": { 377 | "display_name": "Python 3", 378 | "language": "python", 379 | "name": "python3" 380 | }, 381 | "language_info": { 382 | "codemirror_mode": { 383 | "name": "ipython", 384 | "version": 3 385 | }, 386 | "file_extension": ".py", 387 | "mimetype": "text/x-python", 388 | "name": "python", 389 | "nbconvert_exporter": "python", 390 | "pygments_lexer": "ipython3", 391 | "version": "3.6.0" 392 | } 393 | }, 394 | "nbformat": 4, 395 | "nbformat_minor": 2 396 | } 397 | -------------------------------------------------------------------------------- /Chapter04/TfIdf Vectorizer for text representation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Term Frequency-Inverse Document Frequency based Vectorizer\n", 8 | "\n", 9 | "**Documentation**: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stderr", 19 | "output_type": "stream", 20 | "text": [ 21 | "[nltk_data] Downloading package stopwords to\n", 22 | "[nltk_data] /Users/amankedia/nltk_data...\n", 23 | "[nltk_data] Package stopwords is already up-to-date!\n", 24 | "[nltk_data] Downloading package wordnet to\n", 25 | "[nltk_data] /Users/amankedia/nltk_data...\n", 26 | "[nltk_data] Package wordnet is already up-to-date!\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "import nltk\n", 32 | "nltk.download('stopwords')\n", 33 | "nltk.download('wordnet')\n", 34 | "from nltk.corpus import stopwords\n", 35 | "from nltk.stem.porter import PorterStemmer \n", 36 | "from nltk.stem.snowball import SnowballStemmer\n", 37 | "from nltk.stem.wordnet import WordNetLemmatizer\n", 38 | "import pandas as pd\n", 39 | "import re\n", 40 | "import numpy as np\n", 41 | "from sklearn.feature_extraction.text import TfidfVectorizer" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Building a corpus of sentences" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "sentences = [\"We are reading about Natural Language Processing Here\",\n", 58 | " \"Natural Language Processing making computers comprehend language data\",\n", 59 | " \"The field of Natural Language Processing is evolving everyday\"]" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "0 We are reading about Natural Language Processi...\n", 71 | "1 Natural Language Processing making computers c...\n", 72 | "2 The field of Natural Language Processing is ev...\n", 73 | "dtype: object" 74 | ] 75 | }, 76 | "execution_count": 3, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "corpus = pd.Series(sentences)\n", 83 | "corpus" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Data preprocessing pipeline" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "def text_clean(corpus, keep_list):\n", 100 | " '''\n", 101 | " Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)\n", 102 | " \n", 103 | " Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained\n", 104 | " even after the cleaning process\n", 105 | " \n", 106 | " Output : Returns the cleaned text corpus\n", 107 | " \n", 108 | " '''\n", 109 | " cleaned_corpus = pd.Series()\n", 110 | " for row in corpus:\n", 111 | " qs = []\n", 112 | " for word in row.split():\n", 113 | " if word not in keep_list:\n", 114 | " p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)\n", 115 | " p1 = p1.lower()\n", 116 | " qs.append(p1)\n", 117 | " else : qs.append(word)\n", 118 | " cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))\n", 119 | " return cleaned_corpus" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "def lemmatize(corpus):\n", 129 | " lem = WordNetLemmatizer()\n", 130 | " corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]\n", 131 | " return corpus" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "def stem(corpus, stem_type = None):\n", 141 | " if stem_type == 'snowball':\n", 142 | " stemmer = SnowballStemmer(language = 'english')\n", 143 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 144 | " else :\n", 145 | " stemmer = PorterStemmer()\n", 146 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 147 | " return corpus" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 7, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "def stopwords_removal(corpus):\n", 157 | " wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']\n", 158 | " stop = set(stopwords.words('english'))\n", 159 | " for word in wh_words:\n", 160 | " stop.remove(word)\n", 161 | " corpus = [[x for x in x.split() if x not in stop] for x in corpus]\n", 162 | " return corpus" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 8, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):\n", 172 | " '''\n", 173 | " Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)\n", 174 | " \n", 175 | " Input : \n", 176 | " 'corpus' - Text corpus on which pre-processing tasks will be performed\n", 177 | " 'keep_list' - List of words to be retained during cleaning process\n", 178 | " 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should \n", 179 | " be performed or not\n", 180 | " 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is \"None\", which corresponds to Porter\n", 181 | " Stemmer. 'snowball' corresponds to Snowball Stemmer\n", 182 | " \n", 183 | " Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together\n", 184 | " \n", 185 | " Output : Returns the processed text corpus\n", 186 | " \n", 187 | " '''\n", 188 | " \n", 189 | " if cleaning == True:\n", 190 | " corpus = text_clean(corpus, keep_list)\n", 191 | " \n", 192 | " if remove_stopwords == True:\n", 193 | " corpus = stopwords_removal(corpus)\n", 194 | " else :\n", 195 | " corpus = [[x for x in x.split()] for x in corpus]\n", 196 | " \n", 197 | " if lemmatization == True:\n", 198 | " corpus = lemmatize(corpus)\n", 199 | " \n", 200 | " \n", 201 | " if stemming == True:\n", 202 | " corpus = stem(corpus, stem_type)\n", 203 | " \n", 204 | " corpus = [' '.join(x) for x in corpus] \n", 205 | "\n", 206 | " return corpus" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 9, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "['read natural language process',\n", 218 | " 'natural language process make computers comprehend language data',\n", 219 | " 'field natural language process evolve everyday']" 220 | ] 221 | }, 222 | "execution_count": 9, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "# Preprocessing with Lemmatization here\n", 229 | "preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,\n", 230 | " lemmatization = True, remove_stopwords = True)\n", 231 | "preprocessed_corpus" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "## TfIdfVectorizer" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 10, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "vectorizer = TfidfVectorizer()\n", 248 | "tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "### Let's what features were obtained and the corresponding TF-IDF matrix" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 11, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']\n", 268 | "[[0. 0. 0. 0. 0. 0.\n", 269 | " 0.41285857 0. 0.41285857 0.41285857 0.69903033]\n", 270 | " [0.40512186 0.40512186 0.40512186 0. 0. 0.\n", 271 | " 0.478543 0.40512186 0.2392715 0.2392715 0. ]\n", 272 | " [0. 0. 0. 0.49711994 0.49711994 0.49711994\n", 273 | " 0.29360705 0. 0.29360705 0.29360705 0. ]]\n", 274 | "\n", 275 | "The shape of the TF-IDF matrix is: (3, 11)\n" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "print(vectorizer.get_feature_names())\n", 281 | "print(tf_idf_matrix.toarray())\n", 282 | "print(\"\\nThe shape of the TF-IDF matrix is: \", tf_idf_matrix.shape)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "## Changing the norm to l1, default option is l2 which was used above\n", 290 | "\n", 291 | "Each output row will have unit norm, which can be one of\n", 292 | "\n", 293 | "**l2**: Sum of squares of vector elements is 1.\n", 294 | "\n", 295 | "**l1**: Sum of absolute values of vector elements is 1." 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 12, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "vectorizer_l1_norm = TfidfVectorizer(norm=\"l1\")\n", 305 | "tf_idf_matrix_l1_norm = vectorizer_l1_norm.fit_transform(preprocessed_corpus)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 13, 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "['comprehend', 'computers', 'data', 'everyday', 'evolve', 'field', 'language', 'make', 'natural', 'process', 'read']\n", 318 | "[[0. 0. 0. 0. 0. 0.\n", 319 | " 0.21307663 0. 0.21307663 0.21307663 0.3607701 ]\n", 320 | " [0.1571718 0.1571718 0.1571718 0. 0. 0.\n", 321 | " 0.1856564 0.1571718 0.0928282 0.0928282 0. ]\n", 322 | " [0. 0. 0. 0.2095624 0.2095624 0.2095624\n", 323 | " 0.12377093 0. 0.12377093 0.12377093 0. ]]\n", 324 | "\n", 325 | "The shape of the TF-IDF matrix is: (3, 11)\n" 326 | ] 327 | } 328 | ], 329 | "source": [ 330 | "print(vectorizer_l1_norm.get_feature_names())\n", 331 | "print(tf_idf_matrix_l1_norm.toarray())\n", 332 | "print(\"\\nThe shape of the TF-IDF matrix is: \", tf_idf_matrix_l1_norm.shape)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "## N-grams and Max features with TfidfVectorizer" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 14, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "vectorizer_n_gram_max_features = TfidfVectorizer(norm=\"l2\", analyzer='word', ngram_range=(1,3), max_features = 6)\n", 349 | "tf_idf_matrix_n_gram_max_features = vectorizer_n_gram_max_features.fit_transform(preprocessed_corpus)" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 15, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "name": "stdout", 359 | "output_type": "stream", 360 | "text": [ 361 | "['language', 'language process', 'natural', 'natural language', 'natural language process', 'process']\n", 362 | "[[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]\n", 363 | " [0.66666667 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333]\n", 364 | " [0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]\n", 365 | "\n", 366 | "The shape of the TF-IDF matrix is: (3, 6)\n" 367 | ] 368 | } 369 | ], 370 | "source": [ 371 | "print(vectorizer_n_gram_max_features.get_feature_names())\n", 372 | "print(tf_idf_matrix_n_gram_max_features.toarray())\n", 373 | "print(\"\\nThe shape of the TF-IDF matrix is: \", tf_idf_matrix_n_gram_max_features.shape)" 374 | ] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.6.0" 394 | } 395 | }, 396 | "nbformat": 4, 397 | "nbformat_minor": 2 398 | } 399 | -------------------------------------------------------------------------------- /Chapter05/Word Mover's Distance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Word Mover's Distance as a Dissimilarity/Similarity Measure" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Import the libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import gensim\n", 24 | "from gensim.models import KeyedVectors" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Load the pre-trained Word2Vec model" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "model = KeyedVectors.load_word2vec_format('/Users/amankedia/Desktop/Sunday/nlp-book/Chapter 5/Code/GoogleNews-vectors-negative300.bin', binary=True)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Define the sentences" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "sentence_1 = \"Obama speaks to the media in Illinois\"\n", 57 | "sentence_2 = \"President greets the press in Chicago\"\n", 58 | "sentence_3 = \"Apple is my favorite company\"" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "### Compute the Word Mover's Distance between the sentences" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "1.1642040735998236" 77 | ] 78 | }, 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "word_mover_distance = model.wmdistance(sentence_1, sentence_2)\n", 86 | "word_mover_distance" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "1.365806580758697" 98 | ] 99 | }, 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "word_mover_distance = model.wmdistance(sentence_1, sentence_3)\n", 107 | "word_mover_distance" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "### Normalizing the word embeddings to get a best measure of distance" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "model.init_sims(replace = True)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "### Recomputing the Word Mover's Distance between the sentences based on normalized embeddings" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 7, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "0.4277553083600646" 142 | ] 143 | }, 144 | "execution_count": 7, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "word_mover_distance = model.wmdistance(sentence_1, sentence_2)\n", 151 | "word_mover_distance" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 8, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "0.47793400675650705" 163 | ] 164 | }, 165 | "execution_count": 8, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "word_mover_distance = model.wmdistance(sentence_1, sentence_3)\n", 172 | "word_mover_distance" 173 | ] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.6.0" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 2 197 | } 198 | -------------------------------------------------------------------------------- /Chapter06/comments.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter06/comments.zip -------------------------------------------------------------------------------- /Chapter06/fastText based Applications (Spell Correction_Auto Suggestion_Document Distances).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Building a Spell Corrector/Text Suggestor using fastText" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Importing the libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "[nltk_data] Downloading package stopwords to\n", 27 | "[nltk_data] /Users/amankedia/nltk_data...\n", 28 | "[nltk_data] Package stopwords is already up-to-date!\n", 29 | "[nltk_data] Downloading package wordnet to\n", 30 | "[nltk_data] /Users/amankedia/nltk_data...\n", 31 | "[nltk_data] Package wordnet is already up-to-date!\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "import nltk\n", 37 | "import re\n", 38 | "nltk.download('stopwords')\n", 39 | "nltk.download('wordnet')\n", 40 | "from nltk.corpus import stopwords\n", 41 | "from nltk.stem.porter import PorterStemmer \n", 42 | "from nltk.stem.snowball import SnowballStemmer\n", 43 | "from nltk.stem.wordnet import WordNetLemmatizer\n", 44 | "from gensim.models import FastText\n", 45 | "import io\n", 46 | "import collections" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "### Reading the data" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 2, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "words = []\n", 63 | "data = []\n", 64 | "with io.open('comments.txt', 'r') as file:\n", 65 | " for entry in file:\n", 66 | " entry = entry.strip()\n", 67 | " data.append(entry)\n", 68 | " words.extend(entry.split())" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Checking for common terms in the data" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "[('the', 445892),\n", 87 | " ('to', 288753),\n", 88 | " ('of', 219279),\n", 89 | " ('and', 207335),\n", 90 | " ('a', 201765),\n", 91 | " ('I', 182618),\n", 92 | " ('is', 164602),\n", 93 | " ('you', 157025),\n", 94 | " ('that', 140495),\n", 95 | " ('in', 130244)]" 96 | ] 97 | }, 98 | "execution_count": 3, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "unique_words = []\n", 105 | "unique_words = collections.Counter(words)\n", 106 | "unique_words.most_common(10)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "['\"Explanation',\n", 118 | " 'Why the edits made under my username Hardcore Metallica Fan were reverted? They weren\\'t vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don\\'t remove the template from the talk page since I\\'m retired now.89.205.38.27\"',\n", 119 | " \"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)\"]" 120 | ] 121 | }, 122 | "execution_count": 4, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "data[:3]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Data Preprocessing" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 5, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "def text_clean(corpus):\n", 145 | " '''\n", 146 | " Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)\n", 147 | " \n", 148 | " Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained\n", 149 | " even after the cleaning process\n", 150 | " \n", 151 | " Output : Returns the cleaned text corpus\n", 152 | " \n", 153 | " '''\n", 154 | " cleaned_corpus = []\n", 155 | " for row in corpus:\n", 156 | " qs = []\n", 157 | " for word in row.split():\n", 158 | " p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)\n", 159 | " p1 = p1.lower()\n", 160 | " qs.append(p1)\n", 161 | " cleaned_corpus.append(' '.join(qs))\n", 162 | " return cleaned_corpus" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 6, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "def stopwords_removal(corpus):\n", 172 | " wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']\n", 173 | " stop = set(stopwords.words('english'))\n", 174 | " for word in wh_words:\n", 175 | " stop.remove(word)\n", 176 | " corpus = [[x for x in x.split() if x not in stop] for x in corpus]\n", 177 | " return corpus" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 7, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "def lemmatize(corpus):\n", 187 | " lem = WordNetLemmatizer()\n", 188 | " corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]\n", 189 | " return corpus" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 8, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "def stem(corpus, stem_type = None):\n", 199 | " if stem_type == 'snowball':\n", 200 | " stemmer = SnowballStemmer(language = 'english')\n", 201 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 202 | " else :\n", 203 | " stemmer = PorterStemmer()\n", 204 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 205 | " return corpus" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 9, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):\n", 215 | " '''\n", 216 | " Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)\n", 217 | " \n", 218 | " Input : \n", 219 | " 'corpus' - Text corpus on which pre-processing tasks will be performed\n", 220 | " 'keep_list' - List of words to be retained during cleaning process\n", 221 | " 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should \n", 222 | " be performed or not\n", 223 | " 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is \"None\", which corresponds to Porter\n", 224 | " Stemmer. 'snowball' corresponds to Snowball Stemmer\n", 225 | " \n", 226 | " Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together\n", 227 | " \n", 228 | " Output : Returns the processed text corpus\n", 229 | " \n", 230 | " '''\n", 231 | " \n", 232 | " if cleaning == True:\n", 233 | " corpus = text_clean(corpus)\n", 234 | " \n", 235 | " if remove_stopwords == True:\n", 236 | " corpus = stopwords_removal(corpus)\n", 237 | " else :\n", 238 | " corpus = [[x for x in x.split()] for x in corpus]\n", 239 | " \n", 240 | " if lemmatization == True:\n", 241 | " corpus = lemmatize(corpus)\n", 242 | " \n", 243 | " \n", 244 | " if stemming == True:\n", 245 | " corpus = stem(corpus, stem_type)\n", 246 | " \n", 247 | " corpus = [' '.join(x) for x in corpus] \n", 248 | "\n", 249 | " return corpus" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 10, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "data = preprocess(data)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "### Data conversion into formation expected by fastText" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 11, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "preprocessed_data = []\n", 275 | "for line in data:\n", 276 | " if line != \"\":\n", 277 | " preprocessed_data.append(line.split())" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "### Building the fastText model" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 12, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "model = FastText(size=300, window=3, min_count=1, min_n=1, max_n=5)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 13, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "model.build_vocab(sentences=preprocessed_data)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 14, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "182228" 314 | ] 315 | }, 316 | "execution_count": 14, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "len(model.wv.vocab)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 15, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "model.train(sentences=preprocessed_data, total_examples=len(preprocessed_data), epochs=10)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "### Checking for top 5 similar terms returned by the model for specific words (Can be spell corrections and suggestions)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 16, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "[('xplain', 0.8708899021148682),\n", 350 | " ('eexplain', 0.8258755207061768),\n", 351 | " ('explain', 0.8239225149154663),\n", 352 | " ('plain', 0.8152675628662109),\n", 353 | " ('reexplain', 0.8069050312042236)]" 354 | ] 355 | }, 356 | "execution_count": 16, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "model.wv.most_similar('eplain', topn=5)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 17, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "text/plain": [ 373 | "[('remainder', 0.9160404205322266),\n", 374 | " ('rejoinder', 0.914063572883606),\n", 375 | " ('minderbinder', 0.9082638025283813),\n", 376 | " ('reminde', 0.9064908623695374),\n", 377 | " ('reminders', 0.9056416153907776)]" 378 | ] 379 | }, 380 | "execution_count": 17, 381 | "metadata": {}, 382 | "output_type": "execute_result" 383 | } 384 | ], 385 | "source": [ 386 | "model.wv.most_similar('reminder', topn=5)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 18, 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/plain": [ 397 | "[('relevant', 0.8082616925239563),\n", 398 | " ('relev', 0.8032398223876953),\n", 399 | " ('releant', 0.7884056568145752),\n", 400 | " ('releavant', 0.7859309315681458),\n", 401 | " ('relevanmt', 0.782160222530365)]" 402 | ] 403 | }, 404 | "execution_count": 18, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "model.wv.most_similar('relevnt', topn=5)" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 19, 416 | "metadata": {}, 417 | "outputs": [ 418 | { 419 | "data": { 420 | "text/plain": [ 421 | "[('purpse', 0.9277137517929077),\n", 422 | " ('cpurse', 0.9121625423431396),\n", 423 | " ('pure', 0.8952039480209351),\n", 424 | " ('pursue', 0.8886381387710571),\n", 425 | " ('pursuit', 0.8727647066116333)]" 426 | ] 427 | }, 428 | "execution_count": 19, 429 | "metadata": {}, 430 | "output_type": "execute_result" 431 | } 432 | ], 433 | "source": [ 434 | "model.wv.most_similar('purse', topn=5)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "## fastText and Word Mover's Distance" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 20, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "sentence_1 = \"Obama speaks to the media in Illinois\"\n", 451 | "sentence_2 = \"President greets the press in Chicago\"\n", 452 | "sentence_3 = \"Apple is my favorite company\"" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 21, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "name": "stderr", 462 | "output_type": "stream", 463 | "text": [ 464 | "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `wmdistance` (Method will be removed in 4.0.0, use self.wv.wmdistance() instead).\n", 465 | " \"\"\"Entry point for launching an IPython kernel.\n" 466 | ] 467 | }, 468 | { 469 | "data": { 470 | "text/plain": [ 471 | "16.1709604954656" 472 | ] 473 | }, 474 | "execution_count": 21, 475 | "metadata": {}, 476 | "output_type": "execute_result" 477 | } 478 | ], 479 | "source": [ 480 | "word_mover_distance = model.wmdistance(sentence_1, sentence_2)\n", 481 | "word_mover_distance" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 22, 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "name": "stderr", 491 | "output_type": "stream", 492 | "text": [ 493 | "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `wmdistance` (Method will be removed in 4.0.0, use self.wv.wmdistance() instead).\n", 494 | " \"\"\"Entry point for launching an IPython kernel.\n" 495 | ] 496 | }, 497 | { 498 | "data": { 499 | "text/plain": [ 500 | "21.046283089663497" 501 | ] 502 | }, 503 | "execution_count": 22, 504 | "metadata": {}, 505 | "output_type": "execute_result" 506 | } 507 | ], 508 | "source": [ 509 | "word_mover_distance = model.wmdistance(sentence_2, sentence_3)\n", 510 | "word_mover_distance" 511 | ] 512 | } 513 | ], 514 | "metadata": { 515 | "kernelspec": { 516 | "display_name": "Python 3", 517 | "language": "python", 518 | "name": "python3" 519 | }, 520 | "language_info": { 521 | "codemirror_mode": { 522 | "name": "ipython", 523 | "version": 3 524 | }, 525 | "file_extension": ".py", 526 | "mimetype": "text/x-python", 527 | "name": "python", 528 | "nbconvert_exporter": "python", 529 | "pygments_lexer": "ipython3", 530 | "version": "3.6.0" 531 | } 532 | }, 533 | "nbformat": 4, 534 | "nbformat_minor": 2 535 | } 536 | -------------------------------------------------------------------------------- /Chapter06/fastText from scratch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Building fastText based models" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Importing the libraries and data" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from gensim.models import FastText\n", 24 | "from gensim.test.utils import common_texts" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "[['human', 'interface', 'computer'],\n", 36 | " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n", 37 | " ['eps', 'user', 'interface', 'system'],\n", 38 | " ['system', 'human', 'system', 'eps'],\n", 39 | " ['user', 'response', 'time'],\n", 40 | " ['trees'],\n", 41 | " ['graph', 'trees'],\n", 42 | " ['graph', 'minors', 'trees'],\n", 43 | " ['graph', 'minors', 'survey']]" 44 | ] 45 | }, 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [ 52 | "common_texts" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "### Building a basic model" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "model = FastText(size=5, window=3, min_count=1)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "model.build_vocab(sentences=common_texts)\n", 78 | "model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### Check the vocabulary" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "{'human': ,\n", 97 | " 'interface': ,\n", 98 | " 'computer': ,\n", 99 | " 'survey': ,\n", 100 | " 'user': ,\n", 101 | " 'system': ,\n", 102 | " 'response': ,\n", 103 | " 'time': ,\n", 104 | " 'eps': ,\n", 105 | " 'trees': ,\n", 106 | " 'graph': ,\n", 107 | " 'minors': }" 108 | ] 109 | }, 110 | "execution_count": 5, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "model.wv.vocab" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 6, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "array([ 0.03953331, -0.02951075, 0.02039873, 0.00304991, -0.00968183],\n", 128 | " dtype=float32)" 129 | ] 130 | }, 131 | "execution_count": 6, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "model.wv['human']" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "### Checkout the most similar feature" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 7, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "[('system', 0.908109724521637),\n", 156 | " ('eps', 0.886881947517395),\n", 157 | " ('response', 0.6286922097206116),\n", 158 | " ('user', 0.38861846923828125),\n", 159 | " ('minors', 0.24753454327583313),\n", 160 | " ('time', 0.06086184084415436),\n", 161 | " ('survey', -0.0791618824005127),\n", 162 | " ('trees', -0.40337082743644714),\n", 163 | " ('graph', -0.46148836612701416)]" 164 | ] 165 | }, 166 | "execution_count": 7, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "model.wv.most_similar(positive=['computer', 'interface'], negative=['human'])" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### min_n and max_n parameters" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 8, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "model = FastText(size=5, window=3, min_count=1, min_n=1, max_n=5)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 9, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "model.build_vocab(sentences=common_texts)\n", 198 | "model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "### Let's try and fetch a representation for an out of vocabulary word" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 10, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "array([-0.01671136, -0.01868909, -0.03945312, -0.01389101, -0.0250267 ],\n", 217 | " dtype=float32)" 218 | ] 219 | }, 220 | "execution_count": 10, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "model.wv['rubber']" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "### Checkout the most similar feature using an Out of Vocab term" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 11, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "[('time', 0.5615436434745789),\n", 245 | " ('system', 0.4772699475288391),\n", 246 | " ('minors', 0.3850055932998657),\n", 247 | " ('eps', 0.15983597934246063),\n", 248 | " ('user', -0.2565014064311981),\n", 249 | " ('graph', -0.411243200302124),\n", 250 | " ('response', -0.4405473470687866),\n", 251 | " ('trees', -0.6079868078231812),\n", 252 | " ('interface', -0.6381739377975464),\n", 253 | " ('survey', -0.8393087387084961)]" 254 | ] 255 | }, 256 | "execution_count": 11, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "model.wv.most_similar(positive=['computer', 'human'], negative=['rubber'])" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "### Extending the built model to incorporate words from new sentences" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 12, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "sentences_to_be_added = [[\"I\", \"am\", \"learning\", \"Natural\", \"Language\", \"Processing\"],\n", 279 | " [\"Natural\", \"Language\", \"Processing\", \"is\", \"cool\"]]" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 13, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "model.build_vocab(sentences_to_be_added, update=True)\n", 289 | "model.train(sentences=common_texts, total_examples=len(sentences_to_be_added), epochs=10)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 14, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "{'human': ,\n", 301 | " 'interface': ,\n", 302 | " 'computer': ,\n", 303 | " 'survey': ,\n", 304 | " 'user': ,\n", 305 | " 'system': ,\n", 306 | " 'response': ,\n", 307 | " 'time': ,\n", 308 | " 'eps': ,\n", 309 | " 'trees': ,\n", 310 | " 'graph': ,\n", 311 | " 'minors': ,\n", 312 | " 'I': ,\n", 313 | " 'am': ,\n", 314 | " 'learning': ,\n", 315 | " 'Natural': ,\n", 316 | " 'Language': ,\n", 317 | " 'Processing': ,\n", 318 | " 'is': ,\n", 319 | " 'cool': }" 320 | ] 321 | }, 322 | "execution_count": 14, 323 | "metadata": {}, 324 | "output_type": "execute_result" 325 | } 326 | ], 327 | "source": [ 328 | "model.wv.vocab" 329 | ] 330 | } 331 | ], 332 | "metadata": { 333 | "kernelspec": { 334 | "display_name": "Python 3", 335 | "language": "python", 336 | "name": "python3" 337 | }, 338 | "language_info": { 339 | "codemirror_mode": { 340 | "name": "ipython", 341 | "version": 3 342 | }, 343 | "file_extension": ".py", 344 | "mimetype": "text/x-python", 345 | "name": "python", 346 | "nbconvert_exporter": "python", 347 | "pygments_lexer": "ipython3", 348 | "version": "3.6.0" 349 | } 350 | }, 351 | "nbformat": 4, 352 | "nbformat_minor": 2 353 | } 354 | -------------------------------------------------------------------------------- /Chapter07/Data_Preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.7.7" 21 | }, 22 | "colab": { 23 | "name": "Data Preprocessing.ipynb", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "bkVrfzQjZQM4", 32 | "colab_type": "code", 33 | "colab": { 34 | "base_uri": "https://localhost:8080/", 35 | "height": 229 36 | }, 37 | "outputId": "238ca024-d33f-4044-bff4-e1819ed4cc2d" 38 | }, 39 | "source": [ 40 | "#!pip install seaborn\n", 41 | "import seaborn as sns\n", 42 | "\n", 43 | "tips_df = sns.load_dataset('tips')\n", 44 | "tips_df.head()" 45 | ], 46 | "execution_count": 1, 47 | "outputs": [ 48 | { 49 | "output_type": "stream", 50 | "text": [ 51 | "/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", 52 | " import pandas.util.testing as tm\n" 53 | ], 54 | "name": "stderr" 55 | }, 56 | { 57 | "output_type": "execute_result", 58 | "data": { 59 | "text/html": [ 60 | "
\n", 61 | "\n", 74 | "\n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50MaleNoSunDinner3
323.683.31MaleNoSunDinner2
424.593.61FemaleNoSunDinner4
\n", 140 | "
" 141 | ], 142 | "text/plain": [ 143 | " total_bill tip sex smoker day time size\n", 144 | "0 16.99 1.01 Female No Sun Dinner 2\n", 145 | "1 10.34 1.66 Male No Sun Dinner 3\n", 146 | "2 21.01 3.50 Male No Sun Dinner 3\n", 147 | "3 23.68 3.31 Male No Sun Dinner 2\n", 148 | "4 24.59 3.61 Female No Sun Dinner 4" 149 | ] 150 | }, 151 | "metadata": { 152 | "tags": [] 153 | }, 154 | "execution_count": 1 155 | } 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "metadata": { 161 | "id": "HCd7xfMzZQM8", 162 | "colab_type": "code", 163 | "colab": { 164 | "base_uri": "https://localhost:8080/", 165 | "height": 230 166 | }, 167 | "outputId": "35a70c76-9ab6-4d64-d0b3-238baad1b58f" 168 | }, 169 | "source": [ 170 | "tips_df.isnull().values.any() #Check if NaN value in the dataset\n", 171 | "tips_df.isnull().any() #Check if NaN value in the columns\n", 172 | "tips_df.isnull().any(axis=1) #Check if NaN value in the rows" 173 | ], 174 | "execution_count": 2, 175 | "outputs": [ 176 | { 177 | "output_type": "execute_result", 178 | "data": { 179 | "text/plain": [ 180 | "0 False\n", 181 | "1 False\n", 182 | "2 False\n", 183 | "3 False\n", 184 | "4 False\n", 185 | " ... \n", 186 | "239 False\n", 187 | "240 False\n", 188 | "241 False\n", 189 | "242 False\n", 190 | "243 False\n", 191 | "Length: 244, dtype: bool" 192 | ] 193 | }, 194 | "metadata": { 195 | "tags": [] 196 | }, 197 | "execution_count": 2 198 | } 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "metadata": { 204 | "id": "4YFPIhWvZQM_", 205 | "colab_type": "code", 206 | "colab": { 207 | "base_uri": "https://localhost:8080/", 208 | "height": 194 209 | }, 210 | "outputId": "02871610-2aba-4e74-bc07-f1a03c7b6200" 211 | }, 212 | "source": [ 213 | "#Label encoding\n", 214 | "from sklearn.preprocessing import LabelEncoder\n", 215 | "label_encoding = LabelEncoder()\n", 216 | "tips_df.iloc[:,[2,3,4,5]] = tips_df.iloc[:,[2,3,4,5]].apply(label_encoding.fit_transform)\n", 217 | "tips_df.head()" 218 | ], 219 | "execution_count": 3, 220 | "outputs": [ 221 | { 222 | "output_type": "execute_result", 223 | "data": { 224 | "text/html": [ 225 | "
\n", 226 | "\n", 239 | "\n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | "
total_billtipsexsmokerdaytimesize
016.991.0100202
110.341.6610203
221.013.5010203
323.683.3110202
424.593.6100204
\n", 305 | "
" 306 | ], 307 | "text/plain": [ 308 | " total_bill tip sex smoker day time size\n", 309 | "0 16.99 1.01 0 0 2 0 2\n", 310 | "1 10.34 1.66 1 0 2 0 3\n", 311 | "2 21.01 3.50 1 0 2 0 3\n", 312 | "3 23.68 3.31 1 0 2 0 2\n", 313 | "4 24.59 3.61 0 0 2 0 4" 314 | ] 315 | }, 316 | "metadata": { 317 | "tags": [] 318 | }, 319 | "execution_count": 3 320 | } 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "metadata": { 326 | "id": "5N3tXwFfZQNB", 327 | "colab_type": "code", 328 | "colab": { 329 | "base_uri": "https://localhost:8080/", 330 | "height": 35 331 | }, 332 | "outputId": "6800be7b-1ac9-42cd-8b97-e4163ee2b813" 333 | }, 334 | "source": [ 335 | "#visualizing the encodings in label encoding\n", 336 | "label_encoding = LabelEncoder()\n", 337 | "col_fit = label_encoding.fit(tips_df[\"day\"])\n", 338 | "dict(zip(col_fit.classes_, col_fit.transform(col_fit.classes_)))" 339 | ], 340 | "execution_count": 4, 341 | "outputs": [ 342 | { 343 | "output_type": "execute_result", 344 | "data": { 345 | "text/plain": [ 346 | "{0: 0, 1: 1, 2: 2, 3: 3}" 347 | ] 348 | }, 349 | "metadata": { 350 | "tags": [] 351 | }, 352 | "execution_count": 4 353 | } 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "metadata": { 359 | "id": "qenoI3z4ZQNG", 360 | "colab_type": "code", 361 | "colab": { 362 | "base_uri": "https://localhost:8080/", 363 | "height": 141 364 | }, 365 | "outputId": "a3590200-04aa-43f9-d3db-5933125775d2" 366 | }, 367 | "source": [ 368 | "#One Hot encoding\n", 369 | "from sklearn.preprocessing import OneHotEncoder\n", 370 | "from sklearn.compose import ColumnTransformer\n", 371 | "oh_encoding = ColumnTransformer([('OneHotEncoding', OneHotEncoder(), [2,3,4,5])],remainder='passthrough')\n", 372 | "tips_df_ohe = oh_encoding.fit_transform(tips_df)\n", 373 | "tips_df_ohe" 374 | ], 375 | "execution_count": 5, 376 | "outputs": [ 377 | { 378 | "output_type": "execute_result", 379 | "data": { 380 | "text/plain": [ 381 | "array([[ 1. , 0. , 1. , ..., 16.99, 1.01, 2. ],\n", 382 | " [ 0. , 1. , 1. , ..., 10.34, 1.66, 3. ],\n", 383 | " [ 0. , 1. , 1. , ..., 21.01, 3.5 , 3. ],\n", 384 | " ...,\n", 385 | " [ 0. , 1. , 0. , ..., 22.67, 2. , 2. ],\n", 386 | " [ 0. , 1. , 1. , ..., 17.82, 1.75, 2. ],\n", 387 | " [ 1. , 0. , 1. , ..., 18.78, 3. , 2. ]])" 388 | ] 389 | }, 390 | "metadata": { 391 | "tags": [] 392 | }, 393 | "execution_count": 5 394 | } 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "metadata": { 400 | "id": "Wo34nIqiZQNJ", 401 | "colab_type": "code", 402 | "colab": { 403 | "base_uri": "https://localhost:8080/", 404 | "height": 248 405 | }, 406 | "outputId": "e025ce0b-e339-45a8-a079-4e0310d0c873" 407 | }, 408 | "source": [ 409 | "#Standardization\n", 410 | "from sklearn.preprocessing import StandardScaler\n", 411 | "zs = StandardScaler()\n", 412 | "tips_df_std = zs.fit_transform(tips_df_ohe)\n", 413 | "tips_df_std" 414 | ], 415 | "execution_count": 6, 416 | "outputs": [ 417 | { 418 | "output_type": "execute_result", 419 | "data": { 420 | "text/plain": [ 421 | "array([[ 1.34335316e+00, -1.34335316e+00, 7.84789169e-01, ...,\n", 422 | " -3.14711305e-01, -1.43994695e+00, -6.00192629e-01],\n", 423 | " [-7.44405889e-01, 7.44405889e-01, 7.84789169e-01, ...,\n", 424 | " -1.06323531e+00, -9.69205340e-01, 4.53382921e-01],\n", 425 | " [-7.44405889e-01, 7.44405889e-01, 7.84789169e-01, ...,\n", 426 | " 1.37779900e-01, 3.63355539e-01, 4.53382921e-01],\n", 427 | " ...,\n", 428 | " [-7.44405889e-01, 7.44405889e-01, -1.27422758e+00, ...,\n", 429 | " 3.24629502e-01, -7.22971264e-01, -6.00192629e-01],\n", 430 | " [-7.44405889e-01, 7.44405889e-01, 7.84789169e-01, ...,\n", 431 | " -2.21286504e-01, -9.04025732e-01, -6.00192629e-01],\n", 432 | " [ 1.34335316e+00, -1.34335316e+00, 7.84789169e-01, ...,\n", 433 | " -1.13228903e-01, 1.24660453e-03, -6.00192629e-01]])" 434 | ] 435 | }, 436 | "metadata": { 437 | "tags": [] 438 | }, 439 | "execution_count": 6 440 | } 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "metadata": { 446 | "id": "7oKcjpc-ZQNM", 447 | "colab_type": "code", 448 | "colab": { 449 | "base_uri": "https://localhost:8080/", 450 | "height": 248 451 | }, 452 | "outputId": "eab38064-5a25-4915-b2e9-a2ce4e47dd72" 453 | }, 454 | "source": [ 455 | "from sklearn.preprocessing import MinMaxScaler\n", 456 | "minmax = MinMaxScaler()\n", 457 | "tips_df_std = minmax.fit_transform(tips_df_ohe)\n", 458 | "tips_df_std" 459 | ], 460 | "execution_count": 7, 461 | "outputs": [ 462 | { 463 | "output_type": "execute_result", 464 | "data": { 465 | "text/plain": [ 466 | "array([[1. , 0. , 1. , ..., 0.29157939, 0.00111111,\n", 467 | " 0.2 ],\n", 468 | " [0. , 1. , 1. , ..., 0.1522832 , 0.07333333,\n", 469 | " 0.4 ],\n", 470 | " [0. , 1. , 1. , ..., 0.3757855 , 0.27777778,\n", 471 | " 0.4 ],\n", 472 | " ...,\n", 473 | " [0. , 1. , 0. , ..., 0.41055718, 0.11111111,\n", 474 | " 0.2 ],\n", 475 | " [0. , 1. , 1. , ..., 0.30896523, 0.08333333,\n", 476 | " 0.2 ],\n", 477 | " [1. , 0. , 1. , ..., 0.32907415, 0.22222222,\n", 478 | " 0.2 ]])" 479 | ] 480 | }, 481 | "metadata": { 482 | "tags": [] 483 | }, 484 | "execution_count": 7 485 | } 486 | ] 487 | } 488 | ] 489 | } -------------------------------------------------------------------------------- /Chapter07/Sentiment Analyzer Model Reload.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook explains how to reuse the trained model which we pickled in the notebook title \"Sentiment Analyzer\"" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "def sentiment_pred(classifier, training_matrix, doc):\n", 17 | " \"\"\"function to predict the sentiment of a product review\n", 18 | " \n", 19 | " classifier : pre trained model\n", 20 | " training_matrix : matrix of features associated with the trained model\n", 21 | " doc = product review whose sentiment needs to be identified\"\"\"\n", 22 | " \n", 23 | " X_new = training_matrix.transform(pd.Series(doc)) #don't use fit_transform here because the model is already fitted\n", 24 | " X_new = X_new.todense() #convert sparse matrix to dense\n", 25 | "\n", 26 | " from sklearn.feature_extraction.text import TfidfTransformer\n", 27 | " tfidf = TfidfTransformer() #by default applies \"l2\" normalization\n", 28 | " X_tfidf_new = tfidf.fit_transform(X_new)\n", 29 | " X_tfidf_new = X_tfidf_new.todense()\n", 30 | "\n", 31 | " y_new = classifier.predict(X_tfidf_new)\n", 32 | " if y_new[0] == 0:\n", 33 | " return \"negative sentiment\"\n", 34 | " elif y_new[0] == 1:\n", 35 | " return \"positive sentiment\"" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "'negative sentiment'" 47 | ] 48 | }, 49 | "execution_count": 4, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "import pandas as pd\n", 56 | "import pickle\n", 57 | "nb_clf = pickle.load(open(\"nb_sa\", 'rb'))\n", 58 | "vectorizer = pickle.load(open(\"vectorizer_sa\", 'rb'))\n", 59 | "new_doc = \"Not even close to the quality one would expect\"\n", 60 | "sentiment_pred(nb_clf, vectorizer, new_doc)" 61 | ] 62 | } 63 | ], 64 | "metadata": { 65 | "kernelspec": { 66 | "display_name": "Python 3", 67 | "language": "python", 68 | "name": "python3" 69 | }, 70 | "language_info": { 71 | "codemirror_mode": { 72 | "name": "ipython", 73 | "version": 3 74 | }, 75 | "file_extension": ".py", 76 | "mimetype": "text/x-python", 77 | "name": "python", 78 | "nbconvert_exporter": "python", 79 | "pygments_lexer": "ipython3", 80 | "version": "3.7.7" 81 | } 82 | }, 83 | "nbformat": 4, 84 | "nbformat_minor": 4 85 | } 86 | -------------------------------------------------------------------------------- /Chapter07/Sentiment Analyzer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | "
01
0So there is no way for me to plug it in here i...0
1Good case, Excellent value.1
2Great for the jawbone.1
3Tied to charger for conversations lasting more...0
4The mic is great.1
\n", 61 | "
" 62 | ], 63 | "text/plain": [ 64 | " 0 1\n", 65 | "0 So there is no way for me to plug it in here i... 0\n", 66 | "1 Good case, Excellent value. 1\n", 67 | "2 Great for the jawbone. 1\n", 68 | "3 Tied to charger for conversations lasting more... 0\n", 69 | "4 The mic is great. 1" 70 | ] 71 | }, 72 | "execution_count": 1, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "import pandas as pd\n", 79 | "data = pd.read_csv(\"amazon_cells_labelled.txt\", sep='\\t', header=None)\n", 80 | "data.head()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 2, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "X = data.iloc[:,0] # extract column with review\n", 90 | "y = data.iloc[:,-1] # extract column with sentiment" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "matrix([[0, 0, 0, ..., 0, 0, 0],\n", 102 | " [0, 0, 0, ..., 0, 0, 0],\n", 103 | " [0, 0, 0, ..., 0, 0, 0],\n", 104 | " ...,\n", 105 | " [0, 0, 0, ..., 0, 0, 0],\n", 106 | " [0, 0, 0, ..., 0, 0, 0],\n", 107 | " [0, 0, 0, ..., 0, 0, 0]], dtype=int64)" 108 | ] 109 | }, 110 | "execution_count": 4, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "# tokenize the news text and convert data in matrix format\n", 117 | "from sklearn.feature_extraction.text import CountVectorizer\n", 118 | "vectorizer = CountVectorizer(stop_words='english')\n", 119 | "X_vec = vectorizer.fit_transform(X)\n", 120 | "X_vec = X_vec.todense() # convert sparse matrix into dense matrix\n", 121 | "X_vec" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "matrix([[0., 0., 0., ..., 0., 0., 0.],\n", 133 | " [0., 0., 0., ..., 0., 0., 0.],\n", 134 | " [0., 0., 0., ..., 0., 0., 0.],\n", 135 | " ...,\n", 136 | " [0., 0., 0., ..., 0., 0., 0.],\n", 137 | " [0., 0., 0., ..., 0., 0., 0.],\n", 138 | " [0., 0., 0., ..., 0., 0., 0.]])" 139 | ] 140 | }, 141 | "execution_count": 5, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "# Transform data by applying term frequency inverse document frequency (TF-IDF) \n", 148 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 149 | "tfidf = TfidfTransformer() #by default applies \"l2\" normalization\n", 150 | "X_tfidf = tfidf.fit_transform(X_vec)\n", 151 | "X_tfidf = X_tfidf.todense()\n", 152 | "X_tfidf" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 6, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "##################Apply Naive Bayes algorithm to train data####################\n", 162 | "\n", 163 | "# Extract the news body and labels for training the classifier\n", 164 | "from sklearn.model_selection import train_test_split\n", 165 | "X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.25, random_state = 0)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 7, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "MultinomialNB()" 177 | ] 178 | }, 179 | "execution_count": 7, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "# Train the NB classifier\n", 186 | "from sklearn.naive_bayes import MultinomialNB\n", 187 | "clf = MultinomialNB()\n", 188 | "clf.fit(X_train, y_train)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 8, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# Predicting the Test set results\n", 198 | "y_pred = clf.predict(X_test)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 10, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "array([[ 87, 33],\n", 210 | " [ 20, 110]], dtype=int64)" 211 | ] 212 | }, 213 | "execution_count": 10, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "# Confusion Matrix\n", 220 | "from sklearn.metrics import confusion_matrix\n", 221 | "cm = confusion_matrix(y_test, y_pred)\n", 222 | "cm" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 11, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "SVC(kernel='linear')" 234 | ] 235 | }, 236 | "execution_count": 11, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "###################################Fitting SVM to the Training set#############################################\n", 243 | "from sklearn.svm import SVC\n", 244 | "classifier = SVC(kernel='linear')\n", 245 | "classifier.fit(X_train, y_train)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 12, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "y_pred = classifier.predict(X_test)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 13, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "array([[102, 18],\n", 266 | " [ 33, 97]], dtype=int64)" 267 | ] 268 | }, 269 | "execution_count": 13, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "# Making the Confusion Matrix\n", 276 | "from sklearn.metrics import confusion_matrix\n", 277 | "cm = confusion_matrix(y_test, y_pred)\n", 278 | "cm" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 14, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "#pickling the model for reuse\n", 288 | "import pickle\n", 289 | "pickle.dump(vectorizer, open(\"vectorizer_sa\", 'wb')) # Save vectorizer for reuse\n", 290 | "pickle.dump(classifier, open(\"nb_sa\", 'wb')) # Save classifier for reuse" 291 | ] 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "Python 3", 297 | "language": "python", 298 | "name": "python3" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 3 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython3", 310 | "version": "3.7.7" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 4 315 | } 316 | -------------------------------------------------------------------------------- /Chapter07/nb_sa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter07/nb_sa -------------------------------------------------------------------------------- /Chapter07/vectorizer_sa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter07/vectorizer_sa -------------------------------------------------------------------------------- /Chapter08/Output Files/question_classification_model.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Sequential", "config": {"name": "sequential_1", "layers": [{"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "batch_input_shape": [null, 8030], "dtype": "float32", "units": 128, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "dtype": "float32", "rate": 0.3, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "dtype": "float32", "units": 6, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.2.5", "backend": "tensorflow"} -------------------------------------------------------------------------------- /Chapter08/Output Files/question_classification_weights.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter08/Output Files/question_classification_weights.h5 -------------------------------------------------------------------------------- /Chapter09/Dataset/Sarcasm_Headlines_Dataset_v2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter09/Dataset/Sarcasm_Headlines_Dataset_v2.zip -------------------------------------------------------------------------------- /Chapter09/Output Files/sarcasm_detection_model_cnn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter09/Output Files/sarcasm_detection_model_cnn.h5 -------------------------------------------------------------------------------- /Chapter09/Output Files/sarcasm_detection_model_cnn.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Sequential", "config": {"name": "sequential_1", "layers": [{"class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 10, 300], "dtype": "float32", "filters": 8, "kernel_size": [3], "strides": [1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "GlobalMaxPooling1D", "config": {"name": "global_max_pooling1d_1", "trainable": true, "dtype": "float32", "data_format": "channels_last"}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": "float32", "units": 10, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "dtype": "float32", "rate": 0.35, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "dtype": "float32", "units": 5, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "dtype": "float32", "rate": 0.35, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_3", "trainable": true, "dtype": "float32", "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.2.5", "backend": "tensorflow"} -------------------------------------------------------------------------------- /Chapter10/Dataset/hotel_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter10/Dataset/hotel_data.zip -------------------------------------------------------------------------------- /Chapter10/Output Files/text_generation_using_LSTM.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter10/Output Files/text_generation_using_LSTM.h5 -------------------------------------------------------------------------------- /Chapter10/Output Files/text_generation_using_LSTM.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Sequential", "config": {"name": "sequential_1", "layers": [{"class_name": "Embedding", "config": {"name": "embedding_1", "trainable": true, "batch_input_shape": [null, 307], "dtype": "float32", "input_dim": 3395, "output_dim": 10, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 307}}, {"class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "dtype": "float32", "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 128, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 1}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "dtype": "float32", "rate": 0.2, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": "float32", "units": 3395, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.2.5", "backend": "tensorflow"} -------------------------------------------------------------------------------- /Chapter11/dataset/bilingual_pairs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Python-Natural-Language-Processing/a73a7644c21aaf83a257ea63f11692eeef579aec/Chapter11/dataset/bilingual_pairs.zip -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Hands-On Python Natural Language Processing 5 | 6 | Hands-On Python Natural Language Processing 7 | 8 | This is the code repository for [Hands-On Python Natural Language Processing](https://www.packtpub.com/data/hands-on-python-natural-language-processing?utm_source=github&utm_medium=repository&utm_campaign=9781838989590), published by Packt. 9 | 10 | **Explore tools and techniques to analyze and process text with a view to building real-world NLP applications** 11 | 12 | ## What is this book about? 13 | This book provides a blend of both the theoretical and practical aspects of Natural Language Processing (NLP). It covers the concepts essential to develop a thorough understanding of NLP and also delves into a detailed discussion on NLP based use-cases such as language translation, sentiment analysis, etc. Every module covers real-world examples. 14 | 15 | This book covers the following exciting features: 16 | * Understand how NLP powers modern applications 17 | * Explore key NLP techniques to build your natural language vocabulary 18 | * Transform text data into mathematical data structures and learn how to improve text mining models 19 | * Discover how various neural network architectures work with natural language data 20 | * Get the hang of building sophisticated text processing models using machine learning and deep learning 21 | * Check out state-of-the-art architectures that have revolutionized research in the NLP domain 22 | 23 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1838989595) today! 24 | 25 | https://www.packtpub.com/ 26 | 27 | ## Instructions and Navigations 28 | All of the code is organized into folders. 29 | 30 | The code will look like the following: 31 | ``` 32 | set_of_words = set() 33 | for word in preprocessed_corpus[0].split(): 34 | set_of_words.add(word) 35 | vocab = list(set_of_words) 36 | print(vocab) 37 | 38 | ``` 39 | 40 | **Following is what you need for this book:** 41 | This NLP Python book is for anyone looking to learn NLP’s theoretical and practical aspects alike. It starts with the basics and gradually covers advanced concepts to make it easy to follow for readers with varying levels of NLP proficiency. This comprehensive guide will help you develop a thorough understanding of the NLP methodologies for building linguistic applications; however, working knowledge of Python programming language and high school level mathematics is expected. 42 | 43 | With the following software and hardware list you can run all code files present in the book (Chapter 2-11). 44 | 45 | ### Software and Hardware List 46 | 47 | | Chapter | Software required | OS required | 48 | | -------- | -------------------------------------------------------------------------------------| -----------------------------------| 49 | | 2 - 11 | Jupyter Notebook/Google Colab, Python 3.x, GPU (preferred) | Windows, macOS X, and Linux (Any) | 50 | 51 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://static.packt-cdn.com/downloads/9781838989590_ColorImages.pdf). 52 | 53 | ## Errata 54 | 55 | * Page 167: 56 | ``` 57 | from sklearn.naive_bayes import MultinomialNaive Bayes 58 | clf = MultinomialNaive Bayes()** 59 | ``` 60 | _should be_ 61 | 62 | ``` 63 | from sklearn.naive_bayes import MultinomialNB 64 | clf = MultinomialNB() 65 | ``` 66 | 67 | 68 | ### Related products 69 | * Natural Language Processing with Python Quick Start Guide [[Packt]](https://www.packtpub.com/in/big-data-and-business-intelligence/natural-language-processing-python-quick-start-guide?utm_source=github&utm_medium=repository&utm_campaign=9781789130386) [[Amazon]](https://www.amazon.com/Natural-Language-Processing-Python-Quick/dp/1789130387) 70 | 71 | * Deep Learning for Natural Language Processing [[Packt]](https://www.packtpub.com/in/big-data-and-business-intelligence/deep-learning-natural-language-processing?utm_source=github&utm_medium=repository&utm_campaign=9781838550295) [[Amazon]](https://www.amazon.com/Deep-Learning-Natural-Language-Processing-ebook/dp/B07MZ3Q921) 72 | 73 | ## Get to Know the Authors 74 | **Aman Kedia** 75 | is a data enthusiast and lifelong learner. He is an avid believer in Artificial Intelligence (AI) and the algorithms supporting it. He has worked on state-of-the-art problems in Natural Language Processing (NLP), encompassing resume matching and digital assistants, among others. He has worked at Oracle and SAP, trying to solve problems leveraging advancements in AI. He has four published research papers in the domain of AI. 76 | 77 | **Mayank Rasu** 78 | has more than 12 years of global experience as a data scientist and quantitative analyst in the investment banking industry. He has worked at the intersection of finance and technology and has developed and deployed AI-based applications within the finance domain. His experience includes building sentiment analyzers, robotics, and deep learning-based document review, among many others areas. 79 | 80 | ### Suggestions and Feedback 81 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 82 | ### Download a free PDF 83 | 84 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
85 |

https://packt.link/free-ebook/9781838989590

--------------------------------------------------------------------------------