├── 10_nltk.chat.ipynb
├── 1_NLP Intro.ipynb
├── 2_NLTK.ipynb
├── 4_Word Embedding_Part_1.ipynb
├── 4_Word Embedding_Part_2.ipynb
├── 5_Gensim.ipynb
├── 6_Stock Price webscrape.ipynb
├── 6_Web scraper_2.ipynb
├── 6_Webscraper_1.ipynb
├── 7_Spam Classification.ipynb
├── Audio_to_Text.ipynb
├── IMDB-Dataset 1.zip
├── IMDB-Dataset2.zip
├── README.md
├── Sentiment Analysis 1.ipynb
├── Speech_to_text.ipynb
├── Textblob_.ipynb
├── email_messages.csv.zip
├── emotions.txt
└── spam.csv


/10_nltk.chat.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from nltk.chat.util import Chat,reflections"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "{'i am': 'you are',\n",
 21 |        " 'i was': 'you were',\n",
 22 |        " 'i': 'you',\n",
 23 |        " \"i'm\": 'you are',\n",
 24 |        " \"i'd\": 'you would',\n",
 25 |        " \"i've\": 'you have',\n",
 26 |        " \"i'll\": 'you will',\n",
 27 |        " 'my': 'your',\n",
 28 |        " 'you are': 'I am',\n",
 29 |        " 'you were': 'I was',\n",
 30 |        " \"you've\": 'I have',\n",
 31 |        " \"you'll\": 'I will',\n",
 32 |        " 'your': 'my',\n",
 33 |        " 'yours': 'mine',\n",
 34 |        " 'you': 'me',\n",
 35 |        " 'me': 'you'}"
 36 |       ]
 37 |      },
 38 |      "execution_count": 2,
 39 |      "metadata": {},
 40 |      "output_type": "execute_result"
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "# Default reflections. You can create your own reflections\n",
 45 |     "reflections"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 3,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/plain": [
 56 |        "nltk.chat.util.Chat"
 57 |       ]
 58 |      },
 59 |      "execution_count": 3,
 60 |      "metadata": {},
 61 |      "output_type": "execute_result"
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "Chat"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 24,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "#Pairs is a list of patterns and responses.\n",
 75 |     "pairs = [\n",
 76 |     "    [\n",
 77 |     "        r\"(.*)my name is (.*)\",\n",
 78 |     "        [\"Hello %2, How are you today ?\",]\n",
 79 |     "    ],\n",
 80 |     "    [\n",
 81 |     "        r\"(.*)help(.*) \",\n",
 82 |     "        [\"I can help you \",]\n",
 83 |     "    ],\n",
 84 |     "     [\n",
 85 |     "        r\"(.*) your name ?\",\n",
 86 |     "        [\"My name is Shankar and I'm a chatbot .\",]\n",
 87 |     "    ],\n",
 88 |     "    [\n",
 89 |     "        r\"how are you (.*) ?\",\n",
 90 |     "        [\"I'm doing very well\", \"i am great !\",\"I am very cool!\"]\n",
 91 |     "    ],\n",
 92 |     "    [\n",
 93 |     "        r\"sorry (.*)\",\n",
 94 |     "        [\"Its alright\",\"Its OK, never mind that\",]\n",
 95 |     "    ],\n",
 96 |     "    [\n",
 97 |     "        r\"i am (good|well|okay|ok)\",\n",
 98 |     "        [\"Nice to hear that\",\"Alright, great !\",]\n",
 99 |     "    ],\n",
100 |     "    [\n",
101 |     "        r\"(hi|hey|hello|hola|holla)(.*)\",\n",
102 |     "        [\"Hello\", \"Hey there\",]\n",
103 |     "    ],\n",
104 |     "    [\n",
105 |     "        r\"what (.*) want ?\",\n",
106 |     "        [\"I want to learn Natural Language Processing\",]\n",
107 |     "        \n",
108 |     "    ],\n",
109 |     "    [\n",
110 |     "        r\"(.*)created(.*)\",\n",
111 |     "        [\"Shankar created me using Python's NLTK library \",\"top secret ;)\",]\n",
112 |     "    ],\n",
113 |     "    [\n",
114 |     "        r\"(.*) (location|city) ?\",\n",
115 |     "        ['Bengaluru, India',]\n",
116 |     "    ],\n",
117 |     "    [\n",
118 |     "        r\"(.*) raining in (.*)\",\n",
119 |     "        [\"No rain in the past 4 days here in %2\",\"In %2 there is a 50% chance of rain\",]\n",
120 |     "    ],\n",
121 |     "    [\n",
122 |     "        r\"how (.*) health (.*)\",\n",
123 |     "        [\"Health is very important, but I am a computer, so I don't need to worry about my health \",]\n",
124 |     "    ],\n",
125 |     "    [\n",
126 |     "        r\"(.*)(sports|game|sport)(.*)\",\n",
127 |     "        [\"I'm a very big fan of Football\",]\n",
128 |     "    ],\n",
129 |     "    [\n",
130 |     "        r\"who (.*) (player|striker|forward)?\",\n",
131 |     "        [\"Harry Kane\"]\n",
132 |     "    ],\n",
133 |     "    [\n",
134 |     "        r\"quit\",\n",
135 |     "        [\"Bye for now. See you soon :) \",\"It was nice talking to you. See you soon :)\"]\n",
136 |     "    ],\n",
137 |     "    [\n",
138 |     "        r\"(.*)\",\n",
139 |     "        ['I did not find answer, please visit http://google.com']\n",
140 |     "    ],\n",
141 |     "]"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 25,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "Hi, I'm Shankar and I like to chat\n",
154 |       "Please type lowercase English language to start a conversation. Type quit to leave \n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "# default message at the start of chat\n",
160 |     "print(\"Hi, I'm Shankar and I like to chat\\nPlease type lowercase English language to start a conversation. Type quit to leave \")\n",
161 |     "# Create Chat Bot\n",
162 |     "chat = Chat(pairs, reflections)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 27,
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "name": "stdout",
172 |      "output_type": "stream",
173 |      "text": [
174 |       ">who created this?\n",
175 |       "top secret ;)\n",
176 |       ">quit\n",
177 |       "Bye for now. See you soon :) \n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "chat.converse()"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": []
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": []
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": []
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": []
212 |   }
213 |  ],
214 |  "metadata": {
215 |   "hide_input": false,
216 |   "kernelspec": {
217 |    "display_name": "Python 3",
218 |    "language": "python",
219 |    "name": "python3"
220 |   },
221 |   "language_info": {
222 |    "codemirror_mode": {
223 |     "name": "ipython",
224 |     "version": 3
225 |    },
226 |    "file_extension": ".py",
227 |    "mimetype": "text/x-python",
228 |    "name": "python",
229 |    "nbconvert_exporter": "python",
230 |    "pygments_lexer": "ipython3",
231 |    "version": "3.6.5"
232 |   }
233 |  },
234 |  "nbformat": 4,
235 |  "nbformat_minor": 2
236 | }
237 | 


--------------------------------------------------------------------------------
/1_NLP Intro.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# What is NLP?\n",
 8 |     "\n",
 9 |     "NLP is a part of Artificial Intelligence, developed for the machine to understand human language. The ultimate goal of NLP is to read, understand and make valuable conclusion of human language. It is a very tough job to do as human language has a lot of variation in terms of language, pronunciation etc. Although, in recent times there has been a major breakthrough in the field of NLP. \n",
10 |     "\n",
11 |     "Siri and Alexa are one such example of uses of NLP.\n",
12 |     "\n",
13 |     "\n",
14 |     "We will use NLP for text analytics.\n",
15 |     "\n",
16 |     "\n",
17 |     "There many libraries available for NLP in python. we will focus on NLTK in the later part :\n",
18 |     "\n",
19 |     "* Natural Languange Tool Kit (NLTK)\n",
20 |     "* Spacy"
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "markdown",
25 |    "metadata": {},
26 |    "source": [
27 |     "## Five main Component of Natural Language processing are:\n",
28 |     "\n",
29 |     "**Lexical Analysis** − It involves identifying and analyzing the structure of words. Lexicon of a language means the collection of words and phrases in a language. Lexical analysis is dividing the whole chunk of txt into paragraphs, sentences, and words.\n",
30 |     "\n",
31 |     "**Syntactic Analysis (Parsing)** − It involves analysis of words in the sentence for grammar and arranging words in a manner that shows the relationship among the words. The sentence such as “The movie went to see a family” is rejected by English syntactic analyzer.\n",
32 |     "\n",
33 |     "**Semantic Analysis** − It draws the exact meaning or the dictionary meaning from the text. The text is checked for meaningfulness. It is done by mapping syntactic structures and objects in the task domain. The semantic analyzer disregards sentence such as “Hot Ice Cream”.\n",
34 |     "\n",
35 |     "**Discourse Integration** − The meaning of any sentence depends upon the meaning of the sentence just before it. In addition, it also brings about the meaning of immediately succeeding sentence.\n",
36 |     "\n",
37 |     "**Pragmatic Analysis** − During this, what was said is re-interpreted on what it actually meant. It involves deriving those aspects of language which require real world knowledge"
38 |    ]
39 |   },
40 |   {
41 |    "cell_type": "markdown",
42 |    "metadata": {},
43 |    "source": [
44 |     "## Advantages of NLP\n",
45 |     "- Users can ask questions about any subject and get a direct response within seconds.\n",
46 |     "- NLP system provides answers to the questions in natural language\n",
47 |     "- NLP system offers exact answers to the questions, no unnecessary or unwanted information\n",
48 |     "- The accuracy of the answers increases with the amount of relevant information provided in the question.\n",
49 |     "- NLP process helps computers communicate with humans in their language and scales other language-related tasks\n",
50 |     "- Allows you to perform more language-based data compares to a human being without fatigue and in an unbiased and consistent way.\n",
51 |     "- Structuring a highly unstructured data source\n",
52 |     "\n",
53 |     "## Disadvantages of NLP\n",
54 |     "- Complex Query Language- the system may not be able to provide the correct answer it the question that is poorly worded or ambiguous.\n",
55 |     "- The system is built for a single and specific task only; it is unable to adapt to new domains and problems because of limited functions.\n",
56 |     "- NLP system doesn't have a user interface which lacks features that allow users to further interact with the system"
57 |    ]
58 |   },
59 |   {
60 |    "cell_type": "code",
61 |    "execution_count": null,
62 |    "metadata": {},
63 |    "outputs": [],
64 |    "source": []
65 |   }
66 |  ],
67 |  "metadata": {
68 |   "kernelspec": {
69 |    "display_name": "Python 3",
70 |    "language": "python",
71 |    "name": "python3"
72 |   },
73 |   "language_info": {
74 |    "codemirror_mode": {
75 |     "name": "ipython",
76 |     "version": 3
77 |    },
78 |    "file_extension": ".py",
79 |    "mimetype": "text/x-python",
80 |    "name": "python",
81 |    "nbconvert_exporter": "python",
82 |    "pygments_lexer": "ipython3",
83 |    "version": "3.6.5"
84 |   }
85 |  },
86 |  "nbformat": 4,
87 |  "nbformat_minor": 2
88 | }
89 | 


--------------------------------------------------------------------------------
/2_NLTK.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Natural Language Tool Kit (NLTK)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "#!pip install nltk"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### Tokenization\n",
 24 |     "Tokenization is a process of breaking down a given paragraph of text into a list of sentence or words. When paragraph is broken down into list of sentences, it is called sentence tokenization.\n",
 25 |     "Similarly, if the sentences are further broken down into list of words, it is known as Word tokenization.\n",
 26 |     "\n",
 27 |     "Let's understand this with an example. Below is a given paragraph, let's see how tokenization works on it:\n",
 28 |     "\n",
 29 |     "\"India (Hindi: Bhārat), officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia.\"\n",
 30 |     "\n",
 31 |     "* Sentence Tokenize:\n",
 32 |     "\n",
 33 |     "    ['India (Hindi: Bhārat), officially the Republic of India, is a country in South Asia.',\n",
 34 |     "    \n",
 35 |     "  'It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world.',\n",
 36 |     "  \n",
 37 |     "  'Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land      borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east.',\n",
 38 |     "  \n",
 39 |     "  'In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia.']\n",
 40 |     "\n",
 41 |     "\n",
 42 |     "* Word tokenize:\n",
 43 |     "\n",
 44 |     "['India', '(', 'Hindi', ':', 'Bhārat', ')', ',', 'officially', 'the', 'Republic', 'of', 'India', ',', 'is', 'a', 'country', 'in', 'South',\n",
 45 |     " 'Asia', '.', 'It', 'is', 'the', 'seventh-largest', 'country', 'by', 'area', ',', 'the', 'second-most', 'populous', 'country', ',', 'and',\n",
 46 |     " 'the', 'most', 'populous', 'democracy', 'in', 'the', 'world', '.', 'Bounded', 'by', 'the', 'Indian',\n",
 47 |     " 'Ocean',\n",
 48 |     " 'on',\n",
 49 |     " 'the',\n",
 50 |     " 'south',\n",
 51 |     " ',',\n",
 52 |     " 'the',\n",
 53 |     " 'Arabian',\n",
 54 |     " 'Sea',\n",
 55 |     " 'on',\n",
 56 |     " 'the',\n",
 57 |     " 'southwest',\n",
 58 |     " ',',\n",
 59 |     " 'and',\n",
 60 |     " 'the',\n",
 61 |     " 'Bay',\n",
 62 |     " 'of',\n",
 63 |     " 'Bengal',\n",
 64 |     " 'on',\n",
 65 |     " 'the',\n",
 66 |     " 'southeast',\n",
 67 |     " ',',\n",
 68 |     " 'it',\n",
 69 |     " 'shares',\n",
 70 |     " 'land',\n",
 71 |     " 'borders',\n",
 72 |     " 'with',\n",
 73 |     " 'Pakistan',\n",
 74 |     " 'to',\n",
 75 |     " 'the',\n",
 76 |     " 'west',\n",
 77 |     " ';',\n",
 78 |     " 'China',\n",
 79 |     " ',',\n",
 80 |     " 'Nepal',\n",
 81 |     " ',',\n",
 82 |     " 'and',\n",
 83 |     " 'Bhutan',\n",
 84 |     " 'to',\n",
 85 |     " 'the',\n",
 86 |     " 'north',\n",
 87 |     " ';',\n",
 88 |     " 'and',\n",
 89 |     " 'Bangladesh',\n",
 90 |     " 'and',\n",
 91 |     " 'Myanmar',\n",
 92 |     " 'to',\n",
 93 |     " 'the',\n",
 94 |     " 'east',\n",
 95 |     " '.',\n",
 96 |     " 'In',\n",
 97 |     " 'the',\n",
 98 |     " 'Indian',\n",
 99 |     " 'Ocean',\n",
100 |     " ',',\n",
101 |     " 'India',\n",
102 |     " 'is',\n",
103 |     " 'in',\n",
104 |     " 'the',\n",
105 |     " 'vicinity',\n",
106 |     " 'of',\n",
107 |     " 'Sri',\n",
108 |     " 'Lanka',\n",
109 |     " 'and',\n",
110 |     " 'the',\n",
111 |     " 'Maldives',\n",
112 |     " ';',\n",
113 |     " 'its',\n",
114 |     " 'Andaman',\n",
115 |     " 'and',\n",
116 |     " 'Nicobar',\n",
117 |     " 'Islands',\n",
118 |     " 'share',\n",
119 |     " 'a',\n",
120 |     " 'maritime',\n",
121 |     " 'border',\n",
122 |     " 'with',\n",
123 |     " 'Thailand',\n",
124 |     " 'and',\n",
125 |     " 'Indonesia',\n",
126 |     " '.']\n",
127 |     "\n",
128 |     "\n",
129 |     "Hope this example clears up the concept of tokenization. We will understand why it is done when we will dive into text analysis.\n",
130 |     "\n",
131 |     "\n",
132 |     "\n",
133 |     "\n",
134 |     "####  Word Tokenization \n",
135 |     "\n",
136 |     "- Example \n",
137 |     "- 'I am learning Natural Language processing' is being converted into \n",
138 |     "['I', 'am', 'learning', 'Natural', 'Language', 'processing']\n",
139 |     "\n",
140 |     "#### Sentence Tokenization\n",
141 |     "\n",
142 |     "- Example \n",
143 |     "- \"God is Great! I won a lottery.\" is bening converted into [\"God is Great!\", \"I won a lottery\"]"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "##### Word Tokenization"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "import nltk\n",
160 |     "from nltk.tokenize import word_tokenize\n"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "# Define your text or import from other source\n",
170 |     "text = 'I am learning Natural Language processing'"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {
177 |     "scrolled": true
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "# tokenizing\n",
182 |     "print (word_tokenize(text))"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "##### Sentence Tokenization"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "from nltk.tokenize import sent_tokenize\n",
199 |     "#text = \"Good. Morning! How are you?.\"\n",
200 |     "#text = \"Good Morning! How are you\"\n",
201 |     "text = \" Our Company annual growth rate is 25.50%. Good job Mr.Bajaj\""
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {
208 |     "scrolled": true
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "print(sent_tokenize(text))"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "# Regular Expressions"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "from nltk.tokenize import regexp_tokenize"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "# Sample text\n",
238 |     "text = \"NLP is fun and Can deal with texts and sounds, but can't deal with images. We have session at 11AM!.We can earn lot of $\""
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "# Print word by word that contains all small case and starts from samll a to z\n",
248 |     "regexp_tokenize(text,\"[a-z]+\")"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "# extra quote ' get's you word like can't, don't\n",
258 |     "regexp_tokenize(text,\"[a-z']+\")"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "# # Print word by word that contains all caps and from caps A to Z\n",
268 |     "regexp_tokenize(text,\"[A-Z]+\")"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "# Everything in one line\n",
278 |     "regexp_tokenize(text,\"[\\a-z']+\")"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "# Anything starts with caret is not equal. \n",
288 |     "regexp_tokenize(text,\"[^a-z']+\")"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "# Only numbers\n",
298 |     "regexp_tokenize(text,\"[0-9]+\")"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": [
307 |     "# Without numbers\n",
308 |     "regexp_tokenize(text,\"[^0-9]+\")"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "regexp_tokenize(text,\"[$]\")"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "## Stop Words\n",
325 |     "Stop words are such words which are very common in occurrence such as ‘a’,’an’,’the’, ‘at’ etc. We ignore such words during the preprocessing part since they do not give any important information and would just take additional space. We can make our custom list of stop words as well if we want. Different libraries have different stop words list. Let’s see the stop words list for NLTK:"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "# import stopwords\n",
335 |     "from nltk.corpus import stopwords\n",
336 |     "\n",
337 |     "#If you get error download stopwords as below\n",
338 |     "#nltk.download('stopwords')"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "stop_words = stopwords.words('english')"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "print (stop_words)"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "len(stop_words)"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {},
372 |    "outputs": [],
373 |    "source": [
374 |     "import nltk\n",
375 |     "# Another way\n",
376 |     "stopset = set(nltk.corpus.stopwords.words('english'))"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "len(stopset)"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "# Adding custome stopwords\n",
395 |     "stopset.update(('new','old'))\n",
396 |     "len(stopset)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "stopset"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {},
411 |    "source": [
412 |     "#### Similar to the stopwords, we can also ignore punctuations in our sentences."
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "# import string\n",
422 |     "import string"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": [
431 |     "string.punctuation"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": [
440 |     "# Remove stopwords and punctuations from the above set os texts\n",
441 |     "\n",
442 |     "import nltk\n",
443 |     "import string\n",
444 |     "from nltk.corpus import stopwords\n",
445 |     "\n",
446 |     "stop_words = stopwords.words('english')\n",
447 |     "punct =string.punctuation"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "stop_words"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": null,
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": [
465 |     "# Lets check those punctuations\n",
466 |     "punct"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": null,
472 |    "metadata": {},
473 |    "outputs": [],
474 |    "source": [
475 |     "#our text\n",
476 |     "text = \"India (Hindi: Bhārat), officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia.\"\n",
477 |     "\n",
478 |     "# Empty list to load clean data\n",
479 |     "cleaned_text = []\n",
480 |     "\n",
481 |     "for word in nltk.word_tokenize(text):\n",
482 |     "    if word not in punct:\n",
483 |     "        if word not in stop_words:\n",
484 |     "            cleaned_text.append(word)\n",
485 |     "    \n",
486 |     "print ('Original Length  == >', len(text))\n",
487 |     "print ('length of cleaned text ==>', len(cleaned_text))\n",
488 |     "print ('\\n',cleaned_text )\n"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "markdown",
493 |    "metadata": {},
494 |    "source": [
495 |     "## Cases"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": null,
501 |    "metadata": {},
502 |    "outputs": [],
503 |    "source": [
504 |     "# Convert into Lower case\n",
505 |     "print (text.lower())\n",
506 |     "\n",
507 |     "# Convert into Upper case\n",
508 |     "print ('\\n',text.upper())"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "markdown",
513 |    "metadata": {},
514 |    "source": [
515 |     "## Stemming\n",
516 |     "\n",
517 |     "- Stemming means mapping a group of words to the same stem by removing prefixes or suffixes without giving any value to the “grammatical meaning” of the stem formed after the process.\n",
518 |     "\n",
519 |     "e.g.\n",
520 |     "\n",
521 |     "computation --> comput\n",
522 |     "\n",
523 |     "computer --> comput \n",
524 |     "\n",
525 |     "hobbies --> hobbi\n",
526 |     "\n",
527 |     "We can see that stemming tries to bring the word back to their base word but the base word may or may not have correct grammatical meanings.\n",
528 |     "\n",
529 |     "There are few types of stemmers available in NLTK package. We will talk about popular below two\n",
530 |     "- 1)\tPorter Stemmer \n",
531 |     "- 2)\tLancaster Stemmer\n",
532 |     "\n",
533 |     "Let’s see how to use both of them: \n"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 1,
539 |    "metadata": {},
540 |    "outputs": [],
541 |    "source": [
542 |     "import nltk\n",
543 |     "\n",
544 |     "from nltk.stem import PorterStemmer,LancasterStemmer,SnowballStemmer"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": 2,
550 |    "metadata": {},
551 |    "outputs": [
552 |     {
553 |      "name": "stdout",
554 |      "output_type": "stream",
555 |      "text": [
556 |       "Porter stemmer\n",
557 |       "hobbi\n",
558 |       "hobbi\n",
559 |       "comput\n",
560 |       "comput\n",
561 |       "**************************\n",
562 |       "lancaster stemmer\n",
563 |       "hobby\n",
564 |       "hobby\n",
565 |       "comput\n",
566 |       "comput\n",
567 |       "**************************\n"
568 |      ]
569 |     }
570 |    ],
571 |    "source": [
572 |     "lancaster = LancasterStemmer()\n",
573 |     "\n",
574 |     "porter = PorterStemmer()\n",
575 |     "\n",
576 |     "Snowball = SnowballStemmer('english')\n",
577 |     "\n",
578 |     "\n",
579 |     "print('Porter stemmer')\n",
580 |     "print(porter.stem(\"hobby\"))\n",
581 |     "print(porter.stem(\"hobbies\"))\n",
582 |     "print(porter.stem(\"computer\"))\n",
583 |     "print(porter.stem(\"computation\"))\n",
584 |     "print(\"**************************\")  \n",
585 |     "\n",
586 |     "print('lancaster stemmer')\n",
587 |     "print(lancaster.stem(\"hobby\"))\n",
588 |     "print(lancaster.stem(\"hobbies\"))\n",
589 |     "print(lancaster.stem(\"computer\"))\n",
590 |     "print(lancaster.stem(\"computation\"))\n",
591 |     "print(\"**************************\")  "
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": 3,
597 |    "metadata": {},
598 |    "outputs": [
599 |     {
600 |      "data": {
601 |       "text/plain": [
602 |        "['I',\n",
603 |        " 'was',\n",
604 |        " 'going',\n",
605 |        " 'to',\n",
606 |        " 'the',\n",
607 |        " 'office',\n",
608 |        " 'on',\n",
609 |        " 'my',\n",
610 |        " 'bike',\n",
611 |        " 'when',\n",
612 |        " 'i',\n",
613 |        " 'saw',\n",
614 |        " 'a',\n",
615 |        " 'car',\n",
616 |        " 'passing',\n",
617 |        " 'by',\n",
618 |        " 'hit',\n",
619 |        " 'the',\n",
620 |        " 'tree',\n",
621 |        " '.']"
622 |       ]
623 |      },
624 |      "execution_count": 3,
625 |      "metadata": {},
626 |      "output_type": "execute_result"
627 |     }
628 |    ],
629 |    "source": [
630 |     "# Lets see with a new sentence\n",
631 |     "\n",
632 |     "sentence = \"I was going to the office on my bike when i saw a car passing by hit the tree.\"\n",
633 |     "\n",
634 |     "token = list(nltk.word_tokenize(sentence))\n",
635 |     "\n",
636 |     "token"
637 |    ]
638 |   },
639 |   {
640 |    "cell_type": "code",
641 |    "execution_count": 4,
642 |    "metadata": {},
643 |    "outputs": [
644 |     {
645 |      "data": {
646 |       "text/plain": [
647 |        "'I was going to the office on my bike when i saw a car passing by hit the tree.'"
648 |       ]
649 |      },
650 |      "execution_count": 4,
651 |      "metadata": {},
652 |      "output_type": "execute_result"
653 |     }
654 |    ],
655 |    "source": [
656 |     "sentence "
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "code",
661 |    "execution_count": 5,
662 |    "metadata": {
663 |     "scrolled": true
664 |    },
665 |    "outputs": [
666 |     {
667 |      "name": "stdout",
668 |      "output_type": "stream",
669 |      "text": [
670 |       "i was go to the offic on my bike when i saw a car pass by hit the tree .\n",
671 |       "i was going to the off on my bik when i saw a car pass by hit the tre .\n",
672 |       "I wa go to the offic on my bike when i saw a car pass by hit the tree .\n"
673 |      ]
674 |     }
675 |    ],
676 |    "source": [
677 |     "for stemmer in (Snowball, lancaster, porter):\n",
678 |     "    stemm = [stemmer.stem(t) for t in token]\n",
679 |     "    print(\" \".join(stemm))"
680 |    ]
681 |   },
682 |   {
683 |    "cell_type": "markdown",
684 |    "metadata": {},
685 |    "source": [
686 |     "lancaster algorithm is faster than porter but it is more complex.\n",
687 |     "Porter stemmer is the oldest algorithm present and was the most popular to use.\n",
688 |     "\n",
689 |     "Snowball stemmer, also known as  porter2, is the updated version of the Porter stemmer and is currently the most popular stemming algorithm.\n",
690 |     "\n",
691 |     "Snowball stemmer is available for multiple languages as well."
692 |    ]
693 |   },
694 |   {
695 |    "cell_type": "code",
696 |    "execution_count": 6,
697 |    "metadata": {},
698 |    "outputs": [
699 |     {
700 |      "name": "stdout",
701 |      "output_type": "stream",
702 |      "text": [
703 |       "run\n",
704 |       "run\n",
705 |       "ran\n"
706 |      ]
707 |     }
708 |    ],
709 |    "source": [
710 |     "# one more simple example of porter\n",
711 |     "print(porter.stem(\"running\"))\n",
712 |     "print(porter.stem(\"runs\"))\n",
713 |     "print(porter.stem(\"ran\"))"
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "markdown",
718 |    "metadata": {},
719 |    "source": [
720 |     "### Lemmatization\n",
721 |     "\n",
722 |     "\n",
723 |     "Lemmatization also does the same thing as stemming and try to bring a word to its base form, but unlike stemming it do keep in account the actual meaning of the base word i.e. the base word belongs to any specific language. The ‘base word’ is known as ‘Lemma’.\n",
724 |     "\n",
725 |     "We use WordNet Lemmatizer for Lemmatization in nltk."
726 |    ]
727 |   },
728 |   {
729 |    "cell_type": "code",
730 |    "execution_count": 7,
731 |    "metadata": {},
732 |    "outputs": [],
733 |    "source": [
734 |     "from nltk.stem import WordNetLemmatizer"
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "code",
739 |    "execution_count": 8,
740 |    "metadata": {},
741 |    "outputs": [
742 |     {
743 |      "name": "stdout",
744 |      "output_type": "stream",
745 |      "text": [
746 |       "running\n",
747 |       "run\n",
748 |       "ran\n"
749 |      ]
750 |     }
751 |    ],
752 |    "source": [
753 |     "lemma = WordNetLemmatizer()\n",
754 |     "\n",
755 |     "print(lemma.lemmatize('running'))\n",
756 |     "print(lemma.lemmatize('runs'))\n",
757 |     "print(lemma.lemmatize('ran'))"
758 |    ]
759 |   },
760 |   {
761 |    "cell_type": "markdown",
762 |    "metadata": {},
763 |    "source": [
764 |     "Here, we can see the lemma has changed for the words with same base. \n",
765 |     "\n",
766 |     "This is because, we haven’t given any context to the Lemmatizer.\n",
767 |     "\n",
768 |     "Generally, it is given by passing the POS tags for the words in a sentence.\n",
769 |     "e.g.\n"
770 |    ]
771 |   },
772 |   {
773 |    "cell_type": "code",
774 |    "execution_count": 9,
775 |    "metadata": {},
776 |    "outputs": [
777 |     {
778 |      "name": "stdout",
779 |      "output_type": "stream",
780 |      "text": [
781 |       "run\n",
782 |       "run\n",
783 |       "run\n"
784 |      ]
785 |     }
786 |    ],
787 |    "source": [
788 |     "print(lemma.lemmatize('running',pos='v'))\n",
789 |     "print(lemma.lemmatize('runs',pos='v'))\n",
790 |     "print(lemma.lemmatize('ran',pos='v'))"
791 |    ]
792 |   },
793 |   {
794 |    "cell_type": "markdown",
795 |    "metadata": {},
796 |    "source": [
797 |     "Lemmatizer is very complex and takes a lot of time to calculate.\n",
798 |     "\n",
799 |     "So, it should only when the real meaning of words or the context is necessary for processing, else stemming should be preferred.\n",
800 |     "\n",
801 |     "It completely depends on the type of problem you are trying to solve."
802 |    ]
803 |   },
804 |   {
805 |    "cell_type": "code",
806 |    "execution_count": 10,
807 |    "metadata": {},
808 |    "outputs": [
809 |     {
810 |      "name": "stdout",
811 |      "output_type": "stream",
812 |      "text": [
813 |       "Stemming for Bring is bring\n",
814 |       "Stemming for King is king\n",
815 |       "Stemming for Going is go\n",
816 |       "Stemming for Anything is anyth\n",
817 |       "Stemming for Sing is sing\n",
818 |       "Stemming for Ring is ring\n",
819 |       "Stemming for Nothing is noth\n",
820 |       "Stemming for Thing is thing\n"
821 |      ]
822 |     }
823 |    ],
824 |    "source": [
825 |     "# One more example using both stemming and lemma\n",
826 |     "#text = \"studies studying cries cry\"\n",
827 |     "text = \"Bring King Going Anything Sing Ring Nothing Thing\"\n",
828 |     "\n",
829 |     "# Stemming\n",
830 |     "import nltk\n",
831 |     "from nltk.stem.porter import PorterStemmer\n",
832 |     "porter_stemmer  = PorterStemmer()\n",
833 |     "\n",
834 |     "tokenization = nltk.word_tokenize(text)\n",
835 |     "\n",
836 |     "for w in tokenization:\n",
837 |     "    print (\"Stemming for {} is {}\".format(w, porter_stemmer.stem(w))) "
838 |    ]
839 |   },
840 |   {
841 |    "cell_type": "code",
842 |    "execution_count": 11,
843 |    "metadata": {},
844 |    "outputs": [
845 |     {
846 |      "name": "stdout",
847 |      "output_type": "stream",
848 |      "text": [
849 |       "Lemma for Bring is Bring\n",
850 |       "Lemma for King is King\n",
851 |       "Lemma for Going is Going\n",
852 |       "Lemma for Anything is Anything\n",
853 |       "Lemma for Sing is Sing\n",
854 |       "Lemma for Ring is Ring\n",
855 |       "Lemma for Nothing is Nothing\n",
856 |       "Lemma for Thing is Thing\n"
857 |      ]
858 |     }
859 |    ],
860 |    "source": [
861 |     "# Lemma \n",
862 |     "\n",
863 |     "from nltk.stem import WordNetLemmatizer\n",
864 |     "wordnet_lemmatizer = WordNetLemmatizer()\n",
865 |     "\n",
866 |     "tokenization = nltk.word_tokenize(text)\n",
867 |     "\n",
868 |     "for w in tokenization:\n",
869 |     "    print(\"Lemma for {} is {}\".format(w, wordnet_lemmatizer.lemmatize(w))) "
870 |    ]
871 |   },
872 |   {
873 |    "cell_type": "code",
874 |    "execution_count": null,
875 |    "metadata": {},
876 |    "outputs": [],
877 |    "source": [
878 |     "# Lemmatization takes more time to give the response"
879 |    ]
880 |   },
881 |   {
882 |    "cell_type": "markdown",
883 |    "metadata": {},
884 |    "source": [
885 |     "# Wordnet\n",
886 |     "\n",
887 |     "- Wordnet is an NLTK corpus reader, a lexical database for English. It can be used to find the meaning of words, synonym or antonym. One can define it as a semantically oriented dictionary of English."
888 |    ]
889 |   },
890 |   {
891 |    "cell_type": "code",
892 |    "execution_count": 12,
893 |    "metadata": {},
894 |    "outputs": [],
895 |    "source": [
896 |     "from nltk.corpus import wordnet"
897 |    ]
898 |   },
899 |   {
900 |    "cell_type": "code",
901 |    "execution_count": 13,
902 |    "metadata": {},
903 |    "outputs": [
904 |     {
905 |      "name": "stdout",
906 |      "output_type": "stream",
907 |      "text": [
908 |       "Synonyms => {'combat-ready', 'active_voice', 'active', 'alive', 'fighting', 'dynamic', 'active_agent', 'participating'}\n",
909 |       "Antonyms => {'passive_voice', 'passive', 'dormant', 'stative', 'inactive', 'quiet', 'extinct'}\n"
910 |      ]
911 |     }
912 |    ],
913 |    "source": [
914 |     "# Lets find sysnonms and antonyms using python code\n",
915 |     "from nltk.corpus import wordnet\n",
916 |     "\n",
917 |     "synonyms = []\n",
918 |     "antonyms = []\n",
919 |     "\n",
920 |     "for syn in wordnet.synsets(\"active\"):\n",
921 |     "    for l in syn.lemmas():\n",
922 |     "        synonyms.append(l.name())\n",
923 |     "        if l.antonyms():\n",
924 |     "            antonyms.append(l.antonyms()[0].name())\n",
925 |     "\n",
926 |     "print('Synonyms =>',set(synonyms))\n",
927 |     "print('Antonyms =>',set(antonyms))"
928 |    ]
929 |   },
930 |   {
931 |    "cell_type": "code",
932 |    "execution_count": null,
933 |    "metadata": {},
934 |    "outputs": [],
935 |    "source": []
936 |   }
937 |  ],
938 |  "metadata": {
939 |   "hide_input": false,
940 |   "kernelspec": {
941 |    "display_name": "Python 3",
942 |    "language": "python",
943 |    "name": "python3"
944 |   },
945 |   "language_info": {
946 |    "codemirror_mode": {
947 |     "name": "ipython",
948 |     "version": 3
949 |    },
950 |    "file_extension": ".py",
951 |    "mimetype": "text/x-python",
952 |    "name": "python",
953 |    "nbconvert_exporter": "python",
954 |    "pygments_lexer": "ipython3",
955 |    "version": "3.6.5"
956 |   }
957 |  },
958 |  "nbformat": 4,
959 |  "nbformat_minor": 2
960 | }
961 | 


--------------------------------------------------------------------------------
/4_Word Embedding_Part_1.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Word Embedding\n",
   8 |     "\n",
   9 |     "- Word vectorization is the process of mapping words to a set of real numbers or vectors. This is done to process the given words using machine learning techniques and extract relevant information from them such that it can be used in further predicting words. Vectorization is done by comparing a given word to the corpus(collection) of the available words. \n",
  10 |     "\n",
  11 |     "- It is language modeling and feature learning technique. Word embedding is a way to perform mapping using a neural network. \n",
  12 |     "- There are various word embedding models available such as word2vec (Google), Glove (Stanford) and fastest (Facebook).\n",
  13 |     "- We are going to discuss about word2vec in this tutorial\n",
  14 |     "\n",
  15 |     "## Where it is being used\n",
  16 |     "- `Compute similar words: `Word embedding is used to suggest similar words to the word being subjected to the prediction model. Along with that it also suggests dissimilar words, as well as most common words.\n",
  17 |     "\n",
  18 |     "- `Create a group of related words:` It is used for semantic grouping which will group things of similar characteristic together and dissimilar far away.\n",
  19 |     "\n",
  20 |     "- `Feature for text classification: `Text is mapped into arrays of vectors which is fed to the model for training as well as prediction. Text-based classifier models cannot be trained on the string, so this will convert the text into machine trainable form. Further its features of building semantic help in text-based classification.\n",
  21 |     "\n",
  22 |     "- `Document clustering` is another application where word embedding is widely used\n",
  23 |     "\n",
  24 |     "- `Natural language processing:` There are many applications where word embedding is useful and wins over feature extraction phases such as parts of speech tagging, sentimental analysis, and syntactic analysis.\n",
  25 |     "Now we have got some knowledge of word embedding. Some light is also thrown on different models to implement word embedding. "
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "markdown",
  30 |    "metadata": {},
  31 |    "source": [
  32 |     "### Count Vectorizer\n",
  33 |     "\n",
  34 |     "Count vectorizer uses two of the following models as the base to vectorize the given words on the basis of frequency of words.\n",
  35 |     "\n",
  36 |     "#### Bag of Words Model\n",
  37 |     "BOW model is used in NLP to represent the given text/sentence/document as a collection (bag) of words without giving any importance to grammar or the occurrence order of the words. It keeps the account of frequency of the words in the text document, which can be used as features in many models.\n",
  38 |     "\n",
  39 |     "Let’s understand this with an example:\n",
  40 |     "\n",
  41 |     "Text1 = “I went to have a cup of coffee but I ended up having lunch with her.”\n",
  42 |     "\n",
  43 |     "Text2 = “I don’t understand, what is the problem here?”\n",
  44 |     "\n",
  45 |     "BOW1 = {I :2, went : 1, to : 1,have : 1, a : 1, cup: 1, of :1, coffee : 1, but :1, ended : 1, up :1,having : 1, with :1, her :1}\n",
  46 |     "\n",
  47 |     "BOW2 = {I : 1, don’t : 1, understand:1, what : 1 , is :1, the : 1, problem : 1, here : 1}\n",
  48 |     "\n",
  49 |     "BOW is mainly used for feature selection. The above dictionary is converted as a list with only the frequency terms there and on that basis, weights are given to the most occurring terms. But the “stop words” are the most frequent words that appears in raw document. Thus, having a word with high frequency count doesn’t mean that the word is as important. To resolve this problem, “Tf-idf” was introduced. We will discuss about it later.\n",
  50 |     "\n",
  51 |     "#### n-gram model\n",
  52 |     "\n",
  53 |     "As discussed in bag of words model, BOW model doesn’t keep the sequence of words in a given text, only the frequency of words matters. It doesn’t take into account the context of the given sentence, or care for grammatical rules such as verb is following a proper noun in the given text.n-gram model is used in such cases to keep the context of the given text intact. N-gram is the sequence of n words from a given text/document.\n",
  54 |     "\n",
  55 |     "When, n= 1, we call it a “unigram”.\n",
  56 |     "\n",
  57 |     "             n=2, it is called a “bigram”. \n",
  58 |     "             \n",
  59 |     "             n=3, it is called a “trigram”.\n",
  60 |     "And so on.\n",
  61 |     "\n",
  62 |     "Let’s understand this with an example:\n",
  63 |     "\n",
  64 |     "Text1 = “I went to have a cup of coffee but I ended up having lunch with her.”\n",
  65 |     "\n",
  66 |     "* Unigram \n",
  67 |     "\n",
  68 |     "[I, went, to, have, a, cup, of, coffee, but, I, ended, up, having, lunch, with, her]\n",
  69 |     "\n",
  70 |     "* Bi-gram\n",
  71 |     "\n",
  72 |     "[I went], [went to],[to have],[have a],[a cup],[cup of],[of coffee],[coffee but],[but I],[I ended],[ended up],\n",
  73 |     "[up having],[having lunch],[lunch with],[with her]\n",
  74 |     "\n",
  75 |     "* Tri-gram\n",
  76 |     "\n",
  77 |     "[I went to], [went to have], [to have a], [have a cup],[ a cup of], [cup of coffee],[ of coffee but],[ coffee but I],[but I ended],[I ended up],[ended up having],[up having lunch],[having lunch with],[lunch with her].\n",
  78 |     "\n",
  79 |     "Note: We can clearly see that BOW model is nothing but n-gram model when n=1.\n",
  80 |     "\n",
  81 |     "Skip-grams\n",
  82 |     "\n",
  83 |     "Skip grams are type of n-grams where the words are not necessarily in the same order as are in the given text i.e. some words can be skipped. \n",
  84 |     "Example:\n",
  85 |     "\n",
  86 |     "Text2 = “I don’t understand, what is the problem here?”\n",
  87 |     "\n",
  88 |     "1-skip 2-grams (we have to make 2-gram while skipping 1 word)\n",
  89 |     "\n",
  90 |     "[I understand, don’t what, understand is, what the, is problem, the here].\n",
  91 |     "\n",
  92 |     "\n",
  93 |     "Let's see the implementation of Count vectorizer in python:"
  94 |    ]
  95 |   },
  96 |   {
  97 |    "cell_type": "markdown",
  98 |    "metadata": {},
  99 |    "source": [
 100 |     "##### Bag Of Words"
 101 |    ]
 102 |   },
 103 |   {
 104 |    "cell_type": "code",
 105 |    "execution_count": 1,
 106 |    "metadata": {},
 107 |    "outputs": [
 108 |     {
 109 |      "name": "stdout",
 110 |      "output_type": "stream",
 111 |      "text": [
 112 |       "bag of words : ['an', 'bag', 'example', 'is', 'of', 'this', 'words']\n"
 113 |      ]
 114 |     }
 115 |    ],
 116 |    "source": [
 117 |     "# Example of single document\n",
 118 |     "# Without stopwords\n",
 119 |     "\n",
 120 |     "from sklearn.feature_extraction.text import CountVectorizer \n",
 121 |     "\n",
 122 |     "from nltk.corpus import stopwords\n",
 123 |     "import pandas as pd\n",
 124 |     "\n",
 125 |     "# Single document (',' seperates each document)\n",
 126 |     "string = [\"This is an example of bag of words!\"]\n",
 127 |     "\n",
 128 |     "# This step will convert text into tokens \n",
 129 |     "vect1 = CountVectorizer()\n",
 130 |     "\n",
 131 |     "vect1.fit_transform(string)\n",
 132 |     "print(\"bag of words :\",vect1.get_feature_names())"
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "code",
 137 |    "execution_count": 2,
 138 |    "metadata": {},
 139 |    "outputs": [
 140 |     {
 141 |      "data": {
 142 |       "text/plain": [
 143 |        "{'this': 5, 'is': 3, 'an': 0, 'example': 2, 'of': 4, 'bag': 1, 'words': 6}"
 144 |       ]
 145 |      },
 146 |      "execution_count": 2,
 147 |      "metadata": {},
 148 |      "output_type": "execute_result"
 149 |     }
 150 |    ],
 151 |    "source": [
 152 |     "vect1.vocabulary_"
 153 |    ]
 154 |   },
 155 |   {
 156 |    "cell_type": "markdown",
 157 |    "metadata": {},
 158 |    "source": [
 159 |     "###### Fit and transform and predict if the word is present or not\n",
 160 |     "- This is widely used for document or subject classification"
 161 |    ]
 162 |   },
 163 |   {
 164 |    "cell_type": "code",
 165 |    "execution_count": 3,
 166 |    "metadata": {
 167 |     "scrolled": true
 168 |    },
 169 |    "outputs": [
 170 |     {
 171 |      "data": {
 172 |       "text/plain": [
 173 |        "CountVectorizer()"
 174 |       ]
 175 |      },
 176 |      "execution_count": 3,
 177 |      "metadata": {},
 178 |      "output_type": "execute_result"
 179 |     }
 180 |    ],
 181 |    "source": [
 182 |     "\n",
 183 |     "c_vect = CountVectorizer()\n",
 184 |     "\n",
 185 |     "c_vect.fit(string)"
 186 |    ]
 187 |   },
 188 |   {
 189 |    "cell_type": "code",
 190 |    "execution_count": 7,
 191 |    "metadata": {},
 192 |    "outputs": [
 193 |     {
 194 |      "name": "stdout",
 195 |      "output_type": "stream",
 196 |      "text": [
 197 |       "Text Present at  [[0 0 0 1 1 0 1]]\n",
 198 |       "original indexes ['an', 'bag', 'example', 'is', 'of', 'this', 'words']\n"
 199 |      ]
 200 |     }
 201 |    ],
 202 |    "source": [
 203 |     "string2 = ['Lets understand is of words']\n",
 204 |     "\n",
 205 |     "c_new_vect = c_vect.transform(string2)\n",
 206 |     "\n",
 207 |     "print (\"Text Present at \",c_new_vect.toarray())\n",
 208 |     "\n",
 209 |     "# Compare with the indexes\n",
 210 |     "print (\"original indexes\", vect1.get_feature_names() )"
 211 |    ]
 212 |   },
 213 |   {
 214 |    "cell_type": "code",
 215 |    "execution_count": 10,
 216 |    "metadata": {},
 217 |    "outputs": [
 218 |     {
 219 |      "name": "stdout",
 220 |      "output_type": "stream",
 221 |      "text": [
 222 |       "CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',\n",
 223 |       "                            'ourselves', 'you', \"you're\", \"you've\", \"you'll\",\n",
 224 |       "                            \"you'd\", 'your', 'yours', 'yourself', 'yourselves',\n",
 225 |       "                            'he', 'him', 'his', 'himself', 'she', \"she's\",\n",
 226 |       "                            'her', 'hers', 'herself', 'it', \"it's\", 'its',\n",
 227 |       "                            'itself', ...])\n"
 228 |      ]
 229 |     }
 230 |    ],
 231 |    "source": [
 232 |     "## Bag Of Words using stopwords (you can avoid writing extra steps to remove stopwords)\n",
 233 |     "\n",
 234 |     "stpwords = stopwords.words('english')\n",
 235 |     "\n",
 236 |     "string = [\"This is an example of bag of words!\"]\n",
 237 |     "vect1 = CountVectorizer(stop_words=stpwords)\n",
 238 |     "print (vect1)"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "code",
 243 |    "execution_count": 9,
 244 |    "metadata": {},
 245 |    "outputs": [
 246 |     {
 247 |      "name": "stdout",
 248 |      "output_type": "stream",
 249 |      "text": [
 250 |       "bag of words : ['bag', 'example', 'words']\n",
 251 |       "vocab        : {'example': 1, 'bag': 0, 'words': 2}\n"
 252 |      ]
 253 |     }
 254 |    ],
 255 |    "source": [
 256 |     "vect1.fit_transform(string)\n",
 257 |     "print(\"bag of words :\",vect1.get_feature_names())\n",
 258 |     "print(\"vocab        :\",vect1.vocabulary_)"
 259 |    ]
 260 |   },
 261 |   {
 262 |    "cell_type": "code",
 263 |    "execution_count": 11,
 264 |    "metadata": {},
 265 |    "outputs": [],
 266 |    "source": [
 267 |     "# Using function\n",
 268 |     "def text_matrix(message, countvect):\n",
 269 |     "    terms_doc = countvect.fit_transform(message)\n",
 270 |     "    return pd.DataFrame(terms_doc.toarray(),columns=countvect.get_feature_names())"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "code",
 275 |    "execution_count": 17,
 276 |    "metadata": {},
 277 |    "outputs": [
 278 |     {
 279 |      "name": "stdout",
 280 |      "output_type": "stream",
 281 |      "text": [
 282 |       "Below metrix is the Bag of Words approach\n"
 283 |      ]
 284 |     },
 285 |     {
 286 |      "data": {
 287 |       "text/html": [
 288 |        "<div>\n",
 289 |        "<style scoped>\n",
 290 |        "    .dataframe tbody tr th:only-of-type {\n",
 291 |        "        vertical-align: middle;\n",
 292 |        "    }\n",
 293 |        "\n",
 294 |        "    .dataframe tbody tr th {\n",
 295 |        "        vertical-align: top;\n",
 296 |        "    }\n",
 297 |        "\n",
 298 |        "    .dataframe thead th {\n",
 299 |        "        text-align: right;\n",
 300 |        "    }\n",
 301 |        "</style>\n",
 302 |        "<table border=\"1\" class=\"dataframe\">\n",
 303 |        "  <thead>\n",
 304 |        "    <tr style=\"text-align: right;\">\n",
 305 |        "      <th></th>\n",
 306 |        "      <th>are</th>\n",
 307 |        "      <th>but</th>\n",
 308 |        "      <th>for</th>\n",
 309 |        "      <th>get</th>\n",
 310 |        "      <th>in</th>\n",
 311 |        "      <th>is</th>\n",
 312 |        "      <th>language</th>\n",
 313 |        "      <th>making</th>\n",
 314 |        "      <th>mantra</th>\n",
 315 |        "      <th>natural</th>\n",
 316 |        "      <th>only</th>\n",
 317 |        "      <th>practice</th>\n",
 318 |        "      <th>processing</th>\n",
 319 |        "      <th>progress</th>\n",
 320 |        "      <th>slowly</th>\n",
 321 |        "      <th>success</th>\n",
 322 |        "      <th>the</th>\n",
 323 |        "      <th>there</th>\n",
 324 |        "      <th>we</th>\n",
 325 |        "      <th>will</th>\n",
 326 |        "    </tr>\n",
 327 |        "  </thead>\n",
 328 |        "  <tbody>\n",
 329 |        "    <tr>\n",
 330 |        "      <th>0</th>\n",
 331 |        "      <td>1</td>\n",
 332 |        "      <td>0</td>\n",
 333 |        "      <td>0</td>\n",
 334 |        "      <td>0</td>\n",
 335 |        "      <td>1</td>\n",
 336 |        "      <td>0</td>\n",
 337 |        "      <td>1</td>\n",
 338 |        "      <td>1</td>\n",
 339 |        "      <td>0</td>\n",
 340 |        "      <td>1</td>\n",
 341 |        "      <td>0</td>\n",
 342 |        "      <td>0</td>\n",
 343 |        "      <td>1</td>\n",
 344 |        "      <td>1</td>\n",
 345 |        "      <td>1</td>\n",
 346 |        "      <td>0</td>\n",
 347 |        "      <td>0</td>\n",
 348 |        "      <td>0</td>\n",
 349 |        "      <td>1</td>\n",
 350 |        "      <td>0</td>\n",
 351 |        "    </tr>\n",
 352 |        "    <tr>\n",
 353 |        "      <th>1</th>\n",
 354 |        "      <td>0</td>\n",
 355 |        "      <td>0</td>\n",
 356 |        "      <td>0</td>\n",
 357 |        "      <td>1</td>\n",
 358 |        "      <td>0</td>\n",
 359 |        "      <td>0</td>\n",
 360 |        "      <td>0</td>\n",
 361 |        "      <td>0</td>\n",
 362 |        "      <td>0</td>\n",
 363 |        "      <td>0</td>\n",
 364 |        "      <td>0</td>\n",
 365 |        "      <td>0</td>\n",
 366 |        "      <td>0</td>\n",
 367 |        "      <td>0</td>\n",
 368 |        "      <td>0</td>\n",
 369 |        "      <td>0</td>\n",
 370 |        "      <td>0</td>\n",
 371 |        "      <td>1</td>\n",
 372 |        "      <td>1</td>\n",
 373 |        "      <td>1</td>\n",
 374 |        "    </tr>\n",
 375 |        "    <tr>\n",
 376 |        "      <th>2</th>\n",
 377 |        "      <td>0</td>\n",
 378 |        "      <td>1</td>\n",
 379 |        "      <td>1</td>\n",
 380 |        "      <td>0</td>\n",
 381 |        "      <td>0</td>\n",
 382 |        "      <td>1</td>\n",
 383 |        "      <td>0</td>\n",
 384 |        "      <td>0</td>\n",
 385 |        "      <td>1</td>\n",
 386 |        "      <td>0</td>\n",
 387 |        "      <td>1</td>\n",
 388 |        "      <td>1</td>\n",
 389 |        "      <td>0</td>\n",
 390 |        "      <td>0</td>\n",
 391 |        "      <td>0</td>\n",
 392 |        "      <td>1</td>\n",
 393 |        "      <td>1</td>\n",
 394 |        "      <td>0</td>\n",
 395 |        "      <td>0</td>\n",
 396 |        "      <td>0</td>\n",
 397 |        "    </tr>\n",
 398 |        "  </tbody>\n",
 399 |        "</table>\n",
 400 |        "</div>"
 401 |       ],
 402 |       "text/plain": [
 403 |        "   are  but  for  get  in  is  language  making  mantra  natural  only  \\\n",
 404 |        "0    1    0    0    0   1   0         1       1       0        1     0   \n",
 405 |        "1    0    0    0    1   0   0         0       0       0        0     0   \n",
 406 |        "2    0    1    1    0   0   1         0       0       1        0     1   \n",
 407 |        "\n",
 408 |        "   practice  processing  progress  slowly  success  the  there  we  will  \n",
 409 |        "0         0           1         1       1        0    0      0   1     0  \n",
 410 |        "1         0           0         0       0        0    0      1   1     1  \n",
 411 |        "2         1           0         0       0        1    1      0   0     0  "
 412 |       ]
 413 |      },
 414 |      "execution_count": 17,
 415 |      "metadata": {},
 416 |      "output_type": "execute_result"
 417 |     }
 418 |    ],
 419 |    "source": [
 420 |     "message = ['We are slowly making progress in Natural Language Processing',\n",
 421 |     "          \"We will get there\", \"But practice is the only mantra for success\" ]\n",
 422 |     "\n",
 423 |     "c_vect = CountVectorizer()\n",
 424 |     "print (\"Below metrix is the Bag of Words approach\")\n",
 425 |     "text_matrix(message, c_vect)"
 426 |    ]
 427 |   },
 428 |   {
 429 |    "cell_type": "markdown",
 430 |    "metadata": {},
 431 |    "source": [
 432 |     "##### n-grams"
 433 |    ]
 434 |   },
 435 |   {
 436 |    "cell_type": "code",
 437 |    "execution_count": 26,
 438 |    "metadata": {},
 439 |    "outputs": [
 440 |     {
 441 |      "name": "stdout",
 442 |      "output_type": "stream",
 443 |      "text": [
 444 |       "1-gram  : ['an', 'example', 'gram', 'is', 'of', 'this']\n",
 445 |       "2-gram  : ['an example', 'an example of', 'example of', 'example of gram', 'is an', 'is an example', 'of gram', 'this is', 'this is an']\n",
 446 |       "3-gram  : ['an example of', 'example of gram', 'is an example', 'this is an']\n",
 447 |       "4-gram  : ['an example of gram', 'is an example of', 'this is an example']\n"
 448 |      ]
 449 |     }
 450 |    ],
 451 |    "source": [
 452 |     "\n",
 453 |     "from sklearn.feature_extraction.text import CountVectorizer \n",
 454 |     "\n",
 455 |     "from nltk.tokenize import word_tokenize\n",
 456 |     "\n",
 457 |     "string = [\"This is an example of gram!\"]\n",
 458 |     "\n",
 459 |     "vect1 = CountVectorizer(ngram_range=(1,1))\n",
 460 |     "\n",
 461 |     "vect1.fit_transform(string)\n",
 462 |     "\n",
 463 |     "vect2 = CountVectorizer(ngram_range=(2,3))\n",
 464 |     "vect2.fit_transform(string)\n",
 465 |     "\n",
 466 |     "vect3 = CountVectorizer(ngram_range=(3,3))\n",
 467 |     "vect3.fit_transform(string)\n",
 468 |     "\n",
 469 |     "vect4 = CountVectorizer(ngram_range=(4,4))\n",
 470 |     "vect4.fit_transform(string)\n",
 471 |     "\n",
 472 |     "\n",
 473 |     "print(\"1-gram  :\",vect1.get_feature_names())\n",
 474 |     "\n",
 475 |     "print(\"2-gram  :\",vect2.get_feature_names())\n",
 476 |     "print(\"3-gram  :\",vect3.get_feature_names())\n",
 477 |     "print(\"4-gram  :\",vect4.get_feature_names())"
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "markdown",
 482 |    "metadata": {},
 483 |    "source": [
 484 |     "## Tf-Idf (Term frequency–Inverse document frequency)\n",
 485 |     "\n",
 486 |     "Wikipedia definition:  ” Tf-Idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The Tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general. Tf–idf is one of the most popular term-weighting schemes today.”\n",
 487 |     "\n",
 488 |     "\n",
 489 |     "### Term Frequency\n",
 490 |     "It is simply the frequency in which a word appears in a document in comparison to the total number words in the document. Mathematically given as:\n",
 491 |     "\n",
 492 |     "Term frequency = (Number of times a word appears in the document) / (Total number of words in the document)\n",
 493 |     "\n",
 494 |     "### Inverse Document Frequency\n",
 495 |     "\n",
 496 |     "Term frequency has a disadvantage that it tends to give higher weights to words with higher frequency. In such cases words like ‘a’, ‘the’, ‘in’, ’of’ etc. appears more in the documents than other regular words. Thus, more important words are wrongly given lower weights as their frequency is less.\n",
 497 |     " To tackle this problem IDF was introduced. IDF decreases the weights of such high frequency terms and increases the weight of terms with rare occurrence. Mathematically it is given as:\n",
 498 |     " \n",
 499 |     "Inverse Document Frequency = log [(Number of documents)/(Number of documents the word appears in)]   \n",
 500 |     "\n",
 501 |     "**note: [log has base 2]**\n",
 502 |     "\n",
 503 |     "\n",
 504 |     "*Tf-Idf Score = Term frequency * Inverse Document Frequency*"
 505 |    ]
 506 |   },
 507 |   {
 508 |    "cell_type": "code",
 509 |    "execution_count": 27,
 510 |    "metadata": {},
 511 |    "outputs": [
 512 |     {
 513 |      "data": {
 514 |       "text/plain": [
 515 |        "1.584962500721156"
 516 |       ]
 517 |      },
 518 |      "execution_count": 27,
 519 |      "metadata": {},
 520 |      "output_type": "execute_result"
 521 |     }
 522 |    ],
 523 |    "source": [
 524 |     "import numpy as np \n",
 525 |     "np.log2(3)"
 526 |    ]
 527 |   },
 528 |   {
 529 |    "cell_type": "markdown",
 530 |    "metadata": {},
 531 |    "source": [
 532 |     "Let's understand more with an example:\n",
 533 |     "\n",
 534 |     "Doc 1: This is an example.\n",
 535 |     "\n",
 536 |     "Doc 2: We will see how it works.\n",
 537 |     "\n",
 538 |     "Doc 3: IDF can be confusing.\n",
 539 |     "\n",
 540 |     "\n",
 541 |     "\n",
 542 |     "<img src= \"tfidf.PNG\">\n",
 543 |     "\n",
 544 |     "In the above table, we have calculated the term frequency as well as inverse document frequency of each of the words present in the 3 documents given. \n",
 545 |     "\n",
 546 |     "Now, let's calculate the tf-idf score for each term. Since, words of one document is not present in another document, we will have tf-idf value 0 for them e.g. words of doc1 will have 0 tf-idf for doc2 and doc3.\n",
 547 |     "\n",
 548 |     "<img src= \"tfidf2.PNG\">\n",
 549 |     "\n",
 550 |     "Great, hope this example must have cleared how Tf-Idf works. \n",
 551 |     "\n",
 552 |     "let's see the python implementation for it:"
 553 |    ]
 554 |   },
 555 |   {
 556 |    "cell_type": "code",
 557 |    "execution_count": 28,
 558 |    "metadata": {},
 559 |    "outputs": [
 560 |     {
 561 |      "data": {
 562 |       "text/html": [
 563 |        "<div>\n",
 564 |        "<style scoped>\n",
 565 |        "    .dataframe tbody tr th:only-of-type {\n",
 566 |        "        vertical-align: middle;\n",
 567 |        "    }\n",
 568 |        "\n",
 569 |        "    .dataframe tbody tr th {\n",
 570 |        "        vertical-align: top;\n",
 571 |        "    }\n",
 572 |        "\n",
 573 |        "    .dataframe thead th {\n",
 574 |        "        text-align: right;\n",
 575 |        "    }\n",
 576 |        "</style>\n",
 577 |        "<table border=\"1\" class=\"dataframe\">\n",
 578 |        "  <thead>\n",
 579 |        "    <tr style=\"text-align: right;\">\n",
 580 |        "      <th></th>\n",
 581 |        "      <th>an</th>\n",
 582 |        "      <th>be</th>\n",
 583 |        "      <th>can</th>\n",
 584 |        "      <th>confusing</th>\n",
 585 |        "      <th>example</th>\n",
 586 |        "      <th>how</th>\n",
 587 |        "      <th>idf</th>\n",
 588 |        "      <th>is</th>\n",
 589 |        "      <th>it</th>\n",
 590 |        "      <th>see</th>\n",
 591 |        "      <th>this</th>\n",
 592 |        "      <th>we</th>\n",
 593 |        "      <th>will</th>\n",
 594 |        "      <th>works</th>\n",
 595 |        "    </tr>\n",
 596 |        "  </thead>\n",
 597 |        "  <tbody>\n",
 598 |        "    <tr>\n",
 599 |        "      <th>0</th>\n",
 600 |        "      <td>0.5</td>\n",
 601 |        "      <td>0.0</td>\n",
 602 |        "      <td>0.0</td>\n",
 603 |        "      <td>0.0</td>\n",
 604 |        "      <td>0.5</td>\n",
 605 |        "      <td>0.000000</td>\n",
 606 |        "      <td>0.0</td>\n",
 607 |        "      <td>0.5</td>\n",
 608 |        "      <td>0.000000</td>\n",
 609 |        "      <td>0.000000</td>\n",
 610 |        "      <td>0.5</td>\n",
 611 |        "      <td>0.000000</td>\n",
 612 |        "      <td>0.000000</td>\n",
 613 |        "      <td>0.000000</td>\n",
 614 |        "    </tr>\n",
 615 |        "    <tr>\n",
 616 |        "      <th>1</th>\n",
 617 |        "      <td>0.0</td>\n",
 618 |        "      <td>0.0</td>\n",
 619 |        "      <td>0.0</td>\n",
 620 |        "      <td>0.0</td>\n",
 621 |        "      <td>0.0</td>\n",
 622 |        "      <td>0.408248</td>\n",
 623 |        "      <td>0.0</td>\n",
 624 |        "      <td>0.0</td>\n",
 625 |        "      <td>0.408248</td>\n",
 626 |        "      <td>0.408248</td>\n",
 627 |        "      <td>0.0</td>\n",
 628 |        "      <td>0.408248</td>\n",
 629 |        "      <td>0.408248</td>\n",
 630 |        "      <td>0.408248</td>\n",
 631 |        "    </tr>\n",
 632 |        "    <tr>\n",
 633 |        "      <th>2</th>\n",
 634 |        "      <td>0.0</td>\n",
 635 |        "      <td>0.5</td>\n",
 636 |        "      <td>0.5</td>\n",
 637 |        "      <td>0.5</td>\n",
 638 |        "      <td>0.0</td>\n",
 639 |        "      <td>0.000000</td>\n",
 640 |        "      <td>0.5</td>\n",
 641 |        "      <td>0.0</td>\n",
 642 |        "      <td>0.000000</td>\n",
 643 |        "      <td>0.000000</td>\n",
 644 |        "      <td>0.0</td>\n",
 645 |        "      <td>0.000000</td>\n",
 646 |        "      <td>0.000000</td>\n",
 647 |        "      <td>0.000000</td>\n",
 648 |        "    </tr>\n",
 649 |        "  </tbody>\n",
 650 |        "</table>\n",
 651 |        "</div>"
 652 |       ],
 653 |       "text/plain": [
 654 |        "    an   be  can  confusing  example       how  idf   is        it       see  \\\n",
 655 |        "0  0.5  0.0  0.0        0.0      0.5  0.000000  0.0  0.5  0.000000  0.000000   \n",
 656 |        "1  0.0  0.0  0.0        0.0      0.0  0.408248  0.0  0.0  0.408248  0.408248   \n",
 657 |        "2  0.0  0.5  0.5        0.5      0.0  0.000000  0.5  0.0  0.000000  0.000000   \n",
 658 |        "\n",
 659 |        "   this        we      will     works  \n",
 660 |        "0   0.5  0.000000  0.000000  0.000000  \n",
 661 |        "1   0.0  0.408248  0.408248  0.408248  \n",
 662 |        "2   0.0  0.000000  0.000000  0.000000  "
 663 |       ]
 664 |      },
 665 |      "execution_count": 28,
 666 |      "metadata": {},
 667 |      "output_type": "execute_result"
 668 |     }
 669 |    ],
 670 |    "source": [
 671 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 672 |     "\n",
 673 |     "import pandas as pd\n",
 674 |     "\n",
 675 |     "tfid = TfidfVectorizer(smooth_idf=False)\n",
 676 |     "\n",
 677 |     "doc= [\"This is an example.\",\"We will see how it works.\",\"IDF can be confusing\"]\n",
 678 |     "\n",
 679 |     "doc_vector = tfid.fit_transform(doc)\n",
 680 |     "\n",
 681 |     "#print(tfid.get_feature_names())\n",
 682 |     "\n",
 683 |     "df= pd.DataFrame(doc_vector.todense(),columns=tfid.get_feature_names())\n",
 684 |     "df\n",
 685 |     "#print(doc_vector)"
 686 |    ]
 687 |   },
 688 |   {
 689 |    "cell_type": "markdown",
 690 |    "metadata": {},
 691 |    "source": [
 692 |     "Although we are using the same data set as we used while doing manual calculation, the results are different than what we got.\n",
 693 |     "\n",
 694 |     "This is because sklearn package have some modifications done to the formula to avoid complete avoidance of terms as well as to counter dividing by zero. \n",
 695 |     "\n",
 696 |     "You can know more by going through the official doumentation of sklearn as below:\n",
 697 |     "\n",
 698 |     "\"\n",
 699 |     "   *The formula that is used to compute the tf-idf for a term t of a document d\n",
 700 |     "    in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is\n",
 701 |     "    computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where\n",
 702 |     "    n is the total number of documents in the document set and df(t) is the\n",
 703 |     "    document frequency of t; the document frequency is the number of documents\n",
 704 |     "    in the document set that contain the term t. The effect of adding \"1\" to\n",
 705 |     "    the idf in the equation above is that terms with zero idf, i.e., terms\n",
 706 |     "    that occur in all documents in a training set, will not be entirely\n",
 707 |     "    ignored.\n",
 708 |     "    (Note that the idf formula above differs from the standard textbook\n",
 709 |     "    notation that defines the idf as\n",
 710 |     "    idf(t) = log [ n / (df(t) + 1) ]).\n",
 711 |     "    If ``smooth_idf=True`` (the default), the constant \"1\" is added to the\n",
 712 |     "    numerator and denominator of the idf as if an extra document was seen\n",
 713 |     "    containing every term in the collection exactly once, which prevents\n",
 714 |     "    zero divisions: idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1.\"*"
 715 |    ]
 716 |   },
 717 |   {
 718 |    "cell_type": "code",
 719 |    "execution_count": 30,
 720 |    "metadata": {},
 721 |    "outputs": [],
 722 |    "source": [
 723 |     "# Using function\n",
 724 |     "def text_matrix(message, countvect):\n",
 725 |     "    terms_doc = countvect.fit_transform(message)\n",
 726 |     "    return pd.DataFrame(terms_doc.toarray(),columns=countvect.get_feature_names())"
 727 |    ]
 728 |   },
 729 |   {
 730 |    "cell_type": "code",
 731 |    "execution_count": 33,
 732 |    "metadata": {},
 733 |    "outputs": [
 734 |     {
 735 |      "data": {
 736 |       "text/html": [
 737 |        "<div>\n",
 738 |        "<style scoped>\n",
 739 |        "    .dataframe tbody tr th:only-of-type {\n",
 740 |        "        vertical-align: middle;\n",
 741 |        "    }\n",
 742 |        "\n",
 743 |        "    .dataframe tbody tr th {\n",
 744 |        "        vertical-align: top;\n",
 745 |        "    }\n",
 746 |        "\n",
 747 |        "    .dataframe thead th {\n",
 748 |        "        text-align: right;\n",
 749 |        "    }\n",
 750 |        "</style>\n",
 751 |        "<table border=\"1\" class=\"dataframe\">\n",
 752 |        "  <thead>\n",
 753 |        "    <tr style=\"text-align: right;\">\n",
 754 |        "      <th></th>\n",
 755 |        "      <th>are</th>\n",
 756 |        "      <th>cases</th>\n",
 757 |        "      <th>covid</th>\n",
 758 |        "      <th>dropping</th>\n",
 759 |        "      <th>is</th>\n",
 760 |        "      <th>nothing</th>\n",
 761 |        "      <th>that</th>\n",
 762 |        "      <th>what</th>\n",
 763 |        "    </tr>\n",
 764 |        "  </thead>\n",
 765 |        "  <tbody>\n",
 766 |        "    <tr>\n",
 767 |        "      <th>0</th>\n",
 768 |        "      <td>0.000000</td>\n",
 769 |        "      <td>0.000000</td>\n",
 770 |        "      <td>0.592567</td>\n",
 771 |        "      <td>0.000000</td>\n",
 772 |        "      <td>0.381519</td>\n",
 773 |        "      <td>0.000000</td>\n",
 774 |        "      <td>0.501651</td>\n",
 775 |        "      <td>0.501651</td>\n",
 776 |        "    </tr>\n",
 777 |        "    <tr>\n",
 778 |        "      <th>1</th>\n",
 779 |        "      <td>0.000000</td>\n",
 780 |        "      <td>0.000000</td>\n",
 781 |        "      <td>0.425441</td>\n",
 782 |        "      <td>0.000000</td>\n",
 783 |        "      <td>0.547832</td>\n",
 784 |        "      <td>0.720333</td>\n",
 785 |        "      <td>0.000000</td>\n",
 786 |        "      <td>0.000000</td>\n",
 787 |        "    </tr>\n",
 788 |        "    <tr>\n",
 789 |        "      <th>2</th>\n",
 790 |        "      <td>0.546454</td>\n",
 791 |        "      <td>0.546454</td>\n",
 792 |        "      <td>0.322745</td>\n",
 793 |        "      <td>0.546454</td>\n",
 794 |        "      <td>0.000000</td>\n",
 795 |        "      <td>0.000000</td>\n",
 796 |        "      <td>0.000000</td>\n",
 797 |        "      <td>0.000000</td>\n",
 798 |        "    </tr>\n",
 799 |        "  </tbody>\n",
 800 |        "</table>\n",
 801 |        "</div>"
 802 |       ],
 803 |       "text/plain": [
 804 |        "        are     cases     covid  dropping        is   nothing      that  \\\n",
 805 |        "0  0.000000  0.000000  0.592567  0.000000  0.381519  0.000000  0.501651   \n",
 806 |        "1  0.000000  0.000000  0.425441  0.000000  0.547832  0.720333  0.000000   \n",
 807 |        "2  0.546454  0.546454  0.322745  0.546454  0.000000  0.000000  0.000000   \n",
 808 |        "\n",
 809 |        "       what  \n",
 810 |        "0  0.501651  \n",
 811 |        "1  0.000000  \n",
 812 |        "2  0.000000  "
 813 |       ]
 814 |      },
 815 |      "execution_count": 33,
 816 |      "metadata": {},
 817 |      "output_type": "execute_result"
 818 |     }
 819 |    ],
 820 |    "source": [
 821 |     "# We will call the function created earlier\n",
 822 |     "feb_message = [\"What is that covid covid\",\n",
 823 |     "              \"covid is nothing\",\n",
 824 |     "              \"covid cases are dropping\"]\n",
 825 |     "\n",
 826 |     "\n",
 827 |     "tf = TfidfVectorizer()\n",
 828 |     "\n",
 829 |     "#Passing same message with TF-IDF\n",
 830 |     "\n",
 831 |     "text_matrix(feb_message,tf)"
 832 |    ]
 833 |   },
 834 |   {
 835 |    "cell_type": "code",
 836 |    "execution_count": 34,
 837 |    "metadata": {},
 838 |    "outputs": [
 839 |     {
 840 |      "data": {
 841 |       "text/html": [
 842 |        "<div>\n",
 843 |        "<style scoped>\n",
 844 |        "    .dataframe tbody tr th:only-of-type {\n",
 845 |        "        vertical-align: middle;\n",
 846 |        "    }\n",
 847 |        "\n",
 848 |        "    .dataframe tbody tr th {\n",
 849 |        "        vertical-align: top;\n",
 850 |        "    }\n",
 851 |        "\n",
 852 |        "    .dataframe thead th {\n",
 853 |        "        text-align: right;\n",
 854 |        "    }\n",
 855 |        "</style>\n",
 856 |        "<table border=\"1\" class=\"dataframe\">\n",
 857 |        "  <thead>\n",
 858 |        "    <tr style=\"text-align: right;\">\n",
 859 |        "      <th></th>\n",
 860 |        "      <th>bad</th>\n",
 861 |        "      <th>covid</th>\n",
 862 |        "      <th>is</th>\n",
 863 |        "      <th>that</th>\n",
 864 |        "      <th>what</th>\n",
 865 |        "    </tr>\n",
 866 |        "  </thead>\n",
 867 |        "  <tbody>\n",
 868 |        "    <tr>\n",
 869 |        "      <th>0</th>\n",
 870 |        "      <td>0.000000</td>\n",
 871 |        "      <td>0.668501</td>\n",
 872 |        "      <td>0.334251</td>\n",
 873 |        "      <td>0.469778</td>\n",
 874 |        "      <td>0.469778</td>\n",
 875 |        "    </tr>\n",
 876 |        "    <tr>\n",
 877 |        "      <th>1</th>\n",
 878 |        "      <td>0.704909</td>\n",
 879 |        "      <td>0.501549</td>\n",
 880 |        "      <td>0.501549</td>\n",
 881 |        "      <td>0.000000</td>\n",
 882 |        "      <td>0.000000</td>\n",
 883 |        "    </tr>\n",
 884 |        "  </tbody>\n",
 885 |        "</table>\n",
 886 |        "</div>"
 887 |       ],
 888 |       "text/plain": [
 889 |        "        bad     covid        is      that      what\n",
 890 |        "0  0.000000  0.668501  0.334251  0.469778  0.469778\n",
 891 |        "1  0.704909  0.501549  0.501549  0.000000  0.000000"
 892 |       ]
 893 |      },
 894 |      "execution_count": 34,
 895 |      "metadata": {},
 896 |      "output_type": "execute_result"
 897 |     }
 898 |    ],
 899 |    "source": [
 900 |     "# Importance of Covid increased based on the occurance and total document\n",
 901 |     "jul_message = [\"What is that covid covid\",\n",
 902 |     "              \"covid is bad\"]\n",
 903 |     "\n",
 904 |     "text_matrix(jul_message,tf)"
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "markdown",
 909 |    "metadata": {},
 910 |    "source": [
 911 |     "#### Countvectorizer,TF-IDF,n-grams"
 912 |    ]
 913 |   },
 914 |   {
 915 |    "cell_type": "code",
 916 |    "execution_count": 42,
 917 |    "metadata": {},
 918 |    "outputs": [],
 919 |    "source": [
 920 |     "\n",
 921 |     "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
 922 |     "\n",
 923 |     "\n",
 924 |     "arr = [\"Car was cleaned by Jack \",\n",
 925 |     "       \"Jack was cleaned by Car.\"]\n"
 926 |    ]
 927 |   },
 928 |   {
 929 |    "cell_type": "code",
 930 |    "execution_count": 43,
 931 |    "metadata": {},
 932 |    "outputs": [
 933 |     {
 934 |      "name": "stdout",
 935 |      "output_type": "stream",
 936 |      "text": [
 937 |       "Feature Names \n",
 938 |       " ['by car', 'by jack', 'car was', 'cleaned by', 'jack was', 'was cleaned']\n",
 939 |       "Array \n",
 940 |       " [[0 1 1 1 0 1]\n",
 941 |       " [1 0 0 1 1 1]]\n"
 942 |      ]
 943 |     }
 944 |    ],
 945 |    "source": [
 946 |     "# If you want to take into account just term frequencies:\n",
 947 |     "vectorizer = CountVectorizer(ngram_range=(2,2))\n",
 948 |     "\n",
 949 |     "# The ngram range specifies your ngram configuration.\n",
 950 |     "\n",
 951 |     "X = vectorizer.fit_transform(arr)\n",
 952 |     "\n",
 953 |     "# Testing the ngram generation:\n",
 954 |     "print(\"Feature Names \\n\",vectorizer.get_feature_names())\n",
 955 |     "\n",
 956 |     "\n",
 957 |     "print('Array \\n',X.toarray())\n",
 958 |     "\n"
 959 |    ]
 960 |   },
 961 |   {
 962 |    "cell_type": "code",
 963 |    "execution_count": 44,
 964 |    "metadata": {},
 965 |    "outputs": [
 966 |     {
 967 |      "name": "stdout",
 968 |      "output_type": "stream",
 969 |      "text": [
 970 |       "[[0.         0.57615236 0.57615236 0.40993715 0.         0.40993715]\n",
 971 |       " [0.57615236 0.         0.         0.40993715 0.57615236 0.40993715]]\n"
 972 |      ]
 973 |     }
 974 |    ],
 975 |    "source": [
 976 |     "# And now testing TFIDF vectorizer:\n",
 977 |     "# You can still specify n-grams here.\n",
 978 |     "\n",
 979 |     "vectorizer = TfidfVectorizer(ngram_range=(2, 2))\n",
 980 |     "X = vectorizer.fit_transform(arr)\n",
 981 |     "\n",
 982 |     "\n",
 983 |     "# Testing the TFIDF value + ngrams:\n",
 984 |     "print(X.toarray())"
 985 |    ]
 986 |   },
 987 |   {
 988 |    "cell_type": "code",
 989 |    "execution_count": 46,
 990 |    "metadata": {},
 991 |    "outputs": [
 992 |     {
 993 |      "name": "stdout",
 994 |      "output_type": "stream",
 995 |      "text": [
 996 |       "[[0.         0.57615236 0.57615236 0.40993715 0.         0.40993715]\n",
 997 |       " [0.57615236 0.         0.         0.40993715 0.57615236 0.40993715]]\n"
 998 |      ]
 999 |     }
1000 |    ],
1001 |    "source": [
1002 |     "# Testing TFIDF vectorizer without normalization:\n",
1003 |     "# You can still specify n-grams here.\n",
1004 |     "\n",
1005 |     "vectorizer = TfidfVectorizer(ngram_range=(2, 2), norm=None)\n",
1006 |     "\n",
1007 |     "X = vectorizer.fit_transform(arr)\n",
1008 |     "\n",
1009 |     "# Testing TFIDF value before normalization:\n",
1010 |     "print(X.toarray())"
1011 |    ]
1012 |   },
1013 |   {
1014 |    "cell_type": "code",
1015 |    "execution_count": null,
1016 |    "metadata": {},
1017 |    "outputs": [],
1018 |    "source": []
1019 |   }
1020 |  ],
1021 |  "metadata": {
1022 |   "hide_input": false,
1023 |   "kernelspec": {
1024 |    "display_name": "Python 3",
1025 |    "language": "python",
1026 |    "name": "python3"
1027 |   },
1028 |   "language_info": {
1029 |    "codemirror_mode": {
1030 |     "name": "ipython",
1031 |     "version": 3
1032 |    },
1033 |    "file_extension": ".py",
1034 |    "mimetype": "text/x-python",
1035 |    "name": "python",
1036 |    "nbconvert_exporter": "python",
1037 |    "pygments_lexer": "ipython3",
1038 |    "version": "3.6.5"
1039 |   }
1040 |  },
1041 |  "nbformat": 4,
1042 |  "nbformat_minor": 2
1043 | }
1044 | 


--------------------------------------------------------------------------------
/5_Gensim.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Gensim"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### What is Gensim?\n",
 15 |     "- `Gensim = “Generate Similar”` is a popular open source natural language processing (NLP) library used for unsupervised topic modeling.\n",
 16 |     "\n",
 17 |     "\n",
 18 |     "- Let's understand important terms and its meaning.\n",
 19 |     "\n",
 20 |     "\n",
 21 |     " Document: some text.\n",
 22 |     "\n",
 23 |     " Corpus: a collection of documents.\n",
 24 |     "\n",
 25 |     " Vector: a mathematically convenient representation of a document.\n",
 26 |     "\n",
 27 |     " Model: an algorithm for transforming vectors from one representation to another."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "#####  Document: some text."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 13,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "document = \"Human machine interface for lab abc computer applications\""
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "##### Corpus: a collection of documents."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 14,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "text_corpus = [\n",
 60 |     "    \"Human machine interface for lab abc computer applications\",\n",
 61 |     "    \"A survey of user opinion of computer system response time\",\n",
 62 |     "    \"The EPS user interface management system\",\n",
 63 |     "    \"System and human system engineering testing of EPS\",\n",
 64 |     "    \"Relation of user perceived response time to error measurement\",\n",
 65 |     "    \"The generation of random binary unordered trees\",\n",
 66 |     "    \"The intersection graph of paths in trees\",\n",
 67 |     "    \"Graph minors IV Widths of trees and well quasi ordering\",\n",
 68 |     "    \"Graph minors A survey\",\n",
 69 |     "]"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "##### Vector: a mathematically convenient representation of a document"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 15,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "import pprint"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 20,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "[['human', 'interface', 'computer'],\n",
 98 |       " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n",
 99 |       " ['eps', 'user', 'interface', 'system'],\n",
100 |       " ['system', 'human', 'system', 'eps'],\n",
101 |       " ['user', 'response', 'time'],\n",
102 |       " ['trees'],\n",
103 |       " ['graph', 'trees'],\n",
104 |       " ['graph', 'minors', 'trees'],\n",
105 |       " ['graph', 'minors', 'survey']]\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "# Create a set of frequent words\n",
111 |     "\n",
112 |     "stoplist = set('for a of the and to in'.split(' '))\n",
113 |     "\n",
114 |     "# Lowercase each document, split it by white space and filter out stopwords\n",
115 |     "texts = [[word for word in document.lower().split() if word not in stoplist]\n",
116 |     "         for document in text_corpus]\n",
117 |     "\n",
118 |     "# Count word frequencies\n",
119 |     "from collections import defaultdict\n",
120 |     "\n",
121 |     "frequency = defaultdict(int)\n",
122 |     "\n",
123 |     "for text in texts:\n",
124 |     "    for token in text:\n",
125 |     "        frequency[token] += 1\n",
126 |     "\n",
127 |     "# Only keep words that appear more than once\n",
128 |     "processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]\n",
129 |     "\n",
130 |     "pprint.pprint(processed_corpus)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 21,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stdout",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)\n"
143 |      ]
144 |     }
145 |    ],
146 |    "source": [
147 |     "# Creating dictionaries, which helps during Topic modelling\n",
148 |     "from gensim import corpora\n",
149 |     "\n",
150 |     "dictionary = corpora.Dictionary(processed_corpus)\n",
151 |     "print(dictionary)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 22,
157 |    "metadata": {
158 |     "scrolled": true
159 |    },
160 |    "outputs": [
161 |     {
162 |      "name": "stdout",
163 |      "output_type": "stream",
164 |      "text": [
165 |       "{'computer': 0,\n",
166 |       " 'eps': 8,\n",
167 |       " 'graph': 10,\n",
168 |       " 'human': 1,\n",
169 |       " 'interface': 2,\n",
170 |       " 'minors': 11,\n",
171 |       " 'response': 3,\n",
172 |       " 'survey': 4,\n",
173 |       " 'system': 5,\n",
174 |       " 'time': 6,\n",
175 |       " 'trees': 9,\n",
176 |       " 'user': 7}\n"
177 |      ]
178 |     }
179 |    ],
180 |    "source": [
181 |     "# Vectore representation \n",
182 |     "pprint.pprint(dictionary.token2id)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "##### Model: an algorithm for transforming vectors from one representation to another."
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 26,
195 |    "metadata": {},
196 |    "outputs": [
197 |     {
198 |      "name": "stdout",
199 |      "output_type": "stream",
200 |      "text": [
201 |       "[(0, 1), (1, 1)]\n"
202 |      ]
203 |     }
204 |    ],
205 |    "source": [
206 |     "# Always make a practice of testing small texts when trying something new\n",
207 |     "# in each tuple below 1st occurance is ID and 2nd occurance is count\n",
208 |     "\n",
209 |     "new_doc = \"Human computer interaction\"\n",
210 |     "new_vec = dictionary.doc2bow(new_doc.lower().split())\n",
211 |     "print(new_vec)"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 27,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "[[(0, 1), (1, 1), (2, 1)],\n",
224 |       " [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],\n",
225 |       " [(2, 1), (5, 1), (7, 1), (8, 1)],\n",
226 |       " [(1, 1), (5, 2), (8, 1)],\n",
227 |       " [(3, 1), (6, 1), (7, 1)],\n",
228 |       " [(9, 1)],\n",
229 |       " [(9, 1), (10, 1)],\n",
230 |       " [(9, 1), (10, 1), (11, 1)],\n",
231 |       " [(4, 1), (10, 1), (11, 1)]]\n"
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]\n",
237 |     "pprint.pprint(bow_corpus)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 28,
243 |    "metadata": {
244 |     "scrolled": true
245 |    },
246 |    "outputs": [
247 |     {
248 |      "name": "stdout",
249 |      "output_type": "stream",
250 |      "text": [
251 |       "[(5, 0.5898341626740045), (11, 0.8075244024440723)]\n"
252 |      ]
253 |     }
254 |    ],
255 |    "source": [
256 |     "from gensim import models\n",
257 |     "\n",
258 |     "# train the model\n",
259 |     "tfidf = models.TfidfModel(bow_corpus)\n",
260 |     "\n",
261 |     "# transform the \"system minors\" string\n",
262 |     "words = \"system minors\".lower().split()\n",
263 |     "print(tfidf[dictionary.doc2bow(words)])"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "#### Open text file"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 31,
276 |    "metadata": {},
277 |    "outputs": [
278 |     {
279 |      "name": "stdout",
280 |      "output_type": "stream",
281 |      "text": [
282 |       "{'details': 0, 'files': 1, 'sample': 2, 'text': 3, 'how': 4, 'open': 5, 'to': 6, 'txt': 7, 'basic': 8, 'informations': 9, 'create': 10, 'download': 11, 'file': 12, 'of': 13, 'url': 14, 'mb': 15}\n"
283 |      ]
284 |     }
285 |    ],
286 |    "source": [
287 |     "import gensim\n",
288 |     "from gensim import corpora\n",
289 |     "from pprint import pprint\n",
290 |     "from gensim.utils import simple_preprocess,\n",
291 |     "\n",
292 |     "import os\n",
293 |     "\n",
294 |     "dict_STF = corpora.Dictionary(simple_preprocess(line, deacc =True) for line in open(r\"sample.txt\"))\n",
295 |     "\n",
296 |     "print(dict_STF.token2id)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "!pip install gensim"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "### LDA (Latent Dirichlet allocation )\n",
313 |     "\n",
314 |     "It is one of the important module when it comes to Topic Modelling. We will work on this later, but lets understand what it does.\n",
315 |     "\n",
316 |     "\n",
317 |     "Automatically extracting information about topics from large volume of texts in one of the primary applications of NLP (natural language processing). Large volume of texts could be feeds from hotel reviews, tweets, Facebook posts, feeds from any other social media channel, movie reviews, news stories, user feedbacks, e-mails etc.\n",
318 |     "\n",
319 |     "In this digital era, to know what people/customers are talking about, to understand their opinions, and their problems, can be highly valuable for businesses, political campaigns and administrators. But, is it possible to manually read through such large volumes of text and then extracting the information from topics?\n",
320 |     "\n",
321 |     "No, it’s not. It requires an automatic algorithm that can read through these large volume of text documents and automatically extract the required information/topics discussed from it.\n",
322 |     "\n",
323 |     "Role of LDA\n",
324 |     "\n",
325 |     "LDA’s approach to topic modeling is to classify text in a document to a particular topic. Modeled as Dirichlet distributions, LDA builds −\n",
326 |     "\n",
327 |     "- A topic per document model and\n",
328 |     "- Words per topic model\n",
329 |     "\n",
330 |     "After providing the LDA topic model algorithm, in order to obtain a good composition of topic-keyword distribution, it re-arrange −\n",
331 |     "\n",
332 |     "- The topics distributions within the document and\n",
333 |     "- Keywords distribution within the topics\n",
334 |     "\n",
335 |     "\n",
336 |     "While processing, some of the assumptions made by LDA are −\n",
337 |     "\n",
338 |     "- Every document is modeled as multi-nominal distributions of topics.\n",
339 |     "- Every topic is modeled as multi-nominal distributions of words.\n",
340 |     "- We should have to choose the right corpus of data because LDA assumes that each chunk of text contains the related words.\n",
341 |     "- LDA also assumes that the documents are produced from a mixture of topics.\n",
342 |     "\n",
343 |     "\n",
344 |     "Note-  **Will discuss more about this during topic modelling**"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": []
353 |   }
354 |  ],
355 |  "metadata": {
356 |   "kernelspec": {
357 |    "display_name": "Python 3",
358 |    "language": "python",
359 |    "name": "python3"
360 |   },
361 |   "language_info": {
362 |    "codemirror_mode": {
363 |     "name": "ipython",
364 |     "version": 3
365 |    },
366 |    "file_extension": ".py",
367 |    "mimetype": "text/x-python",
368 |    "name": "python",
369 |    "nbconvert_exporter": "python",
370 |    "pygments_lexer": "ipython3",
371 |    "version": "3.6.5"
372 |   }
373 |  },
374 |  "nbformat": 4,
375 |  "nbformat_minor": 2
376 | }
377 | 


--------------------------------------------------------------------------------
/6_Stock Price webscrape.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Import libraries\n",
 10 |     "\n",
 11 |     "# requests needed to request urls\n",
 12 |     "import requests\n",
 13 |     "\n",
 14 |     "# bs4 needs to parse the data from url\n",
 15 |     "from bs4 import BeautifulSoup"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# define stock\n",
 25 |     "stock = 'AMZN'\n",
 26 |     "\n",
 27 |     "url = 'https://finance.yahoo.com/quote/'+str(stock)+'?p='+str(stock)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/plain": [
 38 |        "'https://finance.yahoo.com/quote/AMZN?p=AMZN'"
 39 |       ]
 40 |      },
 41 |      "execution_count": 3,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "url"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "text/plain": [
 58 |        "<Response [200]>"
 59 |       ]
 60 |      },
 61 |      "execution_count": 4,
 62 |      "metadata": {},
 63 |      "output_type": "execute_result"
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "res = requests.get(url)\n",
 68 |     "res"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 5,
 74 |    "metadata": {
 75 |     "scrolled": true
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stdout",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "3,352.15\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "# Data parsing using soup\n",
 88 |     "soup = BeautifulSoup(res.text, \"html\")\n",
 89 |     "\n",
 90 |     "# Identify the div by inspect on webpage\n",
 91 |     "\n",
 92 |     "price = soup.find_all('div',{'class':\"My(6px) Pos(r) smartphone_Mt(6px)\"})[0].find('span').text\n",
 93 |     "\n",
 94 |     "print (price)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 32,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "# To capture live price\n",
104 |     "def live_price():\n",
105 |     "    res = requests.get(url)\n",
106 |     "    soup = BeautifulSoup(res.text, \"html\")\n",
107 |     "    price = soup.find_all('div',{'class':\"My(6px) Pos(r) smartphone_Mt(6px)\"})[0].find('span').text\n",
108 |     "    return price"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 33,
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "name": "stdout",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "Current price of AMZN is => 3,367.88\n",
121 |       "Current price of AMZN is => 3,367.88\n",
122 |       "Current price of AMZN is => 3,367.88\n",
123 |       "Current price of AMZN is => 3,367.54\n",
124 |       "Current price of AMZN is => 3,367.54\n",
125 |       "Current price of AMZN is => 3,367.54\n",
126 |       "Current price of AMZN is => 3,367.54\n",
127 |       "Current price of AMZN is => 3,367.54\n",
128 |       "Current price of AMZN is => 3,367.54\n",
129 |       "Current price of AMZN is => 3,367.54\n",
130 |       "Current price of AMZN is => 3,367.54\n",
131 |       "Current price of AMZN is => 3,367.44\n",
132 |       "Current price of AMZN is => 3,367.80\n",
133 |       "Current price of AMZN is => 3,367.80\n",
134 |       "Current price of AMZN is => 3,367.80\n",
135 |       "Current price of AMZN is => 3,367.80\n",
136 |       "Current price of AMZN is => 3,367.80\n",
137 |       "Current price of AMZN is => 3,367.80\n",
138 |       "Current price of AMZN is => 3,367.74\n",
139 |       "Current price of AMZN is => 3,367.74\n",
140 |       "Current price of AMZN is => 3,367.74\n",
141 |       "Current price of AMZN is => 3,367.74\n",
142 |       "Current price of AMZN is => 3,367.74\n",
143 |       "Current price of AMZN is => 3,367.74\n",
144 |       "Current price of AMZN is => 3,367.74\n",
145 |       "Current price of AMZN is => 3,367.74\n",
146 |       "Current price of AMZN is => 3,365.54\n",
147 |       "Current price of AMZN is => 3,365.54\n",
148 |       "Current price of AMZN is => 3,363.90\n",
149 |       "Current price of AMZN is => 3,363.26\n",
150 |       "Current price of AMZN is => 3,363.26\n",
151 |       "Current price of AMZN is => 3,364.62\n",
152 |       "Current price of AMZN is => 3,364.62\n",
153 |       "Current price of AMZN is => 3,364.62\n",
154 |       "Current price of AMZN is => 3,364.62\n",
155 |       "Current price of AMZN is => 3,364.62\n",
156 |       "Current price of AMZN is => 3,364.74\n",
157 |       "Current price of AMZN is => 3,364.74\n",
158 |       "Current price of AMZN is => 3,364.74\n",
159 |       "Current price of AMZN is => 3,364.74\n",
160 |       "Current price of AMZN is => 3,364.74\n",
161 |       "Current price of AMZN is => 3,364.74\n",
162 |       "Current price of AMZN is => 3,364.74\n",
163 |       "Current price of AMZN is => 3,364.74\n",
164 |       "Current price of AMZN is => 3,365.58\n",
165 |       "Current price of AMZN is => 3,365.58\n",
166 |       "Current price of AMZN is => 3,365.51\n",
167 |       "Current price of AMZN is => 3,365.51\n",
168 |       "Current price of AMZN is => 3,365.51\n",
169 |       "Current price of AMZN is => 3,365.51\n",
170 |       "Current price of AMZN is => 3,365.51\n",
171 |       "Current price of AMZN is => 3,365.51\n",
172 |       "Current price of AMZN is => 3,365.32\n",
173 |       "Current price of AMZN is => 3,366.04\n",
174 |       "Current price of AMZN is => 3,366.04\n",
175 |       "Current price of AMZN is => 3,366.04\n",
176 |       "Current price of AMZN is => 3,366.04\n",
177 |       "Current price of AMZN is => 3,366.04\n",
178 |       "Current price of AMZN is => 3,366.04\n",
179 |       "Current price of AMZN is => 3,366.04\n",
180 |       "Current price of AMZN is => 3,366.04\n",
181 |       "Current price of AMZN is => 3,366.04\n"
182 |      ]
183 |     },
184 |     {
185 |      "ename": "KeyboardInterrupt",
186 |      "evalue": "",
187 |      "output_type": "error",
188 |      "traceback": [
189 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
190 |       "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
191 |       "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m    379\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m  \u001b[1;31m# Python 2.7, use buffering of HTTP responses\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 380\u001b[1;33m                 \u001b[0mhttplib_response\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    381\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m:\u001b[0m  \u001b[1;31m# Python 2.6 and older, Python 3\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
192 |       "\u001b[1;31mTypeError\u001b[0m: getresponse() got an unexpected keyword argument 'buffering'",
193 |       "\nDuring handling of the above exception, another exception occurred:\n",
194 |       "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
195 |       "\u001b[1;32m<ipython-input-33-3a29d883a71d>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m     \u001b[0mprint\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m'Current price of '\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstock\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m+\u001b[0m \u001b[1;34m' is => '\u001b[0m\u001b[1;33m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlive_price\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m     \u001b[0mstock_list\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlive_price\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
196 |       "\u001b[1;32m<ipython-input-32-fe8961858466>\u001b[0m in \u001b[0;36mlive_price\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# To capture live price\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mlive_price\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m     \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m     \u001b[0msoup\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBeautifulSoup\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mres\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"html\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[0mprice\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msoup\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'div'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'class'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;34m\"My(6px) Pos(r) smartphone_Mt(6px)\"\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'span'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
197 |       "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\requests\\api.py\u001b[0m in \u001b[0;36mget\u001b[1;34m(url, params, **kwargs)\u001b[0m\n\u001b[0;32m     74\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     75\u001b[0m     \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'allow_redirects'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 76\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'get'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     77\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     78\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
198 |       "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\requests\\api.py\u001b[0m in \u001b[0;36mrequest\u001b[1;34m(method, url, **kwargs)\u001b[0m\n\u001b[0;32m     59\u001b[0m     \u001b[1;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     60\u001b[0m     \u001b[1;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 61\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     62\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     63\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
199 |       "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\requests\\sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[1;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[0;32m    528\u001b[0m         }\n\u001b[0;32m    529\u001b[0m         \u001b[0msend_kwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 530\u001b[1;33m         \u001b[0mresp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    531\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    532\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
200 |       "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\requests\\sessions.py\u001b[0m in \u001b[0;36msend\u001b[1;34m(self, request, **kwargs)\u001b[0m\n\u001b[0;32m    641\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    642\u001b[0m         \u001b[1;31m# Send the request\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 643\u001b[1;33m         \u001b[0mr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    644\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    645\u001b[0m         \u001b[1;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
201 |       "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\requests\\adapters.py\u001b[0m in \u001b[0;36msend\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m    447\u001b[0m                     \u001b[0mdecode_content\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    448\u001b[0m                     \u001b[0mretries\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmax_retries\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 449\u001b[1;33m                     \u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    450\u001b[0m                 )\n\u001b[0;32m    451\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
202 |       "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m    599\u001b[0m                                                   \u001b[0mtimeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtimeout_obj\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    600\u001b[0m                                                   \u001b[0mbody\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 601\u001b[1;33m                                                   chunked=chunked)\n\u001b[0m\u001b[0;32m    602\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    603\u001b[0m             \u001b[1;31m# If we're going to release the connection in ``finally:``, then\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
203 |       "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m    381\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m:\u001b[0m  \u001b[1;31m# Python 2.6 and older, Python 3\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    382\u001b[0m                 \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 383\u001b[1;33m                     \u001b[0mhttplib_response\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    384\u001b[0m                 \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    385\u001b[0m                     \u001b[1;31m# Remove the TypeError from the exception chain in Python 3;\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
204 |       "\u001b[1;32m~\\Anaconda3\\lib\\http\\client.py\u001b[0m in \u001b[0;36mgetresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1329\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1330\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1331\u001b[1;33m                 \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1332\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1333\u001b[0m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
205 |       "\u001b[1;32m~\\Anaconda3\\lib\\http\\client.py\u001b[0m in \u001b[0;36mbegin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    295\u001b[0m         \u001b[1;31m# read until we get a non-100 response\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    296\u001b[0m         \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 297\u001b[1;33m             \u001b[0mversion\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    298\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    299\u001b[0m                 \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
206 |       "\u001b[1;32m~\\Anaconda3\\lib\\http\\client.py\u001b[0m in \u001b[0;36m_read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    256\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    257\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_read_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 258\u001b[1;33m         \u001b[0mline\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_MAXLINE\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"iso-8859-1\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    259\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[0m_MAXLINE\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    260\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mLineTooLong\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"status line\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
207 |       "\u001b[1;32m~\\Anaconda3\\lib\\socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m    584\u001b[0m         \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    585\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 586\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    587\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    588\u001b[0m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
208 |       "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\urllib3\\contrib\\pyopenssl.py\u001b[0m in \u001b[0;36mrecv_into\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    278\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    279\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 280\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    281\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mOpenSSL\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSSL\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSysCallError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    282\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msuppress_ragged_eofs\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0margs\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'Unexpected EOF'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
209 |       "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\OpenSSL\\SSL.py\u001b[0m in \u001b[0;36mrecv_into\u001b[1;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[0;32m   1811\u001b[0m             \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_lib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSSL_peek\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1812\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1813\u001b[1;33m             \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_lib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSSL_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1814\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_raise_ssl_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_ssl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1815\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
210 |       "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
211 |      ]
212 |     }
213 |    ],
214 |    "source": [
215 |     "stock_list = []\n",
216 |     "\n",
217 |     "while True:\n",
218 |     "    print ('Current price of '+str(stock)+ ' is => '+ str(live_price()))\n",
219 |     "    stock_list.append(str(live_price()))\n",
220 |     "\n"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 34,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "#Store in dataset\n",
230 |     "import pandas as pd"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 35,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "df = pd.DataFrame({'Stock': stock, 'Price' : stock_list})"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 38,
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "text/html": [
250 |        "<div>\n",
251 |        "<style scoped>\n",
252 |        "    .dataframe tbody tr th:only-of-type {\n",
253 |        "        vertical-align: middle;\n",
254 |        "    }\n",
255 |        "\n",
256 |        "    .dataframe tbody tr th {\n",
257 |        "        vertical-align: top;\n",
258 |        "    }\n",
259 |        "\n",
260 |        "    .dataframe thead th {\n",
261 |        "        text-align: right;\n",
262 |        "    }\n",
263 |        "</style>\n",
264 |        "<table border=\"1\" class=\"dataframe\">\n",
265 |        "  <thead>\n",
266 |        "    <tr style=\"text-align: right;\">\n",
267 |        "      <th></th>\n",
268 |        "      <th>Stock</th>\n",
269 |        "      <th>Price</th>\n",
270 |        "    </tr>\n",
271 |        "  </thead>\n",
272 |        "  <tbody>\n",
273 |        "    <tr>\n",
274 |        "      <th>0</th>\n",
275 |        "      <td>AMZN</td>\n",
276 |        "      <td>3,367.88</td>\n",
277 |        "    </tr>\n",
278 |        "    <tr>\n",
279 |        "      <th>1</th>\n",
280 |        "      <td>AMZN</td>\n",
281 |        "      <td>3,367.88</td>\n",
282 |        "    </tr>\n",
283 |        "    <tr>\n",
284 |        "      <th>2</th>\n",
285 |        "      <td>AMZN</td>\n",
286 |        "      <td>3,367.88</td>\n",
287 |        "    </tr>\n",
288 |        "    <tr>\n",
289 |        "      <th>3</th>\n",
290 |        "      <td>AMZN</td>\n",
291 |        "      <td>3,367.54</td>\n",
292 |        "    </tr>\n",
293 |        "    <tr>\n",
294 |        "      <th>4</th>\n",
295 |        "      <td>AMZN</td>\n",
296 |        "      <td>3,367.54</td>\n",
297 |        "    </tr>\n",
298 |        "  </tbody>\n",
299 |        "</table>\n",
300 |        "</div>"
301 |       ],
302 |       "text/plain": [
303 |        "  Stock     Price\n",
304 |        "0  AMZN  3,367.88\n",
305 |        "1  AMZN  3,367.88\n",
306 |        "2  AMZN  3,367.88\n",
307 |        "3  AMZN  3,367.54\n",
308 |        "4  AMZN  3,367.54"
309 |       ]
310 |      },
311 |      "execution_count": 38,
312 |      "metadata": {},
313 |      "output_type": "execute_result"
314 |     }
315 |    ],
316 |    "source": [
317 |     "df.head()"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": []
326 |   }
327 |  ],
328 |  "metadata": {
329 |   "hide_input": false,
330 |   "kernelspec": {
331 |    "display_name": "Python 3",
332 |    "language": "python",
333 |    "name": "python3"
334 |   },
335 |   "language_info": {
336 |    "codemirror_mode": {
337 |     "name": "ipython",
338 |     "version": 3
339 |    },
340 |    "file_extension": ".py",
341 |    "mimetype": "text/x-python",
342 |    "name": "python",
343 |    "nbconvert_exporter": "python",
344 |    "pygments_lexer": "ipython3",
345 |    "version": "3.6.5"
346 |   }
347 |  },
348 |  "nbformat": 4,
349 |  "nbformat_minor": 2
350 | }
351 | 


--------------------------------------------------------------------------------
/Audio_to_Text.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "#import library\n",
10 |     "import speech_recognition as sr"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 3,
16 |    "metadata": {},
17 |    "outputs": [
18 |     {
19 |      "name": "stdout",
20 |      "output_type": "stream",
21 |      "text": [
22 |       "Converting audio transcripts into text ... Do not close the session\n",
23 |       "today going to talk about natural language processing and will try to understand how Natural Language Processing is being used in any industry and what are the advantages and disadvantages of a natural language processing\n"
24 |      ]
25 |     }
26 |    ],
27 |    "source": [
28 |     "# Initialize recognizer class (for recognizing the speech)\n",
29 |     "r = sr.Recognizer()\n",
30 |     "\n",
31 |     "# Reading Audio file as source\n",
32 |     "# listening the audio file and store in audio_text variable\n",
33 |     "\n",
34 |     "with sr.AudioFile('Recording.wav') as source:\n",
35 |     "    \n",
36 |     "    audio_text = r.listen(source)\n",
37 |     "    \n",
38 |     "# recoginize_() method will throw a request error if the API is unreachable, hence using exception handling\n",
39 |     "    try:\n",
40 |     "        \n",
41 |     "        # using google speech recognition\n",
42 |     "        text = r.recognize_google(audio_text)\n",
43 |     "        print('Converting audio transcripts into text ... Do not close the session')\n",
44 |     "        print(text)\n",
45 |     "     \n",
46 |     "    except:\n",
47 |     "         print('Sorry.. run again...')"
48 |    ]
49 |   },
50 |   {
51 |    "cell_type": "code",
52 |    "execution_count": null,
53 |    "metadata": {},
54 |    "outputs": [],
55 |    "source": []
56 |   }
57 |  ],
58 |  "metadata": {
59 |   "hide_input": false,
60 |   "kernelspec": {
61 |    "display_name": "Python 3",
62 |    "language": "python",
63 |    "name": "python3"
64 |   },
65 |   "language_info": {
66 |    "codemirror_mode": {
67 |     "name": "ipython",
68 |     "version": 3
69 |    },
70 |    "file_extension": ".py",
71 |    "mimetype": "text/x-python",
72 |    "name": "python",
73 |    "nbconvert_exporter": "python",
74 |    "pygments_lexer": "ipython3",
75 |    "version": "3.6.5"
76 |   }
77 |  },
78 |  "nbformat": 4,
79 |  "nbformat_minor": 2
80 | }
81 | 


--------------------------------------------------------------------------------
/IMDB-Dataset 1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/training-ml/nlp/217d467c4ed67f8609751530a514833480632039/IMDB-Dataset 1.zip


--------------------------------------------------------------------------------
/IMDB-Dataset2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/training-ml/nlp/217d467c4ed67f8609751530a514833480632039/IMDB-Dataset2.zip


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # nlp
2 | NLP documents
3 | 


--------------------------------------------------------------------------------
/Sentiment Analysis 1.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "If any packages missing do. \n",
   8 |     "- !pip install (missing package)\n"
   9 |    ]
  10 |   },
  11 |   {
  12 |    "cell_type": "code",
  13 |    "execution_count": null,
  14 |    "metadata": {},
  15 |    "outputs": [],
  16 |    "source": [
  17 |     "import string\n",
  18 |     "\n",
  19 |     "import nltk\n",
  20 |     "\n",
  21 |     "from collections import Counter\n",
  22 |     "\n",
  23 |     "import matplotlib.pyplot as plt\n",
  24 |     "\n",
  25 |     "from nltk.corpus import stopwords\n",
  26 |     "\n",
  27 |     "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
  28 |     "\n",
  29 |     "from nltk.stem import WordNetLemmatizer\n",
  30 |     "\n",
  31 |     "from nltk.tokenize import word_tokenize\n",
  32 |     "\n",
  33 |     "\n",
  34 |     "#nltk.download('vader_lexicon')\n",
  35 |     "import warnings\n",
  36 |     "warnings.filterwarnings('ignore')"
  37 |    ]
  38 |   },
  39 |   {
  40 |    "cell_type": "markdown",
  41 |    "metadata": {},
  42 |    "source": [
  43 |     "#### How to be Happier in 2021: Toss Out Your Usual List of New Year’s Resolutions.\n",
  44 |     "` Source: University of Rochester`"
  45 |    ]
  46 |   },
  47 |   {
  48 |    "cell_type": "code",
  49 |    "execution_count": 17,
  50 |    "metadata": {
  51 |     "scrolled": true
  52 |    },
  53 |    "outputs": [
  54 |     {
  55 |      "data": {
  56 |       "text/plain": [
  57 |        "'so you want to look trimmer, be smarter, and successful next year? you strive to exercise and call your friends more, and spend less?\\n\\nyou are not alone. new year’s resolutions are as ubiquitous as they are difficult to keep. does it even make sense to set such lofty goals for the new year, hoping anew each january first that this time really is the charm?\\n\\nany motivational researcher would have “ambivalent feelings” about new year’s resolutions, says richard ryan, an international expert on motivational research and professor emeritus of psychology at the university of rochester. “the evidence shows that most of the time people aren’t successful at them.”\\n\\nbut don’t throw in the towel quite yet. ryan, who is also a clinical psychologist, says that any occasion that gives us an opportunity to reflect on our lives is ultimately a good thing. it doesn’t have to be on new year’s. “whenever that happens, if it’s really a reflective change—something that you put your heart behind—that can be good for people.”\\n\\nand he has another tip: what proves most satisfying, and may also be what’s most needed as the covid-19 pandemic rages on, are goals that involve giving to others.\\n\\n“think of how you can help,” says ryan. “there’s a lot of distress out there: if we can set goals that aim to help others, those kinds of goals will, in turn, also add to our own well-being.”\\n\\nhis advice is grounded in decades of research. together with edward deci (also a university of rochester professor emeritus of psychology) ryan is the cofounder of self-determination theory (sdt), a broad framework for the study of human motivation and personality. developed by the duo over nearly 40 years, the theory has become one of the most widely accepted frameworks of human motivation in contemporary behavioral science. its starting point is the idea that all humans have the natural—or intrinsic—tendency to behave in effective and healthful ways.\\n\\ncheck out: still feeling like a new year’s resolution? study suggests living ‘fast,’ living longer—with intermittent fasting\\n\\naccording to ryan, who is also a professor at the institute for positive psychology and education at australian catholic university, acts of willingly helping others satisfy all three of the basic psychological needs identified in sdt research: the needs for autonomy, competence, and relatedness.\\n\\nautonomy in this context means that you can engage in activities in which you feel true volition and find personal value. competence means feeling effective and having a sense of accomplishment. finally, relatedness means working with and feeling connected to others.\\n\\nrelated: this inspiring couple made it their new year’s resolution to go on 52 dates – and they are about to succeed\\n\\n“if you want to make a new year’s resolution that really makes you happy, think about the ways in which you can contribute to the world,” says ryan. “all three of these basic needs are fulfilled. the research shows it’s not just good for the world but also really good for you.”'"
  58 |       ]
  59 |      },
  60 |      "execution_count": 17,
  61 |      "metadata": {},
  62 |      "output_type": "execute_result"
  63 |     }
  64 |    ],
  65 |    "source": [
  66 |     "text = open('story.txt',encoding=\"utf-8\").read()\n",
  67 |     "\n",
  68 |     "lower_case = text.lower()\n",
  69 |     "\n",
  70 |     "lower_case"
  71 |    ]
  72 |   },
  73 |   {
  74 |    "cell_type": "code",
  75 |    "execution_count": 18,
  76 |    "metadata": {},
  77 |    "outputs": [
  78 |     {
  79 |      "data": {
  80 |       "text/plain": [
  81 |        "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
  82 |       ]
  83 |      },
  84 |      "execution_count": 18,
  85 |      "metadata": {},
  86 |      "output_type": "execute_result"
  87 |     }
  88 |    ],
  89 |    "source": [
  90 |     "string.punctuation"
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "code",
  95 |    "execution_count": 19,
  96 |    "metadata": {},
  97 |    "outputs": [
  98 |     {
  99 |      "data": {
 100 |       "text/plain": [
 101 |        "'so you want to look trimmer be smarter and successful next year you strive to exercise and call your friends more and spend less\\n\\nyou are not alone new year’s resolutions are as ubiquitous as they are difficult to keep does it even make sense to set such lofty goals for the new year hoping anew each january first that this time really is the charm\\n\\nany motivational researcher would have “ambivalent feelings” about new year’s resolutions says richard ryan an international expert on motivational research and professor emeritus of psychology at the university of rochester “the evidence shows that most of the time people aren’t successful at them”\\n\\nbut don’t throw in the towel quite yet ryan who is also a clinical psychologist says that any occasion that gives us an opportunity to reflect on our lives is ultimately a good thing it doesn’t have to be on new year’s “whenever that happens if it’s really a reflective change—something that you put your heart behind—that can be good for people”\\n\\nand he has another tip what proves most satisfying and may also be what’s most needed as the covid19 pandemic rages on are goals that involve giving to others\\n\\n“think of how you can help” says ryan “there’s a lot of distress out there if we can set goals that aim to help others those kinds of goals will in turn also add to our own wellbeing”\\n\\nhis advice is grounded in decades of research together with edward deci also a university of rochester professor emeritus of psychology ryan is the cofounder of selfdetermination theory sdt a broad framework for the study of human motivation and personality developed by the duo over nearly 40 years the theory has become one of the most widely accepted frameworks of human motivation in contemporary behavioral science its starting point is the idea that all humans have the natural—or intrinsic—tendency to behave in effective and healthful ways\\n\\ncheck out still feeling like a new year’s resolution study suggests living ‘fast’ living longer—with intermittent fasting\\n\\naccording to ryan who is also a professor at the institute for positive psychology and education at australian catholic university acts of willingly helping others satisfy all three of the basic psychological needs identified in sdt research the needs for autonomy competence and relatedness\\n\\nautonomy in this context means that you can engage in activities in which you feel true volition and find personal value competence means feeling effective and having a sense of accomplishment finally relatedness means working with and feeling connected to others\\n\\nrelated this inspiring couple made it their new year’s resolution to go on 52 dates – and they are about to succeed\\n\\n“if you want to make a new year’s resolution that really makes you happy think about the ways in which you can contribute to the world” says ryan “all three of these basic needs are fulfilled the research shows it’s not just good for the world but also really good for you”'"
 102 |       ]
 103 |      },
 104 |      "execution_count": 19,
 105 |      "metadata": {},
 106 |      "output_type": "execute_result"
 107 |     }
 108 |    ],
 109 |    "source": [
 110 |     "# str.maketrans removes any punctuations \n",
 111 |     "\n",
 112 |     "cleaned_text = lower_case.translate(str.maketrans('', '', string.punctuation))\n",
 113 |     "\n",
 114 |     "\n",
 115 |     "cleaned_text"
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "code",
 120 |    "execution_count": 20,
 121 |    "metadata": {},
 122 |    "outputs": [],
 123 |    "source": [
 124 |     "# Using word_tokenize to tokenize sentence into words\n",
 125 |     "\n",
 126 |     "tokenized_words = word_tokenize(cleaned_text, \"english\")"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": 21,
 132 |    "metadata": {},
 133 |    "outputs": [
 134 |     {
 135 |      "data": {
 136 |       "text/plain": [
 137 |        "['so',\n",
 138 |        " 'you',\n",
 139 |        " 'want',\n",
 140 |        " 'to',\n",
 141 |        " 'look',\n",
 142 |        " 'trimmer',\n",
 143 |        " 'be',\n",
 144 |        " 'smarter',\n",
 145 |        " 'and',\n",
 146 |        " 'successful',\n",
 147 |        " 'next',\n",
 148 |        " 'year',\n",
 149 |        " 'you',\n",
 150 |        " 'strive',\n",
 151 |        " 'to',\n",
 152 |        " 'exercise',\n",
 153 |        " 'and',\n",
 154 |        " 'call',\n",
 155 |        " 'your',\n",
 156 |        " 'friends',\n",
 157 |        " 'more',\n",
 158 |        " 'and',\n",
 159 |        " 'spend',\n",
 160 |        " 'less',\n",
 161 |        " 'you',\n",
 162 |        " 'are',\n",
 163 |        " 'not',\n",
 164 |        " 'alone',\n",
 165 |        " 'new',\n",
 166 |        " 'year',\n",
 167 |        " '’',\n",
 168 |        " 's',\n",
 169 |        " 'resolutions',\n",
 170 |        " 'are',\n",
 171 |        " 'as',\n",
 172 |        " 'ubiquitous',\n",
 173 |        " 'as',\n",
 174 |        " 'they',\n",
 175 |        " 'are',\n",
 176 |        " 'difficult',\n",
 177 |        " 'to',\n",
 178 |        " 'keep',\n",
 179 |        " 'does',\n",
 180 |        " 'it',\n",
 181 |        " 'even',\n",
 182 |        " 'make',\n",
 183 |        " 'sense',\n",
 184 |        " 'to',\n",
 185 |        " 'set',\n",
 186 |        " 'such',\n",
 187 |        " 'lofty',\n",
 188 |        " 'goals',\n",
 189 |        " 'for',\n",
 190 |        " 'the',\n",
 191 |        " 'new',\n",
 192 |        " 'year',\n",
 193 |        " 'hoping',\n",
 194 |        " 'anew',\n",
 195 |        " 'each',\n",
 196 |        " 'january',\n",
 197 |        " 'first',\n",
 198 |        " 'that',\n",
 199 |        " 'this',\n",
 200 |        " 'time',\n",
 201 |        " 'really',\n",
 202 |        " 'is',\n",
 203 |        " 'the',\n",
 204 |        " 'charm',\n",
 205 |        " 'any',\n",
 206 |        " 'motivational',\n",
 207 |        " 'researcher',\n",
 208 |        " 'would',\n",
 209 |        " 'have',\n",
 210 |        " '“',\n",
 211 |        " 'ambivalent',\n",
 212 |        " 'feelings',\n",
 213 |        " '”',\n",
 214 |        " 'about',\n",
 215 |        " 'new',\n",
 216 |        " 'year',\n",
 217 |        " '’',\n",
 218 |        " 's',\n",
 219 |        " 'resolutions',\n",
 220 |        " 'says',\n",
 221 |        " 'richard',\n",
 222 |        " 'ryan',\n",
 223 |        " 'an',\n",
 224 |        " 'international',\n",
 225 |        " 'expert',\n",
 226 |        " 'on',\n",
 227 |        " 'motivational',\n",
 228 |        " 'research',\n",
 229 |        " 'and',\n",
 230 |        " 'professor',\n",
 231 |        " 'emeritus',\n",
 232 |        " 'of',\n",
 233 |        " 'psychology',\n",
 234 |        " 'at',\n",
 235 |        " 'the',\n",
 236 |        " 'university',\n",
 237 |        " 'of',\n",
 238 |        " 'rochester',\n",
 239 |        " '“',\n",
 240 |        " 'the',\n",
 241 |        " 'evidence',\n",
 242 |        " 'shows',\n",
 243 |        " 'that',\n",
 244 |        " 'most',\n",
 245 |        " 'of',\n",
 246 |        " 'the',\n",
 247 |        " 'time',\n",
 248 |        " 'people',\n",
 249 |        " 'aren',\n",
 250 |        " '’',\n",
 251 |        " 't',\n",
 252 |        " 'successful',\n",
 253 |        " 'at',\n",
 254 |        " 'them',\n",
 255 |        " '”',\n",
 256 |        " 'but',\n",
 257 |        " 'don',\n",
 258 |        " '’',\n",
 259 |        " 't',\n",
 260 |        " 'throw',\n",
 261 |        " 'in',\n",
 262 |        " 'the',\n",
 263 |        " 'towel',\n",
 264 |        " 'quite',\n",
 265 |        " 'yet',\n",
 266 |        " 'ryan',\n",
 267 |        " 'who',\n",
 268 |        " 'is',\n",
 269 |        " 'also',\n",
 270 |        " 'a',\n",
 271 |        " 'clinical',\n",
 272 |        " 'psychologist',\n",
 273 |        " 'says',\n",
 274 |        " 'that',\n",
 275 |        " 'any',\n",
 276 |        " 'occasion',\n",
 277 |        " 'that',\n",
 278 |        " 'gives',\n",
 279 |        " 'us',\n",
 280 |        " 'an',\n",
 281 |        " 'opportunity',\n",
 282 |        " 'to',\n",
 283 |        " 'reflect',\n",
 284 |        " 'on',\n",
 285 |        " 'our',\n",
 286 |        " 'lives',\n",
 287 |        " 'is',\n",
 288 |        " 'ultimately',\n",
 289 |        " 'a',\n",
 290 |        " 'good',\n",
 291 |        " 'thing',\n",
 292 |        " 'it',\n",
 293 |        " 'doesn',\n",
 294 |        " '’',\n",
 295 |        " 't',\n",
 296 |        " 'have',\n",
 297 |        " 'to',\n",
 298 |        " 'be',\n",
 299 |        " 'on',\n",
 300 |        " 'new',\n",
 301 |        " 'year',\n",
 302 |        " '’',\n",
 303 |        " 's',\n",
 304 |        " '“',\n",
 305 |        " 'whenever',\n",
 306 |        " 'that',\n",
 307 |        " 'happens',\n",
 308 |        " 'if',\n",
 309 |        " 'it',\n",
 310 |        " '’',\n",
 311 |        " 's',\n",
 312 |        " 'really',\n",
 313 |        " 'a',\n",
 314 |        " 'reflective',\n",
 315 |        " 'change—something',\n",
 316 |        " 'that',\n",
 317 |        " 'you',\n",
 318 |        " 'put',\n",
 319 |        " 'your',\n",
 320 |        " 'heart',\n",
 321 |        " 'behind—that',\n",
 322 |        " 'can',\n",
 323 |        " 'be',\n",
 324 |        " 'good',\n",
 325 |        " 'for',\n",
 326 |        " 'people',\n",
 327 |        " '”',\n",
 328 |        " 'and',\n",
 329 |        " 'he',\n",
 330 |        " 'has',\n",
 331 |        " 'another',\n",
 332 |        " 'tip',\n",
 333 |        " 'what',\n",
 334 |        " 'proves',\n",
 335 |        " 'most',\n",
 336 |        " 'satisfying',\n",
 337 |        " 'and',\n",
 338 |        " 'may',\n",
 339 |        " 'also',\n",
 340 |        " 'be',\n",
 341 |        " 'what',\n",
 342 |        " '’',\n",
 343 |        " 's',\n",
 344 |        " 'most',\n",
 345 |        " 'needed',\n",
 346 |        " 'as',\n",
 347 |        " 'the',\n",
 348 |        " 'covid19',\n",
 349 |        " 'pandemic',\n",
 350 |        " 'rages',\n",
 351 |        " 'on',\n",
 352 |        " 'are',\n",
 353 |        " 'goals',\n",
 354 |        " 'that',\n",
 355 |        " 'involve',\n",
 356 |        " 'giving',\n",
 357 |        " 'to',\n",
 358 |        " 'others',\n",
 359 |        " '“',\n",
 360 |        " 'think',\n",
 361 |        " 'of',\n",
 362 |        " 'how',\n",
 363 |        " 'you',\n",
 364 |        " 'can',\n",
 365 |        " 'help',\n",
 366 |        " '”',\n",
 367 |        " 'says',\n",
 368 |        " 'ryan',\n",
 369 |        " '“',\n",
 370 |        " 'there',\n",
 371 |        " '’',\n",
 372 |        " 's',\n",
 373 |        " 'a',\n",
 374 |        " 'lot',\n",
 375 |        " 'of',\n",
 376 |        " 'distress',\n",
 377 |        " 'out',\n",
 378 |        " 'there',\n",
 379 |        " 'if',\n",
 380 |        " 'we',\n",
 381 |        " 'can',\n",
 382 |        " 'set',\n",
 383 |        " 'goals',\n",
 384 |        " 'that',\n",
 385 |        " 'aim',\n",
 386 |        " 'to',\n",
 387 |        " 'help',\n",
 388 |        " 'others',\n",
 389 |        " 'those',\n",
 390 |        " 'kinds',\n",
 391 |        " 'of',\n",
 392 |        " 'goals',\n",
 393 |        " 'will',\n",
 394 |        " 'in',\n",
 395 |        " 'turn',\n",
 396 |        " 'also',\n",
 397 |        " 'add',\n",
 398 |        " 'to',\n",
 399 |        " 'our',\n",
 400 |        " 'own',\n",
 401 |        " 'wellbeing',\n",
 402 |        " '”',\n",
 403 |        " 'his',\n",
 404 |        " 'advice',\n",
 405 |        " 'is',\n",
 406 |        " 'grounded',\n",
 407 |        " 'in',\n",
 408 |        " 'decades',\n",
 409 |        " 'of',\n",
 410 |        " 'research',\n",
 411 |        " 'together',\n",
 412 |        " 'with',\n",
 413 |        " 'edward',\n",
 414 |        " 'deci',\n",
 415 |        " 'also',\n",
 416 |        " 'a',\n",
 417 |        " 'university',\n",
 418 |        " 'of',\n",
 419 |        " 'rochester',\n",
 420 |        " 'professor',\n",
 421 |        " 'emeritus',\n",
 422 |        " 'of',\n",
 423 |        " 'psychology',\n",
 424 |        " 'ryan',\n",
 425 |        " 'is',\n",
 426 |        " 'the',\n",
 427 |        " 'cofounder',\n",
 428 |        " 'of',\n",
 429 |        " 'selfdetermination',\n",
 430 |        " 'theory',\n",
 431 |        " 'sdt',\n",
 432 |        " 'a',\n",
 433 |        " 'broad',\n",
 434 |        " 'framework',\n",
 435 |        " 'for',\n",
 436 |        " 'the',\n",
 437 |        " 'study',\n",
 438 |        " 'of',\n",
 439 |        " 'human',\n",
 440 |        " 'motivation',\n",
 441 |        " 'and',\n",
 442 |        " 'personality',\n",
 443 |        " 'developed',\n",
 444 |        " 'by',\n",
 445 |        " 'the',\n",
 446 |        " 'duo',\n",
 447 |        " 'over',\n",
 448 |        " 'nearly',\n",
 449 |        " '40',\n",
 450 |        " 'years',\n",
 451 |        " 'the',\n",
 452 |        " 'theory',\n",
 453 |        " 'has',\n",
 454 |        " 'become',\n",
 455 |        " 'one',\n",
 456 |        " 'of',\n",
 457 |        " 'the',\n",
 458 |        " 'most',\n",
 459 |        " 'widely',\n",
 460 |        " 'accepted',\n",
 461 |        " 'frameworks',\n",
 462 |        " 'of',\n",
 463 |        " 'human',\n",
 464 |        " 'motivation',\n",
 465 |        " 'in',\n",
 466 |        " 'contemporary',\n",
 467 |        " 'behavioral',\n",
 468 |        " 'science',\n",
 469 |        " 'its',\n",
 470 |        " 'starting',\n",
 471 |        " 'point',\n",
 472 |        " 'is',\n",
 473 |        " 'the',\n",
 474 |        " 'idea',\n",
 475 |        " 'that',\n",
 476 |        " 'all',\n",
 477 |        " 'humans',\n",
 478 |        " 'have',\n",
 479 |        " 'the',\n",
 480 |        " 'natural—or',\n",
 481 |        " 'intrinsic—tendency',\n",
 482 |        " 'to',\n",
 483 |        " 'behave',\n",
 484 |        " 'in',\n",
 485 |        " 'effective',\n",
 486 |        " 'and',\n",
 487 |        " 'healthful',\n",
 488 |        " 'ways',\n",
 489 |        " 'check',\n",
 490 |        " 'out',\n",
 491 |        " 'still',\n",
 492 |        " 'feeling',\n",
 493 |        " 'like',\n",
 494 |        " 'a',\n",
 495 |        " 'new',\n",
 496 |        " 'year',\n",
 497 |        " '’',\n",
 498 |        " 's',\n",
 499 |        " 'resolution',\n",
 500 |        " 'study',\n",
 501 |        " 'suggests',\n",
 502 |        " 'living',\n",
 503 |        " '‘',\n",
 504 |        " 'fast',\n",
 505 |        " '’',\n",
 506 |        " 'living',\n",
 507 |        " 'longer—with',\n",
 508 |        " 'intermittent',\n",
 509 |        " 'fasting',\n",
 510 |        " 'according',\n",
 511 |        " 'to',\n",
 512 |        " 'ryan',\n",
 513 |        " 'who',\n",
 514 |        " 'is',\n",
 515 |        " 'also',\n",
 516 |        " 'a',\n",
 517 |        " 'professor',\n",
 518 |        " 'at',\n",
 519 |        " 'the',\n",
 520 |        " 'institute',\n",
 521 |        " 'for',\n",
 522 |        " 'positive',\n",
 523 |        " 'psychology',\n",
 524 |        " 'and',\n",
 525 |        " 'education',\n",
 526 |        " 'at',\n",
 527 |        " 'australian',\n",
 528 |        " 'catholic',\n",
 529 |        " 'university',\n",
 530 |        " 'acts',\n",
 531 |        " 'of',\n",
 532 |        " 'willingly',\n",
 533 |        " 'helping',\n",
 534 |        " 'others',\n",
 535 |        " 'satisfy',\n",
 536 |        " 'all',\n",
 537 |        " 'three',\n",
 538 |        " 'of',\n",
 539 |        " 'the',\n",
 540 |        " 'basic',\n",
 541 |        " 'psychological',\n",
 542 |        " 'needs',\n",
 543 |        " 'identified',\n",
 544 |        " 'in',\n",
 545 |        " 'sdt',\n",
 546 |        " 'research',\n",
 547 |        " 'the',\n",
 548 |        " 'needs',\n",
 549 |        " 'for',\n",
 550 |        " 'autonomy',\n",
 551 |        " 'competence',\n",
 552 |        " 'and',\n",
 553 |        " 'relatedness',\n",
 554 |        " 'autonomy',\n",
 555 |        " 'in',\n",
 556 |        " 'this',\n",
 557 |        " 'context',\n",
 558 |        " 'means',\n",
 559 |        " 'that',\n",
 560 |        " 'you',\n",
 561 |        " 'can',\n",
 562 |        " 'engage',\n",
 563 |        " 'in',\n",
 564 |        " 'activities',\n",
 565 |        " 'in',\n",
 566 |        " 'which',\n",
 567 |        " 'you',\n",
 568 |        " 'feel',\n",
 569 |        " 'true',\n",
 570 |        " 'volition',\n",
 571 |        " 'and',\n",
 572 |        " 'find',\n",
 573 |        " 'personal',\n",
 574 |        " 'value',\n",
 575 |        " 'competence',\n",
 576 |        " 'means',\n",
 577 |        " 'feeling',\n",
 578 |        " 'effective',\n",
 579 |        " 'and',\n",
 580 |        " 'having',\n",
 581 |        " 'a',\n",
 582 |        " 'sense',\n",
 583 |        " 'of',\n",
 584 |        " 'accomplishment',\n",
 585 |        " 'finally',\n",
 586 |        " 'relatedness',\n",
 587 |        " 'means',\n",
 588 |        " 'working',\n",
 589 |        " 'with',\n",
 590 |        " 'and',\n",
 591 |        " 'feeling',\n",
 592 |        " 'connected',\n",
 593 |        " 'to',\n",
 594 |        " 'others',\n",
 595 |        " 'related',\n",
 596 |        " 'this',\n",
 597 |        " 'inspiring',\n",
 598 |        " 'couple',\n",
 599 |        " 'made',\n",
 600 |        " 'it',\n",
 601 |        " 'their',\n",
 602 |        " 'new',\n",
 603 |        " 'year',\n",
 604 |        " '’',\n",
 605 |        " 's',\n",
 606 |        " 'resolution',\n",
 607 |        " 'to',\n",
 608 |        " 'go',\n",
 609 |        " 'on',\n",
 610 |        " '52',\n",
 611 |        " 'dates',\n",
 612 |        " '–',\n",
 613 |        " 'and',\n",
 614 |        " 'they',\n",
 615 |        " 'are',\n",
 616 |        " 'about',\n",
 617 |        " 'to',\n",
 618 |        " 'succeed',\n",
 619 |        " '“',\n",
 620 |        " 'if',\n",
 621 |        " 'you',\n",
 622 |        " 'want',\n",
 623 |        " 'to',\n",
 624 |        " 'make',\n",
 625 |        " 'a',\n",
 626 |        " 'new',\n",
 627 |        " 'year',\n",
 628 |        " '’',\n",
 629 |        " 's',\n",
 630 |        " 'resolution',\n",
 631 |        " 'that',\n",
 632 |        " 'really',\n",
 633 |        " 'makes',\n",
 634 |        " 'you',\n",
 635 |        " 'happy',\n",
 636 |        " 'think',\n",
 637 |        " 'about',\n",
 638 |        " 'the',\n",
 639 |        " 'ways',\n",
 640 |        " 'in',\n",
 641 |        " 'which',\n",
 642 |        " 'you',\n",
 643 |        " 'can',\n",
 644 |        " 'contribute',\n",
 645 |        " 'to',\n",
 646 |        " 'the',\n",
 647 |        " 'world',\n",
 648 |        " '”',\n",
 649 |        " 'says',\n",
 650 |        " 'ryan',\n",
 651 |        " '“',\n",
 652 |        " 'all',\n",
 653 |        " 'three',\n",
 654 |        " 'of',\n",
 655 |        " 'these',\n",
 656 |        " 'basic',\n",
 657 |        " 'needs',\n",
 658 |        " 'are',\n",
 659 |        " 'fulfilled',\n",
 660 |        " 'the',\n",
 661 |        " 'research',\n",
 662 |        " 'shows',\n",
 663 |        " 'it',\n",
 664 |        " '’',\n",
 665 |        " 's',\n",
 666 |        " 'not',\n",
 667 |        " 'just',\n",
 668 |        " 'good',\n",
 669 |        " 'for',\n",
 670 |        " 'the',\n",
 671 |        " 'world',\n",
 672 |        " 'but',\n",
 673 |        " 'also',\n",
 674 |        " 'really',\n",
 675 |        " 'good',\n",
 676 |        " 'for',\n",
 677 |        " 'you',\n",
 678 |        " '”']"
 679 |       ]
 680 |      },
 681 |      "execution_count": 21,
 682 |      "metadata": {},
 683 |      "output_type": "execute_result"
 684 |     }
 685 |    ],
 686 |    "source": [
 687 |     "tokenized_words"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "code",
 692 |    "execution_count": 22,
 693 |    "metadata": {
 694 |     "scrolled": true
 695 |    },
 696 |    "outputs": [
 697 |     {
 698 |      "data": {
 699 |       "text/plain": [
 700 |        "542"
 701 |       ]
 702 |      },
 703 |      "execution_count": 22,
 704 |      "metadata": {},
 705 |      "output_type": "execute_result"
 706 |     }
 707 |    ],
 708 |    "source": [
 709 |     "len(tokenized_words)"
 710 |    ]
 711 |   },
 712 |   {
 713 |    "cell_type": "code",
 714 |    "execution_count": 23,
 715 |    "metadata": {},
 716 |    "outputs": [],
 717 |    "source": [
 718 |     "# Removing Stop Words\n",
 719 |     "final_words = []\n",
 720 |     "\n",
 721 |     "for word in tokenized_words:\n",
 722 |     "    if word not in stopwords.words('english'):\n",
 723 |     "        final_words.append(word)"
 724 |    ]
 725 |   },
 726 |   {
 727 |    "cell_type": "code",
 728 |    "execution_count": 24,
 729 |    "metadata": {},
 730 |    "outputs": [
 731 |     {
 732 |      "data": {
 733 |       "text/plain": [
 734 |        "303"
 735 |       ]
 736 |      },
 737 |      "execution_count": 24,
 738 |      "metadata": {},
 739 |      "output_type": "execute_result"
 740 |     }
 741 |    ],
 742 |    "source": [
 743 |     "len(final_words)"
 744 |    ]
 745 |   },
 746 |   {
 747 |    "cell_type": "code",
 748 |    "execution_count": 25,
 749 |    "metadata": {},
 750 |    "outputs": [],
 751 |    "source": [
 752 |     "# Lemmatization - From plural to single + Base form of a word (example better-> good)\n",
 753 |     "\n",
 754 |     "lemma_words = []\n",
 755 |     "\n",
 756 |     "for word in final_words:\n",
 757 |     "    word = WordNetLemmatizer().lemmatize(word)\n",
 758 |     "    lemma_words.append(word)"
 759 |    ]
 760 |   },
 761 |   {
 762 |    "cell_type": "code",
 763 |    "execution_count": 26,
 764 |    "metadata": {},
 765 |    "outputs": [
 766 |     {
 767 |      "data": {
 768 |       "text/plain": [
 769 |        "['want',\n",
 770 |        " 'look',\n",
 771 |        " 'trimmer',\n",
 772 |        " 'smarter',\n",
 773 |        " 'successful',\n",
 774 |        " 'next',\n",
 775 |        " 'year',\n",
 776 |        " 'strive',\n",
 777 |        " 'exercise',\n",
 778 |        " 'call',\n",
 779 |        " 'friend',\n",
 780 |        " 'spend',\n",
 781 |        " 'le',\n",
 782 |        " 'alone',\n",
 783 |        " 'new',\n",
 784 |        " 'year',\n",
 785 |        " '’',\n",
 786 |        " 'resolution',\n",
 787 |        " 'ubiquitous',\n",
 788 |        " 'difficult',\n",
 789 |        " 'keep',\n",
 790 |        " 'even',\n",
 791 |        " 'make',\n",
 792 |        " 'sense',\n",
 793 |        " 'set',\n",
 794 |        " 'lofty',\n",
 795 |        " 'goal',\n",
 796 |        " 'new',\n",
 797 |        " 'year',\n",
 798 |        " 'hoping',\n",
 799 |        " 'anew',\n",
 800 |        " 'january',\n",
 801 |        " 'first',\n",
 802 |        " 'time',\n",
 803 |        " 'really',\n",
 804 |        " 'charm',\n",
 805 |        " 'motivational',\n",
 806 |        " 'researcher',\n",
 807 |        " 'would',\n",
 808 |        " '“',\n",
 809 |        " 'ambivalent',\n",
 810 |        " 'feeling',\n",
 811 |        " '”',\n",
 812 |        " 'new',\n",
 813 |        " 'year',\n",
 814 |        " '’',\n",
 815 |        " 'resolution',\n",
 816 |        " 'say',\n",
 817 |        " 'richard',\n",
 818 |        " 'ryan',\n",
 819 |        " 'international',\n",
 820 |        " 'expert',\n",
 821 |        " 'motivational',\n",
 822 |        " 'research',\n",
 823 |        " 'professor',\n",
 824 |        " 'emeritus',\n",
 825 |        " 'psychology',\n",
 826 |        " 'university',\n",
 827 |        " 'rochester',\n",
 828 |        " '“',\n",
 829 |        " 'evidence',\n",
 830 |        " 'show',\n",
 831 |        " 'time',\n",
 832 |        " 'people',\n",
 833 |        " '’',\n",
 834 |        " 'successful',\n",
 835 |        " '”',\n",
 836 |        " '’',\n",
 837 |        " 'throw',\n",
 838 |        " 'towel',\n",
 839 |        " 'quite',\n",
 840 |        " 'yet',\n",
 841 |        " 'ryan',\n",
 842 |        " 'also',\n",
 843 |        " 'clinical',\n",
 844 |        " 'psychologist',\n",
 845 |        " 'say',\n",
 846 |        " 'occasion',\n",
 847 |        " 'give',\n",
 848 |        " 'u',\n",
 849 |        " 'opportunity',\n",
 850 |        " 'reflect',\n",
 851 |        " 'life',\n",
 852 |        " 'ultimately',\n",
 853 |        " 'good',\n",
 854 |        " 'thing',\n",
 855 |        " '’',\n",
 856 |        " 'new',\n",
 857 |        " 'year',\n",
 858 |        " '’',\n",
 859 |        " '“',\n",
 860 |        " 'whenever',\n",
 861 |        " 'happens',\n",
 862 |        " '’',\n",
 863 |        " 'really',\n",
 864 |        " 'reflective',\n",
 865 |        " 'change—something',\n",
 866 |        " 'put',\n",
 867 |        " 'heart',\n",
 868 |        " 'behind—that',\n",
 869 |        " 'good',\n",
 870 |        " 'people',\n",
 871 |        " '”',\n",
 872 |        " 'another',\n",
 873 |        " 'tip',\n",
 874 |        " 'prof',\n",
 875 |        " 'satisfying',\n",
 876 |        " 'may',\n",
 877 |        " 'also',\n",
 878 |        " '’',\n",
 879 |        " 'needed',\n",
 880 |        " 'covid19',\n",
 881 |        " 'pandemic',\n",
 882 |        " 'rage',\n",
 883 |        " 'goal',\n",
 884 |        " 'involve',\n",
 885 |        " 'giving',\n",
 886 |        " 'others',\n",
 887 |        " '“',\n",
 888 |        " 'think',\n",
 889 |        " 'help',\n",
 890 |        " '”',\n",
 891 |        " 'say',\n",
 892 |        " 'ryan',\n",
 893 |        " '“',\n",
 894 |        " '’',\n",
 895 |        " 'lot',\n",
 896 |        " 'distress',\n",
 897 |        " 'set',\n",
 898 |        " 'goal',\n",
 899 |        " 'aim',\n",
 900 |        " 'help',\n",
 901 |        " 'others',\n",
 902 |        " 'kind',\n",
 903 |        " 'goal',\n",
 904 |        " 'turn',\n",
 905 |        " 'also',\n",
 906 |        " 'add',\n",
 907 |        " 'wellbeing',\n",
 908 |        " '”',\n",
 909 |        " 'advice',\n",
 910 |        " 'grounded',\n",
 911 |        " 'decade',\n",
 912 |        " 'research',\n",
 913 |        " 'together',\n",
 914 |        " 'edward',\n",
 915 |        " 'deci',\n",
 916 |        " 'also',\n",
 917 |        " 'university',\n",
 918 |        " 'rochester',\n",
 919 |        " 'professor',\n",
 920 |        " 'emeritus',\n",
 921 |        " 'psychology',\n",
 922 |        " 'ryan',\n",
 923 |        " 'cofounder',\n",
 924 |        " 'selfdetermination',\n",
 925 |        " 'theory',\n",
 926 |        " 'sdt',\n",
 927 |        " 'broad',\n",
 928 |        " 'framework',\n",
 929 |        " 'study',\n",
 930 |        " 'human',\n",
 931 |        " 'motivation',\n",
 932 |        " 'personality',\n",
 933 |        " 'developed',\n",
 934 |        " 'duo',\n",
 935 |        " 'nearly',\n",
 936 |        " '40',\n",
 937 |        " 'year',\n",
 938 |        " 'theory',\n",
 939 |        " 'become',\n",
 940 |        " 'one',\n",
 941 |        " 'widely',\n",
 942 |        " 'accepted',\n",
 943 |        " 'framework',\n",
 944 |        " 'human',\n",
 945 |        " 'motivation',\n",
 946 |        " 'contemporary',\n",
 947 |        " 'behavioral',\n",
 948 |        " 'science',\n",
 949 |        " 'starting',\n",
 950 |        " 'point',\n",
 951 |        " 'idea',\n",
 952 |        " 'human',\n",
 953 |        " 'natural—or',\n",
 954 |        " 'intrinsic—tendency',\n",
 955 |        " 'behave',\n",
 956 |        " 'effective',\n",
 957 |        " 'healthful',\n",
 958 |        " 'way',\n",
 959 |        " 'check',\n",
 960 |        " 'still',\n",
 961 |        " 'feeling',\n",
 962 |        " 'like',\n",
 963 |        " 'new',\n",
 964 |        " 'year',\n",
 965 |        " '’',\n",
 966 |        " 'resolution',\n",
 967 |        " 'study',\n",
 968 |        " 'suggests',\n",
 969 |        " 'living',\n",
 970 |        " '‘',\n",
 971 |        " 'fast',\n",
 972 |        " '’',\n",
 973 |        " 'living',\n",
 974 |        " 'longer—with',\n",
 975 |        " 'intermittent',\n",
 976 |        " 'fasting',\n",
 977 |        " 'according',\n",
 978 |        " 'ryan',\n",
 979 |        " 'also',\n",
 980 |        " 'professor',\n",
 981 |        " 'institute',\n",
 982 |        " 'positive',\n",
 983 |        " 'psychology',\n",
 984 |        " 'education',\n",
 985 |        " 'australian',\n",
 986 |        " 'catholic',\n",
 987 |        " 'university',\n",
 988 |        " 'act',\n",
 989 |        " 'willingly',\n",
 990 |        " 'helping',\n",
 991 |        " 'others',\n",
 992 |        " 'satisfy',\n",
 993 |        " 'three',\n",
 994 |        " 'basic',\n",
 995 |        " 'psychological',\n",
 996 |        " 'need',\n",
 997 |        " 'identified',\n",
 998 |        " 'sdt',\n",
 999 |        " 'research',\n",
1000 |        " 'need',\n",
1001 |        " 'autonomy',\n",
1002 |        " 'competence',\n",
1003 |        " 'relatedness',\n",
1004 |        " 'autonomy',\n",
1005 |        " 'context',\n",
1006 |        " 'mean',\n",
1007 |        " 'engage',\n",
1008 |        " 'activity',\n",
1009 |        " 'feel',\n",
1010 |        " 'true',\n",
1011 |        " 'volition',\n",
1012 |        " 'find',\n",
1013 |        " 'personal',\n",
1014 |        " 'value',\n",
1015 |        " 'competence',\n",
1016 |        " 'mean',\n",
1017 |        " 'feeling',\n",
1018 |        " 'effective',\n",
1019 |        " 'sense',\n",
1020 |        " 'accomplishment',\n",
1021 |        " 'finally',\n",
1022 |        " 'relatedness',\n",
1023 |        " 'mean',\n",
1024 |        " 'working',\n",
1025 |        " 'feeling',\n",
1026 |        " 'connected',\n",
1027 |        " 'others',\n",
1028 |        " 'related',\n",
1029 |        " 'inspiring',\n",
1030 |        " 'couple',\n",
1031 |        " 'made',\n",
1032 |        " 'new',\n",
1033 |        " 'year',\n",
1034 |        " '’',\n",
1035 |        " 'resolution',\n",
1036 |        " 'go',\n",
1037 |        " '52',\n",
1038 |        " 'date',\n",
1039 |        " '–',\n",
1040 |        " 'succeed',\n",
1041 |        " '“',\n",
1042 |        " 'want',\n",
1043 |        " 'make',\n",
1044 |        " 'new',\n",
1045 |        " 'year',\n",
1046 |        " '’',\n",
1047 |        " 'resolution',\n",
1048 |        " 'really',\n",
1049 |        " 'make',\n",
1050 |        " 'happy',\n",
1051 |        " 'think',\n",
1052 |        " 'way',\n",
1053 |        " 'contribute',\n",
1054 |        " 'world',\n",
1055 |        " '”',\n",
1056 |        " 'say',\n",
1057 |        " 'ryan',\n",
1058 |        " '“',\n",
1059 |        " 'three',\n",
1060 |        " 'basic',\n",
1061 |        " 'need',\n",
1062 |        " 'fulfilled',\n",
1063 |        " 'research',\n",
1064 |        " 'show',\n",
1065 |        " '’',\n",
1066 |        " 'good',\n",
1067 |        " 'world',\n",
1068 |        " 'also',\n",
1069 |        " 'really',\n",
1070 |        " 'good',\n",
1071 |        " '”']"
1072 |       ]
1073 |      },
1074 |      "execution_count": 26,
1075 |      "metadata": {},
1076 |      "output_type": "execute_result"
1077 |     }
1078 |    ],
1079 |    "source": [
1080 |     "lemma_words"
1081 |    ]
1082 |   },
1083 |   {
1084 |    "cell_type": "code",
1085 |    "execution_count": 27,
1086 |    "metadata": {},
1087 |    "outputs": [
1088 |     {
1089 |      "name": "stdout",
1090 |      "output_type": "stream",
1091 |      "text": [
1092 |       "People emotions from the text \n",
1093 |       " [' attached', ' happy', ' attracted', ' loved', ' alone', ' bored', ' attracted'] \n",
1094 |       " \n",
1095 |       "\n",
1096 |       "Count of each emotion \n",
1097 |       " Counter({' attracted': 2, ' attached': 1, ' happy': 1, ' loved': 1, ' alone': 1, ' bored': 1})\n"
1098 |      ]
1099 |     }
1100 |    ],
1101 |    "source": [
1102 |     "emotion_list = []\n",
1103 |     "\n",
1104 |     "with open('emotions.txt', 'r') as file:\n",
1105 |     "    for line in file:\n",
1106 |     "        clear_line = line.replace(\"\\n\", '').replace(\",\", '').replace(\"'\", '').strip()\n",
1107 |     "        word, emotion = clear_line.split(':')\n",
1108 |     "\n",
1109 |     "        if word in lemma_words:\n",
1110 |     "            emotion_list.append(emotion)\n",
1111 |     "            \n",
1112 |     "print(\"People emotions from the text \\n\", emotion_list, '\\n \\n')\n",
1113 |     "\n",
1114 |     "\n",
1115 |     "w = Counter(emotion_list)\n",
1116 |     "print(\"Count of each emotion \\n\", w)"
1117 |    ]
1118 |   },
1119 |   {
1120 |    "cell_type": "markdown",
1121 |    "metadata": {},
1122 |    "source": [
1123 |     "#### VADER\n",
1124 |     "\n",
1125 |     " VADER ( Valence Aware Dictionary for Sentiment Reasoning) is a model used for text sentiment analysis that is sensitive to both polarity (positive/negative) and intensity (strength) of emotion. It is available in the NLTK package and can be applied directly to unlabeled text data.\n",
1126 |     "VADER sentimental analysis relies on a dictionary that maps lexical features to emotion intensities known as sentiment scores. The sentiment score of a text can be obtained by summing up the intensity of each word in the text.\n",
1127 |     "For example- Words like ‘love’, ‘enjoy’, ‘happy’, ‘like’ all convey a positive sentiment. Also VADER is intelligent enough to understand the basic context of these words, such as “did not love” as a negative statement. It also understands the emphasis of capitalization and punctuation, such as “ENJOY”\n",
1128 |     "\n",
1129 |     "#### Polarity classification\n",
1130 |     "We won’t try to determine if a sentence is objective or subjective, fact or opinion. Rather, we care only if the text expresses a positive, negative or neutral opinion."
1131 |    ]
1132 |   },
1133 |   {
1134 |    "cell_type": "code",
1135 |    "execution_count": null,
1136 |    "metadata": {},
1137 |    "outputs": [],
1138 |    "source": [
1139 |     "# Test how SentimentIntensityAnalyzer works\n",
1140 |     "sia = SentimentIntensityAnalyzer()\n",
1141 |     "\n",
1142 |     "#test_sent = 'Data science is a good course and I am loving it'\n",
1143 |     "sent = cleaned_text \n",
1144 |     "\n",
1145 |     "print (sia.polarity_scores(sent))"
1146 |    ]
1147 |   },
1148 |   {
1149 |    "cell_type": "code",
1150 |    "execution_count": null,
1151 |    "metadata": {},
1152 |    "outputs": [],
1153 |    "source": [
1154 |     "def sentiment_analyse(sentiment_text):\n",
1155 |     "    \n",
1156 |     "    score = SentimentIntensityAnalyzer().polarity_scores(sentiment_text)\n",
1157 |     "    \n",
1158 |     "    if score['neg'] > score['pos']:\n",
1159 |     "        print(\"\\n     ******Negative Sentiment*******\")\n",
1160 |     "        \n",
1161 |     "    elif score['neg'] < score['pos']:\n",
1162 |     "        print(\"\\n     ******Positive Sentiment*******\")\n",
1163 |     "        \n",
1164 |     "    else:\n",
1165 |     "        print(\"Neutral Sentiment\")"
1166 |    ]
1167 |   },
1168 |   {
1169 |    "cell_type": "code",
1170 |    "execution_count": null,
1171 |    "metadata": {
1172 |     "scrolled": true
1173 |    },
1174 |    "outputs": [],
1175 |    "source": [
1176 |     "sentiment_analyse(cleaned_text)\n",
1177 |     "\n",
1178 |     "fig, ax1 = plt.subplots()\n",
1179 |     "ax1.bar(w.keys(), w.values())\n",
1180 |     "fig.autofmt_xdate()\n",
1181 |     "plt.savefig('graph.png')\n",
1182 |     "plt.show()"
1183 |    ]
1184 |   },
1185 |   {
1186 |    "cell_type": "code",
1187 |    "execution_count": null,
1188 |    "metadata": {},
1189 |    "outputs": [],
1190 |    "source": []
1191 |   }
1192 |  ],
1193 |  "metadata": {
1194 |   "hide_input": false,
1195 |   "kernelspec": {
1196 |    "display_name": "Python 3",
1197 |    "language": "python",
1198 |    "name": "python3"
1199 |   },
1200 |   "language_info": {
1201 |    "codemirror_mode": {
1202 |     "name": "ipython",
1203 |     "version": 3
1204 |    },
1205 |    "file_extension": ".py",
1206 |    "mimetype": "text/x-python",
1207 |    "name": "python",
1208 |    "nbconvert_exporter": "python",
1209 |    "pygments_lexer": "ipython3",
1210 |    "version": "3.6.5"
1211 |   }
1212 |  },
1213 |  "nbformat": 4,
1214 |  "nbformat_minor": 2
1215 | }
1216 | 


--------------------------------------------------------------------------------
/Speech_to_text.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import speech_recognition as sr\n",
10 |     "import pyaudio"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 4,
16 |    "metadata": {},
17 |    "outputs": [
18 |     {
19 |      "name": "stdout",
20 |      "output_type": "stream",
21 |      "text": [
22 |       "Keep Talking\n",
23 |       "Here is your converted Text below....\n",
24 |       "    last perception of our natural language processing I think we have understood a lot of topics and all of good and you guys are amazing and the process awesome I hope I have cleared all your doubts\n"
25 |      ]
26 |     }
27 |    ],
28 |    "source": [
29 |     "r = sr.Recognizer()\n",
30 |     "\n",
31 |     "# Reading Microphone as source\n",
32 |     "# listening the speech and store in audio_text variable\n",
33 |     "\n",
34 |     "with sr.Microphone() as source:\n",
35 |     "    print(\"Keep Talking\")\n",
36 |     "    audio_text = r.listen(source)\n",
37 |     "    print(\"Here is your converted Text below....\")\n",
38 |     "# recoginize_() method will throw a request error if the API is unreachable, hence using exception handling\n",
39 |     "    \n",
40 |     "    try:\n",
41 |     "        # using google speech recognition\n",
42 |     "        print(\"    \"+r.recognize_google(audio_text))\n",
43 |     "    except:\n",
44 |     "         print(\"Sorry, I did not get that\")"
45 |    ]
46 |   },
47 |   {
48 |    "cell_type": "code",
49 |    "execution_count": null,
50 |    "metadata": {},
51 |    "outputs": [],
52 |    "source": []
53 |   }
54 |  ],
55 |  "metadata": {
56 |   "hide_input": false,
57 |   "kernelspec": {
58 |    "display_name": "Python 3",
59 |    "language": "python",
60 |    "name": "python3"
61 |   },
62 |   "language_info": {
63 |    "codemirror_mode": {
64 |     "name": "ipython",
65 |     "version": 3
66 |    },
67 |    "file_extension": ".py",
68 |    "mimetype": "text/x-python",
69 |    "name": "python",
70 |    "nbconvert_exporter": "python",
71 |    "pygments_lexer": "ipython3",
72 |    "version": "3.6.5"
73 |   }
74 |  },
75 |  "nbformat": 4,
76 |  "nbformat_minor": 2
77 | }
78 | 


--------------------------------------------------------------------------------
/Textblob_.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "D14RFpIEQKPI"
  8 |    },
  9 |    "source": [
 10 |     "## Translation and Language Detection\n",
 11 |     "\n",
 12 |     "TextBlobs can be translated between languages."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "from textblob import TextBlob"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "colab": {},
 29 |     "colab_type": "code",
 30 |     "id": "4GKzcjSCQKPJ",
 31 |     "outputId": "2c09951a-0fc0-4a03-efba-8482b0a8859f"
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "blob = TextBlob(u'நீங்கள் அனைவருக்கும் நல்வாழ்த்துக்கள்')\n",
 36 |     "blob.translate(to='hi')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {
 42 |     "colab_type": "text",
 43 |     "id": "eJz6p36pQKPN"
 44 |    },
 45 |    "source": [
 46 |     "If no source language is specified, **TextBlob** will attempt to detect the language. You can specify the source language explicitly, like so. Raises **TranslatorError** if the TextBlob cannot be translated into the requested language or NotTranslated if the translated result is the same as the input string."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "blob = TextBlob(u'Something is better than nothing')\n",
 56 |     "blob.translate(to='zh-CN')"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "colab": {},
 64 |     "colab_type": "code",
 65 |     "id": "eQggiqfkQKPO",
 66 |     "outputId": "f2ee94b9-9080-4bd1-88e3-d67d6049543c",
 67 |     "scrolled": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "chinese_blob = TextBlob(u\"有总比没有好\")\n",
 72 |     "chinese_blob.translate(from_lang=\"zh-CN\", to='en')"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {
 78 |     "colab_type": "text",
 79 |     "id": "PI-2-OG7QKPT"
 80 |    },
 81 |    "source": [
 82 |     "You can also attempt to detect a TextBlob’s language using **TextBlob.detect_language().**"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "colab": {
 90 |      "base_uri": "https://localhost:8080/",
 91 |      "height": 180
 92 |     },
 93 |     "colab_type": "code",
 94 |     "executionInfo": {
 95 |      "elapsed": 2269,
 96 |      "status": "error",
 97 |      "timestamp": 1587293948458,
 98 |      "user": {
 99 |       "displayName": "Urvish Shah",
100 |       "photoUrl": "",
101 |       "userId": "00632444284786828642"
102 |      },
103 |      "user_tz": -330
104 |     },
105 |     "id": "IhePnEhbQKPU",
106 |     "outputId": "f3ac00e2-92a0-4be4-9767-f92bb5e15de2"
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "d = TextBlob(\"what is you name\")\n",
111 |     "d.detect_language()"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "### Convert english speech to your native language "
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "import speech_recognition as sr"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "r = sr.Recognizer()\n",
137 |     "\n",
138 |     "# Reading Microphone as source\n",
139 |     "# listening the speech and store in audio_text variable\n",
140 |     "\n",
141 |     "with sr.Microphone() as source:\n",
142 |     "    print(\"Keep Talking\")\n",
143 |     "    audio_text = r.listen(source)\n",
144 |     "    print(\"Here is your converted Text below....\")\n",
145 |     "# recoginize_() method will throw a request error if the API is unreachable, hence using exception handling\n",
146 |     "    \n",
147 |     "    try:\n",
148 |     "        # using google speech recognition\n",
149 |     "#        print(\"    \"+r.recognize_google(audio_text))\n",
150 |     "        text = r.recognize_google(audio_text)\n",
151 |     "        blob = TextBlob(text)\n",
152 |     "        print (blob.translate(to='kn'))\n",
153 |     "    except:\n",
154 |     "         print(\"Sorry, I did not get that\")"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": []
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": []
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "hide_input": false,
174 |   "kernelspec": {
175 |    "display_name": "Python 3",
176 |    "language": "python",
177 |    "name": "python3"
178 |   },
179 |   "language_info": {
180 |    "codemirror_mode": {
181 |     "name": "ipython",
182 |     "version": 3
183 |    },
184 |    "file_extension": ".py",
185 |    "mimetype": "text/x-python",
186 |    "name": "python",
187 |    "nbconvert_exporter": "python",
188 |    "pygments_lexer": "ipython3",
189 |    "version": "3.6.5"
190 |   }
191 |  },
192 |  "nbformat": 4,
193 |  "nbformat_minor": 2
194 | }
195 | 


--------------------------------------------------------------------------------
/email_messages.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/training-ml/nlp/217d467c4ed67f8609751530a514833480632039/email_messages.csv.zip


--------------------------------------------------------------------------------
/emotions.txt:
--------------------------------------------------------------------------------
  1 |  'victimized': 'cheated',
  2 |  'accused': 'cheated',
  3 |  'acquitted': 'singled out',
  4 |  'adorable': 'loved',
  5 |  'adored': 'loved',
  6 |  'affected': 'attracted',
  7 |  'afflicted': 'sad',
  8 |  'aghast': 'fearful',
  9 |  'agog': 'attracted',
 10 |  'agonized': 'sad',
 11 |  'alarmed': 'fearful',
 12 |  'amused': 'happy',
 13 |  'angry': 'angry',
 14 |  'anguished': 'sad',
 15 |  'animated': 'happy',
 16 |  'annoyed': 'angry',
 17 |  'anxious': 'attracted',
 18 |  'apathetic': 'bored',
 19 |  'appalled': 'angry',
 20 |  'appeased': 'singled out',
 21 |  'appreciated': 'esteemed',
 22 |  'apprehensive': 'fearful',
 23 |  'approved of': 'loved',
 24 |  'ardent': 'lustful',
 25 |  'aroused': 'lustful',
 26 |  'attached': 'attached',
 27 |  'attracted': 'attracted',
 28 |  'autonomous': 'independent',
 29 |  'awed': 'fearful',
 30 |  'awkward': 'embarrassed',
 31 |  'beaten down': 'powerless',
 32 |  'beatific': 'happy',
 33 |  'belonging': 'attached',
 34 |  'bereaved': 'sad',
 35 |  'betrayed': 'cheated',
 36 |  'bewildered': 'surprise',
 37 |  'bitter': 'angry',
 38 |  'blissful': 'happy',
 39 |  'blithe': 'happy',
 40 |  'blocked': 'powerless',
 41 |  'blue': 'sad',
 42 |  'boiling': 'angry',
 43 |  'bold': 'fearless',
 44 |  'bored': 'bored',
 45 |  'brave': 'fearless',
 46 |  'bright': 'happy',
 47 |  'brisk': 'happy',
 48 |  'calm': 'safe',
 49 |  'capable': 'adequate',
 50 |  'captivated': 'attached',
 51 |  'careless': 'powerless',
 52 |  'categorized': 'singled out',
 53 |  'cautious': 'fearful',
 54 |  'certain': 'fearless',
 55 |  'chagrined': 'belittled',
 56 |  'challenged': 'attracted',
 57 |  'chastised': 'hated',
 58 |  'cheated': 'cheated',
 59 |  'cheerful': 'happy',
 60 |  'cheerless': 'sad',
 61 |  'cheery': 'happy',
 62 |  'cherished': 'attached',
 63 |  'chicken': 'fearful',
 64 |  'cocky': 'independent',
 65 |  'codependent': 'codependent',
 66 |  'coerced': 'cheated',
 67 |  'comfortable': 'happy',
 68 |  'common': 'average',
 69 |  'competent': 'adequate',
 70 |  'complacent': 'apathetic',
 71 |  'composed': 'adequate',
 72 |  'concerned': 'attracted',
 73 |  'confident': 'adequate',
 74 |  'confused': 'surprise',
 75 |  'connected': 'attached',
 76 |  'conned': 'cheated',
 77 |  'consumed': 'obsessed',
 78 |  'contented': 'happy',
 79 |  'controlled': 'powerless',
 80 |  'convivial': 'happy',
 81 |  'cornered': 'entitled',
 82 |  'courageous': 'fearless',
 83 |  'cowardly': 'fearful',
 84 |  'craving': 'attracted',
 85 |  'crestfallen': 'sad',
 86 |  'criticized': 'hated',
 87 |  'cross': 'angry',
 88 |  'cross-examined': 'singled out',
 89 |  'crushed': 'sad',
 90 |  'curious': 'attracted',
 91 |  'cut off': 'alone',
 92 |  'daring': 'fearless',
 93 |  'dark': 'sad',
 94 |  'dedicated': 'attracted',
 95 |  'defeated': 'powerless',
 96 |  'defenseless': 'fearful',
 97 |  'degraded': 'belittled',
 98 |  'dejected': 'sad',
 99 |  'depressed': 'sad',
100 |  'deserted': 'hated',
101 |  'desirable': 'loved',
102 |  'despondent': 'sad',
103 |  'detached': 'alone',
104 |  'determined': 'focused',
105 |  'diminished': 'belittled',
106 |  'disappointed': 'demoralized',
107 |  'discarded': 'hated',
108 |  'disconsolate': 'sad',
109 |  'discontented': 'sad',
110 |  'discounted': 'belittled',
111 |  'discouraged': 'powerless',
112 |  'disgraced': 'belittled',
113 |  'disgusted': 'angry',
114 |  'disheartened': 'demoralized',
115 |  'disillusioned': 'demoralized',
116 |  'disjointed': 'derailed',
117 |  'dismal': 'sad',
118 |  'dismayed': 'fearful',
119 |  'disoriented': 'derailed',
120 |  'disparaged': 'cheated',
121 |  'displeased': 'sad',
122 |  'disrespected': 'belittled',
123 |  'distressed': 'sad',
124 |  'distrustful': 'anxious',
125 |  'dolorous': 'sad',
126 |  'doubtful': 'fearful',
127 |  'down': 'sad',
128 |  'downhearted': 'sad',
129 |  'dreadful': 'sad',
130 |  'dreary': 'sad',
131 |  'dubious': 'anxious',
132 |  'dull': 'sad',
133 |  'duped': 'cheated',
134 |  'eager': 'attracted',
135 |  'earnest': 'attracted',
136 |  'ecstatic': 'happy',
137 |  'elated': 'happy',
138 |  'embarrassed': 'embarrassed',
139 |  'empathetic': 'attached',
140 |  'enchanted': 'attracted',
141 |  'encouraged': 'adequate',
142 |  'engrossed': 'attracted',
143 |  'enraged': 'angry',
144 |  'enterprising': 'fearless',
145 |  'enthusiastic': 'happy',
146 |  'entrusted': 'loved',
147 |  'esteemed': 'esteemed',
148 |  'excited': 'happy',
149 |  'excluded': 'alone',
150 |  'exempt': 'entitled',
151 |  'exhausted hopeless': 'powerless',
152 |  'exhilarated': 'happy',
153 |  'exploited': 'cheated',
154 |  'exposed': 'fearful',
155 |  'fabulous': 'ecstatic',
156 |  'fainthearted': 'fearful',
157 |  'fantastic': 'ecstatic',
158 |  'fascinated': 'attracted',
159 |  'favored': 'entitled',
160 |  'fearful': 'fearful',
161 |  'fervent': 'attracted',
162 |  'fervid': 'attracted',
163 |  'festive': 'happy',
164 |  'flat': 'sad',
165 |  'focused': 'focused',
166 |  'forced': 'powerless',
167 |  'forsaken': 'hated',
168 |  'framed': 'cheated',
169 |  'free': 'free',
170 |  'free & easy': 'happy',
171 |  'frightened': 'fearful',
172 |  'frisky': 'happy',
173 |  'frustrated': 'angry',
174 |  'full of anticipation': 'attracted',
175 |  'full of ennui': 'apathetic',
176 |  'fuming': 'angry',
177 |  'funereal': 'sad',
178 |  'furious': 'angry',
179 |  'gallant': 'fearless',
180 |  'genial': 'happy',
181 |  'glad': 'happy',
182 |  'gleeful': 'happy',
183 |  'gloomy': 'sad',
184 |  'glum': 'sad',
185 |  'grief-stricken': 'sad',
186 |  'grieved': 'sad',
187 |  'guilt': 'sad',
188 |  'guilty': 'singled out',
189 |  'happy': 'happy',
190 |  'hardy': 'fearless',
191 |  'heartbroken': 'sad',
192 |  'heavyhearted': 'sad',
193 |  'hesitant': 'fearful',
194 |  'high-spirited': 'happy',
195 |  'hilarious': 'happy',
196 |  'hopeful': 'attracted',
197 |  'horny': 'lustful',
198 |  'horrified': 'fearful',
199 |  'hot and bothered': 'lustful',
200 |  'humiliated': 'sad',
201 |  'humorous': 'happy',
202 |  'hurt': 'sad',
203 |  'hysterical': 'fearful',
204 |  'ignored': 'hated',
205 |  'ill at ease': 'sad',
206 |  'immobilized': 'apathetic',
207 |  'immune': 'entitled',
208 |  'important': 'happy',
209 |  'impotent': 'powerless',
210 |  'imprisoned': 'entitled',
211 |  'in a huff': 'angry',
212 |  'in a stew': 'angry',
213 |  'in control': 'adequate',
214 |  'in fear': 'fearful',
215 |  'in pain': 'sad',
216 |  'in the dumps': 'sad',
217 |  'in the zone': 'focused',
218 |  'incensed': 'angry',
219 |  'included': 'attached',
220 |  'indecisive': 'anxious',
221 |  'independent': 'free',
222 |  'indignant': 'angry',
223 |  'infatuated': 'lustful',
224 |  'inflamed': 'angry',
225 |  'injured': 'sad',
226 |  'inquisitive': 'attracted',
227 |  'insecure': 'codependent',
228 |  'insignificant': 'belittled',
229 |  'intent': 'attracted',
230 |  'interested': 'attracted',
231 |  'interrogated': 'singled out',
232 |  'intrigued': 'attracted',
233 |  'irate': 'angry',
234 |  'irresolute': 'fearful',
235 |  'irresponsible': 'powerless',
236 |  'irritated': 'angry',
237 |  'isolated': 'alone',
238 |  'jaunty': 'happy',
239 |  'jocular': 'happy',
240 |  'jolly': 'happy',
241 |  'jovial': 'happy',
242 |  'joyful': 'happy',
243 |  'joyless': 'sad',
244 |  'joyous': 'happy',
245 |  'jubilant': 'happy',
246 |  'justified': 'singled out',
247 |  'keen': 'attracted',
248 |  'labeled': 'singled out',
249 |  'lackadaisical': 'bored',
250 |  'lazy': 'apathetic',
251 |  'left out': 'hated',
252 |  'let down': 'hated',
253 |  'lethargic': 'apathetic',
254 |  'lied to': 'cheated',
255 |  'lighthearted': 'happy',
256 |  'liked': 'attached',
257 |  'lively': 'happy',
258 |  'livid': 'angry',
259 |  'lonely': 'alone',
260 |  'lonesome': 'alone',
261 |  'lost': 'lost',
262 |  'loved': 'attached',
263 |  'low': 'sad',
264 |  'lucky': 'happy',
265 |  'lugubrious': 'sad',
266 |  'macho': 'independent',
267 |  'mad': 'angry',
268 |  'melancholy': 'sad',
269 |  'menaced': 'fearful',
270 |  'merry': 'happy',
271 |  'mirthful': 'happy',
272 |  'misgiving': 'fearful',
273 |  'misunderstood': 'alone',
274 |  'moody': 'sad',
275 |  'moping': 'sad',
276 |  'motivated': 'attracted',
277 |  'mournful': 'sad',
278 |  'needed': 'attracted',
279 |  'needy': 'codependent',
280 |  'nervous': 'fearful',
281 |  'obligated': 'powerless',
282 |  'obsessed': 'obsessed',
283 |  'offended': 'angry',
284 |  'oppressed': 'sad',
285 |  'optionless': 'entitled',
286 |  'ordinary': 'average',
287 |  'organized': 'adequate',
288 |  'out of control': 'powerless',
289 |  'out of sorts': 'sad',
290 |  'outmaneuvered': 'entitled',
291 |  'outraged': 'angry',
292 |  'overjoyed': 'happy',
293 |  'overlooked': 'hated',
294 |  'overwhelmed': 'powerless',
295 |  'panicked': 'fearful',
296 |  'passionate': 'lustful',
297 |  'passive': 'apathetic',
298 |  'pathetic': 'sad',
299 |  'peaceful': 'safe',
300 |  'pensive': 'anxious',
301 |  'perplexed': 'anxious',
302 |  'phobic': 'fearful',
303 |  'playful': 'happy',
304 |  'pleased': 'happy',
305 |  'powerless': 'powerless',
306 |  'pressured': 'burdened',
307 |  'privileged': 'entitled',
308 |  'proud': 'happy',
309 |  'provoked': 'angry',
310 |  'punished': 'hated',
311 |  'put upon': 'burdened',
312 |  'quaking': 'fearful',
313 |  'quiescent': 'apathetic',
314 |  'rageful': 'angry',
315 |  'rapturous': 'happy',
316 |  'rated': 'singled out',
317 |  'reassured': 'fearless',
318 |  'reckless': 'powerless',
319 |  'redeemed': 'singled out',
320 |  'regretful': 'sad',
321 |  'rejected': 'alone',
322 |  'released': 'free',
323 |  'remorse': 'sad',
324 |  'replaced': 'hated',
325 |  'repulsed': 'demoralized',
326 |  'resentful': 'angry',
327 |  'resolute': 'fearless',
328 |  'respected': 'esteemed',
329 |  'responsible': 'adequate',
330 |  'restful': 'fearful',
331 |  'revered': 'esteemed',
332 |  'rueful': 'sad',
333 |  'sad': 'sad',
334 |  'satisfied': 'happy',
335 |  'saucy': 'happy',
336 |  'scared': 'fearful',
337 |  'secure': 'fearless',
338 |  'self-reliant': 'fearless',
339 |  'serene': 'happy',
340 |  'shaky': 'fearful',
341 |  'shamed': 'sad',
342 |  'shocked': 'surprise',
343 |  'significant': 'esteemed',
344 |  'singled out': 'singled out',
345 |  'skeptical': 'anxious',
346 |  'snoopy': 'attracted',
347 |  'somber': 'sad',
348 |  'sparkling': 'happy',
349 |  'spirited': 'happy',
350 |  'spiritless': 'sad',
351 |  'sprightly': 'happy',
352 |  'startled': 'surprise',
353 |  'stereotyped': 'singled out',
354 |  'stifled': 'powerless',
355 |  'stout hearted': 'fearless',
356 |  'strong': 'independent',
357 |  'suffering': 'sad',
358 |  'sulky': 'sad',
359 |  'sullen': 'angry',
360 |  'sunny': 'happy',
361 |  'surprised': 'surprise',
362 |  'suspicious': 'anxious',
363 |  'sympathetic': 'codependent',
364 |  'tense': 'anxious',
365 |  'terrified': 'fearful',
366 |  'terrorized': 'fearful',
367 |  'thankful': 'happy',
368 |  'threatened': 'fearful',
369 |  'thwarted': 'powerless',
370 |  'timid': 'fearful',
371 |  'timorous': 'fearful',
372 |  'torn': 'derailed',
373 |  'tortured': 'sad',
374 |  'tragic': 'sad',
375 |  'tranquil': 'happy',
376 |  'transported': 'happy',
377 |  'trapped': 'entitled',
378 |  'tremulous': 'fearful',
379 |  'tricked': 'entitled',
380 |  'turned on': 'lustful',
381 |  'unapproved of': 'hated',
382 |  'unbelieving': 'anxious',
383 |  'uncertain': 'anxious',
384 |  'unconcerned': 'apathetic',
385 |  'understood': 'attached',
386 |  'unfocussed': 'lost',
387 |  'unlovable': 'hated',
388 |  'unloved': 'hated',
389 |  'unmotivated': 'apathetic',
390 |  'unshackled': 'free',
391 |  'unsupported': 'belittled',
392 |  'up in arms': 'angry',
393 |  'upset': 'fearful',
394 |  'validated': 'loved',
395 |  'valued': 'esteemed',
396 |  'victimized': 'sad',
397 |  'violated': 'cheated',
398 |  'virulent': 'angry',
399 |  'vivacious': 'happy',
400 |  'vulnerable': 'powerless',
401 |  'wavering': 'anxious',
402 |  'weak': 'powerless',
403 |  'welcomed': 'loved',
404 |  'woebegone': 'sad',
405 |  'woeful': 'sad',
406 |  'worn down': 'powerless',
407 |  'worn out': 'powerless',
408 |  'worried': 'fearful',
409 |  'worshiped': 'esteemed',
410 |  'wrathful': 'angry',
411 |  'wronged': 'singled out',
412 |  'wrought up': 'angry',
413 |  'yearning': 'lustful',
414 |  'yellow': 'fearful',
415 |  'zealous': 'attracted',
416 |  'abandoned': 'hated',
417 |  'absolved': 'singled out',
418 |  'absorbed': 'attracted',
419 |  'abused': 'powerless',
420 |  'accepted': 'loved',
421 |  'aching': 'sad',
422 |  'acrimonious': 'angry',
423 |  'addicted': 'codependent',
424 |  'adequate': 'adequate',
425 |  'admired': 'esteemed',
426 |  'affectionate': 'attached',
427 |  'affronted': 'singled out',
428 |  'afraid': 'fearful',
429 |  'airy': 'happy',
430 |  'alone': 'alone',
431 |  'ambivalent': 'bored',
432 |  'apathetic': 'apathetic',
433 |  'apprehensive': 'anxious',
434 |  'arrogant': 'independent',
435 |  'ashamed': 'embarrassed',
436 |  'astonished': 'surprise',
437 |  'at ease': 'safe',
438 |  'attacked': 'fearful',
439 |  'audacious': 'fearless',
440 |  'autonomous': 'free',
441 |  'average': 'average',
442 |  'avid': 'attracted',
443 |  'baffled': 'lost',
444 |  'bashful': 'powerless',
445 |  'belittled': 'belittled',
446 |  'buoyant': 'happy',
447 |  'burdened': 'burdened',
448 |  'clouded': 'sad',
449 |  'committed': 'focused',
450 |  'compassionate': 'attached',
451 |  'compelled': 'obsessed',
452 |  'dauntless': 'fearless',
453 |  'debonair': 'happy',
454 |  'deceived': 'entitled',
455 |  'delighted': 'ecstatic',
456 |  'demoralized': 'demoralized',
457 |  'derailed': 'derailed',
458 |  'desirous': 'attracted',
459 |  'despairing': 'sad',
460 |  'devastated': 'angry',
461 |  'diffident': 'fearful',
462 |  'discredited': 'belittled',
463 |  'disheartened': 'sad',
464 |  'disinclined': 'demoralized',
465 |  'disorganized': 'powerless',
466 |  'downcast': 'sad',
467 |  'entitled': 'entitled',
468 |  'excited': 'adequate',
469 |  'exultant': 'happy',
470 |  'fidgety': 'fearful',
471 |  'frowning': 'sad',
472 |  'full of misgiving': 'anxious',
473 |  'great': 'happy',
474 |  'hapless': 'sad',
475 |  'hated': 'hated',
476 |  'heroic': 'fearless',
477 |  'hostile': 'angry',
478 |  'in despair': 'sad',
479 |  'indifferent': 'bored',
480 |  'infuriated': 'angry',
481 |  'insecure': 'fearful',
482 |  'inspired': 'happy',
483 |  'inspiring': 'attracted',
484 |  'judged': 'singled out',
485 |  'justified': 'singled out',
486 |  'laughting': 'happy',
487 |  'loved': 'loved',
488 |  'loving': 'attached',
489 |  'low': 'sad',
490 |  'lustful': 'lustful',
491 |  'manipulated': 'cheated',
492 |  'mumpish': 'sad',
493 |  'nosey': 'attracted',
494 |  'numb': 'apathetic',
495 |  'obliterated': 'powerless',
496 |  'peaceful': 'happy',
497 |  'petrified': 'fearful',
498 |  'piqued': 'angry',
499 |  'piteous': 'sad',
500 |  'powerless': 'powerless',
501 |  'questioning': 'anxious',
502 |  'rejected': 'hated',
503 |  'self-satisfied': 'happy',
504 |  'set up': 'entitled',
505 |  'shut out': 'alone',
506 |  'sorrowful': 'sad',
507 |  'spirited': 'sad',
508 |  'supported': 'esteemed',
509 |  'suspicious': 'fearful',
510 |  'terrific': 'happy',
511 |  'trapped': 'entitled',
512 |  'trembling': 'fearful',
513 |  'uncomfortable': 'anxious',
514 |  'underestimated': 'belittled',
515 |  'unhappy': 'sad',
516 |  'vindicated': 'singled out',
517 |  'worked up': 'angry'
518 | 


--------------------------------------------------------------------------------
/spam.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/training-ml/nlp/217d467c4ed67f8609751530a514833480632039/spam.csv


--------------------------------------------------------------------------------