├── .gitignore
├── README.md
├── assets
├── author.png
├── banner.png
├── nlp_workflow.png
├── qrcode_github.com.png
├── sentiment.jpeg
├── skipgram_arch.png
├── text_repr.png
└── wordnet.png
├── notebooks
├── 01_getting_started.ipynb
├── 02_text_representation.ipynb
├── 03_text_classification.ipynb
├── 04_nlp_deeplearning.ipynb
├── answers
│ ├── 01_getting_started_answers.ipynb
│ └── 02_text_representation_answers.ipynb
└── movie_reviews.csv.bz2
└── slides
├── text_classification_raghavbali.pdf
└── text_classification_raghavbali.pptx
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | *.DS_Store
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | pip-wheel-metadata/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96 | __pypackages__/
97 |
98 | # Celery stuff
99 | celerybeat-schedule
100 | celerybeat.pid
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 |
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 |
129 | # Pyre type checker
130 | .pyre/
131 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | ## Session Schedule
4 | - :date: **Date**: July 8th, 2022
5 | - :alarm_clock: **Time**: 1430 IST
6 | - **Registration**: [1729.world](https://1729.world/)
7 |
8 | ## Session Outline
9 | The workshop is designed to get you started with the world of Natural Language Processing (NLP), with special focus on __Text Classification__ task. The following is a list of topics we will cover in this workshop:
10 |
11 | - Introduction to NLP
12 | - Text Preprocessing :desktop_computer:
13 | - Text Representation (the basics) :desktop_computer:
14 | - Text Classification :desktop_computer:
15 | - Deep Learning and NLP :desktop_computer:
16 |
17 | ## About the Presenter
18 | 
--------------------------------------------------------------------------------
/assets/author.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/assets/author.png
--------------------------------------------------------------------------------
/assets/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/assets/banner.png
--------------------------------------------------------------------------------
/assets/nlp_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/assets/nlp_workflow.png
--------------------------------------------------------------------------------
/assets/qrcode_github.com.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/assets/qrcode_github.com.png
--------------------------------------------------------------------------------
/assets/sentiment.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/assets/sentiment.jpeg
--------------------------------------------------------------------------------
/assets/skipgram_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/assets/skipgram_arch.png
--------------------------------------------------------------------------------
/assets/text_repr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/assets/text_repr.png
--------------------------------------------------------------------------------
/assets/wordnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/assets/wordnet.png
--------------------------------------------------------------------------------
/notebooks/01_getting_started.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "nu9PsVUFiLLS"
7 | },
8 | "source": [
9 | "# Getting Started with NLP\n",
10 | "\n",
11 | "[](https://colab.research.google.com/github/raghavbali/workshop_text_classification/blob/main/notebooks/01_getting_started.ipynb)\n",
12 | "\n",
13 | "In this notebook, we will get familiar with the world on NLP. \n",
14 | "Key takeaways from this notebook are:\n",
15 | "\n",
16 | "- Learn how to load a textual dataset\n",
17 | "- Understand the dataset using basic EDA\n",
18 | "- Learn how to perform basic preprocessing/cleanup to prepare the dataset\n",
19 | "\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {
26 | "id": "F1072mTyi1VM"
27 | },
28 | "source": [
29 | "## Key NLP Libraries\n",
30 | "\n",
31 | "If you have been working in the Data Science/ML domain, you must have a set of _goto_ libraries and tools to do your magic. For instance, libraries like ``sklearn`` , ``xgboost``, etc. are a must have. \n",
32 | "\n",
33 | "Similarly, the NLP domain has its set of favorites. The following are some of the popular ones:\n",
34 | "- ``nltk`` : is a leading platform for building NLP applications. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing utilities.\n",
35 | "- ``gensim`` : is a library for unsupervised topic modeling, document indexing, retrieval by similarity, and other NLP functionalities, using modern statistical machine learning.\n",
36 | "- ``spacy`` : is a library which provides \"Industrial-Strength NLP\" capabilities which scale and are blazingly fast\n",
37 | "- ``fasttext`` : is a library for learning of word embeddings and text classification created by Facebook's AI Research lab.\n",
38 | "- ``huggingface`` 🤗 : is a community and data science platform that provides tools that enable users to build, train and deploy ML models based on open source (OS) code and technologies."
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "id": "7hvjs76wkK72"
45 | },
46 | "source": [
47 | "## Let's Read Some Shakespeare\n",
48 | "\n",
49 | "The __Gutenberg Project__ is an amazing project aimed at providing free access to some of the world's most amazing classical works. This makes it a wonderful source of textual data for NLP practitionars to use and improve their understanding of textual data. Ofcourse you can improve your litrary skills too 😃\n",
50 | "\n",
51 | "``NLTK`` provides us with a nice interface for the _Gutenberg_ project. Apart from some key utilities, this nice and clean interface enables us to access a number of large textual datasets to play with. For this workshop, we will focus on Shakespeare's __Hamlet__."
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {
58 | "id": "HLIleBHZh8o7"
59 | },
60 | "outputs": [],
61 | "source": [
62 | "import nltk\n",
63 | "import numpy as np\n",
64 | "import pandas as pd\n",
65 | "from nltk.corpus import gutenberg\n",
66 | "import seaborn as sns\n",
67 | "import re\n",
68 | "\n",
69 | "%matplotlib inline\n",
70 | "pd.options.display.max_columns=10000"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {
77 | "id": "PMQdAYMElYkt"
78 | },
79 | "outputs": [],
80 | "source": [
81 | "# First things first, download the Gutenberg Project files\n",
82 | "nltk.download('gutenberg')"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "id": "jLmqfYlXlibu"
90 | },
91 | "outputs": [],
92 | "source": [
93 | "# get the text for hamlet\n",
94 | "hamlet_raw = gutenberg.open('shakespeare-hamlet.txt')\n",
95 | "hamlet_raw = hamlet_raw.readlines()"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {
102 | "id": "iIBUARNOlwHW"
103 | },
104 | "outputs": [],
105 | "source": [
106 | "# Let us print some text\n",
107 | "print(hamlet_raw[:10])"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {
113 | "id": "W6tB6Z_Gl-N3"
114 | },
115 | "source": [
116 | "## Quick Exploratory Analysis\n",
117 | "\n",
118 | "Just like any other data science problem, the first step is to understand the dataset itself. NLP is no different."
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {
125 | "id": "BUL7iz4im_ys"
126 | },
127 | "outputs": [],
128 | "source": [
129 | "# View a Few raw lines of text\n",
130 | "\n",
131 | "# Add your code here"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {
138 | "id": "28MfrMU-nJBG"
139 | },
140 | "outputs": [],
141 | "source": [
142 | "# Total Number of lines of text in Hamlet\n",
143 | "print(\"Total lines in the book/corpus={}\".format(len(hamlet_raw)))"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {
150 | "id": "NG6WdhpNnXL9"
151 | },
152 | "outputs": [],
153 | "source": [
154 | "# Total Number of lines of text excluding blank lines\n",
155 | "hamlet_no_blanks = list(filter(None, [item.strip('\\n') \n",
156 | " for item in hamlet_raw]))\n",
157 | "hamlet_no_blanks[:5]"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {
164 | "id": "qJ2eFYw8ntff"
165 | },
166 | "outputs": [],
167 | "source": [
168 | "# Total Number of non-blank lines of text in Hamlet\n",
169 | "\n",
170 | "# Add your code here"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {
176 | "id": "tY_3HQe9wiuB"
177 | },
178 | "source": [
179 | "### How Long are the sentences?"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {
186 | "id": "NUeWqz3QxFAZ"
187 | },
188 | "outputs": [],
189 | "source": [
190 | "line_lengths = [len(sentence) for sentence in hamlet_no_blanks]\n",
191 | "p = sns.kdeplot(line_lengths, shade=True, color='yellow')"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {
197 | "id": "AE1-ElrPxOAd"
198 | },
199 | "source": [
200 | "## Tokenization\n",
201 | "\n",
202 | "Splitting sentences into usable terms/words is an important aspect of preprocessing textual data. Tokenization is thus the process of identifying the right word boundaries."
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": null,
208 | "metadata": {
209 | "id": "nAKYdZPbxg_r"
210 | },
211 | "outputs": [],
212 | "source": [
213 | "# simple tokenizer\n",
214 | "# splitting each sentence to get words\n",
215 | "tokens = [item.split() for item in hamlet_no_blanks]\n",
216 | "print(tokens[:5])"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "metadata": {
223 | "id": "tbb8xbQKxp_7"
224 | },
225 | "outputs": [],
226 | "source": [
227 | "# Let us visualize the distribution of tokens per sentence\n",
228 | "\n",
229 | "## Add your code here"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {
235 | "id": "Wlwlj9RuyJgw"
236 | },
237 | "source": [
238 | "## A bit more clean-up\n",
239 | "There can be a number of clean-up steps depending upon the kind of dataset and the problem we are solving. \n",
240 | "\n",
241 | "In this case, let us cleanup/remove terms which contain any kind of special characters"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {
248 | "id": "cjH5YWmiybTB"
249 | },
250 | "outputs": [],
251 | "source": [
252 | "# only keeping words and removing special characters\n",
253 | "words = list(filter(None, [re.sub(r'[^A-Za-z]', '', word) for word in words]))\n",
254 | "print(words[:20])"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {
260 | "id": "ZOPPlL__yh1U"
261 | },
262 | "source": [
263 | "## Can you identify Top Occurring words?"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "metadata": {
270 | "id": "lwVkovvCymBn"
271 | },
272 | "outputs": [],
273 | "source": [
274 | "# Add your code here"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {
280 | "id": "0nm6CS9KytxS"
281 | },
282 | "source": [
283 | "### Stopword Removal\n",
284 | "As you can see from the above output, the top occuring terms are not of much use in terms of understanding the context, etc. In the NLP space, such terms (punctuation marks, prepositions, etc) are termed as stopwords and are typically removed to handle dimensionality and other issues.\n",
285 | "\n",
286 | "Thankfully, ``nltk`` provides a clean utility along with an extensible list of stopwords that we can use straight-away"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {
293 | "id": "Fo8Te_WfyqqR"
294 | },
295 | "outputs": [],
296 | "source": [
297 | "import nltk \n",
298 | "\n",
299 | "# print a few stop words\n",
300 | "stopwords = nltk.corpus.stopwords.words('english')\n",
301 | "stopwords[:10]"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "metadata": {
308 | "id": "0UHXmtI4zfR1"
309 | },
310 | "outputs": [],
311 | "source": [
312 | "# Remove stopwords\n",
313 | "words = [word.lower() for word in words if word not in stopwords]"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {
320 | "id": "pJQozwulzlbh"
321 | },
322 | "outputs": [],
323 | "source": [
324 | "# Top Words by occurance after stopword removal\n",
325 | "\n",
326 | "# Add your code here"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {
332 | "id": "gslVjNc7zuWC"
333 | },
334 | "source": [
335 | "## Text Preprocessing\n",
336 | "\n",
337 | "We covered some basics of pre-processing so far, steps such as:\n",
338 | "- Lower-casing\n",
339 | "- Special character removal\n",
340 | "- Stopword removal\n",
341 | "- Removing blank lines and empty spaces\n",
342 | "\n",
343 | "are typically performed time and again. There are a number of other steps as well but those are mostly application dependent."
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "metadata": {
350 | "id": "l04SS_0k0J5_"
351 | },
352 | "outputs": [],
353 | "source": [
354 | "# A utility function to perform basic cleanup\n",
355 | "def normalize_document(doc):\n",
356 | " # lower case and remove special characters\\whitespaces\n",
357 | " doc = re.sub(r'[^a-zA-Z\\s]', '', doc, re.I|re.A)\n",
358 | " doc = doc.lower()\n",
359 | " doc = doc.strip()\n",
360 | " # tokenize document\n",
361 | " tokens = nltk.word_tokenize(doc)\n",
362 | " # filter stopwords out of document\n",
363 | " filtered_tokens = [token for token in tokens if token not in stopwords]\n",
364 | " # re-create document from filtered tokens\n",
365 | " doc = ' '.join(filtered_tokens)\n",
366 | " return doc"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": null,
372 | "metadata": {},
373 | "outputs": [],
374 | "source": [
375 | "normalize_corpus = np.vectorize(normalize_document)\n",
376 | "\n",
377 | "norm_corpus = normalize_corpus(hamlet_raw)\n",
378 | "norm_corpus"
379 | ]
380 | }
381 | ],
382 | "metadata": {
383 | "colab": {
384 | "collapsed_sections": [],
385 | "name": "01_getting_started_answers.ipynb",
386 | "provenance": [],
387 | "toc_visible": true
388 | },
389 | "kernelspec": {
390 | "display_name": "Python 3",
391 | "name": "python3"
392 | },
393 | "language_info": {
394 | "name": "python"
395 | }
396 | },
397 | "nbformat": 4,
398 | "nbformat_minor": 0
399 | }
400 |
--------------------------------------------------------------------------------
/notebooks/02_text_representation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "5qqNXxRKU1B8"
7 | },
8 | "source": [
9 | "# Text Representation Techniques\n",
10 | "\n",
11 | "[](https://colab.research.google.com/github/raghavbali/workshop_text_classification/blob/main/notebooks/02_text_representation.ipynb)\n",
12 | "\n",
13 | "In this notebook, we will get familiar with some basic Text Representation Techniques \n",
14 | "Key takeaways from this notebook are:\n",
15 | "\n",
16 | "- Learn how to transform text into usable format using Bag of Words techniques such as:\n",
17 | " - Count Vectorizer\n",
18 | " - TF-IDF\n",
19 | " - Similarity Features\n",
20 | "\n",
21 | ""
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "id": "jxkAwOUwRrF7"
29 | },
30 | "outputs": [],
31 | "source": [
32 | "import nltk\n",
33 | "import numpy as np\n",
34 | "import pandas as pd\n",
35 | "from nltk.corpus import gutenberg\n",
36 | "import seaborn as sns\n",
37 | "import re\n",
38 | "\n",
39 | "%matplotlib inline\n",
40 | "pd.options.display.max_columns=10000"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {
47 | "id": "IVSjSzvxY4tM"
48 | },
49 | "outputs": [],
50 | "source": [
51 | "# First things first, download the Gutenberg Project files\n",
52 | "nltk.download('gutenberg')\n",
53 | "nltk.download('stopwords')\n",
54 | "nltk.download('punkt')"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "id": "i5egExbqY6q4"
62 | },
63 | "outputs": [],
64 | "source": [
65 | "# get the text for hamlet\n",
66 | "hamlet_raw = gutenberg.open('shakespeare-hamlet.txt')\n",
67 | "hamlet_raw = hamlet_raw.readlines()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "# stop words\n",
77 | "# Add your code here"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {
84 | "id": "g50oL62aY6l5"
85 | },
86 | "outputs": [],
87 | "source": [
88 | "# A utility function to perform basic cleanup\n",
89 | "def normalize_document(doc):\n",
90 | " # lower case and remove special characters\\whitespaces\n",
91 | " doc = re.sub(r'[^a-zA-Z\\s]', '', doc, re.I|re.A)\n",
92 | " \n",
93 | " # Add your code here to lower case\n",
94 | "\n",
95 | " doc = doc.strip()\n",
96 | " # tokenize document\n",
97 | " tokens = nltk.word_tokenize(doc)\n",
98 | " # filter stopwords out of document\n",
99 | " filtered_tokens = [token for token in tokens if token not in stopwords]\n",
100 | " # re-create document from filtered tokens\n",
101 | " doc = ' '.join(filtered_tokens)\n",
102 | " return doc"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {
109 | "id": "A4-Kmr0KZUAl"
110 | },
111 | "outputs": [],
112 | "source": [
113 | "normalize_corpus = np.vectorize(normalize_document)\n",
114 | "\n",
115 | "norm_corpus = normalize_corpus(hamlet_raw)\n",
116 | "norm_corpus"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "metadata": {
122 | "id": "Irxq6wRwZJc1"
123 | },
124 | "source": [
125 | "## Bag of Words : Term Frequency\n",
126 | "A simple vector space representational model for text data. A vector space model is simply a mathematical model for transforming text as numeric vectors, such that each dimension of the vector is a specific feature\\attribute. The bag of words model represents each text document as a numeric vector where each dimension(column) is a specific word from the vocabulary and the value could be its frequency in the document. The model’s name is such because each document is represented literally as a ‘bag’ of its own words, disregarding word orders, sequences and grammar."
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "id": "Kn4N1UgpZLUz"
134 | },
135 | "outputs": [],
136 | "source": [
137 | "from sklearn.feature_extraction.text import CountVectorizer"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {
144 | "id": "6LktkOWdZNlq"
145 | },
146 | "outputs": [],
147 | "source": [
148 | "cv = CountVectorizer(min_df=0., max_df=1.)\n",
149 | "cv_matrix = cv.fit_transform(norm_corpus)\n",
150 | "cv_matrix = cv_matrix.toarray()\n",
151 | "cv_matrix"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "id": "hPVvuidsZNjD"
159 | },
160 | "outputs": [],
161 | "source": [
162 | "cv_matrix.shape"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {
169 | "id": "DZDQ8_zkZNgG"
170 | },
171 | "outputs": [],
172 | "source": [
173 | "vocab = cv.get_feature_names()"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {
180 | "id": "Wd68S8B3Zbhw"
181 | },
182 | "outputs": [],
183 | "source": [
184 | "# show document feature vectors\n",
185 | "pd.DataFrame(cv_matrix, columns=vocab).head()"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {
191 | "id": "HbE9z4Moa8dy"
192 | },
193 | "source": [
194 | "## TF-IDF\n",
195 | "Using absolute frequency counts as a measure of importance has its shortcomings. One potential issue is that there might be some terms which occur frequently across all documents and these may tend to overshadow other terms in the feature set. The TF-IDF model tries to combat this issue by using a normalizing factor. TF-IDF or Term Frequency-Inverse Document Frequency, uses a combination of two metrics in its computation, namely: __term frequency (tf)__ and __inverse document frequency (idf)__.\n",
196 | "\n",
197 | "Mathematically, we can define TF-IDF as\n",
198 | "\n",
199 | "``TF-IDF = tf x idf``\n",
200 | "\n",
201 | "Where, each element in the TF-IDF matrix is the score for word w in document D.\n",
202 | "\n",
203 | "The term **tf(w, D)** represents the term frequency of the word **w** in document **D**, which can be obtained from the Bag of Words model.\n",
204 | "The term idf(w, D) is the inverse document frequency for the term w, which can be computed as the log transform of the total number of documents in the corpus C divided by the document frequency of the word w, in other words it is the frequency of documents in the corpus where the word w occurs."
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "id": "8rBwfbPubTv5"
212 | },
213 | "outputs": [],
214 | "source": [
215 | "from sklearn.feature_extraction.text import TfidfVectorizer"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {
222 | "id": "Vs9uk2o8bZwh"
223 | },
224 | "outputs": [],
225 | "source": [
226 | "tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)\n",
227 | "tv_matrix = tv.fit_transform(norm_corpus)\n",
228 | "tv_matrix = tv_matrix.toarray()\n",
229 | "tv_matrix.shape"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": null,
235 | "metadata": {
236 | "id": "yFHSglOybZnX"
237 | },
238 | "outputs": [],
239 | "source": [
240 | "# Add your code here to ger feature names\n",
241 | "\n",
242 | "pd.DataFrame(np.round(tv_matrix, 2), columns=vocab).head()"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {
248 | "id": "6L2dbC3acLSt"
249 | },
250 | "source": [
251 | "## Bag of N-Grams Model\n",
252 | "A word is just a single token, often known as a **unigram** or 1-gram. We already know that the Bag of Words model doesn’t consider order of words. But what if we also wanted to take into account phrases or collection of words which occur in a sequence? **N-grams** help us achieve that. An N-gram is basically a collection of word tokens from a text document such that these tokens are contiguous and occur in a sequence. Bi-grams indicate n-grams of order 2 (two words), Tri-grams indicate n-grams of order 3 (three words), and so on. The Bag of N-Grams model is hence just an extension of the Bag of Words model so we can also leverage N-gram based features. The following example depicts bi-gram based features in each document feature vector."
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {
259 | "id": "UkAd3s4LcThI"
260 | },
261 | "outputs": [],
262 | "source": [
263 | "# you can set the n-gram range to 1,2 to get unigrams as well as bigrams\n",
264 | "bv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True,ngram_range=(2,2))\n",
265 | "bv_matrix = bv.fit_transform(norm_corpus)\n",
266 | "\n",
267 | "bv_matrix = bv_matrix.toarray()\n",
268 | "bv_matrix.shape"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": null,
274 | "metadata": {
275 | "id": "gGGohxNycgrG"
276 | },
277 | "outputs": [],
278 | "source": [
279 | "vocab = bv.get_feature_names()\n",
280 | "pd.DataFrame(bv_matrix, columns=vocab).head()"
281 | ]
282 | },
283 | {
284 | "cell_type": "markdown",
285 | "metadata": {
286 | "id": "ikWNCAPHcqIU"
287 | },
288 | "source": [
289 | "## Similarity Based Features (Bonus)\n",
290 | "Now that we have a method to transform text into vector form, we can now build on top of such features we engineered to generate new features which can be useful in domains like search engines, document clustering and information retrieval by leveraging these similarity based features.\n",
291 | "\n",
292 | "Pairwise document/sentence/term similarity in a corpus involves computing similarity for each pair of entities in a corpus. Thus if we have N entities in a corpus, we would end up with a N x N matrix such that each row and column represents the similarity score for a given pair. \n",
293 | "\n",
294 | "There are several similarity and distance metrics that are used to compute similarity. These include :\n",
295 | "- cosine distance/similarity, \n",
296 | "- euclidean distance, \n",
297 | "- manhattan distance, \n",
298 | "- BM25 similarity, \n",
299 | "- jaccard distance and so on. "
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {
306 | "id": "kmpC-Z0sdt_I"
307 | },
308 | "outputs": [],
309 | "source": [
310 | "from sklearn.metrics.pairwise import cosine_similarity"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {
317 | "id": "fMjIhdzhdt8D"
318 | },
319 | "outputs": [],
320 | "source": [
321 | "similarity_matrix = cosine_similarity(tv_matrix)\n",
322 | "similarity_matrix"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {
329 | "id": "i2qwGs6ndt37"
330 | },
331 | "outputs": [],
332 | "source": [
333 | "similarity_df = pd.DataFrame(similarity_matrix)\n",
334 | "similarity_df.head()"
335 | ]
336 | }
337 | ],
338 | "metadata": {
339 | "colab": {
340 | "collapsed_sections": [],
341 | "name": "02_text_representation.ipynb",
342 | "provenance": []
343 | },
344 | "kernelspec": {
345 | "display_name": "Python 3",
346 | "name": "python3"
347 | },
348 | "language_info": {
349 | "name": "python"
350 | }
351 | },
352 | "nbformat": 4,
353 | "nbformat_minor": 0
354 | }
355 |
--------------------------------------------------------------------------------
/notebooks/03_text_classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "UqL0UBCHg7hg"
7 | },
8 | "source": [
9 | "# Text Classification\n",
10 | "\n",
11 | "[](https://colab.research.google.com/github/raghavbali/workshop_text_classification/blob/main/notebooks/03_text_classification.ipynb)\n",
12 | "\n",
13 | "\n",
14 | "Photo by Count Chris on Unsplash\n",
15 | " \n",
16 | "\n",
17 | "In this notebook, we will leverage the preprocessing and representation techniques and apply them for a text classification use-case. In this notebook, we will cover:\n",
18 | "\n",
19 | "- Apply cleanup and transform text data into a vector form\n",
20 | "- Work through a text classification use-case\n",
21 | "\n",
22 | "\n",
23 | "### Text classification can have a number of applications, such as:\n",
24 | "- Document categorization\n",
25 | "- Spam vs Ham\n",
26 | "- Review Classification\n",
27 | "- Fake Vs Actual News\n",
28 | "- Sentiment Classification and so on...\n",
29 | "\n",
30 | "\n",
31 | "Dataset for this hands-on :\n",
32 | "- [IMDB Movie Review Dataset](https://huggingface.co/datasets/imdb)\n",
33 | "\n"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {
39 | "id": "gXJy38mdjZuT"
40 | },
41 | "source": [
42 | "## Install Dependencies"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {
49 | "id": "nWy_qyhngCGP"
50 | },
51 | "outputs": [],
52 | "source": [
53 | "!pip install contractions\n",
54 | "!pip install tqdm"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {
60 | "id": "CHcgchhEjeET"
61 | },
62 | "source": [
63 | "## Import Libraries"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {
70 | "id": "bHGTr-vljgko"
71 | },
72 | "outputs": [],
73 | "source": [
74 | "import nltk\n",
75 | "import contractions\n",
76 | "from bs4 import BeautifulSoup\n",
77 | "import numpy as np\n",
78 | "import re\n",
79 | "from tqdm.notebook import tqdm\n",
80 | "import unicodedata\n",
81 | "import pandas as pd"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {
88 | "id": "KYx0yihzjgh_"
89 | },
90 | "outputs": [],
91 | "source": [
92 | "nltk.download('punkt')"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {
98 | "id": "kpdYVBNgjzLk"
99 | },
100 | "source": [
101 | "## Get Data\n",
102 | "We will make use of the movie review dataset for this tutorial"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {
109 | "id": "g2QAXU-jjgfX"
110 | },
111 | "outputs": [],
112 | "source": [
113 | "dataset = pd.read_csv(r'movie_reviews.csv.bz2')\n",
114 | "dataset.info()"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {
121 | "id": "Xqy9dJHWj-XT"
122 | },
123 | "outputs": [],
124 | "source": [
125 | "dataset.head()"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {
131 | "id": "8Z0dHnjmkAbq"
132 | },
133 | "source": [
134 | "### Prepare Train-Test Splits"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {
141 | "id": "Wi4P8D5okE6b"
142 | },
143 | "outputs": [],
144 | "source": [
145 | "# build train and test datasets\n",
146 | "reviews = dataset['review'].values\n",
147 | "sentiments = dataset['sentiment'].values"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {
154 | "id": "dNZgFhLvkE4S"
155 | },
156 | "outputs": [],
157 | "source": [
158 | "train_reviews = reviews[:35000]\n",
159 | "train_sentiments = sentiments[:35000]"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {
166 | "id": "4rFcAlLYkE1G"
167 | },
168 | "outputs": [],
169 | "source": [
170 | "test_reviews = reviews[35000:]\n",
171 | "test_sentiments = sentiments[35000:]"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {
177 | "id": "StrRCsxjkNZW"
178 | },
179 | "source": [
180 | "## Text Preprocessing\n",
181 | "- Remove HTML/Special Characters\n",
182 | "- Remove accented characters\n",
183 | "- Lowercase"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {
190 | "id": "vhv0p91EkTl8"
191 | },
192 | "outputs": [],
193 | "source": [
194 | "def strip_html_tags(text):\n",
195 | " soup = BeautifulSoup(text, \"html.parser\")\n",
196 | " [s.extract() for s in soup(['iframe', 'script'])]\n",
197 | " stripped_text = soup.get_text()\n",
198 | " stripped_text = re.sub(r'[\\r|\\n|\\r\\n]+', '\\n', stripped_text)\n",
199 | " return stripped_text\n",
200 | "\n",
201 | "def remove_accented_chars(text):\n",
202 | " text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')\n",
203 | " return text\n",
204 | "\n",
205 | "def pre_process_corpus(docs):\n",
206 | " norm_docs = []\n",
207 | " for doc in tqdm(docs):\n",
208 | " doc = strip_html_tags(doc)\n",
209 | " doc = doc.translate(doc.maketrans(\"\\n\\t\\r\", \" \"))\n",
210 | " doc = doc.lower()\n",
211 | " doc = remove_accented_chars(doc)\n",
212 | " doc = contractions.fix(doc)\n",
213 | " # lower case and remove special characters\\whitespaces\n",
214 | " doc = re.sub(r'[^a-zA-Z0-9\\s]', '', doc, re.I|re.A)\n",
215 | " doc = re.sub(' +', ' ', doc)\n",
216 | " doc = doc.strip() \n",
217 | " norm_docs.append(doc)\n",
218 | " \n",
219 | " return norm_docs"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {
226 | "id": "9ujpM3YrkiTb"
227 | },
228 | "outputs": [],
229 | "source": [
230 | "%%time\n",
231 | "norm_train_reviews = pre_process_corpus(train_reviews)\n",
232 | "norm_test_reviews = pre_process_corpus(test_reviews)"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {
238 | "id": "utycIPiukpuq"
239 | },
240 | "source": [
241 | "## Feature Engineering"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {
248 | "id": "KvHnLU18kszI"
249 | },
250 | "outputs": [],
251 | "source": [
252 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {
259 | "id": "6BMIpQWTkt-Z"
260 | },
261 | "outputs": [],
262 | "source": [
263 | "# build BOW features on train reviews\n",
264 | "cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))\n",
265 | "cv_train_features = cv.fit_transform(norm_train_reviews)"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {
272 | "id": "0wpJkPeskt7y"
273 | },
274 | "outputs": [],
275 | "source": [
276 | "# build TFIDF features on train reviews\n",
277 | "tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),\n",
278 | " sublinear_tf=True)\n",
279 | "tv_train_features = tv.fit_transform(norm_train_reviews)"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {
286 | "id": "KPVPu2OUkt4X"
287 | },
288 | "outputs": [],
289 | "source": [
290 | "%%time\n",
291 | "\n",
292 | "# transform test reviews into features\n",
293 | "cv_test_features = cv.transform(norm_test_reviews)\n",
294 | "tv_test_features = tv.transform(norm_test_reviews)"
295 | ]
296 | },
297 | {
298 | "cell_type": "markdown",
299 | "metadata": {
300 | "id": "5EiZUgr-k7EG"
301 | },
302 | "source": [
303 | "## Classification Model: Logistic Regression\n",
304 | "\n",
305 | "Also known as the logit or logistic model, it uses the logistic (popularly also known as sigmoid) mathematical function to estimate the parameter values. These are the coefficients of all our features such that the overall loss is minimized when predicting the outcome"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {
312 | "id": "q5wmaYmelEh1"
313 | },
314 | "outputs": [],
315 | "source": [
316 | "# Logistic Regression model on BOW features\n",
317 | "from sklearn.linear_model import LogisticRegression\n",
318 | "from sklearn.metrics import classification_report, confusion_matrix"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {
324 | "id": "CgC2ZKRKlomp"
325 | },
326 | "source": [
327 | "### LR with Count Vectorizer"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "id": "_ZBrjG8GlIvV"
335 | },
336 | "outputs": [],
337 | "source": [
338 | "# instantiate model\n",
339 | "lr_cv = LogisticRegression(penalty='l2', \n",
340 | " max_iter=500, \n",
341 | " C=1, \n",
342 | " solver='lbfgs', \n",
343 | " random_state=42)"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "metadata": {
350 | "id": "IXujYFTKlIkp"
351 | },
352 | "outputs": [],
353 | "source": [
354 | "## Train with CountVectorizer Features\n",
355 | "# train model\n",
356 | "lr_cv.fit(cv_train_features, train_sentiments)"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": null,
362 | "metadata": {
363 | "id": "Ioq62lEtlRPL"
364 | },
365 | "outputs": [],
366 | "source": [
367 | "# predict on test data\n",
368 | "lr_bow_predictions = lr_cv.predict(cv_test_features)"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "metadata": {
374 | "id": "wLu9mg4nlVkD"
375 | },
376 | "source": [
377 | "### Evaluate Model"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {
384 | "id": "-rmXkrnRlYH1"
385 | },
386 | "outputs": [],
387 | "source": [
388 | "print(classification_report(test_sentiments, lr_bow_predictions))"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "metadata": {
395 | "id": "RgWlqugblZrc"
396 | },
397 | "outputs": [],
398 | "source": [
399 | "labels = ['negative', 'positive']\n",
400 | "pd.DataFrame(confusion_matrix(test_sentiments, lr_bow_predictions), \n",
401 | " index=labels, columns=labels)"
402 | ]
403 | },
404 | {
405 | "cell_type": "markdown",
406 | "metadata": {
407 | "id": "mi3_BaRMluBw"
408 | },
409 | "source": [
410 | "### LR with TFIDF"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "metadata": {
417 | "id": "5DuaWtgalvlw"
418 | },
419 | "outputs": [],
420 | "source": [
421 | "# instantiate model\n",
422 | "lr_tv = LogisticRegression(penalty='l2', \n",
423 | " max_iter=500, \n",
424 | " C=1, \n",
425 | " solver='lbfgs', \n",
426 | " random_state=42)"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {
433 | "id": "tGTTCiIblyZA"
434 | },
435 | "outputs": [],
436 | "source": [
437 | "## Train with CountVectorizer Features\n",
438 | "# train model\n",
439 | "lr_tv.fit(tv_train_features, train_sentiments)"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": null,
445 | "metadata": {},
446 | "outputs": [],
447 | "source": [
448 | "# predict on test data\n",
449 | "lr_tfidf_predictions = lr_tv.predict(tv_test_features)"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "metadata": {
455 | "id": "LAbEFoF0l7ye"
456 | },
457 | "source": [
458 | "### Evaluate Model"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": null,
464 | "metadata": {
465 | "id": "qx1ER0fll-Zy"
466 | },
467 | "outputs": [],
468 | "source": [
469 | "print(classification_report(test_sentiments, lr_tfidf_predictions))"
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": null,
475 | "metadata": {
476 | "id": "T-yqZjmWl_Qn"
477 | },
478 | "outputs": [],
479 | "source": [
480 | "labels = ['negative', 'positive']\n",
481 | "pd.DataFrame(confusion_matrix(test_sentiments, lr_tfidf_predictions), \n",
482 | " index=labels, columns=labels)"
483 | ]
484 | }
485 | ],
486 | "metadata": {
487 | "colab": {
488 | "collapsed_sections": [],
489 | "name": "03_text_classification.ipynb",
490 | "provenance": [],
491 | "toc_visible": true
492 | },
493 | "kernelspec": {
494 | "display_name": "Python 3",
495 | "name": "python3"
496 | },
497 | "language_info": {
498 | "name": "python"
499 | }
500 | },
501 | "nbformat": 4,
502 | "nbformat_minor": 0
503 | }
504 |
--------------------------------------------------------------------------------
/notebooks/04_nlp_deeplearning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "JaWAu5ujnB4g"
7 | },
8 | "source": [
9 | "# Word Embeddings and Deep Learning for NLP\n",
10 | "\n",
11 | "[](https://colab.research.google.com/github/raghavbali/workshop_text_classification/blob/main/notebooks/04_nlp_deeplearning.ipynb)\n",
12 | "\n",
13 | "This notebook covers some of basic steps involved in using Deep Learning for NLP. This notebook covers:\n",
14 | "\n",
15 | "A brief overview of Word2Vec based Embeddings.\n",
16 | "A brief on HuggingFace Transformer :hugs: based implementation of NLP tasks\n",
17 | "Note: This is just an overview and not an exhaustive material on NLP with Deep Learning"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {
23 | "id": "0h_gkXlco7E9"
24 | },
25 | "source": [
26 | "## Text Representation using Word2Vec"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "v8rbvEvSpKj2"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "import pandas as pd\n",
38 | "import numpy as np\n",
39 | "from sklearn.datasets import fetch_20newsgroups\n",
40 | "\n",
41 | "pd.options.display.max_colwidth = 200"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {
47 | "id": "Cna9YhwkpMX-"
48 | },
49 | "source": [
50 | "### Get Dataset"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {
57 | "id": "XFJs5ovFm-LW"
58 | },
59 | "outputs": [],
60 | "source": [
61 | "categories = ['alt.atheism', 'comp.graphics', 'sci.med']"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {
68 | "id": "wgexPcuFpUjs"
69 | },
70 | "outputs": [],
71 | "source": [
72 | "twenty_corpus = fetch_20newsgroups(subset='train',\n",
73 | " categories=categories, shuffle=True, random_state=42)"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {
80 | "id": "Mt4n_TZWpWa_"
81 | },
82 | "outputs": [],
83 | "source": [
84 | "[news.split('\\n')[1] for news in twenty_corpus.data[:10]]"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {
91 | "id": "qVhpKEv1pYX1"
92 | },
93 | "outputs": [],
94 | "source": [
95 | "twenty_corpus.target[:10]"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {
102 | "id": "swQzWs1qpa_4"
103 | },
104 | "outputs": [],
105 | "source": [
106 | "corpus = [news.split('\\n')[1] for news in twenty_corpus.data]\n",
107 | "labels = [categories[i] for i in twenty_corpus.target]"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {
114 | "id": "PoU7a4z0pa9C"
115 | },
116 | "outputs": [],
117 | "source": [
118 | "corpus = np.array(corpus)\n",
119 | "corpus_df = pd.DataFrame({'Document': corpus, \n",
120 | " 'Category': labels})\n",
121 | "corpus_df = corpus_df[['Document', 'Category']]\n",
122 | "corpus_df"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {
128 | "id": "C35Ma3AbphM2"
129 | },
130 | "source": [
131 | "### Preprocess Dataset"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {
138 | "id": "HXdZeOx7pdf0"
139 | },
140 | "outputs": [],
141 | "source": [
142 | "import nltk\n",
143 | "import re\n",
144 | "nltk.download('stopwords')\n",
145 | "nltk.download('punkt')"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {
152 | "id": "DInq975npddO"
153 | },
154 | "outputs": [],
155 | "source": [
156 | "def normalize_document(doc):\n",
157 | " # lower case and remove special characters\\whitespaces\n",
158 | " doc = re.sub(r'[^a-zA-Z\\s]', '', doc, re.I|re.A)\n",
159 | " doc = doc.lower()\n",
160 | " doc = doc.strip()\n",
161 | " # tokenize document\n",
162 | " tokens = nltk.word_tokenize(doc)\n",
163 | " # filter stopwords out of document\n",
164 | " filtered_tokens = [token for token in tokens if token not in stop_words]\n",
165 | " # re-create document from filtered tokens\n",
166 | " doc = ' '.join(filtered_tokens)\n",
167 | " return doc"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "id": "4AGfGfxPpdaS"
175 | },
176 | "outputs": [],
177 | "source": [
178 | "stop_words = nltk.corpus.stopwords.words('english')\n",
179 | "normalize_corpus = np.vectorize(normalize_document)\n",
180 | "\n",
181 | "norm_corpus = normalize_corpus(corpus)\n",
182 | "norm_corpus"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {
188 | "id": "ARyquvKiptSL"
189 | },
190 | "source": [
191 | "### Train Word2Vec Model"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {
197 | "id": "y1m_QYz8qT96"
198 | },
199 | "source": [
200 | "The Skip-gram model architecture usually tries to achieve the reverse of what the CBOW model does. It tries to predict the source context words (surrounding words) given a target word (the center word).\n",
201 | "\n",
202 | "Considering our simple sentence from earlier, “the quick brown fox jumps over the lazy dog”. If we used the CBOW model, we get pairs of (context_window, target_word) where if we consider a context window of size 2, we have examples like ([quick, fox], brown), ([the, brown], quick), ([the, dog], lazy) and so on.\n",
203 | "\n",
204 | "Now considering that the skip-gram model’s aim is to predict the context from the target word, the model typically inverts the contexts and targets, and tries to predict each context word from its target word. Hence the task becomes to predict the context [quick, fox] given target word ‘brown’ or [the, brown] given target word ‘quick’ and so on.\n",
205 | "\n",
206 | "Thus the model tries to predict the context_window words based on the target_word.\n",
207 | "\n",
208 | "\n",
209 | ""
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {
216 | "id": "0hKGNvMopwVV"
217 | },
218 | "outputs": [],
219 | "source": [
220 | "import nltk\n",
221 | "from gensim.models import word2vec"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {
228 | "id": "Pun4KemLpwSs"
229 | },
230 | "outputs": [],
231 | "source": [
232 | "tokenized_corpus = [nltk.word_tokenize(doc) for doc in norm_corpus]"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "### Gensim\n",
240 | "The gensim framework, created by Radim Řehůřek consists of a robust, efficient and scalable implementation of the Word2Vec model. We will leverage the same on our sample toy corpus. In our workflow, we will tokenize our normalized corpus and then focus on the following four parameters in the Word2Vec model to build it.\n",
241 | "\n",
242 | "size: The word embedding dimensionality\n",
243 | "window: The context window size\n",
244 | "min_count: The minimum word count\n",
245 | "sample: The downsample setting for frequent words\n",
246 | "sg: Training model, 1 for skip-gram otherwise CBOW\n",
247 | "We will build a simple Word2Vec model on the corpus and visualize the embeddings."
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {
254 | "id": "eZ2yy0YfpwQO"
255 | },
256 | "outputs": [],
257 | "source": [
258 | "# Set values for various parameters\n",
259 | "feature_size = 15 # Word vector dimensionality \n",
260 | "window_context = 20 # Context window size \n",
261 | "min_word_count = 1 # Minimum word count \n",
262 | "sample = 1e-3 # Downsample setting for frequent words\n",
263 | "sg = 1 # skip-gram model\n",
264 | "\n",
265 | "w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, \n",
266 | " window=window_context, min_count = min_word_count,\n",
267 | " sg=sg, sample=sample, iter=5000)\n"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {
273 | "id": "ADkVnAVzp0YD"
274 | },
275 | "source": [
276 | "### Visualize Embeddings"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {
283 | "id": "39ig6TP-p3LE"
284 | },
285 | "outputs": [],
286 | "source": [
287 | "from matplotlib import pyplot as plt\n",
288 | "%matplotlib inline\n",
289 | "# visualize embeddings\n",
290 | "from sklearn.manifold import TSNE"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": null,
296 | "metadata": {
297 | "id": "7SkvFPZWp40x"
298 | },
299 | "outputs": [],
300 | "source": [
301 | "words = w2v_model.wv.index2word\n",
302 | "wvs = w2v_model.wv[words]\n",
303 | "\n",
304 | "tsne = TSNE(n_components=2, random_state=42, n_iter=5000, perplexity=5)\n",
305 | "np.set_printoptions(suppress=True)\n",
306 | "T = tsne.fit_transform(wvs)\n",
307 | "labels = words\n",
308 | "\n",
309 | "plt.figure(figsize=(12, 6))\n",
310 | "plt.scatter(T[:, 0], T[:, 1], c='blue')\n",
311 | "for label, x, y in zip(labels, T[:, 0], T[:, 1]):\n",
312 | " plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "w2v_model.similar_by_word('tradition')"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "w2v_model.doesnt_match(['tradition','medical','war'])"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "w2v_model.wv.most_similar(positive=[\"homeopathy\", \"medical\"], negative=[\"political\"], topn=3)"
340 | ]
341 | },
342 | {
343 | "cell_type": "markdown",
344 | "metadata": {
345 | "id": "EwgNs0sSp6Xk"
346 | },
347 | "source": [
348 | "## Transformers 🤗"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {
355 | "id": "XcQXF_tjqAdk"
356 | },
357 | "outputs": [],
358 | "source": [
359 | "!pip install transformers"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {
366 | "id": "_Ovs8dbmqD9b"
367 | },
368 | "outputs": [],
369 | "source": [
370 | "from transformers import pipeline"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {
377 | "id": "m71pFXF-qFf4"
378 | },
379 | "outputs": [],
380 | "source": [
381 | "classifier = pipeline('sentiment-analysis')"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "metadata": {
388 | "id": "EDUhgwEnqGxH"
389 | },
390 | "outputs": [],
391 | "source": [
392 | "classifier.tokenizer('We are having some fun at 1729 while learning text classification')"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "metadata": {
399 | "id": "a8Xin8auqIMN"
400 | },
401 | "outputs": [],
402 | "source": [
403 | "classifier('We are having some fun at 1729 while learning text classification')"
404 | ]
405 | }
406 | ],
407 | "metadata": {
408 | "colab": {
409 | "collapsed_sections": [],
410 | "name": "04_nlp_deeplearning.ipynb",
411 | "provenance": [],
412 | "toc_visible": true
413 | },
414 | "kernelspec": {
415 | "display_name": "Python 3",
416 | "name": "python3"
417 | },
418 | "language_info": {
419 | "name": "python"
420 | }
421 | },
422 | "nbformat": 4,
423 | "nbformat_minor": 0
424 | }
425 |
--------------------------------------------------------------------------------
/notebooks/answers/01_getting_started_answers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "nu9PsVUFiLLS"
7 | },
8 | "source": [
9 | "# Getting Started with NLP\n",
10 | "\n",
11 | "In this notebook, we will get familiar with the world on NLP. \n",
12 | "Key takeaways from this notebook are:\n",
13 | "\n",
14 | "- Learn how to load a textual dataset\n",
15 | "- Understand the dataset using basic EDA\n",
16 | "- Learn how to perform basic preprocessing/cleanup to prepare the dataset\n",
17 | "\n",
18 | "```shell\n",
19 | "Add image from the slide deck\n",
20 | "```"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {
26 | "id": "F1072mTyi1VM"
27 | },
28 | "source": [
29 | "## Key NLP Libraries\n",
30 | "\n",
31 | "If you have been working in the Data Science/ML domain, you must have a set of _goto_ libraries and tools to do your magic. For instance, libraries like ``sklearn`` , ``xgboost``, etc. are a must have. \n",
32 | "\n",
33 | "Similarly, the NLP domain has its set of favorites. The following are some of the popular ones:\n",
34 | "- ``nltk``\n",
35 | "- ``gensim``\n",
36 | "- ``spacy`` \n",
37 | "- ``fasttext``\n",
38 | "- ``huggingface`` 🤗\n",
39 | "\n",
40 | "```shell\n",
41 | "$> Add library logos\n",
42 | "$> Add one liners for each \n",
43 | "```"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {
49 | "id": "7hvjs76wkK72"
50 | },
51 | "source": [
52 | "## Let's Read Some Shakespeare\n",
53 | "\n",
54 | "The __Gutenberg Project__ is an amazing project aimed at providing free access to some of the world's most amazing classical works. This makes it a wonderful source of textual data for NLP practitionars to use and improve their understanding of textual data. Ofcourse you can improve your litrary skills too 😃\n",
55 | "\n",
56 | "``NLTK`` provides us with a nice interface for the _Gutenberg_ project. Apart from some key utilities, this nice and clean interface enables us to access a number of large textual datasets to play with. For this workshop, we will focus on Shakespeare's __Hamlet__."
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {
63 | "id": "HLIleBHZh8o7"
64 | },
65 | "outputs": [],
66 | "source": [
67 | "import nltk\n",
68 | "import numpy as np\n",
69 | "import pandas as pd\n",
70 | "from nltk.corpus import gutenberg\n",
71 | "import seaborn as sns\n",
72 | "import re\n",
73 | "\n",
74 | "%matplotlib inline\n",
75 | "pd.options.display.max_columns=10000"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {
82 | "id": "PMQdAYMElYkt"
83 | },
84 | "outputs": [],
85 | "source": [
86 | "# First things first, download the Gutenberg Project files\n",
87 | "nltk.download('gutenberg')"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {
94 | "id": "jLmqfYlXlibu"
95 | },
96 | "outputs": [],
97 | "source": [
98 | "# get the text for hamlet\n",
99 | "hamlet_raw = gutenberg.open('shakespeare-hamlet.txt')\n",
100 | "hamlet_raw = hamlet_raw.readlines()"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "id": "iIBUARNOlwHW"
108 | },
109 | "outputs": [],
110 | "source": [
111 | "# Let us print some text\n",
112 | "print(hamlet_raw[:10])"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {
118 | "id": "W6tB6Z_Gl-N3"
119 | },
120 | "source": [
121 | "## Quick Exploratory Analysis\n",
122 | "\n",
123 | "Just like any other data science problem, the first step is to understand the dataset itself. NLP is no different."
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {
130 | "id": "BUL7iz4im_ys"
131 | },
132 | "outputs": [],
133 | "source": [
134 | "# View a Few raw lines of text\n",
135 | "\n",
136 | "# Add your code here\n",
137 | "hamlet_raw[:10]"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {
144 | "id": "28MfrMU-nJBG"
145 | },
146 | "outputs": [],
147 | "source": [
148 | "# Total Number of lines of text in Hamlet\n",
149 | "print(\"Total lines in the book/corpus={}\".format(len(hamlet_raw)))"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {
156 | "id": "NG6WdhpNnXL9"
157 | },
158 | "outputs": [],
159 | "source": [
160 | "# Total Number of lines of text excluding blank lines\n",
161 | "hamlet_no_blanks = list(filter(None, [item.strip('\\n') \n",
162 | " for item in hamlet_raw]))\n",
163 | "hamlet_no_blanks[:5]"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": null,
169 | "metadata": {
170 | "id": "qJ2eFYw8ntff"
171 | },
172 | "outputs": [],
173 | "source": [
174 | "# Total Number of non-blank lines of text in Hamlet\n",
175 | "\n",
176 | "# Add your code here\n",
177 | "print(\"Updated lines in the book/corpus={} after removing blank lines\".format(len(hamlet_no_blanks)))"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {
183 | "id": "tY_3HQe9wiuB"
184 | },
185 | "source": [
186 | "### How Long are the sentences?"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {
193 | "id": "NUeWqz3QxFAZ"
194 | },
195 | "outputs": [],
196 | "source": [
197 | "line_lengths = [len(sentence) for sentence in hamlet_no_blanks]\n",
198 | "p = sns.kdeplot(line_lengths, shade=True, color='yellow')"
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {
204 | "id": "AE1-ElrPxOAd"
205 | },
206 | "source": [
207 | "## Tokenization\n",
208 | "\n",
209 | "Splitting sentences into usable terms/words is an important aspect of preprocessing textual data. Tokenization is thus the process of identifying the right word boundaries."
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {
216 | "id": "nAKYdZPbxg_r"
217 | },
218 | "outputs": [],
219 | "source": [
220 | "# simple tokenizer\n",
221 | "# splitting each sentence to get words\n",
222 | "tokens = [item.split() for item in hamlet_no_blanks]\n",
223 | "print(tokens[:5])"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {
230 | "id": "tbb8xbQKxp_7"
231 | },
232 | "outputs": [],
233 | "source": [
234 | "# Let us visualize the distribution of tokens per sentence\n",
235 | "\n",
236 | "## Add your code here\n",
237 | "total_tokens_per_line = [len(sentence.split()) for sentence in hamlet_no_blanks]\n",
238 | "p = sns.kdeplot(total_tokens_per_line, shade=True, color='orange')"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {
244 | "id": "Wlwlj9RuyJgw"
245 | },
246 | "source": [
247 | "## A bit more clean-up\n",
248 | "There can be a number of clean-up steps depending upon the kind of dataset and the problem we are solving. \n",
249 | "\n",
250 | "In this case, let us cleanup/remove terms which contain any kind of special characters"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": null,
256 | "metadata": {
257 | "id": "cjH5YWmiybTB"
258 | },
259 | "outputs": [],
260 | "source": [
261 | "# only keeping words and removing special characters\n",
262 | "words = list(filter(None, [re.sub(r'[^A-Za-z]', '', word) for word in words]))\n",
263 | "print(words[:20])"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {
269 | "id": "ZOPPlL__yh1U"
270 | },
271 | "source": [
272 | "## Can you identify Top Occurring words?"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": null,
278 | "metadata": {
279 | "id": "lwVkovvCymBn"
280 | },
281 | "outputs": [],
282 | "source": [
283 | "# Add your code here\n",
284 | "from collections import Counter\n",
285 | "\n",
286 | "words = [word.lower() for word in words]\n",
287 | "c = Counter(words)\n",
288 | "c.most_common(10)"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {
294 | "id": "0nm6CS9KytxS"
295 | },
296 | "source": [
297 | "### Stopword Removal\n",
298 | "As you can see from the above output, the top occuring terms are not of much use in terms of understanding the context, etc. In the NLP space, such terms (punctuation marks, prepositions, etc) are termed as stopwords and are typically removed to handle dimensionality and other issues.\n",
299 | "\n",
300 | "Thankfully, ``nltk`` provides a clean utility along with an extensible list of stopwords that we can use straight-away"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {
307 | "id": "Fo8Te_WfyqqR"
308 | },
309 | "outputs": [],
310 | "source": [
311 | "import nltk \n",
312 | "\n",
313 | "# print a few stop words\n",
314 | "stopwords = nltk.corpus.stopwords.words('english')\n",
315 | "stopwords[:10]"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {
322 | "id": "0UHXmtI4zfR1"
323 | },
324 | "outputs": [],
325 | "source": [
326 | "# Remove stopwords\n",
327 | "words = [word.lower() for word in words if word not in stopwords]"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "id": "pJQozwulzlbh"
335 | },
336 | "outputs": [],
337 | "source": [
338 | "# Top Words by occurance after stopword removal\n",
339 | "\n",
340 | "# Add your code here\n",
341 | "c = Counter(words)\n",
342 | "c.most_common(10)"
343 | ]
344 | },
345 | {
346 | "cell_type": "markdown",
347 | "metadata": {
348 | "id": "gslVjNc7zuWC"
349 | },
350 | "source": [
351 | "## Text Preprocessing\n",
352 | "\n",
353 | "We covered some basics of pre-processing so far, steps such as:\n",
354 | "- Lower-casing\n",
355 | "- Special character removal\n",
356 | "- Stopword removal\n",
357 | "- Removing blank lines and empty spaces\n",
358 | "\n",
359 | "are typically performed time and again. There are a number of other steps as well but those are mostly application dependent."
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {
366 | "id": "l04SS_0k0J5_"
367 | },
368 | "outputs": [],
369 | "source": [
370 | "# A utility function to perform basic cleanup\n",
371 | "def normalize_document(doc):\n",
372 | " # lower case and remove special characters\\whitespaces\n",
373 | " doc = re.sub(r'[^a-zA-Z\\s]', '', doc, re.I|re.A)\n",
374 | " doc = doc.lower()\n",
375 | " doc = doc.strip()\n",
376 | " # tokenize document\n",
377 | " tokens = nltk.word_tokenize(doc)\n",
378 | " # filter stopwords out of document\n",
379 | " filtered_tokens = [token for token in tokens if token not in stopwords]\n",
380 | " # re-create document from filtered tokens\n",
381 | " doc = ' '.join(filtered_tokens)\n",
382 | " return doc"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "normalize_corpus = np.vectorize(normalize_document)\n",
392 | "\n",
393 | "norm_corpus = normalize_corpus(hamlet_raw)\n",
394 | "norm_corpus"
395 | ]
396 | }
397 | ],
398 | "metadata": {
399 | "colab": {
400 | "collapsed_sections": [],
401 | "name": "01_getting_started_answers.ipynb",
402 | "provenance": [],
403 | "toc_visible": true
404 | },
405 | "kernelspec": {
406 | "display_name": "Python 3",
407 | "name": "python3"
408 | },
409 | "language_info": {
410 | "name": "python"
411 | }
412 | },
413 | "nbformat": 4,
414 | "nbformat_minor": 0
415 | }
416 |
--------------------------------------------------------------------------------
/notebooks/answers/02_text_representation_answers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "5qqNXxRKU1B8"
7 | },
8 | "source": [
9 | "# Text Representation Techniques\n",
10 | "\n",
11 | "[](https://colab.research.google.com/github/raghavbali/workshop_text_classification/blob/main/notebooks/02_text_representation.ipynb)\n",
12 | "\n",
13 | "In this notebook, we will get familiar with some basic Text Representation Techniques \n",
14 | "Key takeaways from this notebook are:\n",
15 | "\n",
16 | "- Learn how to transform text into usable format using Bag of Words techniques such as:\n",
17 | " - Count Vectorizer\n",
18 | " - TF-IDF\n",
19 | " - Similarity Features\n",
20 | "\n",
21 | ""
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "id": "jxkAwOUwRrF7"
29 | },
30 | "outputs": [],
31 | "source": [
32 | "import nltk\n",
33 | "import numpy as np\n",
34 | "import pandas as pd\n",
35 | "from nltk.corpus import gutenberg\n",
36 | "import seaborn as sns\n",
37 | "import re\n",
38 | "\n",
39 | "%matplotlib inline\n",
40 | "pd.options.display.max_columns=10000"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {
47 | "id": "IVSjSzvxY4tM"
48 | },
49 | "outputs": [],
50 | "source": [
51 | "# First things first, download the Gutenberg Project files\n",
52 | "nltk.download('gutenberg')\n",
53 | "nltk.download('stopwords')\n",
54 | "nltk.download('punkt')"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "id": "i5egExbqY6q4"
62 | },
63 | "outputs": [],
64 | "source": [
65 | "# get the text for hamlet\n",
66 | "hamlet_raw = gutenberg.open('shakespeare-hamlet.txt')\n",
67 | "hamlet_raw = hamlet_raw.readlines()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "# stop words\n",
77 | "# Add your code here\n",
78 | "stopwords = nltk.corpus.stopwords.words('english')"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {
85 | "id": "g50oL62aY6l5"
86 | },
87 | "outputs": [],
88 | "source": [
89 | "# A utility function to perform basic cleanup\n",
90 | "def normalize_document(doc):\n",
91 | " # lower case and remove special characters\\whitespaces\n",
92 | " doc = re.sub(r'[^a-zA-Z\\s]', '', doc, re.I|re.A)\n",
93 | " # Add your code here to lower case\n",
94 | " doc = doc.lower()\n",
95 | " doc = doc.strip()\n",
96 | " # tokenize document\n",
97 | " tokens = nltk.word_tokenize(doc)\n",
98 | " # filter stopwords out of document\n",
99 | " filtered_tokens = [token for token in tokens if token not in stopwords]\n",
100 | " # re-create document from filtered tokens\n",
101 | " doc = ' '.join(filtered_tokens)\n",
102 | " return doc"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {
109 | "id": "A4-Kmr0KZUAl"
110 | },
111 | "outputs": [],
112 | "source": [
113 | "normalize_corpus = np.vectorize(normalize_document)\n",
114 | "\n",
115 | "norm_corpus = normalize_corpus(hamlet_raw)\n",
116 | "norm_corpus"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "metadata": {
122 | "id": "Irxq6wRwZJc1"
123 | },
124 | "source": [
125 | "## Bag of Words : Term Frequency\n",
126 | "A simple vector space representational model for text data. A vector space model is simply a mathematical model for transforming text as numeric vectors, such that each dimension of the vector is a specific feature\\attribute. The bag of words model represents each text document as a numeric vector where each dimension(column) is a specific word from the vocabulary and the value could be its frequency in the document. The model’s name is such because each document is represented literally as a ‘bag’ of its own words, disregarding word orders, sequences and grammar."
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "id": "Kn4N1UgpZLUz"
134 | },
135 | "outputs": [],
136 | "source": [
137 | "from sklearn.feature_extraction.text import CountVectorizer"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {
144 | "id": "6LktkOWdZNlq"
145 | },
146 | "outputs": [],
147 | "source": [
148 | "cv = CountVectorizer(min_df=0., max_df=1.)\n",
149 | "cv_matrix = cv.fit_transform(norm_corpus)\n",
150 | "cv_matrix = cv_matrix.toarray()\n",
151 | "cv_matrix"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "id": "hPVvuidsZNjD"
159 | },
160 | "outputs": [],
161 | "source": [
162 | "cv_matrix.shape"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {
169 | "id": "DZDQ8_zkZNgG"
170 | },
171 | "outputs": [],
172 | "source": [
173 | "vocab = cv.get_feature_names()"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {
180 | "id": "Wd68S8B3Zbhw"
181 | },
182 | "outputs": [],
183 | "source": [
184 | "# show document feature vectors\n",
185 | "pd.DataFrame(cv_matrix, columns=vocab).head()"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {
191 | "id": "HbE9z4Moa8dy"
192 | },
193 | "source": [
194 | "## TF-IDF\n",
195 | "Using absolute frequency counts as a measure of importance has its shortcomings. One potential issue is that there might be some terms which occur frequently across all documents and these may tend to overshadow other terms in the feature set. The TF-IDF model tries to combat this issue by using a normalizing factor. TF-IDF or Term Frequency-Inverse Document Frequency, uses a combination of two metrics in its computation, namely: __term frequency (tf)__ and __inverse document frequency (idf)__.\n",
196 | "\n",
197 | "Mathematically, we can define TF-IDF as\n",
198 | "\n",
199 | "``TF-IDF = tf x idf``\n",
200 | "\n",
201 | "Where, each element in the TF-IDF matrix is the score for word w in document D.\n",
202 | "\n",
203 | "The term **tf(w, D)** represents the term frequency of the word **w** in document **D**, which can be obtained from the Bag of Words model.\n",
204 | "The term idf(w, D) is the inverse document frequency for the term w, which can be computed as the log transform of the total number of documents in the corpus C divided by the document frequency of the word w, in other words it is the frequency of documents in the corpus where the word w occurs."
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "id": "8rBwfbPubTv5"
212 | },
213 | "outputs": [],
214 | "source": [
215 | "from sklearn.feature_extraction.text import TfidfVectorizer"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {
222 | "id": "Vs9uk2o8bZwh"
223 | },
224 | "outputs": [],
225 | "source": [
226 | "tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)\n",
227 | "tv_matrix = tv.fit_transform(norm_corpus)\n",
228 | "tv_matrix = tv_matrix.toarray()\n",
229 | "tv_matrix.shape"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": null,
235 | "metadata": {
236 | "id": "yFHSglOybZnX"
237 | },
238 | "outputs": [],
239 | "source": [
240 | "# Add your code here to ger feature names\n",
241 | "vocab = tv.get_feature_names()\n",
242 | "pd.DataFrame(np.round(tv_matrix, 2), columns=vocab).head()"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {
248 | "id": "6L2dbC3acLSt"
249 | },
250 | "source": [
251 | "## Bag of N-Grams Model\n",
252 | "A word is just a single token, often known as a **unigram** or 1-gram. We already know that the Bag of Words model doesn’t consider order of words. But what if we also wanted to take into account phrases or collection of words which occur in a sequence? **N-grams** help us achieve that. An N-gram is basically a collection of word tokens from a text document such that these tokens are contiguous and occur in a sequence. Bi-grams indicate n-grams of order 2 (two words), Tri-grams indicate n-grams of order 3 (three words), and so on. The Bag of N-Grams model is hence just an extension of the Bag of Words model so we can also leverage N-gram based features. The following example depicts bi-gram based features in each document feature vector."
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {
259 | "id": "UkAd3s4LcThI"
260 | },
261 | "outputs": [],
262 | "source": [
263 | "# you can set the n-gram range to 1,2 to get unigrams as well as bigrams\n",
264 | "bv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True,ngram_range=(2,2))\n",
265 | "bv_matrix = bv.fit_transform(norm_corpus)\n",
266 | "\n",
267 | "bv_matrix = bv_matrix.toarray()\n",
268 | "bv_matrix.shape"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": null,
274 | "metadata": {
275 | "id": "gGGohxNycgrG"
276 | },
277 | "outputs": [],
278 | "source": [
279 | "vocab = bv.get_feature_names()\n",
280 | "pd.DataFrame(bv_matrix, columns=vocab).head()"
281 | ]
282 | },
283 | {
284 | "cell_type": "markdown",
285 | "metadata": {
286 | "id": "ikWNCAPHcqIU"
287 | },
288 | "source": [
289 | "## Similarity Based Features (Bonus)\n",
290 | "Now that we have a method to transform text into vector form, we can now build on top of such features we engineered to generate new features which can be useful in domains like search engines, document clustering and information retrieval by leveraging these similarity based features.\n",
291 | "\n",
292 | "Pairwise document/sentence/term similarity in a corpus involves computing similarity for each pair of entities in a corpus. Thus if we have N entities in a corpus, we would end up with a N x N matrix such that each row and column represents the similarity score for a given pair. \n",
293 | "\n",
294 | "There are several similarity and distance metrics that are used to compute similarity. These include :\n",
295 | "- cosine distance/similarity, \n",
296 | "- euclidean distance, \n",
297 | "- manhattan distance, \n",
298 | "- BM25 similarity, \n",
299 | "- jaccard distance and so on. "
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {
306 | "id": "kmpC-Z0sdt_I"
307 | },
308 | "outputs": [],
309 | "source": [
310 | "from sklearn.metrics.pairwise import cosine_similarity"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {
317 | "id": "fMjIhdzhdt8D"
318 | },
319 | "outputs": [],
320 | "source": [
321 | "similarity_matrix = cosine_similarity(tv_matrix)\n",
322 | "similarity_matrix"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {
329 | "id": "i2qwGs6ndt37"
330 | },
331 | "outputs": [],
332 | "source": [
333 | "similarity_df = pd.DataFrame(similarity_matrix)\n",
334 | "similarity_df.head()"
335 | ]
336 | }
337 | ],
338 | "metadata": {
339 | "colab": {
340 | "collapsed_sections": [],
341 | "name": "02_text_representation.ipynb",
342 | "provenance": []
343 | },
344 | "kernelspec": {
345 | "display_name": "Python 3",
346 | "name": "python3"
347 | },
348 | "language_info": {
349 | "name": "python"
350 | }
351 | },
352 | "nbformat": 4,
353 | "nbformat_minor": 0
354 | }
355 |
--------------------------------------------------------------------------------
/notebooks/movie_reviews.csv.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/notebooks/movie_reviews.csv.bz2
--------------------------------------------------------------------------------
/slides/text_classification_raghavbali.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/slides/text_classification_raghavbali.pdf
--------------------------------------------------------------------------------
/slides/text_classification_raghavbali.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/workshop_text_classification/8f980669c353b84306680452ece779be786f2d13/slides/text_classification_raghavbali.pptx
--------------------------------------------------------------------------------