├── README.md
└── preprocessing_steps.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # NLP_Preprocessing
2 |
3 | In this project, I have curated all the important preprocessing steps that are used in the NLP pre-processing steps.
4 |
5 | I have also written a blog post related to this on medium which you can read by going through the link - [NLP Preprocessing](https://medium.com/@taunkdhaval08/nlp-preprocessing-a-useful-and-important-step-e79895c65a89)
6 |
--------------------------------------------------------------------------------
/preprocessing_steps.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "**1. Removing html tags**"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "**Regex method**"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import re"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "html_text = '
HTML list come in two flavors: ordered and unordered. Ordered list tags automatically inserts the right numbers for each of the list items, where as the unordered list tag inserts bullets.
- First item in the list
- Second item in the list
- Third item in the list
'"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 3,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "clean = re.compile('<.*?>')\n",
42 | "cleantext = re.sub(clean, '', html_text)"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 4,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "data": {
52 | "text/plain": [
53 | "'HTML list come in two flavors: ordered and unordered. Ordered list tags automatically inserts the right numbers for each of the list items, where as the unordered list tag inserts bullets. First item in the list Second item in the list Third item in the list '"
54 | ]
55 | },
56 | "execution_count": 4,
57 | "metadata": {},
58 | "output_type": "execute_result"
59 | }
60 | ],
61 | "source": [
62 | "cleantext"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": []
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "**BeautifulSoup Method**"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 5,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "from bs4 import BeautifulSoup"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 6,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "html_text = 'HTML list come in two flavors: ordered and unordered. Ordered list tags automatically inserts the right numbers for each of the list items, where as the unordered list tag inserts bullets.
- First item in the list
- Second item in the list
- Third item in the list
'"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 7,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "cleantext = BeautifulSoup(html_text, \"html.parser\").text"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 8,
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "data": {
113 | "text/plain": [
114 | "'HTML list come in two flavors: ordered and unordered. Ordered list tags automatically inserts the right numbers for each of the list items, where as the unordered list tag inserts bullets. First item in the list Second item in the list Third item in the list '"
115 | ]
116 | },
117 | "execution_count": 8,
118 | "metadata": {},
119 | "output_type": "execute_result"
120 | }
121 | ],
122 | "source": [
123 | "cleantext"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {},
130 | "outputs": [],
131 | "source": []
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "**2. Remove stop-words**"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "**NLTK method**"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 9,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "from nltk.corpus import stopwords \n",
154 | "from nltk.tokenize import word_tokenize "
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 10,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "stop_words = set(stopwords.words(\"english\"))"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 11,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "data": {
173 | "text/plain": [
174 | "'Machine Learning (ML) is the study of computer algorithms that improve automatically through experience.[1][2] It is seen as a subset of artificial intelligence. machine learning algorithms build a mathematical model based on sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to do so.'"
175 | ]
176 | },
177 | "execution_count": 11,
178 | "metadata": {},
179 | "output_type": "execute_result"
180 | }
181 | ],
182 | "source": [
183 | "text = 'Machine Learning (ML) is the study of computer algorithms that improve automatically through experience.[1][2] It is seen as a subset of artificial intelligence. machine learning algorithms build a mathematical model based on sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to do so.'\n",
184 | "text"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 12,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "word_tokens = word_tokenize(text)\n",
194 | "filtered_text = [word for word in word_tokens if word not in stop_words] "
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 13,
200 | "metadata": {},
201 | "outputs": [
202 | {
203 | "data": {
204 | "text/plain": [
205 | "\"Machine Learning ( ML ) study computer algorithms improve automatically experience . [ 1 ] [ 2 ] It seen subset artificial intelligence . machine learning algorithms build mathematical model based sample data , known `` training data '' , order make predictions decisions without explicitly programmed .\""
206 | ]
207 | },
208 | "execution_count": 13,
209 | "metadata": {},
210 | "output_type": "execute_result"
211 | }
212 | ],
213 | "source": [
214 | "' '.join(filtered_text)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": []
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "**3. Removing extra-spaces**"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 14,
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "data": {
238 | "text/plain": [
239 | "'Machine learning (ML) is the study of computer algorithms that improve automatically through experience.[1][2] It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to do so.'"
240 | ]
241 | },
242 | "execution_count": 14,
243 | "metadata": {},
244 | "output_type": "execute_result"
245 | }
246 | ],
247 | "source": [
248 | "text = 'Machine learning (ML) is the study of computer algorithms that improve automatically through experience.[1][2] It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to do so.'\n",
249 | "text"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 15,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "txt = text.split()"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 16,
264 | "metadata": {},
265 | "outputs": [
266 | {
267 | "data": {
268 | "text/plain": [
269 | "'Machine learning (ML) is the study of computer algorithms that improve automatically through experience.[1][2] It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to do so.'"
270 | ]
271 | },
272 | "execution_count": 16,
273 | "metadata": {},
274 | "output_type": "execute_result"
275 | }
276 | ],
277 | "source": [
278 | "' '.join(txt)"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {},
285 | "outputs": [],
286 | "source": []
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "**4. Converting number to text**"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 17,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "from num2words import num2words as n2w\n",
302 | "import spacy\n",
303 | "nlp = spacy.load('en_core_web_sm')"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 18,
309 | "metadata": {},
310 | "outputs": [
311 | {
312 | "data": {
313 | "text/plain": [
314 | "'I will be there by 3. Its 5 am now. Can the meeting be shifted to 7.'"
315 | ]
316 | },
317 | "execution_count": 18,
318 | "metadata": {},
319 | "output_type": "execute_result"
320 | }
321 | ],
322 | "source": [
323 | "text = 'I will be there by 3. Its 5 am now. Can the meeting be shifted to 7.'\n",
324 | "\n",
325 | "text"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 19,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "doc = nlp(text)\n",
335 | "tokens = [n2w(token.text) if token.pos_ == 'NUM' else token.text for token in doc]"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 20,
341 | "metadata": {},
342 | "outputs": [
343 | {
344 | "data": {
345 | "text/plain": [
346 | "'I will be there by three . Its five am now . Can the meeting be shifted to seven .'"
347 | ]
348 | },
349 | "execution_count": 20,
350 | "metadata": {},
351 | "output_type": "execute_result"
352 | }
353 | ],
354 | "source": [
355 | "' '.join(tokens)"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": []
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "metadata": {},
368 | "source": [
369 | "**5. Lowercasing the text**"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": 21,
375 | "metadata": {},
376 | "outputs": [
377 | {
378 | "data": {
379 | "text/plain": [
380 | "'Machine Learning (ML) is the study of computer algorithms that improve automatically through experience.[1][2] It is seen as a subset of artificial intelligence. machine learning algorithms build a mathematical model based on sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to do so.'"
381 | ]
382 | },
383 | "execution_count": 21,
384 | "metadata": {},
385 | "output_type": "execute_result"
386 | }
387 | ],
388 | "source": [
389 | "text = 'Machine Learning (ML) is the study of computer algorithms that improve automatically through experience.[1][2] It is seen as a subset of artificial intelligence. machine learning algorithms build a mathematical model based on sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to do so.'\n",
390 | "text"
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": 22,
396 | "metadata": {},
397 | "outputs": [],
398 | "source": [
399 | "lowered_text = text.lower()"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 23,
405 | "metadata": {},
406 | "outputs": [
407 | {
408 | "data": {
409 | "text/plain": [
410 | "'machine learning (ml) is the study of computer algorithms that improve automatically through experience.[1][2] it is seen as a subset of artificial intelligence. machine learning algorithms build a mathematical model based on sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to do so.'"
411 | ]
412 | },
413 | "execution_count": 23,
414 | "metadata": {},
415 | "output_type": "execute_result"
416 | }
417 | ],
418 | "source": [
419 | "lowered_text"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": null,
425 | "metadata": {},
426 | "outputs": [],
427 | "source": []
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "metadata": {},
432 | "source": [
433 | "**6. Tokenization**"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 24,
439 | "metadata": {},
440 | "outputs": [],
441 | "source": [
442 | "from nltk.tokenize import word_tokenize"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": 25,
448 | "metadata": {},
449 | "outputs": [
450 | {
451 | "data": {
452 | "text/plain": [
453 | "'Machine Learning (ML) is the study of computer algorithms that improve automatically through experience.[1][2] It is seen as a subset of artificial intelligence. machine learning algorithms build a mathematical model based on sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to do so.'"
454 | ]
455 | },
456 | "execution_count": 25,
457 | "metadata": {},
458 | "output_type": "execute_result"
459 | }
460 | ],
461 | "source": [
462 | "text = 'Machine Learning (ML) is the study of computer algorithms that improve automatically through experience.[1][2] It is seen as a subset of artificial intelligence. machine learning algorithms build a mathematical model based on sample data, known as \"training data\", in order to make predictions or decisions without being explicitly programmed to do so.'\n",
463 | "text"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": 26,
469 | "metadata": {},
470 | "outputs": [],
471 | "source": [
472 | "word_tokens = word_tokenize(text)"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 27,
478 | "metadata": {},
479 | "outputs": [
480 | {
481 | "data": {
482 | "text/plain": [
483 | "['Machine',\n",
484 | " 'Learning',\n",
485 | " '(',\n",
486 | " 'ML',\n",
487 | " ')',\n",
488 | " 'is',\n",
489 | " 'the',\n",
490 | " 'study',\n",
491 | " 'of',\n",
492 | " 'computer',\n",
493 | " 'algorithms',\n",
494 | " 'that',\n",
495 | " 'improve',\n",
496 | " 'automatically',\n",
497 | " 'through',\n",
498 | " 'experience',\n",
499 | " '.',\n",
500 | " '[',\n",
501 | " '1',\n",
502 | " ']',\n",
503 | " '[',\n",
504 | " '2',\n",
505 | " ']',\n",
506 | " 'It',\n",
507 | " 'is',\n",
508 | " 'seen',\n",
509 | " 'as',\n",
510 | " 'a',\n",
511 | " 'subset',\n",
512 | " 'of',\n",
513 | " 'artificial',\n",
514 | " 'intelligence',\n",
515 | " '.',\n",
516 | " 'machine',\n",
517 | " 'learning',\n",
518 | " 'algorithms',\n",
519 | " 'build',\n",
520 | " 'a',\n",
521 | " 'mathematical',\n",
522 | " 'model',\n",
523 | " 'based',\n",
524 | " 'on',\n",
525 | " 'sample',\n",
526 | " 'data',\n",
527 | " ',',\n",
528 | " 'known',\n",
529 | " 'as',\n",
530 | " '``',\n",
531 | " 'training',\n",
532 | " 'data',\n",
533 | " \"''\",\n",
534 | " ',',\n",
535 | " 'in',\n",
536 | " 'order',\n",
537 | " 'to',\n",
538 | " 'make',\n",
539 | " 'predictions',\n",
540 | " 'or',\n",
541 | " 'decisions',\n",
542 | " 'without',\n",
543 | " 'being',\n",
544 | " 'explicitly',\n",
545 | " 'programmed',\n",
546 | " 'to',\n",
547 | " 'do',\n",
548 | " 'so',\n",
549 | " '.']"
550 | ]
551 | },
552 | "execution_count": 27,
553 | "metadata": {},
554 | "output_type": "execute_result"
555 | }
556 | ],
557 | "source": [
558 | "word_tokens"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": null,
564 | "metadata": {},
565 | "outputs": [],
566 | "source": []
567 | },
568 | {
569 | "cell_type": "markdown",
570 | "metadata": {},
571 | "source": [
572 | "**7. Stemming**"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": 28,
578 | "metadata": {},
579 | "outputs": [],
580 | "source": [
581 | "from nltk.stem import PorterStemmer \n",
582 | "from nltk.tokenize import word_tokenize"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": 29,
588 | "metadata": {},
589 | "outputs": [],
590 | "source": [
591 | "ps = PorterStemmer()"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 30,
597 | "metadata": {},
598 | "outputs": [
599 | {
600 | "name": "stdout",
601 | "output_type": "stream",
602 | "text": [
603 | "sitting : sit\n",
604 | "thinking : think\n",
605 | "going : go\n",
606 | "linked : link\n",
607 | "likely : like\n"
608 | ]
609 | }
610 | ],
611 | "source": [
612 | "words = [\"sitting\", \"thinking\", \"going\", \"linked\", \"likely\"] \n",
613 | " \n",
614 | "for w in words: \n",
615 | " print(w, \" : \", ps.stem(w))"
616 | ]
617 | },
618 | {
619 | "cell_type": "code",
620 | "execution_count": null,
621 | "metadata": {},
622 | "outputs": [],
623 | "source": []
624 | },
625 | {
626 | "cell_type": "markdown",
627 | "metadata": {},
628 | "source": [
629 | "**8. Lemmatization**"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 31,
635 | "metadata": {},
636 | "outputs": [],
637 | "source": [
638 | "from nltk.stem import WordNetLemmatizer "
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": 32,
644 | "metadata": {},
645 | "outputs": [],
646 | "source": [
647 | "lemmatizer = WordNetLemmatizer() "
648 | ]
649 | },
650 | {
651 | "cell_type": "code",
652 | "execution_count": 33,
653 | "metadata": {},
654 | "outputs": [
655 | {
656 | "name": "stdout",
657 | "output_type": "stream",
658 | "text": [
659 | "rocks : rock\n",
660 | "eating : eat\n",
661 | "worse : bad\n"
662 | ]
663 | }
664 | ],
665 | "source": [
666 | "print(\"rocks :\", lemmatizer.lemmatize(\"rocks\")) \n",
667 | "print(\"eating :\", lemmatizer.lemmatize(\"eating\", pos='v')) \n",
668 | "print(\"worse :\", lemmatizer.lemmatize(\"worse\", pos='a')) "
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": null,
674 | "metadata": {},
675 | "outputs": [],
676 | "source": []
677 | },
678 | {
679 | "cell_type": "markdown",
680 | "metadata": {},
681 | "source": [
682 | "**9. Spell Checker**"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": 34,
688 | "metadata": {},
689 | "outputs": [],
690 | "source": [
691 | "from nltk.tokenize import word_tokenize\n",
692 | "from textblob import TextBlob "
693 | ]
694 | },
695 | {
696 | "cell_type": "code",
697 | "execution_count": 35,
698 | "metadata": {},
699 | "outputs": [],
700 | "source": [
701 | "text = 'I am ging there. Will brng the thngs from tem'"
702 | ]
703 | },
704 | {
705 | "cell_type": "code",
706 | "execution_count": 36,
707 | "metadata": {},
708 | "outputs": [],
709 | "source": [
710 | "tokens = word_tokenize(text)\n",
711 | "res = []\n",
712 | "for token in tokens:\n",
713 | " word = TextBlob(token)\n",
714 | " res.append(str(word.correct()))"
715 | ]
716 | },
717 | {
718 | "cell_type": "code",
719 | "execution_count": 37,
720 | "metadata": {},
721 | "outputs": [
722 | {
723 | "data": {
724 | "text/plain": [
725 | "'I am going there . Will bring the things from them'"
726 | ]
727 | },
728 | "execution_count": 37,
729 | "metadata": {},
730 | "output_type": "execute_result"
731 | }
732 | ],
733 | "source": [
734 | "' '.join(res)"
735 | ]
736 | },
737 | {
738 | "cell_type": "code",
739 | "execution_count": null,
740 | "metadata": {},
741 | "outputs": [],
742 | "source": []
743 | },
744 | {
745 | "cell_type": "code",
746 | "execution_count": null,
747 | "metadata": {},
748 | "outputs": [],
749 | "source": []
750 | }
751 | ],
752 | "metadata": {
753 | "kernelspec": {
754 | "display_name": "Python 3",
755 | "language": "python",
756 | "name": "python3"
757 | },
758 | "language_info": {
759 | "codemirror_mode": {
760 | "name": "ipython",
761 | "version": 3
762 | },
763 | "file_extension": ".py",
764 | "mimetype": "text/x-python",
765 | "name": "python",
766 | "nbconvert_exporter": "python",
767 | "pygments_lexer": "ipython3",
768 | "version": "3.8.2"
769 | }
770 | },
771 | "nbformat": 4,
772 | "nbformat_minor": 4
773 | }
774 |
--------------------------------------------------------------------------------