├── .gitattributes ├── FAQ1.PNG ├── Dependency Parsing ├── Code files │ ├── images │ │ ├── active.png │ │ ├── regex.png │ │ ├── results.png │ │ ├── keywords.png │ │ └── spacy_pipeline.png │ ├── Dep_parsing2_ipynb_txt.ipynb │ └── Dep_parsing3_ipynb_txt.ipynb ├── Commented code files │ ├── images │ │ ├── active.png │ │ ├── regex.png │ │ ├── results.png │ │ ├── keywords.png │ │ └── spacy_pipeline.png │ ├── Dep_parsing1_Commented.ipynb │ ├── Dep_parsing3_Commented.ipynb │ └── Dep_parsing2_Commented.ipynb └── Dataset │ └── active_passive.csv ├── POS Tagging Case Study ├── Code files │ ├── images │ │ ├── regex.png │ │ ├── active.png │ │ ├── keywords.png │ │ ├── results.png │ │ └── spacy_pipeline.png │ ├── POS1_ipynb_txt.ipynb │ ├── POS3_ipynb_txt.ipynb │ ├── POS2_ipynb_txt.ipynb │ └── POS4_ipynb_txt.ipynb └── Commented code files │ ├── images │ ├── regex.png │ ├── active.png │ ├── keywords.png │ ├── results.png │ └── spacy_pipeline.png │ ├── POS1_Commented.ipynb │ ├── POS3_Commented.ipynb │ ├── POS2_Commented.ipynb │ └── POS4_Commented.ipynb ├── Custom NER Code Demonstration ├── Code files │ ├── util.py │ └── CRF.ipynb └── Commented code files │ └── CRF_Commented.ipynb ├── README.md ├── Heteronyms Detection POS Tagging Use Case ├── Commented code files │ └── Heteronyms_POS_Commented.ipynb └── Code files │ └── Heteronyms_POS.ipynb └── requirements.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /FAQ1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/FAQ1.PNG -------------------------------------------------------------------------------- /Dependency Parsing/Code files/images/active.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/Dependency Parsing/Code files/images/active.png -------------------------------------------------------------------------------- /Dependency Parsing/Code files/images/regex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/Dependency Parsing/Code files/images/regex.png -------------------------------------------------------------------------------- /Dependency Parsing/Code files/images/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/Dependency Parsing/Code files/images/results.png -------------------------------------------------------------------------------- /Dependency Parsing/Code files/images/keywords.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/Dependency Parsing/Code files/images/keywords.png -------------------------------------------------------------------------------- /POS Tagging Case Study/Code files/images/regex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/POS Tagging Case Study/Code files/images/regex.png -------------------------------------------------------------------------------- /POS Tagging Case Study/Code files/images/active.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/POS Tagging Case Study/Code files/images/active.png -------------------------------------------------------------------------------- /POS Tagging Case Study/Code files/images/keywords.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/POS Tagging Case Study/Code files/images/keywords.png -------------------------------------------------------------------------------- /POS Tagging Case Study/Code files/images/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/POS Tagging Case Study/Code files/images/results.png -------------------------------------------------------------------------------- /Dependency Parsing/Code files/images/spacy_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/Dependency Parsing/Code files/images/spacy_pipeline.png -------------------------------------------------------------------------------- /Dependency Parsing/Commented code files/images/active.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/Dependency Parsing/Commented code files/images/active.png -------------------------------------------------------------------------------- /Dependency Parsing/Commented code files/images/regex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/Dependency Parsing/Commented code files/images/regex.png -------------------------------------------------------------------------------- /Dependency Parsing/Commented code files/images/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/Dependency Parsing/Commented code files/images/results.png -------------------------------------------------------------------------------- /Dependency Parsing/Commented code files/images/keywords.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/Dependency Parsing/Commented code files/images/keywords.png -------------------------------------------------------------------------------- /POS Tagging Case Study/Code files/images/spacy_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/POS Tagging Case Study/Code files/images/spacy_pipeline.png -------------------------------------------------------------------------------- /POS Tagging Case Study/Commented code files/images/regex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/POS Tagging Case Study/Commented code files/images/regex.png -------------------------------------------------------------------------------- /POS Tagging Case Study/Commented code files/images/active.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/POS Tagging Case Study/Commented code files/images/active.png -------------------------------------------------------------------------------- /POS Tagging Case Study/Commented code files/images/keywords.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/POS Tagging Case Study/Commented code files/images/keywords.png -------------------------------------------------------------------------------- /POS Tagging Case Study/Commented code files/images/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/POS Tagging Case Study/Commented code files/images/results.png -------------------------------------------------------------------------------- /Dependency Parsing/Commented code files/images/spacy_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/Dependency Parsing/Commented code files/images/spacy_pipeline.png -------------------------------------------------------------------------------- /POS Tagging Case Study/Commented code files/images/spacy_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/Syntactic-Processing/master/POS Tagging Case Study/Commented code files/images/spacy_pipeline.png -------------------------------------------------------------------------------- /Custom NER Code Demonstration/Code files/util.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | def print_transitions(trans_features): 4 | for (label_from, label_to), weight in trans_features: 5 | print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight)) 6 | 7 | #print("Top likely transitions:") 8 | def print_top_likely_transitions(trans_features): 9 | print_transitions(Counter(trans_features).most_common(10)) 10 | 11 | #print("\nTop unlikely transitions:") 12 | def print_top_unlikely_transitions(trans_features): 13 | print_transitions(Counter(trans_features).most_common()[-10:]) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Welcome to Syntactic Processing Code Demonstrations: 2 | 3 | This README file is designed to navigate the **code files** and the **datasets** that have been used to demonstrate the use cases of various concepts taught in the Syntactic Processing module. 4 | 5 | ### What is where? 6 | 7 | Each file is placed in their well named respective folders and each folder contains three folders: 8 | 9 | 1. **Code files**: This is where you can find the notebooks that have been demonstrated in the module. 10 | 2. **Dataset:** This is where all the datasets you will need to run the notebooks are kept. 11 | 3. **Commented code files:** This is where one can find the well commented code files to wrie their own code along with the videos. 12 | 13 | There are a total of 5 folders as mentioned below: 14 | 15 | 1. **Heteronyms Detection POS Tagging Use Case:** This folder contains the code files of heteronyms detection use case that have been demonstrated in PoS tagging session. 16 | 2. **POS Tagging Case Study:** This folder contains the code files and dataset of features extraction using PoS tagging case study that have been demonstrated in PoS tagging session. 17 | 3. **Dependency Parsing:** This folder contains the code files and dataset of active passive sentence detection use case that have been demonstrated in Parsing session. 18 | 4. **NER Code Demonstration:** This folder contains the code files of NER use case that have been demonstrated in NER and CRF session. 19 | 5. **Custom NER Code Demonstration:** This folder contains the code files and dataset of CRF use case that have been demonstrated in NER and CRF session. 20 | 21 | ### FAQs 22 | How do I download all the files in github? 23 | 24 | ![](FAQ1.PNG) 25 | 26 | You can either download the zipped files as shown or you can use `git clone repo_url` to clone the repository, if `git` is installed on your computer 27 | -------------------------------------------------------------------------------- /Dependency Parsing/Dataset/active_passive.csv: -------------------------------------------------------------------------------- 1 | Active,Passive 2 | He reads a novel.,A novel is read. 3 | He does not cook food.,Food is not cooked by him. 4 | Does he purchase books?,Are books being purchased by him? 5 | They grow plants.,Plants are grown by them. 6 | She teaches me.,I am taught by her. 7 | Esha is singing a song.,A song is being sung by Esha. 8 | Kritika is not chopping vegetables.,Vegetables are not being chopped by Kritika. 9 | Is Ritika buying a table?,Is a table being bought by Ritika? 10 | They are serving poor people.,Poor people are being served by them. 11 | She is disturbing Dinesh.,Dinesh is being disturbed by her. 12 | Nitesh has challenged her.,She has been challenged by Nitesh. 13 | Radhika has not written an article.,An article has not been written by Radhika. 14 | Have they left the apartment?,Has apartment been left by them? 15 | She has created this masterpiece.,This masterpiece has been created by her. 16 | I have read the newspaper.,The newspaper has been read by me. 17 | Reema cleaned the floor.,The floor was cleaned by Reema. 18 | Aisha bought a bicycle.,A bicycle was bought by Aisha. 19 | Naman called my friends.,My friends were called by Naman. 20 | I saved him.,He was saved by me. 21 | Miraya paid the bills.,The bills were paid by Miraya. 22 | Nitika was painting the wall.,The wall was being painted by Nitika. 23 | Manish was repairing the car.,The car was being repaired by Manish. 24 | Were you reciting the poem?,Was the poem being recited? 25 | She was baking the cake.,The cake was being baked by her. 26 | She was watching me.,I was being watched by her. 27 | Misha had cleaned the floor.,The floor had been cleaned by Misha. 28 | Vidhi had not received the parcel.,The parcel had not been received by Vidhi. 29 | Vishal had solved the doubt.,The doubt had been solved. 30 | Had they caught the thief?,Had the thief been caught by them? 31 | I had paid fifty thousand.,Fifty thousand had been paid by me. 32 | Kriya will sew the bag.,The bag will be sewed by Kriya. 33 | Disha will not arrange the things.,The things will not be arranged by Disha. 34 | Will you mop the floor?,Will the floor be mopped by you? 35 | They will post the letter.,The letter will be posted. 36 | Reena will save money.,Money will be saved by Reena. 37 | They will have brought the toy.,The toy will have been brought by them. 38 | Nimesh will not have changed the table cover.,The table cover will not have been changed by Nimesh. 39 | Will she have written the notes.,Will the notes have been written by her? 40 | They will have won the match.,The match will have been won by them. 41 | Vijay will have washed a shirt.,A shirt will have been washed by Vijay. -------------------------------------------------------------------------------- /POS Tagging Case Study/Commented code files/POS1_Commented.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "POS1_Commented.ipynb", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "OKcXwoZHurtB" 32 | }, 33 | "source": [ 34 | "## Importing libraries" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "r5zWwc1_urtR" 41 | }, 42 | "source": [ 43 | "# Import libraries\n", 44 | "import pandas as pd\n", 45 | "import numpy as np\n", 46 | "import os" 47 | ], 48 | "execution_count": null, 49 | "outputs": [] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "id": "Bd28EuN3urtT" 55 | }, 56 | "source": [ 57 | "### How do we identify product features?\n", 58 | "" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "metadata": { 64 | "id": "5zP8A4ysurtU" 65 | }, 66 | "source": [ 67 | "sent1 = \"I loved the screen on this phone.\"\n", 68 | "sent2 = \"The battery life on this phone is great.\"\n", 69 | "sent3 = \"The speakers are pathetic.\"" 70 | ], 71 | "execution_count": null, 72 | "outputs": [] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": { 77 | "id": "CT_yMlbiurtV" 78 | }, 79 | "source": [ 80 | "### Lets do a POS parse and see if we can figure out some patterns." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "metadata": { 86 | "id": "XYXVM2CrurtW" 87 | }, 88 | "source": [ 89 | "# import Spacy and load model.\n" 90 | ], 91 | "execution_count": null, 92 | "outputs": [] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "metadata": { 97 | "id": "kD53XzhEurtX" 98 | }, 99 | "source": [ 100 | "# Print the PoS tag of sent1\n" 101 | ], 102 | "execution_count": null, 103 | "outputs": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "metadata": { 108 | "id": "RAHWa3uturtZ" 109 | }, 110 | "source": [ 111 | "# Print the PoS tag of sent2\n" 112 | ], 113 | "execution_count": null, 114 | "outputs": [] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "metadata": { 119 | "id": "5z-Q-Rakurta" 120 | }, 121 | "source": [ 122 | "# Print the PoS tag of sent3\n" 123 | ], 124 | "execution_count": null, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": { 130 | "id": "EWh298c3urte" 131 | }, 132 | "source": [ 133 | "#### **Product features such as `screen`, `battery`, `speaker` have a POS tag of NOUN**" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": { 139 | "id": "XrtRHVfUurtf" 140 | }, 141 | "source": [ 142 | "## Summary\n", 143 | "- Product features such as `screen`, `battery` and `speaker` have a POS tag of Noun\n", 144 | "- If we can find the frequency count of all the nouns in our data, then by looking at top-n nouns we can find out what product features people are talking about\n", 145 | "- Check hypothesis on a real world dataset" 146 | ] 147 | } 148 | ] 149 | } -------------------------------------------------------------------------------- /Heteronyms Detection POS Tagging Use Case/Commented code files/Heteronyms_POS_Commented.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Heteronyms_POS_Commented.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "C5jJ9kQzPjge" 23 | }, 24 | "source": [ 25 | "**Heteronyms** are the words that have same spelling but mean different things when pronounced differently. \n", 26 | "\n", 27 | "\n", 28 | "- Recall the word *lead* from the lectures. It can refer to the metal lead or the act of leadership. The two pronounciations have different meanings.\n", 29 | "\n", 30 | "- For machine translation systems or text to speech systems, the ability to identify the correct sense of the word is crucial.\n", 31 | "\n", 32 | "\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "id": "sHvYVoL7i7_a" 39 | }, 40 | "source": [ 41 | "Let us have a look at this example:\n", 42 | "\n", 43 | "https://translate.google.com/?sl=en&tl=hi&text=She%20wished%20she%20could%20desert%20him%20in%20the%20desert.%0A&op=translate\n", 44 | "\n", 45 | "Example taken from: http://www-personal.umich.edu/~cellis/heteronym.html\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "metadata": { 51 | "id": "foKQ_tmOEca0" 52 | }, 53 | "source": [ 54 | "# Import SpaCy library\n" 55 | ], 56 | "execution_count": null, 57 | "outputs": [] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "metadata": { 62 | "id": "YRtKtXi9kC72" 63 | }, 64 | "source": [ 65 | "# Load pre-trained SpaCy model for performing basic \n", 66 | "# NLP tasks such as POS tagging, parsing, etc.\n" 67 | ], 68 | "execution_count": null, 69 | "outputs": [] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "metadata": { 74 | "id": "yjtQ_cgdk3pn" 75 | }, 76 | "source": [ 77 | "#Use the model to process the input sentence\n" 78 | ], 79 | "execution_count": null, 80 | "outputs": [] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "metadata": { 85 | "id": "gXtwtkrflcPF" 86 | }, 87 | "source": [ 88 | "# Print the tokens and their respective PoS tags.\n" 89 | ], 90 | "execution_count": null, 91 | "outputs": [] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "id": "G5YMNqqF-sHl" 97 | }, 98 | "source": [ 99 | "Note here that in the above example, the two instances of *desert* have different PoS tags and hence, the text to speech system can use this information to generate the correct pronounciation. \n", 100 | "\n", 101 | "The above task is a specific example of the larger NLP problem called Word Sense Disambiguation (WSD). For words that have more than one meaning, WSD is the problem of identifying the correct meaning of the word based on the context in which the word is used.\n", 102 | "\n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "id": "gJMNHvrRx-rA" 109 | }, 110 | "source": [ 111 | "Note that this technique will not work when the different meanings have the same PoS tags.\n", 112 | "\n", 113 | "https://translate.google.com/?sl=en&tl=hi&text=The%20bass%20swam%20around%20the%20bass%20drum%20on%20the%20ocean%20floor.&op=translate" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "metadata": { 119 | "id": "Xd8l1zfNmgQz" 120 | }, 121 | "source": [ 122 | "# Let's take a new example.\n" 123 | ], 124 | "execution_count": null, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "metadata": { 130 | "id": "3IHqXulWJYVj" 131 | }, 132 | "source": [ 133 | "" 134 | ], 135 | "execution_count": null, 136 | "outputs": [] 137 | } 138 | ] 139 | } -------------------------------------------------------------------------------- /Dependency Parsing/Commented code files/Dep_parsing1_Commented.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "Dep_parsing1_Commented.ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [] 26 | } 27 | }, 28 | "cells": [ 29 | { 30 | "cell_type": "code", 31 | "metadata": { 32 | "id": "8C1laB0dc8QJ" 33 | }, 34 | "source": [ 35 | "# Import libraries\n", 36 | "import spacy\n", 37 | "from spacy import displacy\n", 38 | "import pandas as pd\n", 39 | "nlp = spacy.load(\"en_core_web_sm\")" 40 | ], 41 | "execution_count": null, 42 | "outputs": [] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": { 47 | "id": "vuzER39Rc8QK" 48 | }, 49 | "source": [ 50 | "## How to do a dependency parse?" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "metadata": { 56 | "id": "Wbqc2lDSc8QL" 57 | }, 58 | "source": [ 59 | "# Define active and passive sentences.\n", 60 | "active = ['Hens lay eggs.',\n", 61 | " 'Birds build nests.',\n", 62 | " 'The batter hit the ball.',\n", 63 | " 'The computer transmitted a copy of the manual']\n", 64 | "passive = ['Eggs are laid by hens',\n", 65 | " 'Nests are built by birds',\n", 66 | " 'The ball was hit by the batter',\n", 67 | " 'A copy of the manual was transmitted by the computer.']" 68 | ], 69 | "execution_count": null, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "LL2fx6PZc8QM" 76 | }, 77 | "source": [ 78 | "# Get the dependency parsing tags\n" 79 | ], 80 | "execution_count": null, 81 | "outputs": [] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "id": "bwRIa71ec8QO" 87 | }, 88 | "source": [ 89 | "### Visualize this parse" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "metadata": { 95 | "id": "7lBO7vzjc8QP" 96 | }, 97 | "source": [ 98 | "# Visualise the parse tree\n" 99 | ], 100 | "execution_count": null, 101 | "outputs": [] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": { 106 | "id": "X9IVNiDTc8QQ" 107 | }, 108 | "source": [ 109 | "To understand what these dependency relationships one can use [this link](https://universaldependencies.org/docs/en/dep/)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "id": "XgSCNbJZc8QR" 116 | }, 117 | "source": [ 118 | "### Going through the dependency relationships it looks like that one would need to know linguistics and grammar to be able to do analysis. This is not entirely true. Many times being able to find out `patterns` in terms of dependency relationships is enough to perform the task at hand" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "id": "u1WISO06c8QS" 125 | }, 126 | "source": [ 127 | "# Visualise the parse tree of all the active sentences.\n" 128 | ], 129 | "execution_count": null, 130 | "outputs": [] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "metadata": { 135 | "id": "aGn8osrMc8QU" 136 | }, 137 | "source": [ 138 | "# Visualise the parse tree of all the passive sentences.\n" 139 | ], 140 | "execution_count": null, 141 | "outputs": [] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": { 146 | "id": "dNpEVOj5c8QV" 147 | }, 148 | "source": [ 149 | "## Summary:\n", 150 | "- Spacy's dependency parser let's us visualise the relationships\n", 151 | "- When a sentence is in passive voice there is always a presence if `nsubjpass` dependency relation" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "metadata": { 157 | "id": "HC6lEKZQc8QW" 158 | }, 159 | "source": [ 160 | "" 161 | ], 162 | "execution_count": null, 163 | "outputs": [] 164 | } 165 | ] 166 | } -------------------------------------------------------------------------------- /POS Tagging Case Study/Commented code files/POS3_Commented.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "POS3_Commented.ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [] 26 | } 27 | }, 28 | "cells": [ 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "vBOfYpV_U84V" 33 | }, 34 | "source": [ 35 | "## Importing libraries" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "ZqugswuAU84a" 42 | }, 43 | "source": [ 44 | "import pandas as pd\n", 45 | "import numpy as np\n", 46 | "import os\n", 47 | "import spacy \n", 48 | "from tqdm import tqdm" 49 | ], 50 | "execution_count": null, 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "7cD099pXU84b" 57 | }, 58 | "source": [ 59 | "### Read reviews data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "7biN0zEoU84c" 66 | }, 67 | "source": [ 68 | "con=open(\"../data/Samsung.txt\",'r', encoding=\"utf-8\")\n", 69 | "samsung_reviews=con.read()\n", 70 | "con.close()" 71 | ], 72 | "execution_count": null, 73 | "outputs": [] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": { 78 | "id": "ZVK2nZJ9U84c" 79 | }, 80 | "source": [ 81 | "### Can we reduce the time taken?\n", 82 | "[Pipelines (Spacy)](https://spacy.io/usage/processing-pipelines)\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": { 88 | "id": "bU_dNtImU84d" 89 | }, 90 | "source": [ 91 | "" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "metadata": { 97 | "id": "XQd_68ZuU84d" 98 | }, 99 | "source": [ 100 | "# shorten the pipline loading\n" 101 | ], 102 | "execution_count": null, 103 | "outputs": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "metadata": { 108 | "id": "grYm28A9U84e" 109 | }, 110 | "source": [ 111 | "" 112 | ], 113 | "execution_count": null, 114 | "outputs": [] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "metadata": { 119 | "id": "KIqUMEsuU84g" 120 | }, 121 | "source": [ 122 | "" 123 | ], 124 | "execution_count": null, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "metadata": { 130 | "id": "WaN7ver8U84h" 131 | }, 132 | "source": [ 133 | "" 134 | ], 135 | "execution_count": null, 136 | "outputs": [] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "metadata": { 141 | "id": "qMmMUUHjU84i" 142 | }, 143 | "source": [ 144 | "" 145 | ], 146 | "execution_count": null, 147 | "outputs": [] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "id": "TKKLAcF2U84j" 153 | }, 154 | "source": [ 155 | "### Lets process all the reviews now and see if time taken is less !!!" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "metadata": { 161 | "id": "DpDKMbiuU84k" 162 | }, 163 | "source": [ 164 | "" 165 | ], 166 | "execution_count": null, 167 | "outputs": [] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "id": "xDqdFstUU84k" 173 | }, 174 | "source": [ 175 | "### Does the hypothesis of nouns capturing `product features` hold?" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "metadata": { 181 | "id": "6QMCZN6lU84k" 182 | }, 183 | "source": [ 184 | "" 185 | ], 186 | "execution_count": null, 187 | "outputs": [] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "metadata": { 192 | "id": "3ePWgP3hU84l" 193 | }, 194 | "source": [ 195 | "" 196 | ], 197 | "execution_count": null, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": { 203 | "id": "vMKO-5IvU84m" 204 | }, 205 | "source": [ 206 | "### We now know that people mention `battery`, `product`, `screen` etc. But we still don't know in what context they mention these keywords" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "id": "oP4nyL_AU84m" 213 | }, 214 | "source": [ 215 | "### Summary:\n", 216 | " - Most frequently used lemmatised forms of noun, inform us about the product features people are talking about in product reviews\n", 217 | " - In order to process the review data faster spacy allows us to use the idea of enabling parts of model inference pipeline via `spacy.loads()` command and `disable` parameter" 218 | ] 219 | } 220 | ] 221 | } -------------------------------------------------------------------------------- /POS Tagging Case Study/Code files/POS1_ipynb_txt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "POS1_ipynb_txt.ipynb", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "OKcXwoZHurtB" 32 | }, 33 | "source": [ 34 | "## Importing libraries" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "r5zWwc1_urtR" 41 | }, 42 | "source": [ 43 | "import pandas as pd\n", 44 | "import numpy as np\n", 45 | "import os" 46 | ], 47 | "execution_count": null, 48 | "outputs": [] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "id": "Bd28EuN3urtT" 54 | }, 55 | "source": [ 56 | "### How do we identify product features?\n", 57 | "" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "metadata": { 63 | "id": "5zP8A4ysurtU" 64 | }, 65 | "source": [ 66 | "sent1 = \"I loved the screen on this phone.\"\n", 67 | "sent2 = \"The battery life on this phone is great.\"\n", 68 | "sent3 = \"The speakers are pathetic.\"" 69 | ], 70 | "execution_count": null, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "id": "CT_yMlbiurtV" 77 | }, 78 | "source": [ 79 | "### Lets do a POS parse and see if we can figure out some patterns." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "metadata": { 85 | "id": "XYXVM2CrurtW" 86 | }, 87 | "source": [ 88 | "import spacy \n", 89 | "nlp = spacy.load(\"en_core_web_sm\")" 90 | ], 91 | "execution_count": null, 92 | "outputs": [] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "metadata": { 97 | "id": "kD53XzhEurtX", 98 | "outputId": "c4e2e3eb-0c29-4ec0-be3b-feef37646d59" 99 | }, 100 | "source": [ 101 | "doc1 = nlp(sent1)\n", 102 | "for tok in doc1:\n", 103 | " print(tok.text,tok.pos_)" 104 | ], 105 | "execution_count": null, 106 | "outputs": [ 107 | { 108 | "output_type": "stream", 109 | "text": [ 110 | "I PRON\n", 111 | "loved VERB\n", 112 | "the DET\n", 113 | "screen NOUN\n", 114 | "on ADP\n", 115 | "this DET\n", 116 | "phone NOUN\n", 117 | ". PUNCT\n" 118 | ], 119 | "name": "stdout" 120 | } 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "metadata": { 126 | "id": "RAHWa3uturtZ", 127 | "outputId": "1e9d3adf-857f-4343-8a71-296fc7340a6d" 128 | }, 129 | "source": [ 130 | "doc2 = nlp(sent2)\n", 131 | "for tok in doc2:\n", 132 | " print(tok.text,tok.pos_)" 133 | ], 134 | "execution_count": null, 135 | "outputs": [ 136 | { 137 | "output_type": "stream", 138 | "text": [ 139 | "The DET\n", 140 | "battery NOUN\n", 141 | "life NOUN\n", 142 | "on ADP\n", 143 | "this DET\n", 144 | "phone NOUN\n", 145 | "is AUX\n", 146 | "great ADJ\n", 147 | ". PUNCT\n" 148 | ], 149 | "name": "stdout" 150 | } 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "metadata": { 156 | "id": "5z-Q-Rakurta", 157 | "outputId": "93b1956a-2d1a-453b-bc85-896a1a1018bc" 158 | }, 159 | "source": [ 160 | "doc3 = nlp(sent3)\n", 161 | "for tok in doc3:\n", 162 | " print(tok.text,tok.pos_)" 163 | ], 164 | "execution_count": null, 165 | "outputs": [ 166 | { 167 | "output_type": "stream", 168 | "text": [ 169 | "The DET\n", 170 | "speakers NOUN\n", 171 | "are AUX\n", 172 | "pathetic ADJ\n", 173 | ". PUNCT\n" 174 | ], 175 | "name": "stdout" 176 | } 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": { 182 | "id": "EWh298c3urte" 183 | }, 184 | "source": [ 185 | "#### **Product features such as `screen`, `battery`, `speaker` have a POS tag of NOUN**" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "id": "XrtRHVfUurtf" 192 | }, 193 | "source": [ 194 | "## Summary\n", 195 | "- Product features such as `screen`, `battery` and `speaker` have a POS tag of Noun\n", 196 | "- If we can find the frequency count of all the nouns in our data, then by looking at top-n nouns we can find out what product features people are talking about\n", 197 | "- Check hypothesis on a real world dataset" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "metadata": { 203 | "id": "tmqDwHWBurtg" 204 | }, 205 | "source": [ 206 | "" 207 | ], 208 | "execution_count": null, 209 | "outputs": [] 210 | } 211 | ] 212 | } -------------------------------------------------------------------------------- /Heteronyms Detection POS Tagging Use Case/Code files/Heteronyms_POS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Heteronyms_POS.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "C5jJ9kQzPjge" 23 | }, 24 | "source": [ 25 | "**Heteronyms** are the words that have same spelling but mean different things when pronounced differently. \n", 26 | "\n", 27 | "\n", 28 | "- Recall the word *lead* from the lectures. It can refer to the metal lead or the act of leadership. The two pronounciations have different meanings.\n", 29 | "\n", 30 | "- For machine translation systems or text to speech systems, the ability to identify the correct sense of the word is crucial.\n", 31 | "\n", 32 | "\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "id": "sHvYVoL7i7_a" 39 | }, 40 | "source": [ 41 | "Let us have a look at this example:\n", 42 | "\n", 43 | "https://translate.google.com/?sl=en&tl=hi&text=She%20wished%20she%20could%20desert%20him%20in%20the%20desert.%0A&op=translate\n", 44 | "\n", 45 | "Example taken from: http://www-personal.umich.edu/~cellis/heteronym.html\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "metadata": { 51 | "id": "foKQ_tmOEca0" 52 | }, 53 | "source": [ 54 | "# Import SpaCy library\n", 55 | "import spacy " 56 | ], 57 | "execution_count": null, 58 | "outputs": [] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "metadata": { 63 | "id": "YRtKtXi9kC72" 64 | }, 65 | "source": [ 66 | "# Load pre-trained SpaCy model for performing basic \n", 67 | "# NLP tasks such as POS tagging, parsing, etc.\n", 68 | "model = spacy.load(\"en_core_web_sm\")" 69 | ], 70 | "execution_count": null, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "yjtQ_cgdk3pn" 77 | }, 78 | "source": [ 79 | "#Use the model to process the input sentence\n", 80 | "tokens = model(\"She wished she could desert him in the desert.\")" 81 | ], 82 | "execution_count": null, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "metadata": { 88 | "colab": { 89 | "base_uri": "https://localhost:8080/" 90 | }, 91 | "id": "gXtwtkrflcPF", 92 | "outputId": "a7c216e9-f1e9-4631-a382-18ef367dfe4d" 93 | }, 94 | "source": [ 95 | "# Print the tokens and their respective PoS tags.\n", 96 | "for token in tokens:\n", 97 | " print(token.text, \"--\", token.pos_, \"--\", token.tag_)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [ 101 | { 102 | "output_type": "stream", 103 | "text": [ 104 | "She -- PRON -- PRP\n", 105 | "wished -- VERB -- VBD\n", 106 | "she -- PRON -- PRP\n", 107 | "could -- VERB -- MD\n", 108 | "desert -- VERB -- VB\n", 109 | "him -- PRON -- PRP\n", 110 | "in -- ADP -- IN\n", 111 | "the -- DET -- DT\n", 112 | "desert -- NOUN -- NN\n", 113 | ". -- PUNCT -- .\n" 114 | ], 115 | "name": "stdout" 116 | } 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": { 122 | "id": "G5YMNqqF-sHl" 123 | }, 124 | "source": [ 125 | "Note here that in the above example, the two instances of *desert* have different PoS tags and hence, the text to speech system can use this information to generate the correct pronounciation. \n", 126 | "\n", 127 | "The above task is a specific example of the larger NLP problem called Word Sense Disambiguation (WSD). For words that have more than one meaning, WSD is the problem of identifying the correct meaning of the word based on the context in which the word is used.\n", 128 | "\n" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": { 134 | "id": "gJMNHvrRx-rA" 135 | }, 136 | "source": [ 137 | "Note that this technique will not work when the different meanings have the same PoS tags.\n", 138 | "\n", 139 | "https://translate.google.com/?sl=en&tl=hi&text=The%20bass%20swam%20around%20the%20bass%20drum%20on%20the%20ocean%20floor.&op=translate" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "metadata": { 145 | "id": "Xd8l1zfNmgQz", 146 | "colab": { 147 | "base_uri": "https://localhost:8080/" 148 | }, 149 | "outputId": "498b23e2-37c3-4269-e6b2-4b8def4f319c" 150 | }, 151 | "source": [ 152 | "# Let's take a new example.\n", 153 | "tokens = model(\"The bass swam around the bass drum on the ocean floor\")\n", 154 | "for token in tokens:\n", 155 | " print(token.text, \"--\", token.pos_, \"--\", token.tag_)" 156 | ], 157 | "execution_count": null, 158 | "outputs": [ 159 | { 160 | "output_type": "stream", 161 | "text": [ 162 | "The -- DET -- DT\n", 163 | "bass -- NOUN -- NN\n", 164 | "swam -- PROPN -- NNP\n", 165 | "around -- ADP -- IN\n", 166 | "the -- DET -- DT\n", 167 | "bass -- NOUN -- NN\n", 168 | "drum -- NOUN -- NN\n", 169 | "on -- ADP -- IN\n", 170 | "the -- DET -- DT\n", 171 | "ocean -- NOUN -- NN\n", 172 | "floor -- NOUN -- NN\n" 173 | ], 174 | "name": "stdout" 175 | } 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "metadata": { 181 | "id": "3IHqXulWJYVj" 182 | }, 183 | "source": [ 184 | "" 185 | ], 186 | "execution_count": null, 187 | "outputs": [] 188 | } 189 | ] 190 | } -------------------------------------------------------------------------------- /Dependency Parsing/Commented code files/Dep_parsing3_Commented.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "Dep_parsing3_Commented.ipynb", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "mrzv9UJ-tArb" 32 | }, 33 | "source": [ 34 | "# Import libraries\n", 35 | "import spacy\n", 36 | "from spacy import displacy\n", 37 | "from spacy.matcher import Matcher\n", 38 | "import pandas as pd\n", 39 | "nlp = spacy.load(\"en_core_web_sm\")" 40 | ], 41 | "execution_count": null, 42 | "outputs": [] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": { 47 | "id": "i5gre9OYtAre" 48 | }, 49 | "source": [ 50 | "### Lets check our rule on a larger corpus" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "metadata": { 56 | "id": "JXDFu7f3tArf" 57 | }, 58 | "source": [ 59 | "# load the dataset csv file\n" 60 | ], 61 | "execution_count": null, 62 | "outputs": [] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "metadata": { 67 | "id": "9VRU3oostArg" 68 | }, 69 | "source": [ 70 | "# Print the shape of the dataframe.\n" 71 | ], 72 | "execution_count": null, 73 | "outputs": [] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "metadata": { 78 | "id": "9yzs-odUtArg" 79 | }, 80 | "source": [ 81 | "# Separate out active and passive sentences in arrays.\n" 82 | ], 83 | "execution_count": null, 84 | "outputs": [] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "id": "qi53fEfutArg" 90 | }, 91 | "source": [ 92 | "### Create the rule" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "metadata": { 98 | "id": "E5mJnYPotArh" 99 | }, 100 | "source": [ 101 | "" 102 | ], 103 | "execution_count": null, 104 | "outputs": [] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "metadata": { 109 | "id": "pmzbRTwgtArh" 110 | }, 111 | "source": [ 112 | "" 113 | ], 114 | "execution_count": null, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "id": "uWDFJWzZtArh" 121 | }, 122 | "source": [ 123 | "### Check rule on active voice sentences" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "metadata": { 129 | "id": "iVtObi5ItAri" 130 | }, 131 | "source": [ 132 | "" 133 | ], 134 | "execution_count": null, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "id": "lDLw_valtAri" 141 | }, 142 | "source": [ 143 | "### Check rule on passive voice sentences" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "metadata": { 149 | "id": "fr7aJK7RtAri" 150 | }, 151 | "source": [ 152 | "" 153 | ], 154 | "execution_count": null, 155 | "outputs": [] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "id": "bqobr0CctArj" 161 | }, 162 | "source": [ 163 | "### Let's troubleshoot" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "metadata": { 169 | "id": "4EdEzFWHtArj" 170 | }, 171 | "source": [ 172 | "" 173 | ], 174 | "execution_count": null, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "79UCxE9ItArj" 181 | }, 182 | "source": [ 183 | "" 184 | ], 185 | "execution_count": null, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "metadata": { 191 | "id": "-iN097kQtArk" 192 | }, 193 | "source": [ 194 | "" 195 | ], 196 | "execution_count": null, 197 | "outputs": [] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "id": "8F9vXmDHtArk" 203 | }, 204 | "source": [ 205 | "### Let's visualize their dependency trees" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "metadata": { 211 | "id": "XPspRsVvtArk" 212 | }, 213 | "source": [ 214 | "" 215 | ], 216 | "execution_count": null, 217 | "outputs": [] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "metadata": { 222 | "id": "9WtDcQ_XtArl" 223 | }, 224 | "source": [ 225 | "" 226 | ], 227 | "execution_count": null, 228 | "outputs": [] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": { 233 | "id": "0E-BKP-vtArn" 234 | }, 235 | "source": [ 236 | "[Dependencies](https://universaldependencies.org/docs/en/dep/)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": { 242 | "id": "x4_D5nq5tAro" 243 | }, 244 | "source": [ 245 | "### Update our rule\n", 246 | "[Reference](https://spacy.io/usage/rule-based-matching)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "metadata": { 252 | "id": "Rt2RSbsbtAro" 253 | }, 254 | "source": [ 255 | "" 256 | ], 257 | "execution_count": null, 258 | "outputs": [] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "metadata": { 263 | "id": "jqRoircqtAro" 264 | }, 265 | "source": [ 266 | "" 267 | ], 268 | "execution_count": null, 269 | "outputs": [] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "metadata": { 274 | "id": "gsPNOHhttAro" 275 | }, 276 | "source": [ 277 | "" 278 | ], 279 | "execution_count": null, 280 | "outputs": [] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": { 285 | "id": "BDUS7kn4tArp" 286 | }, 287 | "source": [ 288 | "## Summary\n", 289 | " - Always test your rules and hueristics on a larger corpus to see the effectiveness of the rules\n", 290 | " - One can write intricate matching rules using `matcher` object" 291 | ] 292 | } 293 | ] 294 | } -------------------------------------------------------------------------------- /Dependency Parsing/Commented code files/Dep_parsing2_Commented.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "Dep_parsing2_Commented.ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [] 26 | } 27 | }, 28 | "cells": [ 29 | { 30 | "cell_type": "code", 31 | "metadata": { 32 | "id": "uFMZ8FAqc8GS" 33 | }, 34 | "source": [ 35 | "# Import libraries\n", 36 | "import spacy\n", 37 | "from spacy import displacy\n", 38 | "import pandas as pd\n", 39 | "nlp = spacy.load(\"en_core_web_sm\")" 40 | ], 41 | "execution_count": null, 42 | "outputs": [] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "metadata": { 47 | "id": "lMizkNfmc8GX" 48 | }, 49 | "source": [ 50 | "# Define active and passive sentences.\n", 51 | "active = ['Hens lay eggs.',\n", 52 | " 'Birds build nests.',\n", 53 | " 'The batter hit the ball.',\n", 54 | " 'The computer transmitted a copy of the manual']\n", 55 | "passive = ['Eggs are laid by hens',\n", 56 | " 'Nests are built by birds',\n", 57 | " 'The ball was hit by the batter',\n", 58 | " 'A copy of the manual was transmitted by the computer.']" 59 | ], 60 | "execution_count": null, 61 | "outputs": [] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": { 66 | "id": "47aHzaYTc8GZ" 67 | }, 68 | "source": [ 69 | "### How do we impliment the rule `if dep nsubjpass, then passive else not`?" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "XmTIIuqkc8GZ" 76 | }, 77 | "source": [ 78 | "# Import matcher \n", 79 | "from spacy.matcher import Matcher" 80 | ], 81 | "execution_count": null, 82 | "outputs": [] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": { 87 | "id": "Tulu8FF5c8Ga" 88 | }, 89 | "source": [ 90 | "### Read more about it [here](https://spacy.io/api/matcher)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "metadata": { 96 | "id": "PdKzGXTbc8Ga" 97 | }, 98 | "source": [ 99 | "# Visualise the dependency parse tree of 1st sentence of the passive sentences.\n" 100 | ], 101 | "execution_count": null, 102 | "outputs": [] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": { 107 | "id": "z3NmcC_Tc8Gc" 108 | }, 109 | "source": [ 110 | "### Create a rule with `Matcher`" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "NPB4mEXxc8Gd" 117 | }, 118 | "source": [ 119 | "#Create rule with matcher\n" 120 | ], 121 | "execution_count": null, 122 | "outputs": [] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "metadata": { 127 | "id": "t4sdkcioc8Ge" 128 | }, 129 | "source": [ 130 | "" 131 | ], 132 | "execution_count": null, 133 | "outputs": [] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "metadata": { 138 | "id": "as4nsdejc8Gf" 139 | }, 140 | "source": [ 141 | "" 142 | ], 143 | "execution_count": null, 144 | "outputs": [] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "metadata": { 149 | "id": "2gemzSC2c8Gf" 150 | }, 151 | "source": [ 152 | "" 153 | ], 154 | "execution_count": null, 155 | "outputs": [] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "id": "RUSAqlZTc8Gg" 161 | }, 162 | "source": [ 163 | "### Create a rule for `passive voice`" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "metadata": { 169 | "id": "jrDq70A3c8Gh" 170 | }, 171 | "source": [ 172 | "" 173 | ], 174 | "execution_count": null, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "GA6G7_Jrc8Gh" 181 | }, 182 | "source": [ 183 | "" 184 | ], 185 | "execution_count": null, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "id": "Syig5C1nc8Gi" 192 | }, 193 | "source": [ 194 | "### Let's check how this rule works if we use it on a sentence with `active voice`" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "metadata": { 200 | "id": "BqxpwGDJc8Gi" 201 | }, 202 | "source": [ 203 | "" 204 | ], 205 | "execution_count": null, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "metadata": { 211 | "id": "iQ374wu9c8Gj" 212 | }, 213 | "source": [ 214 | "" 215 | ], 216 | "execution_count": null, 217 | "outputs": [] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "metadata": { 222 | "id": "tsC4QHYmc8Gj" 223 | }, 224 | "source": [ 225 | "" 226 | ], 227 | "execution_count": null, 228 | "outputs": [] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": { 233 | "id": "zA197pFCc8Gk" 234 | }, 235 | "source": [ 236 | "### Now lets make a function that impliments this logic" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "metadata": { 242 | "id": "W4C80We_c8Gk" 243 | }, 244 | "source": [ 245 | "" 246 | ], 247 | "execution_count": null, 248 | "outputs": [] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "id": "5e1P9nhcc8Gl" 254 | }, 255 | "source": [ 256 | "### Let's test this function on our small sample of sentences and see how the pipeline will work" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "metadata": { 262 | "id": "wPe6aHIMc8Gl" 263 | }, 264 | "source": [ 265 | "" 266 | ], 267 | "execution_count": null, 268 | "outputs": [] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "metadata": { 273 | "id": "GkVgfN8Wc8Gm" 274 | }, 275 | "source": [ 276 | "" 277 | ], 278 | "execution_count": null, 279 | "outputs": [] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": { 284 | "id": "8VjFb4jXc8Gn" 285 | }, 286 | "source": [ 287 | "### Summary\n", 288 | " - One can go a long way by observing patterns in linguistic data, you don't always need to know the details of the linguitsics very well.\n", 289 | " - Once can use the `matcher` object to find if certain linguistic patterns exist in data" 290 | ] 291 | } 292 | ] 293 | } -------------------------------------------------------------------------------- /POS Tagging Case Study/Commented code files/POS2_Commented.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "POS2_Commented.ipynb", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "FXsu8MxUVAKT" 32 | }, 33 | "source": [ 34 | "## Importing libraries" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "kXr4Cxh2VAKZ" 41 | }, 42 | "source": [ 43 | "import pandas as pd\n", 44 | "import numpy as np\n", 45 | "import os" 46 | ], 47 | "execution_count": null, 48 | "outputs": [] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "metadata": { 53 | "id": "Eq-7JCT6VAKa" 54 | }, 55 | "source": [ 56 | "import spacy \n", 57 | "nlp = spacy.load(\"en_core_web_sm\")" 58 | ], 59 | "execution_count": null, 60 | "outputs": [] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "id": "ItxQ587VVAKb" 66 | }, 67 | "source": [ 68 | "### Read reviews data" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "metadata": { 74 | "id": "q1pr6bbsVAKb" 75 | }, 76 | "source": [ 77 | "# Load the Samsung.txt dataset\n", 78 | "con=open(\"../data/Samsung.txt\",'r', encoding=\"utf-8\")\n", 79 | "samsung_reviews=con.read()\n", 80 | "con.close()" 81 | ], 82 | "execution_count": null, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "metadata": { 88 | "id": "f4GyRJMEVAKb" 89 | }, 90 | "source": [ 91 | "len(samsung_reviews.split(\"\\n\"))" 92 | ], 93 | "execution_count": null, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "id": "fU4ZvurwVAKc" 100 | }, 101 | "source": [ 102 | "### Dataset is a text file where each review is in a new line" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "metadata": { 108 | "id": "bPT-wYC2VAKd" 109 | }, 110 | "source": [ 111 | "samsung_reviews.split(\"\\n\")[0:4]" 112 | ], 113 | "execution_count": null, 114 | "outputs": [] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": { 119 | "id": "EUi5gE7WVAKd" 120 | }, 121 | "source": [ 122 | "### Will our hypothesis hold on real world data? `Product features---POS_NOUN`" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "yUx96iRtVAKg" 129 | }, 130 | "source": [ 131 | "review1=samsung_reviews.split(\"\\n\")[0]\n", 132 | "review1=nlp(review1)" 133 | ], 134 | "execution_count": null, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "id": "EKt6ZNPnVAKg" 141 | }, 142 | "source": [ 143 | "### Lets do nlp parse on part of one review in our dataset" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "metadata": { 149 | "id": "gOCZxFp_VAKh" 150 | }, 151 | "source": [ 152 | "" 153 | ], 154 | "execution_count": null, 155 | "outputs": [] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "id": "xOBlgcu0VAKh" 161 | }, 162 | "source": [ 163 | "#### Real world data is usually messy, observe the words `found` and `used`" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "metadata": { 169 | "id": "yB1t1bPTVAKh" 170 | }, 171 | "source": [ 172 | "" 173 | ], 174 | "execution_count": null, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "G3LDkWepVAKi" 181 | }, 182 | "source": [ 183 | "" 184 | ], 185 | "execution_count": null, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "metadata": { 191 | "id": "FEXKFzlPVAKi" 192 | }, 193 | "source": [ 194 | "## Get most frequent lemma forms of nouns\n" 195 | ], 196 | "execution_count": null, 197 | "outputs": [] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "id": "x-uR8q-LVAKi" 203 | }, 204 | "source": [ 205 | "#### It seems possible that if we extract all the nouns from the reviews and look at the top 5 most frequent lemmatised noun forms, we will be able to identify `What people are talking about?`" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": { 211 | "id": "QOvqBuwgVAKj" 212 | }, 213 | "source": [ 214 | "### Lets repeat this experiment on a larger set of reviews" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "metadata": { 220 | "id": "RsDUv3X1VAKj" 221 | }, 222 | "source": [ 223 | "" 224 | ], 225 | "execution_count": null, 226 | "outputs": [] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": { 231 | "id": "lrVi3UPTVAKj" 232 | }, 233 | "source": [ 234 | "### Lets add some way of keeping track of time" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "metadata": { 240 | "id": "zNhp_45EVAKj" 241 | }, 242 | "source": [ 243 | "" 244 | ], 245 | "execution_count": null, 246 | "outputs": [] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "metadata": { 251 | "id": "jai2hnfoVAKk" 252 | }, 253 | "source": [ 254 | "" 255 | ], 256 | "execution_count": null, 257 | "outputs": [] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": { 262 | "id": "94gZ5gxLVAKk" 263 | }, 264 | "source": [ 265 | "### Did you notice anything? What do you think will be the time taken to process all the reviews?" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "metadata": { 271 | "id": "Op_ftvWyVAKk" 272 | }, 273 | "source": [ 274 | "" 275 | ], 276 | "execution_count": null, 277 | "outputs": [] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "metadata": { 282 | "id": "VDMknUOXVAKl" 283 | }, 284 | "source": [ 285 | "" 286 | ], 287 | "execution_count": null, 288 | "outputs": [] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": { 293 | "id": "1PsJmzrtVAKl" 294 | }, 295 | "source": [ 296 | "## Summary\n", 297 | "- POS tag based rule seems to be working well\n", 298 | "- We need to figure out a way to reduce the time taken to process reviews" 299 | ] 300 | } 301 | ] 302 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.12.0 2 | alabaster==0.7.12 3 | albumentations==0.1.12 4 | altair==4.1.0 5 | appdirs==1.4.4 6 | argon2-cffi==20.1.0 7 | arviz==0.11.2 8 | astor==0.8.1 9 | astropy==4.2.1 10 | astunparse==1.6.3 11 | async-generator==1.10 12 | atari-py==0.2.9 13 | atomicwrites==1.4.0 14 | attrs==21.2.0 15 | audioread==2.1.9 16 | autograd==1.3 17 | Babel==2.9.1 18 | backcall==0.2.0 19 | beautifulsoup4==4.6.3 20 | bleach==3.3.0 21 | blis==0.4.1 22 | bokeh==2.3.2 23 | Bottleneck==1.3.2 24 | branca==0.4.2 25 | bs4==0.0.1 26 | CacheControl==0.12.6 27 | cached-property==1.5.2 28 | cachetools==4.2.2 29 | catalogue==1.0.0 30 | certifi==2021.5.30 31 | cffi==1.14.5 32 | cftime==1.5.0 33 | chardet==3.0.4 34 | click==7.1.2 35 | cloudpickle==1.3.0 36 | cmake==3.12.0 37 | cmdstanpy==0.9.5 38 | colorcet==2.0.6 39 | colorlover==0.3.0 40 | community==1.0.0b1 41 | contextlib2==0.5.5 42 | convertdate==2.3.2 43 | coverage==3.7.1 44 | coveralls==0.5 45 | crcmod==1.7 46 | cufflinks==0.17.3 47 | cvxopt==1.2.6 48 | cvxpy==1.0.31 49 | cycler==0.10.0 50 | cymem==2.0.5 51 | Cython==0.29.23 52 | daft==0.0.4 53 | dask==2.12.0 54 | datascience==0.10.6 55 | debugpy==1.0.0 56 | decorator==4.4.2 57 | defusedxml==0.7.1 58 | descartes==1.1.0 59 | dill==0.3.4 60 | distributed==1.25.3 61 | dlib==19.18.0 62 | dm-tree==0.1.6 63 | docopt==0.6.2 64 | docutils==0.17.1 65 | dopamine-rl==1.0.5 66 | earthengine-api==0.1.269 67 | easydict==1.9 68 | ecos==2.0.7.post1 69 | editdistance==0.5.3 70 | en-core-web-sm==2.2.5 71 | entrypoints==0.3 72 | ephem==4.0.0.2 73 | et-xmlfile==1.1.0 74 | fa2==0.3.5 75 | fastai==1.0.61 76 | fastdtw==0.3.4 77 | fastprogress==1.0.0 78 | fastrlock==0.6 79 | fbprophet==0.7.1 80 | feather-format==0.4.1 81 | filelock==3.0.12 82 | firebase-admin==4.4.0 83 | fix-yahoo-finance==0.0.22 84 | Flask==1.1.4 85 | flatbuffers==1.12 86 | folium==0.8.3 87 | future==0.16.0 88 | gast==0.4.0 89 | GDAL==2.2.2 90 | gdown==3.6.4 91 | gensim==3.6.0 92 | geographiclib==1.50 93 | geopy==1.17.0 94 | gin-config==0.4.0 95 | glob2==0.7 96 | google==2.0.3 97 | google-api-core==1.26.3 98 | google-api-python-client==1.12.8 99 | google-auth==1.31.0 100 | google-auth-httplib2==0.0.4 101 | google-auth-oauthlib==0.4.4 102 | google-cloud-bigquery==1.21.0 103 | google-cloud-bigquery-storage==1.1.0 104 | google-cloud-core==1.0.3 105 | google-cloud-datastore==1.8.0 106 | google-cloud-firestore==1.7.0 107 | google-cloud-language==1.2.0 108 | google-cloud-storage==1.18.1 109 | google-cloud-translate==1.5.0 110 | google-colab==1.0.0 111 | google-pasta==0.2.0 112 | google-resumable-media==0.4.1 113 | googleapis-common-protos==1.53.0 114 | googledrivedownloader==0.4 115 | graphviz==0.10.1 116 | greenlet==1.1.0 117 | grpcio==1.34.1 118 | gspread==3.0.1 119 | gspread-dataframe==3.0.8 120 | gym==0.17.3 121 | h5py==3.1.0 122 | HeapDict==1.0.1 123 | hijri-converter==2.1.2 124 | holidays==0.10.5.2 125 | holoviews==1.14.4 126 | html5lib==1.0.1 127 | httpimport==0.5.18 128 | httplib2==0.17.4 129 | httplib2shim==0.0.3 130 | humanize==0.5.1 131 | hyperopt==0.1.2 132 | ideep4py==2.0.0.post3 133 | idna==2.10 134 | imageio==2.4.1 135 | imagesize==1.2.0 136 | imbalanced-learn==0.4.3 137 | imblearn==0.0 138 | imgaug==0.2.9 139 | importlib-metadata==4.5.0 140 | importlib-resources==5.1.4 141 | imutils==0.5.4 142 | inflect==2.1.0 143 | iniconfig==1.1.1 144 | install==1.3.4 145 | intel-openmp==2021.2.0 146 | intervaltree==2.1.0 147 | ipykernel==4.10.1 148 | ipython==5.5.0 149 | ipython-genutils==0.2.0 150 | ipython-sql==0.3.9 151 | ipywidgets==7.6.3 152 | itsdangerous==1.1.0 153 | jax==0.2.13 154 | jaxlib==0.1.66+cuda110 155 | jdcal==1.4.1 156 | jedi==0.18.0 157 | jieba==0.42.1 158 | Jinja2==2.11.3 159 | joblib==1.0.1 160 | jpeg4py==0.1.4 161 | jsonschema==2.6.0 162 | jupyter==1.0.0 163 | jupyter-client==5.3.5 164 | jupyter-console==5.2.0 165 | jupyter-core==4.7.1 166 | jupyterlab-pygments==0.1.2 167 | jupyterlab-widgets==1.0.0 168 | kaggle==1.5.12 169 | kapre==0.3.5 170 | Keras==2.4.3 171 | keras-nightly==2.5.0.dev2021032900 172 | Keras-Preprocessing==1.1.2 173 | keras-vis==0.4.1 174 | kiwisolver==1.3.1 175 | korean-lunar-calendar==0.2.1 176 | librosa==0.8.1 177 | lightgbm==2.2.3 178 | llvmlite==0.34.0 179 | lmdb==0.99 180 | LunarCalendar==0.0.9 181 | lxml==4.2.6 182 | Markdown==3.3.4 183 | MarkupSafe==2.0.1 184 | matplotlib==3.2.2 185 | matplotlib-inline==0.1.2 186 | matplotlib-venn==0.11.6 187 | missingno==0.4.2 188 | mistune==0.8.4 189 | mizani==0.6.0 190 | mkl==2019.0 191 | mlxtend==0.14.0 192 | more-itertools==8.8.0 193 | moviepy==0.2.3.5 194 | mpmath==1.2.1 195 | msgpack==1.0.2 196 | multiprocess==0.70.12.2 197 | multitasking==0.0.9 198 | murmurhash==1.0.5 199 | music21==5.5.0 200 | natsort==5.5.0 201 | nbclient==0.5.3 202 | nbconvert==5.6.1 203 | nbformat==5.1.3 204 | nest-asyncio==1.5.1 205 | netCDF4==1.5.6 206 | networkx==2.5.1 207 | nibabel==3.0.2 208 | nltk==3.2.5 209 | notebook==5.3.1 210 | numba==0.51.2 211 | numexpr==2.7.3 212 | numpy==1.19.5 213 | nvidia-ml-py3==7.352.0 214 | oauth2client==4.1.3 215 | oauthlib==3.1.1 216 | okgrade==0.4.3 217 | opencv-contrib-python==4.1.2.30 218 | opencv-python==4.1.2.30 219 | openpyxl==2.5.9 220 | opt-einsum==3.3.0 221 | osqp==0.6.2.post0 222 | packaging==20.9 223 | palettable==3.3.0 224 | pandas==1.1.5 225 | pandas-datareader==0.9.0 226 | pandas-gbq==0.13.3 227 | pandas-profiling==1.4.1 228 | pandocfilters==1.4.3 229 | panel==0.11.3 230 | param==1.10.1 231 | parso==0.8.2 232 | pathlib==1.0.1 233 | patsy==0.5.1 234 | pexpect==4.8.0 235 | pickleshare==0.7.5 236 | Pillow==7.1.2 237 | pip-tools==4.5.1 238 | plac==1.1.3 239 | plotly==4.4.1 240 | plotnine==0.6.0 241 | pluggy==0.7.1 242 | pooch==1.4.0 243 | portpicker==1.3.9 244 | prefetch-generator==1.0.1 245 | preshed==3.0.5 246 | prettytable==2.1.0 247 | progressbar2==3.38.0 248 | prometheus-client==0.11.0 249 | promise==2.3 250 | prompt-toolkit==1.0.18 251 | protobuf==3.12.4 252 | psutil==5.4.8 253 | psycopg2==2.7.6.1 254 | ptyprocess==0.7.0 255 | py==1.10.0 256 | pyarrow==3.0.0 257 | pyasn1==0.4.8 258 | pyasn1-modules==0.2.8 259 | pycocotools==2.0.2 260 | pycparser==2.20 261 | pyct==0.4.8 262 | pydata-google-auth==1.2.0 263 | pydot==1.3.0 264 | pydot-ng==2.0.0 265 | pydotplus==2.0.2 266 | PyDrive==1.3.1 267 | pyemd==0.5.1 268 | pyerfa==2.0.0 269 | pyglet==1.5.0 270 | Pygments==2.6.1 271 | pygobject==3.26.1 272 | pymc3==3.11.2 273 | PyMeeus==0.5.11 274 | pymongo==3.11.4 275 | pymystem3==0.2.0 276 | PyOpenGL==3.1.5 277 | pyparsing==2.4.7 278 | pyrsistent==0.17.3 279 | pysndfile==1.3.8 280 | PySocks==1.7.1 281 | pystan==2.19.1.1 282 | pytest==3.6.4 283 | python-apt==0.0.0 284 | python-chess==0.23.11 285 | python-dateutil==2.8.1 286 | python-louvain==0.15 287 | python-slugify==5.0.2 288 | python-utils==2.5.6 289 | pytz==2018.9 290 | pyviz-comms==2.0.2 291 | PyWavelets==1.1.1 292 | PyYAML==3.13 293 | pyzmq==22.1.0 294 | qdldl==0.1.5.post0 295 | qtconsole==5.1.0 296 | QtPy==1.9.0 297 | regex==2019.12.20 298 | requests==2.23.0 299 | requests-oauthlib==1.3.0 300 | resampy==0.2.2 301 | retrying==1.3.3 302 | rpy2==3.4.5 303 | rsa==4.7.2 304 | scikit-image==0.16.2 305 | scikit-learn==0.22.2.post1 306 | scipy==1.4.1 307 | screen-resolution-extra==0.0.0 308 | scs==2.1.4 309 | seaborn==0.11.1 310 | semver==2.13.0 311 | Send2Trash==1.5.0 312 | setuptools-git==1.2 313 | Shapely==1.7.1 314 | simplegeneric==0.8.1 315 | six==1.15.0 316 | sklearn==0.0 317 | sklearn-pandas==1.8.0 318 | smart-open==5.1.0 319 | snowballstemmer==2.1.0 320 | sortedcontainers==2.4.0 321 | SoundFile==0.10.3.post1 322 | spacy==2.2.4 323 | Sphinx==1.8.5 324 | sphinxcontrib-serializinghtml==1.1.5 325 | sphinxcontrib-websupport==1.2.4 326 | SQLAlchemy==1.4.18 327 | sqlparse==0.4.1 328 | srsly==1.0.5 329 | statsmodels==0.10.2 330 | sympy==1.7.1 331 | tables==3.4.4 332 | tabulate==0.8.9 333 | tblib==1.7.0 334 | tensorboard==2.5.0 335 | tensorboard-data-server==0.6.1 336 | tensorboard-plugin-wit==1.8.0 337 | tensorflow==2.5.0 338 | tensorflow-datasets==4.0.1 339 | tensorflow-estimator==2.5.0 340 | tensorflow-gcs-config==2.5.0 341 | tensorflow-hub==0.12.0 342 | tensorflow-metadata==1.0.0 343 | tensorflow-probability==0.12.1 344 | termcolor==1.1.0 345 | terminado==0.10.1 346 | testpath==0.5.0 347 | text-unidecode==1.3 348 | textblob==0.15.3 349 | Theano-PyMC==1.1.2 350 | thinc==7.4.0 351 | tifffile==2021.6.14 352 | toml==0.10.2 353 | toolz==0.11.1 354 | torch==1.9.0+cu102 355 | torchsummary==1.5.1 356 | torchtext==0.10.0 357 | torchvision==0.10.0+cu102 358 | tornado==5.1.1 359 | tqdm==4.41.1 360 | traitlets==5.0.5 361 | tweepy==3.10.0 362 | typeguard==2.7.1 363 | typing-extensions==3.7.4.3 364 | tzlocal==1.5.1 365 | uritemplate==3.0.1 366 | urllib3==1.24.3 367 | vega-datasets==0.9.0 368 | wasabi==0.8.2 369 | wcwidth==0.2.5 370 | webencodings==0.5.1 371 | Werkzeug==1.0.1 372 | widgetsnbextension==3.5.1 373 | wordcloud==1.5.0 374 | wrapt==1.12.1 375 | xarray==0.18.2 376 | xgboost==0.90 377 | xkit==0.0.0 378 | xlrd==1.1.0 379 | xlwt==1.3.0 380 | yellowbrick==0.9.1 381 | zict==2.0.0 382 | zipp==3.4.1 383 | -------------------------------------------------------------------------------- /POS Tagging Case Study/Code files/POS3_ipynb_txt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "POS3.ipynb.txt", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "jCy6Hw3LjXEz" 32 | }, 33 | "source": [ 34 | "## Importing libraries" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "MBorzfsNjXE0" 41 | }, 42 | "source": [ 43 | "import pandas as pd\n", 44 | "import numpy as np\n", 45 | "import os\n", 46 | "import spacy \n", 47 | "from tqdm import tqdm" 48 | ], 49 | "execution_count": null, 50 | "outputs": [] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": { 55 | "id": "gjlVHS9yjXE1" 56 | }, 57 | "source": [ 58 | "### Read reviews data" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "metadata": { 64 | "id": "unZvzHyPjXE2" 65 | }, 66 | "source": [ 67 | "con=open(\"../data/Samsung.txt\",'r', encoding=\"utf-8\")\n", 68 | "samsung_reviews=con.read()\n", 69 | "con.close()" 70 | ], 71 | "execution_count": null, 72 | "outputs": [] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": { 77 | "id": "-Xp3V0sijXE4" 78 | }, 79 | "source": [ 80 | "### Can we reduce the time taken?\n", 81 | "[Pipelines (Spacy)](https://spacy.io/usage/processing-pipelines)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": { 87 | "id": "hcz71QGCjXE5" 88 | }, 89 | "source": [ 90 | "" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "metadata": { 96 | "id": "fStfPPgCjXE6" 97 | }, 98 | "source": [ 99 | "# shorten the pipline loading\n", 100 | "nlp=spacy.load('en_core_web_sm',disable=['parser','ner'])" 101 | ], 102 | "execution_count": null, 103 | "outputs": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "metadata": { 108 | "id": "dx3Ylbe5jXE6", 109 | "outputId": "2c44d403-1088-4268-f062-98bf412fbab7" 110 | }, 111 | "source": [ 112 | "nouns = []\n", 113 | "for review in tqdm(samsung_reviews.split(\"\\n\")[0:1000]):\n", 114 | " doc = nlp(review)\n", 115 | " for tok in doc:\n", 116 | " if tok.pos_==\"NOUN\":\n", 117 | " nouns.append(tok.lemma_.lower())" 118 | ], 119 | "execution_count": null, 120 | "outputs": [ 121 | { 122 | "output_type": "stream", 123 | "text": [ 124 | "100%|██████████| 1000/1000 [00:06<00:00, 148.24it/s]\n" 125 | ], 126 | "name": "stderr" 127 | } 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "metadata": { 133 | "id": "2KmwyyX-jXE8" 134 | }, 135 | "source": [ 136 | "len(samsung_reviews.split(\"\\n\"))" 137 | ], 138 | "execution_count": null, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "metadata": { 144 | "id": "XUf8-pG1jXE9", 145 | "outputId": "288bda64-52b2-48dc-ce1d-bcecf596c39f" 146 | }, 147 | "source": [ 148 | "(46355/1000)*6" 149 | ], 150 | "execution_count": null, 151 | "outputs": [ 152 | { 153 | "output_type": "execute_result", 154 | "data": { 155 | "text/plain": [ 156 | "278.13" 157 | ] 158 | }, 159 | "metadata": { 160 | "tags": [] 161 | }, 162 | "execution_count": 5 163 | } 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "metadata": { 169 | "id": "nX83oQevjXE9", 170 | "outputId": "095ef791-0bd5-4310-dc03-00fba9af3a33" 171 | }, 172 | "source": [ 173 | "278/60" 174 | ], 175 | "execution_count": null, 176 | "outputs": [ 177 | { 178 | "output_type": "execute_result", 179 | "data": { 180 | "text/plain": [ 181 | "4.633333333333334" 182 | ] 183 | }, 184 | "metadata": { 185 | "tags": [] 186 | }, 187 | "execution_count": 6 188 | } 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": { 194 | "id": "-qZDcM2njXE-" 195 | }, 196 | "source": [ 197 | "### Lets process all the reviews now and see if time taken is less !!!" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "metadata": { 203 | "id": "KiftxQmcjXE-", 204 | "outputId": "1f035757-c703-4fd5-fc25-14b99efeca8f" 205 | }, 206 | "source": [ 207 | "nouns = []\n", 208 | "for review in tqdm(samsung_reviews.split(\"\\n\")):\n", 209 | " doc = nlp(review)\n", 210 | " for tok in doc:\n", 211 | " if tok.pos_==\"NOUN\":\n", 212 | " nouns.append(tok.lemma_.lower())" 213 | ], 214 | "execution_count": null, 215 | "outputs": [ 216 | { 217 | "output_type": "stream", 218 | "text": [ 219 | "100%|██████████| 46355/46355 [04:27<00:00, 173.42it/s]\n" 220 | ], 221 | "name": "stderr" 222 | } 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": { 228 | "id": "4tDq-KDOjXE_" 229 | }, 230 | "source": [ 231 | "### Does the hypothesis of nouns capturing `product features` hold?" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "metadata": { 237 | "id": "f_9BsBMvjXFA", 238 | "outputId": "04060afd-fb64-4c84-e55f-eaba0cc85446" 239 | }, 240 | "source": [ 241 | "nouns=pd.Series(nouns)\n", 242 | "nouns.value_counts().head(5)" 243 | ], 244 | "execution_count": null, 245 | "outputs": [ 246 | { 247 | "output_type": "execute_result", 248 | "data": { 249 | "text/plain": [ 250 | "phone 43237\n", 251 | "battery 4350\n", 252 | "product 3907\n", 253 | "time 3825\n", 254 | "screen 3746\n", 255 | "dtype: int64" 256 | ] 257 | }, 258 | "metadata": { 259 | "tags": [] 260 | }, 261 | "execution_count": 8 262 | } 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "metadata": { 268 | "id": "7g33CJOXjXFA", 269 | "outputId": "525c082f-6d7e-4755-997f-13d2b8cddfb9" 270 | }, 271 | "source": [ 272 | "nouns.value_counts().head(10)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [ 276 | { 277 | "output_type": "execute_result", 278 | "data": { 279 | "text/plain": [ 280 | "phone 43237\n", 281 | "battery 4350\n", 282 | "product 3907\n", 283 | "time 3825\n", 284 | "screen 3746\n", 285 | "card 3399\n", 286 | "price 3148\n", 287 | "problem 3120\n", 288 | "camera 2773\n", 289 | "app 2606\n", 290 | "dtype: int64" 291 | ] 292 | }, 293 | "metadata": { 294 | "tags": [] 295 | }, 296 | "execution_count": 9 297 | } 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": { 303 | "id": "qlp6FvUMjXFB" 304 | }, 305 | "source": [ 306 | "### We now know that people mention `battery`, `product`, `screen` etc. But we still don't know in what context they mention these keywords" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": { 312 | "id": "zurzvf_tjXFB" 313 | }, 314 | "source": [ 315 | "### Summary:\n", 316 | " - Most frequently used lemmatised forms of noun, inform us about the product features people are talking about in product reviews\n", 317 | " - In order to process the review data faster spacy allows us to use the idea of enabling parts of model inference pipeline via `spacy.loads()` command and `disable` parameter" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "metadata": { 323 | "id": "qmCWc_9vjXFC" 324 | }, 325 | "source": [ 326 | "" 327 | ], 328 | "execution_count": null, 329 | "outputs": [] 330 | } 331 | ] 332 | } -------------------------------------------------------------------------------- /POS Tagging Case Study/Commented code files/POS4_Commented.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "POS4_Commented.ipynb", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "npY9MI1yIyK8" 32 | }, 33 | "source": [ 34 | "## Importing libraries" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "xp8j7pqtIyLC" 41 | }, 42 | "source": [ 43 | "import pandas as pd\n", 44 | "import numpy as np\n", 45 | "from tqdm import tqdm" 46 | ], 47 | "execution_count": null, 48 | "outputs": [] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "id": "Xr_U2bl3IyLD" 54 | }, 55 | "source": [ 56 | "### Read reviews data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "metadata": { 62 | "id": "fFKFirMfIyLE" 63 | }, 64 | "source": [ 65 | "con=open(\"../data/Samsung.txt\",'r', encoding=\"utf-8\")\n", 66 | "samsung_reviews=con.read()\n", 67 | "con.close()" 68 | ], 69 | "execution_count": null, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "id": "euxL-gvSIyLE" 76 | }, 77 | "source": [ 78 | "" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "id": "d_YiSRgnIyLE" 85 | }, 86 | "source": [ 87 | "" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "id": "kjQbVocSIyLF" 94 | }, 95 | "source": [ 96 | "### We can use a simple hueristic\n", 97 | " - Find out what were the most common words that appeared before and after each mention of `product feature`\n", 98 | " - Use regex pattern to extract this information" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "metadata": { 104 | "id": "2sUkmQxMIyLF" 105 | }, 106 | "source": [ 107 | "" 108 | ], 109 | "execution_count": null, 110 | "outputs": [] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "id": "mJ-h6iY9IyLF" 116 | }, 117 | "source": [ 118 | "The `battery` was ===> Prefix `keyword` Suffix" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": { 124 | "id": "Ut-xNj9TIyLG" 125 | }, 126 | "source": [ 127 | "![image.png](attachment:9b4e9e8f-7d79-4d31-b370-44726e017a96.png)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "metadata": { 133 | "id": "9CKpNcqOIyLG" 134 | }, 135 | "source": [ 136 | "" 137 | ], 138 | "execution_count": null, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "metadata": { 144 | "id": "NfKD33r3IyLG" 145 | }, 146 | "source": [ 147 | "" 148 | ], 149 | "execution_count": null, 150 | "outputs": [] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "metadata": { 155 | "id": "3YUowmHPIyLH" 156 | }, 157 | "source": [ 158 | "" 159 | ], 160 | "execution_count": null, 161 | "outputs": [] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "metadata": { 166 | "id": "KlSmNzRZIyLI" 167 | }, 168 | "source": [ 169 | "" 170 | ], 171 | "execution_count": null, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "metadata": { 177 | "id": "QVMGhLTDIyLI" 178 | }, 179 | "source": [ 180 | "" 181 | ], 182 | "execution_count": null, 183 | "outputs": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "metadata": { 188 | "id": "f3aAJ83iIyLI" 189 | }, 190 | "source": [ 191 | "" 192 | ], 193 | "execution_count": null, 194 | "outputs": [] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "id": "NhNY_nmIIyLI" 200 | }, 201 | "source": [ 202 | "#### Extract all the prefixes and suffixes of `battery`" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "metadata": { 208 | "id": "u9dcEALnIyLJ" 209 | }, 210 | "source": [ 211 | "" 212 | ], 213 | "execution_count": null, 214 | "outputs": [] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "metadata": { 219 | "id": "LXh3QNXBIyLJ" 220 | }, 221 | "source": [ 222 | "" 223 | ], 224 | "execution_count": null, 225 | "outputs": [] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "metadata": { 230 | "id": "o8KOaRWHIyLJ" 231 | }, 232 | "source": [ 233 | "" 234 | ], 235 | "execution_count": null, 236 | "outputs": [] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "metadata": { 241 | "id": "NnH0ufoPIyLK" 242 | }, 243 | "source": [ 244 | "" 245 | ], 246 | "execution_count": null, 247 | "outputs": [] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "metadata": { 252 | "id": "otmFHjwxIyLK" 253 | }, 254 | "source": [ 255 | "" 256 | ], 257 | "execution_count": null, 258 | "outputs": [] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "metadata": { 263 | "id": "7WO2U1VkIyLK" 264 | }, 265 | "source": [ 266 | "" 267 | ], 268 | "execution_count": null, 269 | "outputs": [] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": { 274 | "id": "K8-eOehUIyLK" 275 | }, 276 | "source": [ 277 | "### This doesn't make much sense as these are commonly used words. Let's remove `stopwords` and see what we get\n", 278 | "\n", 279 | "Get Stop Words" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "id": "mfnu7JtWIyLL" 286 | }, 287 | "source": [ 288 | "" 289 | ], 290 | "execution_count": null, 291 | "outputs": [] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "metadata": { 296 | "id": "jb6x_CBKIyLL" 297 | }, 298 | "source": [ 299 | "" 300 | ], 301 | "execution_count": null, 302 | "outputs": [] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "metadata": { 307 | "id": "rr7m8JRaIyLL" 308 | }, 309 | "source": [ 310 | "" 311 | ], 312 | "execution_count": null, 313 | "outputs": [] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "metadata": { 318 | "id": "NVemqXK7IyLL" 319 | }, 320 | "source": [ 321 | "" 322 | ], 323 | "execution_count": null, 324 | "outputs": [] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": { 329 | "id": "Nn55qmL3IyLM" 330 | }, 331 | "source": [ 332 | "### Lets pretty print" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "metadata": { 338 | "id": "-MFkaGYBIyLM" 339 | }, 340 | "source": [ 341 | "" 342 | ], 343 | "execution_count": null, 344 | "outputs": [] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "metadata": { 349 | "id": "1aZAVsDyIyLM" 350 | }, 351 | "source": [ 352 | "" 353 | ], 354 | "execution_count": null, 355 | "outputs": [] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": { 360 | "id": "amgt8bWIIyLM" 361 | }, 362 | "source": [ 363 | "### Lets put all this logic in a function" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "lboxswARIyLN" 370 | }, 371 | "source": [ 372 | "" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "dOHGwpp6IyLN" 381 | }, 382 | "source": [ 383 | "" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "JdqO0FJFIyLN" 392 | }, 393 | "source": [ 394 | "" 395 | ], 396 | "execution_count": null, 397 | "outputs": [] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": { 402 | "id": "IPl2YjseIyLN" 403 | }, 404 | "source": [ 405 | "## Summary:\n", 406 | " - Simple hueristics sometime are very usefull\n", 407 | " - Regex can be life saviours\n", 408 | " - Don't forget to use simple text processing while trying to solve a non-trival problem" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "metadata": { 414 | "id": "0BQ5J8f2IyLO" 415 | }, 416 | "source": [ 417 | "" 418 | ], 419 | "execution_count": null, 420 | "outputs": [] 421 | } 422 | ] 423 | } -------------------------------------------------------------------------------- /Custom NER Code Demonstration/Commented code files/CRF_Commented.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "CRF_Commented.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "FuJZ8hS1Gdp0" 23 | }, 24 | "source": [ 25 | "For this demo, we will use the [MIT Restaurant Corpus](https://groups.csail.mit.edu/sls/downloads/restaurant/) -- a dataset of transcriptions of spoken utterances about restaurants.\n", 26 | "\n", 27 | "The dataset has following entity types:\n", 28 | "\n", 29 | "* 'B-Rating'\n", 30 | "* 'I-Rating',\n", 31 | "* 'B-Amenity',\n", 32 | "* 'I-Amenity',\n", 33 | "* 'B-Location',\n", 34 | "* 'I-Location',\n", 35 | "* 'B-Restaurant_Name',\n", 36 | "* 'I-Restaurant_Name',\n", 37 | "* 'B-Price',\n", 38 | "* 'B-Hours',\n", 39 | "* 'I-Hours',\n", 40 | "* 'B-Dish',\n", 41 | "* 'I-Dish',\n", 42 | "* 'B-Cuisine',\n", 43 | "* 'I-Price',\n", 44 | "* 'I-Cuisine'\n", 45 | "\n", 46 | "Let us load the dataset and see what are we working with." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "metadata": { 52 | "id": "Q0FynEFzDrvb" 53 | }, 54 | "source": [ 55 | "with open('sent_train', 'r') as train_sent_file:\n", 56 | " train_sentences = train_sent_file.readlines()\n", 57 | "\n", 58 | "with open('label_train', 'r') as train_labels_file:\n", 59 | " train_labels = train_labels_file.readlines()\n", 60 | "\n", 61 | "with open('sent_test', 'r') as test_sent_file:\n", 62 | " test_sentences = test_sent_file.readlines()\n", 63 | "\n", 64 | "with open('label_test', 'r') as test_labels_file:\n", 65 | " test_labels = test_labels_file.readlines()\n" 66 | ], 67 | "execution_count": null, 68 | "outputs": [] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "id": "5WkhbGrdGffs" 74 | }, 75 | "source": [ 76 | "Let us see some example data points." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "metadata": { 82 | "id": "FVNI3boSFgQ1" 83 | }, 84 | "source": [ 85 | "# Print the 6th sentence in the test set i.e. index value 5.\n", 86 | "\n", 87 | "# Print the labels of this sentence\n" 88 | ], 89 | "execution_count": null, 90 | "outputs": [] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "id": "dQSnhug230dr" 96 | }, 97 | "source": [ 98 | "#Defining Features for Custom NER" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "id": "jMwc9lv_3mrW" 105 | }, 106 | "source": [ 107 | "First, let us install the required modules." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "metadata": { 113 | "id": "i9KFfDxN3mWu" 114 | }, 115 | "source": [ 116 | "# Install pycrf and crfsuit packages using pip command\n" 117 | ], 118 | "execution_count": null, 119 | "outputs": [] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": { 124 | "id": "4Hv6kAL9TMpf" 125 | }, 126 | "source": [ 127 | "\n", 128 | "\n", 129 | "We will now start with computing features for our input sequences." 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": { 135 | "id": "HqvJ-tAz0MzR" 136 | }, 137 | "source": [ 138 | "We have defined the following features for CRF model building:\n", 139 | "\n", 140 | "- f1 = input word is in lower case; \n", 141 | "- f2 = last 3 characters of word;\n", 142 | "- f3 = last 2 characers of word;\n", 143 | "- f4 = 1; if the word is in uppercase, 0 otherwise;\n", 144 | "- f5 = 1; if word is a number; otherwise, 0 \n", 145 | "- f6= 1; if the word starts with a capital letter; otherwise, 0\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "metadata": { 151 | "id": "3YVm3JTkFiaM" 152 | }, 153 | "source": [ 154 | "#Define a function to get the above defined features for a word.\n" 155 | ], 156 | "execution_count": null, 157 | "outputs": [] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": { 162 | "id": "FAS3xt0u-r89" 163 | }, 164 | "source": [ 165 | "#Computing Features " 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": { 171 | "id": "wQ_ByE1ladpo" 172 | }, 173 | "source": [ 174 | "Define a function to get features for a sentence using the already defined 'getFeaturesForOneWord' function" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "KPrh_4F9RCgb" 181 | }, 182 | "source": [ 183 | "# Define a function to get features for a sentence \n", 184 | "# using the 'getFeaturesForOneWord' function.\n" 185 | ], 186 | "execution_count": null, 187 | "outputs": [] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": { 192 | "id": "PZwEnrYraqi7" 193 | }, 194 | "source": [ 195 | "Define function to get the labels for a sentence." 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "metadata": { 201 | "id": "N7Tt2NncirYD" 202 | }, 203 | "source": [ 204 | "# Define a function to get the labels for a sentence.\n" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "id": "ZHCv5BWRTQb9" 213 | }, 214 | "source": [ 215 | "Example features for a sentence\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "metadata": { 221 | "id": "1RzuYcYgSc_x" 222 | }, 223 | "source": [ 224 | "# Apply function 'getFeaturesForOneSentence' to get features on a single sentence which is at index value 5 in train_sentences\n" 225 | ], 226 | "execution_count": null, 227 | "outputs": [] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": { 232 | "id": "kf6NcY2YdPW2" 233 | }, 234 | "source": [ 235 | "Get the features for sentences of X_train and X_test and get the labels of Y_train and Y_test data." 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "metadata": { 241 | "id": "G0hQZN9TSjMb" 242 | }, 243 | "source": [ 244 | "" 245 | ], 246 | "execution_count": null, 247 | "outputs": [] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": { 252 | "id": "i4upm9ar-iKc" 253 | }, 254 | "source": [ 255 | "#CRF Model Training\n", 256 | "\n", 257 | " Now we have all the information we need to train our CRF. Let us see how we can do that." 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "metadata": { 263 | "id": "Kwm-aTb7hftp" 264 | }, 265 | "source": [ 266 | "import sklearn_crfsuite\n", 267 | "\n", 268 | "from sklearn_crfsuite import metrics" 269 | ], 270 | "execution_count": null, 271 | "outputs": [] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "id": "EudVs1-uPHFs" 277 | }, 278 | "source": [ 279 | "We create a CRF object and passtraining data to it. The model then \"trains\" and learns the weights for feature functions." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "id": "XuKjdgGwhM_e" 286 | }, 287 | "source": [ 288 | "# Build the CRF model.\n" 289 | ], 290 | "execution_count": null, 291 | "outputs": [] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": { 296 | "id": "13ziiBZecMUJ" 297 | }, 298 | "source": [ 299 | "#Model Testing and Evaluation \n", 300 | "The model is trained, let us now see how good it performs on the test data." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "metadata": { 306 | "id": "E75RH6VujmAs" 307 | }, 308 | "source": [ 309 | "# Calculate the f1 score using the test data\n" 310 | ], 311 | "execution_count": null, 312 | "outputs": [] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "metadata": { 317 | "id": "S8sfl1j-k3jw" 318 | }, 319 | "source": [ 320 | "# Print the orginal labels and predicted labels for the sentence in test data, which is at index value 10.\n" 321 | ], 322 | "execution_count": null, 323 | "outputs": [] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": { 328 | "id": "IxSGnW9bFrxV" 329 | }, 330 | "source": [ 331 | "#Transitions Learned by CRF" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "metadata": { 337 | "id": "AoaKikLDRunC" 338 | }, 339 | "source": [ 340 | "from util import print_top_likely_transitions\n", 341 | "from util import print_top_unlikely_transitions" 342 | ], 343 | "execution_count": null, 344 | "outputs": [] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "metadata": { 349 | "id": "4zU4ZHyUSymU" 350 | }, 351 | "source": [ 352 | "print_top_likely_transitions(crf.transition_features_)" 353 | ], 354 | "execution_count": null, 355 | "outputs": [] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "metadata": { 360 | "id": "0JgRe2HES0t1" 361 | }, 362 | "source": [ 363 | "print_top_unlikely_transitions(crf.transition_features_)" 364 | ], 365 | "execution_count": null, 366 | "outputs": [] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "metadata": { 371 | "id": "3X1oXlQ3IMlZ" 372 | }, 373 | "source": [ 374 | "" 375 | ], 376 | "execution_count": null, 377 | "outputs": [] 378 | } 379 | ] 380 | } -------------------------------------------------------------------------------- /POS Tagging Case Study/Code files/POS2_ipynb_txt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "POS2.ipynb.txt", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "FXsu8MxUVAKT" 32 | }, 33 | "source": [ 34 | "## Importing libraries" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "kXr4Cxh2VAKZ" 41 | }, 42 | "source": [ 43 | "import pandas as pd\n", 44 | "import numpy as np\n", 45 | "import os" 46 | ], 47 | "execution_count": null, 48 | "outputs": [] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "metadata": { 53 | "id": "Eq-7JCT6VAKa" 54 | }, 55 | "source": [ 56 | "import spacy \n", 57 | "nlp = spacy.load(\"en_core_web_sm\")" 58 | ], 59 | "execution_count": null, 60 | "outputs": [] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "id": "ItxQ587VVAKb" 66 | }, 67 | "source": [ 68 | "### Read reviews data" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "metadata": { 74 | "id": "q1pr6bbsVAKb" 75 | }, 76 | "source": [ 77 | "con=open(\"../data/Samsung.txt\",'r', encoding=\"utf-8\")\n", 78 | "samsung_reviews=con.read()\n", 79 | "con.close()" 80 | ], 81 | "execution_count": null, 82 | "outputs": [] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "metadata": { 87 | "id": "f4GyRJMEVAKb", 88 | "outputId": "29fcc27e-8558-4e67-bec3-b677797f4601" 89 | }, 90 | "source": [ 91 | "len(samsung_reviews.split(\"\\n\"))" 92 | ], 93 | "execution_count": null, 94 | "outputs": [ 95 | { 96 | "output_type": "execute_result", 97 | "data": { 98 | "text/plain": [ 99 | "46355" 100 | ] 101 | }, 102 | "metadata": { 103 | "tags": [] 104 | }, 105 | "execution_count": 4 106 | } 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "id": "fU4ZvurwVAKc" 113 | }, 114 | "source": [ 115 | "### Dataset is a text file where each review is in a new line" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "metadata": { 121 | "id": "bPT-wYC2VAKd", 122 | "outputId": "93ff4281-69f0-4cfc-b686-a64e9cf439ff" 123 | }, 124 | "source": [ 125 | "samsung_reviews.split(\"\\n\")[0:4]" 126 | ], 127 | "execution_count": null, 128 | "outputs": [ 129 | { 130 | "output_type": "execute_result", 131 | "data": { 132 | "text/plain": [ 133 | "[\"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!\",\n", 134 | " 'nice phone, nice up grade from my pantach revue. Very clean set up and easy set up. never had an android phone but they are fantastic to say the least. perfect size for surfing and social media. great phone samsung',\n", 135 | " 'Very pleased',\n", 136 | " 'It works good but it goes slow sometimes but its a very good phone I love it']" 137 | ] 138 | }, 139 | "metadata": { 140 | "tags": [] 141 | }, 142 | "execution_count": 5 143 | } 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "id": "EUi5gE7WVAKd" 150 | }, 151 | "source": [ 152 | "### Will our hypothesis hold on real world data? `Product features---POS_NOUN`" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "metadata": { 158 | "id": "yUx96iRtVAKg" 159 | }, 160 | "source": [ 161 | "review1=samsung_reviews.split(\"\\n\")[0]\n", 162 | "review1=nlp(review1)" 163 | ], 164 | "execution_count": null, 165 | "outputs": [] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "id": "EKt6ZNPnVAKg" 171 | }, 172 | "source": [ 173 | "### Lets do nlp parse on part of one review in our dataset" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "metadata": { 179 | "id": "gOCZxFp_VAKh", 180 | "outputId": "1465e5eb-d533-42dc-e890-c76dfa65d589" 181 | }, 182 | "source": [ 183 | "for tok in review1[0:10]:\n", 184 | " print(tok.text,\"---\",tok.lemma_,\"---\",tok.pos_)" 185 | ], 186 | "execution_count": null, 187 | "outputs": [ 188 | { 189 | "output_type": "stream", 190 | "text": [ 191 | "I --- I --- PRON\n", 192 | "feel --- feel --- VERB\n", 193 | "so --- so --- ADV\n", 194 | "LUCKY --- lucky --- ADJ\n", 195 | "to --- to --- PART\n", 196 | "have --- have --- AUX\n", 197 | "found --- find --- VERB\n", 198 | "this --- this --- DET\n", 199 | "used --- use --- VERB\n", 200 | "( --- ( --- PUNCT\n" 201 | ], 202 | "name": "stdout" 203 | } 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": { 209 | "id": "xOBlgcu0VAKh" 210 | }, 211 | "source": [ 212 | "#### Real world data is usually messy, observe the words `found` and `used`" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "metadata": { 218 | "id": "yB1t1bPTVAKh" 219 | }, 220 | "source": [ 221 | "pos = []\n", 222 | "lemma = []\n", 223 | "text = []\n", 224 | "for tok in review1:\n", 225 | " pos.append(tok.pos_)\n", 226 | " lemma.append(tok.lemma_)\n", 227 | " text.append(tok.text)" 228 | ], 229 | "execution_count": null, 230 | "outputs": [] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "metadata": { 235 | "id": "G3LDkWepVAKi", 236 | "outputId": "0e2c0671-4157-4ae6-a397-08ef8ad1dfdf" 237 | }, 238 | "source": [ 239 | "nlp_table = pd.DataFrame({'text':text,'lemma':lemma,'pos':pos})\n", 240 | "nlp_table.head()" 241 | ], 242 | "execution_count": null, 243 | "outputs": [ 244 | { 245 | "output_type": "execute_result", 246 | "data": { 247 | "text/html": [ 248 | "
\n", 249 | "\n", 262 | "\n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | "
textlemmapos
0IIPRON
1feelfeelVERB
2sosoADV
3LUCKYluckyADJ
4totoPART
\n", 304 | "
" 305 | ], 306 | "text/plain": [ 307 | " text lemma pos\n", 308 | "0 I I PRON\n", 309 | "1 feel feel VERB\n", 310 | "2 so so ADV\n", 311 | "3 LUCKY lucky ADJ\n", 312 | "4 to to PART" 313 | ] 314 | }, 315 | "metadata": { 316 | "tags": [] 317 | }, 318 | "execution_count": 9 319 | } 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "metadata": { 325 | "id": "FEXKFzlPVAKi", 326 | "outputId": "15fa5020-0bcb-4ba1-8d31-938fc204de91" 327 | }, 328 | "source": [ 329 | "## Get most frequent lemma forms of nouns\n", 330 | "nlp_table[nlp_table['pos']=='NOUN']['lemma'].value_counts()" 331 | ], 332 | "execution_count": null, 333 | "outputs": [ 334 | { 335 | "output_type": "execute_result", 336 | "data": { 337 | "text/plain": [ 338 | "phone 3\n", 339 | "one 2\n", 340 | "year 1\n", 341 | "seller 1\n", 342 | "honesty 1\n", 343 | "upgrade 1\n", 344 | "line 1\n", 345 | "Name: lemma, dtype: int64" 346 | ] 347 | }, 348 | "metadata": { 349 | "tags": [] 350 | }, 351 | "execution_count": 10 352 | } 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": { 358 | "id": "x-uR8q-LVAKi" 359 | }, 360 | "source": [ 361 | "#### It seems possible that if we extract all the nouns from the reviews and look at the top 5 most frequent lemmatised noun forms, we will be able to identify `What people are talking about?`" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "id": "QOvqBuwgVAKj" 368 | }, 369 | "source": [ 370 | "### Lets repeat this experiment on a larger set of reviews" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "metadata": { 376 | "id": "RsDUv3X1VAKj" 377 | }, 378 | "source": [ 379 | "nouns = []\n", 380 | "for review in samsung_reviews.split(\"\\n\")[0:100]:\n", 381 | " doc = nlp(review)\n", 382 | " for tok in doc:\n", 383 | " if tok.pos_==\"NOUN\":\n", 384 | " nouns.append(tok.lemma_.lower())" 385 | ], 386 | "execution_count": null, 387 | "outputs": [] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": { 392 | "id": "lrVi3UPTVAKj" 393 | }, 394 | "source": [ 395 | "### Lets add some way of keeping track of time" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "metadata": { 401 | "id": "zNhp_45EVAKj", 402 | "outputId": "bf7700fd-7e6c-46a2-f8ad-e6282812b14c" 403 | }, 404 | "source": [ 405 | "from tqdm import tqdm\n", 406 | "nouns = []\n", 407 | "for review in tqdm(samsung_reviews.split(\"\\n\")[0:1000]):\n", 408 | " doc = nlp(review)\n", 409 | " for tok in doc:\n", 410 | " if tok.pos_==\"NOUN\":\n", 411 | " nouns.append(tok.lemma_.lower())\n", 412 | "pd.Series(nouns).value_counts().head(5)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [ 416 | { 417 | "output_type": "stream", 418 | "text": [ 419 | "100%|██████████| 1000/1000 [00:17<00:00, 56.70it/s]\n" 420 | ], 421 | "name": "stderr" 422 | }, 423 | { 424 | "output_type": "execute_result", 425 | "data": { 426 | "text/plain": [ 427 | "phone 1208\n", 428 | "battery 92\n", 429 | "time 90\n", 430 | "price 87\n", 431 | "screen 86\n", 432 | "dtype: int64" 433 | ] 434 | }, 435 | "metadata": { 436 | "tags": [] 437 | }, 438 | "execution_count": 12 439 | } 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "metadata": { 445 | "id": "jai2hnfoVAKk", 446 | "outputId": "4d74cbee-10fe-444d-9c8d-f3a1a7f4ff2d" 447 | }, 448 | "source": [ 449 | "len(samsung_reviews.split(\"\\n\"))" 450 | ], 451 | "execution_count": null, 452 | "outputs": [ 453 | { 454 | "output_type": "execute_result", 455 | "data": { 456 | "text/plain": [ 457 | "46355" 458 | ] 459 | }, 460 | "metadata": { 461 | "tags": [] 462 | }, 463 | "execution_count": 13 464 | } 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": { 470 | "id": "94gZ5gxLVAKk" 471 | }, 472 | "source": [ 473 | "### Did you notice anything? What do you think will be the time taken to process all the reviews?" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "metadata": { 479 | "id": "Op_ftvWyVAKk", 480 | "outputId": "3b9fbbd0-87f3-432b-df9d-0579267a250f" 481 | }, 482 | "source": [ 483 | "(46355//1000)*17" 484 | ], 485 | "execution_count": null, 486 | "outputs": [ 487 | { 488 | "output_type": "execute_result", 489 | "data": { 490 | "text/plain": [ 491 | "782" 492 | ] 493 | }, 494 | "metadata": { 495 | "tags": [] 496 | }, 497 | "execution_count": 14 498 | } 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "metadata": { 504 | "id": "VDMknUOXVAKl", 505 | "outputId": "fe2f2b6f-65d5-45e0-f58e-5cc9408ab0e4" 506 | }, 507 | "source": [ 508 | "782//60" 509 | ], 510 | "execution_count": null, 511 | "outputs": [ 512 | { 513 | "output_type": "execute_result", 514 | "data": { 515 | "text/plain": [ 516 | "13" 517 | ] 518 | }, 519 | "metadata": { 520 | "tags": [] 521 | }, 522 | "execution_count": 15 523 | } 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": { 529 | "id": "1PsJmzrtVAKl" 530 | }, 531 | "source": [ 532 | "## Summary\n", 533 | "- POS tag based rule seems to be working well\n", 534 | "- We need to figure out a way to reduce the time taken to process reviews" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "metadata": { 540 | "id": "rf7AFwbiVAKl" 541 | }, 542 | "source": [ 543 | "" 544 | ], 545 | "execution_count": null, 546 | "outputs": [] 547 | } 548 | ] 549 | } -------------------------------------------------------------------------------- /Dependency Parsing/Code files/Dep_parsing2_ipynb_txt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "Dep_parsing2.ipynb.txt", 24 | "provenance": [], 25 | "collapsed_sections": [] 26 | } 27 | }, 28 | "cells": [ 29 | { 30 | "cell_type": "code", 31 | "metadata": { 32 | "id": "uFMZ8FAqc8GS" 33 | }, 34 | "source": [ 35 | "import spacy\n", 36 | "from spacy import displacy\n", 37 | "import pandas as pd\n", 38 | "nlp = spacy.load(\"en_core_web_sm\")" 39 | ], 40 | "execution_count": null, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "id": "lMizkNfmc8GX" 47 | }, 48 | "source": [ 49 | "active = ['Hens lay eggs.',\n", 50 | " 'Birds build nests.',\n", 51 | " 'The batter hit the ball.',\n", 52 | " 'The computer transmitted a copy of the manual']\n", 53 | "passive = ['Eggs are laid by hens',\n", 54 | " 'Nests are built by birds',\n", 55 | " 'The ball was hit by the batter',\n", 56 | " 'A copy of the manual was transmitted by the computer.']" 57 | ], 58 | "execution_count": null, 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "id": "47aHzaYTc8GZ" 65 | }, 66 | "source": [ 67 | "### How do we impliment the rule `if dep nsubjpass, then passive else not`?" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "metadata": { 73 | "id": "XmTIIuqkc8GZ" 74 | }, 75 | "source": [ 76 | "from spacy.matcher import Matcher" 77 | ], 78 | "execution_count": null, 79 | "outputs": [] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "id": "Tulu8FF5c8Ga" 85 | }, 86 | "source": [ 87 | "### Read more about it [here](https://spacy.io/api/matcher)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "metadata": { 93 | "id": "PdKzGXTbc8Ga", 94 | "outputId": "8046a872-8b3f-494a-afa9-6c29771e2db7" 95 | }, 96 | "source": [ 97 | "doc = nlp(passive[0])\n", 98 | "displacy.render(doc, style=\"dep\")" 99 | ], 100 | "execution_count": null, 101 | "outputs": [ 102 | { 103 | "output_type": "display_data", 104 | "data": { 105 | "text/html": [ 106 | "\n", 107 | "\n", 108 | " Eggs\n", 109 | " NOUN\n", 110 | "\n", 111 | "\n", 112 | "\n", 113 | " are\n", 114 | " AUX\n", 115 | "\n", 116 | "\n", 117 | "\n", 118 | " laid\n", 119 | " VERB\n", 120 | "\n", 121 | "\n", 122 | "\n", 123 | " by\n", 124 | " ADP\n", 125 | "\n", 126 | "\n", 127 | "\n", 128 | " hens\n", 129 | " NOUN\n", 130 | "\n", 131 | "\n", 132 | "\n", 133 | " \n", 134 | " \n", 135 | " nsubjpass\n", 136 | " \n", 137 | " \n", 138 | "\n", 139 | "\n", 140 | "\n", 141 | " \n", 142 | " \n", 143 | " auxpass\n", 144 | " \n", 145 | " \n", 146 | "\n", 147 | "\n", 148 | "\n", 149 | " \n", 150 | " \n", 151 | " agent\n", 152 | " \n", 153 | " \n", 154 | "\n", 155 | "\n", 156 | "\n", 157 | " \n", 158 | " \n", 159 | " pobj\n", 160 | " \n", 161 | " \n", 162 | "\n", 163 | "" 164 | ], 165 | "text/plain": [ 166 | "" 167 | ] 168 | }, 169 | "metadata": { 170 | "tags": [] 171 | } 172 | } 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "id": "z3NmcC_Tc8Gc" 179 | }, 180 | "source": [ 181 | "### Create a rule with `Matcher`" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "metadata": { 187 | "id": "NPB4mEXxc8Gd" 188 | }, 189 | "source": [ 190 | "rule = [{'POS':'NOUN'}]\n", 191 | "matcher = Matcher(nlp.vocab)\n", 192 | "matcher.add('Rule',[rule])" 193 | ], 194 | "execution_count": null, 195 | "outputs": [] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "metadata": { 200 | "id": "t4sdkcioc8Ge", 201 | "outputId": "c784c5c0-6658-4b12-fc84-3a1cdbf2284a" 202 | }, 203 | "source": [ 204 | "matcher(doc)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [ 208 | { 209 | "output_type": "execute_result", 210 | "data": { 211 | "text/plain": [ 212 | "[(15740618714089435985, 0, 1), (15740618714089435985, 4, 5)]" 213 | ] 214 | }, 215 | "metadata": { 216 | "tags": [] 217 | }, 218 | "execution_count": 6 219 | } 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "metadata": { 225 | "id": "as4nsdejc8Gf", 226 | "outputId": "81b85451-9f66-409c-8544-68f0ca89e3dc" 227 | }, 228 | "source": [ 229 | "doc[0:1]" 230 | ], 231 | "execution_count": null, 232 | "outputs": [ 233 | { 234 | "output_type": "execute_result", 235 | "data": { 236 | "text/plain": [ 237 | "Eggs" 238 | ] 239 | }, 240 | "metadata": { 241 | "tags": [] 242 | }, 243 | "execution_count": 7 244 | } 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "metadata": { 250 | "id": "2gemzSC2c8Gf", 251 | "outputId": "f42d85fb-b030-46cb-87d5-b6bc466e9dc6" 252 | }, 253 | "source": [ 254 | "doc[4:5]" 255 | ], 256 | "execution_count": null, 257 | "outputs": [ 258 | { 259 | "output_type": "execute_result", 260 | "data": { 261 | "text/plain": [ 262 | "hens" 263 | ] 264 | }, 265 | "metadata": { 266 | "tags": [] 267 | }, 268 | "execution_count": 8 269 | } 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": { 275 | "id": "RUSAqlZTc8Gg" 276 | }, 277 | "source": [ 278 | "### Create a rule for `passive voice`" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "metadata": { 284 | "id": "jrDq70A3c8Gh" 285 | }, 286 | "source": [ 287 | "passive_rule = [{'DEP':'nsubjpass'}]\n", 288 | "matcher = Matcher(nlp.vocab)\n", 289 | "matcher.add('Rule',[passive_rule])" 290 | ], 291 | "execution_count": null, 292 | "outputs": [] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "metadata": { 297 | "id": "GA6G7_Jrc8Gh", 298 | "outputId": "1b372f7c-fef5-4fbc-d976-1fac36595023" 299 | }, 300 | "source": [ 301 | "matcher(doc)" 302 | ], 303 | "execution_count": null, 304 | "outputs": [ 305 | { 306 | "output_type": "execute_result", 307 | "data": { 308 | "text/plain": [ 309 | "[(15740618714089435985, 0, 1)]" 310 | ] 311 | }, 312 | "metadata": { 313 | "tags": [] 314 | }, 315 | "execution_count": 10 316 | } 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": { 322 | "id": "Syig5C1nc8Gi" 323 | }, 324 | "source": [ 325 | "### Let's check how this rule works if we use it on a sentence with `active voice`" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "metadata": { 331 | "id": "BqxpwGDJc8Gi", 332 | "outputId": "ee17ec87-5b68-4be7-af4a-7630bc78edfe" 333 | }, 334 | "source": [ 335 | "active[0]" 336 | ], 337 | "execution_count": null, 338 | "outputs": [ 339 | { 340 | "output_type": "execute_result", 341 | "data": { 342 | "text/plain": [ 343 | "'Hens lay eggs.'" 344 | ] 345 | }, 346 | "metadata": { 347 | "tags": [] 348 | }, 349 | "execution_count": 11 350 | } 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "metadata": { 356 | "id": "iQ374wu9c8Gj", 357 | "outputId": "48e420e1-2102-4b5f-d18d-be094b4a7766" 358 | }, 359 | "source": [ 360 | "doc = nlp(active[0])\n", 361 | "displacy.render(doc, style=\"dep\")" 362 | ], 363 | "execution_count": null, 364 | "outputs": [ 365 | { 366 | "output_type": "display_data", 367 | "data": { 368 | "text/html": [ 369 | "\n", 370 | "\n", 371 | " Hens\n", 372 | " NOUN\n", 373 | "\n", 374 | "\n", 375 | "\n", 376 | " lay\n", 377 | " VERB\n", 378 | "\n", 379 | "\n", 380 | "\n", 381 | " eggs.\n", 382 | " NOUN\n", 383 | "\n", 384 | "\n", 385 | "\n", 386 | " \n", 387 | " \n", 388 | " nsubj\n", 389 | " \n", 390 | " \n", 391 | "\n", 392 | "\n", 393 | "\n", 394 | " \n", 395 | " \n", 396 | " dobj\n", 397 | " \n", 398 | " \n", 399 | "\n", 400 | "" 401 | ], 402 | "text/plain": [ 403 | "" 404 | ] 405 | }, 406 | "metadata": { 407 | "tags": [] 408 | } 409 | } 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "metadata": { 415 | "id": "tsC4QHYmc8Gj", 416 | "outputId": "ac297b2f-7809-49e7-e669-9bb1f41572ff" 417 | }, 418 | "source": [ 419 | "matcher(doc)" 420 | ], 421 | "execution_count": null, 422 | "outputs": [ 423 | { 424 | "output_type": "execute_result", 425 | "data": { 426 | "text/plain": [ 427 | "[]" 428 | ] 429 | }, 430 | "metadata": { 431 | "tags": [] 432 | }, 433 | "execution_count": 13 434 | } 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": { 440 | "id": "zA197pFCc8Gk" 441 | }, 442 | "source": [ 443 | "### Now lets make a function that impliments this logic" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "metadata": { 449 | "id": "W4C80We_c8Gk" 450 | }, 451 | "source": [ 452 | "def is_passive(doc,matcher):\n", 453 | " if len(matcher(doc))>0:\n", 454 | " return True\n", 455 | " else:\n", 456 | " return False" 457 | ], 458 | "execution_count": null, 459 | "outputs": [] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": { 464 | "id": "5e1P9nhcc8Gl" 465 | }, 466 | "source": [ 467 | "### Let's test this function on our small sample of sentences and see how the pipeline will work" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "metadata": { 473 | "id": "wPe6aHIMc8Gl", 474 | "outputId": "843da434-aa29-43c0-8705-6f37b65abf7a" 475 | }, 476 | "source": [ 477 | "for sent in active:\n", 478 | " doc = nlp(sent)\n", 479 | " print(is_passive(doc,matcher))" 480 | ], 481 | "execution_count": null, 482 | "outputs": [ 483 | { 484 | "output_type": "stream", 485 | "text": [ 486 | "False\n", 487 | "False\n", 488 | "False\n", 489 | "False\n" 490 | ], 491 | "name": "stdout" 492 | } 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "metadata": { 498 | "id": "GkVgfN8Wc8Gm", 499 | "outputId": "a5bc5c2b-9c8c-40a1-9905-1cfe8134abb5" 500 | }, 501 | "source": [ 502 | "for sent in passive:\n", 503 | " doc = nlp(sent)\n", 504 | " print(is_passive(doc,matcher))" 505 | ], 506 | "execution_count": null, 507 | "outputs": [ 508 | { 509 | "output_type": "stream", 510 | "text": [ 511 | "True\n", 512 | "True\n", 513 | "True\n", 514 | "True\n" 515 | ], 516 | "name": "stdout" 517 | } 518 | ] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "metadata": { 523 | "id": "8VjFb4jXc8Gn" 524 | }, 525 | "source": [ 526 | "### Summary\n", 527 | " - One can go a long way by observing patterns in linguistic data, you don't always need to know the details of the linguitsics very well.\n", 528 | " - Once can use the `matcher` object to find if certain linguistic patterns exist in data" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "metadata": { 534 | "id": "VaJunQ_Rc8Gn", 535 | "colab": { 536 | "base_uri": "https://localhost:8080/" 537 | }, 538 | "outputId": "d55f8185-a528-40c1-b700-2c3202d96d01" 539 | }, 540 | "source": [ 541 | "" 542 | ], 543 | "execution_count": null, 544 | "outputs": [ 545 | { 546 | "output_type": "execute_result", 547 | "data": { 548 | "text/plain": [ 549 | "[]" 550 | ] 551 | }, 552 | "metadata": { 553 | "tags": [] 554 | }, 555 | "execution_count": 8 556 | } 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "metadata": { 562 | "id": "rtYp1yRyY1kA" 563 | }, 564 | "source": [ 565 | "" 566 | ], 567 | "execution_count": null, 568 | "outputs": [] 569 | } 570 | ] 571 | } -------------------------------------------------------------------------------- /Custom NER Code Demonstration/Code files/CRF.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "CRF.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [ 9 | "13ziiBZecMUJ" 10 | ] 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | } 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "FuJZ8hS1Gdp0" 25 | }, 26 | "source": [ 27 | "For this demo, we will use the [MIT Restaurant Corpus](https://groups.csail.mit.edu/sls/downloads/restaurant/) -- a dataset of transcriptions of spoken utterances about restaurants.\n", 28 | "\n", 29 | "The dataset has following entity types:\n", 30 | "\n", 31 | "* 'B-Rating'\n", 32 | "* 'I-Rating',\n", 33 | "* 'B-Amenity',\n", 34 | "* 'I-Amenity',\n", 35 | "* 'B-Location',\n", 36 | "* 'I-Location',\n", 37 | "* 'B-Restaurant_Name',\n", 38 | "* 'I-Restaurant_Name',\n", 39 | "* 'B-Price',\n", 40 | "* 'B-Hours',\n", 41 | "* 'I-Hours',\n", 42 | "* 'B-Dish',\n", 43 | "* 'I-Dish',\n", 44 | "* 'B-Cuisine',\n", 45 | "* 'I-Price',\n", 46 | "* 'I-Cuisine'\n", 47 | "\n", 48 | "Let us load the dataset and see what are we working with." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "metadata": { 54 | "id": "Q0FynEFzDrvb" 55 | }, 56 | "source": [ 57 | "with open('sent_train', 'r') as train_sent_file:\n", 58 | " train_sentences = train_sent_file.readlines()\n", 59 | "\n", 60 | "with open('label_train', 'r') as train_labels_file:\n", 61 | " train_labels = train_labels_file.readlines()\n", 62 | "\n", 63 | "with open('sent_test', 'r') as test_sent_file:\n", 64 | " test_sentences = test_sent_file.readlines()\n", 65 | "\n", 66 | "with open('label_test', 'r') as test_labels_file:\n", 67 | " test_labels = test_labels_file.readlines()\n" 68 | ], 69 | "execution_count": null, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "id": "5WkhbGrdGffs" 76 | }, 77 | "source": [ 78 | "Let us see some example data points." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "metadata": { 84 | "colab": { 85 | "base_uri": "https://localhost:8080/" 86 | }, 87 | "id": "FVNI3boSFgQ1", 88 | "outputId": "ebced656-d436-41d7-f3c0-4db80bb4a621" 89 | }, 90 | "source": [ 91 | "# Print the 6th sentence in the test set i.e. index value 5.\n", 92 | "print(test_sentences[5])\n", 93 | "\n", 94 | "# Print the labels of this sentence\n", 95 | "print(test_labels[5])" 96 | ], 97 | "execution_count": null, 98 | "outputs": [ 99 | { 100 | "output_type": "stream", 101 | "text": [ 102 | "any good ice cream parlors around \n", 103 | "\n", 104 | "O B-Rating B-Cuisine I-Cuisine I-Cuisine B-Location \n", 105 | "\n" 106 | ], 107 | "name": "stdout" 108 | } 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": { 114 | "id": "dQSnhug230dr" 115 | }, 116 | "source": [ 117 | "#Defining Features for Custom NER" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": { 123 | "id": "jMwc9lv_3mrW" 124 | }, 125 | "source": [ 126 | "First, let us install the required modules." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "metadata": { 132 | "id": "i9KFfDxN3mWu" 133 | }, 134 | "source": [ 135 | "# Install pycrf and crfsuit packages using pip command\n", 136 | "!pip install pycrf\n", 137 | "!pip install sklearn-crfsuite" 138 | ], 139 | "execution_count": null, 140 | "outputs": [] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": { 145 | "id": "4Hv6kAL9TMpf" 146 | }, 147 | "source": [ 148 | "\n", 149 | "\n", 150 | "We will now start with computing features for our input sequences." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "id": "HqvJ-tAz0MzR" 157 | }, 158 | "source": [ 159 | "We have defined the following features for CRF model building:\n", 160 | "\n", 161 | "- f1 = input word is in lower case; \n", 162 | "- f2 = last 3 characters of word;\n", 163 | "- f3 = last 2 characers of word;\n", 164 | "- f4 = 1; if the word is in uppercase, 0 otherwise;\n", 165 | "- f5 = 1; if word is a number; otherwise, 0 \n", 166 | "- f6= 1; if the word starts with a capital letter; otherwise, 0\n" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "metadata": { 172 | "id": "3YVm3JTkFiaM" 173 | }, 174 | "source": [ 175 | "#Define a function to get the above defined features for a word.\n", 176 | "\n", 177 | "def getFeaturesForOneWord(sentence, pos):\n", 178 | " word = sentence[pos]\n", 179 | "\n", 180 | " features = [\n", 181 | " 'word.lower=' + word.lower(), # serves as word id\n", 182 | " 'word[-3:]=' + word[-3:], # last three characters\n", 183 | " 'word[-2:]=' + word[-2:], # last two characters\n", 184 | " 'word.isupper=%s' % word.isupper(), # is the word in all uppercase\n", 185 | " 'word.isdigit=%s' % word.isdigit(), # is the word a number\n", 186 | " 'words.startsWithCapital=%s' % word[0].isupper() # is the word starting with a capital letter\n", 187 | " ]\n", 188 | "\n", 189 | " if(pos > 0):\n", 190 | " prev_word = sentence[pos-1]\n", 191 | " features.extend([\n", 192 | " 'prev_word.lower=' + prev_word.lower(), \n", 193 | " 'prev_word.isupper=%s' % prev_word.isupper(),\n", 194 | " 'prev_word.isdigit=%s' % prev_word.isdigit(),\n", 195 | " 'prev_words.startsWithCapital=%s' % prev_word[0].isupper()\n", 196 | " ])\n", 197 | " else:\n", 198 | " features.append('BEG') # feature to track begin of sentence \n", 199 | "\n", 200 | " if(pos == len(sentence)-1):\n", 201 | " features.append('END') # feature to track end of sentence\n", 202 | "\n", 203 | " return features" 204 | ], 205 | "execution_count": null, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": { 211 | "id": "FAS3xt0u-r89" 212 | }, 213 | "source": [ 214 | "#Computing Features " 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": { 220 | "id": "wQ_ByE1ladpo" 221 | }, 222 | "source": [ 223 | "Define a function to get features for a sentence using the already defined 'getFeaturesForOneWord' function" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "metadata": { 229 | "id": "KPrh_4F9RCgb" 230 | }, 231 | "source": [ 232 | "# Define a function to get features for a sentence \n", 233 | "# using the 'getFeaturesForOneWord' function.\n", 234 | "def getFeaturesForOneSentence(sentence):\n", 235 | " sentence_list = sentence.split()\n", 236 | " return [getFeaturesForOneWord(sentence_list, pos) for pos in range(len(sentence_list))]" 237 | ], 238 | "execution_count": null, 239 | "outputs": [] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "id": "PZwEnrYraqi7" 245 | }, 246 | "source": [ 247 | "Define function to get the labels for a sentence." 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "metadata": { 253 | "id": "N7Tt2NncirYD" 254 | }, 255 | "source": [ 256 | "# Define a function to get the labels for a sentence.\n", 257 | "def getLabelsInListForOneSentence(labels):\n", 258 | " return labels.split()" 259 | ], 260 | "execution_count": null, 261 | "outputs": [] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": { 266 | "id": "ZHCv5BWRTQb9" 267 | }, 268 | "source": [ 269 | "Example features for a sentence\n" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "metadata": { 275 | "colab": { 276 | "base_uri": "https://localhost:8080/" 277 | }, 278 | "id": "1RzuYcYgSc_x", 279 | "outputId": "a12da80f-0012-455a-a7ff-41431136d121" 280 | }, 281 | "source": [ 282 | "# Apply function 'getFeaturesForOneSentence' to get features on a single sentence which is at index value 5 in train_sentences\n", 283 | "example_sentence = train_sentences[5]\n", 284 | "print(example_sentence)\n", 285 | "\n", 286 | "features = getFeaturesForOneSentence(example_sentence)\n", 287 | "features[2]" 288 | ], 289 | "execution_count": null, 290 | "outputs": [ 291 | { 292 | "output_type": "stream", 293 | "text": [ 294 | "a place that serves soft serve ice cream \n", 295 | "\n" 296 | ], 297 | "name": "stdout" 298 | }, 299 | { 300 | "output_type": "execute_result", 301 | "data": { 302 | "text/plain": [ 303 | "['word.lower=that',\n", 304 | " 'word[-3:]=hat',\n", 305 | " 'word[-2:]=at',\n", 306 | " 'word.isupper=False',\n", 307 | " 'word.isdigit=False',\n", 308 | " 'words.startsWithCapital=False',\n", 309 | " 'prev_word.lower=place',\n", 310 | " 'prev_word.isupper=False',\n", 311 | " 'prev_word.isdigit=False',\n", 312 | " 'prev_words.startsWithCapital=False']" 313 | ] 314 | }, 315 | "metadata": { 316 | "tags": [] 317 | }, 318 | "execution_count": 45 319 | } 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": { 325 | "id": "kf6NcY2YdPW2" 326 | }, 327 | "source": [ 328 | "Get the features for sentences of X_train and X_test and get the labels of Y_train and Y_test data." 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "metadata": { 334 | "id": "G0hQZN9TSjMb" 335 | }, 336 | "source": [ 337 | "X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentences]\n", 338 | "Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]\n", 339 | "\n", 340 | "X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentences]\n", 341 | "Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]" 342 | ], 343 | "execution_count": null, 344 | "outputs": [] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": { 349 | "id": "i4upm9ar-iKc" 350 | }, 351 | "source": [ 352 | "#CRF Model Training\n", 353 | "\n", 354 | " Now we have all the information we need to train our CRF. Let us see how we can do that." 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "metadata": { 360 | "id": "Kwm-aTb7hftp" 361 | }, 362 | "source": [ 363 | "import sklearn_crfsuite\n", 364 | "\n", 365 | "from sklearn_crfsuite import metrics" 366 | ], 367 | "execution_count": null, 368 | "outputs": [] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "id": "EudVs1-uPHFs" 374 | }, 375 | "source": [ 376 | "We create a CRF object and passtraining data to it. The model then \"trains\" and learns the weights for feature functions." 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "metadata": { 382 | "colab": { 383 | "base_uri": "https://localhost:8080/" 384 | }, 385 | "id": "XuKjdgGwhM_e", 386 | "outputId": "a907c690-b90a-4105-f0d6-cde19e07dbd4" 387 | }, 388 | "source": [ 389 | "# Build the CRF model.\n", 390 | "crf = sklearn_crfsuite.CRF(max_iterations=100)\n", 391 | "crf.fit(X_train, Y_train)" 392 | ], 393 | "execution_count": null, 394 | "outputs": [ 395 | { 396 | "output_type": "stream", 397 | "text": [ 398 | "/usr/local/lib/python3.7/dist-packages/sklearn/base.py:197: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.\n", 399 | " FutureWarning)\n" 400 | ], 401 | "name": "stderr" 402 | }, 403 | { 404 | "output_type": "execute_result", 405 | "data": { 406 | "text/plain": [ 407 | "CRF(algorithm=None, all_possible_states=None, all_possible_transitions=None,\n", 408 | " averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,\n", 409 | " calibration_eta=None, calibration_max_trials=None, calibration_rate=None,\n", 410 | " calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,\n", 411 | " gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,\n", 412 | " max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,\n", 413 | " pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)" 414 | ] 415 | }, 416 | "metadata": { 417 | "tags": [] 418 | }, 419 | "execution_count": 54 420 | } 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": { 426 | "id": "13ziiBZecMUJ" 427 | }, 428 | "source": [ 429 | "#Model Testing and Evaluation \n", 430 | "The model is trained, let us now see how good it performs on the test data." 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "metadata": { 436 | "colab": { 437 | "base_uri": "https://localhost:8080/", 438 | "height": 198 439 | }, 440 | "id": "E75RH6VujmAs", 441 | "outputId": "78440723-54f5-4fb9-8765-6975ac66c1d8" 442 | }, 443 | "source": [ 444 | "# Calculate the f1 score using the test data\n", 445 | "Y_pred = crf.predict(X_test)\n", 446 | "metrics.flat_f1_score(Y_test, Y_pred, average='weighted')" 447 | ], 448 | "execution_count": null, 449 | "outputs": [ 450 | { 451 | "output_type": "error", 452 | "ename": "NameError", 453 | "evalue": "ignored", 454 | "traceback": [ 455 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 456 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 457 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Calculate the f1 score using the test data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mY_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcrf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflat_f1_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mY_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maverage\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'weighted'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 458 | "\u001b[0;31mNameError\u001b[0m: name 'crf' is not defined" 459 | ] 460 | } 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "metadata": { 466 | "colab": { 467 | "base_uri": "https://localhost:8080/", 468 | "height": 232 469 | }, 470 | "id": "S8sfl1j-k3jw", 471 | "outputId": "7ee06369-808f-425f-9456-03f145b4058b" 472 | }, 473 | "source": [ 474 | "# Print the orginal labels and predicted labels for the sentence in test data, which is at index value 10.\n", 475 | "id = 10\n", 476 | "print(\"Sentence:\",test_sentences[id])\n", 477 | "print(\"Orig Labels:\", Y_test[id])\n", 478 | "print(\"Pred Labels:\", Y_pred[id])" 479 | ], 480 | "execution_count": null, 481 | "outputs": [ 482 | { 483 | "output_type": "error", 484 | "ename": "NameError", 485 | "evalue": "ignored", 486 | "traceback": [ 487 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 488 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 489 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Print the orginal labels and predicted labels for the sentence in test data, which is at index value 10.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Sentence:\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mtest_sentences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Orig Labels:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY_test\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pred Labels:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY_pred\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 490 | "\u001b[0;31mNameError\u001b[0m: name 'test_sentences' is not defined" 491 | ] 492 | } 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": { 498 | "id": "IxSGnW9bFrxV" 499 | }, 500 | "source": [ 501 | "#Transitions Learned by CRF" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "metadata": { 507 | "id": "AoaKikLDRunC" 508 | }, 509 | "source": [ 510 | "from util import print_top_likely_transitions\n", 511 | "from util import print_top_unlikely_transitions" 512 | ], 513 | "execution_count": null, 514 | "outputs": [] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "metadata": { 519 | "colab": { 520 | "base_uri": "https://localhost:8080/" 521 | }, 522 | "id": "4zU4ZHyUSymU", 523 | "outputId": "8c5b4dde-3a8d-4f66-97ba-b3e17b482b01" 524 | }, 525 | "source": [ 526 | "print_top_likely_transitions(crf.transition_features_)" 527 | ], 528 | "execution_count": null, 529 | "outputs": [ 530 | { 531 | "output_type": "stream", 532 | "text": [ 533 | "B-Restaurant_Name -> I-Restaurant_Name 6.780861\n", 534 | "B-Location -> I-Location 6.708175\n", 535 | "B-Amenity -> I-Amenity 6.684825\n", 536 | "I-Location -> I-Location 6.449733\n", 537 | "I-Amenity -> I-Amenity 6.185293\n", 538 | "B-Dish -> I-Dish 5.921328\n", 539 | "B-Hours -> I-Hours 5.885383\n", 540 | "I-Restaurant_Name -> I-Restaurant_Name 5.861961\n", 541 | "B-Cuisine -> I-Cuisine 5.559337\n", 542 | "I-Hours -> I-Hours 5.434019\n" 543 | ], 544 | "name": "stdout" 545 | } 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "metadata": { 551 | "colab": { 552 | "base_uri": "https://localhost:8080/" 553 | }, 554 | "id": "0JgRe2HES0t1", 555 | "outputId": "4c0e469c-4100-40b5-80ec-5f74d8e5180b" 556 | }, 557 | "source": [ 558 | "print_top_unlikely_transitions(crf.transition_features_)" 559 | ], 560 | "execution_count": null, 561 | "outputs": [ 562 | { 563 | "output_type": "stream", 564 | "text": [ 565 | "B-Price -> B-Location -0.642237\n", 566 | "I-Location -> B-Dish -0.727662\n", 567 | "I-Dish -> B-Cuisine -0.827585\n", 568 | "I-Price -> B-Location -0.886004\n", 569 | "I-Hours -> O -0.889121\n", 570 | "B-Restaurant_Name -> B-Cuisine -0.943158\n", 571 | "I-Rating -> O -0.949635\n", 572 | "I-Price -> O -0.951517\n", 573 | "I-Restaurant_Name -> B-Dish -1.097983\n", 574 | "I-Restaurant_Name -> B-Cuisine -1.127921\n" 575 | ], 576 | "name": "stdout" 577 | } 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "metadata": { 583 | "id": "3X1oXlQ3IMlZ" 584 | }, 585 | "source": [ 586 | "" 587 | ], 588 | "execution_count": null, 589 | "outputs": [] 590 | } 591 | ] 592 | } -------------------------------------------------------------------------------- /POS Tagging Case Study/Code files/POS4_ipynb_txt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "Y5Sb0P3lv2wM" 7 | }, 8 | "source": [ 9 | "## Importing libraries" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "id": "DTUdxNDnv2wb" 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "from tqdm import tqdm" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "id": "FmY5TNDFv2wd" 29 | }, 30 | "source": [ 31 | "### Read reviews data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "id": "c8dObNbev2we" 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "con=open(\"../data/Samsung.txt\",'r', encoding=\"utf-8\")\n", 43 | "samsung_reviews=con.read()\n", 44 | "con.close()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "id": "JjjtrCbJv2wf" 51 | }, 52 | "source": [ 53 | "" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": { 59 | "id": "FOQhbImWv2wf" 60 | }, 61 | "source": [ 62 | "" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": { 68 | "id": "o8NCJ4P0v2wg" 69 | }, 70 | "source": [ 71 | "### We can use a simple hueristic\n", 72 | " - Find out what were the most common words that appeared before and after each mention of `product feature`\n", 73 | " - Use regex pattern to extract this information" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "id": "KRhx9ESqv2wi" 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "import re\n", 85 | "s1 = \"The battery was great.\"" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": { 91 | "id": "E-uvyEcpv2wj" 92 | }, 93 | "source": [ 94 | "The `battery` was ===> Prefix `keyword` Suffix" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "id": "Pc7aMQcfv2wk" 101 | }, 102 | "source": [ 103 | "" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "id": "OHiP9iUIv2wl" 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "pattern = re.compile(\"\\w+\\sbattery\\s\\w+\")" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "id": "gc27nKndv2wl", 122 | "outputId": "b3d55d7c-66b2-4b5b-9679-8355deb9a956" 123 | }, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "['The battery was']" 129 | ] 130 | }, 131 | "execution_count": 5, 132 | "metadata": { 133 | "tags": [] 134 | }, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "re.findall(pattern,s1)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "id": "mNpBQN76v2wo", 147 | "outputId": "c545b9ab-0f62-458b-ebe1-b6283f757674" 148 | }, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/plain": [ 153 | "'The battery was'" 154 | ] 155 | }, 156 | "execution_count": 6, 157 | "metadata": { 158 | "tags": [] 159 | }, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "re.findall(pattern,s1)[0]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "id": "ykY2vGz8v2wp", 172 | "outputId": "12c309cc-97e0-421e-978e-bdf01ff19926" 173 | }, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "['The', 'battery', 'was']" 179 | ] 180 | }, 181 | "execution_count": 7, 182 | "metadata": { 183 | "tags": [] 184 | }, 185 | "output_type": "execute_result" 186 | } 187 | ], 188 | "source": [ 189 | "re.findall(pattern,s1)[0].split(\" \")" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "id": "QCIPRHiYv2wq", 197 | "outputId": "41ca811d-5796-4790-ca4e-0c42148c063c" 198 | }, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "'The'" 204 | ] 205 | }, 206 | "execution_count": 8, 207 | "metadata": { 208 | "tags": [] 209 | }, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "re.findall(pattern,s1)[0].split(\" \")[0] ## prefix" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "id": "N9uMLlr2v2wq", 222 | "outputId": "139a0e26-06f2-416a-9311-3186cbd05465" 223 | }, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "'was'" 229 | ] 230 | }, 231 | "execution_count": 9, 232 | "metadata": { 233 | "tags": [] 234 | }, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "re.findall(pattern,s1)[0].split(\" \")[-1] ## suffix" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": { 245 | "id": "xbWMeBh-v2wr" 246 | }, 247 | "source": [ 248 | "#### Extract all the prefixes and suffixes of `battery`" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": { 255 | "id": "IOpdwVCsv2ws" 256 | }, 257 | "outputs": [], 258 | "source": [ 259 | "prefixes_suffixes = re.findall(pattern,samsung_reviews)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "id": "L8NnHP4-v2wt", 267 | "outputId": "c91263bd-2c45-4aa7-97bf-afe3fa4175d4" 268 | }, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "['that battery life',\n", 274 | " 'The battery was',\n", 275 | " 'great battery life',\n", 276 | " 'removable battery or',\n", 277 | " 'the battery in',\n", 278 | " 'The battery was',\n", 279 | " 'the battery is',\n", 280 | " 'Excellent battery life',\n", 281 | " 'the battery off',\n", 282 | " 'the battery goes']" 283 | ] 284 | }, 285 | "execution_count": 11, 286 | "metadata": { 287 | "tags": [] 288 | }, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "prefixes_suffixes[0:10]" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "id": "N0Qt0pAdv2wt", 301 | "outputId": "355df60d-f733-403b-a5c3-7ca3bd69dd35" 302 | }, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "['that', 'battery', 'life']" 308 | ] 309 | }, 310 | "execution_count": 12, 311 | "metadata": { 312 | "tags": [] 313 | }, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "prefixes_suffixes[0].split(\" \")" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": { 325 | "id": "PYmp4Sobv2wu" 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "prefixes = []\n", 330 | "suffixes = []\n", 331 | "for p in prefixes_suffixes:\n", 332 | " l = p.split(\" \")\n", 333 | " prefixes.append(l[0].lower())\n", 334 | " suffixes.append(l[-1].lower())" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "id": "aQaV7j8Iv2wv", 342 | "outputId": "e6456148-fd0b-4d19-b987-ef4a1850ab18" 343 | }, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/plain": [ 348 | "the 1396\n", 349 | "good 122\n", 350 | "great 90\n", 351 | "and 82\n", 352 | "long 60\n", 353 | "dtype: int64" 354 | ] 355 | }, 356 | "execution_count": 14, 357 | "metadata": { 358 | "tags": [] 359 | }, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "pd.Series(prefixes).value_counts().head(5)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": { 371 | "id": "BsR3F19Fv2ww", 372 | "outputId": "ed85be8e-e695-4018-f4ad-87057b18aeae" 373 | }, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/plain": [ 378 | "life 1052\n", 379 | "is 210\n", 380 | "and 146\n", 381 | "lasts 83\n", 382 | "was 66\n", 383 | "dtype: int64" 384 | ] 385 | }, 386 | "execution_count": 15, 387 | "metadata": { 388 | "tags": [] 389 | }, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "pd.Series(suffixes).value_counts().head(5)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": { 400 | "id": "_4h1NcO6v2wx" 401 | }, 402 | "source": [ 403 | "### This doesn't make much sense as these are commonly used words. Let's remove `stopwords` and see what we get\n", 404 | "\n", 405 | "Get Stop Words" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": { 412 | "id": "JQJUiZN1v2wx" 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "stop_words = [\"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \"yours\", \"yourself\", \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \"hers\", \"herself\", \"it\", \"its\", \"itself\", \"they\", \"them\", \"their\", \"theirs\", \"themselves\", \"what\", \"which\", \"who\", \"whom\", \"this\", \"that\", \"these\", \"those\", \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\", \"having\", \"do\", \"does\", \"did\", \"doing\", \"a\", \"an\", \"the\", \"and\", \"but\", \"if\", \"or\", \"because\", \"as\", \"until\", \"while\", \"of\", \"at\", \"by\", \"for\", \"with\", \"about\", \"against\", \"between\", \"into\", \"through\", \"during\", \"before\", \"after\", \"above\", \"below\", \"to\", \"from\", \"up\", \"down\", \"in\", \"out\", \"on\", \"off\", \"over\", \"under\", \"again\", \"further\", \"then\", \"once\", \"here\", \"there\", \"when\", \"where\", \"why\", \"how\", \"all\", \"any\", \"both\", \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \"own\", \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"can\", \"will\", \"just\", \"don\", \"should\", \"now\"]" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": { 423 | "id": "zI9upGvgv2wz" 424 | }, 425 | "outputs": [], 426 | "source": [ 427 | "prefixes = [p for p in prefixes if p not in stop_words]\n", 428 | "suffixes = [s for s in suffixes if s not in stop_words]" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "id": "6xRoSdX9v2wz", 436 | "outputId": "ccca747b-20d7-481f-c9de-9fafca5d737f" 437 | }, 438 | "outputs": [ 439 | { 440 | "data": { 441 | "text/plain": [ 442 | "good 122\n", 443 | "great 90\n", 444 | "long 60\n", 445 | "new 51\n", 446 | "removable 48\n", 447 | "dtype: int64" 448 | ] 449 | }, 450 | "execution_count": 18, 451 | "metadata": { 452 | "tags": [] 453 | }, 454 | "output_type": "execute_result" 455 | } 456 | ], 457 | "source": [ 458 | "pd.Series(prefixes).value_counts().head(5)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": { 465 | "id": "9tjoL9ZZv2w0", 466 | "outputId": "aba8adf9-fb2c-4d7d-95a4-f31e9d3470d9" 467 | }, 468 | "outputs": [ 469 | { 470 | "data": { 471 | "text/plain": [ 472 | "life 1052\n", 473 | "lasts 83\n", 474 | "last 53\n", 475 | "doesn 40\n", 476 | "runs 31\n", 477 | "dtype: int64" 478 | ] 479 | }, 480 | "execution_count": 19, 481 | "metadata": { 482 | "tags": [] 483 | }, 484 | "output_type": "execute_result" 485 | } 486 | ], 487 | "source": [ 488 | "pd.Series(suffixes).value_counts().head(5)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": { 494 | "id": "kwnOFruxv2w1" 495 | }, 496 | "source": [ 497 | "### Lets pretty print" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": { 504 | "id": "ksKd61Fyv2w1" 505 | }, 506 | "outputs": [], 507 | "source": [ 508 | "prefixes=pd.Series(prefixes).value_counts().head(5).index\n", 509 | "suffixes=pd.Series(suffixes).value_counts().head(5).index" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": { 516 | "id": "Coc3GtGqv2w2", 517 | "outputId": "3d1bed33-480e-40b9-e2c5-f7a34a49c16c" 518 | }, 519 | "outputs": [ 520 | { 521 | "data": { 522 | "text/html": [ 523 | "
\n", 524 | "\n", 537 | "\n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | "
prefixeskeywordsuffixes
0goodbatterylife
1greatbatterylasts
2longbatterylast
3newbatterydoesn
4removablebatteryruns
\n", 579 | "
" 580 | ], 581 | "text/plain": [ 582 | " prefixes keyword suffixes\n", 583 | "0 good battery life\n", 584 | "1 great battery lasts\n", 585 | "2 long battery last\n", 586 | "3 new battery doesn\n", 587 | "4 removable battery runs" 588 | ] 589 | }, 590 | "execution_count": 21, 591 | "metadata": { 592 | "tags": [] 593 | }, 594 | "output_type": "execute_result" 595 | } 596 | ], 597 | "source": [ 598 | "pd.DataFrame({'prefixes':prefixes,'keyword':['battery']*len(prefixes),'suffixes':suffixes})" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": { 604 | "id": "6y-7MjMUv2w3" 605 | }, 606 | "source": [ 607 | "### Lets put all this logic in a function" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "metadata": { 614 | "id": "zxi86ZU-v2w3" 615 | }, 616 | "outputs": [], 617 | "source": [ 618 | "def get_context(reviews,keyword):\n", 619 | " pattern = re.compile(f\"\\w+\\s{keyword}\\s\\w+\")\n", 620 | " prefixes_suffixes = re.findall(pattern,reviews)\n", 621 | " prefixes = []\n", 622 | " suffixes = []\n", 623 | " for p in prefixes_suffixes:\n", 624 | " l = p.split(\" \")\n", 625 | " prefixes.append(l[0].lower())\n", 626 | " suffixes.append(l[-1].lower())\n", 627 | " prefixes = [p for p in prefixes if p not in stop_words]\n", 628 | " suffixes = [s for s in suffixes if s not in stop_words]\n", 629 | " prefixes=pd.Series(prefixes).value_counts().head(5).index\n", 630 | " suffixes=pd.Series(suffixes).value_counts().head(5).index\n", 631 | " return pd.DataFrame({'prefixes':prefixes,'keyword':[f'{keyword}']*len(prefixes),'suffixes':suffixes})" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": { 638 | "id": "HGfr7wpCv2w4", 639 | "outputId": "1543466e-d0ef-4ac2-a189-57c447fceb39" 640 | }, 641 | "outputs": [ 642 | { 643 | "data": { 644 | "text/html": [ 645 | "
\n", 646 | "\n", 659 | "\n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | "
prefixeskeywordsuffixes
0goodbatterylife
1greatbatterylasts
2longbatterylast
3newbatterydoesn
4removablebatteryruns
\n", 701 | "
" 702 | ], 703 | "text/plain": [ 704 | " prefixes keyword suffixes\n", 705 | "0 good battery life\n", 706 | "1 great battery lasts\n", 707 | "2 long battery last\n", 708 | "3 new battery doesn\n", 709 | "4 removable battery runs" 710 | ] 711 | }, 712 | "execution_count": 23, 713 | "metadata": { 714 | "tags": [] 715 | }, 716 | "output_type": "execute_result" 717 | } 718 | ], 719 | "source": [ 720 | "get_context(samsung_reviews,\"battery\")" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": null, 726 | "metadata": { 727 | "id": "7hM31b61v2w5", 728 | "outputId": "f14fce27-d6d0-43f0-b8fd-7b688381599c" 729 | }, 730 | "outputs": [ 731 | { 732 | "data": { 733 | "text/html": [ 734 | "
\n", 735 | "\n", 748 | "\n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | "
prefixeskeywordsuffixes
0touchscreenprotector
1bigscreensize
2greatscreenresolution
3largescreenprotectors
4homescreenquality
\n", 790 | "
" 791 | ], 792 | "text/plain": [ 793 | " prefixes keyword suffixes\n", 794 | "0 touch screen protector\n", 795 | "1 big screen size\n", 796 | "2 great screen resolution\n", 797 | "3 large screen protectors\n", 798 | "4 home screen quality" 799 | ] 800 | }, 801 | "execution_count": 24, 802 | "metadata": { 803 | "tags": [] 804 | }, 805 | "output_type": "execute_result" 806 | } 807 | ], 808 | "source": [ 809 | "get_context(samsung_reviews,\"screen\")" 810 | ] 811 | }, 812 | { 813 | "cell_type": "markdown", 814 | "metadata": { 815 | "id": "dLWZsuxjv2w6" 816 | }, 817 | "source": [ 818 | "## Summary:\n", 819 | " - Simple hueristics sometime are very usefull\n", 820 | " - Regex can be life saviours\n", 821 | " - Don't forget to use simple text processing while trying to solve a non-trival problem" 822 | ] 823 | }, 824 | { 825 | "cell_type": "code", 826 | "execution_count": null, 827 | "metadata": { 828 | "id": "2G4VXSNov2w7" 829 | }, 830 | "outputs": [], 831 | "source": [] 832 | } 833 | ], 834 | "metadata": { 835 | "colab": { 836 | "name": "POS4.ipynb.txt", 837 | "provenance": [] 838 | }, 839 | "kernelspec": { 840 | "display_name": "Python 3", 841 | "language": "python", 842 | "name": "python3" 843 | }, 844 | "language_info": { 845 | "codemirror_mode": { 846 | "name": "ipython", 847 | "version": 3 848 | }, 849 | "file_extension": ".py", 850 | "mimetype": "text/x-python", 851 | "name": "python", 852 | "nbconvert_exporter": "python", 853 | "pygments_lexer": "ipython3", 854 | "version": "3.8.5" 855 | } 856 | }, 857 | "nbformat": 4, 858 | "nbformat_minor": 1 859 | } 860 | -------------------------------------------------------------------------------- /Dependency Parsing/Code files/Dep_parsing3_ipynb_txt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "Dep_parsing3.ipynb.txt", 24 | "provenance": [] 25 | } 26 | }, 27 | "cells": [ 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "mrzv9UJ-tArb" 32 | }, 33 | "source": [ 34 | "import spacy\n", 35 | "from spacy import displacy\n", 36 | "from spacy.matcher import Matcher\n", 37 | "import pandas as pd\n", 38 | "nlp = spacy.load(\"en_core_web_sm\")" 39 | ], 40 | "execution_count": null, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "i5gre9OYtAre" 47 | }, 48 | "source": [ 49 | "### Lets check our rule on a larger corpus" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "metadata": { 55 | "colab": { 56 | "base_uri": "https://localhost:8080/", 57 | "height": 111 58 | }, 59 | "id": "JXDFu7f3tArf", 60 | "outputId": "3df31ca1-646a-421e-e3e8-74569215ed3d" 61 | }, 62 | "source": [ 63 | "active_passive = pd.read_csv('active_passive.csv')\n", 64 | "active_passive.head(2)" 65 | ], 66 | "execution_count": null, 67 | "outputs": [ 68 | { 69 | "output_type": "execute_result", 70 | "data": { 71 | "text/html": [ 72 | "
\n", 73 | "\n", 86 | "\n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | "
ActivePassive
0He reads a novel.A novel is read.
1He does not cook food.Food is not cooked by him.
\n", 107 | "
" 108 | ], 109 | "text/plain": [ 110 | " Active Passive\n", 111 | "0 He reads a novel. A novel is read.\n", 112 | "1 He does not cook food. Food is not cooked by him." 113 | ] 114 | }, 115 | "metadata": { 116 | "tags": [] 117 | }, 118 | "execution_count": 5 119 | } 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "colab": { 126 | "base_uri": "https://localhost:8080/" 127 | }, 128 | "id": "9VRU3oostArg", 129 | "outputId": "f44119f5-8e2e-45db-baf7-5722bac45840" 130 | }, 131 | "source": [ 132 | "active_passive.shape" 133 | ], 134 | "execution_count": null, 135 | "outputs": [ 136 | { 137 | "output_type": "execute_result", 138 | "data": { 139 | "text/plain": [ 140 | "(40, 2)" 141 | ] 142 | }, 143 | "metadata": { 144 | "tags": [] 145 | }, 146 | "execution_count": 6 147 | } 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "9yzs-odUtArg" 154 | }, 155 | "source": [ 156 | "active = active_passive['Active']\n", 157 | "passive = active_passive['Passive']" 158 | ], 159 | "execution_count": null, 160 | "outputs": [] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": { 165 | "id": "qi53fEfutArg" 166 | }, 167 | "source": [ 168 | "### Create the rule" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "metadata": { 174 | "id": "E5mJnYPotArh" 175 | }, 176 | "source": [ 177 | "passive_rule = [{'DEP':'nsubjpass'}]\n", 178 | "matcher = Matcher(nlp.vocab)\n", 179 | "matcher.add('Rule',[passive_rule])" 180 | ], 181 | "execution_count": null, 182 | "outputs": [] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "metadata": { 187 | "id": "pmzbRTwgtArh" 188 | }, 189 | "source": [ 190 | "def is_passive(doc,matcher):\n", 191 | " if len(matcher(doc))>0:\n", 192 | " return True\n", 193 | " else:\n", 194 | " return False" 195 | ], 196 | "execution_count": null, 197 | "outputs": [] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "id": "uWDFJWzZtArh" 203 | }, 204 | "source": [ 205 | "### Check rule on active voice sentences" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "metadata": { 211 | "colab": { 212 | "base_uri": "https://localhost:8080/" 213 | }, 214 | "id": "iVtObi5ItAri", 215 | "outputId": "81bb9877-d256-4622-a733-e1905eaa92d8" 216 | }, 217 | "source": [ 218 | "cnt = 0\n", 219 | "for sent in active:\n", 220 | " doc = nlp(sent)\n", 221 | " if not is_passive(doc,matcher):\n", 222 | " cnt += 1\n", 223 | "print(cnt)" 224 | ], 225 | "execution_count": null, 226 | "outputs": [ 227 | { 228 | "output_type": "stream", 229 | "text": [ 230 | "40\n" 231 | ], 232 | "name": "stdout" 233 | } 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": { 239 | "id": "lDLw_valtAri" 240 | }, 241 | "source": [ 242 | "### Check rule on passive voice sentences" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "metadata": { 248 | "colab": { 249 | "base_uri": "https://localhost:8080/" 250 | }, 251 | "id": "fr7aJK7RtAri", 252 | "outputId": "ddb984bf-e29a-42bb-9053-a5ea02fe9cdc" 253 | }, 254 | "source": [ 255 | "cnt = 0\n", 256 | "for sent in passive:\n", 257 | " doc = nlp(sent)\n", 258 | " if is_passive(doc,matcher):\n", 259 | " cnt += 1\n", 260 | "print(cnt)" 261 | ], 262 | "execution_count": null, 263 | "outputs": [ 264 | { 265 | "output_type": "stream", 266 | "text": [ 267 | "40\n" 268 | ], 269 | "name": "stdout" 270 | } 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "id": "bqobr0CctArj" 277 | }, 278 | "source": [ 279 | "### Let's troubleshoot" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "colab": { 286 | "base_uri": "https://localhost:8080/" 287 | }, 288 | "id": "4EdEzFWHtArj", 289 | "outputId": "c002b05c-7b0a-4baf-a767-9ee54f666f80" 290 | }, 291 | "source": [ 292 | "cnt = 0\n", 293 | "missed = []\n", 294 | "for sent in passive:\n", 295 | " doc = nlp(sent)\n", 296 | " if is_passive(doc,matcher):\n", 297 | " cnt += 1\n", 298 | " else:\n", 299 | " missed.append(doc)\n", 300 | "print(cnt)" 301 | ], 302 | "execution_count": null, 303 | "outputs": [ 304 | { 305 | "output_type": "stream", 306 | "text": [ 307 | "40\n" 308 | ], 309 | "name": "stdout" 310 | } 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "metadata": { 316 | "colab": { 317 | "base_uri": "https://localhost:8080/", 318 | "height": 163 319 | }, 320 | "id": "79UCxE9ItArj", 321 | "outputId": "6af01691-8844-421e-a389-38c95efde178" 322 | }, 323 | "source": [ 324 | "missed[0]" 325 | ], 326 | "execution_count": null, 327 | "outputs": [ 328 | { 329 | "output_type": "error", 330 | "ename": "IndexError", 331 | "evalue": "ignored", 332 | "traceback": [ 333 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 334 | "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", 335 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmissed\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 336 | "\u001b[0;31mIndexError\u001b[0m: list index out of range" 337 | ] 338 | } 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "metadata": { 344 | "id": "-iN097kQtArk", 345 | "outputId": "ad05c9af-89ea-4cb7-9f3e-db6c01621e88" 346 | }, 347 | "source": [ 348 | "missed[1]" 349 | ], 350 | "execution_count": null, 351 | "outputs": [ 352 | { 353 | "output_type": "execute_result", 354 | "data": { 355 | "text/plain": [ 356 | "Has apartment been left by them?" 357 | ] 358 | }, 359 | "metadata": { 360 | "tags": [] 361 | }, 362 | "execution_count": 11 363 | } 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": { 369 | "id": "8F9vXmDHtArk" 370 | }, 371 | "source": [ 372 | "### Let's visualize their dependency trees" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "metadata": { 378 | "id": "XPspRsVvtArk", 379 | "outputId": "6ef5de5e-488a-49c9-ff1c-5738c5273e59" 380 | }, 381 | "source": [ 382 | "for doc in missed:\n", 383 | " displacy.render(doc, style=\"dep\")" 384 | ], 385 | "execution_count": null, 386 | "outputs": [ 387 | { 388 | "output_type": "display_data", 389 | "data": { 390 | "text/html": [ 391 | "\n", 392 | "\n", 393 | " Is\n", 394 | " AUX\n", 395 | "\n", 396 | "\n", 397 | "\n", 398 | " a\n", 399 | " DET\n", 400 | "\n", 401 | "\n", 402 | "\n", 403 | " table\n", 404 | " NOUN\n", 405 | "\n", 406 | "\n", 407 | "\n", 408 | " being\n", 409 | " AUX\n", 410 | "\n", 411 | "\n", 412 | "\n", 413 | " bought\n", 414 | " VERB\n", 415 | "\n", 416 | "\n", 417 | "\n", 418 | " by\n", 419 | " ADP\n", 420 | "\n", 421 | "\n", 422 | "\n", 423 | " Ritika?\n", 424 | " PROPN\n", 425 | "\n", 426 | "\n", 427 | "\n", 428 | " \n", 429 | " \n", 430 | " det\n", 431 | " \n", 432 | " \n", 433 | "\n", 434 | "\n", 435 | "\n", 436 | " \n", 437 | " \n", 438 | " attr\n", 439 | " \n", 440 | " \n", 441 | "\n", 442 | "\n", 443 | "\n", 444 | " \n", 445 | " \n", 446 | " auxpass\n", 447 | " \n", 448 | " \n", 449 | "\n", 450 | "\n", 451 | "\n", 452 | " \n", 453 | " \n", 454 | " acl\n", 455 | " \n", 456 | " \n", 457 | "\n", 458 | "\n", 459 | "\n", 460 | " \n", 461 | " \n", 462 | " agent\n", 463 | " \n", 464 | " \n", 465 | "\n", 466 | "\n", 467 | "\n", 468 | " \n", 469 | " \n", 470 | " pobj\n", 471 | " \n", 472 | " \n", 473 | "\n", 474 | "" 475 | ], 476 | "text/plain": [ 477 | "" 478 | ] 479 | }, 480 | "metadata": { 481 | "tags": [] 482 | } 483 | }, 484 | { 485 | "output_type": "display_data", 486 | "data": { 487 | "text/html": [ 488 | "\n", 489 | "\n", 490 | " Has\n", 491 | " AUX\n", 492 | "\n", 493 | "\n", 494 | "\n", 495 | " apartment\n", 496 | " NOUN\n", 497 | "\n", 498 | "\n", 499 | "\n", 500 | " been\n", 501 | " AUX\n", 502 | "\n", 503 | "\n", 504 | "\n", 505 | " left\n", 506 | " VERB\n", 507 | "\n", 508 | "\n", 509 | "\n", 510 | " by\n", 511 | " ADP\n", 512 | "\n", 513 | "\n", 514 | "\n", 515 | " them?\n", 516 | " PRON\n", 517 | "\n", 518 | "\n", 519 | "\n", 520 | " \n", 521 | " \n", 522 | " aux\n", 523 | " \n", 524 | " \n", 525 | "\n", 526 | "\n", 527 | "\n", 528 | " \n", 529 | " \n", 530 | " dobj\n", 531 | " \n", 532 | " \n", 533 | "\n", 534 | "\n", 535 | "\n", 536 | " \n", 537 | " \n", 538 | " auxpass\n", 539 | " \n", 540 | " \n", 541 | "\n", 542 | "\n", 543 | "\n", 544 | " \n", 545 | " \n", 546 | " agent\n", 547 | " \n", 548 | " \n", 549 | "\n", 550 | "\n", 551 | "\n", 552 | " \n", 553 | " \n", 554 | " pobj\n", 555 | " \n", 556 | " \n", 557 | "\n", 558 | "" 559 | ], 560 | "text/plain": [ 561 | "" 562 | ] 563 | }, 564 | "metadata": { 565 | "tags": [] 566 | } 567 | } 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "metadata": { 573 | "id": "9WtDcQ_XtArl", 574 | "outputId": "aaa28771-02c1-41ae-df8c-cf40ae7515a1" 575 | }, 576 | "source": [ 577 | "spacy.explain(\"auxpass\")" 578 | ], 579 | "execution_count": null, 580 | "outputs": [ 581 | { 582 | "output_type": "execute_result", 583 | "data": { 584 | "text/plain": [ 585 | "'auxiliary (passive)'" 586 | ] 587 | }, 588 | "metadata": { 589 | "tags": [] 590 | }, 591 | "execution_count": 13 592 | } 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": { 598 | "id": "0E-BKP-vtArn" 599 | }, 600 | "source": [ 601 | "[Dependencies](https://universaldependencies.org/docs/en/dep/)" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": { 607 | "id": "x4_D5nq5tAro" 608 | }, 609 | "source": [ 610 | "### Update our rule\n", 611 | "[Reference](https://spacy.io/usage/rule-based-matching)" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "metadata": { 617 | "id": "Rt2RSbsbtAro" 618 | }, 619 | "source": [ 620 | "passive_rule = [{'DEP':{\"IN\":['nsubjpass','auxpass']}}]\n", 621 | "matcher = Matcher(nlp.vocab)\n", 622 | "matcher.add('Rule',[passive_rule])" 623 | ], 624 | "execution_count": null, 625 | "outputs": [] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "metadata": { 630 | "id": "jqRoircqtAro", 631 | "outputId": "f762bf7a-a283-43b4-bab4-bc03bf92e880" 632 | }, 633 | "source": [ 634 | "cnt = 0\n", 635 | "for sent in active:\n", 636 | " doc = nlp(sent)\n", 637 | " if not is_passive(doc,matcher):\n", 638 | " cnt += 1\n", 639 | "print(cnt)" 640 | ], 641 | "execution_count": null, 642 | "outputs": [ 643 | { 644 | "output_type": "stream", 645 | "text": [ 646 | "40\n" 647 | ], 648 | "name": "stdout" 649 | } 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "metadata": { 655 | "id": "gsPNOHhttAro", 656 | "outputId": "e2f2f1ef-b54a-41a7-b21c-7406902aa6dc" 657 | }, 658 | "source": [ 659 | "cnt = 0\n", 660 | "missed = []\n", 661 | "for sent in passive:\n", 662 | " doc = nlp(sent)\n", 663 | " if is_passive(doc,matcher):\n", 664 | " cnt += 1\n", 665 | " else:\n", 666 | " missed.append(doc)\n", 667 | "print(cnt)" 668 | ], 669 | "execution_count": null, 670 | "outputs": [ 671 | { 672 | "output_type": "stream", 673 | "text": [ 674 | "40\n" 675 | ], 676 | "name": "stdout" 677 | } 678 | ] 679 | }, 680 | { 681 | "cell_type": "markdown", 682 | "metadata": { 683 | "id": "BDUS7kn4tArp" 684 | }, 685 | "source": [ 686 | "## Summary\n", 687 | " - Always test your rules and hueristics on a larger corpus to see the effectiveness of the rules\n", 688 | " - One can write intricate matching rules using `matcher` object" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "metadata": { 694 | "id": "bx_VfPt6tArp" 695 | }, 696 | "source": [ 697 | "" 698 | ], 699 | "execution_count": null, 700 | "outputs": [] 701 | } 702 | ] 703 | } --------------------------------------------------------------------------------