├── requirements.txt
├── notebooks
├── images
│ ├── regex.png
│ ├── active.png
│ ├── results.png
│ ├── keywords.png
│ └── spacy_pipeline.png
├── POS1.ipynb
├── POS3.ipynb
├── POS2.ipynb
├── Dep_parsing2.ipynb
├── Dep_parsing3.ipynb
└── Dep_parsing1.ipynb
├── .gitignore
└── data
└── active_passive.csv
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/pos_tagging/main/requirements.txt
--------------------------------------------------------------------------------
/notebooks/images/regex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/pos_tagging/main/notebooks/images/regex.png
--------------------------------------------------------------------------------
/notebooks/images/active.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/pos_tagging/main/notebooks/images/active.png
--------------------------------------------------------------------------------
/notebooks/images/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/pos_tagging/main/notebooks/images/results.png
--------------------------------------------------------------------------------
/notebooks/images/keywords.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/pos_tagging/main/notebooks/images/keywords.png
--------------------------------------------------------------------------------
/notebooks/images/spacy_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/pos_tagging/main/notebooks/images/spacy_pipeline.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # PyInstaller
26 | # Usually these files are written by a python script from a template
27 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 |
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 |
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 |
43 | # Translations
44 | *.mo
45 | *.pot
46 |
47 | # Django stuff:
48 | *.log
49 |
50 | # Sphinx documentation
51 | docs/_build/
52 |
53 | # PyBuilder
54 | target/
55 |
56 | # IPython notebook
57 | .ipynb_checkpoints
58 |
59 | # Repo scratch directory
60 | scratch/
61 |
62 | # Misc
63 | .DS_Store
--------------------------------------------------------------------------------
/data/active_passive.csv:
--------------------------------------------------------------------------------
1 | Active,Passive
2 | He reads a novel.,A novel is read.
3 | He does not cook food.,Food is not cooked by him.
4 | Does he purchase books?,Are books being purchased by him?
5 | They grow plants.,Plants are grown by them.
6 | She teaches me.,I am taught by her.
7 | Esha is singing a song.,A song is being sung by Esha.
8 | Kritika is not chopping vegetables.,Vegetables are not being chopped by Kritika.
9 | Is Ritika buying a table?,Is a table being bought by Ritika?
10 | They are serving poor people.,Poor people are being served by them.
11 | She is disturbing Dinesh.,Dinesh is being disturbed by her.
12 | Nitesh has challenged her.,She has been challenged by Nitesh.
13 | Radhika has not written an article.,An article has not been written by Radhika.
14 | Have they left the apartment?,Has apartment been left by them?
15 | She has created this masterpiece.,This masterpiece has been created by her.
16 | I have read the newspaper.,The newspaper has been read by me.
17 | Reema cleaned the floor.,The floor was cleaned by Reema.
18 | Aisha bought a bicycle.,A bicycle was bought by Aisha.
19 | Naman called my friends.,My friends were called by Naman.
20 | I saved him.,He was saved by me.
21 | Miraya paid the bills.,The bills were paid by Miraya.
22 | Nitika was painting the wall.,The wall was being painted by Nitika.
23 | Manish was repairing the car.,The car was being repaired by Manish.
24 | Were you reciting the poem?,Was the poem being recited?
25 | She was baking the cake.,The cake was being baked by her.
26 | She was watching me.,I was being watched by her.
27 | Misha had cleaned the floor.,The floor had been cleaned by Misha.
28 | Vidhi had not received the parcel.,The parcel had not been received by Vidhi.
29 | Vishal had solved the doubt.,The doubt had been solved.
30 | Had they caught the thief?,Had the thief been caught by them?
31 | I had paid fifty thousand.,Fifty thousand had been paid by me.
32 | Kriya will sew the bag.,The bag will be sewed by Kriya.
33 | Disha will not arrange the things.,The things will not be arranged by Disha.
34 | Will you mop the floor?,Will the floor be mopped by you?
35 | They will post the letter.,The letter will be posted.
36 | Reena will save money.,Money will be saved by Reena.
37 | They will have brought the toy.,The toy will have been brought by them.
38 | Nimesh will not have changed the table cover.,The table cover will not have been changed by Nimesh.
39 | Will she have written the notes.,Will the notes have been written by her?
40 | They will have won the match.,The match will have been won by them.
41 | Vijay will have washed a shirt.,A shirt will have been washed by Vijay.
--------------------------------------------------------------------------------
/notebooks/POS1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Importing libraries"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "import numpy as np\n",
18 | "import os"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### How do we identify product features?\n",
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "sent1 = \"I loved the screen on this phone.\"\n",
36 | "sent2 = \"The battery life on this phone is great.\"\n",
37 | "sent3 = \"The speakers are pathetic.\""
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### Lets do a POS parse and see if we can figure out some patterns."
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "import spacy \n",
54 | "nlp = spacy.load(\"en_core_web_sm\")"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 4,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "name": "stdout",
64 | "output_type": "stream",
65 | "text": [
66 | "I PRON\n",
67 | "loved VERB\n",
68 | "the DET\n",
69 | "screen NOUN\n",
70 | "on ADP\n",
71 | "this DET\n",
72 | "phone NOUN\n",
73 | ". PUNCT\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "doc1 = nlp(sent1)\n",
79 | "for tok in doc1:\n",
80 | " print(tok.text,tok.pos_)"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 5,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "name": "stdout",
90 | "output_type": "stream",
91 | "text": [
92 | "The DET\n",
93 | "battery NOUN\n",
94 | "life NOUN\n",
95 | "on ADP\n",
96 | "this DET\n",
97 | "phone NOUN\n",
98 | "is AUX\n",
99 | "great ADJ\n",
100 | ". PUNCT\n"
101 | ]
102 | }
103 | ],
104 | "source": [
105 | "doc2 = nlp(sent2)\n",
106 | "for tok in doc2:\n",
107 | " print(tok.text,tok.pos_)"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 6,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "The DET\n",
120 | "speakers NOUN\n",
121 | "are AUX\n",
122 | "pathetic ADJ\n",
123 | ". PUNCT\n"
124 | ]
125 | }
126 | ],
127 | "source": [
128 | "doc3 = nlp(sent3)\n",
129 | "for tok in doc3:\n",
130 | " print(tok.text,tok.pos_)"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "#### **Product features such as `screen`, `battery`, `speaker` have a POS tag of NOUN**"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "## Summary\n",
145 | "- Product features such as `screen`, `battery` and `speaker` have a POS tag of Noun\n",
146 | "- If we can find the frequency count of all the nouns in our data, then by looking at top-n nouns we can find out what product features people are talking about\n",
147 | "- Check hypothesis on a real world dataset"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": []
156 | }
157 | ],
158 | "metadata": {
159 | "kernelspec": {
160 | "display_name": "Python 3",
161 | "language": "python",
162 | "name": "python3"
163 | },
164 | "language_info": {
165 | "codemirror_mode": {
166 | "name": "ipython",
167 | "version": 3
168 | },
169 | "file_extension": ".py",
170 | "mimetype": "text/x-python",
171 | "name": "python",
172 | "nbconvert_exporter": "python",
173 | "pygments_lexer": "ipython3",
174 | "version": "3.8.3"
175 | }
176 | },
177 | "nbformat": 4,
178 | "nbformat_minor": 4
179 | }
180 |
--------------------------------------------------------------------------------
/notebooks/POS3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Importing libraries"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "import numpy as np\n",
18 | "import os\n",
19 | "import spacy \n",
20 | "from tqdm import tqdm"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "### Read reviews data"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "con=open(\"../data/Samsung.txt\",'r', encoding=\"utf-8\")\n",
37 | "samsung_reviews=con.read()\n",
38 | "con.close()"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "### Can we reduce the time taken?\n",
46 | "[Pipelines (Spacy)](https://spacy.io/usage/processing-pipelines)\n"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "
"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 3,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "# shorten the pipline loading\n",
63 | "nlp=spacy.load('en_core_web_sm',disable=['parser','ner'])"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 4,
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "name": "stderr",
73 | "output_type": "stream",
74 | "text": [
75 | "100%|██████████| 1000/1000 [00:06<00:00, 148.24it/s]\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "nouns = []\n",
81 | "for review in tqdm(samsung_reviews.split(\"\\n\")[0:1000]):\n",
82 | " doc = nlp(review)\n",
83 | " for tok in doc:\n",
84 | " if tok.pos_==\"NOUN\":\n",
85 | " nouns.append(tok.lemma_.lower())"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "len(samsung_reviews.split(\"\\n\"))"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 5,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "data": {
104 | "text/plain": [
105 | "278.13"
106 | ]
107 | },
108 | "execution_count": 5,
109 | "metadata": {},
110 | "output_type": "execute_result"
111 | }
112 | ],
113 | "source": [
114 | "(46355/1000)*6"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 6,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | "4.633333333333334"
126 | ]
127 | },
128 | "execution_count": 6,
129 | "metadata": {},
130 | "output_type": "execute_result"
131 | }
132 | ],
133 | "source": [
134 | "278/60"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "### Lets process all the reviews now and see if time taken is less !!!"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 7,
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "name": "stderr",
151 | "output_type": "stream",
152 | "text": [
153 | "100%|██████████| 46355/46355 [04:27<00:00, 173.42it/s]\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "nouns = []\n",
159 | "for review in tqdm(samsung_reviews.split(\"\\n\")):\n",
160 | " doc = nlp(review)\n",
161 | " for tok in doc:\n",
162 | " if tok.pos_==\"NOUN\":\n",
163 | " nouns.append(tok.lemma_.lower())"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "### Does the hypothesis of nouns capturing `product features` hold?"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 8,
176 | "metadata": {},
177 | "outputs": [
178 | {
179 | "data": {
180 | "text/plain": [
181 | "phone 43237\n",
182 | "battery 4350\n",
183 | "product 3907\n",
184 | "time 3825\n",
185 | "screen 3746\n",
186 | "dtype: int64"
187 | ]
188 | },
189 | "execution_count": 8,
190 | "metadata": {},
191 | "output_type": "execute_result"
192 | }
193 | ],
194 | "source": [
195 | "nouns=pd.Series(nouns)\n",
196 | "nouns.value_counts().head(5)"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 9,
202 | "metadata": {},
203 | "outputs": [
204 | {
205 | "data": {
206 | "text/plain": [
207 | "phone 43237\n",
208 | "battery 4350\n",
209 | "product 3907\n",
210 | "time 3825\n",
211 | "screen 3746\n",
212 | "card 3399\n",
213 | "price 3148\n",
214 | "problem 3120\n",
215 | "camera 2773\n",
216 | "app 2606\n",
217 | "dtype: int64"
218 | ]
219 | },
220 | "execution_count": 9,
221 | "metadata": {},
222 | "output_type": "execute_result"
223 | }
224 | ],
225 | "source": [
226 | "nouns.value_counts().head(10)"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "### We now know that people mention `battery`, `product`, `screen` etc. But we still don't know in what context they mention these keywords"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "### Summary:\n",
241 | " - Most frequently used lemmatised forms of noun, inform us about the product features people are talking about in product reviews\n",
242 | " - In order to process the review data faster spacy allows us to use the idea of enabling parts of model inference pipeline via `spacy.loads()` command and `disable` parameter"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {},
249 | "outputs": [],
250 | "source": []
251 | }
252 | ],
253 | "metadata": {
254 | "kernelspec": {
255 | "display_name": "Python 3",
256 | "language": "python",
257 | "name": "python3"
258 | },
259 | "language_info": {
260 | "codemirror_mode": {
261 | "name": "ipython",
262 | "version": 3
263 | },
264 | "file_extension": ".py",
265 | "mimetype": "text/x-python",
266 | "name": "python",
267 | "nbconvert_exporter": "python",
268 | "pygments_lexer": "ipython3",
269 | "version": "3.8.3"
270 | }
271 | },
272 | "nbformat": 4,
273 | "nbformat_minor": 4
274 | }
275 |
--------------------------------------------------------------------------------
/notebooks/POS2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Importing libraries"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "import numpy as np\n",
18 | "import os"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import spacy \n",
28 | "nlp = spacy.load(\"en_core_web_sm\")"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "### Read reviews data"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "con=open(\"../data/Samsung.txt\",'r', encoding=\"utf-8\")\n",
45 | "samsung_reviews=con.read()\n",
46 | "con.close()"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 4,
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "data": {
56 | "text/plain": [
57 | "46355"
58 | ]
59 | },
60 | "execution_count": 4,
61 | "metadata": {},
62 | "output_type": "execute_result"
63 | }
64 | ],
65 | "source": [
66 | "len(samsung_reviews.split(\"\\n\"))"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "### Dataset is a text file where each review is in a new line"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 5,
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "data": {
83 | "text/plain": [
84 | "[\"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!\",\n",
85 | " 'nice phone, nice up grade from my pantach revue. Very clean set up and easy set up. never had an android phone but they are fantastic to say the least. perfect size for surfing and social media. great phone samsung',\n",
86 | " 'Very pleased',\n",
87 | " 'It works good but it goes slow sometimes but its a very good phone I love it']"
88 | ]
89 | },
90 | "execution_count": 5,
91 | "metadata": {},
92 | "output_type": "execute_result"
93 | }
94 | ],
95 | "source": [
96 | "samsung_reviews.split(\"\\n\")[0:4]"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "### Will our hypothesis hold on real world data? `Product features---POS_NOUN`"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 6,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "review1=samsung_reviews.split(\"\\n\")[0]\n",
113 | "review1=nlp(review1)"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "### Lets do nlp parse on part of one review in our dataset"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 7,
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "name": "stdout",
130 | "output_type": "stream",
131 | "text": [
132 | "I --- I --- PRON\n",
133 | "feel --- feel --- VERB\n",
134 | "so --- so --- ADV\n",
135 | "LUCKY --- lucky --- ADJ\n",
136 | "to --- to --- PART\n",
137 | "have --- have --- AUX\n",
138 | "found --- find --- VERB\n",
139 | "this --- this --- DET\n",
140 | "used --- use --- VERB\n",
141 | "( --- ( --- PUNCT\n"
142 | ]
143 | }
144 | ],
145 | "source": [
146 | "for tok in review1[0:10]:\n",
147 | " print(tok.text,\"---\",tok.lemma_,\"---\",tok.pos_)"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "#### Real world data is usually messy, observe the words `found` and `used`"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 8,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "pos = []\n",
164 | "lemma = []\n",
165 | "text = []\n",
166 | "for tok in review1:\n",
167 | " pos.append(tok.pos_)\n",
168 | " lemma.append(tok.lemma_)\n",
169 | " text.append(tok.text)"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 9,
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "data": {
179 | "text/html": [
180 | "
\n",
181 | "\n",
194 | "
\n",
195 | " \n",
196 | " \n",
197 | " | \n",
198 | " text | \n",
199 | " lemma | \n",
200 | " pos | \n",
201 | "
\n",
202 | " \n",
203 | " \n",
204 | " \n",
205 | " | 0 | \n",
206 | " I | \n",
207 | " I | \n",
208 | " PRON | \n",
209 | "
\n",
210 | " \n",
211 | " | 1 | \n",
212 | " feel | \n",
213 | " feel | \n",
214 | " VERB | \n",
215 | "
\n",
216 | " \n",
217 | " | 2 | \n",
218 | " so | \n",
219 | " so | \n",
220 | " ADV | \n",
221 | "
\n",
222 | " \n",
223 | " | 3 | \n",
224 | " LUCKY | \n",
225 | " lucky | \n",
226 | " ADJ | \n",
227 | "
\n",
228 | " \n",
229 | " | 4 | \n",
230 | " to | \n",
231 | " to | \n",
232 | " PART | \n",
233 | "
\n",
234 | " \n",
235 | "
\n",
236 | "
"
237 | ],
238 | "text/plain": [
239 | " text lemma pos\n",
240 | "0 I I PRON\n",
241 | "1 feel feel VERB\n",
242 | "2 so so ADV\n",
243 | "3 LUCKY lucky ADJ\n",
244 | "4 to to PART"
245 | ]
246 | },
247 | "execution_count": 9,
248 | "metadata": {},
249 | "output_type": "execute_result"
250 | }
251 | ],
252 | "source": [
253 | "nlp_table = pd.DataFrame({'text':text,'lemma':lemma,'pos':pos})\n",
254 | "nlp_table.head()"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 10,
260 | "metadata": {},
261 | "outputs": [
262 | {
263 | "data": {
264 | "text/plain": [
265 | "phone 3\n",
266 | "one 2\n",
267 | "year 1\n",
268 | "seller 1\n",
269 | "honesty 1\n",
270 | "upgrade 1\n",
271 | "line 1\n",
272 | "Name: lemma, dtype: int64"
273 | ]
274 | },
275 | "execution_count": 10,
276 | "metadata": {},
277 | "output_type": "execute_result"
278 | }
279 | ],
280 | "source": [
281 | "## Get most frequent lemma forms of nouns\n",
282 | "nlp_table[nlp_table['pos']=='NOUN']['lemma'].value_counts()"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {},
288 | "source": [
289 | "#### It seems possible that if we extract all the nouns from the reviews and look at the top 5 most frequent lemmatised noun forms, we will be able to identify `What people are talking about?`"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "### Lets repeat this experiment on a larger set of reviews"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 11,
302 | "metadata": {},
303 | "outputs": [],
304 | "source": [
305 | "nouns = []\n",
306 | "for review in samsung_reviews.split(\"\\n\")[0:100]:\n",
307 | " doc = nlp(review)\n",
308 | " for tok in doc:\n",
309 | " if tok.pos_==\"NOUN\":\n",
310 | " nouns.append(tok.lemma_.lower())"
311 | ]
312 | },
313 | {
314 | "cell_type": "markdown",
315 | "metadata": {},
316 | "source": [
317 | "### Lets add some way of keeping track of time"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 12,
323 | "metadata": {},
324 | "outputs": [
325 | {
326 | "name": "stderr",
327 | "output_type": "stream",
328 | "text": [
329 | "100%|██████████| 1000/1000 [00:17<00:00, 56.70it/s]\n"
330 | ]
331 | },
332 | {
333 | "data": {
334 | "text/plain": [
335 | "phone 1208\n",
336 | "battery 92\n",
337 | "time 90\n",
338 | "price 87\n",
339 | "screen 86\n",
340 | "dtype: int64"
341 | ]
342 | },
343 | "execution_count": 12,
344 | "metadata": {},
345 | "output_type": "execute_result"
346 | }
347 | ],
348 | "source": [
349 | "from tqdm import tqdm\n",
350 | "nouns = []\n",
351 | "for review in tqdm(samsung_reviews.split(\"\\n\")[0:1000]):\n",
352 | " doc = nlp(review)\n",
353 | " for tok in doc:\n",
354 | " if tok.pos_==\"NOUN\":\n",
355 | " nouns.append(tok.lemma_.lower())\n",
356 | "pd.Series(nouns).value_counts().head(5)"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": 13,
362 | "metadata": {},
363 | "outputs": [
364 | {
365 | "data": {
366 | "text/plain": [
367 | "46355"
368 | ]
369 | },
370 | "execution_count": 13,
371 | "metadata": {},
372 | "output_type": "execute_result"
373 | }
374 | ],
375 | "source": [
376 | "len(samsung_reviews.split(\"\\n\"))"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "### Did you notice anything? What do you think will be the time taken to process all the reviews?"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 14,
389 | "metadata": {},
390 | "outputs": [
391 | {
392 | "data": {
393 | "text/plain": [
394 | "782"
395 | ]
396 | },
397 | "execution_count": 14,
398 | "metadata": {},
399 | "output_type": "execute_result"
400 | }
401 | ],
402 | "source": [
403 | "(46355//1000)*17"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 15,
409 | "metadata": {},
410 | "outputs": [
411 | {
412 | "data": {
413 | "text/plain": [
414 | "13"
415 | ]
416 | },
417 | "execution_count": 15,
418 | "metadata": {},
419 | "output_type": "execute_result"
420 | }
421 | ],
422 | "source": [
423 | "782//60"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {},
429 | "source": [
430 | "## Summary\n",
431 | "- POS tag based rule seems to be working well\n",
432 | "- We need to figure out a way to reduce the time taken to process reviews"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": null,
438 | "metadata": {},
439 | "outputs": [],
440 | "source": []
441 | }
442 | ],
443 | "metadata": {
444 | "kernelspec": {
445 | "display_name": "Python 3",
446 | "language": "python",
447 | "name": "python3"
448 | },
449 | "language_info": {
450 | "codemirror_mode": {
451 | "name": "ipython",
452 | "version": 3
453 | },
454 | "file_extension": ".py",
455 | "mimetype": "text/x-python",
456 | "name": "python",
457 | "nbconvert_exporter": "python",
458 | "pygments_lexer": "ipython3",
459 | "version": "3.8.3"
460 | }
461 | },
462 | "nbformat": 4,
463 | "nbformat_minor": 4
464 | }
465 |
--------------------------------------------------------------------------------
/notebooks/Dep_parsing2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import spacy\n",
10 | "from spacy import displacy\n",
11 | "import pandas as pd\n",
12 | "nlp = spacy.load(\"en_core_web_sm\")"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "active = ['Hens lay eggs.',\n",
22 | " 'Birds build nests.',\n",
23 | " 'The batter hit the ball.',\n",
24 | " 'The computer transmitted a copy of the manual']\n",
25 | "passive = ['Eggs are laid by hens',\n",
26 | " 'Nests are built by birds',\n",
27 | " 'The ball was hit by the batter',\n",
28 | " 'A copy of the manual was transmitted by the computer.']"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "### How do we impliment the rule `if dep nsubjpass, then passive else not`?"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "from spacy.matcher import Matcher"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "### Read more about it [here](https://spacy.io/api/matcher)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 4,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/html": [
62 | ""
120 | ],
121 | "text/plain": [
122 | ""
123 | ]
124 | },
125 | "metadata": {},
126 | "output_type": "display_data"
127 | }
128 | ],
129 | "source": [
130 | "doc = nlp(passive[0])\n",
131 | "displacy.render(doc, style=\"dep\")"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "### Create a rule with `Matcher`"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 5,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "rule = [{'POS':'NOUN'}]\n",
148 | "matcher = Matcher(nlp.vocab)\n",
149 | "matcher.add('Rule',[rule])"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 6,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "data": {
159 | "text/plain": [
160 | "[(15740618714089435985, 0, 1), (15740618714089435985, 4, 5)]"
161 | ]
162 | },
163 | "execution_count": 6,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "matcher(doc)"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 7,
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "data": {
179 | "text/plain": [
180 | "Eggs"
181 | ]
182 | },
183 | "execution_count": 7,
184 | "metadata": {},
185 | "output_type": "execute_result"
186 | }
187 | ],
188 | "source": [
189 | "doc[0:1]"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 8,
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "data": {
199 | "text/plain": [
200 | "hens"
201 | ]
202 | },
203 | "execution_count": 8,
204 | "metadata": {},
205 | "output_type": "execute_result"
206 | }
207 | ],
208 | "source": [
209 | "doc[4:5]"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "### Create a rule for `passive voice`"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 9,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "passive_rule = [{'DEP':'nsubjpass'}]\n",
226 | "matcher = Matcher(nlp.vocab)\n",
227 | "matcher.add('Rule',[passive_rule])"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 10,
233 | "metadata": {},
234 | "outputs": [
235 | {
236 | "data": {
237 | "text/plain": [
238 | "[(15740618714089435985, 0, 1)]"
239 | ]
240 | },
241 | "execution_count": 10,
242 | "metadata": {},
243 | "output_type": "execute_result"
244 | }
245 | ],
246 | "source": [
247 | "matcher(doc)"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {},
253 | "source": [
254 | "### Let's check how this rule works if we use it on a sentence with `active voice`"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 11,
260 | "metadata": {},
261 | "outputs": [
262 | {
263 | "data": {
264 | "text/plain": [
265 | "'Hens lay eggs.'"
266 | ]
267 | },
268 | "execution_count": 11,
269 | "metadata": {},
270 | "output_type": "execute_result"
271 | }
272 | ],
273 | "source": [
274 | "active[0]"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 12,
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "data": {
284 | "text/html": [
285 | ""
317 | ],
318 | "text/plain": [
319 | ""
320 | ]
321 | },
322 | "metadata": {},
323 | "output_type": "display_data"
324 | }
325 | ],
326 | "source": [
327 | "doc = nlp(active[0])\n",
328 | "displacy.render(doc, style=\"dep\")"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": 13,
334 | "metadata": {},
335 | "outputs": [
336 | {
337 | "data": {
338 | "text/plain": [
339 | "[]"
340 | ]
341 | },
342 | "execution_count": 13,
343 | "metadata": {},
344 | "output_type": "execute_result"
345 | }
346 | ],
347 | "source": [
348 | "matcher(doc)"
349 | ]
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "metadata": {},
354 | "source": [
355 | "### Now lets make a function that impliments this logic"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 14,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "def is_passive(doc,matcher):\n",
365 | " if len(matcher(doc))>0:\n",
366 | " return True\n",
367 | " else:\n",
368 | " return False"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "metadata": {},
374 | "source": [
375 | "### Let's test this function on our small sample of sentences and see how the pipeline will work"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 15,
381 | "metadata": {},
382 | "outputs": [
383 | {
384 | "name": "stdout",
385 | "output_type": "stream",
386 | "text": [
387 | "False\n",
388 | "False\n",
389 | "False\n",
390 | "False\n"
391 | ]
392 | }
393 | ],
394 | "source": [
395 | "for sent in active:\n",
396 | " doc = nlp(sent)\n",
397 | " print(is_passive(doc,matcher))"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 16,
403 | "metadata": {},
404 | "outputs": [
405 | {
406 | "name": "stdout",
407 | "output_type": "stream",
408 | "text": [
409 | "True\n",
410 | "True\n",
411 | "True\n",
412 | "True\n"
413 | ]
414 | }
415 | ],
416 | "source": [
417 | "for sent in passive:\n",
418 | " doc = nlp(sent)\n",
419 | " print(is_passive(doc,matcher))"
420 | ]
421 | },
422 | {
423 | "cell_type": "markdown",
424 | "metadata": {},
425 | "source": [
426 | "### Summary\n",
427 | " - One can go a long way by observing patterns in linguistic data, you don't always need to know the details of the linguitsics very well.\n",
428 | " - Once can use the `matcher` object to find if certain linguistic patterns exist in data"
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": null,
434 | "metadata": {},
435 | "outputs": [],
436 | "source": []
437 | }
438 | ],
439 | "metadata": {
440 | "kernelspec": {
441 | "display_name": "Python 3",
442 | "language": "python",
443 | "name": "python3"
444 | },
445 | "language_info": {
446 | "codemirror_mode": {
447 | "name": "ipython",
448 | "version": 3
449 | },
450 | "file_extension": ".py",
451 | "mimetype": "text/x-python",
452 | "name": "python",
453 | "nbconvert_exporter": "python",
454 | "pygments_lexer": "ipython3",
455 | "version": "3.8.3"
456 | }
457 | },
458 | "nbformat": 4,
459 | "nbformat_minor": 4
460 | }
461 |
--------------------------------------------------------------------------------
/notebooks/Dep_parsing3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import spacy\n",
10 | "from spacy import displacy\n",
11 | "from spacy.matcher import Matcher\n",
12 | "import pandas as pd\n",
13 | "nlp = spacy.load(\"en_core_web_sm\")"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "### Lets check our rule on a larger corpus"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "data": {
30 | "text/html": [
31 | "\n",
32 | "\n",
45 | "
\n",
46 | " \n",
47 | " \n",
48 | " | \n",
49 | " Active | \n",
50 | " Passive | \n",
51 | "
\n",
52 | " \n",
53 | " \n",
54 | " \n",
55 | " | 0 | \n",
56 | " He reads a novel. | \n",
57 | " A novel is read. | \n",
58 | "
\n",
59 | " \n",
60 | " | 1 | \n",
61 | " He does not cook food. | \n",
62 | " Food is not cooked by him. | \n",
63 | "
\n",
64 | " \n",
65 | "
\n",
66 | "
"
67 | ],
68 | "text/plain": [
69 | " Active Passive\n",
70 | "0 He reads a novel. A novel is read.\n",
71 | "1 He does not cook food. Food is not cooked by him."
72 | ]
73 | },
74 | "execution_count": 2,
75 | "metadata": {},
76 | "output_type": "execute_result"
77 | }
78 | ],
79 | "source": [
80 | "active_passive = pd.read_csv('../data/active_passive.csv')\n",
81 | "active_passive.head(2)"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 3,
87 | "metadata": {},
88 | "outputs": [
89 | {
90 | "data": {
91 | "text/plain": [
92 | "(40, 2)"
93 | ]
94 | },
95 | "execution_count": 3,
96 | "metadata": {},
97 | "output_type": "execute_result"
98 | }
99 | ],
100 | "source": [
101 | "active_passive.shape"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 4,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "active = active_passive['Active']\n",
111 | "passive = active_passive['Passive']"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "### Create the rule"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 5,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "passive_rule = [{'DEP':'nsubjpass'}]\n",
128 | "matcher = Matcher(nlp.vocab)\n",
129 | "matcher.add('Rule',[passive_rule])"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 6,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "def is_passive(doc,matcher):\n",
139 | " if len(matcher(doc))>0:\n",
140 | " return True\n",
141 | " else:\n",
142 | " return False"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "### Check rule on active voice sentences"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 7,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | "40\n"
162 | ]
163 | }
164 | ],
165 | "source": [
166 | "cnt = 0\n",
167 | "for sent in active:\n",
168 | " doc = nlp(sent)\n",
169 | " if not is_passive(doc,matcher):\n",
170 | " cnt += 1\n",
171 | "print(cnt)"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "### Check rule on passive voice sentences"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 8,
184 | "metadata": {},
185 | "outputs": [
186 | {
187 | "name": "stdout",
188 | "output_type": "stream",
189 | "text": [
190 | "38\n"
191 | ]
192 | }
193 | ],
194 | "source": [
195 | "cnt = 0\n",
196 | "for sent in passive:\n",
197 | " doc = nlp(sent)\n",
198 | " if is_passive(doc,matcher):\n",
199 | " cnt += 1\n",
200 | "print(cnt)"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "### Let's troubleshoot"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 9,
213 | "metadata": {},
214 | "outputs": [
215 | {
216 | "name": "stdout",
217 | "output_type": "stream",
218 | "text": [
219 | "38\n"
220 | ]
221 | }
222 | ],
223 | "source": [
224 | "cnt = 0\n",
225 | "missed = []\n",
226 | "for sent in passive:\n",
227 | " doc = nlp(sent)\n",
228 | " if is_passive(doc,matcher):\n",
229 | " cnt += 1\n",
230 | " else:\n",
231 | " missed.append(doc)\n",
232 | "print(cnt)"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 10,
238 | "metadata": {},
239 | "outputs": [
240 | {
241 | "data": {
242 | "text/plain": [
243 | "Is a table being bought by Ritika?"
244 | ]
245 | },
246 | "execution_count": 10,
247 | "metadata": {},
248 | "output_type": "execute_result"
249 | }
250 | ],
251 | "source": [
252 | "missed[0]"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 11,
258 | "metadata": {},
259 | "outputs": [
260 | {
261 | "data": {
262 | "text/plain": [
263 | "Has apartment been left by them?"
264 | ]
265 | },
266 | "execution_count": 11,
267 | "metadata": {},
268 | "output_type": "execute_result"
269 | }
270 | ],
271 | "source": [
272 | "missed[1]"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "### Let's visualize their dependency trees"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 12,
285 | "metadata": {},
286 | "outputs": [
287 | {
288 | "data": {
289 | "text/html": [
290 | ""
374 | ],
375 | "text/plain": [
376 | ""
377 | ]
378 | },
379 | "metadata": {},
380 | "output_type": "display_data"
381 | },
382 | {
383 | "data": {
384 | "text/html": [
385 | ""
456 | ],
457 | "text/plain": [
458 | ""
459 | ]
460 | },
461 | "metadata": {},
462 | "output_type": "display_data"
463 | }
464 | ],
465 | "source": [
466 | "for doc in missed:\n",
467 | " displacy.render(doc, style=\"dep\")"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": 13,
473 | "metadata": {},
474 | "outputs": [
475 | {
476 | "data": {
477 | "text/plain": [
478 | "'auxiliary (passive)'"
479 | ]
480 | },
481 | "execution_count": 13,
482 | "metadata": {},
483 | "output_type": "execute_result"
484 | }
485 | ],
486 | "source": [
487 | "spacy.explain(\"auxpass\")"
488 | ]
489 | },
490 | {
491 | "cell_type": "markdown",
492 | "metadata": {},
493 | "source": [
494 | "[Dependencies](https://universaldependencies.org/docs/en/dep/)"
495 | ]
496 | },
497 | {
498 | "cell_type": "markdown",
499 | "metadata": {},
500 | "source": [
501 | "### Update our rule\n",
502 | "[Reference](https://spacy.io/usage/rule-based-matching)"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 14,
508 | "metadata": {},
509 | "outputs": [],
510 | "source": [
511 | "passive_rule = [{'DEP':{\"IN\":['nsubjpass','auxpass']}}]\n",
512 | "matcher = Matcher(nlp.vocab)\n",
513 | "matcher.add('Rule',[passive_rule])"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": 15,
519 | "metadata": {},
520 | "outputs": [
521 | {
522 | "name": "stdout",
523 | "output_type": "stream",
524 | "text": [
525 | "40\n"
526 | ]
527 | }
528 | ],
529 | "source": [
530 | "cnt = 0\n",
531 | "for sent in active:\n",
532 | " doc = nlp(sent)\n",
533 | " if not is_passive(doc,matcher):\n",
534 | " cnt += 1\n",
535 | "print(cnt)"
536 | ]
537 | },
538 | {
539 | "cell_type": "code",
540 | "execution_count": 16,
541 | "metadata": {},
542 | "outputs": [
543 | {
544 | "name": "stdout",
545 | "output_type": "stream",
546 | "text": [
547 | "40\n"
548 | ]
549 | }
550 | ],
551 | "source": [
552 | "cnt = 0\n",
553 | "missed = []\n",
554 | "for sent in passive:\n",
555 | " doc = nlp(sent)\n",
556 | " if is_passive(doc,matcher):\n",
557 | " cnt += 1\n",
558 | " else:\n",
559 | " missed.append(doc)\n",
560 | "print(cnt)"
561 | ]
562 | },
563 | {
564 | "cell_type": "markdown",
565 | "metadata": {},
566 | "source": [
567 | "## Summary\n",
568 | " - Always test your rules and hueristics on a larger corpus to see the effectiveness of the rules\n",
569 | " - One can write intricate matching rules using `matcher` object"
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": null,
575 | "metadata": {},
576 | "outputs": [],
577 | "source": []
578 | }
579 | ],
580 | "metadata": {
581 | "kernelspec": {
582 | "display_name": "Python 3",
583 | "language": "python",
584 | "name": "python3"
585 | },
586 | "language_info": {
587 | "codemirror_mode": {
588 | "name": "ipython",
589 | "version": 3
590 | },
591 | "file_extension": ".py",
592 | "mimetype": "text/x-python",
593 | "name": "python",
594 | "nbconvert_exporter": "python",
595 | "pygments_lexer": "ipython3",
596 | "version": "3.8.3"
597 | }
598 | },
599 | "nbformat": 4,
600 | "nbformat_minor": 4
601 | }
602 |
--------------------------------------------------------------------------------
/notebooks/Dep_parsing1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | ""
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import spacy\n",
17 | "from spacy import displacy\n",
18 | "import pandas as pd\n",
19 | "nlp = spacy.load(\"en_core_web_sm\")"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## How to do a dependency parse?"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "active = ['Hens lay eggs.',\n",
36 | " 'Birds build nests.',\n",
37 | " 'The batter hit the ball.',\n",
38 | " 'The computer transmitted a copy of the manual']\n",
39 | "passive = ['Eggs are laid by hens',\n",
40 | " 'Nests are built by birds',\n",
41 | " 'The ball was hit by the batter',\n",
42 | " 'A copy of the manual was transmitted by the computer.']"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "name": "stdout",
52 | "output_type": "stream",
53 | "text": [
54 | "Hens nsubj\n",
55 | "lay ROOT\n",
56 | "eggs dobj\n",
57 | ". punct\n"
58 | ]
59 | }
60 | ],
61 | "source": [
62 | "doc = nlp(active[0])\n",
63 | "for tok in doc:\n",
64 | " print(tok.text,tok.dep_)"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "### Visualize this parse"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/html": [
82 | ""
114 | ],
115 | "text/plain": [
116 | ""
117 | ]
118 | },
119 | "metadata": {},
120 | "output_type": "display_data"
121 | }
122 | ],
123 | "source": [
124 | "displacy.render(doc, style=\"dep\")"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "To understand what these dependency relationships one can use [this link](https://universaldependencies.org/docs/en/dep/)"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "### Going through the dependency relationships it looks like that one would need to know linguistics and grammar to be able to do analysis. This is not entirely true. Many times being able to find out `patterns` in terms of dependency relationships is enough to perform the task at hand"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 5,
144 | "metadata": {},
145 | "outputs": [
146 | {
147 | "data": {
148 | "text/html": [
149 | ""
181 | ],
182 | "text/plain": [
183 | ""
184 | ]
185 | },
186 | "metadata": {},
187 | "output_type": "display_data"
188 | },
189 | {
190 | "data": {
191 | "text/html": [
192 | ""
224 | ],
225 | "text/plain": [
226 | ""
227 | ]
228 | },
229 | "metadata": {},
230 | "output_type": "display_data"
231 | },
232 | {
233 | "data": {
234 | "text/html": [
235 | ""
293 | ],
294 | "text/plain": [
295 | ""
296 | ]
297 | },
298 | "metadata": {},
299 | "output_type": "display_data"
300 | },
301 | {
302 | "data": {
303 | "text/html": [
304 | ""
401 | ],
402 | "text/plain": [
403 | ""
404 | ]
405 | },
406 | "metadata": {},
407 | "output_type": "display_data"
408 | }
409 | ],
410 | "source": [
411 | "for sent in active:\n",
412 | " doc = nlp(sent)\n",
413 | " displacy.render(doc, style=\"dep\")"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 6,
419 | "metadata": {},
420 | "outputs": [
421 | {
422 | "data": {
423 | "text/html": [
424 | ""
482 | ],
483 | "text/plain": [
484 | ""
485 | ]
486 | },
487 | "metadata": {},
488 | "output_type": "display_data"
489 | },
490 | {
491 | "data": {
492 | "text/html": [
493 | ""
551 | ],
552 | "text/plain": [
553 | ""
554 | ]
555 | },
556 | "metadata": {},
557 | "output_type": "display_data"
558 | },
559 | {
560 | "data": {
561 | "text/html": [
562 | ""
646 | ],
647 | "text/plain": [
648 | ""
649 | ]
650 | },
651 | "metadata": {},
652 | "output_type": "display_data"
653 | },
654 | {
655 | "data": {
656 | "text/html": [
657 | ""
780 | ],
781 | "text/plain": [
782 | ""
783 | ]
784 | },
785 | "metadata": {},
786 | "output_type": "display_data"
787 | }
788 | ],
789 | "source": [
790 | "for sent in passive:\n",
791 | " doc = nlp(sent)\n",
792 | " displacy.render(doc, style=\"dep\")"
793 | ]
794 | },
795 | {
796 | "cell_type": "markdown",
797 | "metadata": {},
798 | "source": [
799 | "## Summary:\n",
800 | "- Spacy's dependency parser let's us visualise the relationships\n",
801 | "- When a sentence is in passive voice there is always a presence if `nsubjpass` dependency relation"
802 | ]
803 | },
804 | {
805 | "cell_type": "code",
806 | "execution_count": null,
807 | "metadata": {},
808 | "outputs": [],
809 | "source": []
810 | }
811 | ],
812 | "metadata": {
813 | "kernelspec": {
814 | "display_name": "Python 3",
815 | "language": "python",
816 | "name": "python3"
817 | },
818 | "language_info": {
819 | "codemirror_mode": {
820 | "name": "ipython",
821 | "version": 3
822 | },
823 | "file_extension": ".py",
824 | "mimetype": "text/x-python",
825 | "name": "python",
826 | "nbconvert_exporter": "python",
827 | "pygments_lexer": "ipython3",
828 | "version": "3.8.3"
829 | }
830 | },
831 | "nbformat": 4,
832 | "nbformat_minor": 4
833 | }
834 |
--------------------------------------------------------------------------------