├── BehindSpacy.jpg
├── Intent Classification With Rasa - Spacy
├── config_spacy.yaml
├── nlu_nlp_explain.png
├── projects
│ └── default
│ │ └── model_20180602-072117
│ │ ├── entity_synonyms.json
│ │ ├── crf_model.pkl
│ │ ├── intent_classifier_sklearn.pkl
│ │ ├── regex_featurizer.json
│ │ ├── metadata.json
│ │ └── training_data.json
├── demo-rasa.json
├── rasa_dataset.json
└── Intent Classification With Rasa NLU and SpaCy.ipynb
├── SpaCy_logo.png
├── quotesfile.txt
├── textacylogo1.png
├── NLP_in_French
├── SpaCy_logo.png
└── BehindSpacy.jpg
├── NLP_with_SpaCy
├── BehindSpacy.jpg
├── SpaCy_logo.png
├── quotesfile.txt
├── imageredacted.jpg
├── samplefile.txt
├── quotesfiles.txt
├── spacy_summarizer.py
├── spacy_pipeline.svg
├── How to detect languages with SpaCy.ipynb
├── NLP with SpaCy- Adding Extensions Attributes in SpaCy(How to use sentiment analysis in SpaCy).ipynb
├── Automatic Redaction & Sanitization of Document Using Spacy NER.ipynb
├── How to Find the Most Common Words Using Spacy.ipynb
└── Training the Named Entity Recognizer in SpaCy.ipynb
├── NLP_with_Textacy
├── textacylogo1.png
├── README.md
├── example.txt
└── example1.txt
├── samplefile.txt
├── NLP_with_Flair
└── text_classification_with_flair_workflow_jcharistech.png
├── NLP with JavaScript
├── index.js
├── NLP-with-JavaScript.md
└── index.html
├── quotesfiles.txt
├── NLP-with-JavaScript.md
├── README.md
├── index.html
├── example.txt
├── spacy_pipeline.svg
├── example1.txt
├── NLP with SpaCy- Adding Extensions Attributes in SpaCy(How to use sentiment analysis in SpaCy).ipynb
├── Text Summarization with Sumy Python .ipynb
├── How to Find the Most Common Words Using Spacy.ipynb
├── NLP_with_Polyglot
└── NLP with Polyglot .ipynb
└── Training the Named Entity Recognizer in SpaCy.ipynb
/BehindSpacy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/BehindSpacy.jpg
--------------------------------------------------------------------------------
/Intent Classification With Rasa - Spacy/config_spacy.yaml:
--------------------------------------------------------------------------------
1 | language: "en"
2 |
3 | pipeline: "spacy_sklearn"
4 |
--------------------------------------------------------------------------------
/SpaCy_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/SpaCy_logo.png
--------------------------------------------------------------------------------
/quotesfile.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/quotesfile.txt
--------------------------------------------------------------------------------
/textacylogo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/textacylogo1.png
--------------------------------------------------------------------------------
/NLP_in_French/SpaCy_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_in_French/SpaCy_logo.png
--------------------------------------------------------------------------------
/NLP_in_French/BehindSpacy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_in_French/BehindSpacy.jpg
--------------------------------------------------------------------------------
/NLP_with_SpaCy/BehindSpacy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_SpaCy/BehindSpacy.jpg
--------------------------------------------------------------------------------
/NLP_with_SpaCy/SpaCy_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_SpaCy/SpaCy_logo.png
--------------------------------------------------------------------------------
/NLP_with_SpaCy/quotesfile.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_SpaCy/quotesfile.txt
--------------------------------------------------------------------------------
/NLP_with_SpaCy/imageredacted.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_SpaCy/imageredacted.jpg
--------------------------------------------------------------------------------
/NLP_with_Textacy/textacylogo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_Textacy/textacylogo1.png
--------------------------------------------------------------------------------
/Intent Classification With Rasa - Spacy/nlu_nlp_explain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/Intent Classification With Rasa - Spacy/nlu_nlp_explain.png
--------------------------------------------------------------------------------
/samplefile.txt:
--------------------------------------------------------------------------------
1 | The best error message is the one that never shows up.
2 | You Learn More From Failure Than From Success.
3 | The purpose of software engineering is to control complexity, not to create it
--------------------------------------------------------------------------------
/Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/entity_synonyms.json:
--------------------------------------------------------------------------------
1 | {
2 | "chinese": "chinese",
3 | "chines": "chinese",
4 | "veggie": "vegetarian",
5 | "vegg": "vegetarian"
6 | }
--------------------------------------------------------------------------------
/NLP_with_Flair/text_classification_with_flair_workflow_jcharistech.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/NLP_with_Flair/text_classification_with_flair_workflow_jcharistech.png
--------------------------------------------------------------------------------
/NLP_with_SpaCy/samplefile.txt:
--------------------------------------------------------------------------------
1 | The best error message is the one that never shows up.
2 | You Learn More From Failure Than From Success.
3 | The purpose of software engineering is to control complexity, not to create it
--------------------------------------------------------------------------------
/Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/crf_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/crf_model.pkl
--------------------------------------------------------------------------------
/Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/intent_classifier_sklearn.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jcharis/Natural-Language-Processing-Tutorials/HEAD/Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/intent_classifier_sklearn.pkl
--------------------------------------------------------------------------------
/Intent Classification With Rasa - Spacy/projects/default/model_20180602-072117/regex_featurizer.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "name": "greet",
4 | "pattern": "hey[^\\s]*"
5 | },
6 | {
7 | "name": "zipcode",
8 | "pattern": "[0-9]{5}"
9 | }
10 | ]
--------------------------------------------------------------------------------
/NLP_with_Textacy/README.md:
--------------------------------------------------------------------------------
1 | ### Natural Language Processing with Textacy & SpaCy
2 | - Textacy is a library built on top of the high performant NLP library spaCy.
3 | - Useful for text-preprocessing
4 | - Topic modelling
5 | - Information Extraction
6 | - Keyterm
7 | - Emotional Valency Analysis
8 | - Many more
9 |
--------------------------------------------------------------------------------
/NLP with JavaScript/index.js:
--------------------------------------------------------------------------------
1 | var Sentiment = require('sentiment');
2 | var sentiment = new Sentiment();
3 |
4 | var docx = sentiment.analyze("I like apples");
5 | console.log(docx);
6 |
7 | // Applying to An Array
8 | var mydocx = ["I love apples","I don't eat pepper","the movie was very nice","this book is the best"]
9 |
10 | mydocx.forEach(function(s){
11 | console.log(sentiment.analyze(s));
12 | })
13 |
14 |
--------------------------------------------------------------------------------
/quotesfiles.txt:
--------------------------------------------------------------------------------
1 | First, solve the problem. Then, write the code.
2 | Fix the cause, not the symptom.
3 | Simplicity is the soul of efficiency.
4 | Good design adds value faster than it adds cost.
5 | In theory, theory and practice are the same. In practice, they’re not.
6 | There are two ways of constructing a software design.
7 | One way is to make it so simple that there are obviously no deficiencies.
8 | And the other way is to make it so complicated that there are no obvious deficiencies.
--------------------------------------------------------------------------------
/NLP_with_SpaCy/quotesfiles.txt:
--------------------------------------------------------------------------------
1 | First, solve the problem. Then, write the code.
2 | Fix the cause, not the symptom.
3 | Simplicity is the soul of efficiency.
4 | Good design adds value faster than it adds cost.
5 | In theory, theory and practice are the same. In practice, they’re not.
6 | There are two ways of constructing a software design.
7 | One way is to make it so simple that there are obviously no deficiencies.
8 | And the other way is to make it so complicated that there are no obvious deficiencies.
--------------------------------------------------------------------------------
/NLP-with-JavaScript.md:
--------------------------------------------------------------------------------
1 | ## Natural Language Processing with JavaScript
2 | + understanding everyday language
3 |
4 | #### Common Libraries & Packages
5 | + compromise.js
6 | + natural
7 | + sentiment
8 | + franc
9 | + talisman
10 | + etc
11 |
12 | #### NLP with Compromise.js
13 | + Tokenization
14 | + Part of Speech Tagging
15 | + Word transformation
16 | + Entity Recognition
17 | + Match Finding
18 | + etc
19 |
20 | #### NLP with Sentiment.js
21 | + For Sentiment Analysis
22 |
23 | #### NLP with Franc
24 | + Language Detection
25 |
26 |
27 |
28 | ###### .
29 | + J-Secur1ty
30 | + Jesus Saves @ JCharisTech
31 |
32 |
--------------------------------------------------------------------------------
/NLP with JavaScript/NLP-with-JavaScript.md:
--------------------------------------------------------------------------------
1 | ## Natural Language Processing with JavaScript
2 | + understanding everyday language
3 |
4 | #### Common Libraries & Packages
5 | + compromise.js
6 | + natural
7 | + sentiment
8 | + franc
9 | + talisman
10 | + etc
11 |
12 | #### NLP with Compromise.js
13 | + Tokenization
14 | + Part of Speech Tagging
15 | + Word transformation
16 | + Entity Recognition
17 | + Match Finding
18 | + etc
19 |
20 | #### NLP with Sentiment.js
21 | + For Sentiment Analysis
22 |
23 | #### NLP with Franc
24 | + Language Detection
25 |
26 |
27 |
28 | ###### .
29 | + J-Secur1ty
30 | + Jesus Saves @ JCharisTech
31 |
32 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Natural-Language-Process-Tutorials
2 | Natural Language Processing Tutorials(NLP) with Python,Julia and JavaScripts
3 |
4 |
5 | #### Contents
6 | + NLP with Python
7 | - Natural Language Processing with SpaCy
8 | - Natural Language Processing with TextBlob
9 | - Natural Language Processing with PolyGlot
10 | - Natural Language Processing with TextaCy
11 |
12 | + NLP with JavaScript
13 | - Natural Language Processing with Compromise.js
14 | - Natural Language Processing with Natural.js
15 | - Natural Language Processing with Sentiment.js
16 |
17 | + NLP with Julia
18 | - Natural Language Processing with TextAnalysis.jl
19 | - TextSummarization.jl
20 |
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
The reporter said that it was [REDACTED]that gave him the news in \n",
221 | "\n",
222 | " London\n",
223 | " GPE\n",
224 | "\n",
225 | " \n",
226 | "\n",
227 | " last year\n",
228 | " DATE\n",
229 | "\n",
230 | "
"
231 | ],
232 | "text/plain": [
233 | ""
234 | ]
235 | },
236 | "metadata": {},
237 | "output_type": "display_data"
238 | }
239 | ],
240 | "source": [
241 | "displacy.render(nlp(docx2),style='ent',jupyter=True)"
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "metadata": {},
247 | "source": [
248 | "#### Redaction/Sanitization of Location/GPE"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 13,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "# Redaction of Location/GPE\n",
258 | "def sanitize_locations(text):\n",
259 | " docx = nlp(text)\n",
260 | " redacted_sentences = []\n",
261 | " for ent in docx.ents:\n",
262 | " ent.merge()\n",
263 | " for token in docx:\n",
264 | " if token.ent_type_ == 'GPE':\n",
265 | " redacted_sentences.append(\"[REDACTED]\")\n",
266 | " else:\n",
267 | " redacted_sentences.append(token.string)\n",
268 | " return \"\".join(redacted_sentences)"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 14,
274 | "metadata": {},
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/plain": [
279 | "'The reporter said that it was John Mark that gave him the news in [REDACTED]last year'"
280 | ]
281 | },
282 | "execution_count": 14,
283 | "metadata": {},
284 | "output_type": "execute_result"
285 | }
286 | ],
287 | "source": [
288 | "sanitize_locations(ex1)"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 15,
294 | "metadata": {},
295 | "outputs": [],
296 | "source": [
297 | "## Thanks For Watching\n",
298 | "# Jesse JCharis\n",
299 | "# J-Secur1ty\n",
300 | "# Jesus Saves@JCharisTech"
301 | ]
302 | }
303 | ],
304 | "metadata": {
305 | "kernelspec": {
306 | "display_name": "Python 3",
307 | "language": "python",
308 | "name": "python3"
309 | },
310 | "language_info": {
311 | "codemirror_mode": {
312 | "name": "ipython",
313 | "version": 3
314 | },
315 | "file_extension": ".py",
316 | "mimetype": "text/x-python",
317 | "name": "python",
318 | "nbconvert_exporter": "python",
319 | "pygments_lexer": "ipython3",
320 | "version": "3.6.7"
321 | }
322 | },
323 | "nbformat": 4,
324 | "nbformat_minor": 2
325 | }
326 |
--------------------------------------------------------------------------------
/How to Find the Most Common Words Using Spacy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### How to Find the Most Common Words Using SpaCy & Python"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 7,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# Load Our Packages\n",
17 | "import spacy\n",
18 | "from collections import Counter"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 8,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "nlp = spacy.load('en')"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 9,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "docx = nlp(open('luke6.txt').read())"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 10,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/plain": [
47 | "Blessings and Woes\n",
48 | "He went down with them and stood on a level place. A large crowd of his disciples was there and a great number of people from all over Judea, from Jerusalem, and from the coastal region around Tyre and Sidon, who had come to hear him and to be healed of their diseases. Those troubled by impure spirits were cured, and the people all tried to touch him, because power was coming from him and healing them all. Looking at his disciples, he said: \"Blessed are you who are poor, for yours is the kingdom of God. Blessed are you who hunger now, for you will be satisfied. Blessed are you who weep now, for you will laugh. Blessed are you when people hate you, when they exclude you and insult you and reject your name as evil, because of the Son of Man. \"Rejoice in that day and leap for joy, because great is your reward in heaven. For that is how their ancestors treated the prophets. \"But woe to you who are rich, for you have already received your comfort. Woe to you who are well fed now, for you will go hungry. Woe to you who laugh now, for you will mourn and weep. Woe to you when everyone speaks well of you, for that is how their ancestors treated the false prophets.\n",
49 | "Love for Enemies\n",
50 | "\"But to you who are listening I say: Love your enemies, do good to those who hate you, bless those who curse you, pray for those who mistreat you. If someone slaps you on one cheek, turn to them the other also. If someone takes your coat, do not withhold your shirt from them. Give to everyone who asks you, and if anyone takes what belongs to you, do not demand it back. Do to others as you would have them do to you. \"If you love those who love you, what credit is that to you? Even sinners love those who love them. And if you do good to those who are good to you, what credit is that to you? Even sinners do that. And if you lend to those from whom you expect repayment, what credit is that to you? Even sinners lend to sinners, expecting to be repaid in full. But love your enemies, do good to them, and lend to them without expecting to get anything back. Then your reward will be great, and you will be children of the Most High, because he is kind to the ungrateful and wicked. Be merciful, just as your Father is merciful.\n",
51 | "Judging Others\n",
52 | "\"Do not judge, and you will not be judged. Do not condemn, and you will not be condemned. Forgive, and you will be forgiven. Give, and it will be given to you. A good measure, pressed down, shaken together and running over, will be poured into your lap. For with the measure you use, it will be measured to you.\" He also told them this parable: \"Can the blind lead the blind? Will they not both fall into a pit? The student is not above the teacher, but everyone who is fully trained will be like their teacher. \"Why do you look at the speck of sawdust in your brother’s eye and pay no attention to the plank in your own eye? How can you say to your brother, 'Brother, let me take the speck out of your eye,'when you yourself fail to see the plank in your own eye? You hypocrite, first take the plank out of your eye, and then you will see clearly to remove the speck from your brother’s eye.\n",
53 | "A Tree and Its Fruit\n",
54 | "\"No good tree bears bad fruit, nor does a bad tree bear good fruit. Each tree is recognized by its own fruit. People do not pick figs from thornbushes, or grapes from briers. A good man brings good things out of the good stored up in his heart, and an evil man brings evil things out of the evil stored up in his heart. For the mouth speaks what the heart is full of.\n",
55 | "The Wise and Foolish Builders\n",
56 | "\"Why do you call me, 'Lord, Lord,'and do not do what I say? As for everyone who comes to me and hears my words and puts them into practice, I will show you what they are like. They are like a man building a house, who dug down deep and laid the foundation on rock. When a flood came, the torrent struck that house but could not shake it, because it was well built. But the one who hears my words and does not put them into practice is like a man who built a house on the ground without a foundation. The moment the torrent struck that house, it collapsed and its destruction was complete.\""
57 | ]
58 | },
59 | "execution_count": 10,
60 | "metadata": {},
61 | "output_type": "execute_result"
62 | }
63 | ],
64 | "source": [
65 | "docx"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 12,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "# Remove Punct,Stop \n",
75 | "# Nouns\n",
76 | "nouns = [ token.text for token in docx if token.is_stop != True and token.is_punct !=True and token.pos_ == 'NOUN']\n"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 13,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "data": {
86 | "text/plain": [
87 | "['Blessings',\n",
88 | " 'level',\n",
89 | " 'place',\n",
90 | " 'crowd',\n",
91 | " 'disciples',\n",
92 | " 'number',\n",
93 | " 'people',\n",
94 | " 'region',\n",
95 | " 'diseases',\n",
96 | " 'impure',\n",
97 | " 'spirits',\n",
98 | " 'people',\n",
99 | " 'power',\n",
100 | " 'disciples',\n",
101 | " 'kingdom',\n",
102 | " 'people',\n",
103 | " 'evil',\n",
104 | " 'day',\n",
105 | " 'leap',\n",
106 | " 'joy',\n",
107 | " 'reward',\n",
108 | " 'ancestors',\n",
109 | " 'prophets',\n",
110 | " 'woe',\n",
111 | " 'comfort',\n",
112 | " 'Woe',\n",
113 | " 'Woe',\n",
114 | " 'ancestors',\n",
115 | " 'prophets',\n",
116 | " 'enemies',\n",
117 | " 'good',\n",
118 | " 'cheek',\n",
119 | " 'coat',\n",
120 | " 'shirt',\n",
121 | " 'credit',\n",
122 | " 'sinners',\n",
123 | " 'good',\n",
124 | " 'credit',\n",
125 | " 'sinners',\n",
126 | " 'repayment',\n",
127 | " 'credit',\n",
128 | " 'sinners',\n",
129 | " 'sinners',\n",
130 | " 'enemies',\n",
131 | " 'good',\n",
132 | " 'reward',\n",
133 | " 'children',\n",
134 | " 'Others',\n",
135 | " 'measure',\n",
136 | " 'lap',\n",
137 | " 'measure',\n",
138 | " 'parable',\n",
139 | " 'pit',\n",
140 | " 'student',\n",
141 | " 'teacher',\n",
142 | " 'teacher',\n",
143 | " 'speck',\n",
144 | " 'sawdust',\n",
145 | " 'brotherâ€',\n",
146 | " 'eye',\n",
147 | " 'attention',\n",
148 | " 'plank',\n",
149 | " 'eye',\n",
150 | " 'brother',\n",
151 | " 'Brother',\n",
152 | " 'speck',\n",
153 | " 'plank',\n",
154 | " 'eye',\n",
155 | " 'plank',\n",
156 | " 'eye',\n",
157 | " 'speck',\n",
158 | " 'brotherâ€',\n",
159 | " 'eye',\n",
160 | " 'tree',\n",
161 | " 'fruit',\n",
162 | " 'tree',\n",
163 | " 'fruit',\n",
164 | " 'tree',\n",
165 | " 'fruit',\n",
166 | " 'People',\n",
167 | " 'figs',\n",
168 | " 'thornbushes',\n",
169 | " 'grapes',\n",
170 | " 'briers',\n",
171 | " 'man',\n",
172 | " 'things',\n",
173 | " 'good',\n",
174 | " 'heart',\n",
175 | " 'man',\n",
176 | " 'things',\n",
177 | " 'evil',\n",
178 | " 'heart',\n",
179 | " 'mouth',\n",
180 | " 'heart',\n",
181 | " 'words',\n",
182 | " 'practice',\n",
183 | " 'man',\n",
184 | " 'house',\n",
185 | " 'foundation',\n",
186 | " 'rock',\n",
187 | " 'flood',\n",
188 | " 'torrent',\n",
189 | " 'house',\n",
190 | " 'words',\n",
191 | " 'practice',\n",
192 | " 'man',\n",
193 | " 'house',\n",
194 | " 'ground',\n",
195 | " 'foundation',\n",
196 | " 'moment',\n",
197 | " 'torrent',\n",
198 | " 'house',\n",
199 | " 'destruction']"
200 | ]
201 | },
202 | "execution_count": 13,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "nouns"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 14,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "word_freq = Counter(nouns)"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 15,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "common_nouns = word_freq.most_common(10)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 16,
232 | "metadata": {},
233 | "outputs": [
234 | {
235 | "name": "stdout",
236 | "output_type": "stream",
237 | "text": [
238 | "[('eye', 5), ('good', 4), ('sinners', 4), ('man', 4), ('house', 4), ('people', 3), ('credit', 3), ('speck', 3), ('plank', 3), ('tree', 3)]\n"
239 | ]
240 | }
241 | ],
242 | "source": [
243 | "print(common_nouns)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": []
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "### Most Common Verbs\n",
258 | "+ Some stops words can also be verbs"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 17,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "# Remove Punct,Stop \n",
268 | "# verbs\n",
269 | "verbs = [ token.text for token in docx if token.is_punct !=True and token.pos_ == 'VERB']\n"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 18,
275 | "metadata": {},
276 | "outputs": [
277 | {
278 | "name": "stdout",
279 | "output_type": "stream",
280 | "text": [
281 | "[('will', 15), ('is', 14), ('be', 12), ('do', 12), ('are', 11), ('love', 5), ('was', 4), ('Blessed', 4), ('say', 3), ('Do', 3)]\n"
282 | ]
283 | }
284 | ],
285 | "source": [
286 | "print(Counter(verbs).most_common(10))"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 19,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "verbs_with_stopword = [ token.text for token in docx if token.is_stop != True and token.is_punct !=True and token.pos_ == 'VERB']"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 20,
301 | "metadata": {},
302 | "outputs": [
303 | {
304 | "name": "stdout",
305 | "output_type": "stream",
306 | "text": [
307 | "[('love', 5), ('Blessed', 4), ('Do', 3), ('lend', 3), ('weep', 2), ('laugh', 2), ('hate', 2), ('treated', 2), ('speaks', 2), ('takes', 2)]\n"
308 | ]
309 | }
310 | ],
311 | "source": [
312 | "print(Counter(verbs_with_stopword).most_common(10))"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": []
321 | }
322 | ],
323 | "metadata": {
324 | "kernelspec": {
325 | "display_name": "Python 3",
326 | "language": "python",
327 | "name": "python3"
328 | },
329 | "language_info": {
330 | "codemirror_mode": {
331 | "name": "ipython",
332 | "version": 3
333 | },
334 | "file_extension": ".py",
335 | "mimetype": "text/x-python",
336 | "name": "python",
337 | "nbconvert_exporter": "python",
338 | "pygments_lexer": "ipython3",
339 | "version": "3.6.3"
340 | }
341 | },
342 | "nbformat": 4,
343 | "nbformat_minor": 2
344 | }
345 |
--------------------------------------------------------------------------------
/NLP_with_SpaCy/How to Find the Most Common Words Using Spacy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### How to Find the Most Common Words Using SpaCy & Python"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 7,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# Load Our Packages\n",
17 | "import spacy\n",
18 | "from collections import Counter"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 8,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "nlp = spacy.load('en')"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 9,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "docx = nlp(open('luke6.txt').read())"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 10,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/plain": [
47 | "Blessings and Woes\n",
48 | "He went down with them and stood on a level place. A large crowd of his disciples was there and a great number of people from all over Judea, from Jerusalem, and from the coastal region around Tyre and Sidon, who had come to hear him and to be healed of their diseases. Those troubled by impure spirits were cured, and the people all tried to touch him, because power was coming from him and healing them all. Looking at his disciples, he said: \"Blessed are you who are poor, for yours is the kingdom of God. Blessed are you who hunger now, for you will be satisfied. Blessed are you who weep now, for you will laugh. Blessed are you when people hate you, when they exclude you and insult you and reject your name as evil, because of the Son of Man. \"Rejoice in that day and leap for joy, because great is your reward in heaven. For that is how their ancestors treated the prophets. \"But woe to you who are rich, for you have already received your comfort. Woe to you who are well fed now, for you will go hungry. Woe to you who laugh now, for you will mourn and weep. Woe to you when everyone speaks well of you, for that is how their ancestors treated the false prophets.\n",
49 | "Love for Enemies\n",
50 | "\"But to you who are listening I say: Love your enemies, do good to those who hate you, bless those who curse you, pray for those who mistreat you. If someone slaps you on one cheek, turn to them the other also. If someone takes your coat, do not withhold your shirt from them. Give to everyone who asks you, and if anyone takes what belongs to you, do not demand it back. Do to others as you would have them do to you. \"If you love those who love you, what credit is that to you? Even sinners love those who love them. And if you do good to those who are good to you, what credit is that to you? Even sinners do that. And if you lend to those from whom you expect repayment, what credit is that to you? Even sinners lend to sinners, expecting to be repaid in full. But love your enemies, do good to them, and lend to them without expecting to get anything back. Then your reward will be great, and you will be children of the Most High, because he is kind to the ungrateful and wicked. Be merciful, just as your Father is merciful.\n",
51 | "Judging Others\n",
52 | "\"Do not judge, and you will not be judged. Do not condemn, and you will not be condemned. Forgive, and you will be forgiven. Give, and it will be given to you. A good measure, pressed down, shaken together and running over, will be poured into your lap. For with the measure you use, it will be measured to you.\" He also told them this parable: \"Can the blind lead the blind? Will they not both fall into a pit? The student is not above the teacher, but everyone who is fully trained will be like their teacher. \"Why do you look at the speck of sawdust in your brother’s eye and pay no attention to the plank in your own eye? How can you say to your brother, 'Brother, let me take the speck out of your eye,'when you yourself fail to see the plank in your own eye? You hypocrite, first take the plank out of your eye, and then you will see clearly to remove the speck from your brother’s eye.\n",
53 | "A Tree and Its Fruit\n",
54 | "\"No good tree bears bad fruit, nor does a bad tree bear good fruit. Each tree is recognized by its own fruit. People do not pick figs from thornbushes, or grapes from briers. A good man brings good things out of the good stored up in his heart, and an evil man brings evil things out of the evil stored up in his heart. For the mouth speaks what the heart is full of.\n",
55 | "The Wise and Foolish Builders\n",
56 | "\"Why do you call me, 'Lord, Lord,'and do not do what I say? As for everyone who comes to me and hears my words and puts them into practice, I will show you what they are like. They are like a man building a house, who dug down deep and laid the foundation on rock. When a flood came, the torrent struck that house but could not shake it, because it was well built. But the one who hears my words and does not put them into practice is like a man who built a house on the ground without a foundation. The moment the torrent struck that house, it collapsed and its destruction was complete.\""
57 | ]
58 | },
59 | "execution_count": 10,
60 | "metadata": {},
61 | "output_type": "execute_result"
62 | }
63 | ],
64 | "source": [
65 | "docx"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 12,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "# Remove Punct,Stop \n",
75 | "# Nouns\n",
76 | "nouns = [ token.text for token in docx if token.is_stop != True and token.is_punct !=True and token.pos_ == 'NOUN']\n"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 13,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "data": {
86 | "text/plain": [
87 | "['Blessings',\n",
88 | " 'level',\n",
89 | " 'place',\n",
90 | " 'crowd',\n",
91 | " 'disciples',\n",
92 | " 'number',\n",
93 | " 'people',\n",
94 | " 'region',\n",
95 | " 'diseases',\n",
96 | " 'impure',\n",
97 | " 'spirits',\n",
98 | " 'people',\n",
99 | " 'power',\n",
100 | " 'disciples',\n",
101 | " 'kingdom',\n",
102 | " 'people',\n",
103 | " 'evil',\n",
104 | " 'day',\n",
105 | " 'leap',\n",
106 | " 'joy',\n",
107 | " 'reward',\n",
108 | " 'ancestors',\n",
109 | " 'prophets',\n",
110 | " 'woe',\n",
111 | " 'comfort',\n",
112 | " 'Woe',\n",
113 | " 'Woe',\n",
114 | " 'ancestors',\n",
115 | " 'prophets',\n",
116 | " 'enemies',\n",
117 | " 'good',\n",
118 | " 'cheek',\n",
119 | " 'coat',\n",
120 | " 'shirt',\n",
121 | " 'credit',\n",
122 | " 'sinners',\n",
123 | " 'good',\n",
124 | " 'credit',\n",
125 | " 'sinners',\n",
126 | " 'repayment',\n",
127 | " 'credit',\n",
128 | " 'sinners',\n",
129 | " 'sinners',\n",
130 | " 'enemies',\n",
131 | " 'good',\n",
132 | " 'reward',\n",
133 | " 'children',\n",
134 | " 'Others',\n",
135 | " 'measure',\n",
136 | " 'lap',\n",
137 | " 'measure',\n",
138 | " 'parable',\n",
139 | " 'pit',\n",
140 | " 'student',\n",
141 | " 'teacher',\n",
142 | " 'teacher',\n",
143 | " 'speck',\n",
144 | " 'sawdust',\n",
145 | " 'brotherâ€',\n",
146 | " 'eye',\n",
147 | " 'attention',\n",
148 | " 'plank',\n",
149 | " 'eye',\n",
150 | " 'brother',\n",
151 | " 'Brother',\n",
152 | " 'speck',\n",
153 | " 'plank',\n",
154 | " 'eye',\n",
155 | " 'plank',\n",
156 | " 'eye',\n",
157 | " 'speck',\n",
158 | " 'brotherâ€',\n",
159 | " 'eye',\n",
160 | " 'tree',\n",
161 | " 'fruit',\n",
162 | " 'tree',\n",
163 | " 'fruit',\n",
164 | " 'tree',\n",
165 | " 'fruit',\n",
166 | " 'People',\n",
167 | " 'figs',\n",
168 | " 'thornbushes',\n",
169 | " 'grapes',\n",
170 | " 'briers',\n",
171 | " 'man',\n",
172 | " 'things',\n",
173 | " 'good',\n",
174 | " 'heart',\n",
175 | " 'man',\n",
176 | " 'things',\n",
177 | " 'evil',\n",
178 | " 'heart',\n",
179 | " 'mouth',\n",
180 | " 'heart',\n",
181 | " 'words',\n",
182 | " 'practice',\n",
183 | " 'man',\n",
184 | " 'house',\n",
185 | " 'foundation',\n",
186 | " 'rock',\n",
187 | " 'flood',\n",
188 | " 'torrent',\n",
189 | " 'house',\n",
190 | " 'words',\n",
191 | " 'practice',\n",
192 | " 'man',\n",
193 | " 'house',\n",
194 | " 'ground',\n",
195 | " 'foundation',\n",
196 | " 'moment',\n",
197 | " 'torrent',\n",
198 | " 'house',\n",
199 | " 'destruction']"
200 | ]
201 | },
202 | "execution_count": 13,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "nouns"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 14,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "word_freq = Counter(nouns)"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 15,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "common_nouns = word_freq.most_common(10)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 16,
232 | "metadata": {},
233 | "outputs": [
234 | {
235 | "name": "stdout",
236 | "output_type": "stream",
237 | "text": [
238 | "[('eye', 5), ('good', 4), ('sinners', 4), ('man', 4), ('house', 4), ('people', 3), ('credit', 3), ('speck', 3), ('plank', 3), ('tree', 3)]\n"
239 | ]
240 | }
241 | ],
242 | "source": [
243 | "print(common_nouns)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": []
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "### Most Common Verbs\n",
258 | "+ Some stops words can also be verbs"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 17,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "# Remove Punct,Stop \n",
268 | "# verbs\n",
269 | "verbs = [ token.text for token in docx if token.is_punct !=True and token.pos_ == 'VERB']\n"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 18,
275 | "metadata": {},
276 | "outputs": [
277 | {
278 | "name": "stdout",
279 | "output_type": "stream",
280 | "text": [
281 | "[('will', 15), ('is', 14), ('be', 12), ('do', 12), ('are', 11), ('love', 5), ('was', 4), ('Blessed', 4), ('say', 3), ('Do', 3)]\n"
282 | ]
283 | }
284 | ],
285 | "source": [
286 | "print(Counter(verbs).most_common(10))"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 19,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "verbs_with_stopword = [ token.text for token in docx if token.is_stop != True and token.is_punct !=True and token.pos_ == 'VERB']"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 20,
301 | "metadata": {},
302 | "outputs": [
303 | {
304 | "name": "stdout",
305 | "output_type": "stream",
306 | "text": [
307 | "[('love', 5), ('Blessed', 4), ('Do', 3), ('lend', 3), ('weep', 2), ('laugh', 2), ('hate', 2), ('treated', 2), ('speaks', 2), ('takes', 2)]\n"
308 | ]
309 | }
310 | ],
311 | "source": [
312 | "print(Counter(verbs_with_stopword).most_common(10))"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": []
321 | }
322 | ],
323 | "metadata": {
324 | "kernelspec": {
325 | "display_name": "Python 3",
326 | "language": "python",
327 | "name": "python3"
328 | },
329 | "language_info": {
330 | "codemirror_mode": {
331 | "name": "ipython",
332 | "version": 3
333 | },
334 | "file_extension": ".py",
335 | "mimetype": "text/x-python",
336 | "name": "python",
337 | "nbconvert_exporter": "python",
338 | "pygments_lexer": "ipython3",
339 | "version": "3.6.3"
340 | }
341 | },
342 | "nbformat": 4,
343 | "nbformat_minor": 2
344 | }
345 |
--------------------------------------------------------------------------------
/NLP_with_Polyglot/NLP with Polyglot .ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Natural Language Processing with Polyglot\n",
8 | "\n",
9 | "#### Installation on Unix\n",
10 | "+ sudo apt-get install python-numpy libicu-dev\n",
11 | "+ pip install polyglot\n",
12 | "\n",
13 | "#### Installation on Windows\n",
14 | "\n",
15 | "##### Download the PyCLD2 and PyICU From \n",
16 | " - https://www.lfd.uci.edu/~gohlke/pythonlibs/\n",
17 | "- pip install pycld2-0.31-cp36-cp36m-win_amd64.whl\n",
18 | "- pip install PyICU-1.9.8-cp36-cp36m-win_amd64.whl\n",
19 | "- pip install Morfessor-2.0.4-py2.py3-none-any.whl\n",
20 | "- git clone https://github.com/aboSamoor/polyglot.git\n",
21 | "- python setup.py install\n",
22 | "\n",
23 | "\n",
24 | "- polyglot download embeddings2.en\n",
25 | "- polyglot download ner2.en\n",
26 | "- polyglot download sentiment2.en\n",
27 | "- polyglot download pos2.en\n",
28 | "- polyglot download morph2.en\n",
29 | "- polyglot download transliteration2.ar\n",
30 | "\n",
31 | "#### Uses and Application\n",
32 | "+ Fundamentals or Basics of NLP\n",
33 | "+ Transliteration\n",
34 | "+ Named Entity Recognition\n",
35 | "+ Sentiment Analysis\n",
36 | "\n",
37 | "##### NB similar learning curve like TextBlob API"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "#### Tokenization\n",
45 | "+ Splitting text into words"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 47,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# Load packages\n",
55 | "import polyglot\n",
56 | "from polyglot.text import Text,Word"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 48,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "# Word Tokens\n",
66 | "docx = Text(u\"He likes reading and painting\")\n"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 49,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/plain": [
77 | "WordList(['He', 'likes', 'reading', 'and', 'painting'])"
78 | ]
79 | },
80 | "execution_count": 49,
81 | "metadata": {},
82 | "output_type": "execute_result"
83 | }
84 | ],
85 | "source": [
86 | "docx.words"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 50,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "docx2 = Text(u\"He exclaimed, 'what're you doing? Reading?'.\")"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 51,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "data": {
105 | "text/plain": [
106 | "WordList(['He', 'exclaimed', ',', \"'\", \"what're\", 'you', 'doing', '?', 'Reading', '?', \"'\", '.'])"
107 | ]
108 | },
109 | "execution_count": 51,
110 | "metadata": {},
111 | "output_type": "execute_result"
112 | }
113 | ],
114 | "source": [
115 | "docx2.words"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 52,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "# Sentence tokens\n",
125 | "docx3 = Text(u\"He likes reading and painting.He exclaimed, 'what're you doing? Reading?'.\")"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 53,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "data": {
135 | "text/plain": [
136 | "[Sentence(\"He likes reading and painting.He exclaimed, 'what're you doing?\"),\n",
137 | " Sentence(\"Reading?'.\")]"
138 | ]
139 | },
140 | "execution_count": 53,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "docx3.sentences"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": []
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "#### Parts of Speech Tagging\n",
161 | "+ polyglot download embeddings2.la\n",
162 | "+ pos_tags\n"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 54,
168 | "metadata": {},
169 | "outputs": [
170 | {
171 | "data": {
172 | "text/plain": [
173 | "Text(\"He likes reading and painting\")"
174 | ]
175 | },
176 | "execution_count": 54,
177 | "metadata": {},
178 | "output_type": "execute_result"
179 | }
180 | ],
181 | "source": [
182 | "docx"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 55,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "data": {
192 | "text/plain": [
193 | "[('He', 'PRON'),\n",
194 | " ('likes', 'VERB'),\n",
195 | " ('reading', 'VERB'),\n",
196 | " ('and', 'CONJ'),\n",
197 | " ('painting', 'NOUN')]"
198 | ]
199 | },
200 | "execution_count": 55,
201 | "metadata": {},
202 | "output_type": "execute_result"
203 | }
204 | ],
205 | "source": [
206 | "docx.pos_tags\n",
207 | " "
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "#### Language Detection\n",
215 | "+ polyglot.detect\n",
216 | "+ language.name\n",
217 | "+ language.code"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 56,
223 | "metadata": {},
224 | "outputs": [
225 | {
226 | "data": {
227 | "text/plain": [
228 | "Text(\"He likes reading and painting\")"
229 | ]
230 | },
231 | "execution_count": 56,
232 | "metadata": {},
233 | "output_type": "execute_result"
234 | }
235 | ],
236 | "source": [
237 | "docx"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 57,
243 | "metadata": {},
244 | "outputs": [
245 | {
246 | "data": {
247 | "text/plain": [
248 | "'English'"
249 | ]
250 | },
251 | "execution_count": 57,
252 | "metadata": {},
253 | "output_type": "execute_result"
254 | }
255 | ],
256 | "source": [
257 | "docx.language.name"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 58,
263 | "metadata": {},
264 | "outputs": [
265 | {
266 | "data": {
267 | "text/plain": [
268 | "'en'"
269 | ]
270 | },
271 | "execution_count": 58,
272 | "metadata": {},
273 | "output_type": "execute_result"
274 | }
275 | ],
276 | "source": [
277 | "docx.language.code"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 59,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "from polyglot.detect import Detector"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 60,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "en_text = \"He is a student \"\n",
296 | "fr_text = \"Il est un étudiant\"\n",
297 | "ru_text = \"Он студент\""
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 67,
303 | "metadata": {},
304 | "outputs": [
305 | {
306 | "name": "stderr",
307 | "output_type": "stream",
308 | "text": [
309 | "Detector is not able to detect the language reliably.\n",
310 | "Detector is not able to detect the language reliably.\n"
311 | ]
312 | }
313 | ],
314 | "source": [
315 | "detect_en = Detector(en_text)\n",
316 | "detect_fr = Detector(fr_text)\n",
317 | "detect_ru = Detector(ru_text)"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 63,
323 | "metadata": {},
324 | "outputs": [
325 | {
326 | "name": "stdout",
327 | "output_type": "stream",
328 | "text": [
329 | "name: English code: en confidence: 94.0 read bytes: 704\n"
330 | ]
331 | }
332 | ],
333 | "source": [
334 | "print(detect_en.language)"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 66,
340 | "metadata": {},
341 | "outputs": [
342 | {
343 | "name": "stdout",
344 | "output_type": "stream",
345 | "text": [
346 | "name: French code: fr confidence: 95.0 read bytes: 870\n"
347 | ]
348 | }
349 | ],
350 | "source": [
351 | "print(detect_fr.language)"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 68,
357 | "metadata": {},
358 | "outputs": [
359 | {
360 | "name": "stdout",
361 | "output_type": "stream",
362 | "text": [
363 | "name: Serbian code: sr confidence: 95.0 read bytes: 614\n"
364 | ]
365 | }
366 | ],
367 | "source": [
368 | "print(detect_ru.language)"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": null,
374 | "metadata": {},
375 | "outputs": [],
376 | "source": []
377 | },
378 | {
379 | "cell_type": "markdown",
380 | "metadata": {},
381 | "source": [
382 | "#### Sentiment Analysis\n",
383 | "+ polarity"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 71,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "docx4 = Text(u\"He hates reading and playing\")"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 69,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "data": {
402 | "text/plain": [
403 | "Text(\"He likes reading and painting\")"
404 | ]
405 | },
406 | "execution_count": 69,
407 | "metadata": {},
408 | "output_type": "execute_result"
409 | }
410 | ],
411 | "source": [
412 | "docx"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": 70,
418 | "metadata": {},
419 | "outputs": [
420 | {
421 | "data": {
422 | "text/plain": [
423 | "1.0"
424 | ]
425 | },
426 | "execution_count": 70,
427 | "metadata": {},
428 | "output_type": "execute_result"
429 | }
430 | ],
431 | "source": [
432 | "docx.polarity"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": 72,
438 | "metadata": {},
439 | "outputs": [
440 | {
441 | "data": {
442 | "text/plain": [
443 | "-1.0"
444 | ]
445 | },
446 | "execution_count": 72,
447 | "metadata": {},
448 | "output_type": "execute_result"
449 | }
450 | ],
451 | "source": [
452 | "docx4.polarity"
453 | ]
454 | },
455 | {
456 | "cell_type": "markdown",
457 | "metadata": {},
458 | "source": [
459 | "#### Named Entities\n",
460 | "+ entities"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 73,
466 | "metadata": {},
467 | "outputs": [],
468 | "source": [
469 | "docx5 = Text(u\"John Jones was a FBI detector\")"
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": 74,
475 | "metadata": {},
476 | "outputs": [
477 | {
478 | "data": {
479 | "text/plain": [
480 | "[I-PER(['John', 'Jones']), I-ORG(['FBI'])]"
481 | ]
482 | },
483 | "execution_count": 74,
484 | "metadata": {},
485 | "output_type": "execute_result"
486 | }
487 | ],
488 | "source": [
489 | "docx5.entities"
490 | ]
491 | },
492 | {
493 | "cell_type": "markdown",
494 | "metadata": {},
495 | "source": [
496 | "#### Morphology\n",
497 | "+ morpheme is the smallest grammatical unit in a language. \n",
498 | "+ morpheme may or may not stand alone, word, by definition, is freestanding. \n",
499 | "+ morphemes"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": 75,
505 | "metadata": {},
506 | "outputs": [],
507 | "source": [
508 | "docx6 = Text(u\"preprocessing\")"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": 76,
514 | "metadata": {},
515 | "outputs": [
516 | {
517 | "data": {
518 | "text/plain": [
519 | "WordList(['pre', 'process', 'ing'])"
520 | ]
521 | },
522 | "execution_count": 76,
523 | "metadata": {},
524 | "output_type": "execute_result"
525 | }
526 | ],
527 | "source": [
528 | "docx6.morphemes"
529 | ]
530 | },
531 | {
532 | "cell_type": "markdown",
533 | "metadata": {},
534 | "source": [
535 | "#### Transliteration"
536 | ]
537 | },
538 | {
539 | "cell_type": "code",
540 | "execution_count": 77,
541 | "metadata": {},
542 | "outputs": [],
543 | "source": [
544 | "# Load \n",
545 | "from polyglot.transliteration import Transliterator\n",
546 | "translit = Transliterator(source_lang='en',target_lang='fr')"
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "execution_count": 78,
552 | "metadata": {},
553 | "outputs": [
554 | {
555 | "data": {
556 | "text/plain": [
557 | "'working'"
558 | ]
559 | },
560 | "execution_count": 78,
561 | "metadata": {},
562 | "output_type": "execute_result"
563 | }
564 | ],
565 | "source": [
566 | "translit.transliterate(u\"working\")"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": null,
572 | "metadata": {},
573 | "outputs": [],
574 | "source": []
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": null,
579 | "metadata": {},
580 | "outputs": [],
581 | "source": [
582 | "# Jesse JCharis\n",
583 | "# J-Secur1ty\n",
584 | "# Jesus Saves @JCharisTect"
585 | ]
586 | }
587 | ],
588 | "metadata": {
589 | "kernelspec": {
590 | "display_name": "Python 3",
591 | "language": "python",
592 | "name": "python3"
593 | },
594 | "language_info": {
595 | "codemirror_mode": {
596 | "name": "ipython",
597 | "version": 3
598 | },
599 | "file_extension": ".py",
600 | "mimetype": "text/x-python",
601 | "name": "python",
602 | "nbconvert_exporter": "python",
603 | "pygments_lexer": "ipython3",
604 | "version": "3.6.7"
605 | }
606 | },
607 | "nbformat": 4,
608 | "nbformat_minor": 2
609 | }
610 |
--------------------------------------------------------------------------------
/Intent Classification With Rasa - Spacy/Intent Classification With Rasa NLU and SpaCy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Intent Classification with Rasa NLU and SpaCy \n",
8 | "+ + A Libary for intent recognition and entity extraction based on SpaCy and Sklearn\n",
9 | "\n",
10 | "##### NLP = NLU+NLG+ More\n",
11 | "+ NLP = understand,process,interprete everyday human language\n",
12 | "+ NLU = unstructured inputs and convert them into a structured form that a machine can understand and act upon\n",
13 | "\n",
14 | "#### Uses\n",
15 | "+ Chatbot task\n",
16 | "+ NL understanding\n",
17 | "+ Intent classification\n",
18 | "\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | ""
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "#### Installation\n",
33 | "+ pip install rasa_nlu\n",
34 | "+ python -m rasa_nlu.server &\n",
35 | "+ sklearn_crfsuite\n",
36 | "\n",
37 | "#### using spacy as backend\n",
38 | "+ pip install rasa_nlu[spacy]\n",
39 | "+ python -m spacy download en_core_web_md\n",
40 | "+ python -m spacy link en_core_web_md en\n",
41 | " \n",
42 | " = = Dataset = =\n",
43 | "+ demo-rasa.json\n",
44 | "+ config_spacy.yaml"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 15,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# Load the Packages\n",
54 | "from rasa_nlu.training_data import load_data\n",
55 | "from rasa_nlu.config import RasaNLUModelConfig\n",
56 | "from rasa_nlu.model import Trainer\n",
57 | "from rasa_nlu import config"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 16,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "{\n",
70 | " \"rasa_nlu_data\": {\n",
71 | " \"regex_features\": [\n",
72 | " {\n",
73 | " \"name\": \"zipcode\",\n",
74 | " \"pattern\": \"[0-9]{5}\"\n",
75 | " },\n",
76 | " {\n",
77 | " \"name\": \"greet\",\n",
78 | " \"pattern\": \"hey[^\\\\s]*\"\n",
79 | " }\n",
80 | " ],\n",
81 | " \"entity_synonyms\": [\n",
82 | " {\n",
83 | " \"value\": \"chinese\",\n",
84 | " \"synonyms\": [\"Chinese\", \"Chines\", \"chines\"]\n",
85 | " },\n",
86 | " {\n",
87 | " \"value\": \"vegetarian\",\n",
88 | " \"synonyms\": [\"veggie\", \"vegg\"]\n",
89 | " }\n",
90 | " ],\n",
91 | " \"common_examples\": [\n",
92 | " {\n",
93 | " \"text\": \"hey\", \n",
94 | " \"intent\": \"greet\", \n",
95 | " \"entities\": []\n",
96 | " }, \n",
97 | " {\n",
98 | " \"text\": \"howdy\", \n",
99 | " \"intent\": \"greet\", \n",
100 | " \"entities\": []\n",
101 | " }, \n",
102 | " {\n",
103 | " \"text\": \"hey there\",\n",
104 | " \"intent\": \"greet\", \n",
105 | " \"entities\": []\n",
106 | " }, \n",
107 | " {\n",
108 | " \"text\": \"hello\", \n",
109 | " \"intent\": \"greet\", \n",
110 | " \"entities\": []\n",
111 | " }, \n",
112 | " {\n",
113 | " \"text\": \"hi\", \n",
114 | " \"intent\": \"greet\", \n",
115 | " \"entities\": []\n",
116 | " },\n",
117 | " {\n",
118 | " \"text\": \"good morning\",\n",
119 | " \"intent\": \"greet\",\n",
120 | " \"entities\": []\n",
121 | " },\n",
122 | " {\n",
123 | " \"text\": \"good evening\",\n",
124 | " \"intent\": \"greet\",\n",
125 | " \"entities\": []\n",
126 | " },\n",
127 | " {\n",
128 | " \"text\": \"dear sir\",\n",
129 | " \"intent\": \"greet\",\n",
130 | " \"entities\": []\n",
131 | " },\n",
132 | " {\n",
133 | " \"text\": \"yes\", \n",
134 | " \"intent\": \"affirm\", \n",
135 | " \"entities\": []\n",
136 | " }, \n",
137 | " {\n",
138 | " \"text\": \"yep\", \n",
139 | " \"intent\": \"affirm\", \n",
140 | " \"entities\": []\n",
141 | " }, \n",
142 | " {\n",
143 | " \"text\": \"yeah\", \n",
144 | " \"intent\": \"affirm\", \n",
145 | " \"entities\": []\n",
146 | " },\n",
147 | " {\n",
148 | " \"text\": \"indeed\",\n",
149 | " \"intent\": \"affirm\",\n",
150 | " \"entities\": []\n",
151 | " },\n",
152 | " {\n",
153 | " \"text\": \"that's right\",\n",
154 | " \"intent\": \"affirm\",\n",
155 | " \"entities\": []\n",
156 | " },\n",
157 | " {\n",
158 | " \"text\": \"ok\",\n",
159 | " \"intent\": \"affirm\",\n",
160 | " \"entities\": []\n",
161 | " },\n",
162 | " {\n",
163 | " \"text\": \"great\",\n",
164 | " \"intent\": \"affirm\",\n",
165 | " \"entities\": []\n",
166 | " },\n",
167 | " {\n",
168 | " \"text\": \"right, thank you\",\n",
169 | " \"intent\": \"affirm\",\n",
170 | " \"entities\": []\n",
171 | " },\n",
172 | " {\n",
173 | " \"text\": \"correct\",\n",
174 | " \"intent\": \"affirm\",\n",
175 | " \"entities\": []\n",
176 | " },\n",
177 | " {\n",
178 | " \"text\": \"great choice\",\n",
179 | " \"intent\": \"affirm\",\n",
180 | " \"entities\": []\n",
181 | " },\n",
182 | " {\n",
183 | " \"text\": \"sounds really good\",\n",
184 | " \"intent\": \"affirm\",\n",
185 | " \"entities\": []\n",
186 | " },\n",
187 | " {\n",
188 | " \"text\": \"i'm looking for a place to eat\",\n",
189 | " \"intent\": \"restaurant_search\",\n",
190 | " \"entities\": []\n",
191 | " },\n",
192 | " {\n",
193 | " \"text\": \"I want to grab lunch\",\n",
194 | " \"intent\": \"restaurant_search\",\n",
195 | " \"entities\": []\n",
196 | " },\n",
197 | " {\n",
198 | " \"text\": \"I am searching for a dinner spot\",\n",
199 | " \"intent\": \"restaurant_search\",\n",
200 | " \"entities\": []\n",
201 | " },\n",
202 | " {\n",
203 | " \"text\": \"i'm looking for a place in the north of town\",\n",
204 | " \"intent\": \"restaurant_search\",\n",
205 | " \"entities\": [\n",
206 | " {\n",
207 | " \"start\": 31,\n",
208 | " \"end\": 36,\n",
209 | " \"value\": \"north\",\n",
210 | " \"entity\": \"location\"\n",
211 | " }\n",
212 | " ]\n",
213 | " },\n",
214 | " {\n",
215 | " \"text\": \"show me chinese restaurants\",\n",
216 | " \"intent\": \"restaurant_search\",\n",
217 | " \"entities\": [\n",
218 | " {\n",
219 | " \"start\": 8,\n",
220 | " \"end\": 15,\n",
221 | " \"value\": \"chinese\",\n",
222 | " \"entity\": \"cuisine\"\n",
223 | " }\n",
224 | " ]\n",
225 | " },\n",
226 | " {\n",
227 | " \"text\": \"show me chines restaurants in the north\",\n",
228 | " \"intent\": \"restaurant_search\",\n",
229 | " \"entities\": [\n",
230 | " {\n",
231 | " \"start\": 8,\n",
232 | " \"end\": 14,\n",
233 | " \"value\": \"chinese\",\n",
234 | " \"entity\": \"cuisine\"\n",
235 | " },\n",
236 | " {\n",
237 | " \"start\": 34,\n",
238 | " \"end\": 39,\n",
239 | " \"value\": \"north\",\n",
240 | " \"entity\": \"location\"\n",
241 | " }\n",
242 | " ]\n",
243 | " },\n",
244 | " {\n",
245 | " \"text\": \"show me a mexican place in the centre\", \n",
246 | " \"intent\": \"restaurant_search\", \n",
247 | " \"entities\": [\n",
248 | " {\n",
249 | " \"start\": 31, \n",
250 | " \"end\": 37, \n",
251 | " \"value\": \"centre\", \n",
252 | " \"entity\": \"location\"\n",
253 | " }, \n",
254 | " {\n",
255 | " \"start\": 10, \n",
256 | " \"end\": 17, \n",
257 | " \"value\": \"mexican\", \n",
258 | " \"entity\": \"cuisine\"\n",
259 | " }\n",
260 | " ]\n",
261 | " },\n",
262 | " {\n",
263 | " \"text\": \"i am looking for an indian spot called olaolaolaolaolaola\",\n",
264 | " \"intent\": \"restaurant_search\",\n",
265 | " \"entities\": [\n",
266 | " {\n",
267 | " \"start\": 20,\n",
268 | " \"end\": 26,\n",
269 | " \"value\": \"indian\",\n",
270 | " \"entity\": \"cuisine\"\n",
271 | " }\n",
272 | " ]\n",
273 | " }, {\n",
274 | " \"text\": \"search for restaurants\",\n",
275 | " \"intent\": \"restaurant_search\",\n",
276 | " \"entities\": []\n",
277 | " },\n",
278 | " {\n",
279 | " \"text\": \"anywhere in the west\",\n",
280 | " \"intent\": \"restaurant_search\",\n",
281 | " \"entities\": [\n",
282 | " {\n",
283 | " \"start\": 16,\n",
284 | " \"end\": 20,\n",
285 | " \"value\": \"west\",\n",
286 | " \"entity\": \"location\"\n",
287 | " }\n",
288 | " ]\n",
289 | " },\n",
290 | " {\n",
291 | " \"text\": \"anywhere near 18328\",\n",
292 | " \"intent\": \"restaurant_search\",\n",
293 | " \"entities\": [\n",
294 | " {\n",
295 | " \"start\": 14,\n",
296 | " \"end\": 19,\n",
297 | " \"value\": \"18328\",\n",
298 | " \"entity\": \"location\"\n",
299 | " }\n",
300 | " ]\n",
301 | " },\n",
302 | " {\n",
303 | " \"text\": \"I am looking for asian fusion food\",\n",
304 | " \"intent\": \"restaurant_search\",\n",
305 | " \"entities\": [\n",
306 | " {\n",
307 | " \"start\": 17,\n",
308 | " \"end\": 29,\n",
309 | " \"value\": \"asian fusion\",\n",
310 | " \"entity\": \"cuisine\"\n",
311 | " }\n",
312 | " ]\n",
313 | " },\n",
314 | " {\n",
315 | " \"text\": \"I am looking a restaurant in 29432\",\n",
316 | " \"intent\": \"restaurant_search\",\n",
317 | " \"entities\": [\n",
318 | " {\n",
319 | " \"start\": 29,\n",
320 | " \"end\": 34,\n",
321 | " \"value\": \"29432\",\n",
322 | " \"entity\": \"location\"\n",
323 | " }\n",
324 | " ]\n",
325 | " },\n",
326 | " {\n",
327 | " \"text\": \"I am looking for mexican indian fusion\",\n",
328 | " \"intent\": \"restaurant_search\",\n",
329 | " \"entities\": [\n",
330 | " {\n",
331 | " \"start\": 17,\n",
332 | " \"end\": 38,\n",
333 | " \"value\": \"mexican indian fusion\",\n",
334 | " \"entity\": \"cuisine\"\n",
335 | " }\n",
336 | " ]\n",
337 | " },\n",
338 | " {\n",
339 | " \"text\": \"central indian restaurant\",\n",
340 | " \"intent\": \"restaurant_search\",\n",
341 | " \"entities\": [\n",
342 | " {\n",
343 | " \"start\": 0,\n",
344 | " \"end\": 7,\n",
345 | " \"value\": \"central\",\n",
346 | " \"entity\": \"location\"\n",
347 | " },\n",
348 | " {\n",
349 | " \"start\": 8,\n",
350 | " \"end\": 14,\n",
351 | " \"value\": \"indian\",\n",
352 | " \"entity\": \"cuisine\"\n",
353 | " }\n",
354 | " ]\n",
355 | " },\n",
356 | " {\n",
357 | " \"text\": \"bye\", \n",
358 | " \"intent\": \"goodbye\", \n",
359 | " \"entities\": []\n",
360 | " }, \n",
361 | " {\n",
362 | " \"text\": \"goodbye\", \n",
363 | " \"intent\": \"goodbye\", \n",
364 | " \"entities\": []\n",
365 | " }, \n",
366 | " {\n",
367 | " \"text\": \"good bye\", \n",
368 | " \"intent\": \"goodbye\", \n",
369 | " \"entities\": []\n",
370 | " }, \n",
371 | " {\n",
372 | " \"text\": \"stop\", \n",
373 | " \"intent\": \"goodbye\", \n",
374 | " \"entities\": []\n",
375 | " }, \n",
376 | " {\n",
377 | " \"text\": \"end\", \n",
378 | " \"intent\": \"goodbye\", \n",
379 | " \"entities\": []\n",
380 | " },\n",
381 | " {\n",
382 | " \"text\": \"farewell\",\n",
383 | " \"intent\": \"goodbye\",\n",
384 | " \"entities\": []\n",
385 | " },\n",
386 | " {\n",
387 | " \"text\": \"Bye bye\",\n",
388 | " \"intent\": \"goodbye\",\n",
389 | " \"entities\": []\n",
390 | " },\n",
391 | " {\n",
392 | " \"text\": \"have a good one\",\n",
393 | " \"intent\": \"goodbye\",\n",
394 | " \"entities\": []\n",
395 | " }\n",
396 | " ]\n",
397 | " }\n",
398 | "}\n"
399 | ]
400 | }
401 | ],
402 | "source": [
403 | "# Load Data \n",
404 | "!cat rasa_dataset.json"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": 17,
410 | "metadata": {},
411 | "outputs": [],
412 | "source": [
413 | "# Loading DataSet\n",
414 | "train_data = load_data('rasa_dataset.json')"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": 18,
420 | "metadata": {},
421 | "outputs": [],
422 | "source": [
423 | "# Config Backend using Sklearn and Spacy\n",
424 | "trainer = Trainer(config.load(\"config_spacy.yaml\"))"
425 | ]
426 | },
427 | {
428 | "cell_type": "markdown",
429 | "metadata": {},
430 | "source": [
431 | "#### Content on Config\n",
432 | " language: \"en\"\n",
433 | " pipeline: \"spacy_sklearn\"\n",
434 | "\n",
435 | " =======================\n",
436 | "\n",
437 | " language: \"en\"\n",
438 | "\n",
439 | " pipeline:\n",
440 | " - name: \"nlp_spacy\"\n",
441 | " - name: \"tokenizer_spacy\"\n",
442 | " - name: \"intent_entity_featurizer_regex\"\n",
443 | " - name: \"intent_featurizer_spacy\"\n",
444 | " - name: \"ner_crf\"\n",
445 | " - name: \"ner_synonyms\"\n",
446 | " - name: \"intent_classifier_sklearn\""
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 19,
452 | "metadata": {},
453 | "outputs": [
454 | {
455 | "name": "stdout",
456 | "output_type": "stream",
457 | "text": [
458 | "Fitting 2 folds for each of 6 candidates, totalling 12 fits\n"
459 | ]
460 | },
461 | {
462 | "name": "stderr",
463 | "output_type": "stream",
464 | "text": [
465 | "[Parallel(n_jobs=1)]: Done 12 out of 12 | elapsed: 0.3s finished\n"
466 | ]
467 | },
468 | {
469 | "data": {
470 | "text/plain": [
471 | ""
472 | ]
473 | },
474 | "execution_count": 19,
475 | "metadata": {},
476 | "output_type": "execute_result"
477 | }
478 | ],
479 | "source": [
480 | "# Training Data\n",
481 | "trainer.train(train_data)"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": 20,
487 | "metadata": {},
488 | "outputs": [],
489 | "source": [
490 | "# Returns the directory the model is stored in (Creat a folder to store model in)\n",
491 | "model_directory = trainer.persist('/projects/')"
492 | ]
493 | },
494 | {
495 | "cell_type": "markdown",
496 | "metadata": {},
497 | "source": [
498 | "#### Entity Extraction With SpaCy"
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": 22,
504 | "metadata": {},
505 | "outputs": [],
506 | "source": [
507 | "import spacy\n",
508 | "nlp = spacy.load('en')"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": 23,
514 | "metadata": {},
515 | "outputs": [],
516 | "source": [
517 | "docx = nlp(u\"I am looking for an Italian Restaurant where I can eat\")"
518 | ]
519 | },
520 | {
521 | "cell_type": "code",
522 | "execution_count": 24,
523 | "metadata": {},
524 | "outputs": [
525 | {
526 | "name": "stdout",
527 | "output_type": "stream",
528 | "text": [
529 | "value Italian entity NORP start 20 end 27\n"
530 | ]
531 | }
532 | ],
533 | "source": [
534 | "for word in docx.ents:\n",
535 | " print(\"value\",word.text,\"entity\",word.label_,\"start\",word.start_char,\"end\",word.end_char)"
536 | ]
537 | },
538 | {
539 | "cell_type": "markdown",
540 | "metadata": {},
541 | "source": [
542 | "#### Making Predictions With Model\n",
543 | "+ Interpreter.load().parse()"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": 21,
549 | "metadata": {},
550 | "outputs": [],
551 | "source": [
552 | "from rasa_nlu.model import Metadata, Interpreter"
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": 25,
558 | "metadata": {},
559 | "outputs": [],
560 | "source": [
561 | "# where `model_directory points to the folder the model is persisted in\n",
562 | "interpreter = Interpreter.load(model_directory)"
563 | ]
564 | },
565 | {
566 | "cell_type": "code",
567 | "execution_count": 26,
568 | "metadata": {},
569 | "outputs": [
570 | {
571 | "data": {
572 | "text/plain": [
573 | "{'intent': {'name': 'restaurant_search', 'confidence': 0.7455215289019911},\n",
574 | " 'entities': [{'start': 20,\n",
575 | " 'end': 27,\n",
576 | " 'value': 'italian',\n",
577 | " 'entity': 'cuisine',\n",
578 | " 'confidence': 0.6636828413532201,\n",
579 | " 'extractor': 'ner_crf'}],\n",
580 | " 'intent_ranking': [{'name': 'restaurant_search',\n",
581 | " 'confidence': 0.7455215289019911},\n",
582 | " {'name': 'affirm', 'confidence': 0.15019642212447237},\n",
583 | " {'name': 'greet', 'confidence': 0.058736824115844515},\n",
584 | " {'name': 'goodbye', 'confidence': 0.045545224857692024}],\n",
585 | " 'text': 'I am looking for an Italian Restaurant where I can eat'}"
586 | ]
587 | },
588 | "execution_count": 26,
589 | "metadata": {},
590 | "output_type": "execute_result"
591 | }
592 | ],
593 | "source": [
594 | "# Prediction of Intent\n",
595 | "interpreter.parse(u\"I am looking for an Italian Restaurant where I can eat\")"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": 27,
601 | "metadata": {},
602 | "outputs": [
603 | {
604 | "data": {
605 | "text/plain": [
606 | "{'intent': {'name': 'restaurant_search', 'confidence': 0.6874972430877329},\n",
607 | " 'entities': [{'start': 10,\n",
608 | " 'end': 17,\n",
609 | " 'value': 'african',\n",
610 | " 'entity': 'cuisine',\n",
611 | " 'confidence': 0.6470976966769572,\n",
612 | " 'extractor': 'ner_crf'}],\n",
613 | " 'intent_ranking': [{'name': 'restaurant_search',\n",
614 | " 'confidence': 0.6874972430877329},\n",
615 | " {'name': 'goodbye', 'confidence': 0.12400667696797882},\n",
616 | " {'name': 'affirm', 'confidence': 0.11357435021080386},\n",
617 | " {'name': 'greet', 'confidence': 0.07492172973348454}],\n",
618 | " 'text': 'I want an African Spot to eat'}"
619 | ]
620 | },
621 | "execution_count": 27,
622 | "metadata": {},
623 | "output_type": "execute_result"
624 | }
625 | ],
626 | "source": [
627 | "interpreter.parse(u\"I want an African Spot to eat\")"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": 28,
633 | "metadata": {},
634 | "outputs": [
635 | {
636 | "data": {
637 | "text/plain": [
638 | "{'intent': {'name': 'greet', 'confidence': 0.44328419685532383},\n",
639 | " 'entities': [],\n",
640 | " 'intent_ranking': [{'name': 'greet', 'confidence': 0.44328419685532383},\n",
641 | " {'name': 'goodbye', 'confidence': 0.31245698090344237},\n",
642 | " {'name': 'affirm', 'confidence': 0.1257434275305043},\n",
643 | " {'name': 'restaurant_search', 'confidence': 0.11851539471072912}],\n",
644 | " 'text': 'Good morning World'}"
645 | ]
646 | },
647 | "execution_count": 28,
648 | "metadata": {},
649 | "output_type": "execute_result"
650 | }
651 | ],
652 | "source": [
653 | "interpreter.parse(u\"Good morning World\")"
654 | ]
655 | },
656 | {
657 | "cell_type": "code",
658 | "execution_count": null,
659 | "metadata": {},
660 | "outputs": [],
661 | "source": []
662 | },
663 | {
664 | "cell_type": "code",
665 | "execution_count": null,
666 | "metadata": {},
667 | "outputs": [],
668 | "source": [
669 | "### Credits Rasa_nlu\n",
670 | "#### By Jesse JCharis\n",
671 | "#### Jesus Saves @ JCharisTec"
672 | ]
673 | }
674 | ],
675 | "metadata": {
676 | "kernelspec": {
677 | "display_name": "Python 3",
678 | "language": "python",
679 | "name": "python3"
680 | },
681 | "language_info": {
682 | "codemirror_mode": {
683 | "name": "ipython",
684 | "version": 3
685 | },
686 | "file_extension": ".py",
687 | "mimetype": "text/x-python",
688 | "name": "python",
689 | "nbconvert_exporter": "python",
690 | "pygments_lexer": "ipython3",
691 | "version": "3.5.2"
692 | }
693 | },
694 | "nbformat": 4,
695 | "nbformat_minor": 2
696 | }
697 |
--------------------------------------------------------------------------------
/Training the Named Entity Recognizer in SpaCy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Natural Language Processing With SpaCy\n",
8 | ""
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "#### Training the Named Entity Recognizer\n",
16 | "##### Updating our NER\n",
17 | "+ Load the model\n",
18 | " + spacy.load('en')\n",
19 | " - Disable existing pipe line (nlp.disable_pipes)\n",
20 | " + spacy.blank('en')\n",
21 | " - Added Entity Recognizer to Pipeline\n",
22 | "+ Shuffle and loop over the examples\n",
23 | " - update the model (nlp.update)\n",
24 | "+ Save the trained model (nlp.to_disk)\n",
25 | "+ Test"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "# Load Packages\n",
35 | "from __future__ import unicode_literals, print_function\n",
36 | "\n",
37 | "import plac # wrapper over argparse\n",
38 | "import random\n",
39 | "from pathlib import Path\n",
40 | "import spacy\n",
41 | "from tqdm import tqdm # loading bar"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "nlp1 = spacy.load('en')"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "docx1 = nlp1(u\"Who was Kofi Annan?\")"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "for token in docx1.ents:\n",
69 | " print(token.text,token.start_char, token.end_char,token.label_)"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "docx2 = nlp1(u\"Who is Steve Jobs?\")"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "for token in docx2.ents:\n",
88 | " print(token.text,token.start_char, token.end_char,token.label_)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "docx3 = nlp1(u\"Who is Shaka Khan?\")"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "# training data\n",
107 | "TRAIN_DATA = [\n",
108 | " ('Who is Kofi Annan?', {\n",
109 | " 'entities': [(8, 18, 'PERSON')]\n",
110 | " }),\n",
111 | " ('Who is Steve Jobs?', {\n",
112 | " 'entities': [(7, 17, 'PERSON')]\n",
113 | " }),\n",
114 | " ('I like London and Berlin.', {\n",
115 | " 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]\n",
116 | " })\n",
117 | "]"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "## plac is wrapper for argparser \n",
127 | "@plac.annotations(\n",
128 | " model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n",
129 | " output_dir=(\"C:\\Users\\This PC\\Documents\\JLabs\\JFlow\", \"option\", \"o\", Path),\n",
130 | " n_iter=(\"Number of training iterations\", \"option\", \"n\", int))"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "# Define our variables\n",
140 | "model = None\n",
141 | "output_dir=Path(\"C:\\\\Users\\\\This PC\\\\Documents\\\\JLabs\\\\JFlow\")\n",
142 | "n_iter=100"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "#### Load the model"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "if model is not None:\n",
159 | " nlp = spacy.load(model) # load existing spaCy model\n",
160 | " print(\"Loaded model '%s'\" % model)\n",
161 | "else:\n",
162 | " nlp = spacy.blank('en') # create blank Language class\n",
163 | " print(\"Created blank 'en' model\")"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "#### Set Up the Pipeline"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "# create the built-in pipeline components and add them to the pipeline\n",
180 | " # nlp.create_pipe works for built-ins that are registered with spaCy\n",
181 | "if 'ner' not in nlp.pipe_names:\n",
182 | " ner = nlp.create_pipe('ner')\n",
183 | " nlp.add_pipe(ner, last=True)\n",
184 | "# otherwise, get it so we can add labels\n",
185 | "else:\n",
186 | " ner = nlp.get_pipe('ner')"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "#### Train the Recognizer\n",
194 | "+ Add labels,Annotate them\n",
195 | "+ Pipes\n",
196 | "+ Begin_training()"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "\n",
206 | " # add labels\n",
207 | "for _, annotations in TRAIN_DATA:\n",
208 | " for ent in annotations.get('entities'):\n",
209 | " ner.add_label(ent[2])\n",
210 | "\n",
211 | " # get names of other pipes to disable them during training\n",
212 | "other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n",
213 | "with nlp.disable_pipes(*other_pipes): # only train NER\n",
214 | " optimizer = nlp.begin_training()\n",
215 | " for itn in range(n_iter):\n",
216 | " random.shuffle(TRAIN_DATA)\n",
217 | " losses = {}\n",
218 | " for text, annotations in tqdm(TRAIN_DATA):\n",
219 | " nlp.update(\n",
220 | " [text], # batch of texts\n",
221 | " [annotations], # batch of annotations\n",
222 | " drop=0.5, # dropout - make it harder to memorise data\n",
223 | " sgd=optimizer, # callable to update weights\n",
224 | " losses=losses)\n",
225 | " print(losses)"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "#### Test the trained model"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "# test the trained model\n",
242 | "for text, _ in TRAIN_DATA:\n",
243 | " doc = nlp(text)\n",
244 | " print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n",
245 | " print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])\n"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "#### Save the Model"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "# save model to output directory\n",
262 | "if output_dir is not None:\n",
263 | " output_dir = Path(output_dir)\n",
264 | " if not output_dir.exists():\n",
265 | " output_dir.mkdir()\n",
266 | " nlp.to_disk(output_dir)\n",
267 | " print(\"Saved model to\", output_dir)\n",
268 | "\n",
269 | " "
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "#### Test The Saved Model\n",
277 | "+ NB Output Directory"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "# test the saved model\n",
287 | "print(\"Loading from\", output_dir)\n",
288 | "nlp2 = spacy.load(output_dir)\n",
289 | "for text, _ in TRAIN_DATA:\n",
290 | " doc = nlp2(text)\n",
291 | " print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n",
292 | " print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "### Adding Additional Entity Types\n"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "### Natural Language Processing With SpaCy\n",
307 | ""
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {},
313 | "source": [
314 | "#### Training the Named Entity Recognizer (NER)\n",
315 | "##### Adding An Additional Entity (NER)\n",
316 | "+ Load the model\n",
317 | " + spacy.load('en')\n",
318 | " - Disable existing pipe line (nlp.disable_pipes)\n",
319 | " + spacy.blank('en')\n",
320 | " - Added Entity Recognizer to Pipeline\n",
321 | "+ Add a Label eg(ner.add_label(LABEL) & (nlp.begin_training())\n",
322 | "+ Shuffle and loop over the examples\n",
323 | " - update the model (nlp.update)\n",
324 | "+ Save the trained model (nlp.to_disk)\n",
325 | "+ Test"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 78,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "from __future__ import unicode_literals, print_function\n",
335 | "\n",
336 | "import plac\n",
337 | "import random\n",
338 | "from pathlib import Path\n",
339 | "import spacy"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 79,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "# new entity label\n",
349 | "LABEL = 'ANIMAL'"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 80,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "TRAIN_DATA = [\n",
359 | " (\"Horses are too tall and they pretend to care about your feelings\", {\n",
360 | " 'entities': [(0, 6, 'ANIMAL')]\n",
361 | " }),\n",
362 | "\n",
363 | " (\"Do they bite?\", {\n",
364 | " 'entities': []\n",
365 | " }),\n",
366 | "\n",
367 | " (\"horses are too tall and they pretend to care about your feelings\", {\n",
368 | " 'entities': [(0, 6, 'ANIMAL')]\n",
369 | " }),\n",
370 | "\n",
371 | " (\"horses pretend to care about your feelings\", {\n",
372 | " 'entities': [(0, 6, 'ANIMAL')]\n",
373 | " }),\n",
374 | "\n",
375 | " (\"they pretend to care about your feelings, those horses\", {\n",
376 | " 'entities': [(48, 54, 'ANIMAL')]\n",
377 | " }),\n",
378 | "\n",
379 | " (\"horses?\", {\n",
380 | " 'entities': [(0, 6, 'ANIMAL')]\n",
381 | " })\n",
382 | "]"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 82,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "\n",
392 | "@plac.annotations(\n",
393 | " model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n",
394 | " new_model_name=(\"New model name for model meta.\", \"option\", \"nm\", str),\n",
395 | " output_dir=(\"Optional output directory\", \"option\", \"o\", Path),\n",
396 | " n_iter=(\"Number of training iterations\", \"option\", \"n\", int))\n",
397 | "\n",
398 | "\n",
399 | "def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):\n",
400 | " \"\"\"Set up the pipeline and entity recognizer, and train the new entity.\"\"\"\n",
401 | " if model is not None:\n",
402 | " nlp = spacy.load(model) # load existing spaCy model\n",
403 | " print(\"Loaded model '%s'\" % model)\n",
404 | " else:\n",
405 | " nlp = spacy.blank('en') # create blank Language class\n",
406 | " print(\"Created blank 'en' model\")\n",
407 | " # Add entity recognizer to model if it's not in the pipeline\n",
408 | " # nlp.create_pipe works for built-ins that are registered with spaCy\n",
409 | " if 'ner' not in nlp.pipe_names:\n",
410 | " ner = nlp.create_pipe('ner')\n",
411 | " nlp.add_pipe(ner)\n",
412 | " # otherwise, get it, so we can add labels to it\n",
413 | " else:\n",
414 | " ner = nlp.get_pipe('ner')\n",
415 | "\n",
416 | " ner.add_label(LABEL) # add new entity label to entity recognizer\n",
417 | " if model is None:\n",
418 | " optimizer = nlp.begin_training()\n",
419 | " else:\n",
420 | " # Note that 'begin_training' initializes the models, so it'll zero out\n",
421 | " # existing entity types.\n",
422 | " optimizer = nlp.entity.create_optimizer()\n",
423 | "\n",
424 | " # get names of other pipes to disable them during training\n",
425 | " other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n",
426 | " with nlp.disable_pipes(*other_pipes): # only train NER\n",
427 | " for itn in range(n_iter):\n",
428 | " random.shuffle(TRAIN_DATA)\n",
429 | " losses = {}\n",
430 | " for text, annotations in tqdm(TRAIN_DATA):\n",
431 | " nlp.update([text], [annotations], sgd=optimizer, drop=0.35,\n",
432 | " losses=losses)\n",
433 | " print(losses)\n",
434 | "\n",
435 | " # test the trained model\n",
436 | " test_text = 'Do you like horses?'\n",
437 | " doc = nlp(test_text)\n",
438 | " print(\"Entities in '%s'\" % test_text)\n",
439 | " for ent in doc.ents:\n",
440 | " print(ent.label_, ent.text)\n",
441 | "\n",
442 | " # save model to output directory\n",
443 | " if output_dir is not None:\n",
444 | " output_dir = Path(output_dir)\n",
445 | " if not output_dir.exists():\n",
446 | " output_dir.mkdir()\n",
447 | " nlp.meta['name'] = new_model_name # rename model\n",
448 | " nlp.to_disk(output_dir)\n",
449 | " print(\"Saved model to\", output_dir)\n",
450 | "\n",
451 | " # test the saved model\n",
452 | " print(\"Loading from\", output_dir)\n",
453 | " nlp2 = spacy.load(output_dir)\n",
454 | " doc2 = nlp2(test_text)\n",
455 | " for ent in doc2.ents:\n",
456 | " print(ent.label_, ent.text)\n",
457 | "\n",
458 | "\n",
459 | "# if __name__ == '__main__':\n",
460 | "# plac.call(main)"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 83,
466 | "metadata": {},
467 | "outputs": [
468 | {
469 | "name": "stdout",
470 | "output_type": "stream",
471 | "text": [
472 | "Created blank 'en' model\n",
473 | "Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (0, 0))\n"
474 | ]
475 | },
476 | {
477 | "name": "stderr",
478 | "output_type": "stream",
479 | "text": [
480 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:07<00:00, 1.22s/it]\n"
481 | ]
482 | },
483 | {
484 | "name": "stdout",
485 | "output_type": "stream",
486 | "text": [
487 | "{'ner': 26.770396717498016}\n"
488 | ]
489 | },
490 | {
491 | "name": "stderr",
492 | "output_type": "stream",
493 | "text": [
494 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:06<00:00, 1.02s/it]\n"
495 | ]
496 | },
497 | {
498 | "name": "stdout",
499 | "output_type": "stream",
500 | "text": [
501 | "{'ner': 8.593518038099443}\n"
502 | ]
503 | },
504 | {
505 | "name": "stderr",
506 | "output_type": "stream",
507 | "text": [
508 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n"
509 | ]
510 | },
511 | {
512 | "name": "stdout",
513 | "output_type": "stream",
514 | "text": [
515 | "{'ner': 4.161424036550985}\n"
516 | ]
517 | },
518 | {
519 | "name": "stderr",
520 | "output_type": "stream",
521 | "text": [
522 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n"
523 | ]
524 | },
525 | {
526 | "name": "stdout",
527 | "output_type": "stream",
528 | "text": [
529 | "{'ner': 3.8918851538918418}\n"
530 | ]
531 | },
532 | {
533 | "name": "stderr",
534 | "output_type": "stream",
535 | "text": [
536 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.30it/s]\n"
537 | ]
538 | },
539 | {
540 | "name": "stdout",
541 | "output_type": "stream",
542 | "text": [
543 | "{'ner': 2.01546711932046}\n"
544 | ]
545 | },
546 | {
547 | "name": "stderr",
548 | "output_type": "stream",
549 | "text": [
550 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.31it/s]\n"
551 | ]
552 | },
553 | {
554 | "name": "stdout",
555 | "output_type": "stream",
556 | "text": [
557 | "{'ner': 0.000131435854561013}\n"
558 | ]
559 | },
560 | {
561 | "name": "stderr",
562 | "output_type": "stream",
563 | "text": [
564 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.32it/s]\n"
565 | ]
566 | },
567 | {
568 | "name": "stdout",
569 | "output_type": "stream",
570 | "text": [
571 | "{'ner': 1.3692610842225425e-07}\n"
572 | ]
573 | },
574 | {
575 | "name": "stderr",
576 | "output_type": "stream",
577 | "text": [
578 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.08it/s]\n"
579 | ]
580 | },
581 | {
582 | "name": "stdout",
583 | "output_type": "stream",
584 | "text": [
585 | "{'ner': 0.019683124967466954}\n"
586 | ]
587 | },
588 | {
589 | "name": "stderr",
590 | "output_type": "stream",
591 | "text": [
592 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.07it/s]\n"
593 | ]
594 | },
595 | {
596 | "name": "stdout",
597 | "output_type": "stream",
598 | "text": [
599 | "{'ner': 2.078213820644416e-12}\n"
600 | ]
601 | },
602 | {
603 | "name": "stderr",
604 | "output_type": "stream",
605 | "text": [
606 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.11it/s]\n"
607 | ]
608 | },
609 | {
610 | "name": "stdout",
611 | "output_type": "stream",
612 | "text": [
613 | "{'ner': 1.5424355623930257e-05}\n"
614 | ]
615 | },
616 | {
617 | "name": "stderr",
618 | "output_type": "stream",
619 | "text": [
620 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n"
621 | ]
622 | },
623 | {
624 | "name": "stdout",
625 | "output_type": "stream",
626 | "text": [
627 | "{'ner': 0.34855798227363266}\n"
628 | ]
629 | },
630 | {
631 | "name": "stderr",
632 | "output_type": "stream",
633 | "text": [
634 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n"
635 | ]
636 | },
637 | {
638 | "name": "stdout",
639 | "output_type": "stream",
640 | "text": [
641 | "{'ner': 1.2020330928745637e-21}\n"
642 | ]
643 | },
644 | {
645 | "name": "stderr",
646 | "output_type": "stream",
647 | "text": [
648 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.23it/s]\n"
649 | ]
650 | },
651 | {
652 | "name": "stdout",
653 | "output_type": "stream",
654 | "text": [
655 | "{'ner': 1.1364459848434984e-19}\n"
656 | ]
657 | },
658 | {
659 | "name": "stderr",
660 | "output_type": "stream",
661 | "text": [
662 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.01it/s]\n"
663 | ]
664 | },
665 | {
666 | "name": "stdout",
667 | "output_type": "stream",
668 | "text": [
669 | "{'ner': 5.07038899221475e-16}\n"
670 | ]
671 | },
672 | {
673 | "name": "stderr",
674 | "output_type": "stream",
675 | "text": [
676 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.07it/s]\n"
677 | ]
678 | },
679 | {
680 | "name": "stdout",
681 | "output_type": "stream",
682 | "text": [
683 | "{'ner': 7.756965635961777e-18}\n"
684 | ]
685 | },
686 | {
687 | "name": "stderr",
688 | "output_type": "stream",
689 | "text": [
690 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.21it/s]\n"
691 | ]
692 | },
693 | {
694 | "name": "stdout",
695 | "output_type": "stream",
696 | "text": [
697 | "{'ner': 4.682540175328388e-13}\n"
698 | ]
699 | },
700 | {
701 | "name": "stderr",
702 | "output_type": "stream",
703 | "text": [
704 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.17it/s]\n"
705 | ]
706 | },
707 | {
708 | "name": "stdout",
709 | "output_type": "stream",
710 | "text": [
711 | "{'ner': 4.9982126736537605e-14}\n"
712 | ]
713 | },
714 | {
715 | "name": "stderr",
716 | "output_type": "stream",
717 | "text": [
718 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.15it/s]\n"
719 | ]
720 | },
721 | {
722 | "name": "stdout",
723 | "output_type": "stream",
724 | "text": [
725 | "{'ner': 5.766438963914882e-17}\n"
726 | ]
727 | },
728 | {
729 | "name": "stderr",
730 | "output_type": "stream",
731 | "text": [
732 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.25it/s]\n"
733 | ]
734 | },
735 | {
736 | "name": "stdout",
737 | "output_type": "stream",
738 | "text": [
739 | "{'ner': 4.4997379863434744e-20}\n"
740 | ]
741 | },
742 | {
743 | "name": "stderr",
744 | "output_type": "stream",
745 | "text": [
746 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n"
747 | ]
748 | },
749 | {
750 | "name": "stdout",
751 | "output_type": "stream",
752 | "text": [
753 | "{'ner': 1.4565571602945852e-16}\n",
754 | "Entities in 'Do you like horses?'\n",
755 | "ANIMAL horses\n"
756 | ]
757 | }
758 | ],
759 | "source": [
760 | "# Run our Function\n",
761 | "main()"
762 | ]
763 | },
764 | {
765 | "cell_type": "code",
766 | "execution_count": null,
767 | "metadata": {},
768 | "outputs": [],
769 | "source": [
770 | "# Our model was able to recognize horses as ANIMAL"
771 | ]
772 | },
773 | {
774 | "cell_type": "code",
775 | "execution_count": null,
776 | "metadata": {},
777 | "outputs": [],
778 | "source": []
779 | }
780 | ],
781 | "metadata": {
782 | "kernelspec": {
783 | "display_name": "Python 3",
784 | "language": "python",
785 | "name": "python3"
786 | },
787 | "language_info": {
788 | "codemirror_mode": {
789 | "name": "ipython",
790 | "version": 3
791 | },
792 | "file_extension": ".py",
793 | "mimetype": "text/x-python",
794 | "name": "python",
795 | "nbconvert_exporter": "python",
796 | "pygments_lexer": "ipython3",
797 | "version": "3.6.6"
798 | }
799 | },
800 | "nbformat": 4,
801 | "nbformat_minor": 2
802 | }
803 |
--------------------------------------------------------------------------------
/NLP_with_SpaCy/Training the Named Entity Recognizer in SpaCy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Natural Language Processing With SpaCy\n",
8 | ""
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "#### Training the Named Entity Recognizer\n",
16 | "##### Updating our NER\n",
17 | "+ Load the model\n",
18 | " + spacy.load('en')\n",
19 | " - Disable existing pipe line (nlp.disable_pipes)\n",
20 | " + spacy.blank('en')\n",
21 | " - Added Entity Recognizer to Pipeline\n",
22 | "+ Shuffle and loop over the examples\n",
23 | " - update the model (nlp.update)\n",
24 | "+ Save the trained model (nlp.to_disk)\n",
25 | "+ Test"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "# Load Packages\n",
35 | "from __future__ import unicode_literals, print_function\n",
36 | "\n",
37 | "import plac # wrapper over argparse\n",
38 | "import random\n",
39 | "from pathlib import Path\n",
40 | "import spacy\n",
41 | "from tqdm import tqdm # loading bar"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "nlp1 = spacy.load('en')"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "docx1 = nlp1(u\"Who was Kofi Annan?\")"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "for token in docx1.ents:\n",
69 | " print(token.text,token.start_char, token.end_char,token.label_)"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "docx2 = nlp1(u\"Who is Steve Jobs?\")"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "for token in docx2.ents:\n",
88 | " print(token.text,token.start_char, token.end_char,token.label_)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "docx3 = nlp1(u\"Who is Shaka Khan?\")"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "# training data\n",
107 | "TRAIN_DATA = [\n",
108 | " ('Who is Kofi Annan?', {\n",
109 | " 'entities': [(8, 18, 'PERSON')]\n",
110 | " }),\n",
111 | " ('Who is Steve Jobs?', {\n",
112 | " 'entities': [(7, 17, 'PERSON')]\n",
113 | " }),\n",
114 | " ('I like London and Berlin.', {\n",
115 | " 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]\n",
116 | " })\n",
117 | "]"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "## plac is wrapper for argparser \n",
127 | "@plac.annotations(\n",
128 | " model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n",
129 | " output_dir=(\"C:\\Users\\This PC\\Documents\\JLabs\\JFlow\", \"option\", \"o\", Path),\n",
130 | " n_iter=(\"Number of training iterations\", \"option\", \"n\", int))"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "# Define our variables\n",
140 | "model = None\n",
141 | "output_dir=Path(\"C:\\\\Users\\\\This PC\\\\Documents\\\\JLabs\\\\JFlow\")\n",
142 | "n_iter=100"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "#### Load the model"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "if model is not None:\n",
159 | " nlp = spacy.load(model) # load existing spaCy model\n",
160 | " print(\"Loaded model '%s'\" % model)\n",
161 | "else:\n",
162 | " nlp = spacy.blank('en') # create blank Language class\n",
163 | " print(\"Created blank 'en' model\")"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "#### Set Up the Pipeline"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "# create the built-in pipeline components and add them to the pipeline\n",
180 | " # nlp.create_pipe works for built-ins that are registered with spaCy\n",
181 | "if 'ner' not in nlp.pipe_names:\n",
182 | " ner = nlp.create_pipe('ner')\n",
183 | " nlp.add_pipe(ner, last=True)\n",
184 | "# otherwise, get it so we can add labels\n",
185 | "else:\n",
186 | " ner = nlp.get_pipe('ner')"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "#### Train the Recognizer\n",
194 | "+ Add labels,Annotate them\n",
195 | "+ Pipes\n",
196 | "+ Begin_training()"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "\n",
206 | " # add labels\n",
207 | "for _, annotations in TRAIN_DATA:\n",
208 | " for ent in annotations.get('entities'):\n",
209 | " ner.add_label(ent[2])\n",
210 | "\n",
211 | " # get names of other pipes to disable them during training\n",
212 | "other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n",
213 | "with nlp.disable_pipes(*other_pipes): # only train NER\n",
214 | " optimizer = nlp.begin_training()\n",
215 | " for itn in range(n_iter):\n",
216 | " random.shuffle(TRAIN_DATA)\n",
217 | " losses = {}\n",
218 | " for text, annotations in tqdm(TRAIN_DATA):\n",
219 | " nlp.update(\n",
220 | " [text], # batch of texts\n",
221 | " [annotations], # batch of annotations\n",
222 | " drop=0.5, # dropout - make it harder to memorise data\n",
223 | " sgd=optimizer, # callable to update weights\n",
224 | " losses=losses)\n",
225 | " print(losses)"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "#### Test the trained model"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "# test the trained model\n",
242 | "for text, _ in TRAIN_DATA:\n",
243 | " doc = nlp(text)\n",
244 | " print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n",
245 | " print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])\n"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "#### Save the Model"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "# save model to output directory\n",
262 | "if output_dir is not None:\n",
263 | " output_dir = Path(output_dir)\n",
264 | " if not output_dir.exists():\n",
265 | " output_dir.mkdir()\n",
266 | " nlp.to_disk(output_dir)\n",
267 | " print(\"Saved model to\", output_dir)\n",
268 | "\n",
269 | " "
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "#### Test The Saved Model\n",
277 | "+ NB Output Directory"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "# test the saved model\n",
287 | "print(\"Loading from\", output_dir)\n",
288 | "nlp2 = spacy.load(output_dir)\n",
289 | "for text, _ in TRAIN_DATA:\n",
290 | " doc = nlp2(text)\n",
291 | " print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n",
292 | " print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "### Adding Additional Entity Types\n"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "### Natural Language Processing With SpaCy\n",
307 | ""
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {},
313 | "source": [
314 | "#### Training the Named Entity Recognizer (NER)\n",
315 | "##### Adding An Additional Entity (NER)\n",
316 | "+ Load the model\n",
317 | " + spacy.load('en')\n",
318 | " - Disable existing pipe line (nlp.disable_pipes)\n",
319 | " + spacy.blank('en')\n",
320 | " - Added Entity Recognizer to Pipeline\n",
321 | "+ Add a Label eg(ner.add_label(LABEL) & (nlp.begin_training())\n",
322 | "+ Shuffle and loop over the examples\n",
323 | " - update the model (nlp.update)\n",
324 | "+ Save the trained model (nlp.to_disk)\n",
325 | "+ Test"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 78,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "from __future__ import unicode_literals, print_function\n",
335 | "\n",
336 | "import plac\n",
337 | "import random\n",
338 | "from pathlib import Path\n",
339 | "import spacy"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 79,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "# new entity label\n",
349 | "LABEL = 'ANIMAL'"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 80,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "TRAIN_DATA = [\n",
359 | " (\"Horses are too tall and they pretend to care about your feelings\", {\n",
360 | " 'entities': [(0, 6, 'ANIMAL')]\n",
361 | " }),\n",
362 | "\n",
363 | " (\"Do they bite?\", {\n",
364 | " 'entities': []\n",
365 | " }),\n",
366 | "\n",
367 | " (\"horses are too tall and they pretend to care about your feelings\", {\n",
368 | " 'entities': [(0, 6, 'ANIMAL')]\n",
369 | " }),\n",
370 | "\n",
371 | " (\"horses pretend to care about your feelings\", {\n",
372 | " 'entities': [(0, 6, 'ANIMAL')]\n",
373 | " }),\n",
374 | "\n",
375 | " (\"they pretend to care about your feelings, those horses\", {\n",
376 | " 'entities': [(48, 54, 'ANIMAL')]\n",
377 | " }),\n",
378 | "\n",
379 | " (\"horses?\", {\n",
380 | " 'entities': [(0, 6, 'ANIMAL')]\n",
381 | " })\n",
382 | "]"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 82,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "\n",
392 | "@plac.annotations(\n",
393 | " model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n",
394 | " new_model_name=(\"New model name for model meta.\", \"option\", \"nm\", str),\n",
395 | " output_dir=(\"Optional output directory\", \"option\", \"o\", Path),\n",
396 | " n_iter=(\"Number of training iterations\", \"option\", \"n\", int))\n",
397 | "\n",
398 | "\n",
399 | "def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):\n",
400 | " \"\"\"Set up the pipeline and entity recognizer, and train the new entity.\"\"\"\n",
401 | " if model is not None:\n",
402 | " nlp = spacy.load(model) # load existing spaCy model\n",
403 | " print(\"Loaded model '%s'\" % model)\n",
404 | " else:\n",
405 | " nlp = spacy.blank('en') # create blank Language class\n",
406 | " print(\"Created blank 'en' model\")\n",
407 | " # Add entity recognizer to model if it's not in the pipeline\n",
408 | " # nlp.create_pipe works for built-ins that are registered with spaCy\n",
409 | " if 'ner' not in nlp.pipe_names:\n",
410 | " ner = nlp.create_pipe('ner')\n",
411 | " nlp.add_pipe(ner)\n",
412 | " # otherwise, get it, so we can add labels to it\n",
413 | " else:\n",
414 | " ner = nlp.get_pipe('ner')\n",
415 | "\n",
416 | " ner.add_label(LABEL) # add new entity label to entity recognizer\n",
417 | " if model is None:\n",
418 | " optimizer = nlp.begin_training()\n",
419 | " else:\n",
420 | " # Note that 'begin_training' initializes the models, so it'll zero out\n",
421 | " # existing entity types.\n",
422 | " optimizer = nlp.entity.create_optimizer()\n",
423 | "\n",
424 | " # get names of other pipes to disable them during training\n",
425 | " other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n",
426 | " with nlp.disable_pipes(*other_pipes): # only train NER\n",
427 | " for itn in range(n_iter):\n",
428 | " random.shuffle(TRAIN_DATA)\n",
429 | " losses = {}\n",
430 | " for text, annotations in tqdm(TRAIN_DATA):\n",
431 | " nlp.update([text], [annotations], sgd=optimizer, drop=0.35,\n",
432 | " losses=losses)\n",
433 | " print(losses)\n",
434 | "\n",
435 | " # test the trained model\n",
436 | " test_text = 'Do you like horses?'\n",
437 | " doc = nlp(test_text)\n",
438 | " print(\"Entities in '%s'\" % test_text)\n",
439 | " for ent in doc.ents:\n",
440 | " print(ent.label_, ent.text)\n",
441 | "\n",
442 | " # save model to output directory\n",
443 | " if output_dir is not None:\n",
444 | " output_dir = Path(output_dir)\n",
445 | " if not output_dir.exists():\n",
446 | " output_dir.mkdir()\n",
447 | " nlp.meta['name'] = new_model_name # rename model\n",
448 | " nlp.to_disk(output_dir)\n",
449 | " print(\"Saved model to\", output_dir)\n",
450 | "\n",
451 | " # test the saved model\n",
452 | " print(\"Loading from\", output_dir)\n",
453 | " nlp2 = spacy.load(output_dir)\n",
454 | " doc2 = nlp2(test_text)\n",
455 | " for ent in doc2.ents:\n",
456 | " print(ent.label_, ent.text)\n",
457 | "\n",
458 | "\n",
459 | "# if __name__ == '__main__':\n",
460 | "# plac.call(main)"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 83,
466 | "metadata": {},
467 | "outputs": [
468 | {
469 | "name": "stdout",
470 | "output_type": "stream",
471 | "text": [
472 | "Created blank 'en' model\n",
473 | "Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (0, 0))\n"
474 | ]
475 | },
476 | {
477 | "name": "stderr",
478 | "output_type": "stream",
479 | "text": [
480 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:07<00:00, 1.22s/it]\n"
481 | ]
482 | },
483 | {
484 | "name": "stdout",
485 | "output_type": "stream",
486 | "text": [
487 | "{'ner': 26.770396717498016}\n"
488 | ]
489 | },
490 | {
491 | "name": "stderr",
492 | "output_type": "stream",
493 | "text": [
494 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:06<00:00, 1.02s/it]\n"
495 | ]
496 | },
497 | {
498 | "name": "stdout",
499 | "output_type": "stream",
500 | "text": [
501 | "{'ner': 8.593518038099443}\n"
502 | ]
503 | },
504 | {
505 | "name": "stderr",
506 | "output_type": "stream",
507 | "text": [
508 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n"
509 | ]
510 | },
511 | {
512 | "name": "stdout",
513 | "output_type": "stream",
514 | "text": [
515 | "{'ner': 4.161424036550985}\n"
516 | ]
517 | },
518 | {
519 | "name": "stderr",
520 | "output_type": "stream",
521 | "text": [
522 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n"
523 | ]
524 | },
525 | {
526 | "name": "stdout",
527 | "output_type": "stream",
528 | "text": [
529 | "{'ner': 3.8918851538918418}\n"
530 | ]
531 | },
532 | {
533 | "name": "stderr",
534 | "output_type": "stream",
535 | "text": [
536 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.30it/s]\n"
537 | ]
538 | },
539 | {
540 | "name": "stdout",
541 | "output_type": "stream",
542 | "text": [
543 | "{'ner': 2.01546711932046}\n"
544 | ]
545 | },
546 | {
547 | "name": "stderr",
548 | "output_type": "stream",
549 | "text": [
550 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.31it/s]\n"
551 | ]
552 | },
553 | {
554 | "name": "stdout",
555 | "output_type": "stream",
556 | "text": [
557 | "{'ner': 0.000131435854561013}\n"
558 | ]
559 | },
560 | {
561 | "name": "stderr",
562 | "output_type": "stream",
563 | "text": [
564 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.32it/s]\n"
565 | ]
566 | },
567 | {
568 | "name": "stdout",
569 | "output_type": "stream",
570 | "text": [
571 | "{'ner': 1.3692610842225425e-07}\n"
572 | ]
573 | },
574 | {
575 | "name": "stderr",
576 | "output_type": "stream",
577 | "text": [
578 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.08it/s]\n"
579 | ]
580 | },
581 | {
582 | "name": "stdout",
583 | "output_type": "stream",
584 | "text": [
585 | "{'ner': 0.019683124967466954}\n"
586 | ]
587 | },
588 | {
589 | "name": "stderr",
590 | "output_type": "stream",
591 | "text": [
592 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.07it/s]\n"
593 | ]
594 | },
595 | {
596 | "name": "stdout",
597 | "output_type": "stream",
598 | "text": [
599 | "{'ner': 2.078213820644416e-12}\n"
600 | ]
601 | },
602 | {
603 | "name": "stderr",
604 | "output_type": "stream",
605 | "text": [
606 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.11it/s]\n"
607 | ]
608 | },
609 | {
610 | "name": "stdout",
611 | "output_type": "stream",
612 | "text": [
613 | "{'ner': 1.5424355623930257e-05}\n"
614 | ]
615 | },
616 | {
617 | "name": "stderr",
618 | "output_type": "stream",
619 | "text": [
620 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n"
621 | ]
622 | },
623 | {
624 | "name": "stdout",
625 | "output_type": "stream",
626 | "text": [
627 | "{'ner': 0.34855798227363266}\n"
628 | ]
629 | },
630 | {
631 | "name": "stderr",
632 | "output_type": "stream",
633 | "text": [
634 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n"
635 | ]
636 | },
637 | {
638 | "name": "stdout",
639 | "output_type": "stream",
640 | "text": [
641 | "{'ner': 1.2020330928745637e-21}\n"
642 | ]
643 | },
644 | {
645 | "name": "stderr",
646 | "output_type": "stream",
647 | "text": [
648 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.23it/s]\n"
649 | ]
650 | },
651 | {
652 | "name": "stdout",
653 | "output_type": "stream",
654 | "text": [
655 | "{'ner': 1.1364459848434984e-19}\n"
656 | ]
657 | },
658 | {
659 | "name": "stderr",
660 | "output_type": "stream",
661 | "text": [
662 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.01it/s]\n"
663 | ]
664 | },
665 | {
666 | "name": "stdout",
667 | "output_type": "stream",
668 | "text": [
669 | "{'ner': 5.07038899221475e-16}\n"
670 | ]
671 | },
672 | {
673 | "name": "stderr",
674 | "output_type": "stream",
675 | "text": [
676 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.07it/s]\n"
677 | ]
678 | },
679 | {
680 | "name": "stdout",
681 | "output_type": "stream",
682 | "text": [
683 | "{'ner': 7.756965635961777e-18}\n"
684 | ]
685 | },
686 | {
687 | "name": "stderr",
688 | "output_type": "stream",
689 | "text": [
690 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.21it/s]\n"
691 | ]
692 | },
693 | {
694 | "name": "stdout",
695 | "output_type": "stream",
696 | "text": [
697 | "{'ner': 4.682540175328388e-13}\n"
698 | ]
699 | },
700 | {
701 | "name": "stderr",
702 | "output_type": "stream",
703 | "text": [
704 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.17it/s]\n"
705 | ]
706 | },
707 | {
708 | "name": "stdout",
709 | "output_type": "stream",
710 | "text": [
711 | "{'ner': 4.9982126736537605e-14}\n"
712 | ]
713 | },
714 | {
715 | "name": "stderr",
716 | "output_type": "stream",
717 | "text": [
718 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.15it/s]\n"
719 | ]
720 | },
721 | {
722 | "name": "stdout",
723 | "output_type": "stream",
724 | "text": [
725 | "{'ner': 5.766438963914882e-17}\n"
726 | ]
727 | },
728 | {
729 | "name": "stderr",
730 | "output_type": "stream",
731 | "text": [
732 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.25it/s]\n"
733 | ]
734 | },
735 | {
736 | "name": "stdout",
737 | "output_type": "stream",
738 | "text": [
739 | "{'ner': 4.4997379863434744e-20}\n"
740 | ]
741 | },
742 | {
743 | "name": "stderr",
744 | "output_type": "stream",
745 | "text": [
746 | "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n"
747 | ]
748 | },
749 | {
750 | "name": "stdout",
751 | "output_type": "stream",
752 | "text": [
753 | "{'ner': 1.4565571602945852e-16}\n",
754 | "Entities in 'Do you like horses?'\n",
755 | "ANIMAL horses\n"
756 | ]
757 | }
758 | ],
759 | "source": [
760 | "# Run our Function\n",
761 | "main()"
762 | ]
763 | },
764 | {
765 | "cell_type": "code",
766 | "execution_count": null,
767 | "metadata": {},
768 | "outputs": [],
769 | "source": [
770 | "# Our model was able to recognize horses as ANIMAL"
771 | ]
772 | },
773 | {
774 | "cell_type": "code",
775 | "execution_count": null,
776 | "metadata": {},
777 | "outputs": [],
778 | "source": []
779 | }
780 | ],
781 | "metadata": {
782 | "kernelspec": {
783 | "display_name": "Python 3",
784 | "language": "python",
785 | "name": "python3"
786 | },
787 | "language_info": {
788 | "codemirror_mode": {
789 | "name": "ipython",
790 | "version": 3
791 | },
792 | "file_extension": ".py",
793 | "mimetype": "text/x-python",
794 | "name": "python",
795 | "nbconvert_exporter": "python",
796 | "pygments_lexer": "ipython3",
797 | "version": "3.6.6"
798 | }
799 | },
800 | "nbformat": 4,
801 | "nbformat_minor": 2
802 | }
803 |
--------------------------------------------------------------------------------