├── Intro_spaCy_NLP.ipynb
├── Intro_spaCy_NLP.md
├── LICENSE
├── README.md
└── subject_object_extraction.py


/Intro_spaCy_NLP.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:6842b637870ccb1c99f940c90e75c511dc193e15ea69d132c85630a4dc6ce0df"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "# Set up spaCy\n",
 16 |       "from spacy.en import English\n",
 17 |       "parser = English()\n",
 18 |       "\n",
 19 |       "# Test Data\n",
 20 |       "multiSentence = \"There is an art, it says, or rather, a knack to flying.\" \\\n",
 21 |       "                 \"The knack lies in learning how to throw yourself at the ground and miss.\" \\\n",
 22 |       "                 \"In the beginning the Universe was created. This has made a lot of people \"\\\n",
 23 |       "                 \"very angry and been widely regarded as a bad move.\""
 24 |      ],
 25 |      "language": "python",
 26 |      "metadata": {},
 27 |      "outputs": [],
 28 |      "prompt_number": 2
 29 |     },
 30 |     {
 31 |      "cell_type": "heading",
 32 |      "level": 1,
 33 |      "metadata": {},
 34 |      "source": [
 35 |       "spaCy does tokenization, sentence recognition, part of speech tagging, lemmatization, dependency parsing, and named entity recognition all at once!"
 36 |      ]
 37 |     },
 38 |     {
 39 |      "cell_type": "code",
 40 |      "collapsed": false,
 41 |      "input": [
 42 |       "# all you have to do to parse text is this:\n",
 43 |       "#note: the first time you run spaCy in a file it takes a little while to load up its modules\n",
 44 |       "parsedData = parser(multiSentence)"
 45 |      ],
 46 |      "language": "python",
 47 |      "metadata": {},
 48 |      "outputs": [],
 49 |      "prompt_number": 59
 50 |     },
 51 |     {
 52 |      "cell_type": "code",
 53 |      "collapsed": false,
 54 |      "input": [
 55 |       "# Let's look at the tokens\n",
 56 |       "# All you have to do is iterate through the parsedData\n",
 57 |       "# Each token is an object with lots of different properties\n",
 58 |       "# A property with an underscore at the end returns the string representation\n",
 59 |       "# while a property without the underscore returns an index (int) into spaCy's vocabulary\n",
 60 |       "# The probability estimate is based on counts from a 3 billion word corpus, smoothed using the Simple Good-Turing method.\n",
 61 |       "for i, token in enumerate(parsedData):\n",
 62 |       "    print(\"original:\", token.orth, token.orth_)\n",
 63 |       "    print(\"lowercased:\", token.lower, token.lower_)\n",
 64 |       "    print(\"lemma:\", token.lemma, token.lemma_)\n",
 65 |       "    print(\"shape:\", token.shape, token.shape_)\n",
 66 |       "    print(\"prefix:\", token.prefix, token.prefix_)\n",
 67 |       "    print(\"suffix:\", token.suffix, token.suffix_)\n",
 68 |       "    print(\"log probability:\", token.prob)\n",
 69 |       "    print(\"Brown cluster id:\", token.cluster)\n",
 70 |       "    print(\"----------------------------------------\")\n",
 71 |       "    if i > 10:\n",
 72 |       "        break"
 73 |      ],
 74 |      "language": "python",
 75 |      "metadata": {},
 76 |      "outputs": [
 77 |       {
 78 |        "output_type": "stream",
 79 |        "stream": "stdout",
 80 |        "text": [
 81 |         "original: 300 There\n",
 82 |         "lowercased: 144 there\n",
 83 |         "lemma: 300 There\n",
 84 |         "shape: 187 Xxxxx\n",
 85 |         "prefix: 32 T\n",
 86 |         "suffix: 66 ere\n",
 87 |         "log probability: -7.663576126098633\n",
 88 |         "Brown cluster id: 1918\n",
 89 |         "----------------------------------------\n",
 90 |         "original: 29 is\n",
 91 |         "lowercased: 29 is\n",
 92 |         "lemma: 52 be\n",
 93 |         "shape: 7 xx\n",
 94 |         "prefix: 14 i\n",
 95 |         "suffix: 29 is\n",
 96 |         "log probability: -5.002371311187744\n",
 97 |         "Brown cluster id: 762\n",
 98 |         "----------------------------------------\n",
 99 |         "original: 59 an\n",
100 |         "lowercased: 59 an\n",
101 |         "lemma: 59 an\n",
102 |         "shape: 7 xx\n",
103 |         "prefix: 11 a\n",
104 |         "suffix: 59 an\n",
105 |         "log probability: -5.829381465911865\n",
106 |         "Brown cluster id: 3\n",
107 |         "----------------------------------------\n",
108 |         "original: 334 art\n",
109 |         "lowercased: 334 art\n",
110 |         "lemma: 334 art\n",
111 |         "shape: 3 xxx\n",
112 |         "prefix: 11 a\n",
113 |         "suffix: 334 art\n",
114 |         "log probability: -9.482678413391113\n",
115 |         "Brown cluster id: 633\n",
116 |         "----------------------------------------\n",
117 |         "original: 1 ,\n",
118 |         "lowercased: 1 ,\n",
119 |         "lemma: 1 ,\n",
120 |         "shape: 1 ,\n",
121 |         "prefix: 1 ,\n",
122 |         "suffix: 1 ,\n",
123 |         "log probability: -3.0368354320526123\n",
124 |         "Brown cluster id: 4\n",
125 |         "----------------------------------------\n",
126 |         "original: 44 it\n",
127 |         "lowercased: 44 it\n",
128 |         "lemma: 906264 -PRON-\n",
129 |         "shape: 7 xx\n",
130 |         "prefix: 14 i\n",
131 |         "suffix: 44 it\n",
132 |         "log probability: -5.498129367828369\n",
133 |         "Brown cluster id: 474\n",
134 |         "----------------------------------------\n",
135 |         "original: 274 says\n",
136 |         "lowercased: 274 says\n",
137 |         "lemma: 253 say\n",
138 |         "shape: 20 xxxx\n",
139 |         "prefix: 27 s\n",
140 |         "suffix: 275 ays\n",
141 |         "log probability: -7.604108810424805\n",
142 |         "Brown cluster id: 244\n",
143 |         "----------------------------------------\n",
144 |         "original: 1 ,\n",
145 |         "lowercased: 1 ,\n",
146 |         "lemma: 1 ,\n",
147 |         "shape: 1 ,\n",
148 |         "prefix: 1 ,\n",
149 |         "suffix: 1 ,\n",
150 |         "log probability: -3.0368354320526123\n",
151 |         "Brown cluster id: 4\n",
152 |         "----------------------------------------\n",
153 |         "original: 79 or\n",
154 |         "lowercased: 79 or\n",
155 |         "lemma: 79 or\n",
156 |         "shape: 7 xx\n",
157 |         "prefix: 8 o\n",
158 |         "suffix: 79 or\n",
159 |         "log probability: -6.262600898742676\n",
160 |         "Brown cluster id: 404\n",
161 |         "----------------------------------------\n",
162 |         "original: 1400 rather\n",
163 |         "lowercased: 1400 rather\n",
164 |         "lemma: 1400 rather\n",
165 |         "shape: 20 xxxx\n",
166 |         "prefix: 357 r\n",
167 |         "suffix: 131 her\n",
168 |         "log probability: -9.074186325073242\n",
169 |         "Brown cluster id: 6698\n",
170 |         "----------------------------------------\n",
171 |         "original: 1 ,\n",
172 |         "lowercased: 1 ,\n",
173 |         "lemma: 1 ,\n",
174 |         "shape: 1 ,\n",
175 |         "prefix: 1 ,\n",
176 |         "suffix: 1 ,\n",
177 |         "log probability: -3.0368354320526123\n",
178 |         "Brown cluster id: 4\n",
179 |         "----------------------------------------\n",
180 |         "original:"
181 |        ]
182 |       },
183 |       {
184 |        "output_type": "stream",
185 |        "stream": "stdout",
186 |        "text": [
187 |         " 11 a\n",
188 |         "lowercased: 11 a\n",
189 |         "lemma: 11 a\n",
190 |         "shape: 12 x\n",
191 |         "prefix: 11 a\n",
192 |         "suffix: 11 a\n",
193 |         "log probability: -4.003841400146484\n",
194 |         "Brown cluster id: 19\n",
195 |         "----------------------------------------\n"
196 |        ]
197 |       }
198 |      ],
199 |      "prompt_number": 60
200 |     },
201 |     {
202 |      "cell_type": "code",
203 |      "collapsed": false,
204 |      "input": [
205 |       "# Let's look at the sentences\n",
206 |       "sents = []\n",
207 |       "# the \"sents\" property returns spans\n",
208 |       "# spans have indices into the original string\n",
209 |       "# where each index value represents a token\n",
210 |       "for span in parsedData.sents:\n",
211 |       "    # go from the start to the end of each span, returning each token in the sentence\n",
212 |       "    # combine each token using join()\n",
213 |       "    sent = ''.join(parsedData[i].string for i in range(span.start, span.end)).strip()\n",
214 |       "    sents.append(sent)\n",
215 |       "\n",
216 |       "for sentence in sents:\n",
217 |       "    print(sentence)"
218 |      ],
219 |      "language": "python",
220 |      "metadata": {},
221 |      "outputs": [
222 |       {
223 |        "output_type": "stream",
224 |        "stream": "stdout",
225 |        "text": [
226 |         "There is an art, it says, or rather, a knack to flying.\n",
227 |         "The knack lies in learning how to throw yourself at the ground and miss.\n",
228 |         "In the beginning the Universe was created.\n",
229 |         "This has made a lot of people very angry and been widely regarded as a bad move.\n"
230 |        ]
231 |       }
232 |      ],
233 |      "prompt_number": 61
234 |     },
235 |     {
236 |      "cell_type": "code",
237 |      "collapsed": false,
238 |      "input": [
239 |       "# Let's look at the part of speech tags of the first sentence\n",
240 |       "for span in parsedData.sents:\n",
241 |       "    sent = [parsedData[i] for i in range(span.start, span.end)]\n",
242 |       "    break\n",
243 |       "\n",
244 |       "for token in sent:\n",
245 |       "    print(token.orth_, token.pos_)"
246 |      ],
247 |      "language": "python",
248 |      "metadata": {},
249 |      "outputs": [
250 |       {
251 |        "output_type": "stream",
252 |        "stream": "stdout",
253 |        "text": [
254 |         "There DET\n",
255 |         "is VERB\n",
256 |         "an DET\n",
257 |         "art NOUN\n",
258 |         ", PUNCT\n",
259 |         "it PRON\n",
260 |         "says VERB\n",
261 |         ", PUNCT\n",
262 |         "or CONJ\n",
263 |         "rather ADV\n",
264 |         ", PUNCT\n",
265 |         "a DET\n",
266 |         "knack NOUN\n",
267 |         "to ADP\n",
268 |         "flying NOUN\n",
269 |         ". PUNCT\n"
270 |        ]
271 |       }
272 |      ],
273 |      "prompt_number": 62
274 |     },
275 |     {
276 |      "cell_type": "code",
277 |      "collapsed": false,
278 |      "input": [
279 |       "# Let's look at the dependencies of this example:\n",
280 |       "example = \"The boy with the spotted dog quickly ran after the firetruck.\"\n",
281 |       "parsedEx = parser(example)\n",
282 |       "# shown as: original token, dependency tag, head word, left dependents, right dependents\n",
283 |       "for token in parsedEx:\n",
284 |       "    print(token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights])"
285 |      ],
286 |      "language": "python",
287 |      "metadata": {},
288 |      "outputs": [
289 |       {
290 |        "output_type": "stream",
291 |        "stream": "stdout",
292 |        "text": [
293 |         "The det boy [] []\n",
294 |         "boy nsubj ran ['The'] ['with']\n",
295 |         "with prep boy [] ['dog']\n",
296 |         "the det dog [] []\n",
297 |         "spotted amod dog [] []\n",
298 |         "dog pobj with ['the', 'spotted'] []\n",
299 |         "quickly advmod ran [] []\n",
300 |         "ran ROOT ran ['boy', 'quickly'] ['after', '.']\n",
301 |         "after prep ran [] ['firetruck']\n",
302 |         "the det firetruck [] []\n",
303 |         "firetruck pobj after ['the'] []\n",
304 |         ". punct ran [] []\n"
305 |        ]
306 |       }
307 |      ],
308 |      "prompt_number": 63
309 |     },
310 |     {
311 |      "cell_type": "code",
312 |      "collapsed": false,
313 |      "input": [
314 |       "# Let's look at the named entities of this example:\n",
315 |       "example = \"Apple's stocks dropped dramatically after the death of Steve Jobs in October.\"\n",
316 |       "parsedEx = parser(example)\n",
317 |       "for token in parsedEx:\n",
318 |       "    print(token.orth_, token.ent_type_ if token.ent_type_ != \"\" else \"(not an entity)\")\n",
319 |       "\n",
320 |       "print(\"-------------- entities only ---------------\")\n",
321 |       "# if you just want the entities and nothing else, you can do access the parsed examples \"ents\" property like this:\n",
322 |       "ents = list(parsedEx.ents)\n",
323 |       "for entity in ents:\n",
324 |       "    print(entity.label, entity.label_, ' '.join(t.orth_ for t in entity))"
325 |      ],
326 |      "language": "python",
327 |      "metadata": {},
328 |      "outputs": [
329 |       {
330 |        "output_type": "stream",
331 |        "stream": "stdout",
332 |        "text": [
333 |         "Apple ORG\n",
334 |         "'s (not an entity)\n",
335 |         "stocks (not an entity)\n",
336 |         "dropped (not an entity)\n",
337 |         "dramatically (not an entity)\n",
338 |         "after (not an entity)\n",
339 |         "the (not an entity)\n",
340 |         "death (not an entity)\n",
341 |         "of (not an entity)\n",
342 |         "Steve PERSON\n",
343 |         "Jobs (not an entity)\n",
344 |         "in (not an entity)\n",
345 |         "October DATE\n",
346 |         ". (not an entity)\n",
347 |         "-------------- entities only ---------------\n",
348 |         "274530 ORG Apple\n",
349 |         "112504 PERSON Steve Jobs\n",
350 |         "71288 DATE October\n"
351 |        ]
352 |       }
353 |      ],
354 |      "prompt_number": 3
355 |     },
356 |     {
357 |      "cell_type": "heading",
358 |      "level": 1,
359 |      "metadata": {},
360 |      "source": [
361 |       "spaCy is trained to attempt to handle messy data, including emoticons and other web-based features"
362 |      ]
363 |     },
364 |     {
365 |      "cell_type": "code",
366 |      "collapsed": false,
367 |      "input": [
368 |       "messyData = \"lol that is rly funny :) This is gr8 i rate it 8/8!!!\"\n",
369 |       "parsedData = parser(messyData)\n",
370 |       "for token in parsedData:\n",
371 |       "    print(token.orth_, token.pos_, token.lemma_)\n",
372 |       "    \n",
373 |       "# it does pretty well! Note that it does fail on the token \"gr8\", taking it as a verb rather than an adjective meaning \"great\"\n",
374 |       "# and \"lol\" probably isn't a noun...it's more like an interjection"
375 |      ],
376 |      "language": "python",
377 |      "metadata": {},
378 |      "outputs": [
379 |       {
380 |        "output_type": "stream",
381 |        "stream": "stdout",
382 |        "text": [
383 |         "lol NOUN lol\n",
384 |         "that DET that\n",
385 |         "is VERB be\n",
386 |         "rly ADV rly\n",
387 |         "funny ADJ funny\n",
388 |         ":) PUNCT :)\n",
389 |         "This DET This\n",
390 |         "is VERB be\n",
391 |         "gr8 VERB gr8\n",
392 |         "i PRON i\n",
393 |         "rate VERB rate\n",
394 |         "it PRON -PRON-\n",
395 |         "8/8 NUM 8/8\n",
396 |         "! PUNCT !\n",
397 |         "! PUNCT !\n",
398 |         "! PUNCT !\n"
399 |        ]
400 |       }
401 |      ],
402 |      "prompt_number": 65
403 |     },
404 |     {
405 |      "cell_type": "heading",
406 |      "level": 1,
407 |      "metadata": {},
408 |      "source": [
409 |       "spaCy has word vector representations built in!"
410 |      ]
411 |     },
412 |     {
413 |      "cell_type": "code",
414 |      "collapsed": false,
415 |      "input": [
416 |       "from numpy import dot\n",
417 |       "from numpy.linalg import norm\n",
418 |       "\n",
419 |       "# you can access known words from the parser's vocabulary\n",
420 |       "nasa = parser.vocab['NASA']\n",
421 |       "\n",
422 |       "# cosine similarity\n",
423 |       "cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))\n",
424 |       "\n",
425 |       "# gather all known words, take only the lowercased versions\n",
426 |       "allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != \"nasa\"})\n",
427 |       "\n",
428 |       "# sort by similarity to NASA\n",
429 |       "allWords.sort(key=lambda w: cosine(w.repvec, nasa.repvec))\n",
430 |       "allWords.reverse()\n",
431 |       "print(\"Top 20 most similar words to NASA:\")\n",
432 |       "for word in allWords[:20]:   \n",
433 |       "    print(word.orth_)\n",
434 |       "    \n",
435 |       "# Let's see if it can figure out this analogy\n",
436 |       "# Man is to King as Woman is to ??\n",
437 |       "king = parser.vocab['king']\n",
438 |       "man = parser.vocab['man']\n",
439 |       "woman = parser.vocab['woman']\n",
440 |       "\n",
441 |       "result = king.repvec - man.repvec + woman.repvec\n",
442 |       "\n",
443 |       "# gather all known words, take only the lowercased versions\n",
444 |       "allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != \"king\" and w.lower_ != \"man\" and w.lower_ != \"woman\"})\n",
445 |       "# sort by similarity to the result\n",
446 |       "allWords.sort(key=lambda w: cosine(w.repvec, result))\n",
447 |       "allWords.reverse()\n",
448 |       "print(\"\\n----------------------------\\nTop 3 closest results for king - man + woman:\")\n",
449 |       "for word in allWords[:3]:   \n",
450 |       "    print(word.orth_)\n",
451 |       "    \n",
452 |       "# it got it! Queen!"
453 |      ],
454 |      "language": "python",
455 |      "metadata": {},
456 |      "outputs": [
457 |       {
458 |        "output_type": "stream",
459 |        "stream": "stdout",
460 |        "text": [
461 |         "Top 20 most similar words to NASA:\n",
462 |         "jpl\n",
463 |         "noaa\n",
464 |         "esa\n",
465 |         "cern\n",
466 |         "nih\n",
467 |         "norad\n",
468 |         "fema\n",
469 |         "isro\n",
470 |         "usaid\n",
471 |         "nsf\n",
472 |         "nsa\n",
473 |         "dod\n",
474 |         "usda\n",
475 |         "caltech\n",
476 |         "defra\n",
477 |         "raytheon\n",
478 |         "cia\n",
479 |         "unhcr\n",
480 |         "fermilab\n",
481 |         "cdc\n",
482 |         "\n",
483 |         "----------------------------\n",
484 |         "Top 3 closest results for king - man + woman:"
485 |        ]
486 |       },
487 |       {
488 |        "output_type": "stream",
489 |        "stream": "stdout",
490 |        "text": [
491 |         "\n",
492 |         "queen\n",
493 |         "monarch\n",
494 |         "princess\n"
495 |        ]
496 |       }
497 |      ],
498 |      "prompt_number": 66
499 |     },
500 |     {
501 |      "cell_type": "heading",
502 |      "level": 1,
503 |      "metadata": {},
504 |      "source": [
505 |       "You can do cool things like extract Subject, Verb, Object triples from the dependency parse if you use my code in subject_object_extraction.py. Note: Doesn't work on complicated sentences. Fails if the dependency parse is incorrect."
506 |      ]
507 |     },
508 |     {
509 |      "cell_type": "code",
510 |      "collapsed": false,
511 |      "input": [
512 |       "from subject_object_extraction import findSVOs\n",
513 |       "\n",
514 |       "# can still work even without punctuation\n",
515 |       "parse = parser(\"he and his brother shot me and my sister\")\n",
516 |       "print(findSVOs(parse))\n",
517 |       "\n",
518 |       "# very complex sample. Only some are correct. Some are missed.\n",
519 |       "parse = parser(\"Far out in the uncharted backwaters of the unfashionable end of the Western Spiral arm of the Galaxy lies a small unregarded yellow sun. \"\n",
520 |       "                \"Orbiting this at a distance of roughly ninety-two million miles is an utterly insignificant little blue green planet whose ape-descended \"\n",
521 |       "                \"life forms are so amazingly primitive that they still think digital watches are a pretty neat idea. \"\n",
522 |       "                \"This planet has \u2013 or rather had \u2013 a problem, which was this: most of the people living on it were unhappy for pretty much of the time. \"\n",
523 |       "                \"Many solutions were suggested for this problem, but most of these were largely concerned with the movements of small green pieces of paper, \"\n",
524 |       "                \"which is odd because on the whole it wasn\u2019t the small green pieces of paper that were unhappy. And so the problem remained; lots of the \"\n",
525 |       "                \"people were mean, and most of them were miserable, even the ones with digital watches.\")\n",
526 |       "print(findSVOs(parse))"
527 |      ],
528 |      "language": "python",
529 |      "metadata": {},
530 |      "outputs": [
531 |       {
532 |        "output_type": "stream",
533 |        "stream": "stdout",
534 |        "text": [
535 |         "[('he', 'shot', 'me'), ('he', 'shot', 'sister'), ('brother', 'shot', 'me'), ('brother', 'shot', 'sister')]\n",
536 |         "[('orbiting', 'is', 'planet'), ('watches', 'are', 'idea'), ('problem', 'was', 'this'), ('it', 'wasn\u2019t', 'pieces'), ('most', 'were', 'ones')]\n"
537 |        ]
538 |       }
539 |      ],
540 |      "prompt_number": 67
541 |     },
542 |     {
543 |      "cell_type": "heading",
544 |      "level": 1,
545 |      "metadata": {},
546 |      "source": [
547 |       "If you want to include spaCy in your machine learning it is not too difficult"
548 |      ]
549 |     },
550 |     {
551 |      "cell_type": "code",
552 |      "collapsed": false,
553 |      "input": [
554 |       "from sklearn.feature_extraction.text import CountVectorizer\n",
555 |       "from sklearn.base import TransformerMixin\n",
556 |       "from sklearn.pipeline import Pipeline\n",
557 |       "from sklearn.svm import LinearSVC\n",
558 |       "from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS\n",
559 |       "from sklearn.metrics import accuracy_score\n",
560 |       "from nltk.corpus import stopwords\n",
561 |       "import string\n",
562 |       "import re\n",
563 |       "\n",
564 |       "# A custom stoplist\n",
565 |       "STOPLIST = set(stopwords.words('english') + [\"n't\", \"'s\", \"'m\", \"ca\"] + list(ENGLISH_STOP_WORDS))\n",
566 |       "# List of symbols we don't care about\n",
567 |       "SYMBOLS = \" \".join(string.punctuation).split(\" \") + [\"-----\", \"---\", \"...\", \"\u201c\", \"\u201d\", \"'ve\"]\n",
568 |       "\n",
569 |       "# Every step in a pipeline needs to be a \"transformer\". Define a custom transformer to clean text using spaCy\n",
570 |       "class CleanTextTransformer(TransformerMixin):\n",
571 |       "    \"\"\"\n",
572 |       "    Convert text to cleaned text\n",
573 |       "    \"\"\"\n",
574 |       "\n",
575 |       "    def transform(self, X, **transform_params):\n",
576 |       "        return [cleanText(text) for text in X]\n",
577 |       "\n",
578 |       "    def fit(self, X, y=None, **fit_params):\n",
579 |       "        return self\n",
580 |       "\n",
581 |       "    def get_params(self, deep=True):\n",
582 |       "        return {}\n",
583 |       "    \n",
584 |       "# A custom function to clean the text before sending it into the vectorizer\n",
585 |       "def cleanText(text):\n",
586 |       "    # get rid of newlines\n",
587 |       "    text = text.strip().replace(\"\\n\", \" \").replace(\"\\r\", \" \")\n",
588 |       "    \n",
589 |       "    # replace twitter @mentions\n",
590 |       "    mentionFinder = re.compile(r\"@[a-z0-9_]{1,15}\", re.IGNORECASE)\n",
591 |       "    text = mentionFinder.sub(\"@MENTION\", text)\n",
592 |       "    \n",
593 |       "    # replace HTML symbols\n",
594 |       "    text = text.replace(\"&amp;\", \"and\").replace(\"&gt;\", \">\").replace(\"&lt;\", \"<\")\n",
595 |       "    \n",
596 |       "    # lowercase\n",
597 |       "    text = text.lower()\n",
598 |       "\n",
599 |       "    return text\n",
600 |       "\n",
601 |       "# A custom function to tokenize the text using spaCy\n",
602 |       "# and convert to lemmas\n",
603 |       "def tokenizeText(sample):\n",
604 |       "\n",
605 |       "    # get the tokens using spaCy\n",
606 |       "    tokens = parser(sample)\n",
607 |       "\n",
608 |       "    # lemmatize\n",
609 |       "    lemmas = []\n",
610 |       "    for tok in tokens:\n",
611 |       "        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != \"-PRON-\" else tok.lower_)\n",
612 |       "    tokens = lemmas\n",
613 |       "\n",
614 |       "    # stoplist the tokens\n",
615 |       "    tokens = [tok for tok in tokens if tok not in STOPLIST]\n",
616 |       "\n",
617 |       "    # stoplist symbols\n",
618 |       "    tokens = [tok for tok in tokens if tok not in SYMBOLS]\n",
619 |       "\n",
620 |       "    # remove large strings of whitespace\n",
621 |       "    while \"\" in tokens:\n",
622 |       "        tokens.remove(\"\")\n",
623 |       "    while \" \" in tokens:\n",
624 |       "        tokens.remove(\" \")\n",
625 |       "    while \"\\n\" in tokens:\n",
626 |       "        tokens.remove(\"\\n\")\n",
627 |       "    while \"\\n\\n\" in tokens:\n",
628 |       "        tokens.remove(\"\\n\\n\")\n",
629 |       "\n",
630 |       "    return tokens\n",
631 |       "\n",
632 |       "def printNMostInformative(vectorizer, clf, N):\n",
633 |       "    \"\"\"Prints features with the highest coefficient values, per class\"\"\"\n",
634 |       "    feature_names = vectorizer.get_feature_names()\n",
635 |       "    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))\n",
636 |       "    topClass1 = coefs_with_fns[:N]\n",
637 |       "    topClass2 = coefs_with_fns[:-(N + 1):-1]\n",
638 |       "    print(\"Class 1 best: \")\n",
639 |       "    for feat in topClass1:\n",
640 |       "        print(feat)\n",
641 |       "    print(\"Class 2 best: \")\n",
642 |       "    for feat in topClass2:\n",
643 |       "        print(feat)\n",
644 |       "\n",
645 |       "# the vectorizer and classifer to use\n",
646 |       "# note that I changed the tokenizer in CountVectorizer to use a custom function using spaCy's tokenizer\n",
647 |       "vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))\n",
648 |       "clf = LinearSVC()\n",
649 |       "# the pipeline to clean, tokenize, vectorize, and classify\n",
650 |       "pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])\n",
651 |       "\n",
652 |       "# data\n",
653 |       "train = [\"I love space. Space is great.\", \"Planets are cool. I am glad they exist in space\", \"lol @twitterdude that is gr8\", \n",
654 |       "        \"twitter &amp; reddit are fun.\", \"Mars is a planet. It is red.\", \"@Microsoft: y u skip windows 9?\", \"Rockets launch from Earth and go to other planets.\",\n",
655 |       "        \"twitter social media &gt; &lt;\", \"@someguy @somegirl @twitter #hashtag\", \"Orbiting the sun is a little blue-green planet.\"]\n",
656 |       "labelsTrain = [\"space\", \"space\", \"twitter\", \"twitter\", \"space\", \"twitter\", \"space\", \"twitter\", \"twitter\", \"space\"]\n",
657 |       "\n",
658 |       "test = [\"i h8 riting comprehensibly #skoolsux\", \"planets and stars and rockets and stuff\"]\n",
659 |       "labelsTest = [\"twitter\", \"space\"]\n",
660 |       "\n",
661 |       "# train\n",
662 |       "pipe.fit(train, labelsTrain)\n",
663 |       "\n",
664 |       "# test\n",
665 |       "preds = pipe.predict(test)\n",
666 |       "print(\"----------------------------------------------------------------------------------------------\")\n",
667 |       "print(\"results:\")\n",
668 |       "for (sample, pred) in zip(test, preds):\n",
669 |       "    print(sample, \":\", pred)\n",
670 |       "print(\"accuracy:\", accuracy_score(labelsTest, preds))\n",
671 |       "\n",
672 |       "print(\"----------------------------------------------------------------------------------------------\")\n",
673 |       "print(\"Top 10 features used to predict: \")\n",
674 |       "# show the top features\n",
675 |       "printNMostInformative(vectorizer, clf, 10)\n",
676 |       "\n",
677 |       "print(\"----------------------------------------------------------------------------------------------\")\n",
678 |       "print(\"The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc\")\n",
679 |       "# let's see what the pipeline was transforming the data into\n",
680 |       "pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])\n",
681 |       "transform = pipe.fit_transform(train, labelsTrain)\n",
682 |       "\n",
683 |       "# get the features that the vectorizer learned (its vocabulary)\n",
684 |       "vocab = vectorizer.get_feature_names()\n",
685 |       "\n",
686 |       "# the values from the vectorizer transformed data (each item is a row,column index with value as # times occuring in the sample, stored as a sparse matrix)\n",
687 |       "for i in range(len(train)):\n",
688 |       "    s = \"\"\n",
689 |       "    indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]\n",
690 |       "    numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]\n",
691 |       "    for idx, num in zip(indexIntoVocab, numOccurences):\n",
692 |       "        s += str((vocab[idx], num))\n",
693 |       "    print(\"Sample {}: {}\".format(i, s))"
694 |      ],
695 |      "language": "python",
696 |      "metadata": {},
697 |      "outputs": [
698 |       {
699 |        "output_type": "stream",
700 |        "stream": "stdout",
701 |        "text": [
702 |         "----------------------------------------------------------------------------------------------\n",
703 |         "results:\n",
704 |         "i h8 riting comprehensibly #skoolsux : twitter\n",
705 |         "planets and stars and rockets and stuff : space\n",
706 |         "accuracy: 1.0\n",
707 |         "----------------------------------------------------------------------------------------------\n",
708 |         "Top 10 features used to predict: \n",
709 |         "Class 1 best: \n",
710 |         "(-0.52882810587037121, 'planet')\n",
711 |         "(-0.35193565503626856, 'space')\n",
712 |         "(-0.2182987490483107, 'mar')\n",
713 |         "(-0.2182987490483107, 'red')\n",
714 |         "(-0.15592826214493352, 'earth')\n",
715 |         "(-0.15592826214493352, 'launch')\n",
716 |         "(-0.15592826214493352, 'rocket')\n",
717 |         "(-0.1482804579342584, 'great')\n",
718 |         "(-0.1482804579342584, 'love')\n",
719 |         "(-0.099226355509375405, 'blue')\n",
720 |         "Class 2 best: \n",
721 |         "(0.41129938045689757, 'twitter')\n",
722 |         "(0.34038557663231445, '@mention')\n",
723 |         "(0.23401502570811406, 'lol')\n",
724 |         "(0.23401502570811406, 'gr8')\n",
725 |         "(0.20564996854629114, 'social')\n",
726 |         "(0.20564996854629114, 'medium')\n",
727 |         "(0.20564941191060651, 'reddit')\n",
728 |         "(0.20564941191060651, 'fun')\n",
729 |         "(0.10637055092420053, 'y')\n",
730 |         "(0.10637055092420053, 'window')\n",
731 |         "----------------------------------------------------------------------------------------------\n",
732 |         "The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc\n",
733 |         "Sample 0: ('love', 1)('space', 2)('great', 1)\n",
734 |         "Sample 1: ('space', 1)('planet', 1)('cool', 1)('glad', 1)('exist', 1)\n",
735 |         "Sample 2: ('lol', 1)('@mention', 1)('gr8', 1)\n",
736 |         "Sample 3: ('twitter', 1)('reddit', 1)('fun', 1)\n",
737 |         "Sample 4: ('planet', 1)('mar', 1)('red', 1)\n",
738 |         "Sample 5: ('@mention', 1)('y', 1)('u', 1)('skip', 1)('window', 1)('9', 1)\n",
739 |         "Sample 6: ('planet', 1)('rocket', 1)('launch', 1)('earth', 1)\n",
740 |         "Sample 7: ('twitter', 1)('social', 1)('medium', 1)\n",
741 |         "Sample 8: ('@mention', 3)('hashtag', 1)\n",
742 |         "Sample 9: ('planet', 1)('orbit', 1)('sun', 1)('little', 1)('blue', 1)('green', 1)\n"
743 |        ]
744 |       }
745 |      ],
746 |      "prompt_number": 68
747 |     },
748 |     {
749 |      "cell_type": "code",
750 |      "collapsed": false,
751 |      "input": [],
752 |      "language": "python",
753 |      "metadata": {},
754 |      "outputs": []
755 |     }
756 |    ],
757 |    "metadata": {}
758 |   }
759 |  ]
760 | }


--------------------------------------------------------------------------------
/Intro_spaCy_NLP.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 |     # Set up spaCy
  4 |     from spacy.en import English
  5 |     parser = English()
  6 |     
  7 |     # Test Data
  8 |     multiSentence = "There is an art, it says, or rather, a knack to flying." \
  9 |                      "The knack lies in learning how to throw yourself at the ground and miss." \
 10 |                      "In the beginning the Universe was created. This has made a lot of people "\
 11 |                      "very angry and been widely regarded as a bad move."
 12 | 
 13 | # spaCy does tokenization, sentence recognition, part of speech tagging, lemmatization, dependency parsing, and named entity recognition all at once!
 14 | 
 15 | 
 16 |     # all you have to do to parse text is this:
 17 |     #note: the first time you run spaCy in a file it takes a little while to load up its modules
 18 |     parsedData = parser(multiSentence)
 19 | 
 20 | 
 21 |     # Let's look at the tokens
 22 |     # All you have to do is iterate through the parsedData
 23 |     # Each token is an object with lots of different properties
 24 |     # A property with an underscore at the end returns the string representation
 25 |     # while a property without the underscore returns an index (int) into spaCy's vocabulary
 26 |     # The probability estimate is based on counts from a 3 billion word corpus, smoothed using the Simple Good-Turing method.
 27 |     for i, token in enumerate(parsedData):
 28 |         print("original:", token.orth, token.orth_)
 29 |         print("lowercased:", token.lower, token.lower_)
 30 |         print("lemma:", token.lemma, token.lemma_)
 31 |         print("shape:", token.shape, token.shape_)
 32 |         print("prefix:", token.prefix, token.prefix_)
 33 |         print("suffix:", token.suffix, token.suffix_)
 34 |         print("log probability:", token.prob)
 35 |         print("Brown cluster id:", token.cluster)
 36 |         print("----------------------------------------")
 37 |         if i > 10:
 38 |             break
 39 | 
 40 |     original: 300 There
 41 |     lowercased: 144 there
 42 |     lemma: 300 There
 43 |     shape: 187 Xxxxx
 44 |     prefix: 32 T
 45 |     suffix: 66 ere
 46 |     log probability: -7.663576126098633
 47 |     Brown cluster id: 1918
 48 |     ----------------------------------------
 49 |     original: 29 is
 50 |     lowercased: 29 is
 51 |     lemma: 52 be
 52 |     shape: 7 xx
 53 |     prefix: 14 i
 54 |     suffix: 29 is
 55 |     log probability: -5.002371311187744
 56 |     Brown cluster id: 762
 57 |     ----------------------------------------
 58 |     original: 59 an
 59 |     lowercased: 59 an
 60 |     lemma: 59 an
 61 |     shape: 7 xx
 62 |     prefix: 11 a
 63 |     suffix: 59 an
 64 |     log probability: -5.829381465911865
 65 |     Brown cluster id: 3
 66 |     ----------------------------------------
 67 |     original: 334 art
 68 |     lowercased: 334 art
 69 |     lemma: 334 art
 70 |     shape: 3 xxx
 71 |     prefix: 11 a
 72 |     suffix: 334 art
 73 |     log probability: -9.482678413391113
 74 |     Brown cluster id: 633
 75 |     ----------------------------------------
 76 |     original: 1 ,
 77 |     lowercased: 1 ,
 78 |     lemma: 1 ,
 79 |     shape: 1 ,
 80 |     prefix: 1 ,
 81 |     suffix: 1 ,
 82 |     log probability: -3.0368354320526123
 83 |     Brown cluster id: 4
 84 |     ----------------------------------------
 85 |     original: 44 it
 86 |     lowercased: 44 it
 87 |     lemma: 906264 -PRON-
 88 |     shape: 7 xx
 89 |     prefix: 14 i
 90 |     suffix: 44 it
 91 |     log probability: -5.498129367828369
 92 |     Brown cluster id: 474
 93 |     ----------------------------------------
 94 |     original: 274 says
 95 |     lowercased: 274 says
 96 |     lemma: 253 say
 97 |     shape: 20 xxxx
 98 |     prefix: 27 s
 99 |     suffix: 275 ays
100 |     log probability: -7.604108810424805
101 |     Brown cluster id: 244
102 |     ----------------------------------------
103 |     original: 1 ,
104 |     lowercased: 1 ,
105 |     lemma: 1 ,
106 |     shape: 1 ,
107 |     prefix: 1 ,
108 |     suffix: 1 ,
109 |     log probability: -3.0368354320526123
110 |     Brown cluster id: 4
111 |     ----------------------------------------
112 |     original: 79 or
113 |     lowercased: 79 or
114 |     lemma: 79 or
115 |     shape: 7 xx
116 |     prefix: 8 o
117 |     suffix: 79 or
118 |     log probability: -6.262600898742676
119 |     Brown cluster id: 404
120 |     ----------------------------------------
121 |     original: 1400 rather
122 |     lowercased: 1400 rather
123 |     lemma: 1400 rather
124 |     shape: 20 xxxx
125 |     prefix: 357 r
126 |     suffix: 131 her
127 |     log probability: -9.074186325073242
128 |     Brown cluster id: 6698
129 |     ----------------------------------------
130 |     original: 1 ,
131 |     lowercased: 1 ,
132 |     lemma: 1 ,
133 |     shape: 1 ,
134 |     prefix: 1 ,
135 |     suffix: 1 ,
136 |     log probability: -3.0368354320526123
137 |     Brown cluster id: 4
138 |     ----------------------------------------
139 |     original: 11 a
140 |     lowercased: 11 a
141 |     lemma: 11 a
142 |     shape: 12 x
143 |     prefix: 11 a
144 |     suffix: 11 a
145 |     log probability: -4.003841400146484
146 |     Brown cluster id: 19
147 |     ----------------------------------------
148 | 
149 | 
150 | 
151 |     # Let's look at the sentences
152 |     sents = []
153 |     # the "sents" property returns spans
154 |     # spans have indices into the original string
155 |     # where each index value represents a token
156 |     for span in parsedData.sents:
157 |         # go from the start to the end of each span, returning each token in the sentence
158 |         # combine each token using join()
159 |         sent = ''.join(parsedData[i].string for i in range(span.start, span.end)).strip()
160 |         sents.append(sent)
161 |     
162 |     for sentence in sents:
163 |         print(sentence)
164 | 
165 |     There is an art, it says, or rather, a knack to flying.
166 |     The knack lies in learning how to throw yourself at the ground and miss.
167 |     In the beginning the Universe was created.
168 |     This has made a lot of people very angry and been widely regarded as a bad move.
169 | 
170 | 
171 | 
172 |     # Let's look at the part of speech tags of the first sentence
173 |     for span in parsedData.sents:
174 |         sent = [parsedData[i] for i in range(span.start, span.end)]
175 |         break
176 |     
177 |     for token in sent:
178 |         print(token.orth_, token.pos_)
179 | 
180 |     There DET
181 |     is VERB
182 |     an DET
183 |     art NOUN
184 |     , PUNCT
185 |     it PRON
186 |     says VERB
187 |     , PUNCT
188 |     or CONJ
189 |     rather ADV
190 |     , PUNCT
191 |     a DET
192 |     knack NOUN
193 |     to ADP
194 |     flying NOUN
195 |     . PUNCT
196 | 
197 | 
198 | 
199 |     # Let's look at the dependencies of this example:
200 |     example = "The boy with the spotted dog quickly ran after the firetruck."
201 |     parsedEx = parser(example)
202 |     # shown as: original token, dependency tag, head word, left dependents, right dependents
203 |     for token in parsedEx:
204 |         print(token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights])
205 | 
206 |     The det boy [] []
207 |     boy nsubj ran ['The'] ['with']
208 |     with prep boy [] ['dog']
209 |     the det dog [] []
210 |     spotted amod dog [] []
211 |     dog pobj with ['the', 'spotted'] []
212 |     quickly advmod ran [] []
213 |     ran ROOT ran ['boy', 'quickly'] ['after', '.']
214 |     after prep ran [] ['firetruck']
215 |     the det firetruck [] []
216 |     firetruck pobj after ['the'] []
217 |     . punct ran [] []
218 | 
219 | 
220 | 
221 |     # Let's look at the named entities of this example:
222 |     example = "Apple's stocks dropped dramatically after the death of Steve Jobs in October."
223 |     parsedEx = parser(example)
224 |     for token in parsedEx:
225 |         print(token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)")
226 |     
227 |     print("-------------- entities only ---------------")
228 |     # if you just want the entities and nothing else, you can do access the parsed examples "ents" property like this:
229 |     ents = list(parsedEx.ents)
230 |     for entity in ents:
231 |         print(entity.label, entity.label_, ' '.join(t.orth_ for t in entity))
232 | 
233 |     Apple ORG
234 |     's (not an entity)
235 |     stocks (not an entity)
236 |     dropped (not an entity)
237 |     dramatically (not an entity)
238 |     after (not an entity)
239 |     the (not an entity)
240 |     death (not an entity)
241 |     of (not an entity)
242 |     Steve PERSON
243 |     Jobs (not an entity)
244 |     in (not an entity)
245 |     October DATE
246 |     . (not an entity)
247 |     -------------- entities only ---------------
248 |     274530 ORG Apple
249 |     112504 PERSON Steve Jobs
250 |     71288 DATE October
251 | 
252 | 
253 | # spaCy is trained to attempt to handle messy data, including emoticons and other web-based features
254 | 
255 | 
256 |     messyData = "lol that is rly funny :) This is gr8 i rate it 8/8!!!"
257 |     parsedData = parser(messyData)
258 |     for token in parsedData:
259 |         print(token.orth_, token.pos_, token.lemma_)
260 |         
261 |     # it does pretty well! Note that it does fail on the token "gr8", taking it as a verb rather than an adjective meaning "great"
262 |     # and "lol" probably isn't a noun...it's more like an interjection
263 | 
264 |     lol NOUN lol
265 |     that DET that
266 |     is VERB be
267 |     rly ADV rly
268 |     funny ADJ funny
269 |     :) PUNCT :)
270 |     This DET This
271 |     is VERB be
272 |     gr8 VERB gr8
273 |     i PRON i
274 |     rate VERB rate
275 |     it PRON -PRON-
276 |     8/8 NUM 8/8
277 |     ! PUNCT !
278 |     ! PUNCT !
279 |     ! PUNCT !
280 | 
281 | 
282 | # spaCy has word vector representations built in!
283 | 
284 | 
285 |     from numpy import dot
286 |     from numpy.linalg import norm
287 |     
288 |     # you can access known words from the parser's vocabulary
289 |     nasa = parser.vocab['NASA']
290 |     
291 |     # cosine similarity
292 |     cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
293 |     
294 |     # gather all known words, take only the lowercased versions
295 |     allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != "nasa"})
296 |     
297 |     # sort by similarity to NASA
298 |     allWords.sort(key=lambda w: cosine(w.repvec, nasa.repvec))
299 |     allWords.reverse()
300 |     print("Top 20 most similar words to NASA:")
301 |     for word in allWords[:20]:   
302 |         print(word.orth_)
303 |         
304 |     # Let's see if it can figure out this analogy
305 |     # Man is to King as Woman is to ??
306 |     king = parser.vocab['king']
307 |     man = parser.vocab['man']
308 |     woman = parser.vocab['woman']
309 |     
310 |     result = king.repvec - man.repvec + woman.repvec
311 |     
312 |     # gather all known words, take only the lowercased versions
313 |     allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != "king" and w.lower_ != "man" and w.lower_ != "woman"})
314 |     # sort by similarity to the result
315 |     allWords.sort(key=lambda w: cosine(w.repvec, result))
316 |     allWords.reverse()
317 |     print("\n----------------------------\nTop 3 closest results for king - man + woman:")
318 |     for word in allWords[:3]:   
319 |         print(word.orth_)
320 |         
321 |     # it got it! Queen!
322 | 
323 |     Top 20 most similar words to NASA:
324 |     jpl
325 |     noaa
326 |     esa
327 |     cern
328 |     nih
329 |     norad
330 |     fema
331 |     isro
332 |     usaid
333 |     nsf
334 |     nsa
335 |     dod
336 |     usda
337 |     caltech
338 |     defra
339 |     raytheon
340 |     cia
341 |     unhcr
342 |     fermilab
343 |     cdc
344 |     
345 |     ----------------------------
346 |     Top 3 closest results for king - man + woman:
347 |     queen
348 |     monarch
349 |     princess
350 | 
351 | 
352 | # You can do cool things like extract Subject, Verb, Object triples from the dependency parse if you use my code in subject_object_extraction.py. Note: Doesn't work on complicated sentences. Fails if the dependency parse is incorrect.
353 | 
354 | 
355 |     from subject_object_extraction import findSVOs
356 |     
357 |     # can still work even without punctuation
358 |     parse = parser("he and his brother shot me and my sister")
359 |     print(findSVOs(parse))
360 |     
361 |     # very complex sample. Only some are correct. Some are missed.
362 |     parse = parser("Far out in the uncharted backwaters of the unfashionable end of the Western Spiral arm of the Galaxy lies a small unregarded yellow sun. "
363 |                     "Orbiting this at a distance of roughly ninety-two million miles is an utterly insignificant little blue green planet whose ape-descended "
364 |                     "life forms are so amazingly primitive that they still think digital watches are a pretty neat idea. "
365 |                     "This planet has – or rather had – a problem, which was this: most of the people living on it were unhappy for pretty much of the time. "
366 |                     "Many solutions were suggested for this problem, but most of these were largely concerned with the movements of small green pieces of paper, "
367 |                     "which is odd because on the whole it wasn’t the small green pieces of paper that were unhappy. And so the problem remained; lots of the "
368 |                     "people were mean, and most of them were miserable, even the ones with digital watches.")
369 |     print(findSVOs(parse))
370 | 
371 |     [('he', 'shot', 'me'), ('he', 'shot', 'sister'), ('brother', 'shot', 'me'), ('brother', 'shot', 'sister')]
372 |     [('orbiting', 'is', 'planet'), ('watches', 'are', 'idea'), ('problem', 'was', 'this'), ('it', 'wasn’t', 'pieces'), ('most', 'were', 'ones')]
373 | 
374 | 
375 | # If you want to include spaCy in your machine learning it is not too difficult
376 | 
377 | 
378 |     from sklearn.feature_extraction.text import CountVectorizer
379 |     from sklearn.base import TransformerMixin
380 |     from sklearn.pipeline import Pipeline
381 |     from sklearn.svm import LinearSVC
382 |     from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
383 |     from sklearn.metrics import accuracy_score
384 |     from nltk.corpus import stopwords
385 |     import string
386 |     import re
387 |     
388 |     # A custom stoplist
389 |     STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
390 |     # List of symbols we don't care about
391 |     SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]
392 |     
393 |     # Every step in a pipeline needs to be a "transformer". Define a custom transformer to clean text using spaCy
394 |     class CleanTextTransformer(TransformerMixin):
395 |         """
396 |         Convert text to cleaned text
397 |         """
398 |     
399 |         def transform(self, X, **transform_params):
400 |             return [cleanText(text) for text in X]
401 |     
402 |         def fit(self, X, y=None, **fit_params):
403 |             return self
404 |     
405 |         def get_params(self, deep=True):
406 |             return {}
407 |         
408 |     # A custom function to clean the text before sending it into the vectorizer
409 |     def cleanText(text):
410 |         # get rid of newlines
411 |         text = text.strip().replace("\n", " ").replace("\r", " ")
412 |         
413 |         # replace twitter @mentions
414 |         mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
415 |         text = mentionFinder.sub("@MENTION", text)
416 |         
417 |         # replace HTML symbols
418 |         text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
419 |         
420 |         # lowercase
421 |         text = text.lower()
422 |     
423 |         return text
424 |     
425 |     # A custom function to tokenize the text using spaCy
426 |     # and convert to lemmas
427 |     def tokenizeText(sample):
428 |     
429 |         # get the tokens using spaCy
430 |         tokens = parser(sample)
431 |     
432 |         # lemmatize
433 |         lemmas = []
434 |         for tok in tokens:
435 |             lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
436 |         tokens = lemmas
437 |     
438 |         # stoplist the tokens
439 |         tokens = [tok for tok in tokens if tok not in STOPLIST]
440 |     
441 |         # stoplist symbols
442 |         tokens = [tok for tok in tokens if tok not in SYMBOLS]
443 |     
444 |         # remove large strings of whitespace
445 |         while "" in tokens:
446 |             tokens.remove("")
447 |         while " " in tokens:
448 |             tokens.remove(" ")
449 |         while "\n" in tokens:
450 |             tokens.remove("\n")
451 |         while "\n\n" in tokens:
452 |             tokens.remove("\n\n")
453 |     
454 |         return tokens
455 |     
456 |     def printNMostInformative(vectorizer, clf, N):
457 |         """Prints features with the highest coefficient values, per class"""
458 |         feature_names = vectorizer.get_feature_names()
459 |         coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
460 |         topClass1 = coefs_with_fns[:N]
461 |         topClass2 = coefs_with_fns[:-(N + 1):-1]
462 |         print("Class 1 best: ")
463 |         for feat in topClass1:
464 |             print(feat)
465 |         print("Class 2 best: ")
466 |         for feat in topClass2:
467 |             print(feat)
468 |     
469 |     # the vectorizer and classifer to use
470 |     # note that I changed the tokenizer in CountVectorizer to use a custom function using spaCy's tokenizer
471 |     vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
472 |     clf = LinearSVC()
473 |     # the pipeline to clean, tokenize, vectorize, and classify
474 |     pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])
475 |     
476 |     # data
477 |     train = ["I love space. Space is great.", "Planets are cool. I am glad they exist in space", "lol @twitterdude that is gr8", 
478 |             "twitter &amp; reddit are fun.", "Mars is a planet. It is red.", "@Microsoft: y u skip windows 9?", "Rockets launch from Earth and go to other planets.",
479 |             "twitter social media &gt; &lt;", "@someguy @somegirl @twitter #hashtag", "Orbiting the sun is a little blue-green planet."]
480 |     labelsTrain = ["space", "space", "twitter", "twitter", "space", "twitter", "space", "twitter", "twitter", "space"]
481 |     
482 |     test = ["i h8 riting comprehensibly #skoolsux", "planets and stars and rockets and stuff"]
483 |     labelsTest = ["twitter", "space"]
484 |     
485 |     # train
486 |     pipe.fit(train, labelsTrain)
487 |     
488 |     # test
489 |     preds = pipe.predict(test)
490 |     print("----------------------------------------------------------------------------------------------")
491 |     print("results:")
492 |     for (sample, pred) in zip(test, preds):
493 |         print(sample, ":", pred)
494 |     print("accuracy:", accuracy_score(labelsTest, preds))
495 |     
496 |     print("----------------------------------------------------------------------------------------------")
497 |     print("Top 10 features used to predict: ")
498 |     # show the top features
499 |     printNMostInformative(vectorizer, clf, 10)
500 |     
501 |     print("----------------------------------------------------------------------------------------------")
502 |     print("The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc")
503 |     # let's see what the pipeline was transforming the data into
504 |     pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
505 |     transform = pipe.fit_transform(train, labelsTrain)
506 |     
507 |     # get the features that the vectorizer learned (its vocabulary)
508 |     vocab = vectorizer.get_feature_names()
509 |     
510 |     # the values from the vectorizer transformed data (each item is a row,column index with value as # times occuring in the sample, stored as a sparse matrix)
511 |     for i in range(len(train)):
512 |         s = ""
513 |         indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]
514 |         numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]
515 |         for idx, num in zip(indexIntoVocab, numOccurences):
516 |             s += str((vocab[idx], num))
517 |         print("Sample {}: {}".format(i, s))
518 | 
519 |     ----------------------------------------------------------------------------------------------
520 |     results:
521 |     i h8 riting comprehensibly #skoolsux : twitter
522 |     planets and stars and rockets and stuff : space
523 |     accuracy: 1.0
524 |     ----------------------------------------------------------------------------------------------
525 |     Top 10 features used to predict: 
526 |     Class 1 best: 
527 |     (-0.52882810587037121, 'planet')
528 |     (-0.35193565503626856, 'space')
529 |     (-0.2182987490483107, 'mar')
530 |     (-0.2182987490483107, 'red')
531 |     (-0.15592826214493352, 'earth')
532 |     (-0.15592826214493352, 'launch')
533 |     (-0.15592826214493352, 'rocket')
534 |     (-0.1482804579342584, 'great')
535 |     (-0.1482804579342584, 'love')
536 |     (-0.099226355509375405, 'blue')
537 |     Class 2 best: 
538 |     (0.41129938045689757, 'twitter')
539 |     (0.34038557663231445, '@mention')
540 |     (0.23401502570811406, 'lol')
541 |     (0.23401502570811406, 'gr8')
542 |     (0.20564996854629114, 'social')
543 |     (0.20564996854629114, 'medium')
544 |     (0.20564941191060651, 'reddit')
545 |     (0.20564941191060651, 'fun')
546 |     (0.10637055092420053, 'y')
547 |     (0.10637055092420053, 'window')
548 |     ----------------------------------------------------------------------------------------------
549 |     The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc
550 |     Sample 0: ('love', 1)('space', 2)('great', 1)
551 |     Sample 1: ('space', 1)('planet', 1)('cool', 1)('glad', 1)('exist', 1)
552 |     Sample 2: ('lol', 1)('@mention', 1)('gr8', 1)
553 |     Sample 3: ('twitter', 1)('reddit', 1)('fun', 1)
554 |     Sample 4: ('planet', 1)('mar', 1)('red', 1)
555 |     Sample 5: ('@mention', 1)('y', 1)('u', 1)('skip', 1)('window', 1)('9', 1)
556 |     Sample 6: ('planet', 1)('rocket', 1)('launch', 1)('earth', 1)
557 |     Sample 7: ('twitter', 1)('social', 1)('medium', 1)
558 |     Sample 8: ('@mention', 3)('hashtag', 1)
559 |     Sample 9: ('planet', 1)('orbit', 1)('sun', 1)('little', 1)('blue', 1)('green', 1)
560 | 
561 | 
562 | 
563 |     
564 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (C) 2016 J Nicolas Schrading
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to spaCy for NLP and Machine Learning
 2 | 
 3 | ## Dependencies
 4 | spaCy
 5 | 
 6 | Scikit-learn
 7 | 
 8 | NLTK
 9 | 
10 | To get the above requirements (except spaCy) download and install anaconda:
11 | https://store.continuum.io/cshop/anaconda/
12 | 
13 | To install spaCy:
14 | ```
15 | pip install spacy
16 | python -m spacy.en.download all
17 | ```
18 | 
19 | Make sure to run the above python command, this downloads the models that spaCy needs.
20 | Python 3 is recommended, although Python 2 should work as long as you convert the strings to unicode objects.
21 | 
22 | ## Running the files
23 | 
24 | 1. From the command line (cmd) or terminal navigate to where this readme and .ipynb file is.
25 | 2. execute "ipython notebook Intro_spaCy_NLP"
26 | 


--------------------------------------------------------------------------------
/subject_object_extraction.py:
--------------------------------------------------------------------------------
  1 | from nltk.stem.wordnet import WordNetLemmatizer
  2 | from spacy.en import English
  3 | 
  4 | SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
  5 | OBJECTS = ["dobj", "dative", "attr", "oprd"]
  6 | 
  7 | def getSubsFromConjunctions(subs):
  8 |     moreSubs = []
  9 |     for sub in subs:
 10 |         # rights is a generator
 11 |         rights = list(sub.rights)
 12 |         rightDeps = {tok.lower_ for tok in rights}
 13 |         if "and" in rightDeps:
 14 |             moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
 15 |             if len(moreSubs) > 0:
 16 |                 moreSubs.extend(getSubsFromConjunctions(moreSubs))
 17 |     return moreSubs
 18 | 
 19 | def getObjsFromConjunctions(objs):
 20 |     moreObjs = []
 21 |     for obj in objs:
 22 |         # rights is a generator
 23 |         rights = list(obj.rights)
 24 |         rightDeps = {tok.lower_ for tok in rights}
 25 |         if "and" in rightDeps:
 26 |             moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
 27 |             if len(moreObjs) > 0:
 28 |                 moreObjs.extend(getObjsFromConjunctions(moreObjs))
 29 |     return moreObjs
 30 | 
 31 | def getVerbsFromConjunctions(verbs):
 32 |     moreVerbs = []
 33 |     for verb in verbs:
 34 |         rightDeps = {tok.lower_ for tok in verb.rights}
 35 |         if "and" in rightDeps:
 36 |             moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
 37 |             if len(moreVerbs) > 0:
 38 |                 moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
 39 |     return moreVerbs
 40 | 
 41 | def findSubs(tok):
 42 |     head = tok.head
 43 |     while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
 44 |         head = head.head
 45 |     if head.pos_ == "VERB":
 46 |         subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
 47 |         if len(subs) > 0:
 48 |             verbNegated = isNegated(head)
 49 |             subs.extend(getSubsFromConjunctions(subs))
 50 |             return subs, verbNegated
 51 |         elif head.head != head:
 52 |             return findSubs(head)
 53 |     elif head.pos_ == "NOUN":
 54 |         return [head], isNegated(tok)
 55 |     return [], False
 56 | 
 57 | def isNegated(tok):
 58 |     negations = {"no", "not", "n't", "never", "none"}
 59 |     for dep in list(tok.lefts) + list(tok.rights):
 60 |         if dep.lower_ in negations:
 61 |             return True
 62 |     return False
 63 | 
 64 | def findSVs(tokens):
 65 |     svs = []
 66 |     verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
 67 |     for v in verbs:
 68 |         subs, verbNegated = getAllSubs(v)
 69 |         if len(subs) > 0:
 70 |             for sub in subs:
 71 |                 svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
 72 |     return svs
 73 | 
 74 | def getObjsFromPrepositions(deps):
 75 |     objs = []
 76 |     for dep in deps:
 77 |         if dep.pos_ == "ADP" and dep.dep_ == "prep":
 78 |             objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
 79 |     return objs
 80 | 
 81 | def getObjsFromAttrs(deps):
 82 |     for dep in deps:
 83 |         if dep.pos_ == "NOUN" and dep.dep_ == "attr":
 84 |             verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
 85 |             if len(verbs) > 0:
 86 |                 for v in verbs:
 87 |                     rights = list(v.rights)
 88 |                     objs = [tok for tok in rights if tok.dep_ in OBJECTS]
 89 |                     objs.extend(getObjsFromPrepositions(rights))
 90 |                     if len(objs) > 0:
 91 |                         return v, objs
 92 |     return None, None
 93 | 
 94 | def getObjFromXComp(deps):
 95 |     for dep in deps:
 96 |         if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
 97 |             v = dep
 98 |             rights = list(v.rights)
 99 |             objs = [tok for tok in rights if tok.dep_ in OBJECTS]
100 |             objs.extend(getObjsFromPrepositions(rights))
101 |             if len(objs) > 0:
102 |                 return v, objs
103 |     return None, None
104 | 
105 | def getAllSubs(v):
106 |     verbNegated = isNegated(v)
107 |     subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
108 |     if len(subs) > 0:
109 |         subs.extend(getSubsFromConjunctions(subs))
110 |     else:
111 |         foundSubs, verbNegated = findSubs(v)
112 |         subs.extend(foundSubs)
113 |     return subs, verbNegated
114 | 
115 | def getAllObjs(v):
116 |     # rights is a generator
117 |     rights = list(v.rights)
118 |     objs = [tok for tok in rights if tok.dep_ in OBJECTS]
119 |     objs.extend(getObjsFromPrepositions(rights))
120 | 
121 |     #potentialNewVerb, potentialNewObjs = getObjsFromAttrs(rights)
122 |     #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
123 |     #    objs.extend(potentialNewObjs)
124 |     #    v = potentialNewVerb
125 | 
126 |     potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
127 |     if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
128 |         objs.extend(potentialNewObjs)
129 |         v = potentialNewVerb
130 |     if len(objs) > 0:
131 |         objs.extend(getObjsFromConjunctions(objs))
132 |     return v, objs
133 | 
134 | def findSVOs(tokens):
135 |     svos = []
136 |     verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
137 |     for v in verbs:
138 |         subs, verbNegated = getAllSubs(v)
139 |         # hopefully there are subs, if not, don't examine this verb any longer
140 |         if len(subs) > 0:
141 |             v, objs = getAllObjs(v)
142 |             for sub in subs:
143 |                 for obj in objs:
144 |                     objNegated = isNegated(obj)
145 |                     svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
146 |     return svos
147 | 
148 | def getAbuserOntoVictimSVOs(tokens):
149 |     maleAbuser = {'he', 'boyfriend', 'bf', 'father', 'dad', 'husband', 'brother', 'man'}
150 |     femaleAbuser = {'she', 'girlfriend', 'gf', 'mother', 'mom', 'wife', 'sister', 'woman'}
151 |     neutralAbuser = {'pastor', 'abuser', 'offender', 'ex', 'x', 'lover', 'church', 'they'}
152 |     victim = {'me', 'sister', 'brother', 'child', 'kid', 'baby', 'friend', 'her', 'him', 'man', 'woman'}
153 | 
154 |     svos = findSVOs(tokens)
155 |     wnl = WordNetLemmatizer()
156 |     passed = []
157 |     for s, v, o in svos:
158 |         s = wnl.lemmatize(s)
159 |         v = "!" + wnl.lemmatize(v[1:], 'v') if v[0] == "!" else wnl.lemmatize(v, 'v')
160 |         o = "!" + wnl.lemmatize(o[1:]) if o[0] == "!" else wnl.lemmatize(o)
161 |         if s in maleAbuser.union(femaleAbuser).union(neutralAbuser) and o in victim:
162 |             passed.append((s, v, o))
163 |     return passed
164 | 
165 | def printDeps(toks):
166 |     for tok in toks:
167 |         print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])
168 | 
169 | def testSVOs():
170 |     nlp = English()
171 | 
172 |     tok = nlp("making $12 an hour? where am i going to go? i have no other financial assistance available and he certainly won't provide support.")
173 |     svos = findSVOs(tok)
174 |     printDeps(tok)
175 |     assert set(svos) == {('i', '!have', 'assistance'), ('he', '!provide', 'support')}
176 |     print(svos)
177 | 
178 |     tok = nlp("i don't have other assistance")
179 |     svos = findSVOs(tok)
180 |     printDeps(tok)
181 |     assert set(svos) == {('i', '!have', 'assistance')}
182 | 
183 |     print("-----------------------------------------------")
184 |     tok = nlp("They ate the pizza with anchovies.")
185 |     svos = findSVOs(tok)
186 |     printDeps(tok)
187 |     print(svos)
188 |     assert set(svos) == {('they', 'ate', 'pizza')}
189 | 
190 |     print("--------------------------------------------------")
191 |     tok = nlp("I have no other financial assistance available and he certainly won't provide support.")
192 |     svos = findSVOs(tok)
193 |     printDeps(tok)
194 |     print(svos)
195 |     assert set(svos) == {('i', '!have', 'assistance'), ('he', '!provide', 'support')}
196 | 
197 |     print("--------------------------------------------------")
198 |     tok = nlp("I have no other financial assistance available, and he certainly won't provide support.")
199 |     svos = findSVOs(tok)
200 |     printDeps(tok)
201 |     print(svos)
202 |     assert set(svos) == {('i', '!have', 'assistance'), ('he', '!provide', 'support')}
203 | 
204 |     print("--------------------------------------------------")
205 |     tok = nlp("he did not kill me")
206 |     svos = findSVOs(tok)
207 |     printDeps(tok)
208 |     print(svos)
209 |     assert set(svos) == {('he', '!kill', 'me')}
210 | 
211 |     #print("--------------------------------------------------")
212 |     #tok = nlp("he is an evil man that hurt my child and sister")
213 |     #svos = findSVOs(tok)
214 |     #printDeps(tok)
215 |     #print(svos)
216 |     #assert set(svos) == {('he', 'hurt', 'child'), ('he', 'hurt', 'sister'), ('man', 'hurt', 'child'), ('man', 'hurt', 'sister')}
217 | 
218 |     print("--------------------------------------------------")
219 |     tok = nlp("he told me i would die alone with nothing but my career someday")
220 |     svos = findSVOs(tok)
221 |     printDeps(tok)
222 |     print(svos)
223 |     assert set(svos) == {('he', 'told', 'me')}
224 | 
225 |     print("--------------------------------------------------")
226 |     tok = nlp("I wanted to kill him with a hammer.")
227 |     svos = findSVOs(tok)
228 |     printDeps(tok)
229 |     print(svos)
230 |     assert set(svos) == {('i', 'kill', 'him')}
231 | 
232 |     print("--------------------------------------------------")
233 |     tok = nlp("because he hit me and also made me so angry i wanted to kill him with a hammer.")
234 |     svos = findSVOs(tok)
235 |     printDeps(tok)
236 |     print(svos)
237 |     assert set(svos) == {('he', 'hit', 'me'), ('i', 'kill', 'him')}
238 | 
239 |     print("--------------------------------------------------")
240 |     tok = nlp("he and his brother shot me")
241 |     svos = findSVOs(tok)
242 |     printDeps(tok)
243 |     print(svos)
244 |     assert set(svos) == {('he', 'shot', 'me'), ('brother', 'shot', 'me')}
245 | 
246 |     print("--------------------------------------------------")
247 |     tok = nlp("he and his brother shot me and my sister")
248 |     svos = findSVOs(tok)
249 |     printDeps(tok)
250 |     print(svos)
251 |     assert set(svos) == {('he', 'shot', 'me'), ('he', 'shot', 'sister'), ('brother', 'shot', 'me'), ('brother', 'shot', 'sister')}
252 | 
253 |     print("--------------------------------------------------")
254 |     tok = nlp("the annoying person that was my boyfriend hit me")
255 |     svos = findSVOs(tok)
256 |     printDeps(tok)
257 |     print(svos)
258 |     assert set(svos) == {('person', 'was', 'boyfriend'), ('person', 'hit', 'me')}
259 | 
260 |     print("--------------------------------------------------")
261 |     tok = nlp("the boy raced the girl who had a hat that had spots.")
262 |     svos = findSVOs(tok)
263 |     printDeps(tok)
264 |     print(svos)
265 |     assert set(svos) == {('boy', 'raced', 'girl'), ('who', 'had', 'hat'), ('hat', 'had', 'spots')}
266 | 
267 |     print("--------------------------------------------------")
268 |     tok = nlp("he spit on me")
269 |     svos = findSVOs(tok)
270 |     printDeps(tok)
271 |     print(svos)
272 |     assert set(svos) == {('he', 'spit', 'me')}
273 | 
274 |     print("--------------------------------------------------")
275 |     tok = nlp("he didn't spit on me")
276 |     svos = findSVOs(tok)
277 |     printDeps(tok)
278 |     print(svos)
279 |     assert set(svos) == {('he', '!spit', 'me')}
280 | 
281 |     print("--------------------------------------------------")
282 |     tok = nlp("the boy raced the girl who had a hat that didn't have spots.")
283 |     svos = findSVOs(tok)
284 |     printDeps(tok)
285 |     print(svos)
286 |     assert set(svos) == {('boy', 'raced', 'girl'), ('who', 'had', 'hat'), ('hat', '!have', 'spots')}
287 | 
288 |     print("--------------------------------------------------")
289 |     tok = nlp("he is a nice man that didn't hurt my child and sister")
290 |     svos = findSVOs(tok)
291 |     printDeps(tok)
292 |     print(svos)
293 |     assert set(svos) == {('he', 'is', 'man'), ('man', '!hurt', 'child'), ('man', '!hurt', 'sister')}
294 | 
295 |     print("--------------------------------------------------")
296 |     tok = nlp("he didn't spit on me and my child")
297 |     svos = findSVOs(tok)
298 |     printDeps(tok)
299 |     print(svos)
300 |     assert set(svos) == {('he', '!spit', 'me'), ('he', '!spit', 'child')}
301 | 
302 |     print("--------------------------------------------------")
303 |     tok = nlp("he beat and hurt me")
304 |     svos = findSVOs(tok)
305 |     printDeps(tok)
306 |     print(svos)
307 |     # tok = nlp("he beat and hurt me")
308 | 
309 | def main():
310 |     testSVOs()
311 | 
312 | if __name__ == "__main__":
313 |     main()


--------------------------------------------------------------------------------