├── Intro_spaCy_NLP.ipynb ├── Intro_spaCy_NLP.md ├── LICENSE ├── README.md └── subject_object_extraction.py /Intro_spaCy_NLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:6842b637870ccb1c99f940c90e75c511dc193e15ea69d132c85630a4dc6ce0df" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "# Set up spaCy\n", 16 | "from spacy.en import English\n", 17 | "parser = English()\n", 18 | "\n", 19 | "# Test Data\n", 20 | "multiSentence = \"There is an art, it says, or rather, a knack to flying.\" \\\n", 21 | " \"The knack lies in learning how to throw yourself at the ground and miss.\" \\\n", 22 | " \"In the beginning the Universe was created. This has made a lot of people \"\\\n", 23 | " \"very angry and been widely regarded as a bad move.\"" 24 | ], 25 | "language": "python", 26 | "metadata": {}, 27 | "outputs": [], 28 | "prompt_number": 2 29 | }, 30 | { 31 | "cell_type": "heading", 32 | "level": 1, 33 | "metadata": {}, 34 | "source": [ 35 | "spaCy does tokenization, sentence recognition, part of speech tagging, lemmatization, dependency parsing, and named entity recognition all at once!" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "collapsed": false, 41 | "input": [ 42 | "# all you have to do to parse text is this:\n", 43 | "#note: the first time you run spaCy in a file it takes a little while to load up its modules\n", 44 | "parsedData = parser(multiSentence)" 45 | ], 46 | "language": "python", 47 | "metadata": {}, 48 | "outputs": [], 49 | "prompt_number": 59 50 | }, 51 | { 52 | "cell_type": "code", 53 | "collapsed": false, 54 | "input": [ 55 | "# Let's look at the tokens\n", 56 | "# All you have to do is iterate through the parsedData\n", 57 | "# Each token is an object with lots of different properties\n", 58 | "# A property with an underscore at the end returns the string representation\n", 59 | "# while a property without the underscore returns an index (int) into spaCy's vocabulary\n", 60 | "# The probability estimate is based on counts from a 3 billion word corpus, smoothed using the Simple Good-Turing method.\n", 61 | "for i, token in enumerate(parsedData):\n", 62 | " print(\"original:\", token.orth, token.orth_)\n", 63 | " print(\"lowercased:\", token.lower, token.lower_)\n", 64 | " print(\"lemma:\", token.lemma, token.lemma_)\n", 65 | " print(\"shape:\", token.shape, token.shape_)\n", 66 | " print(\"prefix:\", token.prefix, token.prefix_)\n", 67 | " print(\"suffix:\", token.suffix, token.suffix_)\n", 68 | " print(\"log probability:\", token.prob)\n", 69 | " print(\"Brown cluster id:\", token.cluster)\n", 70 | " print(\"----------------------------------------\")\n", 71 | " if i > 10:\n", 72 | " break" 73 | ], 74 | "language": "python", 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "output_type": "stream", 79 | "stream": "stdout", 80 | "text": [ 81 | "original: 300 There\n", 82 | "lowercased: 144 there\n", 83 | "lemma: 300 There\n", 84 | "shape: 187 Xxxxx\n", 85 | "prefix: 32 T\n", 86 | "suffix: 66 ere\n", 87 | "log probability: -7.663576126098633\n", 88 | "Brown cluster id: 1918\n", 89 | "----------------------------------------\n", 90 | "original: 29 is\n", 91 | "lowercased: 29 is\n", 92 | "lemma: 52 be\n", 93 | "shape: 7 xx\n", 94 | "prefix: 14 i\n", 95 | "suffix: 29 is\n", 96 | "log probability: -5.002371311187744\n", 97 | "Brown cluster id: 762\n", 98 | "----------------------------------------\n", 99 | "original: 59 an\n", 100 | "lowercased: 59 an\n", 101 | "lemma: 59 an\n", 102 | "shape: 7 xx\n", 103 | "prefix: 11 a\n", 104 | "suffix: 59 an\n", 105 | "log probability: -5.829381465911865\n", 106 | "Brown cluster id: 3\n", 107 | "----------------------------------------\n", 108 | "original: 334 art\n", 109 | "lowercased: 334 art\n", 110 | "lemma: 334 art\n", 111 | "shape: 3 xxx\n", 112 | "prefix: 11 a\n", 113 | "suffix: 334 art\n", 114 | "log probability: -9.482678413391113\n", 115 | "Brown cluster id: 633\n", 116 | "----------------------------------------\n", 117 | "original: 1 ,\n", 118 | "lowercased: 1 ,\n", 119 | "lemma: 1 ,\n", 120 | "shape: 1 ,\n", 121 | "prefix: 1 ,\n", 122 | "suffix: 1 ,\n", 123 | "log probability: -3.0368354320526123\n", 124 | "Brown cluster id: 4\n", 125 | "----------------------------------------\n", 126 | "original: 44 it\n", 127 | "lowercased: 44 it\n", 128 | "lemma: 906264 -PRON-\n", 129 | "shape: 7 xx\n", 130 | "prefix: 14 i\n", 131 | "suffix: 44 it\n", 132 | "log probability: -5.498129367828369\n", 133 | "Brown cluster id: 474\n", 134 | "----------------------------------------\n", 135 | "original: 274 says\n", 136 | "lowercased: 274 says\n", 137 | "lemma: 253 say\n", 138 | "shape: 20 xxxx\n", 139 | "prefix: 27 s\n", 140 | "suffix: 275 ays\n", 141 | "log probability: -7.604108810424805\n", 142 | "Brown cluster id: 244\n", 143 | "----------------------------------------\n", 144 | "original: 1 ,\n", 145 | "lowercased: 1 ,\n", 146 | "lemma: 1 ,\n", 147 | "shape: 1 ,\n", 148 | "prefix: 1 ,\n", 149 | "suffix: 1 ,\n", 150 | "log probability: -3.0368354320526123\n", 151 | "Brown cluster id: 4\n", 152 | "----------------------------------------\n", 153 | "original: 79 or\n", 154 | "lowercased: 79 or\n", 155 | "lemma: 79 or\n", 156 | "shape: 7 xx\n", 157 | "prefix: 8 o\n", 158 | "suffix: 79 or\n", 159 | "log probability: -6.262600898742676\n", 160 | "Brown cluster id: 404\n", 161 | "----------------------------------------\n", 162 | "original: 1400 rather\n", 163 | "lowercased: 1400 rather\n", 164 | "lemma: 1400 rather\n", 165 | "shape: 20 xxxx\n", 166 | "prefix: 357 r\n", 167 | "suffix: 131 her\n", 168 | "log probability: -9.074186325073242\n", 169 | "Brown cluster id: 6698\n", 170 | "----------------------------------------\n", 171 | "original: 1 ,\n", 172 | "lowercased: 1 ,\n", 173 | "lemma: 1 ,\n", 174 | "shape: 1 ,\n", 175 | "prefix: 1 ,\n", 176 | "suffix: 1 ,\n", 177 | "log probability: -3.0368354320526123\n", 178 | "Brown cluster id: 4\n", 179 | "----------------------------------------\n", 180 | "original:" 181 | ] 182 | }, 183 | { 184 | "output_type": "stream", 185 | "stream": "stdout", 186 | "text": [ 187 | " 11 a\n", 188 | "lowercased: 11 a\n", 189 | "lemma: 11 a\n", 190 | "shape: 12 x\n", 191 | "prefix: 11 a\n", 192 | "suffix: 11 a\n", 193 | "log probability: -4.003841400146484\n", 194 | "Brown cluster id: 19\n", 195 | "----------------------------------------\n" 196 | ] 197 | } 198 | ], 199 | "prompt_number": 60 200 | }, 201 | { 202 | "cell_type": "code", 203 | "collapsed": false, 204 | "input": [ 205 | "# Let's look at the sentences\n", 206 | "sents = []\n", 207 | "# the \"sents\" property returns spans\n", 208 | "# spans have indices into the original string\n", 209 | "# where each index value represents a token\n", 210 | "for span in parsedData.sents:\n", 211 | " # go from the start to the end of each span, returning each token in the sentence\n", 212 | " # combine each token using join()\n", 213 | " sent = ''.join(parsedData[i].string for i in range(span.start, span.end)).strip()\n", 214 | " sents.append(sent)\n", 215 | "\n", 216 | "for sentence in sents:\n", 217 | " print(sentence)" 218 | ], 219 | "language": "python", 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "output_type": "stream", 224 | "stream": "stdout", 225 | "text": [ 226 | "There is an art, it says, or rather, a knack to flying.\n", 227 | "The knack lies in learning how to throw yourself at the ground and miss.\n", 228 | "In the beginning the Universe was created.\n", 229 | "This has made a lot of people very angry and been widely regarded as a bad move.\n" 230 | ] 231 | } 232 | ], 233 | "prompt_number": 61 234 | }, 235 | { 236 | "cell_type": "code", 237 | "collapsed": false, 238 | "input": [ 239 | "# Let's look at the part of speech tags of the first sentence\n", 240 | "for span in parsedData.sents:\n", 241 | " sent = [parsedData[i] for i in range(span.start, span.end)]\n", 242 | " break\n", 243 | "\n", 244 | "for token in sent:\n", 245 | " print(token.orth_, token.pos_)" 246 | ], 247 | "language": "python", 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "output_type": "stream", 252 | "stream": "stdout", 253 | "text": [ 254 | "There DET\n", 255 | "is VERB\n", 256 | "an DET\n", 257 | "art NOUN\n", 258 | ", PUNCT\n", 259 | "it PRON\n", 260 | "says VERB\n", 261 | ", PUNCT\n", 262 | "or CONJ\n", 263 | "rather ADV\n", 264 | ", PUNCT\n", 265 | "a DET\n", 266 | "knack NOUN\n", 267 | "to ADP\n", 268 | "flying NOUN\n", 269 | ". PUNCT\n" 270 | ] 271 | } 272 | ], 273 | "prompt_number": 62 274 | }, 275 | { 276 | "cell_type": "code", 277 | "collapsed": false, 278 | "input": [ 279 | "# Let's look at the dependencies of this example:\n", 280 | "example = \"The boy with the spotted dog quickly ran after the firetruck.\"\n", 281 | "parsedEx = parser(example)\n", 282 | "# shown as: original token, dependency tag, head word, left dependents, right dependents\n", 283 | "for token in parsedEx:\n", 284 | " print(token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights])" 285 | ], 286 | "language": "python", 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "output_type": "stream", 291 | "stream": "stdout", 292 | "text": [ 293 | "The det boy [] []\n", 294 | "boy nsubj ran ['The'] ['with']\n", 295 | "with prep boy [] ['dog']\n", 296 | "the det dog [] []\n", 297 | "spotted amod dog [] []\n", 298 | "dog pobj with ['the', 'spotted'] []\n", 299 | "quickly advmod ran [] []\n", 300 | "ran ROOT ran ['boy', 'quickly'] ['after', '.']\n", 301 | "after prep ran [] ['firetruck']\n", 302 | "the det firetruck [] []\n", 303 | "firetruck pobj after ['the'] []\n", 304 | ". punct ran [] []\n" 305 | ] 306 | } 307 | ], 308 | "prompt_number": 63 309 | }, 310 | { 311 | "cell_type": "code", 312 | "collapsed": false, 313 | "input": [ 314 | "# Let's look at the named entities of this example:\n", 315 | "example = \"Apple's stocks dropped dramatically after the death of Steve Jobs in October.\"\n", 316 | "parsedEx = parser(example)\n", 317 | "for token in parsedEx:\n", 318 | " print(token.orth_, token.ent_type_ if token.ent_type_ != \"\" else \"(not an entity)\")\n", 319 | "\n", 320 | "print(\"-------------- entities only ---------------\")\n", 321 | "# if you just want the entities and nothing else, you can do access the parsed examples \"ents\" property like this:\n", 322 | "ents = list(parsedEx.ents)\n", 323 | "for entity in ents:\n", 324 | " print(entity.label, entity.label_, ' '.join(t.orth_ for t in entity))" 325 | ], 326 | "language": "python", 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "output_type": "stream", 331 | "stream": "stdout", 332 | "text": [ 333 | "Apple ORG\n", 334 | "'s (not an entity)\n", 335 | "stocks (not an entity)\n", 336 | "dropped (not an entity)\n", 337 | "dramatically (not an entity)\n", 338 | "after (not an entity)\n", 339 | "the (not an entity)\n", 340 | "death (not an entity)\n", 341 | "of (not an entity)\n", 342 | "Steve PERSON\n", 343 | "Jobs (not an entity)\n", 344 | "in (not an entity)\n", 345 | "October DATE\n", 346 | ". (not an entity)\n", 347 | "-------------- entities only ---------------\n", 348 | "274530 ORG Apple\n", 349 | "112504 PERSON Steve Jobs\n", 350 | "71288 DATE October\n" 351 | ] 352 | } 353 | ], 354 | "prompt_number": 3 355 | }, 356 | { 357 | "cell_type": "heading", 358 | "level": 1, 359 | "metadata": {}, 360 | "source": [ 361 | "spaCy is trained to attempt to handle messy data, including emoticons and other web-based features" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "collapsed": false, 367 | "input": [ 368 | "messyData = \"lol that is rly funny :) This is gr8 i rate it 8/8!!!\"\n", 369 | "parsedData = parser(messyData)\n", 370 | "for token in parsedData:\n", 371 | " print(token.orth_, token.pos_, token.lemma_)\n", 372 | " \n", 373 | "# it does pretty well! Note that it does fail on the token \"gr8\", taking it as a verb rather than an adjective meaning \"great\"\n", 374 | "# and \"lol\" probably isn't a noun...it's more like an interjection" 375 | ], 376 | "language": "python", 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "output_type": "stream", 381 | "stream": "stdout", 382 | "text": [ 383 | "lol NOUN lol\n", 384 | "that DET that\n", 385 | "is VERB be\n", 386 | "rly ADV rly\n", 387 | "funny ADJ funny\n", 388 | ":) PUNCT :)\n", 389 | "This DET This\n", 390 | "is VERB be\n", 391 | "gr8 VERB gr8\n", 392 | "i PRON i\n", 393 | "rate VERB rate\n", 394 | "it PRON -PRON-\n", 395 | "8/8 NUM 8/8\n", 396 | "! PUNCT !\n", 397 | "! PUNCT !\n", 398 | "! PUNCT !\n" 399 | ] 400 | } 401 | ], 402 | "prompt_number": 65 403 | }, 404 | { 405 | "cell_type": "heading", 406 | "level": 1, 407 | "metadata": {}, 408 | "source": [ 409 | "spaCy has word vector representations built in!" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "collapsed": false, 415 | "input": [ 416 | "from numpy import dot\n", 417 | "from numpy.linalg import norm\n", 418 | "\n", 419 | "# you can access known words from the parser's vocabulary\n", 420 | "nasa = parser.vocab['NASA']\n", 421 | "\n", 422 | "# cosine similarity\n", 423 | "cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))\n", 424 | "\n", 425 | "# gather all known words, take only the lowercased versions\n", 426 | "allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != \"nasa\"})\n", 427 | "\n", 428 | "# sort by similarity to NASA\n", 429 | "allWords.sort(key=lambda w: cosine(w.repvec, nasa.repvec))\n", 430 | "allWords.reverse()\n", 431 | "print(\"Top 20 most similar words to NASA:\")\n", 432 | "for word in allWords[:20]: \n", 433 | " print(word.orth_)\n", 434 | " \n", 435 | "# Let's see if it can figure out this analogy\n", 436 | "# Man is to King as Woman is to ??\n", 437 | "king = parser.vocab['king']\n", 438 | "man = parser.vocab['man']\n", 439 | "woman = parser.vocab['woman']\n", 440 | "\n", 441 | "result = king.repvec - man.repvec + woman.repvec\n", 442 | "\n", 443 | "# gather all known words, take only the lowercased versions\n", 444 | "allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != \"king\" and w.lower_ != \"man\" and w.lower_ != \"woman\"})\n", 445 | "# sort by similarity to the result\n", 446 | "allWords.sort(key=lambda w: cosine(w.repvec, result))\n", 447 | "allWords.reverse()\n", 448 | "print(\"\\n----------------------------\\nTop 3 closest results for king - man + woman:\")\n", 449 | "for word in allWords[:3]: \n", 450 | " print(word.orth_)\n", 451 | " \n", 452 | "# it got it! Queen!" 453 | ], 454 | "language": "python", 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "output_type": "stream", 459 | "stream": "stdout", 460 | "text": [ 461 | "Top 20 most similar words to NASA:\n", 462 | "jpl\n", 463 | "noaa\n", 464 | "esa\n", 465 | "cern\n", 466 | "nih\n", 467 | "norad\n", 468 | "fema\n", 469 | "isro\n", 470 | "usaid\n", 471 | "nsf\n", 472 | "nsa\n", 473 | "dod\n", 474 | "usda\n", 475 | "caltech\n", 476 | "defra\n", 477 | "raytheon\n", 478 | "cia\n", 479 | "unhcr\n", 480 | "fermilab\n", 481 | "cdc\n", 482 | "\n", 483 | "----------------------------\n", 484 | "Top 3 closest results for king - man + woman:" 485 | ] 486 | }, 487 | { 488 | "output_type": "stream", 489 | "stream": "stdout", 490 | "text": [ 491 | "\n", 492 | "queen\n", 493 | "monarch\n", 494 | "princess\n" 495 | ] 496 | } 497 | ], 498 | "prompt_number": 66 499 | }, 500 | { 501 | "cell_type": "heading", 502 | "level": 1, 503 | "metadata": {}, 504 | "source": [ 505 | "You can do cool things like extract Subject, Verb, Object triples from the dependency parse if you use my code in subject_object_extraction.py. Note: Doesn't work on complicated sentences. Fails if the dependency parse is incorrect." 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "collapsed": false, 511 | "input": [ 512 | "from subject_object_extraction import findSVOs\n", 513 | "\n", 514 | "# can still work even without punctuation\n", 515 | "parse = parser(\"he and his brother shot me and my sister\")\n", 516 | "print(findSVOs(parse))\n", 517 | "\n", 518 | "# very complex sample. Only some are correct. Some are missed.\n", 519 | "parse = parser(\"Far out in the uncharted backwaters of the unfashionable end of the Western Spiral arm of the Galaxy lies a small unregarded yellow sun. \"\n", 520 | " \"Orbiting this at a distance of roughly ninety-two million miles is an utterly insignificant little blue green planet whose ape-descended \"\n", 521 | " \"life forms are so amazingly primitive that they still think digital watches are a pretty neat idea. \"\n", 522 | " \"This planet has \u2013 or rather had \u2013 a problem, which was this: most of the people living on it were unhappy for pretty much of the time. \"\n", 523 | " \"Many solutions were suggested for this problem, but most of these were largely concerned with the movements of small green pieces of paper, \"\n", 524 | " \"which is odd because on the whole it wasn\u2019t the small green pieces of paper that were unhappy. And so the problem remained; lots of the \"\n", 525 | " \"people were mean, and most of them were miserable, even the ones with digital watches.\")\n", 526 | "print(findSVOs(parse))" 527 | ], 528 | "language": "python", 529 | "metadata": {}, 530 | "outputs": [ 531 | { 532 | "output_type": "stream", 533 | "stream": "stdout", 534 | "text": [ 535 | "[('he', 'shot', 'me'), ('he', 'shot', 'sister'), ('brother', 'shot', 'me'), ('brother', 'shot', 'sister')]\n", 536 | "[('orbiting', 'is', 'planet'), ('watches', 'are', 'idea'), ('problem', 'was', 'this'), ('it', 'wasn\u2019t', 'pieces'), ('most', 'were', 'ones')]\n" 537 | ] 538 | } 539 | ], 540 | "prompt_number": 67 541 | }, 542 | { 543 | "cell_type": "heading", 544 | "level": 1, 545 | "metadata": {}, 546 | "source": [ 547 | "If you want to include spaCy in your machine learning it is not too difficult" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "collapsed": false, 553 | "input": [ 554 | "from sklearn.feature_extraction.text import CountVectorizer\n", 555 | "from sklearn.base import TransformerMixin\n", 556 | "from sklearn.pipeline import Pipeline\n", 557 | "from sklearn.svm import LinearSVC\n", 558 | "from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS\n", 559 | "from sklearn.metrics import accuracy_score\n", 560 | "from nltk.corpus import stopwords\n", 561 | "import string\n", 562 | "import re\n", 563 | "\n", 564 | "# A custom stoplist\n", 565 | "STOPLIST = set(stopwords.words('english') + [\"n't\", \"'s\", \"'m\", \"ca\"] + list(ENGLISH_STOP_WORDS))\n", 566 | "# List of symbols we don't care about\n", 567 | "SYMBOLS = \" \".join(string.punctuation).split(\" \") + [\"-----\", \"---\", \"...\", \"\u201c\", \"\u201d\", \"'ve\"]\n", 568 | "\n", 569 | "# Every step in a pipeline needs to be a \"transformer\". Define a custom transformer to clean text using spaCy\n", 570 | "class CleanTextTransformer(TransformerMixin):\n", 571 | " \"\"\"\n", 572 | " Convert text to cleaned text\n", 573 | " \"\"\"\n", 574 | "\n", 575 | " def transform(self, X, **transform_params):\n", 576 | " return [cleanText(text) for text in X]\n", 577 | "\n", 578 | " def fit(self, X, y=None, **fit_params):\n", 579 | " return self\n", 580 | "\n", 581 | " def get_params(self, deep=True):\n", 582 | " return {}\n", 583 | " \n", 584 | "# A custom function to clean the text before sending it into the vectorizer\n", 585 | "def cleanText(text):\n", 586 | " # get rid of newlines\n", 587 | " text = text.strip().replace(\"\\n\", \" \").replace(\"\\r\", \" \")\n", 588 | " \n", 589 | " # replace twitter @mentions\n", 590 | " mentionFinder = re.compile(r\"@[a-z0-9_]{1,15}\", re.IGNORECASE)\n", 591 | " text = mentionFinder.sub(\"@MENTION\", text)\n", 592 | " \n", 593 | " # replace HTML symbols\n", 594 | " text = text.replace(\"&\", \"and\").replace(\">\", \">\").replace(\"<\", \"<\")\n", 595 | " \n", 596 | " # lowercase\n", 597 | " text = text.lower()\n", 598 | "\n", 599 | " return text\n", 600 | "\n", 601 | "# A custom function to tokenize the text using spaCy\n", 602 | "# and convert to lemmas\n", 603 | "def tokenizeText(sample):\n", 604 | "\n", 605 | " # get the tokens using spaCy\n", 606 | " tokens = parser(sample)\n", 607 | "\n", 608 | " # lemmatize\n", 609 | " lemmas = []\n", 610 | " for tok in tokens:\n", 611 | " lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != \"-PRON-\" else tok.lower_)\n", 612 | " tokens = lemmas\n", 613 | "\n", 614 | " # stoplist the tokens\n", 615 | " tokens = [tok for tok in tokens if tok not in STOPLIST]\n", 616 | "\n", 617 | " # stoplist symbols\n", 618 | " tokens = [tok for tok in tokens if tok not in SYMBOLS]\n", 619 | "\n", 620 | " # remove large strings of whitespace\n", 621 | " while \"\" in tokens:\n", 622 | " tokens.remove(\"\")\n", 623 | " while \" \" in tokens:\n", 624 | " tokens.remove(\" \")\n", 625 | " while \"\\n\" in tokens:\n", 626 | " tokens.remove(\"\\n\")\n", 627 | " while \"\\n\\n\" in tokens:\n", 628 | " tokens.remove(\"\\n\\n\")\n", 629 | "\n", 630 | " return tokens\n", 631 | "\n", 632 | "def printNMostInformative(vectorizer, clf, N):\n", 633 | " \"\"\"Prints features with the highest coefficient values, per class\"\"\"\n", 634 | " feature_names = vectorizer.get_feature_names()\n", 635 | " coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))\n", 636 | " topClass1 = coefs_with_fns[:N]\n", 637 | " topClass2 = coefs_with_fns[:-(N + 1):-1]\n", 638 | " print(\"Class 1 best: \")\n", 639 | " for feat in topClass1:\n", 640 | " print(feat)\n", 641 | " print(\"Class 2 best: \")\n", 642 | " for feat in topClass2:\n", 643 | " print(feat)\n", 644 | "\n", 645 | "# the vectorizer and classifer to use\n", 646 | "# note that I changed the tokenizer in CountVectorizer to use a custom function using spaCy's tokenizer\n", 647 | "vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))\n", 648 | "clf = LinearSVC()\n", 649 | "# the pipeline to clean, tokenize, vectorize, and classify\n", 650 | "pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])\n", 651 | "\n", 652 | "# data\n", 653 | "train = [\"I love space. Space is great.\", \"Planets are cool. I am glad they exist in space\", \"lol @twitterdude that is gr8\", \n", 654 | " \"twitter & reddit are fun.\", \"Mars is a planet. It is red.\", \"@Microsoft: y u skip windows 9?\", \"Rockets launch from Earth and go to other planets.\",\n", 655 | " \"twitter social media > <\", \"@someguy @somegirl @twitter #hashtag\", \"Orbiting the sun is a little blue-green planet.\"]\n", 656 | "labelsTrain = [\"space\", \"space\", \"twitter\", \"twitter\", \"space\", \"twitter\", \"space\", \"twitter\", \"twitter\", \"space\"]\n", 657 | "\n", 658 | "test = [\"i h8 riting comprehensibly #skoolsux\", \"planets and stars and rockets and stuff\"]\n", 659 | "labelsTest = [\"twitter\", \"space\"]\n", 660 | "\n", 661 | "# train\n", 662 | "pipe.fit(train, labelsTrain)\n", 663 | "\n", 664 | "# test\n", 665 | "preds = pipe.predict(test)\n", 666 | "print(\"----------------------------------------------------------------------------------------------\")\n", 667 | "print(\"results:\")\n", 668 | "for (sample, pred) in zip(test, preds):\n", 669 | " print(sample, \":\", pred)\n", 670 | "print(\"accuracy:\", accuracy_score(labelsTest, preds))\n", 671 | "\n", 672 | "print(\"----------------------------------------------------------------------------------------------\")\n", 673 | "print(\"Top 10 features used to predict: \")\n", 674 | "# show the top features\n", 675 | "printNMostInformative(vectorizer, clf, 10)\n", 676 | "\n", 677 | "print(\"----------------------------------------------------------------------------------------------\")\n", 678 | "print(\"The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc\")\n", 679 | "# let's see what the pipeline was transforming the data into\n", 680 | "pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])\n", 681 | "transform = pipe.fit_transform(train, labelsTrain)\n", 682 | "\n", 683 | "# get the features that the vectorizer learned (its vocabulary)\n", 684 | "vocab = vectorizer.get_feature_names()\n", 685 | "\n", 686 | "# the values from the vectorizer transformed data (each item is a row,column index with value as # times occuring in the sample, stored as a sparse matrix)\n", 687 | "for i in range(len(train)):\n", 688 | " s = \"\"\n", 689 | " indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]\n", 690 | " numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]\n", 691 | " for idx, num in zip(indexIntoVocab, numOccurences):\n", 692 | " s += str((vocab[idx], num))\n", 693 | " print(\"Sample {}: {}\".format(i, s))" 694 | ], 695 | "language": "python", 696 | "metadata": {}, 697 | "outputs": [ 698 | { 699 | "output_type": "stream", 700 | "stream": "stdout", 701 | "text": [ 702 | "----------------------------------------------------------------------------------------------\n", 703 | "results:\n", 704 | "i h8 riting comprehensibly #skoolsux : twitter\n", 705 | "planets and stars and rockets and stuff : space\n", 706 | "accuracy: 1.0\n", 707 | "----------------------------------------------------------------------------------------------\n", 708 | "Top 10 features used to predict: \n", 709 | "Class 1 best: \n", 710 | "(-0.52882810587037121, 'planet')\n", 711 | "(-0.35193565503626856, 'space')\n", 712 | "(-0.2182987490483107, 'mar')\n", 713 | "(-0.2182987490483107, 'red')\n", 714 | "(-0.15592826214493352, 'earth')\n", 715 | "(-0.15592826214493352, 'launch')\n", 716 | "(-0.15592826214493352, 'rocket')\n", 717 | "(-0.1482804579342584, 'great')\n", 718 | "(-0.1482804579342584, 'love')\n", 719 | "(-0.099226355509375405, 'blue')\n", 720 | "Class 2 best: \n", 721 | "(0.41129938045689757, 'twitter')\n", 722 | "(0.34038557663231445, '@mention')\n", 723 | "(0.23401502570811406, 'lol')\n", 724 | "(0.23401502570811406, 'gr8')\n", 725 | "(0.20564996854629114, 'social')\n", 726 | "(0.20564996854629114, 'medium')\n", 727 | "(0.20564941191060651, 'reddit')\n", 728 | "(0.20564941191060651, 'fun')\n", 729 | "(0.10637055092420053, 'y')\n", 730 | "(0.10637055092420053, 'window')\n", 731 | "----------------------------------------------------------------------------------------------\n", 732 | "The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc\n", 733 | "Sample 0: ('love', 1)('space', 2)('great', 1)\n", 734 | "Sample 1: ('space', 1)('planet', 1)('cool', 1)('glad', 1)('exist', 1)\n", 735 | "Sample 2: ('lol', 1)('@mention', 1)('gr8', 1)\n", 736 | "Sample 3: ('twitter', 1)('reddit', 1)('fun', 1)\n", 737 | "Sample 4: ('planet', 1)('mar', 1)('red', 1)\n", 738 | "Sample 5: ('@mention', 1)('y', 1)('u', 1)('skip', 1)('window', 1)('9', 1)\n", 739 | "Sample 6: ('planet', 1)('rocket', 1)('launch', 1)('earth', 1)\n", 740 | "Sample 7: ('twitter', 1)('social', 1)('medium', 1)\n", 741 | "Sample 8: ('@mention', 3)('hashtag', 1)\n", 742 | "Sample 9: ('planet', 1)('orbit', 1)('sun', 1)('little', 1)('blue', 1)('green', 1)\n" 743 | ] 744 | } 745 | ], 746 | "prompt_number": 68 747 | }, 748 | { 749 | "cell_type": "code", 750 | "collapsed": false, 751 | "input": [], 752 | "language": "python", 753 | "metadata": {}, 754 | "outputs": [] 755 | } 756 | ], 757 | "metadata": {} 758 | } 759 | ] 760 | } -------------------------------------------------------------------------------- /Intro_spaCy_NLP.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Set up spaCy 4 | from spacy.en import English 5 | parser = English() 6 | 7 | # Test Data 8 | multiSentence = "There is an art, it says, or rather, a knack to flying." \ 9 | "The knack lies in learning how to throw yourself at the ground and miss." \ 10 | "In the beginning the Universe was created. This has made a lot of people "\ 11 | "very angry and been widely regarded as a bad move." 12 | 13 | # spaCy does tokenization, sentence recognition, part of speech tagging, lemmatization, dependency parsing, and named entity recognition all at once! 14 | 15 | 16 | # all you have to do to parse text is this: 17 | #note: the first time you run spaCy in a file it takes a little while to load up its modules 18 | parsedData = parser(multiSentence) 19 | 20 | 21 | # Let's look at the tokens 22 | # All you have to do is iterate through the parsedData 23 | # Each token is an object with lots of different properties 24 | # A property with an underscore at the end returns the string representation 25 | # while a property without the underscore returns an index (int) into spaCy's vocabulary 26 | # The probability estimate is based on counts from a 3 billion word corpus, smoothed using the Simple Good-Turing method. 27 | for i, token in enumerate(parsedData): 28 | print("original:", token.orth, token.orth_) 29 | print("lowercased:", token.lower, token.lower_) 30 | print("lemma:", token.lemma, token.lemma_) 31 | print("shape:", token.shape, token.shape_) 32 | print("prefix:", token.prefix, token.prefix_) 33 | print("suffix:", token.suffix, token.suffix_) 34 | print("log probability:", token.prob) 35 | print("Brown cluster id:", token.cluster) 36 | print("----------------------------------------") 37 | if i > 10: 38 | break 39 | 40 | original: 300 There 41 | lowercased: 144 there 42 | lemma: 300 There 43 | shape: 187 Xxxxx 44 | prefix: 32 T 45 | suffix: 66 ere 46 | log probability: -7.663576126098633 47 | Brown cluster id: 1918 48 | ---------------------------------------- 49 | original: 29 is 50 | lowercased: 29 is 51 | lemma: 52 be 52 | shape: 7 xx 53 | prefix: 14 i 54 | suffix: 29 is 55 | log probability: -5.002371311187744 56 | Brown cluster id: 762 57 | ---------------------------------------- 58 | original: 59 an 59 | lowercased: 59 an 60 | lemma: 59 an 61 | shape: 7 xx 62 | prefix: 11 a 63 | suffix: 59 an 64 | log probability: -5.829381465911865 65 | Brown cluster id: 3 66 | ---------------------------------------- 67 | original: 334 art 68 | lowercased: 334 art 69 | lemma: 334 art 70 | shape: 3 xxx 71 | prefix: 11 a 72 | suffix: 334 art 73 | log probability: -9.482678413391113 74 | Brown cluster id: 633 75 | ---------------------------------------- 76 | original: 1 , 77 | lowercased: 1 , 78 | lemma: 1 , 79 | shape: 1 , 80 | prefix: 1 , 81 | suffix: 1 , 82 | log probability: -3.0368354320526123 83 | Brown cluster id: 4 84 | ---------------------------------------- 85 | original: 44 it 86 | lowercased: 44 it 87 | lemma: 906264 -PRON- 88 | shape: 7 xx 89 | prefix: 14 i 90 | suffix: 44 it 91 | log probability: -5.498129367828369 92 | Brown cluster id: 474 93 | ---------------------------------------- 94 | original: 274 says 95 | lowercased: 274 says 96 | lemma: 253 say 97 | shape: 20 xxxx 98 | prefix: 27 s 99 | suffix: 275 ays 100 | log probability: -7.604108810424805 101 | Brown cluster id: 244 102 | ---------------------------------------- 103 | original: 1 , 104 | lowercased: 1 , 105 | lemma: 1 , 106 | shape: 1 , 107 | prefix: 1 , 108 | suffix: 1 , 109 | log probability: -3.0368354320526123 110 | Brown cluster id: 4 111 | ---------------------------------------- 112 | original: 79 or 113 | lowercased: 79 or 114 | lemma: 79 or 115 | shape: 7 xx 116 | prefix: 8 o 117 | suffix: 79 or 118 | log probability: -6.262600898742676 119 | Brown cluster id: 404 120 | ---------------------------------------- 121 | original: 1400 rather 122 | lowercased: 1400 rather 123 | lemma: 1400 rather 124 | shape: 20 xxxx 125 | prefix: 357 r 126 | suffix: 131 her 127 | log probability: -9.074186325073242 128 | Brown cluster id: 6698 129 | ---------------------------------------- 130 | original: 1 , 131 | lowercased: 1 , 132 | lemma: 1 , 133 | shape: 1 , 134 | prefix: 1 , 135 | suffix: 1 , 136 | log probability: -3.0368354320526123 137 | Brown cluster id: 4 138 | ---------------------------------------- 139 | original: 11 a 140 | lowercased: 11 a 141 | lemma: 11 a 142 | shape: 12 x 143 | prefix: 11 a 144 | suffix: 11 a 145 | log probability: -4.003841400146484 146 | Brown cluster id: 19 147 | ---------------------------------------- 148 | 149 | 150 | 151 | # Let's look at the sentences 152 | sents = [] 153 | # the "sents" property returns spans 154 | # spans have indices into the original string 155 | # where each index value represents a token 156 | for span in parsedData.sents: 157 | # go from the start to the end of each span, returning each token in the sentence 158 | # combine each token using join() 159 | sent = ''.join(parsedData[i].string for i in range(span.start, span.end)).strip() 160 | sents.append(sent) 161 | 162 | for sentence in sents: 163 | print(sentence) 164 | 165 | There is an art, it says, or rather, a knack to flying. 166 | The knack lies in learning how to throw yourself at the ground and miss. 167 | In the beginning the Universe was created. 168 | This has made a lot of people very angry and been widely regarded as a bad move. 169 | 170 | 171 | 172 | # Let's look at the part of speech tags of the first sentence 173 | for span in parsedData.sents: 174 | sent = [parsedData[i] for i in range(span.start, span.end)] 175 | break 176 | 177 | for token in sent: 178 | print(token.orth_, token.pos_) 179 | 180 | There DET 181 | is VERB 182 | an DET 183 | art NOUN 184 | , PUNCT 185 | it PRON 186 | says VERB 187 | , PUNCT 188 | or CONJ 189 | rather ADV 190 | , PUNCT 191 | a DET 192 | knack NOUN 193 | to ADP 194 | flying NOUN 195 | . PUNCT 196 | 197 | 198 | 199 | # Let's look at the dependencies of this example: 200 | example = "The boy with the spotted dog quickly ran after the firetruck." 201 | parsedEx = parser(example) 202 | # shown as: original token, dependency tag, head word, left dependents, right dependents 203 | for token in parsedEx: 204 | print(token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights]) 205 | 206 | The det boy [] [] 207 | boy nsubj ran ['The'] ['with'] 208 | with prep boy [] ['dog'] 209 | the det dog [] [] 210 | spotted amod dog [] [] 211 | dog pobj with ['the', 'spotted'] [] 212 | quickly advmod ran [] [] 213 | ran ROOT ran ['boy', 'quickly'] ['after', '.'] 214 | after prep ran [] ['firetruck'] 215 | the det firetruck [] [] 216 | firetruck pobj after ['the'] [] 217 | . punct ran [] [] 218 | 219 | 220 | 221 | # Let's look at the named entities of this example: 222 | example = "Apple's stocks dropped dramatically after the death of Steve Jobs in October." 223 | parsedEx = parser(example) 224 | for token in parsedEx: 225 | print(token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)") 226 | 227 | print("-------------- entities only ---------------") 228 | # if you just want the entities and nothing else, you can do access the parsed examples "ents" property like this: 229 | ents = list(parsedEx.ents) 230 | for entity in ents: 231 | print(entity.label, entity.label_, ' '.join(t.orth_ for t in entity)) 232 | 233 | Apple ORG 234 | 's (not an entity) 235 | stocks (not an entity) 236 | dropped (not an entity) 237 | dramatically (not an entity) 238 | after (not an entity) 239 | the (not an entity) 240 | death (not an entity) 241 | of (not an entity) 242 | Steve PERSON 243 | Jobs (not an entity) 244 | in (not an entity) 245 | October DATE 246 | . (not an entity) 247 | -------------- entities only --------------- 248 | 274530 ORG Apple 249 | 112504 PERSON Steve Jobs 250 | 71288 DATE October 251 | 252 | 253 | # spaCy is trained to attempt to handle messy data, including emoticons and other web-based features 254 | 255 | 256 | messyData = "lol that is rly funny :) This is gr8 i rate it 8/8!!!" 257 | parsedData = parser(messyData) 258 | for token in parsedData: 259 | print(token.orth_, token.pos_, token.lemma_) 260 | 261 | # it does pretty well! Note that it does fail on the token "gr8", taking it as a verb rather than an adjective meaning "great" 262 | # and "lol" probably isn't a noun...it's more like an interjection 263 | 264 | lol NOUN lol 265 | that DET that 266 | is VERB be 267 | rly ADV rly 268 | funny ADJ funny 269 | :) PUNCT :) 270 | This DET This 271 | is VERB be 272 | gr8 VERB gr8 273 | i PRON i 274 | rate VERB rate 275 | it PRON -PRON- 276 | 8/8 NUM 8/8 277 | ! PUNCT ! 278 | ! PUNCT ! 279 | ! PUNCT ! 280 | 281 | 282 | # spaCy has word vector representations built in! 283 | 284 | 285 | from numpy import dot 286 | from numpy.linalg import norm 287 | 288 | # you can access known words from the parser's vocabulary 289 | nasa = parser.vocab['NASA'] 290 | 291 | # cosine similarity 292 | cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) 293 | 294 | # gather all known words, take only the lowercased versions 295 | allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != "nasa"}) 296 | 297 | # sort by similarity to NASA 298 | allWords.sort(key=lambda w: cosine(w.repvec, nasa.repvec)) 299 | allWords.reverse() 300 | print("Top 20 most similar words to NASA:") 301 | for word in allWords[:20]: 302 | print(word.orth_) 303 | 304 | # Let's see if it can figure out this analogy 305 | # Man is to King as Woman is to ?? 306 | king = parser.vocab['king'] 307 | man = parser.vocab['man'] 308 | woman = parser.vocab['woman'] 309 | 310 | result = king.repvec - man.repvec + woman.repvec 311 | 312 | # gather all known words, take only the lowercased versions 313 | allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != "king" and w.lower_ != "man" and w.lower_ != "woman"}) 314 | # sort by similarity to the result 315 | allWords.sort(key=lambda w: cosine(w.repvec, result)) 316 | allWords.reverse() 317 | print("\n----------------------------\nTop 3 closest results for king - man + woman:") 318 | for word in allWords[:3]: 319 | print(word.orth_) 320 | 321 | # it got it! Queen! 322 | 323 | Top 20 most similar words to NASA: 324 | jpl 325 | noaa 326 | esa 327 | cern 328 | nih 329 | norad 330 | fema 331 | isro 332 | usaid 333 | nsf 334 | nsa 335 | dod 336 | usda 337 | caltech 338 | defra 339 | raytheon 340 | cia 341 | unhcr 342 | fermilab 343 | cdc 344 | 345 | ---------------------------- 346 | Top 3 closest results for king - man + woman: 347 | queen 348 | monarch 349 | princess 350 | 351 | 352 | # You can do cool things like extract Subject, Verb, Object triples from the dependency parse if you use my code in subject_object_extraction.py. Note: Doesn't work on complicated sentences. Fails if the dependency parse is incorrect. 353 | 354 | 355 | from subject_object_extraction import findSVOs 356 | 357 | # can still work even without punctuation 358 | parse = parser("he and his brother shot me and my sister") 359 | print(findSVOs(parse)) 360 | 361 | # very complex sample. Only some are correct. Some are missed. 362 | parse = parser("Far out in the uncharted backwaters of the unfashionable end of the Western Spiral arm of the Galaxy lies a small unregarded yellow sun. " 363 | "Orbiting this at a distance of roughly ninety-two million miles is an utterly insignificant little blue green planet whose ape-descended " 364 | "life forms are so amazingly primitive that they still think digital watches are a pretty neat idea. " 365 | "This planet has – or rather had – a problem, which was this: most of the people living on it were unhappy for pretty much of the time. " 366 | "Many solutions were suggested for this problem, but most of these were largely concerned with the movements of small green pieces of paper, " 367 | "which is odd because on the whole it wasn’t the small green pieces of paper that were unhappy. And so the problem remained; lots of the " 368 | "people were mean, and most of them were miserable, even the ones with digital watches.") 369 | print(findSVOs(parse)) 370 | 371 | [('he', 'shot', 'me'), ('he', 'shot', 'sister'), ('brother', 'shot', 'me'), ('brother', 'shot', 'sister')] 372 | [('orbiting', 'is', 'planet'), ('watches', 'are', 'idea'), ('problem', 'was', 'this'), ('it', 'wasn’t', 'pieces'), ('most', 'were', 'ones')] 373 | 374 | 375 | # If you want to include spaCy in your machine learning it is not too difficult 376 | 377 | 378 | from sklearn.feature_extraction.text import CountVectorizer 379 | from sklearn.base import TransformerMixin 380 | from sklearn.pipeline import Pipeline 381 | from sklearn.svm import LinearSVC 382 | from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS 383 | from sklearn.metrics import accuracy_score 384 | from nltk.corpus import stopwords 385 | import string 386 | import re 387 | 388 | # A custom stoplist 389 | STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS)) 390 | # List of symbols we don't care about 391 | SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"] 392 | 393 | # Every step in a pipeline needs to be a "transformer". Define a custom transformer to clean text using spaCy 394 | class CleanTextTransformer(TransformerMixin): 395 | """ 396 | Convert text to cleaned text 397 | """ 398 | 399 | def transform(self, X, **transform_params): 400 | return [cleanText(text) for text in X] 401 | 402 | def fit(self, X, y=None, **fit_params): 403 | return self 404 | 405 | def get_params(self, deep=True): 406 | return {} 407 | 408 | # A custom function to clean the text before sending it into the vectorizer 409 | def cleanText(text): 410 | # get rid of newlines 411 | text = text.strip().replace("\n", " ").replace("\r", " ") 412 | 413 | # replace twitter @mentions 414 | mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE) 415 | text = mentionFinder.sub("@MENTION", text) 416 | 417 | # replace HTML symbols 418 | text = text.replace("&", "and").replace(">", ">").replace("<", "<") 419 | 420 | # lowercase 421 | text = text.lower() 422 | 423 | return text 424 | 425 | # A custom function to tokenize the text using spaCy 426 | # and convert to lemmas 427 | def tokenizeText(sample): 428 | 429 | # get the tokens using spaCy 430 | tokens = parser(sample) 431 | 432 | # lemmatize 433 | lemmas = [] 434 | for tok in tokens: 435 | lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_) 436 | tokens = lemmas 437 | 438 | # stoplist the tokens 439 | tokens = [tok for tok in tokens if tok not in STOPLIST] 440 | 441 | # stoplist symbols 442 | tokens = [tok for tok in tokens if tok not in SYMBOLS] 443 | 444 | # remove large strings of whitespace 445 | while "" in tokens: 446 | tokens.remove("") 447 | while " " in tokens: 448 | tokens.remove(" ") 449 | while "\n" in tokens: 450 | tokens.remove("\n") 451 | while "\n\n" in tokens: 452 | tokens.remove("\n\n") 453 | 454 | return tokens 455 | 456 | def printNMostInformative(vectorizer, clf, N): 457 | """Prints features with the highest coefficient values, per class""" 458 | feature_names = vectorizer.get_feature_names() 459 | coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) 460 | topClass1 = coefs_with_fns[:N] 461 | topClass2 = coefs_with_fns[:-(N + 1):-1] 462 | print("Class 1 best: ") 463 | for feat in topClass1: 464 | print(feat) 465 | print("Class 2 best: ") 466 | for feat in topClass2: 467 | print(feat) 468 | 469 | # the vectorizer and classifer to use 470 | # note that I changed the tokenizer in CountVectorizer to use a custom function using spaCy's tokenizer 471 | vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1)) 472 | clf = LinearSVC() 473 | # the pipeline to clean, tokenize, vectorize, and classify 474 | pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)]) 475 | 476 | # data 477 | train = ["I love space. Space is great.", "Planets are cool. I am glad they exist in space", "lol @twitterdude that is gr8", 478 | "twitter & reddit are fun.", "Mars is a planet. It is red.", "@Microsoft: y u skip windows 9?", "Rockets launch from Earth and go to other planets.", 479 | "twitter social media > <", "@someguy @somegirl @twitter #hashtag", "Orbiting the sun is a little blue-green planet."] 480 | labelsTrain = ["space", "space", "twitter", "twitter", "space", "twitter", "space", "twitter", "twitter", "space"] 481 | 482 | test = ["i h8 riting comprehensibly #skoolsux", "planets and stars and rockets and stuff"] 483 | labelsTest = ["twitter", "space"] 484 | 485 | # train 486 | pipe.fit(train, labelsTrain) 487 | 488 | # test 489 | preds = pipe.predict(test) 490 | print("----------------------------------------------------------------------------------------------") 491 | print("results:") 492 | for (sample, pred) in zip(test, preds): 493 | print(sample, ":", pred) 494 | print("accuracy:", accuracy_score(labelsTest, preds)) 495 | 496 | print("----------------------------------------------------------------------------------------------") 497 | print("Top 10 features used to predict: ") 498 | # show the top features 499 | printNMostInformative(vectorizer, clf, 10) 500 | 501 | print("----------------------------------------------------------------------------------------------") 502 | print("The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc") 503 | # let's see what the pipeline was transforming the data into 504 | pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)]) 505 | transform = pipe.fit_transform(train, labelsTrain) 506 | 507 | # get the features that the vectorizer learned (its vocabulary) 508 | vocab = vectorizer.get_feature_names() 509 | 510 | # the values from the vectorizer transformed data (each item is a row,column index with value as # times occuring in the sample, stored as a sparse matrix) 511 | for i in range(len(train)): 512 | s = "" 513 | indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]] 514 | numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]] 515 | for idx, num in zip(indexIntoVocab, numOccurences): 516 | s += str((vocab[idx], num)) 517 | print("Sample {}: {}".format(i, s)) 518 | 519 | ---------------------------------------------------------------------------------------------- 520 | results: 521 | i h8 riting comprehensibly #skoolsux : twitter 522 | planets and stars and rockets and stuff : space 523 | accuracy: 1.0 524 | ---------------------------------------------------------------------------------------------- 525 | Top 10 features used to predict: 526 | Class 1 best: 527 | (-0.52882810587037121, 'planet') 528 | (-0.35193565503626856, 'space') 529 | (-0.2182987490483107, 'mar') 530 | (-0.2182987490483107, 'red') 531 | (-0.15592826214493352, 'earth') 532 | (-0.15592826214493352, 'launch') 533 | (-0.15592826214493352, 'rocket') 534 | (-0.1482804579342584, 'great') 535 | (-0.1482804579342584, 'love') 536 | (-0.099226355509375405, 'blue') 537 | Class 2 best: 538 | (0.41129938045689757, 'twitter') 539 | (0.34038557663231445, '@mention') 540 | (0.23401502570811406, 'lol') 541 | (0.23401502570811406, 'gr8') 542 | (0.20564996854629114, 'social') 543 | (0.20564996854629114, 'medium') 544 | (0.20564941191060651, 'reddit') 545 | (0.20564941191060651, 'fun') 546 | (0.10637055092420053, 'y') 547 | (0.10637055092420053, 'window') 548 | ---------------------------------------------------------------------------------------------- 549 | The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc 550 | Sample 0: ('love', 1)('space', 2)('great', 1) 551 | Sample 1: ('space', 1)('planet', 1)('cool', 1)('glad', 1)('exist', 1) 552 | Sample 2: ('lol', 1)('@mention', 1)('gr8', 1) 553 | Sample 3: ('twitter', 1)('reddit', 1)('fun', 1) 554 | Sample 4: ('planet', 1)('mar', 1)('red', 1) 555 | Sample 5: ('@mention', 1)('y', 1)('u', 1)('skip', 1)('window', 1)('9', 1) 556 | Sample 6: ('planet', 1)('rocket', 1)('launch', 1)('earth', 1) 557 | Sample 7: ('twitter', 1)('social', 1)('medium', 1) 558 | Sample 8: ('@mention', 3)('hashtag', 1) 559 | Sample 9: ('planet', 1)('orbit', 1)('sun', 1)('little', 1)('blue', 1)('green', 1) 560 | 561 | 562 | 563 | 564 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (C) 2016 J Nicolas Schrading 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction to spaCy for NLP and Machine Learning 2 | 3 | ## Dependencies 4 | spaCy 5 | 6 | Scikit-learn 7 | 8 | NLTK 9 | 10 | To get the above requirements (except spaCy) download and install anaconda: 11 | https://store.continuum.io/cshop/anaconda/ 12 | 13 | To install spaCy: 14 | ``` 15 | pip install spacy 16 | python -m spacy.en.download all 17 | ``` 18 | 19 | Make sure to run the above python command, this downloads the models that spaCy needs. 20 | Python 3 is recommended, although Python 2 should work as long as you convert the strings to unicode objects. 21 | 22 | ## Running the files 23 | 24 | 1. From the command line (cmd) or terminal navigate to where this readme and .ipynb file is. 25 | 2. execute "ipython notebook Intro_spaCy_NLP" 26 | -------------------------------------------------------------------------------- /subject_object_extraction.py: -------------------------------------------------------------------------------- 1 | from nltk.stem.wordnet import WordNetLemmatizer 2 | from spacy.en import English 3 | 4 | SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"] 5 | OBJECTS = ["dobj", "dative", "attr", "oprd"] 6 | 7 | def getSubsFromConjunctions(subs): 8 | moreSubs = [] 9 | for sub in subs: 10 | # rights is a generator 11 | rights = list(sub.rights) 12 | rightDeps = {tok.lower_ for tok in rights} 13 | if "and" in rightDeps: 14 | moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"]) 15 | if len(moreSubs) > 0: 16 | moreSubs.extend(getSubsFromConjunctions(moreSubs)) 17 | return moreSubs 18 | 19 | def getObjsFromConjunctions(objs): 20 | moreObjs = [] 21 | for obj in objs: 22 | # rights is a generator 23 | rights = list(obj.rights) 24 | rightDeps = {tok.lower_ for tok in rights} 25 | if "and" in rightDeps: 26 | moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"]) 27 | if len(moreObjs) > 0: 28 | moreObjs.extend(getObjsFromConjunctions(moreObjs)) 29 | return moreObjs 30 | 31 | def getVerbsFromConjunctions(verbs): 32 | moreVerbs = [] 33 | for verb in verbs: 34 | rightDeps = {tok.lower_ for tok in verb.rights} 35 | if "and" in rightDeps: 36 | moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"]) 37 | if len(moreVerbs) > 0: 38 | moreVerbs.extend(getVerbsFromConjunctions(moreVerbs)) 39 | return moreVerbs 40 | 41 | def findSubs(tok): 42 | head = tok.head 43 | while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head: 44 | head = head.head 45 | if head.pos_ == "VERB": 46 | subs = [tok for tok in head.lefts if tok.dep_ == "SUB"] 47 | if len(subs) > 0: 48 | verbNegated = isNegated(head) 49 | subs.extend(getSubsFromConjunctions(subs)) 50 | return subs, verbNegated 51 | elif head.head != head: 52 | return findSubs(head) 53 | elif head.pos_ == "NOUN": 54 | return [head], isNegated(tok) 55 | return [], False 56 | 57 | def isNegated(tok): 58 | negations = {"no", "not", "n't", "never", "none"} 59 | for dep in list(tok.lefts) + list(tok.rights): 60 | if dep.lower_ in negations: 61 | return True 62 | return False 63 | 64 | def findSVs(tokens): 65 | svs = [] 66 | verbs = [tok for tok in tokens if tok.pos_ == "VERB"] 67 | for v in verbs: 68 | subs, verbNegated = getAllSubs(v) 69 | if len(subs) > 0: 70 | for sub in subs: 71 | svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_)) 72 | return svs 73 | 74 | def getObjsFromPrepositions(deps): 75 | objs = [] 76 | for dep in deps: 77 | if dep.pos_ == "ADP" and dep.dep_ == "prep": 78 | objs.extend([tok for tok in dep.rights if tok.dep_ in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")]) 79 | return objs 80 | 81 | def getObjsFromAttrs(deps): 82 | for dep in deps: 83 | if dep.pos_ == "NOUN" and dep.dep_ == "attr": 84 | verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"] 85 | if len(verbs) > 0: 86 | for v in verbs: 87 | rights = list(v.rights) 88 | objs = [tok for tok in rights if tok.dep_ in OBJECTS] 89 | objs.extend(getObjsFromPrepositions(rights)) 90 | if len(objs) > 0: 91 | return v, objs 92 | return None, None 93 | 94 | def getObjFromXComp(deps): 95 | for dep in deps: 96 | if dep.pos_ == "VERB" and dep.dep_ == "xcomp": 97 | v = dep 98 | rights = list(v.rights) 99 | objs = [tok for tok in rights if tok.dep_ in OBJECTS] 100 | objs.extend(getObjsFromPrepositions(rights)) 101 | if len(objs) > 0: 102 | return v, objs 103 | return None, None 104 | 105 | def getAllSubs(v): 106 | verbNegated = isNegated(v) 107 | subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"] 108 | if len(subs) > 0: 109 | subs.extend(getSubsFromConjunctions(subs)) 110 | else: 111 | foundSubs, verbNegated = findSubs(v) 112 | subs.extend(foundSubs) 113 | return subs, verbNegated 114 | 115 | def getAllObjs(v): 116 | # rights is a generator 117 | rights = list(v.rights) 118 | objs = [tok for tok in rights if tok.dep_ in OBJECTS] 119 | objs.extend(getObjsFromPrepositions(rights)) 120 | 121 | #potentialNewVerb, potentialNewObjs = getObjsFromAttrs(rights) 122 | #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0: 123 | # objs.extend(potentialNewObjs) 124 | # v = potentialNewVerb 125 | 126 | potentialNewVerb, potentialNewObjs = getObjFromXComp(rights) 127 | if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0: 128 | objs.extend(potentialNewObjs) 129 | v = potentialNewVerb 130 | if len(objs) > 0: 131 | objs.extend(getObjsFromConjunctions(objs)) 132 | return v, objs 133 | 134 | def findSVOs(tokens): 135 | svos = [] 136 | verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"] 137 | for v in verbs: 138 | subs, verbNegated = getAllSubs(v) 139 | # hopefully there are subs, if not, don't examine this verb any longer 140 | if len(subs) > 0: 141 | v, objs = getAllObjs(v) 142 | for sub in subs: 143 | for obj in objs: 144 | objNegated = isNegated(obj) 145 | svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_)) 146 | return svos 147 | 148 | def getAbuserOntoVictimSVOs(tokens): 149 | maleAbuser = {'he', 'boyfriend', 'bf', 'father', 'dad', 'husband', 'brother', 'man'} 150 | femaleAbuser = {'she', 'girlfriend', 'gf', 'mother', 'mom', 'wife', 'sister', 'woman'} 151 | neutralAbuser = {'pastor', 'abuser', 'offender', 'ex', 'x', 'lover', 'church', 'they'} 152 | victim = {'me', 'sister', 'brother', 'child', 'kid', 'baby', 'friend', 'her', 'him', 'man', 'woman'} 153 | 154 | svos = findSVOs(tokens) 155 | wnl = WordNetLemmatizer() 156 | passed = [] 157 | for s, v, o in svos: 158 | s = wnl.lemmatize(s) 159 | v = "!" + wnl.lemmatize(v[1:], 'v') if v[0] == "!" else wnl.lemmatize(v, 'v') 160 | o = "!" + wnl.lemmatize(o[1:]) if o[0] == "!" else wnl.lemmatize(o) 161 | if s in maleAbuser.union(femaleAbuser).union(neutralAbuser) and o in victim: 162 | passed.append((s, v, o)) 163 | return passed 164 | 165 | def printDeps(toks): 166 | for tok in toks: 167 | print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights]) 168 | 169 | def testSVOs(): 170 | nlp = English() 171 | 172 | tok = nlp("making $12 an hour? where am i going to go? i have no other financial assistance available and he certainly won't provide support.") 173 | svos = findSVOs(tok) 174 | printDeps(tok) 175 | assert set(svos) == {('i', '!have', 'assistance'), ('he', '!provide', 'support')} 176 | print(svos) 177 | 178 | tok = nlp("i don't have other assistance") 179 | svos = findSVOs(tok) 180 | printDeps(tok) 181 | assert set(svos) == {('i', '!have', 'assistance')} 182 | 183 | print("-----------------------------------------------") 184 | tok = nlp("They ate the pizza with anchovies.") 185 | svos = findSVOs(tok) 186 | printDeps(tok) 187 | print(svos) 188 | assert set(svos) == {('they', 'ate', 'pizza')} 189 | 190 | print("--------------------------------------------------") 191 | tok = nlp("I have no other financial assistance available and he certainly won't provide support.") 192 | svos = findSVOs(tok) 193 | printDeps(tok) 194 | print(svos) 195 | assert set(svos) == {('i', '!have', 'assistance'), ('he', '!provide', 'support')} 196 | 197 | print("--------------------------------------------------") 198 | tok = nlp("I have no other financial assistance available, and he certainly won't provide support.") 199 | svos = findSVOs(tok) 200 | printDeps(tok) 201 | print(svos) 202 | assert set(svos) == {('i', '!have', 'assistance'), ('he', '!provide', 'support')} 203 | 204 | print("--------------------------------------------------") 205 | tok = nlp("he did not kill me") 206 | svos = findSVOs(tok) 207 | printDeps(tok) 208 | print(svos) 209 | assert set(svos) == {('he', '!kill', 'me')} 210 | 211 | #print("--------------------------------------------------") 212 | #tok = nlp("he is an evil man that hurt my child and sister") 213 | #svos = findSVOs(tok) 214 | #printDeps(tok) 215 | #print(svos) 216 | #assert set(svos) == {('he', 'hurt', 'child'), ('he', 'hurt', 'sister'), ('man', 'hurt', 'child'), ('man', 'hurt', 'sister')} 217 | 218 | print("--------------------------------------------------") 219 | tok = nlp("he told me i would die alone with nothing but my career someday") 220 | svos = findSVOs(tok) 221 | printDeps(tok) 222 | print(svos) 223 | assert set(svos) == {('he', 'told', 'me')} 224 | 225 | print("--------------------------------------------------") 226 | tok = nlp("I wanted to kill him with a hammer.") 227 | svos = findSVOs(tok) 228 | printDeps(tok) 229 | print(svos) 230 | assert set(svos) == {('i', 'kill', 'him')} 231 | 232 | print("--------------------------------------------------") 233 | tok = nlp("because he hit me and also made me so angry i wanted to kill him with a hammer.") 234 | svos = findSVOs(tok) 235 | printDeps(tok) 236 | print(svos) 237 | assert set(svos) == {('he', 'hit', 'me'), ('i', 'kill', 'him')} 238 | 239 | print("--------------------------------------------------") 240 | tok = nlp("he and his brother shot me") 241 | svos = findSVOs(tok) 242 | printDeps(tok) 243 | print(svos) 244 | assert set(svos) == {('he', 'shot', 'me'), ('brother', 'shot', 'me')} 245 | 246 | print("--------------------------------------------------") 247 | tok = nlp("he and his brother shot me and my sister") 248 | svos = findSVOs(tok) 249 | printDeps(tok) 250 | print(svos) 251 | assert set(svos) == {('he', 'shot', 'me'), ('he', 'shot', 'sister'), ('brother', 'shot', 'me'), ('brother', 'shot', 'sister')} 252 | 253 | print("--------------------------------------------------") 254 | tok = nlp("the annoying person that was my boyfriend hit me") 255 | svos = findSVOs(tok) 256 | printDeps(tok) 257 | print(svos) 258 | assert set(svos) == {('person', 'was', 'boyfriend'), ('person', 'hit', 'me')} 259 | 260 | print("--------------------------------------------------") 261 | tok = nlp("the boy raced the girl who had a hat that had spots.") 262 | svos = findSVOs(tok) 263 | printDeps(tok) 264 | print(svos) 265 | assert set(svos) == {('boy', 'raced', 'girl'), ('who', 'had', 'hat'), ('hat', 'had', 'spots')} 266 | 267 | print("--------------------------------------------------") 268 | tok = nlp("he spit on me") 269 | svos = findSVOs(tok) 270 | printDeps(tok) 271 | print(svos) 272 | assert set(svos) == {('he', 'spit', 'me')} 273 | 274 | print("--------------------------------------------------") 275 | tok = nlp("he didn't spit on me") 276 | svos = findSVOs(tok) 277 | printDeps(tok) 278 | print(svos) 279 | assert set(svos) == {('he', '!spit', 'me')} 280 | 281 | print("--------------------------------------------------") 282 | tok = nlp("the boy raced the girl who had a hat that didn't have spots.") 283 | svos = findSVOs(tok) 284 | printDeps(tok) 285 | print(svos) 286 | assert set(svos) == {('boy', 'raced', 'girl'), ('who', 'had', 'hat'), ('hat', '!have', 'spots')} 287 | 288 | print("--------------------------------------------------") 289 | tok = nlp("he is a nice man that didn't hurt my child and sister") 290 | svos = findSVOs(tok) 291 | printDeps(tok) 292 | print(svos) 293 | assert set(svos) == {('he', 'is', 'man'), ('man', '!hurt', 'child'), ('man', '!hurt', 'sister')} 294 | 295 | print("--------------------------------------------------") 296 | tok = nlp("he didn't spit on me and my child") 297 | svos = findSVOs(tok) 298 | printDeps(tok) 299 | print(svos) 300 | assert set(svos) == {('he', '!spit', 'me'), ('he', '!spit', 'child')} 301 | 302 | print("--------------------------------------------------") 303 | tok = nlp("he beat and hurt me") 304 | svos = findSVOs(tok) 305 | printDeps(tok) 306 | print(svos) 307 | # tok = nlp("he beat and hurt me") 308 | 309 | def main(): 310 | testSVOs() 311 | 312 | if __name__ == "__main__": 313 | main() --------------------------------------------------------------------------------