├── LICENSE ├── .gitignore ├── long_stopwords.txt ├── README.md └── TextRank.ipynb /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Jishnu Ray Chowdhury 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /long_stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | able 3 | about 4 | above 5 | abst 6 | accordance 7 | according 8 | accordingly 9 | across 10 | act 11 | actually 12 | added 13 | adj 14 | affected 15 | affecting 16 | affects 17 | after 18 | afterwards 19 | again 20 | against 21 | ah 22 | all 23 | almost 24 | alone 25 | along 26 | already 27 | also 28 | although 29 | always 30 | am 31 | among 32 | amongst 33 | an 34 | and 35 | announce 36 | another 37 | any 38 | anybody 39 | anyhow 40 | anymore 41 | anyone 42 | anything 43 | anyway 44 | anyways 45 | anywhere 46 | apparently 47 | approximately 48 | are 49 | aren 50 | arent 51 | arise 52 | around 53 | as 54 | aside 55 | ask 56 | asking 57 | at 58 | auth 59 | available 60 | away 61 | awfully 62 | b 63 | back 64 | be 65 | became 66 | because 67 | become 68 | becomes 69 | becoming 70 | been 71 | before 72 | beforehand 73 | begin 74 | beginning 75 | beginnings 76 | begins 77 | behind 78 | being 79 | believe 80 | below 81 | beside 82 | besides 83 | between 84 | beyond 85 | biol 86 | both 87 | brief 88 | briefly 89 | but 90 | by 91 | c 92 | ca 93 | came 94 | can 95 | cannot 96 | can't 97 | cause 98 | causes 99 | certain 100 | certainly 101 | co 102 | com 103 | come 104 | comes 105 | contain 106 | containing 107 | corresponding 108 | contains 109 | could 110 | couldnt 111 | d 112 | date 113 | did 114 | didn't 115 | different 116 | do 117 | does 118 | doesn't 119 | doing 120 | done 121 | don't 122 | down 123 | downwards 124 | due 125 | during 126 | e 127 | each 128 | ed 129 | edu 130 | effect 131 | eg 132 | eight 133 | eighty 134 | either 135 | else 136 | elsewhere 137 | end 138 | ending 139 | enough 140 | especially 141 | et 142 | et-al 143 | etc 144 | even 145 | ever 146 | every 147 | everybody 148 | everyone 149 | everything 150 | everywhere 151 | ex 152 | except 153 | f 154 | far 155 | few 156 | ff 157 | fifth 158 | first 159 | five 160 | fix 161 | followed 162 | following 163 | follows 164 | for 165 | former 166 | formerly 167 | forth 168 | found 169 | four 170 | from 171 | further 172 | furthermore 173 | g 174 | gave 175 | get 176 | gets 177 | getting 178 | give 179 | given 180 | gives 181 | giving 182 | go 183 | goes 184 | gone 185 | got 186 | gotten 187 | h 188 | had 189 | happens 190 | hardly 191 | has 192 | hasn't 193 | have 194 | haven't 195 | having 196 | he 197 | hed 198 | hence 199 | her 200 | here 201 | hereafter 202 | hereby 203 | herein 204 | heres 205 | hereupon 206 | hers 207 | herself 208 | hes 209 | hi 210 | hid 211 | him 212 | himself 213 | his 214 | hither 215 | home 216 | how 217 | howbeit 218 | however 219 | hundred 220 | i 221 | id 222 | ie 223 | if 224 | i'll 225 | im 226 | immediate 227 | immediately 228 | importance 229 | important 230 | in 231 | inc 232 | indeed 233 | index 234 | information 235 | instead 236 | into 237 | invention 238 | inward 239 | is 240 | isn't 241 | it 242 | itd 243 | it'll 244 | its 245 | itself 246 | i've 247 | j 248 | just 249 | k 250 | keep 251 | keeps 252 | kept 253 | kg 254 | km 255 | know 256 | known 257 | knows 258 | l 259 | largely 260 | last 261 | lately 262 | later 263 | latter 264 | latterly 265 | least 266 | less 267 | lest 268 | let 269 | lets 270 | like 271 | liked 272 | likely 273 | line 274 | little 275 | 'll 276 | look 277 | looking 278 | looks 279 | ltd 280 | m 281 | made 282 | mainly 283 | make 284 | makes 285 | many 286 | may 287 | maybe 288 | me 289 | mean 290 | means 291 | meantime 292 | meanwhile 293 | merely 294 | mg 295 | might 296 | million 297 | miss 298 | ml 299 | more 300 | moreover 301 | most 302 | mostly 303 | mr 304 | mrs 305 | much 306 | mug 307 | must 308 | my 309 | myself 310 | n 311 | na 312 | name 313 | namely 314 | nay 315 | nd 316 | near 317 | nearly 318 | necessarily 319 | necessary 320 | need 321 | needs 322 | neither 323 | never 324 | nevertheless 325 | new 326 | next 327 | nine 328 | ninety 329 | no 330 | nobody 331 | non 332 | none 333 | nonetheless 334 | noone 335 | nor 336 | normally 337 | nos 338 | not 339 | noted 340 | nothing 341 | now 342 | nowhere 343 | o 344 | obtain 345 | obtained 346 | obviously 347 | of 348 | off 349 | often 350 | oh 351 | ok 352 | okay 353 | old 354 | omitted 355 | on 356 | once 357 | one 358 | ones 359 | only 360 | onto 361 | or 362 | ord 363 | other 364 | others 365 | otherwise 366 | ought 367 | our 368 | ours 369 | ourselves 370 | out 371 | outside 372 | over 373 | overall 374 | owing 375 | own 376 | p 377 | page 378 | pages 379 | part 380 | particular 381 | particularly 382 | past 383 | per 384 | perhaps 385 | placed 386 | please 387 | plus 388 | poorly 389 | possible 390 | possibly 391 | potentially 392 | pp 393 | predominantly 394 | present 395 | previously 396 | primarily 397 | probably 398 | promptly 399 | proud 400 | provides 401 | put 402 | q 403 | que 404 | quickly 405 | quite 406 | qv 407 | r 408 | ran 409 | rather 410 | rd 411 | re 412 | readily 413 | really 414 | recent 415 | recently 416 | ref 417 | refs 418 | regarding 419 | regardless 420 | regards 421 | related 422 | relatively 423 | research 424 | respectively 425 | resulted 426 | resulting 427 | results 428 | right 429 | run 430 | s 431 | said 432 | same 433 | saw 434 | say 435 | saying 436 | says 437 | sec 438 | section 439 | see 440 | seeing 441 | seem 442 | seemed 443 | seeming 444 | seems 445 | seen 446 | self 447 | selves 448 | sent 449 | seven 450 | several 451 | shall 452 | she 453 | shed 454 | she'll 455 | shes 456 | should 457 | shouldn't 458 | show 459 | showed 460 | shown 461 | showns 462 | shows 463 | significant 464 | significantly 465 | similar 466 | similarly 467 | since 468 | six 469 | slightly 470 | so 471 | some 472 | somebody 473 | somehow 474 | someone 475 | somethan 476 | something 477 | sometime 478 | sometimes 479 | somewhat 480 | somewhere 481 | soon 482 | sorry 483 | specifically 484 | specified 485 | specify 486 | specifying 487 | still 488 | stop 489 | strongly 490 | sub 491 | substantially 492 | successfully 493 | such 494 | sufficiently 495 | suggest 496 | sup 497 | sure t 498 | take 499 | taken 500 | taking 501 | tell 502 | tends 503 | th 504 | than 505 | thank 506 | thanks 507 | thanx 508 | that 509 | that'll 510 | thats 511 | that've 512 | the 513 | their 514 | theirs 515 | them 516 | themselves 517 | then 518 | thence 519 | there 520 | thereafter 521 | thereby 522 | thered 523 | therefore 524 | therein 525 | there'll 526 | thereof 527 | therere 528 | theres 529 | thereto 530 | thereupon 531 | there've 532 | these 533 | they 534 | theyd 535 | they'll 536 | theyre 537 | they've 538 | think 539 | this 540 | those 541 | thou 542 | though 543 | thoughh 544 | thousand 545 | throug 546 | through 547 | throughout 548 | thru 549 | thus 550 | til 551 | tip 552 | to 553 | together 554 | too 555 | took 556 | toward 557 | towards 558 | tried 559 | tries 560 | truly 561 | try 562 | trying 563 | ts 564 | twice 565 | two 566 | u 567 | un 568 | under 569 | unfortunately 570 | unless 571 | unlike 572 | unlikely 573 | until 574 | unto 575 | up 576 | upon 577 | ups 578 | us 579 | use 580 | used 581 | useful 582 | usefully 583 | usefulness 584 | uses 585 | using 586 | usually 587 | v 588 | value 589 | various 590 | 've 591 | very 592 | via 593 | viz 594 | vol 595 | vols 596 | vs 597 | w 598 | want 599 | wants 600 | was 601 | wasnt 602 | way 603 | we 604 | wed 605 | welcome 606 | we'll 607 | went 608 | were 609 | werent 610 | we've 611 | what 612 | whatever 613 | what'll 614 | whats 615 | when 616 | whence 617 | whenever 618 | where 619 | whereafter 620 | whereas 621 | whereby 622 | wherein 623 | wheres 624 | whereupon 625 | wherever 626 | whether 627 | which 628 | while 629 | whim 630 | whither 631 | who 632 | whod 633 | whoever 634 | whole 635 | who'll 636 | whom 637 | whomever 638 | whos 639 | whose 640 | why 641 | widely 642 | willing 643 | wish 644 | with 645 | within 646 | without 647 | wont 648 | words 649 | world 650 | would 651 | wouldnt 652 | www 653 | x 654 | y 655 | yes 656 | yet 657 | you 658 | youd 659 | you'll 660 | your 661 | youre 662 | yours 663 | yourself 664 | yourselves 665 | you've 666 | z 667 | zero 668 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Implementation of TextRank for keyword Extraction 3 | 4 | Based on: 5 | 6 | [TextRank: Bringing Order into Texts - by Rada Mihalcea and Paul Tarau](https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf) 7 | 8 | The input text is given below 9 | 10 | 11 | ```python 12 | #Source of text: 13 | #https://www.researchgate.net/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents 14 | 15 | Text = "Compatibility of systems of linear constraints over the set of natural numbers. \ 16 | Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and \ 17 | nonstrict inequations are considered. \ 18 | Upper bounds for components of a minimal set of solutions and \ 19 | algorithms of construction of minimal generating sets of solutions for all \ 20 | types of systems are given. \ 21 | These criteria and the corresponding algorithms for constructing \ 22 | a minimal supporting set of solutions can be used in solving all the \ 23 | considered types of systems and systems of mixed types." 24 | ``` 25 | 26 | ### Cleaning Text Data 27 | 28 | The raw input text is cleaned off non-printable characters (if any) and turned into lower case. 29 | The processed input text is then tokenized using NLTK library functions. 30 | 31 | 32 | ```python 33 | 34 | import nltk 35 | from nltk import word_tokenize 36 | import string 37 | 38 | #nltk.download('punkt') 39 | 40 | def clean(text): 41 | text = text.lower() 42 | printable = set(string.printable) 43 | text = filter(lambda x: x in printable, text) #filter funny characters, if any. 44 | return text 45 | 46 | Cleaned_text = clean(Text) 47 | 48 | text = word_tokenize(Cleaned_text) 49 | 50 | print "Tokenized Text: \n" 51 | print text 52 | ``` 53 | 54 | Tokenized Text: 55 | 56 | ['compatibility', 'of', 'systems', 'of', 'linear', 'constraints', 'over', 'the', 'set', 'of', 'natural', 'numbers', '.', 'criteria', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equations', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bounds', 'for', 'components', 'of', 'a', 'minimal', 'set', 'of', 'solutions', 'and', 'algorithms', 'of', 'construction', 'of', 'minimal', 'generating', 'sets', 'of', 'solutions', 'for', 'all', 'types', 'of', 'systems', 'are', 'given', '.', 'these', 'criteria', 'and', 'the', 'corresponding', 'algorithms', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solutions', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'types', 'of', 'systems', 'and', 'systems', 'of', 'mixed', 'types', '.'] 57 | 58 | 59 | ### POS Tagging For Lemmatization 60 | 61 | NLTK is again used for POS tagging the input text so that the words can be lemmatized based on their POS tags. 62 | 63 | Description of POS tags: 64 | 65 | 66 | http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html 67 | 68 | 69 | ```python 70 | #nltk.download('averaged_perceptron_tagger') 71 | 72 | POS_tag = nltk.pos_tag(text) 73 | 74 | print "Tokenized Text with POS tags: \n" 75 | print POS_tag 76 | ``` 77 | 78 | Tokenized Text with POS tags: 79 | 80 | [('compatibility', 'NN'), ('of', 'IN'), ('systems', 'NNS'), ('of', 'IN'), ('linear', 'JJ'), ('constraints', 'NNS'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('numbers', 'NNS'), ('.', '.'), ('criteria', 'NNS'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'NN'), ('equations', 'NNS'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bounds', 'NNS'), ('for', 'IN'), ('components', 'NNS'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('and', 'CC'), ('algorithms', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('sets', 'NNS'), ('of', 'IN'), ('solutions', 'NNS'), ('for', 'IN'), ('all', 'DT'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criteria', 'NNS'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithms', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('and', 'CC'), ('systems', 'NNS'), ('of', 'IN'), ('mixed', 'JJ'), ('types', 'NNS'), ('.', '.')] 81 | 82 | 83 | ### Lemmatization 84 | 85 | The tokenized text (mainly the nouns and adjectives) is normalized by lemmatization. 86 | In lemmatization different grammatical counterparts of a word will be replaced by single 87 | basic lemma. For example, 'glasses' may be replaced by 'glass'. 88 | 89 | Details about lemmatization: 90 | 91 | https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html 92 | 93 | 94 | ```python 95 | #nltk.download('wordnet') 96 | 97 | from nltk.stem import WordNetLemmatizer 98 | 99 | wordnet_lemmatizer = WordNetLemmatizer() 100 | 101 | adjective_tags = ['JJ','JJR','JJS'] 102 | 103 | lemmatized_text = [] 104 | 105 | for word in POS_tag: 106 | if word[1] in adjective_tags: 107 | lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a"))) 108 | else: 109 | lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun 110 | 111 | print "Text tokens after lemmatization of adjectives and nouns: \n" 112 | print lemmatized_text 113 | ``` 114 | 115 | Text tokens after lemmatization of adjectives and nouns: 116 | 117 | ['compatibility', 'of', 'system', 'of', 'linear', 'constraint', 'over', 'the', 'set', 'of', 'natural', 'number', '.', 'criterion', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equation', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bound', 'for', 'component', 'of', 'a', 'minimal', 'set', 'of', 'solution', 'and', 'algorithm', 'of', 'construction', 'of', 'minimal', 'generating', 'set', 'of', 'solution', 'for', 'all', 'type', 'of', 'system', 'are', 'given', '.', 'these', 'criterion', 'and', 'the', 'corresponding', 'algorithm', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solution', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'type', 'of', 'system', 'and', 'system', 'of', 'mixed', 'type', '.'] 118 | 119 | 120 | ### POS tagging for Filtering 121 | 122 | The lemmatized text is POS tagged here. The tags will be used for filtering later on. 123 | 124 | 125 | ```python 126 | POS_tag = nltk.pos_tag(lemmatized_text) 127 | 128 | print "Lemmatized text with POS tags: \n" 129 | print POS_tag 130 | ``` 131 | 132 | Lemmatized text with POS tags: 133 | 134 | [('compatibility', 'NN'), ('of', 'IN'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('constraint', 'NN'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('number', 'NN'), ('.', '.'), ('criterion', 'NN'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'JJ'), ('equation', 'NN'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bound', 'NN'), ('for', 'IN'), ('component', 'NN'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('and', 'CC'), ('algorithm', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('for', 'IN'), ('all', 'DT'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criterion', 'NN'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithm', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('and', 'CC'), ('system', 'NN'), ('of', 'IN'), ('mixed', 'JJ'), ('type', 'NN'), ('.', '.')] 135 | 136 | 137 | ## POS Based Filtering 138 | 139 | Any word from the lemmatized text, which isn't a noun, adjective, or gerund (or a 'foreign word'), is here 140 | considered as a stopword (non-content). This is based on the assumption that usually keywords are noun, 141 | adjectives or gerunds. 142 | 143 | Punctuations are added to the stopword list too. 144 | 145 | 146 | ```python 147 | stopwords = [] 148 | 149 | wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 150 | 151 | for word in POS_tag: 152 | if word[1] not in wanted_POS: 153 | stopwords.append(word[0]) 154 | 155 | punctuations = list(str(string.punctuation)) 156 | 157 | stopwords = stopwords + punctuations 158 | ``` 159 | 160 | ### Complete stopword generation 161 | 162 | Even if we remove the aforementioned stopwords, still some extremely common nouns, adjectives or gerunds may 163 | remain which are very bad candidates for being keywords (or part of it). 164 | 165 | An external file constituting a long list of stopwords is loaded and all the words are added with the previous 166 | stopwords to create the final list 'stopwords-plus' which is then converted into a set. 167 | 168 | (Source of stopwords data: https://www.ranks.nl/stopwords) 169 | 170 | Stopwords-plus constitute the sum total of all stopwords and potential phrase-delimiters. 171 | 172 | (The contents of this set will be later used to partition the lemmatized text into n-gram phrases. But, for now, I will simply remove the stopwords, and work with a 'bag-of-words' approach. I will be developing the graph using unigram texts as vertices) 173 | 174 | ```python 175 | stopword_file = open("long_stopwords.txt", "r") 176 | #Source = https://www.ranks.nl/stopwords 177 | 178 | lots_of_stopwords = [] 179 | 180 | for line in stopword_file.readlines(): 181 | lots_of_stopwords.append(str(line.strip())) 182 | 183 | stopwords_plus = [] 184 | stopwords_plus = stopwords + lots_of_stopwords 185 | stopwords_plus = set(stopwords_plus) 186 | 187 | #Stopwords_plus contain total set of all stopwords 188 | ``` 189 | 190 | ### Removing Stopwords 191 | 192 | Removing stopwords from lemmatized_text. 193 | Processeced_text condtains the result. 194 | 195 | 196 | ```python 197 | processed_text = [] 198 | for word in lemmatized_text: 199 | if word not in stopwords_plus: 200 | processed_text.append(word) 201 | print processed_text 202 | ``` 203 | 204 | ['compatibility', 'system', 'linear', 'constraint', 'set', 'natural', 'number', 'criterion', 'compatibility', 'system', 'linear', 'diophantine', 'equation', 'strict', 'inequations', 'nonstrict', 'inequations', 'upper', 'bound', 'component', 'minimal', 'set', 'solution', 'algorithm', 'construction', 'minimal', 'generating', 'set', 'solution', 'type', 'system', 'criterion', 'algorithm', 'constructing', 'minimal', 'supporting', 'set', 'solution', 'solving', 'type', 'system', 'system', 'mixed', 'type'] 205 | 206 | 207 | ## Vocabulary Creation 208 | 209 | Vocabulary will only contain unique words from processed_text. 210 | 211 | 212 | ```python 213 | vocabulary = list(set(processed_text)) 214 | print vocabulary 215 | ``` 216 | 217 | ['upper', 'set', 'constructing', 'number', 'solving', 'system', 'compatibility', 'strict', 'criterion', 'type', 'minimal', 'supporting', 'generating', 'linear', 'diophantine', 'component', 'bound', 'nonstrict', 'inequations', 'natural', 'algorithm', 'constraint', 'equation', 'solution', 'construction', 'mixed'] 218 | 219 | 220 | ### Building Graph 221 | 222 | TextRank is a graph based model, and thus it requires us to build a graph. Each words in the vocabulary will serve as a vertex for graph. The words will be represented in the vertices by their index in vocabulary list. 223 | 224 | The weighetd_edge matrix contains the information of edge connections among all vertices. 225 | I am building a graph with wieghted undirected edges. 226 | 227 | weighted_edge[i][j] contains the weight of the connecting edge between the word vertex represented by vocabulary index i and the word vertex represented by vocabulary j. 228 | 229 | If weighted_edge[i][j] is zero, it means no edge or connection is present between the words represented by index i and j. 230 | 231 | There is a connection between the words (and thus between i and j which represents them) if the words co-occur within a window of a specified 'window_size' in the processed_text. 232 | 233 | I am increasing value of the weighted_edge[i][j] is increased by (1/(distance between positions of words currently represented by i and j)) for every connection discovered between the same words in different locations of the text. 234 | 235 | The covered_coocurrences list (which is contain the list of pairs of absolute positions in processed_text of the words whose coocurrence at that location is already checked) is managed so that the same two words located in the same positions in processed_text are not repetitively counted while sliding the window one text unit at a time. 236 | 237 | The score of all vertices are intialized to one. 238 | 239 | Self-connections are not considered, so weighted_edge[i][i] will be zero. 240 | 241 | 242 | ```python 243 | import numpy as np 244 | import math 245 | vocab_len = len(vocabulary) 246 | 247 | weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32) 248 | 249 | score = np.zeros((vocab_len),dtype=np.float32) 250 | window_size = 3 251 | covered_coocurrences = [] 252 | 253 | for i in xrange(0,vocab_len): 254 | score[i]=1 255 | for j in xrange(0,vocab_len): 256 | if j==i: 257 | weighted_edge[i][j]=0 258 | else: 259 | for window_start in xrange(0,(len(processed_text)-window_size+1)): 260 | 261 | window_end = window_start+window_size 262 | 263 | window = processed_text[window_start:window_end] 264 | 265 | if (vocabulary[i] in window) and (vocabulary[j] in window): 266 | 267 | index_of_i = window_start + window.index(vocabulary[i]) 268 | index_of_j = window_start + window.index(vocabulary[j]) 269 | 270 | # index_of_x is the absolute position of the xth term in the window 271 | # (counting from 0) 272 | # in the processed_text 273 | 274 | if [index_of_i,index_of_j] not in covered_coocurrences: 275 | weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j) 276 | covered_coocurrences.append([index_of_i,index_of_j]) 277 | 278 | ``` 279 | 280 | ### Calculating weighted summation of connections of a vertex 281 | 282 | inout[i] will contain the total no. of undirected connections\edges associated withe the vertex represented by i. 283 | 284 | 285 | ```python 286 | inout = np.zeros((vocab_len),dtype=np.float32) 287 | 288 | for i in xrange(0,vocab_len): 289 | for j in xrange(0,vocab_len): 290 | inout[i]+=weighted_edge[i][j] 291 | ``` 292 | 293 | ### Scoring Vertices 294 | 295 | The formula used for scoring a vertex represented by i is: 296 | 297 | score[i] = (1-d) + d x [ Summation(j) ( (weighted_edge[i][j]/inout[j]) x score[j] ) ] where j belongs to the list of vertices that has a connection with i. 298 | 299 | d is the damping factor. 300 | 301 | The score is iteratively updated until convergence. 302 | 303 | 304 | ```python 305 | MAX_ITERATIONS = 50 306 | d=0.85 307 | threshold = 0.0001 #convergence threshold 308 | 309 | for iter in xrange(0,MAX_ITERATIONS): 310 | prev_score = np.copy(score) 311 | 312 | for i in xrange(0,vocab_len): 313 | 314 | summation = 0 315 | for j in xrange(0,vocab_len): 316 | if weighted_edge[i][j] != 0: 317 | summation += (weighted_edge[i][j]/inout[j])*score[j] 318 | 319 | score[i] = (1-d) + d*(summation) 320 | 321 | if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition 322 | print "Converging at iteration "+str(iter)+"...." 323 | break 324 | 325 | ``` 326 | 327 | Converging at iteration 29.... 328 | 329 | 330 | 331 | ```python 332 | for i in xrange(0,vocab_len): 333 | print "Score of "+vocabulary[i]+": "+str(score[i]) 334 | ``` 335 | 336 | Score of upper: 0.816792 337 | Score of set: 2.27184 338 | Score of constructing: 0.667288 339 | Score of number: 0.688316 340 | Score of solving: 0.642318 341 | Score of system: 2.12032 342 | Score of compatibility: 0.944584 343 | Score of strict: 0.823772 344 | Score of criterion: 1.22559 345 | Score of type: 1.08101 346 | Score of minimal: 1.78693 347 | Score of supporting: 0.653705 348 | Score of generating: 0.652645 349 | Score of linear: 1.2717 350 | Score of diophantine: 0.759295 351 | Score of component: 0.737641 352 | Score of bound: 0.786006 353 | Score of nonstrict: 0.827216 354 | Score of inequations: 1.30824 355 | Score of natural: 0.688299 356 | Score of algorithm: 1.19365 357 | Score of constraint: 0.674411 358 | Score of equation: 0.799815 359 | Score of solution: 1.6832 360 | Score of construction: 0.659809 361 | Score of mixed: 0.235822 362 | 363 | 364 | ### Phrase Partitioning 365 | 366 | Paritioning lemmatized_text into phrases using the stopwords in it as delimeters. 367 | The phrases are also candidates for keyphrases to be extracted. 368 | 369 | 370 | ```python 371 | phrases = [] 372 | 373 | phrase = " " 374 | for word in lemmatized_text: 375 | 376 | if word in stopwords_plus: 377 | if phrase!= " ": 378 | phrases.append(str(phrase).strip().split()) 379 | phrase = " " 380 | elif word not in stopwords_plus: 381 | phrase+=str(word) 382 | phrase+=" " 383 | 384 | print "Partitioned Phrases (Candidate Keyphrases): \n" 385 | print phrases 386 | ``` 387 | 388 | Partitioned Phrases (Candidate Keyphrases): 389 | 390 | [['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['compatibility'], ['system'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['solution'], ['type'], ['system'], ['criterion'], ['algorithm'], ['constructing'], ['minimal', 'supporting', 'set'], ['solution'], ['solving'], ['type'], ['system'], ['system'], ['mixed', 'type']] 391 | 392 | 393 | ### Create a list of unique phrases. 394 | 395 | Repeating phrases\keyphrase candidates has no purpose here, anymore. 396 | 397 | 398 | ```python 399 | unique_phrases = [] 400 | 401 | for phrase in phrases: 402 | if phrase not in unique_phrases: 403 | unique_phrases.append(phrase) 404 | 405 | print "Unique Phrases (Candidate Keyphrases): \n" 406 | print unique_phrases 407 | ``` 408 | 409 | Unique Phrases (Candidate Keyphrases): 410 | 411 | [['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['type'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']] 412 | 413 | 414 | ### Thinning the list of candidate-keyphrases. 415 | 416 | Removing single word keyphrase-candidates that are present multi-word alternatives. 417 | 418 | 419 | ```python 420 | for word in vocabulary: 421 | #print word 422 | for phrase in unique_phrases: 423 | if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1): 424 | #if len(phrase)>1 then the current phrase is multi-worded. 425 | #if the word in vocabulary is present in unique_phrases as a single-word-phrase 426 | # and at the same time present as a word within a multi-worded phrase, 427 | # then I will remove the single-word-phrase from the list. 428 | unique_phrases.remove([word]) 429 | 430 | print "Thinned Unique Phrases (Candidate Keyphrases): \n" 431 | print unique_phrases 432 | ``` 433 | 434 | Thinned Unique Phrases (Candidate Keyphrases): 435 | 436 | [['compatibility'], ['system'], ['linear', 'constraint'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']] 437 | 438 | 439 | ### Scoring Keyphrases 440 | 441 | Scoring the phrases (candidate keyphrases) and building up a list of keyphrases 442 | by listing untokenized versions of tokenized phrases\candidate-keyphrases. 443 | Phrases are scored by adding the score of their members (words\text-units that were ranked by the graph algorithm) 444 | 445 | 446 | 447 | ```python 448 | phrase_scores = [] 449 | keywords = [] 450 | for phrase in unique_phrases: 451 | phrase_score=0 452 | keyword = '' 453 | for word in phrase: 454 | keyword += str(word) 455 | keyword += " " 456 | phrase_score+=score[vocabulary.index(word)] 457 | phrase_scores.append(phrase_score) 458 | keywords.append(keyword.strip()) 459 | 460 | i=0 461 | for keyword in keywords: 462 | print "Keyword: '"+str(keyword)+"', Score: "+str(phrase_scores[i]) 463 | i+=1 464 | ``` 465 | 466 | Keyword: 'compatibility', Score: 0.944583714008 467 | Keyword: 'system', Score: 2.12031626701 468 | Keyword: 'linear constraint', Score: 1.94610738754 469 | Keyword: 'natural number', Score: 1.37661552429 470 | Keyword: 'criterion', Score: 1.2255872488 471 | Keyword: 'linear diophantine equation', Score: 2.83080631495 472 | Keyword: 'strict inequations', Score: 2.13201224804 473 | Keyword: 'nonstrict inequations', Score: 2.135455966 474 | Keyword: 'upper bound', Score: 1.60279768705 475 | Keyword: 'component', Score: 0.737640619278 476 | Keyword: 'minimal set', Score: 4.05876886845 477 | Keyword: 'solution', Score: 1.68319940567 478 | Keyword: 'algorithm', Score: 1.19365406036 479 | Keyword: 'construction', Score: 0.659808635712 480 | Keyword: 'minimal generating set', Score: 4.71141409874 481 | Keyword: 'constructing', Score: 0.66728836298 482 | Keyword: 'minimal supporting set', Score: 4.71247345209 483 | Keyword: 'solving', Score: 0.642318367958 484 | Keyword: 'mixed type', Score: 1.31682945788 485 | 486 | 487 | ### Ranking Keyphrases 488 | 489 | Ranking keyphrases based on their calculated scores. Displaying top 'keywords_num' no. of keyphrases. 490 | 491 | 492 | ```python 493 | sorted_index = np.flip(np.argsort(phrase_scores),0) 494 | 495 | keywords_num = 10 496 | 497 | print "Keywords:\n" 498 | 499 | for i in xrange(0,keywords_num): 500 | print str(keywords[sorted_index[i]])+", ", 501 | ``` 502 | 503 | Keywords: 504 | 505 | minimal supporting set, minimal generating set, minimal set, linear diophantine equation, nonstrict inequations, strict inequations, system, linear constraint, solution, upper bound, 506 | 507 | 508 | # Input: 509 | 510 | Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types. 511 | 512 | # Extracted Keywords: 513 | 514 | * minimal supporting set, 515 | * minimal generating set, 516 | * minimal set, 517 | * linear diophantine equation, 518 | * nonstrict inequations, 519 | * strict inequations, 520 | * system, 521 | * linear constraint, 522 | * solution, 523 | * upper bound, 524 | -------------------------------------------------------------------------------- /TextRank.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Implementation of TextRank\n", 8 | "(Based on: https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "The input text is given below" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 6, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "#Source of text:\n", 25 | "#https://www.researchgate.net/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents\n", 26 | "\n", 27 | "Text = \"Compatibility of systems of linear constraints over the set of natural numbers. \\\n", 28 | "Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and \\\n", 29 | "nonstrict inequations are considered. \\\n", 30 | "Upper bounds for components of a minimal set of solutions and \\\n", 31 | "algorithms of construction of minimal generating sets of solutions for all \\\n", 32 | "types of systems are given. \\\n", 33 | "These criteria and the corresponding algorithms for constructing \\\n", 34 | "a minimal supporting set of solutions can be used in solving all the \\\n", 35 | "considered types of systems and systems of mixed types.\"" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Cleaning Text Data\n", 43 | "\n", 44 | "The raw input text is cleaned off non-printable characters (if any) and turned into lower case.\n", 45 | "The processed input text is then tokenized using NLTK library functions. " 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 20, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "Tokenized Text: \n", 58 | "\n", 59 | "['compatibility', 'of', 'systems', 'of', 'linear', 'constraints', 'over', 'the', 'set', 'of', 'natural', 'numbers', '.', 'criteria', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equations', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bounds', 'for', 'components', 'of', 'a', 'minimal', 'set', 'of', 'solutions', 'and', 'algorithms', 'of', 'construction', 'of', 'minimal', 'generating', 'sets', 'of', 'solutions', 'for', 'all', 'types', 'of', 'systems', 'are', 'given', '.', 'these', 'criteria', 'and', 'the', 'corresponding', 'algorithms', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solutions', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'types', 'of', 'systems', 'and', 'systems', 'of', 'mixed', 'types', '.']\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "\n", 65 | "import nltk\n", 66 | "from nltk import word_tokenize\n", 67 | "import string\n", 68 | "\n", 69 | "#nltk.download('punkt')\n", 70 | "\n", 71 | "def clean(text):\n", 72 | " text = text.lower()\n", 73 | " printable = set(string.printable)\n", 74 | " text = filter(lambda x: x in printable, text)\n", 75 | " text = \"\".join(list(text))\n", 76 | " return text\n", 77 | "\n", 78 | "Cleaned_text = clean(Text)\n", 79 | "# print(Cleaned_text)\n", 80 | "text = word_tokenize(Cleaned_text)\n", 81 | "\n", 82 | "print (\"Tokenized Text: \\n\")\n", 83 | "print (text)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "### POS Tagging For Lemmatization\n", 91 | "\n", 92 | "NLTK is again used for POS tagging the input text so that the words can be lemmatized based on their POS tags.\n", 93 | "\n", 94 | "Description of POS tags: \n", 95 | "\n", 96 | "\n", 97 | "http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 22, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "Tokenized Text with POS tags: \n", 110 | "\n", 111 | "[('compatibility', 'NN'), ('of', 'IN'), ('systems', 'NNS'), ('of', 'IN'), ('linear', 'JJ'), ('constraints', 'NNS'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('numbers', 'NNS'), ('.', '.'), ('criteria', 'NNS'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'NN'), ('equations', 'NNS'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bounds', 'NNS'), ('for', 'IN'), ('components', 'NNS'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('and', 'CC'), ('algorithms', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('sets', 'NNS'), ('of', 'IN'), ('solutions', 'NNS'), ('for', 'IN'), ('all', 'DT'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criteria', 'NNS'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithms', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('and', 'CC'), ('systems', 'NNS'), ('of', 'IN'), ('mixed', 'JJ'), ('types', 'NNS'), ('.', '.')]\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "#nltk.download('averaged_perceptron_tagger')\n", 117 | " \n", 118 | "POS_tag = nltk.pos_tag(text)\n", 119 | "\n", 120 | "print (\"Tokenized Text with POS tags: \\n\")\n", 121 | "print (POS_tag)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### Lemmatization\n", 129 | "\n", 130 | "The tokenized text (mainly the nouns and adjectives) is normalized by lemmatization.\n", 131 | "In lemmatization different grammatical counterparts of a word will be replaced by single\n", 132 | "basic lemma. For example, 'glasses' may be replaced by 'glass'. \n", 133 | "\n", 134 | "Details about lemmatization: \n", 135 | " \n", 136 | "https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 23, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Text tokens after lemmatization of adjectives and nouns: \n", 149 | "\n", 150 | "['compatibility', 'of', 'system', 'of', 'linear', 'constraint', 'over', 'the', 'set', 'of', 'natural', 'number', '.', 'criterion', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equation', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bound', 'for', 'component', 'of', 'a', 'minimal', 'set', 'of', 'solution', 'and', 'algorithm', 'of', 'construction', 'of', 'minimal', 'generating', 'set', 'of', 'solution', 'for', 'all', 'type', 'of', 'system', 'are', 'given', '.', 'these', 'criterion', 'and', 'the', 'corresponding', 'algorithm', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solution', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'type', 'of', 'system', 'and', 'system', 'of', 'mixed', 'type', '.']\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "#nltk.download('wordnet')\n", 156 | "\n", 157 | "from nltk.stem import WordNetLemmatizer\n", 158 | "\n", 159 | "wordnet_lemmatizer = WordNetLemmatizer()\n", 160 | "\n", 161 | "adjective_tags = ['JJ','JJR','JJS']\n", 162 | "\n", 163 | "lemmatized_text = []\n", 164 | "\n", 165 | "for word in POS_tag:\n", 166 | " if word[1] in adjective_tags:\n", 167 | " lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos=\"a\")))\n", 168 | " else:\n", 169 | " lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun\n", 170 | " \n", 171 | "print (\"Text tokens after lemmatization of adjectives and nouns: \\n\")\n", 172 | "print (lemmatized_text)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### POS tagging for Filtering\n", 180 | "\n", 181 | "The lemmatized text is POS tagged here. The tags will be used for filtering later on." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 24, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "Lemmatized text with POS tags: \n", 194 | "\n", 195 | "[('compatibility', 'NN'), ('of', 'IN'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('constraint', 'NN'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('number', 'NN'), ('.', '.'), ('criterion', 'NN'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'JJ'), ('equation', 'NN'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bound', 'NN'), ('for', 'IN'), ('component', 'NN'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('and', 'CC'), ('algorithm', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('for', 'IN'), ('all', 'DT'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criterion', 'NN'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithm', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('and', 'CC'), ('system', 'NN'), ('of', 'IN'), ('mixed', 'JJ'), ('type', 'NN'), ('.', '.')]\n" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "POS_tag = nltk.pos_tag(lemmatized_text)\n", 201 | "\n", 202 | "print (\"Lemmatized text with POS tags: \\n\")\n", 203 | "print (POS_tag)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "## POS Based Filtering\n", 211 | "\n", 212 | "Any word from the lemmatized text, which isn't a noun, adjective, or gerund (or a 'foreign word'), is here\n", 213 | "considered as a stopword (non-content). This is based on the assumption that usually keywords are noun,\n", 214 | "adjectives or gerunds. \n", 215 | "\n", 216 | "Punctuations are added to the stopword list too." 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 25, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "stopwords = []\n", 226 | "\n", 227 | "wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] \n", 228 | "\n", 229 | "for word in POS_tag:\n", 230 | " if word[1] not in wanted_POS:\n", 231 | " stopwords.append(word[0])\n", 232 | "\n", 233 | "punctuations = list(str(string.punctuation))\n", 234 | "\n", 235 | "stopwords = stopwords + punctuations" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "### Complete stopword generation\n", 243 | "\n", 244 | "Even if we remove the aforementioned stopwords, still some extremely common nouns, adjectives or gerunds may\n", 245 | "remain which are very bad candidates for being keywords (or part of it). \n", 246 | "\n", 247 | "An external file constituting a long list of stopwords is loaded and all the words are added with the previous\n", 248 | "stopwords to create the final list 'stopwords-plus' which is then converted into a set. \n", 249 | "\n", 250 | "(Source of stopwords data: https://www.ranks.nl/stopwords)\n", 251 | "\n", 252 | "Stopwords-plus constitute the sum total of all stopwords and potential phrase-delimiters. \n", 253 | "\n", 254 | "(The contents of this set will be later used to partition the lemmatized text into n-gram phrases. But, for now, I will simply remove the stopwords, and work with a 'bag-of-words' approach. I will be developing the graph using unigram texts as vertices)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 27, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "stopword_file = open(\"long_stopwords.txt\", \"r\")\n", 264 | "#Source = https://www.ranks.nl/stopwords\n", 265 | "\n", 266 | "lots_of_stopwords = []\n", 267 | "\n", 268 | "for line in stopword_file.readlines():\n", 269 | " lots_of_stopwords.append(str(line.strip()))\n", 270 | "\n", 271 | "stopwords_plus = []\n", 272 | "stopwords_plus = stopwords + lots_of_stopwords\n", 273 | "stopwords_plus = set(stopwords_plus)\n", 274 | "\n", 275 | "#Stopwords_plus contain total set of all stopwords" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "### Removing Stopwords \n", 283 | "\n", 284 | "Removing stopwords from lemmatized_text. \n", 285 | "Processeced_text condtains the result." 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 29, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "name": "stdout", 295 | "output_type": "stream", 296 | "text": [ 297 | "['compatibility', 'system', 'linear', 'constraint', 'set', 'natural', 'number', 'criterion', 'compatibility', 'system', 'linear', 'diophantine', 'equation', 'strict', 'inequations', 'nonstrict', 'inequations', 'upper', 'bound', 'component', 'minimal', 'set', 'solution', 'algorithm', 'construction', 'minimal', 'generating', 'set', 'solution', 'type', 'system', 'criterion', 'algorithm', 'constructing', 'minimal', 'supporting', 'set', 'solution', 'solving', 'type', 'system', 'system', 'mixed', 'type']\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "processed_text = []\n", 303 | "for word in lemmatized_text:\n", 304 | " if word not in stopwords_plus:\n", 305 | " processed_text.append(word)\n", 306 | "print (processed_text)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "## Vocabulary Creation\n", 314 | "\n", 315 | "Vocabulary will only contain unique words from processed_text." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 31, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "['solving', 'equation', 'generating', 'diophantine', 'construction', 'set', 'mixed', 'minimal', 'compatibility', 'component', 'system', 'natural', 'inequations', 'constraint', 'criterion', 'type', 'upper', 'solution', 'linear', 'algorithm', 'strict', 'bound', 'nonstrict', 'number', 'supporting', 'constructing']\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "vocabulary = list(set(processed_text))\n", 333 | "print (vocabulary)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "### Building Graph\n", 341 | "\n", 342 | "TextRank is a graph based model, and thus it requires us to build a graph. Each words in the vocabulary will serve as a vertex for graph. The words will be represented in the vertices by their index in vocabulary list. \n", 343 | "\n", 344 | "The weighted_edge matrix contains the information of edge connections among all vertices.\n", 345 | "I am building wieghted undirected edges.\n", 346 | "\n", 347 | "weighted_edge[i][j] contains the weight of the connecting edge between the word vertex represented by vocabulary index i and the word vertex represented by vocabulary j.\n", 348 | "\n", 349 | "If weighted_edge[i][j] is zero, it means no edge connection is present between the words represented by index i and j.\n", 350 | "\n", 351 | "There is a connection between the words (and thus between i and j which represents them) if the words co-occur within a window of a specified 'window_size' in the processed_text.\n", 352 | "\n", 353 | "The value of the weighted_edge[i][j] is increased by (1/(distance between positions of words currently represented by i and j)) for every connection discovered between the same words in different locations of the text. \n", 354 | "\n", 355 | "The covered_coocurrences list (which is contain the list of pairs of absolute positions in processed_text of the words whose coocurrence at that location is already checked) is managed so that the same two words located in the same positions in processed_text are not repetitively counted while sliding the window one text unit at a time.\n", 356 | "\n", 357 | "The score of all vertices are intialized to one. \n", 358 | "\n", 359 | "Self-connections are not considered, so weighted_edge[i][i] will be zero." 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 33, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "import numpy as np\n", 369 | "import math\n", 370 | "vocab_len = len(vocabulary)\n", 371 | "\n", 372 | "weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)\n", 373 | "\n", 374 | "score = np.zeros((vocab_len),dtype=np.float32)\n", 375 | "window_size = 3\n", 376 | "covered_coocurrences = []\n", 377 | "\n", 378 | "for i in range(0,vocab_len):\n", 379 | " score[i]=1\n", 380 | " for j in range(0,vocab_len):\n", 381 | " if j==i:\n", 382 | " weighted_edge[i][j]=0\n", 383 | " else:\n", 384 | " for window_start in range(0,(len(processed_text)-window_size)):\n", 385 | " \n", 386 | " window_end = window_start+window_size\n", 387 | " \n", 388 | " window = processed_text[window_start:window_end]\n", 389 | " \n", 390 | " if (vocabulary[i] in window) and (vocabulary[j] in window):\n", 391 | " \n", 392 | " index_of_i = window_start + window.index(vocabulary[i])\n", 393 | " index_of_j = window_start + window.index(vocabulary[j])\n", 394 | " \n", 395 | " # index_of_x is the absolute position of the xth term in the window \n", 396 | " # (counting from 0) \n", 397 | " # in the processed_text\n", 398 | " \n", 399 | " if [index_of_i,index_of_j] not in covered_coocurrences:\n", 400 | " weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)\n", 401 | " covered_coocurrences.append([index_of_i,index_of_j])\n" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "### Calculating weighted summation of connections of a vertex\n", 409 | "\n", 410 | "inout[i] will contain the sum of all the undirected connections\\edges associated withe the vertex represented by i." 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 34, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "inout = np.zeros((vocab_len),dtype=np.float32)\n", 420 | "\n", 421 | "for i in range(0,vocab_len):\n", 422 | " for j in range(0,vocab_len):\n", 423 | " inout[i]+=weighted_edge[i][j]" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "### Scoring Vertices\n", 431 | "\n", 432 | "The formula used for scoring a vertex represented by i is:\n", 433 | "\n", 434 | "score[i] = (1-d) + d x [ Summation(j) ( (weighted_edge[i][j]/inout[j]) x score[j] ) ] where j belongs to the list of vertieces that has a connection with i. \n", 435 | "\n", 436 | "d is the damping factor.\n", 437 | "\n", 438 | "The score is iteratively updated until convergence. " 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 35, 444 | "metadata": {}, 445 | "outputs": [ 446 | { 447 | "name": "stdout", 448 | "output_type": "stream", 449 | "text": [ 450 | "Converging at iteration 23....\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "MAX_ITERATIONS = 50\n", 456 | "d=0.85\n", 457 | "threshold = 0.0001 #convergence threshold\n", 458 | "\n", 459 | "for iter in range(0,MAX_ITERATIONS):\n", 460 | " prev_score = np.copy(score)\n", 461 | " \n", 462 | " for i in range(0,vocab_len):\n", 463 | " \n", 464 | " summation = 0\n", 465 | " for j in range(0,vocab_len):\n", 466 | " if weighted_edge[i][j] != 0:\n", 467 | " summation += (weighted_edge[i][j]/inout[j])*score[j]\n", 468 | " \n", 469 | " score[i] = (1-d) + d*(summation)\n", 470 | " \n", 471 | " if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition\n", 472 | " print(\"Converging at iteration \"+str(iter)+\"....\")\n", 473 | " break\n" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 36, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "Score of solving: 0.64231944\n", 486 | "Score of equation: 0.79981786\n", 487 | "Score of generating: 0.65264744\n", 488 | "Score of diophantine: 0.759297\n", 489 | "Score of construction: 0.6598107\n", 490 | "Score of set: 2.2718465\n", 491 | "Score of mixed: 0.2358227\n", 492 | "Score of minimal: 1.7869267\n", 493 | "Score of compatibility: 0.9445859\n", 494 | "Score of component: 0.73764145\n", 495 | "Score of system: 2.1203177\n", 496 | "Score of natural: 0.6883006\n", 497 | "Score of inequations: 1.308244\n", 498 | "Score of constraint: 0.67441183\n", 499 | "Score of criterion: 1.2255884\n", 500 | "Score of type: 1.0810083\n", 501 | "Score of upper: 0.8167923\n", 502 | "Score of solution: 1.683202\n", 503 | "Score of linear: 1.2716976\n", 504 | "Score of algorithm: 1.1936545\n", 505 | "Score of strict: 0.8237729\n", 506 | "Score of bound: 0.78600633\n", 507 | "Score of nonstrict: 0.8272164\n", 508 | "Score of number: 0.6883157\n", 509 | "Score of supporting: 0.6537049\n", 510 | "Score of constructing: 0.66728705\n" 511 | ] 512 | } 513 | ], 514 | "source": [ 515 | "for i in range(0,vocab_len):\n", 516 | " print(\"Score of \"+vocabulary[i]+\": \"+str(score[i]))" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "### Phrase Partiotioning\n", 524 | "\n", 525 | "Paritioning lemmatized_text into phrases using the stopwords in it as delimeters.\n", 526 | "The phrases are also candidates for keyphrases to be extracted. " 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 37, 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "name": "stdout", 536 | "output_type": "stream", 537 | "text": [ 538 | "Partitioned Phrases (Candidate Keyphrases): \n", 539 | "\n", 540 | "[['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['compatibility'], ['system'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['solution'], ['type'], ['system'], ['criterion'], ['algorithm'], ['constructing'], ['minimal', 'supporting', 'set'], ['solution'], ['solving'], ['type'], ['system'], ['system'], ['mixed', 'type']]\n" 541 | ] 542 | } 543 | ], 544 | "source": [ 545 | "phrases = []\n", 546 | "\n", 547 | "phrase = \" \"\n", 548 | "for word in lemmatized_text:\n", 549 | " \n", 550 | " if word in stopwords_plus:\n", 551 | " if phrase!= \" \":\n", 552 | " phrases.append(str(phrase).strip().split())\n", 553 | " phrase = \" \"\n", 554 | " elif word not in stopwords_plus:\n", 555 | " phrase+=str(word)\n", 556 | " phrase+=\" \"\n", 557 | "\n", 558 | "print(\"Partitioned Phrases (Candidate Keyphrases): \\n\")\n", 559 | "print(phrases)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "### Create a list of unique phrases.\n", 567 | "\n", 568 | "Repeating phrases\\keyphrase candidates has no purpose here, anymore. " 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": 38, 574 | "metadata": {}, 575 | "outputs": [ 576 | { 577 | "name": "stdout", 578 | "output_type": "stream", 579 | "text": [ 580 | "Unique Phrases (Candidate Keyphrases): \n", 581 | "\n", 582 | "[['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['type'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']]\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "unique_phrases = []\n", 588 | "\n", 589 | "for phrase in phrases:\n", 590 | " if phrase not in unique_phrases:\n", 591 | " unique_phrases.append(phrase)\n", 592 | "\n", 593 | "print(\"Unique Phrases (Candidate Keyphrases): \\n\")\n", 594 | "print(unique_phrases)" 595 | ] 596 | }, 597 | { 598 | "cell_type": "markdown", 599 | "metadata": {}, 600 | "source": [ 601 | "### Thinning the list of candidate-keyphrases.\n", 602 | "\n", 603 | "Removing single word keyphrases-candidates that are present multi-word alternatives. " 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 39, 609 | "metadata": {}, 610 | "outputs": [ 611 | { 612 | "name": "stdout", 613 | "output_type": "stream", 614 | "text": [ 615 | "Thinned Unique Phrases (Candidate Keyphrases): \n", 616 | "\n", 617 | "[['compatibility'], ['system'], ['linear', 'constraint'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']]\n" 618 | ] 619 | } 620 | ], 621 | "source": [ 622 | "for word in vocabulary:\n", 623 | " #print word\n", 624 | " for phrase in unique_phrases:\n", 625 | " if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):\n", 626 | " #if len(phrase)>1 then the current phrase is multi-worded.\n", 627 | " #if the word in vocabulary is present in unique_phrases as a single-word-phrase\n", 628 | " # and at the same time present as a word within a multi-worded phrase,\n", 629 | " # then I will remove the single-word-phrase from the list.\n", 630 | " unique_phrases.remove([word])\n", 631 | " \n", 632 | "print(\"Thinned Unique Phrases (Candidate Keyphrases): \\n\")\n", 633 | "print(unique_phrases) " 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "### Scoring Keyphrases\n", 641 | "\n", 642 | "Scoring the phrases (candidate keyphrases) and building up a list of keyphrases\\keywords\n", 643 | "by listing untokenized versions of tokenized phrases\\candidate-keyphrases.\n", 644 | "Phrases are scored by adding the score of their members (words\\text-units that were ranked by the graph algorithm)\n" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 40, 650 | "metadata": {}, 651 | "outputs": [ 652 | { 653 | "name": "stdout", 654 | "output_type": "stream", 655 | "text": [ 656 | "Keyword: 'compatibility', Score: 0.944585919380188\n", 657 | "Keyword: 'system', Score: 2.1203176975250244\n", 658 | "Keyword: 'linear constraint', Score: 1.9461094737052917\n", 659 | "Keyword: 'natural number', Score: 1.3766162991523743\n", 660 | "Keyword: 'criterion', Score: 1.2255884408950806\n", 661 | "Keyword: 'linear diophantine equation', Score: 2.8308125138282776\n", 662 | "Keyword: 'strict inequations', Score: 2.132016897201538\n", 663 | "Keyword: 'nonstrict inequations', Score: 2.135460376739502\n", 664 | "Keyword: 'upper bound', Score: 1.6027986407279968\n", 665 | "Keyword: 'component', Score: 0.737641453742981\n", 666 | "Keyword: 'minimal set', Score: 4.0587732791900635\n", 667 | "Keyword: 'solution', Score: 1.6832020282745361\n", 668 | "Keyword: 'algorithm', Score: 1.1936545372009277\n", 669 | "Keyword: 'construction', Score: 0.6598107218742371\n", 670 | "Keyword: 'minimal generating set', Score: 4.711420714855194\n", 671 | "Keyword: 'constructing', Score: 0.6672870516777039\n", 672 | "Keyword: 'minimal supporting set', Score: 4.712478160858154\n", 673 | "Keyword: 'solving', Score: 0.6423194408416748\n", 674 | "Keyword: 'mixed type', Score: 1.3168310225009918\n" 675 | ] 676 | } 677 | ], 678 | "source": [ 679 | "phrase_scores = []\n", 680 | "keywords = []\n", 681 | "for phrase in unique_phrases:\n", 682 | " phrase_score=0\n", 683 | " keyword = ''\n", 684 | " for word in phrase:\n", 685 | " keyword += str(word)\n", 686 | " keyword += \" \"\n", 687 | " phrase_score+=score[vocabulary.index(word)]\n", 688 | " phrase_scores.append(phrase_score)\n", 689 | " keywords.append(keyword.strip())\n", 690 | "\n", 691 | "i=0\n", 692 | "for keyword in keywords:\n", 693 | " print (\"Keyword: '\"+str(keyword)+\"', Score: \"+str(phrase_scores[i]))\n", 694 | " i+=1" 695 | ] 696 | }, 697 | { 698 | "cell_type": "markdown", 699 | "metadata": {}, 700 | "source": [ 701 | "### Ranking Keyphrases\n", 702 | "\n", 703 | "Ranking keyphrases based on their calculated scores. Displaying top keywords_num no. of keyphrases." 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 43, 709 | "metadata": {}, 710 | "outputs": [ 711 | { 712 | "name": "stdout", 713 | "output_type": "stream", 714 | "text": [ 715 | "Keywords:\n", 716 | "\n", 717 | "minimal supporting set, minimal generating set, minimal set, linear diophantine equation, nonstrict inequations, strict inequations, system, linear constraint, solution, upper bound, " 718 | ] 719 | } 720 | ], 721 | "source": [ 722 | "sorted_index = np.flip(np.argsort(phrase_scores),0)\n", 723 | "\n", 724 | "keywords_num = 10\n", 725 | "\n", 726 | "print(\"Keywords:\\n\")\n", 727 | "\n", 728 | "for i in range(0,keywords_num):\n", 729 | " print(str(keywords[sorted_index[i]])+\", \", end=' ')" 730 | ] 731 | }, 732 | { 733 | "cell_type": "markdown", 734 | "metadata": {}, 735 | "source": [ 736 | "# Input:\n", 737 | "\n", 738 | "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.\n", 739 | "\n", 740 | "# Extracted Keywords:\n", 741 | "\n", 742 | "* minimal supporting set, \n", 743 | "* minimal generating set, \n", 744 | "* minimal set, \n", 745 | "* linear diophantine equation, \n", 746 | "* nonstrict inequations, \n", 747 | "* strict inequations, \n", 748 | "* system, \n", 749 | "* linear constraint, \n", 750 | "* solution, \n", 751 | "* upper bound, \n" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "metadata": {}, 758 | "outputs": [], 759 | "source": [] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": null, 764 | "metadata": {}, 765 | "outputs": [], 766 | "source": [] 767 | } 768 | ], 769 | "metadata": { 770 | "kernelspec": { 771 | "display_name": "Python 3", 772 | "language": "python", 773 | "name": "python3" 774 | }, 775 | "language_info": { 776 | "codemirror_mode": { 777 | "name": "ipython", 778 | "version": 3 779 | }, 780 | "file_extension": ".py", 781 | "mimetype": "text/x-python", 782 | "name": "python", 783 | "nbconvert_exporter": "python", 784 | "pygments_lexer": "ipython3", 785 | "version": "3.7.4" 786 | } 787 | }, 788 | "nbformat": 4, 789 | "nbformat_minor": 2 790 | } 791 | --------------------------------------------------------------------------------