├── LICENSE
├── .gitignore
├── long_stopwords.txt
├── README.md
└── TextRank.ipynb


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Jishnu Ray Chowdhury
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/long_stopwords.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | able
  3 | about
  4 | above
  5 | abst
  6 | accordance
  7 | according
  8 | accordingly
  9 | across
 10 | act
 11 | actually
 12 | added
 13 | adj
 14 | affected
 15 | affecting
 16 | affects
 17 | after
 18 | afterwards
 19 | again
 20 | against
 21 | ah
 22 | all
 23 | almost
 24 | alone
 25 | along
 26 | already
 27 | also
 28 | although
 29 | always
 30 | am
 31 | among
 32 | amongst
 33 | an
 34 | and
 35 | announce
 36 | another
 37 | any
 38 | anybody
 39 | anyhow
 40 | anymore
 41 | anyone
 42 | anything
 43 | anyway
 44 | anyways
 45 | anywhere
 46 | apparently
 47 | approximately
 48 | are
 49 | aren
 50 | arent
 51 | arise
 52 | around
 53 | as
 54 | aside
 55 | ask
 56 | asking
 57 | at
 58 | auth
 59 | available
 60 | away
 61 | awfully
 62 | b
 63 | back
 64 | be
 65 | became
 66 | because
 67 | become
 68 | becomes
 69 | becoming
 70 | been
 71 | before
 72 | beforehand
 73 | begin
 74 | beginning
 75 | beginnings
 76 | begins
 77 | behind
 78 | being
 79 | believe
 80 | below
 81 | beside
 82 | besides
 83 | between
 84 | beyond
 85 | biol
 86 | both
 87 | brief
 88 | briefly
 89 | but
 90 | by
 91 | c
 92 | ca
 93 | came
 94 | can
 95 | cannot
 96 | can't
 97 | cause
 98 | causes
 99 | certain
100 | certainly
101 | co
102 | com
103 | come
104 | comes
105 | contain
106 | containing
107 | corresponding
108 | contains
109 | could
110 | couldnt
111 | d
112 | date
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | done
121 | don't
122 | down
123 | downwards
124 | due
125 | during
126 | e
127 | each
128 | ed
129 | edu
130 | effect
131 | eg
132 | eight
133 | eighty
134 | either
135 | else
136 | elsewhere
137 | end
138 | ending
139 | enough
140 | especially
141 | et
142 | et-al
143 | etc
144 | even
145 | ever
146 | every
147 | everybody
148 | everyone
149 | everything
150 | everywhere
151 | ex
152 | except
153 | f
154 | far
155 | few
156 | ff
157 | fifth
158 | first
159 | five
160 | fix
161 | followed
162 | following
163 | follows
164 | for
165 | former
166 | formerly
167 | forth
168 | found
169 | four
170 | from
171 | further
172 | furthermore
173 | g
174 | gave
175 | get
176 | gets
177 | getting
178 | give
179 | given
180 | gives
181 | giving
182 | go
183 | goes
184 | gone
185 | got
186 | gotten
187 | h
188 | had
189 | happens
190 | hardly
191 | has
192 | hasn't
193 | have
194 | haven't
195 | having
196 | he
197 | hed
198 | hence
199 | her
200 | here
201 | hereafter
202 | hereby
203 | herein
204 | heres
205 | hereupon
206 | hers
207 | herself
208 | hes
209 | hi
210 | hid
211 | him
212 | himself
213 | his
214 | hither
215 | home
216 | how
217 | howbeit
218 | however
219 | hundred
220 | i
221 | id
222 | ie
223 | if
224 | i'll
225 | im
226 | immediate
227 | immediately
228 | importance
229 | important
230 | in
231 | inc
232 | indeed
233 | index
234 | information
235 | instead
236 | into
237 | invention
238 | inward
239 | is
240 | isn't
241 | it
242 | itd
243 | it'll
244 | its
245 | itself
246 | i've
247 | j
248 | just
249 | k
250 | keep	
251 | keeps
252 | kept
253 | kg
254 | km
255 | know
256 | known
257 | knows
258 | l
259 | largely
260 | last
261 | lately
262 | later
263 | latter
264 | latterly
265 | least
266 | less
267 | lest
268 | let
269 | lets
270 | like
271 | liked
272 | likely
273 | line
274 | little
275 | 'll
276 | look
277 | looking
278 | looks
279 | ltd
280 | m
281 | made
282 | mainly
283 | make
284 | makes
285 | many
286 | may
287 | maybe
288 | me
289 | mean
290 | means
291 | meantime
292 | meanwhile
293 | merely
294 | mg
295 | might
296 | million
297 | miss
298 | ml
299 | more
300 | moreover
301 | most
302 | mostly
303 | mr
304 | mrs
305 | much
306 | mug
307 | must
308 | my
309 | myself
310 | n
311 | na
312 | name
313 | namely
314 | nay
315 | nd
316 | near
317 | nearly
318 | necessarily
319 | necessary
320 | need
321 | needs
322 | neither
323 | never
324 | nevertheless
325 | new
326 | next
327 | nine
328 | ninety
329 | no
330 | nobody
331 | non
332 | none
333 | nonetheless
334 | noone
335 | nor
336 | normally
337 | nos
338 | not
339 | noted
340 | nothing
341 | now
342 | nowhere
343 | o
344 | obtain
345 | obtained
346 | obviously
347 | of
348 | off
349 | often
350 | oh
351 | ok
352 | okay
353 | old
354 | omitted
355 | on
356 | once
357 | one
358 | ones
359 | only
360 | onto
361 | or
362 | ord
363 | other
364 | others
365 | otherwise
366 | ought
367 | our
368 | ours
369 | ourselves
370 | out
371 | outside
372 | over
373 | overall
374 | owing
375 | own
376 | p
377 | page
378 | pages
379 | part
380 | particular
381 | particularly
382 | past
383 | per
384 | perhaps
385 | placed
386 | please
387 | plus
388 | poorly
389 | possible
390 | possibly
391 | potentially
392 | pp
393 | predominantly
394 | present
395 | previously
396 | primarily
397 | probably
398 | promptly
399 | proud
400 | provides
401 | put
402 | q
403 | que
404 | quickly
405 | quite
406 | qv
407 | r
408 | ran
409 | rather
410 | rd
411 | re
412 | readily
413 | really
414 | recent
415 | recently
416 | ref
417 | refs
418 | regarding
419 | regardless
420 | regards
421 | related
422 | relatively
423 | research
424 | respectively
425 | resulted
426 | resulting
427 | results
428 | right
429 | run
430 | s
431 | said
432 | same
433 | saw
434 | say
435 | saying
436 | says
437 | sec
438 | section
439 | see
440 | seeing
441 | seem
442 | seemed
443 | seeming
444 | seems
445 | seen
446 | self
447 | selves
448 | sent
449 | seven
450 | several
451 | shall
452 | she
453 | shed
454 | she'll
455 | shes
456 | should
457 | shouldn't
458 | show
459 | showed
460 | shown
461 | showns
462 | shows
463 | significant
464 | significantly
465 | similar
466 | similarly
467 | since
468 | six
469 | slightly
470 | so
471 | some
472 | somebody
473 | somehow
474 | someone
475 | somethan
476 | something
477 | sometime
478 | sometimes
479 | somewhat
480 | somewhere
481 | soon
482 | sorry
483 | specifically
484 | specified
485 | specify
486 | specifying
487 | still
488 | stop
489 | strongly
490 | sub
491 | substantially
492 | successfully
493 | such
494 | sufficiently
495 | suggest
496 | sup
497 | sure	t
498 | take
499 | taken
500 | taking
501 | tell
502 | tends
503 | th
504 | than
505 | thank
506 | thanks
507 | thanx
508 | that
509 | that'll
510 | thats
511 | that've
512 | the
513 | their
514 | theirs
515 | them
516 | themselves
517 | then
518 | thence
519 | there
520 | thereafter
521 | thereby
522 | thered
523 | therefore
524 | therein
525 | there'll
526 | thereof
527 | therere
528 | theres
529 | thereto
530 | thereupon
531 | there've
532 | these
533 | they
534 | theyd
535 | they'll
536 | theyre
537 | they've
538 | think
539 | this
540 | those
541 | thou
542 | though
543 | thoughh
544 | thousand
545 | throug
546 | through
547 | throughout
548 | thru
549 | thus
550 | til
551 | tip
552 | to
553 | together
554 | too
555 | took
556 | toward
557 | towards
558 | tried
559 | tries
560 | truly
561 | try
562 | trying
563 | ts
564 | twice
565 | two
566 | u
567 | un
568 | under
569 | unfortunately
570 | unless
571 | unlike
572 | unlikely
573 | until
574 | unto
575 | up
576 | upon
577 | ups
578 | us
579 | use
580 | used
581 | useful
582 | usefully
583 | usefulness
584 | uses
585 | using
586 | usually
587 | v
588 | value
589 | various
590 | 've
591 | very
592 | via
593 | viz
594 | vol
595 | vols
596 | vs
597 | w
598 | want
599 | wants
600 | was
601 | wasnt
602 | way
603 | we
604 | wed
605 | welcome
606 | we'll
607 | went
608 | were
609 | werent
610 | we've
611 | what
612 | whatever
613 | what'll
614 | whats
615 | when
616 | whence
617 | whenever
618 | where
619 | whereafter
620 | whereas
621 | whereby
622 | wherein
623 | wheres
624 | whereupon
625 | wherever
626 | whether
627 | which
628 | while
629 | whim
630 | whither
631 | who
632 | whod
633 | whoever
634 | whole
635 | who'll
636 | whom
637 | whomever
638 | whos
639 | whose
640 | why
641 | widely
642 | willing
643 | wish
644 | with
645 | within
646 | without
647 | wont
648 | words
649 | world
650 | would
651 | wouldnt
652 | www
653 | x
654 | y
655 | yes
656 | yet
657 | you
658 | youd
659 | you'll
660 | your
661 | youre
662 | yours
663 | yourself
664 | yourselves
665 | you've
666 | z
667 | zero
668 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Implementation of TextRank for keyword Extraction
  3 | 
  4 | Based on: 
  5 | 
  6 | [TextRank: Bringing Order into Texts - by Rada Mihalcea and Paul Tarau](https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf)
  7 | 
  8 | The input text is given below
  9 | 
 10 | 
 11 | ```python
 12 | #Source of text:
 13 | #https://www.researchgate.net/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents
 14 | 
 15 | Text = "Compatibility of systems of linear constraints over the set of natural numbers. \
 16 | Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and \
 17 | nonstrict inequations are considered. \
 18 | Upper bounds for components of a minimal set of solutions and \
 19 | algorithms of construction of minimal generating sets of solutions for all \
 20 | types of systems are given. \
 21 | These criteria and the corresponding algorithms for constructing \
 22 | a minimal supporting set of solutions can be used in solving all the \
 23 | considered types of systems and systems of mixed types."
 24 | ```
 25 | 
 26 | ### Cleaning Text Data
 27 | 
 28 | The raw input text is cleaned off non-printable characters (if any) and turned into lower case.
 29 | The processed input text is then tokenized using NLTK library functions. 
 30 | 
 31 | 
 32 | ```python
 33 | 
 34 | import nltk
 35 | from nltk import word_tokenize
 36 | import string
 37 | 
 38 | #nltk.download('punkt')
 39 | 
 40 | def clean(text):
 41 |     text = text.lower()
 42 |     printable = set(string.printable)
 43 |     text = filter(lambda x: x in printable, text) #filter funny characters, if any.
 44 |     return text
 45 | 
 46 | Cleaned_text = clean(Text)
 47 | 
 48 | text = word_tokenize(Cleaned_text)
 49 | 
 50 | print "Tokenized Text: \n"
 51 | print text
 52 | ```
 53 | 
 54 |     Tokenized Text: 
 55 |     
 56 |     ['compatibility', 'of', 'systems', 'of', 'linear', 'constraints', 'over', 'the', 'set', 'of', 'natural', 'numbers', '.', 'criteria', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equations', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bounds', 'for', 'components', 'of', 'a', 'minimal', 'set', 'of', 'solutions', 'and', 'algorithms', 'of', 'construction', 'of', 'minimal', 'generating', 'sets', 'of', 'solutions', 'for', 'all', 'types', 'of', 'systems', 'are', 'given', '.', 'these', 'criteria', 'and', 'the', 'corresponding', 'algorithms', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solutions', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'types', 'of', 'systems', 'and', 'systems', 'of', 'mixed', 'types', '.']
 57 | 
 58 | 
 59 | ### POS Tagging For Lemmatization
 60 | 
 61 | NLTK is again used for <b>POS tagging</b> the input text so that the words can be lemmatized based on their POS tags.
 62 | 
 63 | Description of POS tags: 
 64 | 
 65 | 
 66 | http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
 67 | 
 68 | 
 69 | ```python
 70 | #nltk.download('averaged_perceptron_tagger')
 71 |   
 72 | POS_tag = nltk.pos_tag(text)
 73 | 
 74 | print "Tokenized Text with POS tags: \n"
 75 | print POS_tag
 76 | ```
 77 | 
 78 |     Tokenized Text with POS tags: 
 79 |     
 80 |     [('compatibility', 'NN'), ('of', 'IN'), ('systems', 'NNS'), ('of', 'IN'), ('linear', 'JJ'), ('constraints', 'NNS'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('numbers', 'NNS'), ('.', '.'), ('criteria', 'NNS'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'NN'), ('equations', 'NNS'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bounds', 'NNS'), ('for', 'IN'), ('components', 'NNS'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('and', 'CC'), ('algorithms', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('sets', 'NNS'), ('of', 'IN'), ('solutions', 'NNS'), ('for', 'IN'), ('all', 'DT'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criteria', 'NNS'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithms', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('and', 'CC'), ('systems', 'NNS'), ('of', 'IN'), ('mixed', 'JJ'), ('types', 'NNS'), ('.', '.')]
 81 | 
 82 | 
 83 | ### Lemmatization
 84 | 
 85 | The tokenized text (mainly the nouns and adjectives) is normalized by <b>lemmatization</b>.
 86 | In lemmatization different grammatical counterparts of a word will be replaced by single
 87 | basic lemma. For example, 'glasses' may be replaced by 'glass'. 
 88 | 
 89 | Details about lemmatization: 
 90 |     
 91 | https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html
 92 | 
 93 | 
 94 | ```python
 95 | #nltk.download('wordnet')
 96 | 
 97 | from nltk.stem import WordNetLemmatizer
 98 | 
 99 | wordnet_lemmatizer = WordNetLemmatizer()
100 | 
101 | adjective_tags = ['JJ','JJR','JJS']
102 | 
103 | lemmatized_text = []
104 | 
105 | for word in POS_tag:
106 |     if word[1] in adjective_tags:
107 |         lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
108 |     else:
109 |         lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun
110 |         
111 | print "Text tokens after lemmatization of adjectives and nouns: \n"
112 | print lemmatized_text
113 | ```
114 | 
115 |     Text tokens after lemmatization of adjectives and nouns: 
116 |     
117 |     ['compatibility', 'of', 'system', 'of', 'linear', 'constraint', 'over', 'the', 'set', 'of', 'natural', 'number', '.', 'criterion', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equation', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bound', 'for', 'component', 'of', 'a', 'minimal', 'set', 'of', 'solution', 'and', 'algorithm', 'of', 'construction', 'of', 'minimal', 'generating', 'set', 'of', 'solution', 'for', 'all', 'type', 'of', 'system', 'are', 'given', '.', 'these', 'criterion', 'and', 'the', 'corresponding', 'algorithm', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solution', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'type', 'of', 'system', 'and', 'system', 'of', 'mixed', 'type', '.']
118 | 
119 | 
120 | ### POS tagging for Filtering
121 | 
122 | The <b>lemmatized text</b> is <b>POS tagged</b> here. The tags will be used for filtering later on.
123 | 
124 | 
125 | ```python
126 | POS_tag = nltk.pos_tag(lemmatized_text)
127 | 
128 | print "Lemmatized text with POS tags: \n"
129 | print POS_tag
130 | ```
131 | 
132 |     Lemmatized text with POS tags: 
133 |     
134 |     [('compatibility', 'NN'), ('of', 'IN'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('constraint', 'NN'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('number', 'NN'), ('.', '.'), ('criterion', 'NN'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'JJ'), ('equation', 'NN'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bound', 'NN'), ('for', 'IN'), ('component', 'NN'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('and', 'CC'), ('algorithm', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('for', 'IN'), ('all', 'DT'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criterion', 'NN'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithm', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('and', 'CC'), ('system', 'NN'), ('of', 'IN'), ('mixed', 'JJ'), ('type', 'NN'), ('.', '.')]
135 | 
136 | 
137 | ## POS Based Filtering
138 | 
139 | Any word from the lemmatized text, which isn't a noun, adjective, or gerund (or a 'foreign word'), is here
140 | considered as a <b>stopword</b> (non-content). This is based on the assumption that usually keywords are noun,
141 | adjectives or gerunds. 
142 | 
143 | Punctuations are added to the stopword list too.
144 | 
145 | 
146 | ```python
147 | stopwords = []
148 | 
149 | wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 
150 | 
151 | for word in POS_tag:
152 |     if word[1] not in wanted_POS:
153 |         stopwords.append(word[0])
154 | 
155 | punctuations = list(str(string.punctuation))
156 | 
157 | stopwords = stopwords + punctuations
158 | ```
159 | 
160 | ### Complete stopword generation
161 | 
162 | Even if we remove the aforementioned stopwords, still some extremely common nouns, adjectives or gerunds may
163 | remain which are very bad candidates for being keywords (or part of it). 
164 | 
165 | An external file constituting a long list of stopwords is loaded and all the words are added with the previous
166 | stopwords to create the final list 'stopwords-plus' which is then converted into a set. 
167 | 
168 | (Source of stopwords data: https://www.ranks.nl/stopwords)
169 | 
170 | Stopwords-plus constitute the sum total of all stopwords and potential phrase-delimiters. 
171 | 
172 | (The contents of this set will be later used to partition the lemmatized text into n-gram phrases. But, for now, I will simply remove the stopwords, and work with a 'bag-of-words' approach. I will be developing the graph using unigram texts as vertices)
173 | 
174 | ```python
175 | stopword_file = open("long_stopwords.txt", "r")
176 | #Source = https://www.ranks.nl/stopwords
177 | 
178 | lots_of_stopwords = []
179 | 
180 | for line in stopword_file.readlines():
181 |     lots_of_stopwords.append(str(line.strip()))
182 | 
183 | stopwords_plus = []
184 | stopwords_plus = stopwords + lots_of_stopwords
185 | stopwords_plus = set(stopwords_plus)
186 | 
187 | #Stopwords_plus contain total set of all stopwords
188 | ```
189 | 
190 | ### Removing Stopwords 
191 | 
192 | Removing stopwords from lemmatized_text. 
193 | Processeced_text condtains the result.
194 | 
195 | 
196 | ```python
197 | processed_text = []
198 | for word in lemmatized_text:
199 |     if word not in stopwords_plus:
200 |         processed_text.append(word)
201 | print processed_text
202 | ```
203 | 
204 |     ['compatibility', 'system', 'linear', 'constraint', 'set', 'natural', 'number', 'criterion', 'compatibility', 'system', 'linear', 'diophantine', 'equation', 'strict', 'inequations', 'nonstrict', 'inequations', 'upper', 'bound', 'component', 'minimal', 'set', 'solution', 'algorithm', 'construction', 'minimal', 'generating', 'set', 'solution', 'type', 'system', 'criterion', 'algorithm', 'constructing', 'minimal', 'supporting', 'set', 'solution', 'solving', 'type', 'system', 'system', 'mixed', 'type']
205 | 
206 | 
207 | ## Vocabulary Creation
208 | 
209 | Vocabulary will only contain unique words from processed_text.
210 | 
211 | 
212 | ```python
213 | vocabulary = list(set(processed_text))
214 | print vocabulary
215 | ```
216 | 
217 |     ['upper', 'set', 'constructing', 'number', 'solving', 'system', 'compatibility', 'strict', 'criterion', 'type', 'minimal', 'supporting', 'generating', 'linear', 'diophantine', 'component', 'bound', 'nonstrict', 'inequations', 'natural', 'algorithm', 'constraint', 'equation', 'solution', 'construction', 'mixed']
218 | 
219 | 
220 | ### Building Graph
221 | 
222 | TextRank is a graph based model, and thus it requires us to build a graph. Each words in the vocabulary will serve as a vertex for graph. The words will be represented in the vertices by their index in vocabulary list.  
223 | 
224 | The weighetd_edge matrix contains the information of edge connections among all vertices.
225 | I am building a graph with wieghted undirected edges.
226 | 
227 | weighted_edge[i][j] contains the weight of the connecting edge between the word vertex represented by vocabulary index i and the word vertex represented by vocabulary j.
228 | 
229 | If weighted_edge[i][j] is zero, it means no edge or connection is present between the words represented by index i and j.
230 | 
231 | There is a connection between the words (and thus between i and j which represents them) if the words co-occur within a window of a specified 'window_size' in the processed_text.
232 | 
233 | I am increasing value of the weighted_edge[i][j] is increased by (1/(distance between positions of words currently represented by i and j)) for every connection discovered between the same words in different locations of the text. 
234 | 
235 | The covered_coocurrences list (which is contain the list of pairs of absolute positions in processed_text of the words whose coocurrence at that location is already checked) is managed so that the same two words located in the same positions in processed_text are not repetitively counted while sliding the window one text unit at a time.
236 | 
237 | The score of all vertices are intialized to one. 
238 | 
239 | Self-connections are not considered, so weighted_edge[i][i] will be zero.
240 | 
241 | 
242 | ```python
243 | import numpy as np
244 | import math
245 | vocab_len = len(vocabulary)
246 | 
247 | weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)
248 | 
249 | score = np.zeros((vocab_len),dtype=np.float32)
250 | window_size = 3
251 | covered_coocurrences = []
252 | 
253 | for i in xrange(0,vocab_len):
254 |     score[i]=1
255 |     for j in xrange(0,vocab_len):
256 |         if j==i:
257 |             weighted_edge[i][j]=0
258 |         else:
259 |             for window_start in xrange(0,(len(processed_text)-window_size+1)):
260 |                 
261 |                 window_end = window_start+window_size
262 |                 
263 |                 window = processed_text[window_start:window_end]
264 |                 
265 |                 if (vocabulary[i] in window) and (vocabulary[j] in window):
266 |                     
267 |                     index_of_i = window_start + window.index(vocabulary[i])
268 |                     index_of_j = window_start + window.index(vocabulary[j])
269 |                     
270 |                     # index_of_x is the absolute position of the xth term in the window 
271 |                     # (counting from 0) 
272 |                     # in the processed_text
273 |                       
274 |                     if [index_of_i,index_of_j] not in covered_coocurrences:
275 |                         weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
276 |                         covered_coocurrences.append([index_of_i,index_of_j])
277 | 
278 | ```
279 | 
280 | ### Calculating weighted summation of connections of a vertex
281 | 
282 | inout[i] will contain the total no. of undirected connections\edges associated withe the vertex represented by i.
283 | 
284 | 
285 | ```python
286 | inout = np.zeros((vocab_len),dtype=np.float32)
287 | 
288 | for i in xrange(0,vocab_len):
289 |     for j in xrange(0,vocab_len):
290 |         inout[i]+=weighted_edge[i][j]
291 | ```
292 | 
293 | ### Scoring Vertices
294 | 
295 | The formula used for scoring a vertex represented by i is:
296 | 
297 | score[i] = (1-d) + d x [ Summation(j) ( (weighted_edge[i][j]/inout[j]) x score[j] ) ] where j belongs to the list of vertices that has a connection with i. 
298 | 
299 | d is the damping factor.
300 | 
301 | The score is iteratively updated until convergence. 
302 | 
303 | 
304 | ```python
305 | MAX_ITERATIONS = 50
306 | d=0.85
307 | threshold = 0.0001 #convergence threshold
308 | 
309 | for iter in xrange(0,MAX_ITERATIONS):
310 |     prev_score = np.copy(score)
311 |     
312 |     for i in xrange(0,vocab_len):
313 |         
314 |         summation = 0
315 |         for j in xrange(0,vocab_len):
316 |             if weighted_edge[i][j] != 0:
317 |                 summation += (weighted_edge[i][j]/inout[j])*score[j]
318 |                 
319 |         score[i] = (1-d) + d*(summation)
320 |     
321 |     if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition
322 |         print "Converging at iteration "+str(iter)+"...."
323 |         break
324 | 
325 | ```
326 | 
327 |     Converging at iteration 29....
328 | 
329 | 
330 | 
331 | ```python
332 | for i in xrange(0,vocab_len):
333 |     print "Score of "+vocabulary[i]+": "+str(score[i])
334 | ```
335 | 
336 |     Score of upper: 0.816792
337 |     Score of set: 2.27184
338 |     Score of constructing: 0.667288
339 |     Score of number: 0.688316
340 |     Score of solving: 0.642318
341 |     Score of system: 2.12032
342 |     Score of compatibility: 0.944584
343 |     Score of strict: 0.823772
344 |     Score of criterion: 1.22559
345 |     Score of type: 1.08101
346 |     Score of minimal: 1.78693
347 |     Score of supporting: 0.653705
348 |     Score of generating: 0.652645
349 |     Score of linear: 1.2717
350 |     Score of diophantine: 0.759295
351 |     Score of component: 0.737641
352 |     Score of bound: 0.786006
353 |     Score of nonstrict: 0.827216
354 |     Score of inequations: 1.30824
355 |     Score of natural: 0.688299
356 |     Score of algorithm: 1.19365
357 |     Score of constraint: 0.674411
358 |     Score of equation: 0.799815
359 |     Score of solution: 1.6832
360 |     Score of construction: 0.659809
361 |     Score of mixed: 0.235822
362 | 
363 | 
364 | ### Phrase Partitioning
365 | 
366 | Paritioning lemmatized_text into phrases using the stopwords in it as delimeters.
367 | The phrases are also candidates for keyphrases to be extracted. 
368 | 
369 | 
370 | ```python
371 | phrases = []
372 | 
373 | phrase = " "
374 | for word in lemmatized_text:
375 |     
376 |     if word in stopwords_plus:
377 |         if phrase!= " ":
378 |             phrases.append(str(phrase).strip().split())
379 |         phrase = " "
380 |     elif word not in stopwords_plus:
381 |         phrase+=str(word)
382 |         phrase+=" "
383 | 
384 | print "Partitioned Phrases (Candidate Keyphrases): \n"
385 | print phrases
386 | ```
387 | 
388 |     Partitioned Phrases (Candidate Keyphrases): 
389 |     
390 |     [['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['compatibility'], ['system'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['solution'], ['type'], ['system'], ['criterion'], ['algorithm'], ['constructing'], ['minimal', 'supporting', 'set'], ['solution'], ['solving'], ['type'], ['system'], ['system'], ['mixed', 'type']]
391 | 
392 | 
393 | ### Create a list of unique phrases.
394 | 
395 | Repeating phrases\keyphrase candidates has no purpose here, anymore. 
396 | 
397 | 
398 | ```python
399 | unique_phrases = []
400 | 
401 | for phrase in phrases:
402 |     if phrase not in unique_phrases:
403 |         unique_phrases.append(phrase)
404 | 
405 | print "Unique Phrases (Candidate Keyphrases): \n"
406 | print unique_phrases
407 | ```
408 | 
409 |     Unique Phrases (Candidate Keyphrases): 
410 |     
411 |     [['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['type'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']]
412 | 
413 | 
414 | ### Thinning the list of candidate-keyphrases.
415 | 
416 | Removing single word keyphrase-candidates that are present multi-word alternatives. 
417 | 
418 | 
419 | ```python
420 | for word in vocabulary:
421 |     #print word
422 |     for phrase in unique_phrases:
423 |         if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
424 |             #if len(phrase)>1 then the current phrase is multi-worded.
425 |             #if the word in vocabulary is present in unique_phrases as a single-word-phrase
426 |             # and at the same time present as a word within a multi-worded phrase,
427 |             # then I will remove the single-word-phrase from the list.
428 |             unique_phrases.remove([word])
429 |             
430 | print "Thinned Unique Phrases (Candidate Keyphrases): \n"
431 | print unique_phrases    
432 | ```
433 | 
434 |     Thinned Unique Phrases (Candidate Keyphrases): 
435 |     
436 |     [['compatibility'], ['system'], ['linear', 'constraint'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']]
437 | 
438 | 
439 | ### Scoring Keyphrases
440 | 
441 | Scoring the phrases (candidate keyphrases) and building up a list of keyphrases
442 | by listing untokenized versions of tokenized phrases\candidate-keyphrases.
443 | Phrases are scored by adding the score of their members (words\text-units that were ranked by the graph algorithm)
444 | 
445 | 
446 | 
447 | ```python
448 | phrase_scores = []
449 | keywords = []
450 | for phrase in unique_phrases:
451 |     phrase_score=0
452 |     keyword = ''
453 |     for word in phrase:
454 |         keyword += str(word)
455 |         keyword += " "
456 |         phrase_score+=score[vocabulary.index(word)]
457 |     phrase_scores.append(phrase_score)
458 |     keywords.append(keyword.strip())
459 | 
460 | i=0
461 | for keyword in keywords:
462 |     print "Keyword: '"+str(keyword)+"', Score: "+str(phrase_scores[i])
463 |     i+=1
464 | ```
465 | 
466 |     Keyword: 'compatibility', Score: 0.944583714008
467 |     Keyword: 'system', Score: 2.12031626701
468 |     Keyword: 'linear constraint', Score: 1.94610738754
469 |     Keyword: 'natural number', Score: 1.37661552429
470 |     Keyword: 'criterion', Score: 1.2255872488
471 |     Keyword: 'linear diophantine equation', Score: 2.83080631495
472 |     Keyword: 'strict inequations', Score: 2.13201224804
473 |     Keyword: 'nonstrict inequations', Score: 2.135455966
474 |     Keyword: 'upper bound', Score: 1.60279768705
475 |     Keyword: 'component', Score: 0.737640619278
476 |     Keyword: 'minimal set', Score: 4.05876886845
477 |     Keyword: 'solution', Score: 1.68319940567
478 |     Keyword: 'algorithm', Score: 1.19365406036
479 |     Keyword: 'construction', Score: 0.659808635712
480 |     Keyword: 'minimal generating set', Score: 4.71141409874
481 |     Keyword: 'constructing', Score: 0.66728836298
482 |     Keyword: 'minimal supporting set', Score: 4.71247345209
483 |     Keyword: 'solving', Score: 0.642318367958
484 |     Keyword: 'mixed type', Score: 1.31682945788
485 | 
486 | 
487 | ### Ranking Keyphrases
488 | 
489 | Ranking keyphrases based on their calculated scores. Displaying top 'keywords_num' no. of keyphrases.
490 | 
491 | 
492 | ```python
493 | sorted_index = np.flip(np.argsort(phrase_scores),0)
494 | 
495 | keywords_num = 10
496 | 
497 | print "Keywords:\n"
498 | 
499 | for i in xrange(0,keywords_num):
500 |     print str(keywords[sorted_index[i]])+", ",
501 | ```
502 | 
503 |     Keywords:
504 |     
505 |     minimal supporting set,  minimal generating set,  minimal set,  linear diophantine equation,  nonstrict inequations,  strict inequations,  system,  linear constraint,  solution,  upper bound, 
506 | 
507 | 
508 | # Input:
509 | 
510 | Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.
511 | 
512 | # Extracted Keywords:
513 | 
514 | * minimal supporting set,  
515 | * minimal generating set,  
516 | * minimal set,  
517 | * linear diophantine equation,  
518 | * nonstrict inequations,  
519 | * strict inequations,  
520 | * system,  
521 | * linear constraint,  
522 | * solution,  
523 | * upper bound, 
524 | 


--------------------------------------------------------------------------------
/TextRank.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Implementation of TextRank\n",
  8 |     "(Based on: https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf)"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "The input text is given below"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 6,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "#Source of text:\n",
 25 |     "#https://www.researchgate.net/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents\n",
 26 |     "\n",
 27 |     "Text = \"Compatibility of systems of linear constraints over the set of natural numbers. \\\n",
 28 |     "Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and \\\n",
 29 |     "nonstrict inequations are considered. \\\n",
 30 |     "Upper bounds for components of a minimal set of solutions and \\\n",
 31 |     "algorithms of construction of minimal generating sets of solutions for all \\\n",
 32 |     "types of systems are given. \\\n",
 33 |     "These criteria and the corresponding algorithms for constructing \\\n",
 34 |     "a minimal supporting set of solutions can be used in solving all the \\\n",
 35 |     "considered types of systems and systems of mixed types.\""
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "### Cleaning Text Data\n",
 43 |     "\n",
 44 |     "The raw input text is cleaned off non-printable characters (if any) and turned into lower case.\n",
 45 |     "The processed input text is then tokenized using NLTK library functions. "
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 20,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "Tokenized Text: \n",
 58 |       "\n",
 59 |       "['compatibility', 'of', 'systems', 'of', 'linear', 'constraints', 'over', 'the', 'set', 'of', 'natural', 'numbers', '.', 'criteria', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equations', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bounds', 'for', 'components', 'of', 'a', 'minimal', 'set', 'of', 'solutions', 'and', 'algorithms', 'of', 'construction', 'of', 'minimal', 'generating', 'sets', 'of', 'solutions', 'for', 'all', 'types', 'of', 'systems', 'are', 'given', '.', 'these', 'criteria', 'and', 'the', 'corresponding', 'algorithms', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solutions', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'types', 'of', 'systems', 'and', 'systems', 'of', 'mixed', 'types', '.']\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "\n",
 65 |     "import nltk\n",
 66 |     "from nltk import word_tokenize\n",
 67 |     "import string\n",
 68 |     "\n",
 69 |     "#nltk.download('punkt')\n",
 70 |     "\n",
 71 |     "def clean(text):\n",
 72 |     "    text = text.lower()\n",
 73 |     "    printable = set(string.printable)\n",
 74 |     "    text = filter(lambda x: x in printable, text)\n",
 75 |     "    text = \"\".join(list(text))\n",
 76 |     "    return text\n",
 77 |     "\n",
 78 |     "Cleaned_text = clean(Text)\n",
 79 |     "# print(Cleaned_text)\n",
 80 |     "text = word_tokenize(Cleaned_text)\n",
 81 |     "\n",
 82 |     "print (\"Tokenized Text: \\n\")\n",
 83 |     "print (text)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "### POS Tagging For Lemmatization\n",
 91 |     "\n",
 92 |     "NLTK is again used for <b>POS tagging</b> the input text so that the words can be lemmatized based on their POS tags.\n",
 93 |     "\n",
 94 |     "Description of POS tags: \n",
 95 |     "\n",
 96 |     "\n",
 97 |     "http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 22,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "Tokenized Text with POS tags: \n",
110 |       "\n",
111 |       "[('compatibility', 'NN'), ('of', 'IN'), ('systems', 'NNS'), ('of', 'IN'), ('linear', 'JJ'), ('constraints', 'NNS'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('numbers', 'NNS'), ('.', '.'), ('criteria', 'NNS'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'NN'), ('equations', 'NNS'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bounds', 'NNS'), ('for', 'IN'), ('components', 'NNS'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('and', 'CC'), ('algorithms', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('sets', 'NNS'), ('of', 'IN'), ('solutions', 'NNS'), ('for', 'IN'), ('all', 'DT'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criteria', 'NNS'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithms', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('and', 'CC'), ('systems', 'NNS'), ('of', 'IN'), ('mixed', 'JJ'), ('types', 'NNS'), ('.', '.')]\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "#nltk.download('averaged_perceptron_tagger')\n",
117 |     "  \n",
118 |     "POS_tag = nltk.pos_tag(text)\n",
119 |     "\n",
120 |     "print (\"Tokenized Text with POS tags: \\n\")\n",
121 |     "print (POS_tag)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "### Lemmatization\n",
129 |     "\n",
130 |     "The tokenized text (mainly the nouns and adjectives) is normalized by <b>lemmatization</b>.\n",
131 |     "In lemmatization different grammatical counterparts of a word will be replaced by single\n",
132 |     "basic lemma. For example, 'glasses' may be replaced by 'glass'. \n",
133 |     "\n",
134 |     "Details about lemmatization: \n",
135 |     "    \n",
136 |     "https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 23,
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "Text tokens after lemmatization of adjectives and nouns: \n",
149 |       "\n",
150 |       "['compatibility', 'of', 'system', 'of', 'linear', 'constraint', 'over', 'the', 'set', 'of', 'natural', 'number', '.', 'criterion', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equation', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bound', 'for', 'component', 'of', 'a', 'minimal', 'set', 'of', 'solution', 'and', 'algorithm', 'of', 'construction', 'of', 'minimal', 'generating', 'set', 'of', 'solution', 'for', 'all', 'type', 'of', 'system', 'are', 'given', '.', 'these', 'criterion', 'and', 'the', 'corresponding', 'algorithm', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solution', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'type', 'of', 'system', 'and', 'system', 'of', 'mixed', 'type', '.']\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "#nltk.download('wordnet')\n",
156 |     "\n",
157 |     "from nltk.stem import WordNetLemmatizer\n",
158 |     "\n",
159 |     "wordnet_lemmatizer = WordNetLemmatizer()\n",
160 |     "\n",
161 |     "adjective_tags = ['JJ','JJR','JJS']\n",
162 |     "\n",
163 |     "lemmatized_text = []\n",
164 |     "\n",
165 |     "for word in POS_tag:\n",
166 |     "    if word[1] in adjective_tags:\n",
167 |     "        lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos=\"a\")))\n",
168 |     "    else:\n",
169 |     "        lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun\n",
170 |     "        \n",
171 |     "print (\"Text tokens after lemmatization of adjectives and nouns: \\n\")\n",
172 |     "print (lemmatized_text)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "### POS tagging for Filtering\n",
180 |     "\n",
181 |     "The <b>lemmatized text</b> is <b>POS tagged</b> here. The tags will be used for filtering later on."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 24,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "Lemmatized text with POS tags: \n",
194 |       "\n",
195 |       "[('compatibility', 'NN'), ('of', 'IN'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('constraint', 'NN'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('number', 'NN'), ('.', '.'), ('criterion', 'NN'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'JJ'), ('equation', 'NN'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bound', 'NN'), ('for', 'IN'), ('component', 'NN'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('and', 'CC'), ('algorithm', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('for', 'IN'), ('all', 'DT'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criterion', 'NN'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithm', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('and', 'CC'), ('system', 'NN'), ('of', 'IN'), ('mixed', 'JJ'), ('type', 'NN'), ('.', '.')]\n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "POS_tag = nltk.pos_tag(lemmatized_text)\n",
201 |     "\n",
202 |     "print (\"Lemmatized text with POS tags: \\n\")\n",
203 |     "print (POS_tag)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "## POS Based Filtering\n",
211 |     "\n",
212 |     "Any word from the lemmatized text, which isn't a noun, adjective, or gerund (or a 'foreign word'), is here\n",
213 |     "considered as a <b>stopword</b> (non-content). This is based on the assumption that usually keywords are noun,\n",
214 |     "adjectives or gerunds. \n",
215 |     "\n",
216 |     "Punctuations are added to the stopword list too."
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 25,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "stopwords = []\n",
226 |     "\n",
227 |     "wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] \n",
228 |     "\n",
229 |     "for word in POS_tag:\n",
230 |     "    if word[1] not in wanted_POS:\n",
231 |     "        stopwords.append(word[0])\n",
232 |     "\n",
233 |     "punctuations = list(str(string.punctuation))\n",
234 |     "\n",
235 |     "stopwords = stopwords + punctuations"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "### Complete stopword generation\n",
243 |     "\n",
244 |     "Even if we remove the aforementioned stopwords, still some extremely common nouns, adjectives or gerunds may\n",
245 |     "remain which are very bad candidates for being keywords (or part of it). \n",
246 |     "\n",
247 |     "An external file constituting a long list of stopwords is loaded and all the words are added with the previous\n",
248 |     "stopwords to create the final list 'stopwords-plus' which is then converted into a set. \n",
249 |     "\n",
250 |     "(Source of stopwords data: https://www.ranks.nl/stopwords)\n",
251 |     "\n",
252 |     "Stopwords-plus constitute the sum total of all stopwords and potential phrase-delimiters. \n",
253 |     "\n",
254 |     "(The contents of this set will be later used to partition the lemmatized text into n-gram phrases. But, for now, I will simply remove the stopwords, and work with a 'bag-of-words' approach. I will be developing the graph using unigram texts as vertices)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 27,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "stopword_file = open(\"long_stopwords.txt\", \"r\")\n",
264 |     "#Source = https://www.ranks.nl/stopwords\n",
265 |     "\n",
266 |     "lots_of_stopwords = []\n",
267 |     "\n",
268 |     "for line in stopword_file.readlines():\n",
269 |     "    lots_of_stopwords.append(str(line.strip()))\n",
270 |     "\n",
271 |     "stopwords_plus = []\n",
272 |     "stopwords_plus = stopwords + lots_of_stopwords\n",
273 |     "stopwords_plus = set(stopwords_plus)\n",
274 |     "\n",
275 |     "#Stopwords_plus contain total set of all stopwords"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "### Removing Stopwords \n",
283 |     "\n",
284 |     "Removing stopwords from lemmatized_text. \n",
285 |     "Processeced_text condtains the result."
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 29,
291 |    "metadata": {},
292 |    "outputs": [
293 |     {
294 |      "name": "stdout",
295 |      "output_type": "stream",
296 |      "text": [
297 |       "['compatibility', 'system', 'linear', 'constraint', 'set', 'natural', 'number', 'criterion', 'compatibility', 'system', 'linear', 'diophantine', 'equation', 'strict', 'inequations', 'nonstrict', 'inequations', 'upper', 'bound', 'component', 'minimal', 'set', 'solution', 'algorithm', 'construction', 'minimal', 'generating', 'set', 'solution', 'type', 'system', 'criterion', 'algorithm', 'constructing', 'minimal', 'supporting', 'set', 'solution', 'solving', 'type', 'system', 'system', 'mixed', 'type']\n"
298 |      ]
299 |     }
300 |    ],
301 |    "source": [
302 |     "processed_text = []\n",
303 |     "for word in lemmatized_text:\n",
304 |     "    if word not in stopwords_plus:\n",
305 |     "        processed_text.append(word)\n",
306 |     "print (processed_text)"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {},
312 |    "source": [
313 |     "## Vocabulary Creation\n",
314 |     "\n",
315 |     "Vocabulary will only contain unique words from processed_text."
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 31,
321 |    "metadata": {},
322 |    "outputs": [
323 |     {
324 |      "name": "stdout",
325 |      "output_type": "stream",
326 |      "text": [
327 |       "['solving', 'equation', 'generating', 'diophantine', 'construction', 'set', 'mixed', 'minimal', 'compatibility', 'component', 'system', 'natural', 'inequations', 'constraint', 'criterion', 'type', 'upper', 'solution', 'linear', 'algorithm', 'strict', 'bound', 'nonstrict', 'number', 'supporting', 'constructing']\n"
328 |      ]
329 |     }
330 |    ],
331 |    "source": [
332 |     "vocabulary = list(set(processed_text))\n",
333 |     "print (vocabulary)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {},
339 |    "source": [
340 |     "### Building Graph\n",
341 |     "\n",
342 |     "TextRank is a graph based model, and thus it requires us to build a graph. Each words in the vocabulary will serve as a vertex for graph. The words will be represented in the vertices by their index in vocabulary list.  \n",
343 |     "\n",
344 |     "The weighted_edge matrix contains the information of edge connections among all vertices.\n",
345 |     "I am building wieghted undirected edges.\n",
346 |     "\n",
347 |     "weighted_edge[i][j] contains the weight of the connecting edge between the word vertex represented by vocabulary index i and the word vertex represented by vocabulary j.\n",
348 |     "\n",
349 |     "If weighted_edge[i][j] is zero, it means no edge connection is present between the words represented by index i and j.\n",
350 |     "\n",
351 |     "There is a connection between the words (and thus between i and j which represents them) if the words co-occur within a window of a specified 'window_size' in the processed_text.\n",
352 |     "\n",
353 |     "The value of the weighted_edge[i][j] is increased by (1/(distance between positions of words currently represented by i and j)) for every connection discovered between the same words in different locations of the text. \n",
354 |     "\n",
355 |     "The covered_coocurrences list (which is contain the list of pairs of absolute positions in processed_text of the words whose coocurrence at that location is already checked) is managed so that the same two words located in the same positions in processed_text are not repetitively counted while sliding the window one text unit at a time.\n",
356 |     "\n",
357 |     "The score of all vertices are intialized to one. \n",
358 |     "\n",
359 |     "Self-connections are not considered, so weighted_edge[i][i] will be zero."
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 33,
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "import numpy as np\n",
369 |     "import math\n",
370 |     "vocab_len = len(vocabulary)\n",
371 |     "\n",
372 |     "weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)\n",
373 |     "\n",
374 |     "score = np.zeros((vocab_len),dtype=np.float32)\n",
375 |     "window_size = 3\n",
376 |     "covered_coocurrences = []\n",
377 |     "\n",
378 |     "for i in range(0,vocab_len):\n",
379 |     "    score[i]=1\n",
380 |     "    for j in range(0,vocab_len):\n",
381 |     "        if j==i:\n",
382 |     "            weighted_edge[i][j]=0\n",
383 |     "        else:\n",
384 |     "            for window_start in range(0,(len(processed_text)-window_size)):\n",
385 |     "                \n",
386 |     "                window_end = window_start+window_size\n",
387 |     "                \n",
388 |     "                window = processed_text[window_start:window_end]\n",
389 |     "                \n",
390 |     "                if (vocabulary[i] in window) and (vocabulary[j] in window):\n",
391 |     "                    \n",
392 |     "                    index_of_i = window_start + window.index(vocabulary[i])\n",
393 |     "                    index_of_j = window_start + window.index(vocabulary[j])\n",
394 |     "                    \n",
395 |     "                    # index_of_x is the absolute position of the xth term in the window \n",
396 |     "                    # (counting from 0) \n",
397 |     "                    # in the processed_text\n",
398 |     "                      \n",
399 |     "                    if [index_of_i,index_of_j] not in covered_coocurrences:\n",
400 |     "                        weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)\n",
401 |     "                        covered_coocurrences.append([index_of_i,index_of_j])\n"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "markdown",
406 |    "metadata": {},
407 |    "source": [
408 |     "### Calculating weighted summation of connections of a vertex\n",
409 |     "\n",
410 |     "inout[i] will contain the sum of all the undirected connections\\edges associated withe the vertex represented by i."
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 34,
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "inout = np.zeros((vocab_len),dtype=np.float32)\n",
420 |     "\n",
421 |     "for i in range(0,vocab_len):\n",
422 |     "    for j in range(0,vocab_len):\n",
423 |     "        inout[i]+=weighted_edge[i][j]"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "markdown",
428 |    "metadata": {},
429 |    "source": [
430 |     "### Scoring Vertices\n",
431 |     "\n",
432 |     "The formula used for scoring a vertex represented by i is:\n",
433 |     "\n",
434 |     "score[i] = (1-d) + d x [ Summation(j) ( (weighted_edge[i][j]/inout[j]) x score[j] ) ] where j belongs to the list of vertieces that has a connection with i. \n",
435 |     "\n",
436 |     "d is the damping factor.\n",
437 |     "\n",
438 |     "The score is iteratively updated until convergence. "
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": 35,
444 |    "metadata": {},
445 |    "outputs": [
446 |     {
447 |      "name": "stdout",
448 |      "output_type": "stream",
449 |      "text": [
450 |       "Converging at iteration 23....\n"
451 |      ]
452 |     }
453 |    ],
454 |    "source": [
455 |     "MAX_ITERATIONS = 50\n",
456 |     "d=0.85\n",
457 |     "threshold = 0.0001 #convergence threshold\n",
458 |     "\n",
459 |     "for iter in range(0,MAX_ITERATIONS):\n",
460 |     "    prev_score = np.copy(score)\n",
461 |     "    \n",
462 |     "    for i in range(0,vocab_len):\n",
463 |     "        \n",
464 |     "        summation = 0\n",
465 |     "        for j in range(0,vocab_len):\n",
466 |     "            if weighted_edge[i][j] != 0:\n",
467 |     "                summation += (weighted_edge[i][j]/inout[j])*score[j]\n",
468 |     "                \n",
469 |     "        score[i] = (1-d) + d*(summation)\n",
470 |     "    \n",
471 |     "    if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition\n",
472 |     "        print(\"Converging at iteration \"+str(iter)+\"....\")\n",
473 |     "        break\n"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 36,
479 |    "metadata": {},
480 |    "outputs": [
481 |     {
482 |      "name": "stdout",
483 |      "output_type": "stream",
484 |      "text": [
485 |       "Score of solving: 0.64231944\n",
486 |       "Score of equation: 0.79981786\n",
487 |       "Score of generating: 0.65264744\n",
488 |       "Score of diophantine: 0.759297\n",
489 |       "Score of construction: 0.6598107\n",
490 |       "Score of set: 2.2718465\n",
491 |       "Score of mixed: 0.2358227\n",
492 |       "Score of minimal: 1.7869267\n",
493 |       "Score of compatibility: 0.9445859\n",
494 |       "Score of component: 0.73764145\n",
495 |       "Score of system: 2.1203177\n",
496 |       "Score of natural: 0.6883006\n",
497 |       "Score of inequations: 1.308244\n",
498 |       "Score of constraint: 0.67441183\n",
499 |       "Score of criterion: 1.2255884\n",
500 |       "Score of type: 1.0810083\n",
501 |       "Score of upper: 0.8167923\n",
502 |       "Score of solution: 1.683202\n",
503 |       "Score of linear: 1.2716976\n",
504 |       "Score of algorithm: 1.1936545\n",
505 |       "Score of strict: 0.8237729\n",
506 |       "Score of bound: 0.78600633\n",
507 |       "Score of nonstrict: 0.8272164\n",
508 |       "Score of number: 0.6883157\n",
509 |       "Score of supporting: 0.6537049\n",
510 |       "Score of constructing: 0.66728705\n"
511 |      ]
512 |     }
513 |    ],
514 |    "source": [
515 |     "for i in range(0,vocab_len):\n",
516 |     "    print(\"Score of \"+vocabulary[i]+\": \"+str(score[i]))"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "markdown",
521 |    "metadata": {},
522 |    "source": [
523 |     "### Phrase Partiotioning\n",
524 |     "\n",
525 |     "Paritioning lemmatized_text into phrases using the stopwords in it as delimeters.\n",
526 |     "The phrases are also candidates for keyphrases to be extracted. "
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": 37,
532 |    "metadata": {},
533 |    "outputs": [
534 |     {
535 |      "name": "stdout",
536 |      "output_type": "stream",
537 |      "text": [
538 |       "Partitioned Phrases (Candidate Keyphrases): \n",
539 |       "\n",
540 |       "[['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['compatibility'], ['system'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['solution'], ['type'], ['system'], ['criterion'], ['algorithm'], ['constructing'], ['minimal', 'supporting', 'set'], ['solution'], ['solving'], ['type'], ['system'], ['system'], ['mixed', 'type']]\n"
541 |      ]
542 |     }
543 |    ],
544 |    "source": [
545 |     "phrases = []\n",
546 |     "\n",
547 |     "phrase = \" \"\n",
548 |     "for word in lemmatized_text:\n",
549 |     "    \n",
550 |     "    if word in stopwords_plus:\n",
551 |     "        if phrase!= \" \":\n",
552 |     "            phrases.append(str(phrase).strip().split())\n",
553 |     "        phrase = \" \"\n",
554 |     "    elif word not in stopwords_plus:\n",
555 |     "        phrase+=str(word)\n",
556 |     "        phrase+=\" \"\n",
557 |     "\n",
558 |     "print(\"Partitioned Phrases (Candidate Keyphrases): \\n\")\n",
559 |     "print(phrases)"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "markdown",
564 |    "metadata": {},
565 |    "source": [
566 |     "### Create a list of unique phrases.\n",
567 |     "\n",
568 |     "Repeating phrases\\keyphrase candidates has no purpose here, anymore. "
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": 38,
574 |    "metadata": {},
575 |    "outputs": [
576 |     {
577 |      "name": "stdout",
578 |      "output_type": "stream",
579 |      "text": [
580 |       "Unique Phrases (Candidate Keyphrases): \n",
581 |       "\n",
582 |       "[['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['type'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']]\n"
583 |      ]
584 |     }
585 |    ],
586 |    "source": [
587 |     "unique_phrases = []\n",
588 |     "\n",
589 |     "for phrase in phrases:\n",
590 |     "    if phrase not in unique_phrases:\n",
591 |     "        unique_phrases.append(phrase)\n",
592 |     "\n",
593 |     "print(\"Unique Phrases (Candidate Keyphrases): \\n\")\n",
594 |     "print(unique_phrases)"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "markdown",
599 |    "metadata": {},
600 |    "source": [
601 |     "### Thinning the list of candidate-keyphrases.\n",
602 |     "\n",
603 |     "Removing single word keyphrases-candidates that are present multi-word alternatives. "
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": 39,
609 |    "metadata": {},
610 |    "outputs": [
611 |     {
612 |      "name": "stdout",
613 |      "output_type": "stream",
614 |      "text": [
615 |       "Thinned Unique Phrases (Candidate Keyphrases): \n",
616 |       "\n",
617 |       "[['compatibility'], ['system'], ['linear', 'constraint'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']]\n"
618 |      ]
619 |     }
620 |    ],
621 |    "source": [
622 |     "for word in vocabulary:\n",
623 |     "    #print word\n",
624 |     "    for phrase in unique_phrases:\n",
625 |     "        if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):\n",
626 |     "            #if len(phrase)>1 then the current phrase is multi-worded.\n",
627 |     "            #if the word in vocabulary is present in unique_phrases as a single-word-phrase\n",
628 |     "            # and at the same time present as a word within a multi-worded phrase,\n",
629 |     "            # then I will remove the single-word-phrase from the list.\n",
630 |     "            unique_phrases.remove([word])\n",
631 |     "            \n",
632 |     "print(\"Thinned Unique Phrases (Candidate Keyphrases): \\n\")\n",
633 |     "print(unique_phrases)    "
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "markdown",
638 |    "metadata": {},
639 |    "source": [
640 |     "### Scoring Keyphrases\n",
641 |     "\n",
642 |     "Scoring the phrases (candidate keyphrases) and building up a list of keyphrases\\keywords\n",
643 |     "by listing untokenized versions of tokenized phrases\\candidate-keyphrases.\n",
644 |     "Phrases are scored by adding the score of their members (words\\text-units that were ranked by the graph algorithm)\n"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "code",
649 |    "execution_count": 40,
650 |    "metadata": {},
651 |    "outputs": [
652 |     {
653 |      "name": "stdout",
654 |      "output_type": "stream",
655 |      "text": [
656 |       "Keyword: 'compatibility', Score: 0.944585919380188\n",
657 |       "Keyword: 'system', Score: 2.1203176975250244\n",
658 |       "Keyword: 'linear constraint', Score: 1.9461094737052917\n",
659 |       "Keyword: 'natural number', Score: 1.3766162991523743\n",
660 |       "Keyword: 'criterion', Score: 1.2255884408950806\n",
661 |       "Keyword: 'linear diophantine equation', Score: 2.8308125138282776\n",
662 |       "Keyword: 'strict inequations', Score: 2.132016897201538\n",
663 |       "Keyword: 'nonstrict inequations', Score: 2.135460376739502\n",
664 |       "Keyword: 'upper bound', Score: 1.6027986407279968\n",
665 |       "Keyword: 'component', Score: 0.737641453742981\n",
666 |       "Keyword: 'minimal set', Score: 4.0587732791900635\n",
667 |       "Keyword: 'solution', Score: 1.6832020282745361\n",
668 |       "Keyword: 'algorithm', Score: 1.1936545372009277\n",
669 |       "Keyword: 'construction', Score: 0.6598107218742371\n",
670 |       "Keyword: 'minimal generating set', Score: 4.711420714855194\n",
671 |       "Keyword: 'constructing', Score: 0.6672870516777039\n",
672 |       "Keyword: 'minimal supporting set', Score: 4.712478160858154\n",
673 |       "Keyword: 'solving', Score: 0.6423194408416748\n",
674 |       "Keyword: 'mixed type', Score: 1.3168310225009918\n"
675 |      ]
676 |     }
677 |    ],
678 |    "source": [
679 |     "phrase_scores = []\n",
680 |     "keywords = []\n",
681 |     "for phrase in unique_phrases:\n",
682 |     "    phrase_score=0\n",
683 |     "    keyword = ''\n",
684 |     "    for word in phrase:\n",
685 |     "        keyword += str(word)\n",
686 |     "        keyword += \" \"\n",
687 |     "        phrase_score+=score[vocabulary.index(word)]\n",
688 |     "    phrase_scores.append(phrase_score)\n",
689 |     "    keywords.append(keyword.strip())\n",
690 |     "\n",
691 |     "i=0\n",
692 |     "for keyword in keywords:\n",
693 |     "    print (\"Keyword: '\"+str(keyword)+\"', Score: \"+str(phrase_scores[i]))\n",
694 |     "    i+=1"
695 |    ]
696 |   },
697 |   {
698 |    "cell_type": "markdown",
699 |    "metadata": {},
700 |    "source": [
701 |     "### Ranking Keyphrases\n",
702 |     "\n",
703 |     "Ranking keyphrases based on their calculated scores. Displaying top keywords_num no. of keyphrases."
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": 43,
709 |    "metadata": {},
710 |    "outputs": [
711 |     {
712 |      "name": "stdout",
713 |      "output_type": "stream",
714 |      "text": [
715 |       "Keywords:\n",
716 |       "\n",
717 |       "minimal supporting set,  minimal generating set,  minimal set,  linear diophantine equation,  nonstrict inequations,  strict inequations,  system,  linear constraint,  solution,  upper bound,  "
718 |      ]
719 |     }
720 |    ],
721 |    "source": [
722 |     "sorted_index = np.flip(np.argsort(phrase_scores),0)\n",
723 |     "\n",
724 |     "keywords_num = 10\n",
725 |     "\n",
726 |     "print(\"Keywords:\\n\")\n",
727 |     "\n",
728 |     "for i in range(0,keywords_num):\n",
729 |     "    print(str(keywords[sorted_index[i]])+\", \", end=' ')"
730 |    ]
731 |   },
732 |   {
733 |    "cell_type": "markdown",
734 |    "metadata": {},
735 |    "source": [
736 |     "# Input:\n",
737 |     "\n",
738 |     "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.\n",
739 |     "\n",
740 |     "# Extracted Keywords:\n",
741 |     "\n",
742 |     "* minimal supporting set,  \n",
743 |     "* minimal generating set,  \n",
744 |     "* minimal set,  \n",
745 |     "* linear diophantine equation,  \n",
746 |     "* nonstrict inequations,  \n",
747 |     "* strict inequations,  \n",
748 |     "* system,  \n",
749 |     "* linear constraint,  \n",
750 |     "* solution,  \n",
751 |     "* upper bound, \n"
752 |    ]
753 |   },
754 |   {
755 |    "cell_type": "code",
756 |    "execution_count": null,
757 |    "metadata": {},
758 |    "outputs": [],
759 |    "source": []
760 |   },
761 |   {
762 |    "cell_type": "code",
763 |    "execution_count": null,
764 |    "metadata": {},
765 |    "outputs": [],
766 |    "source": []
767 |   }
768 |  ],
769 |  "metadata": {
770 |   "kernelspec": {
771 |    "display_name": "Python 3",
772 |    "language": "python",
773 |    "name": "python3"
774 |   },
775 |   "language_info": {
776 |    "codemirror_mode": {
777 |     "name": "ipython",
778 |     "version": 3
779 |    },
780 |    "file_extension": ".py",
781 |    "mimetype": "text/x-python",
782 |    "name": "python",
783 |    "nbconvert_exporter": "python",
784 |    "pygments_lexer": "ipython3",
785 |    "version": "3.7.4"
786 |   }
787 |  },
788 |  "nbformat": 4,
789 |  "nbformat_minor": 2
790 | }
791 | 


--------------------------------------------------------------------------------