├── LICENSE
├── .gitignore
├── long_stopwords.txt
├── README.md
└── TextRank.ipynb
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Jishnu Ray Chowdhury
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/long_stopwords.txt:
--------------------------------------------------------------------------------
1 | a
2 | able
3 | about
4 | above
5 | abst
6 | accordance
7 | according
8 | accordingly
9 | across
10 | act
11 | actually
12 | added
13 | adj
14 | affected
15 | affecting
16 | affects
17 | after
18 | afterwards
19 | again
20 | against
21 | ah
22 | all
23 | almost
24 | alone
25 | along
26 | already
27 | also
28 | although
29 | always
30 | am
31 | among
32 | amongst
33 | an
34 | and
35 | announce
36 | another
37 | any
38 | anybody
39 | anyhow
40 | anymore
41 | anyone
42 | anything
43 | anyway
44 | anyways
45 | anywhere
46 | apparently
47 | approximately
48 | are
49 | aren
50 | arent
51 | arise
52 | around
53 | as
54 | aside
55 | ask
56 | asking
57 | at
58 | auth
59 | available
60 | away
61 | awfully
62 | b
63 | back
64 | be
65 | became
66 | because
67 | become
68 | becomes
69 | becoming
70 | been
71 | before
72 | beforehand
73 | begin
74 | beginning
75 | beginnings
76 | begins
77 | behind
78 | being
79 | believe
80 | below
81 | beside
82 | besides
83 | between
84 | beyond
85 | biol
86 | both
87 | brief
88 | briefly
89 | but
90 | by
91 | c
92 | ca
93 | came
94 | can
95 | cannot
96 | can't
97 | cause
98 | causes
99 | certain
100 | certainly
101 | co
102 | com
103 | come
104 | comes
105 | contain
106 | containing
107 | corresponding
108 | contains
109 | could
110 | couldnt
111 | d
112 | date
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | done
121 | don't
122 | down
123 | downwards
124 | due
125 | during
126 | e
127 | each
128 | ed
129 | edu
130 | effect
131 | eg
132 | eight
133 | eighty
134 | either
135 | else
136 | elsewhere
137 | end
138 | ending
139 | enough
140 | especially
141 | et
142 | et-al
143 | etc
144 | even
145 | ever
146 | every
147 | everybody
148 | everyone
149 | everything
150 | everywhere
151 | ex
152 | except
153 | f
154 | far
155 | few
156 | ff
157 | fifth
158 | first
159 | five
160 | fix
161 | followed
162 | following
163 | follows
164 | for
165 | former
166 | formerly
167 | forth
168 | found
169 | four
170 | from
171 | further
172 | furthermore
173 | g
174 | gave
175 | get
176 | gets
177 | getting
178 | give
179 | given
180 | gives
181 | giving
182 | go
183 | goes
184 | gone
185 | got
186 | gotten
187 | h
188 | had
189 | happens
190 | hardly
191 | has
192 | hasn't
193 | have
194 | haven't
195 | having
196 | he
197 | hed
198 | hence
199 | her
200 | here
201 | hereafter
202 | hereby
203 | herein
204 | heres
205 | hereupon
206 | hers
207 | herself
208 | hes
209 | hi
210 | hid
211 | him
212 | himself
213 | his
214 | hither
215 | home
216 | how
217 | howbeit
218 | however
219 | hundred
220 | i
221 | id
222 | ie
223 | if
224 | i'll
225 | im
226 | immediate
227 | immediately
228 | importance
229 | important
230 | in
231 | inc
232 | indeed
233 | index
234 | information
235 | instead
236 | into
237 | invention
238 | inward
239 | is
240 | isn't
241 | it
242 | itd
243 | it'll
244 | its
245 | itself
246 | i've
247 | j
248 | just
249 | k
250 | keep
251 | keeps
252 | kept
253 | kg
254 | km
255 | know
256 | known
257 | knows
258 | l
259 | largely
260 | last
261 | lately
262 | later
263 | latter
264 | latterly
265 | least
266 | less
267 | lest
268 | let
269 | lets
270 | like
271 | liked
272 | likely
273 | line
274 | little
275 | 'll
276 | look
277 | looking
278 | looks
279 | ltd
280 | m
281 | made
282 | mainly
283 | make
284 | makes
285 | many
286 | may
287 | maybe
288 | me
289 | mean
290 | means
291 | meantime
292 | meanwhile
293 | merely
294 | mg
295 | might
296 | million
297 | miss
298 | ml
299 | more
300 | moreover
301 | most
302 | mostly
303 | mr
304 | mrs
305 | much
306 | mug
307 | must
308 | my
309 | myself
310 | n
311 | na
312 | name
313 | namely
314 | nay
315 | nd
316 | near
317 | nearly
318 | necessarily
319 | necessary
320 | need
321 | needs
322 | neither
323 | never
324 | nevertheless
325 | new
326 | next
327 | nine
328 | ninety
329 | no
330 | nobody
331 | non
332 | none
333 | nonetheless
334 | noone
335 | nor
336 | normally
337 | nos
338 | not
339 | noted
340 | nothing
341 | now
342 | nowhere
343 | o
344 | obtain
345 | obtained
346 | obviously
347 | of
348 | off
349 | often
350 | oh
351 | ok
352 | okay
353 | old
354 | omitted
355 | on
356 | once
357 | one
358 | ones
359 | only
360 | onto
361 | or
362 | ord
363 | other
364 | others
365 | otherwise
366 | ought
367 | our
368 | ours
369 | ourselves
370 | out
371 | outside
372 | over
373 | overall
374 | owing
375 | own
376 | p
377 | page
378 | pages
379 | part
380 | particular
381 | particularly
382 | past
383 | per
384 | perhaps
385 | placed
386 | please
387 | plus
388 | poorly
389 | possible
390 | possibly
391 | potentially
392 | pp
393 | predominantly
394 | present
395 | previously
396 | primarily
397 | probably
398 | promptly
399 | proud
400 | provides
401 | put
402 | q
403 | que
404 | quickly
405 | quite
406 | qv
407 | r
408 | ran
409 | rather
410 | rd
411 | re
412 | readily
413 | really
414 | recent
415 | recently
416 | ref
417 | refs
418 | regarding
419 | regardless
420 | regards
421 | related
422 | relatively
423 | research
424 | respectively
425 | resulted
426 | resulting
427 | results
428 | right
429 | run
430 | s
431 | said
432 | same
433 | saw
434 | say
435 | saying
436 | says
437 | sec
438 | section
439 | see
440 | seeing
441 | seem
442 | seemed
443 | seeming
444 | seems
445 | seen
446 | self
447 | selves
448 | sent
449 | seven
450 | several
451 | shall
452 | she
453 | shed
454 | she'll
455 | shes
456 | should
457 | shouldn't
458 | show
459 | showed
460 | shown
461 | showns
462 | shows
463 | significant
464 | significantly
465 | similar
466 | similarly
467 | since
468 | six
469 | slightly
470 | so
471 | some
472 | somebody
473 | somehow
474 | someone
475 | somethan
476 | something
477 | sometime
478 | sometimes
479 | somewhat
480 | somewhere
481 | soon
482 | sorry
483 | specifically
484 | specified
485 | specify
486 | specifying
487 | still
488 | stop
489 | strongly
490 | sub
491 | substantially
492 | successfully
493 | such
494 | sufficiently
495 | suggest
496 | sup
497 | sure t
498 | take
499 | taken
500 | taking
501 | tell
502 | tends
503 | th
504 | than
505 | thank
506 | thanks
507 | thanx
508 | that
509 | that'll
510 | thats
511 | that've
512 | the
513 | their
514 | theirs
515 | them
516 | themselves
517 | then
518 | thence
519 | there
520 | thereafter
521 | thereby
522 | thered
523 | therefore
524 | therein
525 | there'll
526 | thereof
527 | therere
528 | theres
529 | thereto
530 | thereupon
531 | there've
532 | these
533 | they
534 | theyd
535 | they'll
536 | theyre
537 | they've
538 | think
539 | this
540 | those
541 | thou
542 | though
543 | thoughh
544 | thousand
545 | throug
546 | through
547 | throughout
548 | thru
549 | thus
550 | til
551 | tip
552 | to
553 | together
554 | too
555 | took
556 | toward
557 | towards
558 | tried
559 | tries
560 | truly
561 | try
562 | trying
563 | ts
564 | twice
565 | two
566 | u
567 | un
568 | under
569 | unfortunately
570 | unless
571 | unlike
572 | unlikely
573 | until
574 | unto
575 | up
576 | upon
577 | ups
578 | us
579 | use
580 | used
581 | useful
582 | usefully
583 | usefulness
584 | uses
585 | using
586 | usually
587 | v
588 | value
589 | various
590 | 've
591 | very
592 | via
593 | viz
594 | vol
595 | vols
596 | vs
597 | w
598 | want
599 | wants
600 | was
601 | wasnt
602 | way
603 | we
604 | wed
605 | welcome
606 | we'll
607 | went
608 | were
609 | werent
610 | we've
611 | what
612 | whatever
613 | what'll
614 | whats
615 | when
616 | whence
617 | whenever
618 | where
619 | whereafter
620 | whereas
621 | whereby
622 | wherein
623 | wheres
624 | whereupon
625 | wherever
626 | whether
627 | which
628 | while
629 | whim
630 | whither
631 | who
632 | whod
633 | whoever
634 | whole
635 | who'll
636 | whom
637 | whomever
638 | whos
639 | whose
640 | why
641 | widely
642 | willing
643 | wish
644 | with
645 | within
646 | without
647 | wont
648 | words
649 | world
650 | would
651 | wouldnt
652 | www
653 | x
654 | y
655 | yes
656 | yet
657 | you
658 | youd
659 | you'll
660 | your
661 | youre
662 | yours
663 | yourself
664 | yourselves
665 | you've
666 | z
667 | zero
668 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Implementation of TextRank for keyword Extraction
3 |
4 | Based on:
5 |
6 | [TextRank: Bringing Order into Texts - by Rada Mihalcea and Paul Tarau](https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf)
7 |
8 | The input text is given below
9 |
10 |
11 | ```python
12 | #Source of text:
13 | #https://www.researchgate.net/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents
14 |
15 | Text = "Compatibility of systems of linear constraints over the set of natural numbers. \
16 | Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and \
17 | nonstrict inequations are considered. \
18 | Upper bounds for components of a minimal set of solutions and \
19 | algorithms of construction of minimal generating sets of solutions for all \
20 | types of systems are given. \
21 | These criteria and the corresponding algorithms for constructing \
22 | a minimal supporting set of solutions can be used in solving all the \
23 | considered types of systems and systems of mixed types."
24 | ```
25 |
26 | ### Cleaning Text Data
27 |
28 | The raw input text is cleaned off non-printable characters (if any) and turned into lower case.
29 | The processed input text is then tokenized using NLTK library functions.
30 |
31 |
32 | ```python
33 |
34 | import nltk
35 | from nltk import word_tokenize
36 | import string
37 |
38 | #nltk.download('punkt')
39 |
40 | def clean(text):
41 | text = text.lower()
42 | printable = set(string.printable)
43 | text = filter(lambda x: x in printable, text) #filter funny characters, if any.
44 | return text
45 |
46 | Cleaned_text = clean(Text)
47 |
48 | text = word_tokenize(Cleaned_text)
49 |
50 | print "Tokenized Text: \n"
51 | print text
52 | ```
53 |
54 | Tokenized Text:
55 |
56 | ['compatibility', 'of', 'systems', 'of', 'linear', 'constraints', 'over', 'the', 'set', 'of', 'natural', 'numbers', '.', 'criteria', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equations', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bounds', 'for', 'components', 'of', 'a', 'minimal', 'set', 'of', 'solutions', 'and', 'algorithms', 'of', 'construction', 'of', 'minimal', 'generating', 'sets', 'of', 'solutions', 'for', 'all', 'types', 'of', 'systems', 'are', 'given', '.', 'these', 'criteria', 'and', 'the', 'corresponding', 'algorithms', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solutions', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'types', 'of', 'systems', 'and', 'systems', 'of', 'mixed', 'types', '.']
57 |
58 |
59 | ### POS Tagging For Lemmatization
60 |
61 | NLTK is again used for POS tagging the input text so that the words can be lemmatized based on their POS tags.
62 |
63 | Description of POS tags:
64 |
65 |
66 | http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
67 |
68 |
69 | ```python
70 | #nltk.download('averaged_perceptron_tagger')
71 |
72 | POS_tag = nltk.pos_tag(text)
73 |
74 | print "Tokenized Text with POS tags: \n"
75 | print POS_tag
76 | ```
77 |
78 | Tokenized Text with POS tags:
79 |
80 | [('compatibility', 'NN'), ('of', 'IN'), ('systems', 'NNS'), ('of', 'IN'), ('linear', 'JJ'), ('constraints', 'NNS'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('numbers', 'NNS'), ('.', '.'), ('criteria', 'NNS'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'NN'), ('equations', 'NNS'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bounds', 'NNS'), ('for', 'IN'), ('components', 'NNS'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('and', 'CC'), ('algorithms', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('sets', 'NNS'), ('of', 'IN'), ('solutions', 'NNS'), ('for', 'IN'), ('all', 'DT'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criteria', 'NNS'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithms', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('and', 'CC'), ('systems', 'NNS'), ('of', 'IN'), ('mixed', 'JJ'), ('types', 'NNS'), ('.', '.')]
81 |
82 |
83 | ### Lemmatization
84 |
85 | The tokenized text (mainly the nouns and adjectives) is normalized by lemmatization.
86 | In lemmatization different grammatical counterparts of a word will be replaced by single
87 | basic lemma. For example, 'glasses' may be replaced by 'glass'.
88 |
89 | Details about lemmatization:
90 |
91 | https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html
92 |
93 |
94 | ```python
95 | #nltk.download('wordnet')
96 |
97 | from nltk.stem import WordNetLemmatizer
98 |
99 | wordnet_lemmatizer = WordNetLemmatizer()
100 |
101 | adjective_tags = ['JJ','JJR','JJS']
102 |
103 | lemmatized_text = []
104 |
105 | for word in POS_tag:
106 | if word[1] in adjective_tags:
107 | lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
108 | else:
109 | lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun
110 |
111 | print "Text tokens after lemmatization of adjectives and nouns: \n"
112 | print lemmatized_text
113 | ```
114 |
115 | Text tokens after lemmatization of adjectives and nouns:
116 |
117 | ['compatibility', 'of', 'system', 'of', 'linear', 'constraint', 'over', 'the', 'set', 'of', 'natural', 'number', '.', 'criterion', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equation', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bound', 'for', 'component', 'of', 'a', 'minimal', 'set', 'of', 'solution', 'and', 'algorithm', 'of', 'construction', 'of', 'minimal', 'generating', 'set', 'of', 'solution', 'for', 'all', 'type', 'of', 'system', 'are', 'given', '.', 'these', 'criterion', 'and', 'the', 'corresponding', 'algorithm', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solution', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'type', 'of', 'system', 'and', 'system', 'of', 'mixed', 'type', '.']
118 |
119 |
120 | ### POS tagging for Filtering
121 |
122 | The lemmatized text is POS tagged here. The tags will be used for filtering later on.
123 |
124 |
125 | ```python
126 | POS_tag = nltk.pos_tag(lemmatized_text)
127 |
128 | print "Lemmatized text with POS tags: \n"
129 | print POS_tag
130 | ```
131 |
132 | Lemmatized text with POS tags:
133 |
134 | [('compatibility', 'NN'), ('of', 'IN'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('constraint', 'NN'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('number', 'NN'), ('.', '.'), ('criterion', 'NN'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'JJ'), ('equation', 'NN'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bound', 'NN'), ('for', 'IN'), ('component', 'NN'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('and', 'CC'), ('algorithm', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('for', 'IN'), ('all', 'DT'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criterion', 'NN'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithm', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('and', 'CC'), ('system', 'NN'), ('of', 'IN'), ('mixed', 'JJ'), ('type', 'NN'), ('.', '.')]
135 |
136 |
137 | ## POS Based Filtering
138 |
139 | Any word from the lemmatized text, which isn't a noun, adjective, or gerund (or a 'foreign word'), is here
140 | considered as a stopword (non-content). This is based on the assumption that usually keywords are noun,
141 | adjectives or gerunds.
142 |
143 | Punctuations are added to the stopword list too.
144 |
145 |
146 | ```python
147 | stopwords = []
148 |
149 | wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW']
150 |
151 | for word in POS_tag:
152 | if word[1] not in wanted_POS:
153 | stopwords.append(word[0])
154 |
155 | punctuations = list(str(string.punctuation))
156 |
157 | stopwords = stopwords + punctuations
158 | ```
159 |
160 | ### Complete stopword generation
161 |
162 | Even if we remove the aforementioned stopwords, still some extremely common nouns, adjectives or gerunds may
163 | remain which are very bad candidates for being keywords (or part of it).
164 |
165 | An external file constituting a long list of stopwords is loaded and all the words are added with the previous
166 | stopwords to create the final list 'stopwords-plus' which is then converted into a set.
167 |
168 | (Source of stopwords data: https://www.ranks.nl/stopwords)
169 |
170 | Stopwords-plus constitute the sum total of all stopwords and potential phrase-delimiters.
171 |
172 | (The contents of this set will be later used to partition the lemmatized text into n-gram phrases. But, for now, I will simply remove the stopwords, and work with a 'bag-of-words' approach. I will be developing the graph using unigram texts as vertices)
173 |
174 | ```python
175 | stopword_file = open("long_stopwords.txt", "r")
176 | #Source = https://www.ranks.nl/stopwords
177 |
178 | lots_of_stopwords = []
179 |
180 | for line in stopword_file.readlines():
181 | lots_of_stopwords.append(str(line.strip()))
182 |
183 | stopwords_plus = []
184 | stopwords_plus = stopwords + lots_of_stopwords
185 | stopwords_plus = set(stopwords_plus)
186 |
187 | #Stopwords_plus contain total set of all stopwords
188 | ```
189 |
190 | ### Removing Stopwords
191 |
192 | Removing stopwords from lemmatized_text.
193 | Processeced_text condtains the result.
194 |
195 |
196 | ```python
197 | processed_text = []
198 | for word in lemmatized_text:
199 | if word not in stopwords_plus:
200 | processed_text.append(word)
201 | print processed_text
202 | ```
203 |
204 | ['compatibility', 'system', 'linear', 'constraint', 'set', 'natural', 'number', 'criterion', 'compatibility', 'system', 'linear', 'diophantine', 'equation', 'strict', 'inequations', 'nonstrict', 'inequations', 'upper', 'bound', 'component', 'minimal', 'set', 'solution', 'algorithm', 'construction', 'minimal', 'generating', 'set', 'solution', 'type', 'system', 'criterion', 'algorithm', 'constructing', 'minimal', 'supporting', 'set', 'solution', 'solving', 'type', 'system', 'system', 'mixed', 'type']
205 |
206 |
207 | ## Vocabulary Creation
208 |
209 | Vocabulary will only contain unique words from processed_text.
210 |
211 |
212 | ```python
213 | vocabulary = list(set(processed_text))
214 | print vocabulary
215 | ```
216 |
217 | ['upper', 'set', 'constructing', 'number', 'solving', 'system', 'compatibility', 'strict', 'criterion', 'type', 'minimal', 'supporting', 'generating', 'linear', 'diophantine', 'component', 'bound', 'nonstrict', 'inequations', 'natural', 'algorithm', 'constraint', 'equation', 'solution', 'construction', 'mixed']
218 |
219 |
220 | ### Building Graph
221 |
222 | TextRank is a graph based model, and thus it requires us to build a graph. Each words in the vocabulary will serve as a vertex for graph. The words will be represented in the vertices by their index in vocabulary list.
223 |
224 | The weighetd_edge matrix contains the information of edge connections among all vertices.
225 | I am building a graph with wieghted undirected edges.
226 |
227 | weighted_edge[i][j] contains the weight of the connecting edge between the word vertex represented by vocabulary index i and the word vertex represented by vocabulary j.
228 |
229 | If weighted_edge[i][j] is zero, it means no edge or connection is present between the words represented by index i and j.
230 |
231 | There is a connection between the words (and thus between i and j which represents them) if the words co-occur within a window of a specified 'window_size' in the processed_text.
232 |
233 | I am increasing value of the weighted_edge[i][j] is increased by (1/(distance between positions of words currently represented by i and j)) for every connection discovered between the same words in different locations of the text.
234 |
235 | The covered_coocurrences list (which is contain the list of pairs of absolute positions in processed_text of the words whose coocurrence at that location is already checked) is managed so that the same two words located in the same positions in processed_text are not repetitively counted while sliding the window one text unit at a time.
236 |
237 | The score of all vertices are intialized to one.
238 |
239 | Self-connections are not considered, so weighted_edge[i][i] will be zero.
240 |
241 |
242 | ```python
243 | import numpy as np
244 | import math
245 | vocab_len = len(vocabulary)
246 |
247 | weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)
248 |
249 | score = np.zeros((vocab_len),dtype=np.float32)
250 | window_size = 3
251 | covered_coocurrences = []
252 |
253 | for i in xrange(0,vocab_len):
254 | score[i]=1
255 | for j in xrange(0,vocab_len):
256 | if j==i:
257 | weighted_edge[i][j]=0
258 | else:
259 | for window_start in xrange(0,(len(processed_text)-window_size+1)):
260 |
261 | window_end = window_start+window_size
262 |
263 | window = processed_text[window_start:window_end]
264 |
265 | if (vocabulary[i] in window) and (vocabulary[j] in window):
266 |
267 | index_of_i = window_start + window.index(vocabulary[i])
268 | index_of_j = window_start + window.index(vocabulary[j])
269 |
270 | # index_of_x is the absolute position of the xth term in the window
271 | # (counting from 0)
272 | # in the processed_text
273 |
274 | if [index_of_i,index_of_j] not in covered_coocurrences:
275 | weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
276 | covered_coocurrences.append([index_of_i,index_of_j])
277 |
278 | ```
279 |
280 | ### Calculating weighted summation of connections of a vertex
281 |
282 | inout[i] will contain the total no. of undirected connections\edges associated withe the vertex represented by i.
283 |
284 |
285 | ```python
286 | inout = np.zeros((vocab_len),dtype=np.float32)
287 |
288 | for i in xrange(0,vocab_len):
289 | for j in xrange(0,vocab_len):
290 | inout[i]+=weighted_edge[i][j]
291 | ```
292 |
293 | ### Scoring Vertices
294 |
295 | The formula used for scoring a vertex represented by i is:
296 |
297 | score[i] = (1-d) + d x [ Summation(j) ( (weighted_edge[i][j]/inout[j]) x score[j] ) ] where j belongs to the list of vertices that has a connection with i.
298 |
299 | d is the damping factor.
300 |
301 | The score is iteratively updated until convergence.
302 |
303 |
304 | ```python
305 | MAX_ITERATIONS = 50
306 | d=0.85
307 | threshold = 0.0001 #convergence threshold
308 |
309 | for iter in xrange(0,MAX_ITERATIONS):
310 | prev_score = np.copy(score)
311 |
312 | for i in xrange(0,vocab_len):
313 |
314 | summation = 0
315 | for j in xrange(0,vocab_len):
316 | if weighted_edge[i][j] != 0:
317 | summation += (weighted_edge[i][j]/inout[j])*score[j]
318 |
319 | score[i] = (1-d) + d*(summation)
320 |
321 | if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition
322 | print "Converging at iteration "+str(iter)+"...."
323 | break
324 |
325 | ```
326 |
327 | Converging at iteration 29....
328 |
329 |
330 |
331 | ```python
332 | for i in xrange(0,vocab_len):
333 | print "Score of "+vocabulary[i]+": "+str(score[i])
334 | ```
335 |
336 | Score of upper: 0.816792
337 | Score of set: 2.27184
338 | Score of constructing: 0.667288
339 | Score of number: 0.688316
340 | Score of solving: 0.642318
341 | Score of system: 2.12032
342 | Score of compatibility: 0.944584
343 | Score of strict: 0.823772
344 | Score of criterion: 1.22559
345 | Score of type: 1.08101
346 | Score of minimal: 1.78693
347 | Score of supporting: 0.653705
348 | Score of generating: 0.652645
349 | Score of linear: 1.2717
350 | Score of diophantine: 0.759295
351 | Score of component: 0.737641
352 | Score of bound: 0.786006
353 | Score of nonstrict: 0.827216
354 | Score of inequations: 1.30824
355 | Score of natural: 0.688299
356 | Score of algorithm: 1.19365
357 | Score of constraint: 0.674411
358 | Score of equation: 0.799815
359 | Score of solution: 1.6832
360 | Score of construction: 0.659809
361 | Score of mixed: 0.235822
362 |
363 |
364 | ### Phrase Partitioning
365 |
366 | Paritioning lemmatized_text into phrases using the stopwords in it as delimeters.
367 | The phrases are also candidates for keyphrases to be extracted.
368 |
369 |
370 | ```python
371 | phrases = []
372 |
373 | phrase = " "
374 | for word in lemmatized_text:
375 |
376 | if word in stopwords_plus:
377 | if phrase!= " ":
378 | phrases.append(str(phrase).strip().split())
379 | phrase = " "
380 | elif word not in stopwords_plus:
381 | phrase+=str(word)
382 | phrase+=" "
383 |
384 | print "Partitioned Phrases (Candidate Keyphrases): \n"
385 | print phrases
386 | ```
387 |
388 | Partitioned Phrases (Candidate Keyphrases):
389 |
390 | [['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['compatibility'], ['system'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['solution'], ['type'], ['system'], ['criterion'], ['algorithm'], ['constructing'], ['minimal', 'supporting', 'set'], ['solution'], ['solving'], ['type'], ['system'], ['system'], ['mixed', 'type']]
391 |
392 |
393 | ### Create a list of unique phrases.
394 |
395 | Repeating phrases\keyphrase candidates has no purpose here, anymore.
396 |
397 |
398 | ```python
399 | unique_phrases = []
400 |
401 | for phrase in phrases:
402 | if phrase not in unique_phrases:
403 | unique_phrases.append(phrase)
404 |
405 | print "Unique Phrases (Candidate Keyphrases): \n"
406 | print unique_phrases
407 | ```
408 |
409 | Unique Phrases (Candidate Keyphrases):
410 |
411 | [['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['type'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']]
412 |
413 |
414 | ### Thinning the list of candidate-keyphrases.
415 |
416 | Removing single word keyphrase-candidates that are present multi-word alternatives.
417 |
418 |
419 | ```python
420 | for word in vocabulary:
421 | #print word
422 | for phrase in unique_phrases:
423 | if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
424 | #if len(phrase)>1 then the current phrase is multi-worded.
425 | #if the word in vocabulary is present in unique_phrases as a single-word-phrase
426 | # and at the same time present as a word within a multi-worded phrase,
427 | # then I will remove the single-word-phrase from the list.
428 | unique_phrases.remove([word])
429 |
430 | print "Thinned Unique Phrases (Candidate Keyphrases): \n"
431 | print unique_phrases
432 | ```
433 |
434 | Thinned Unique Phrases (Candidate Keyphrases):
435 |
436 | [['compatibility'], ['system'], ['linear', 'constraint'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']]
437 |
438 |
439 | ### Scoring Keyphrases
440 |
441 | Scoring the phrases (candidate keyphrases) and building up a list of keyphrases
442 | by listing untokenized versions of tokenized phrases\candidate-keyphrases.
443 | Phrases are scored by adding the score of their members (words\text-units that were ranked by the graph algorithm)
444 |
445 |
446 |
447 | ```python
448 | phrase_scores = []
449 | keywords = []
450 | for phrase in unique_phrases:
451 | phrase_score=0
452 | keyword = ''
453 | for word in phrase:
454 | keyword += str(word)
455 | keyword += " "
456 | phrase_score+=score[vocabulary.index(word)]
457 | phrase_scores.append(phrase_score)
458 | keywords.append(keyword.strip())
459 |
460 | i=0
461 | for keyword in keywords:
462 | print "Keyword: '"+str(keyword)+"', Score: "+str(phrase_scores[i])
463 | i+=1
464 | ```
465 |
466 | Keyword: 'compatibility', Score: 0.944583714008
467 | Keyword: 'system', Score: 2.12031626701
468 | Keyword: 'linear constraint', Score: 1.94610738754
469 | Keyword: 'natural number', Score: 1.37661552429
470 | Keyword: 'criterion', Score: 1.2255872488
471 | Keyword: 'linear diophantine equation', Score: 2.83080631495
472 | Keyword: 'strict inequations', Score: 2.13201224804
473 | Keyword: 'nonstrict inequations', Score: 2.135455966
474 | Keyword: 'upper bound', Score: 1.60279768705
475 | Keyword: 'component', Score: 0.737640619278
476 | Keyword: 'minimal set', Score: 4.05876886845
477 | Keyword: 'solution', Score: 1.68319940567
478 | Keyword: 'algorithm', Score: 1.19365406036
479 | Keyword: 'construction', Score: 0.659808635712
480 | Keyword: 'minimal generating set', Score: 4.71141409874
481 | Keyword: 'constructing', Score: 0.66728836298
482 | Keyword: 'minimal supporting set', Score: 4.71247345209
483 | Keyword: 'solving', Score: 0.642318367958
484 | Keyword: 'mixed type', Score: 1.31682945788
485 |
486 |
487 | ### Ranking Keyphrases
488 |
489 | Ranking keyphrases based on their calculated scores. Displaying top 'keywords_num' no. of keyphrases.
490 |
491 |
492 | ```python
493 | sorted_index = np.flip(np.argsort(phrase_scores),0)
494 |
495 | keywords_num = 10
496 |
497 | print "Keywords:\n"
498 |
499 | for i in xrange(0,keywords_num):
500 | print str(keywords[sorted_index[i]])+", ",
501 | ```
502 |
503 | Keywords:
504 |
505 | minimal supporting set, minimal generating set, minimal set, linear diophantine equation, nonstrict inequations, strict inequations, system, linear constraint, solution, upper bound,
506 |
507 |
508 | # Input:
509 |
510 | Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.
511 |
512 | # Extracted Keywords:
513 |
514 | * minimal supporting set,
515 | * minimal generating set,
516 | * minimal set,
517 | * linear diophantine equation,
518 | * nonstrict inequations,
519 | * strict inequations,
520 | * system,
521 | * linear constraint,
522 | * solution,
523 | * upper bound,
524 |
--------------------------------------------------------------------------------
/TextRank.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Implementation of TextRank\n",
8 | "(Based on: https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf)"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "The input text is given below"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 6,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "#Source of text:\n",
25 | "#https://www.researchgate.net/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents\n",
26 | "\n",
27 | "Text = \"Compatibility of systems of linear constraints over the set of natural numbers. \\\n",
28 | "Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and \\\n",
29 | "nonstrict inequations are considered. \\\n",
30 | "Upper bounds for components of a minimal set of solutions and \\\n",
31 | "algorithms of construction of minimal generating sets of solutions for all \\\n",
32 | "types of systems are given. \\\n",
33 | "These criteria and the corresponding algorithms for constructing \\\n",
34 | "a minimal supporting set of solutions can be used in solving all the \\\n",
35 | "considered types of systems and systems of mixed types.\""
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "### Cleaning Text Data\n",
43 | "\n",
44 | "The raw input text is cleaned off non-printable characters (if any) and turned into lower case.\n",
45 | "The processed input text is then tokenized using NLTK library functions. "
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 20,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "name": "stdout",
55 | "output_type": "stream",
56 | "text": [
57 | "Tokenized Text: \n",
58 | "\n",
59 | "['compatibility', 'of', 'systems', 'of', 'linear', 'constraints', 'over', 'the', 'set', 'of', 'natural', 'numbers', '.', 'criteria', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equations', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bounds', 'for', 'components', 'of', 'a', 'minimal', 'set', 'of', 'solutions', 'and', 'algorithms', 'of', 'construction', 'of', 'minimal', 'generating', 'sets', 'of', 'solutions', 'for', 'all', 'types', 'of', 'systems', 'are', 'given', '.', 'these', 'criteria', 'and', 'the', 'corresponding', 'algorithms', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solutions', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'types', 'of', 'systems', 'and', 'systems', 'of', 'mixed', 'types', '.']\n"
60 | ]
61 | }
62 | ],
63 | "source": [
64 | "\n",
65 | "import nltk\n",
66 | "from nltk import word_tokenize\n",
67 | "import string\n",
68 | "\n",
69 | "#nltk.download('punkt')\n",
70 | "\n",
71 | "def clean(text):\n",
72 | " text = text.lower()\n",
73 | " printable = set(string.printable)\n",
74 | " text = filter(lambda x: x in printable, text)\n",
75 | " text = \"\".join(list(text))\n",
76 | " return text\n",
77 | "\n",
78 | "Cleaned_text = clean(Text)\n",
79 | "# print(Cleaned_text)\n",
80 | "text = word_tokenize(Cleaned_text)\n",
81 | "\n",
82 | "print (\"Tokenized Text: \\n\")\n",
83 | "print (text)"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "### POS Tagging For Lemmatization\n",
91 | "\n",
92 | "NLTK is again used for POS tagging the input text so that the words can be lemmatized based on their POS tags.\n",
93 | "\n",
94 | "Description of POS tags: \n",
95 | "\n",
96 | "\n",
97 | "http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 22,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "Tokenized Text with POS tags: \n",
110 | "\n",
111 | "[('compatibility', 'NN'), ('of', 'IN'), ('systems', 'NNS'), ('of', 'IN'), ('linear', 'JJ'), ('constraints', 'NNS'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('numbers', 'NNS'), ('.', '.'), ('criteria', 'NNS'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'NN'), ('equations', 'NNS'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bounds', 'NNS'), ('for', 'IN'), ('components', 'NNS'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('and', 'CC'), ('algorithms', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('sets', 'NNS'), ('of', 'IN'), ('solutions', 'NNS'), ('for', 'IN'), ('all', 'DT'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criteria', 'NNS'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithms', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solutions', 'NNS'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('types', 'NNS'), ('of', 'IN'), ('systems', 'NNS'), ('and', 'CC'), ('systems', 'NNS'), ('of', 'IN'), ('mixed', 'JJ'), ('types', 'NNS'), ('.', '.')]\n"
112 | ]
113 | }
114 | ],
115 | "source": [
116 | "#nltk.download('averaged_perceptron_tagger')\n",
117 | " \n",
118 | "POS_tag = nltk.pos_tag(text)\n",
119 | "\n",
120 | "print (\"Tokenized Text with POS tags: \\n\")\n",
121 | "print (POS_tag)"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "### Lemmatization\n",
129 | "\n",
130 | "The tokenized text (mainly the nouns and adjectives) is normalized by lemmatization.\n",
131 | "In lemmatization different grammatical counterparts of a word will be replaced by single\n",
132 | "basic lemma. For example, 'glasses' may be replaced by 'glass'. \n",
133 | "\n",
134 | "Details about lemmatization: \n",
135 | " \n",
136 | "https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 23,
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "Text tokens after lemmatization of adjectives and nouns: \n",
149 | "\n",
150 | "['compatibility', 'of', 'system', 'of', 'linear', 'constraint', 'over', 'the', 'set', 'of', 'natural', 'number', '.', 'criterion', 'of', 'compatibility', 'of', 'a', 'system', 'of', 'linear', 'diophantine', 'equation', ',', 'strict', 'inequations', ',', 'and', 'nonstrict', 'inequations', 'are', 'considered', '.', 'upper', 'bound', 'for', 'component', 'of', 'a', 'minimal', 'set', 'of', 'solution', 'and', 'algorithm', 'of', 'construction', 'of', 'minimal', 'generating', 'set', 'of', 'solution', 'for', 'all', 'type', 'of', 'system', 'are', 'given', '.', 'these', 'criterion', 'and', 'the', 'corresponding', 'algorithm', 'for', 'constructing', 'a', 'minimal', 'supporting', 'set', 'of', 'solution', 'can', 'be', 'used', 'in', 'solving', 'all', 'the', 'considered', 'type', 'of', 'system', 'and', 'system', 'of', 'mixed', 'type', '.']\n"
151 | ]
152 | }
153 | ],
154 | "source": [
155 | "#nltk.download('wordnet')\n",
156 | "\n",
157 | "from nltk.stem import WordNetLemmatizer\n",
158 | "\n",
159 | "wordnet_lemmatizer = WordNetLemmatizer()\n",
160 | "\n",
161 | "adjective_tags = ['JJ','JJR','JJS']\n",
162 | "\n",
163 | "lemmatized_text = []\n",
164 | "\n",
165 | "for word in POS_tag:\n",
166 | " if word[1] in adjective_tags:\n",
167 | " lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos=\"a\")))\n",
168 | " else:\n",
169 | " lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun\n",
170 | " \n",
171 | "print (\"Text tokens after lemmatization of adjectives and nouns: \\n\")\n",
172 | "print (lemmatized_text)"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### POS tagging for Filtering\n",
180 | "\n",
181 | "The lemmatized text is POS tagged here. The tags will be used for filtering later on."
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 24,
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "Lemmatized text with POS tags: \n",
194 | "\n",
195 | "[('compatibility', 'NN'), ('of', 'IN'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('constraint', 'NN'), ('over', 'IN'), ('the', 'DT'), ('set', 'NN'), ('of', 'IN'), ('natural', 'JJ'), ('number', 'NN'), ('.', '.'), ('criterion', 'NN'), ('of', 'IN'), ('compatibility', 'NN'), ('of', 'IN'), ('a', 'DT'), ('system', 'NN'), ('of', 'IN'), ('linear', 'JJ'), ('diophantine', 'JJ'), ('equation', 'NN'), (',', ','), ('strict', 'JJ'), ('inequations', 'NNS'), (',', ','), ('and', 'CC'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('are', 'VBP'), ('considered', 'VBN'), ('.', '.'), ('upper', 'JJ'), ('bound', 'NN'), ('for', 'IN'), ('component', 'NN'), ('of', 'IN'), ('a', 'DT'), ('minimal', 'JJ'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('and', 'CC'), ('algorithm', 'NN'), ('of', 'IN'), ('construction', 'NN'), ('of', 'IN'), ('minimal', 'JJ'), ('generating', 'VBG'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('for', 'IN'), ('all', 'DT'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('are', 'VBP'), ('given', 'VBN'), ('.', '.'), ('these', 'DT'), ('criterion', 'NN'), ('and', 'CC'), ('the', 'DT'), ('corresponding', 'JJ'), ('algorithm', 'NN'), ('for', 'IN'), ('constructing', 'VBG'), ('a', 'DT'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('of', 'IN'), ('solution', 'NN'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('in', 'IN'), ('solving', 'VBG'), ('all', 'PDT'), ('the', 'DT'), ('considered', 'VBN'), ('type', 'NN'), ('of', 'IN'), ('system', 'NN'), ('and', 'CC'), ('system', 'NN'), ('of', 'IN'), ('mixed', 'JJ'), ('type', 'NN'), ('.', '.')]\n"
196 | ]
197 | }
198 | ],
199 | "source": [
200 | "POS_tag = nltk.pos_tag(lemmatized_text)\n",
201 | "\n",
202 | "print (\"Lemmatized text with POS tags: \\n\")\n",
203 | "print (POS_tag)"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "## POS Based Filtering\n",
211 | "\n",
212 | "Any word from the lemmatized text, which isn't a noun, adjective, or gerund (or a 'foreign word'), is here\n",
213 | "considered as a stopword (non-content). This is based on the assumption that usually keywords are noun,\n",
214 | "adjectives or gerunds. \n",
215 | "\n",
216 | "Punctuations are added to the stopword list too."
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 25,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "stopwords = []\n",
226 | "\n",
227 | "wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] \n",
228 | "\n",
229 | "for word in POS_tag:\n",
230 | " if word[1] not in wanted_POS:\n",
231 | " stopwords.append(word[0])\n",
232 | "\n",
233 | "punctuations = list(str(string.punctuation))\n",
234 | "\n",
235 | "stopwords = stopwords + punctuations"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "### Complete stopword generation\n",
243 | "\n",
244 | "Even if we remove the aforementioned stopwords, still some extremely common nouns, adjectives or gerunds may\n",
245 | "remain which are very bad candidates for being keywords (or part of it). \n",
246 | "\n",
247 | "An external file constituting a long list of stopwords is loaded and all the words are added with the previous\n",
248 | "stopwords to create the final list 'stopwords-plus' which is then converted into a set. \n",
249 | "\n",
250 | "(Source of stopwords data: https://www.ranks.nl/stopwords)\n",
251 | "\n",
252 | "Stopwords-plus constitute the sum total of all stopwords and potential phrase-delimiters. \n",
253 | "\n",
254 | "(The contents of this set will be later used to partition the lemmatized text into n-gram phrases. But, for now, I will simply remove the stopwords, and work with a 'bag-of-words' approach. I will be developing the graph using unigram texts as vertices)"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 27,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "stopword_file = open(\"long_stopwords.txt\", \"r\")\n",
264 | "#Source = https://www.ranks.nl/stopwords\n",
265 | "\n",
266 | "lots_of_stopwords = []\n",
267 | "\n",
268 | "for line in stopword_file.readlines():\n",
269 | " lots_of_stopwords.append(str(line.strip()))\n",
270 | "\n",
271 | "stopwords_plus = []\n",
272 | "stopwords_plus = stopwords + lots_of_stopwords\n",
273 | "stopwords_plus = set(stopwords_plus)\n",
274 | "\n",
275 | "#Stopwords_plus contain total set of all stopwords"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "metadata": {},
281 | "source": [
282 | "### Removing Stopwords \n",
283 | "\n",
284 | "Removing stopwords from lemmatized_text. \n",
285 | "Processeced_text condtains the result."
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 29,
291 | "metadata": {},
292 | "outputs": [
293 | {
294 | "name": "stdout",
295 | "output_type": "stream",
296 | "text": [
297 | "['compatibility', 'system', 'linear', 'constraint', 'set', 'natural', 'number', 'criterion', 'compatibility', 'system', 'linear', 'diophantine', 'equation', 'strict', 'inequations', 'nonstrict', 'inequations', 'upper', 'bound', 'component', 'minimal', 'set', 'solution', 'algorithm', 'construction', 'minimal', 'generating', 'set', 'solution', 'type', 'system', 'criterion', 'algorithm', 'constructing', 'minimal', 'supporting', 'set', 'solution', 'solving', 'type', 'system', 'system', 'mixed', 'type']\n"
298 | ]
299 | }
300 | ],
301 | "source": [
302 | "processed_text = []\n",
303 | "for word in lemmatized_text:\n",
304 | " if word not in stopwords_plus:\n",
305 | " processed_text.append(word)\n",
306 | "print (processed_text)"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {},
312 | "source": [
313 | "## Vocabulary Creation\n",
314 | "\n",
315 | "Vocabulary will only contain unique words from processed_text."
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 31,
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "name": "stdout",
325 | "output_type": "stream",
326 | "text": [
327 | "['solving', 'equation', 'generating', 'diophantine', 'construction', 'set', 'mixed', 'minimal', 'compatibility', 'component', 'system', 'natural', 'inequations', 'constraint', 'criterion', 'type', 'upper', 'solution', 'linear', 'algorithm', 'strict', 'bound', 'nonstrict', 'number', 'supporting', 'constructing']\n"
328 | ]
329 | }
330 | ],
331 | "source": [
332 | "vocabulary = list(set(processed_text))\n",
333 | "print (vocabulary)"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "metadata": {},
339 | "source": [
340 | "### Building Graph\n",
341 | "\n",
342 | "TextRank is a graph based model, and thus it requires us to build a graph. Each words in the vocabulary will serve as a vertex for graph. The words will be represented in the vertices by their index in vocabulary list. \n",
343 | "\n",
344 | "The weighted_edge matrix contains the information of edge connections among all vertices.\n",
345 | "I am building wieghted undirected edges.\n",
346 | "\n",
347 | "weighted_edge[i][j] contains the weight of the connecting edge between the word vertex represented by vocabulary index i and the word vertex represented by vocabulary j.\n",
348 | "\n",
349 | "If weighted_edge[i][j] is zero, it means no edge connection is present between the words represented by index i and j.\n",
350 | "\n",
351 | "There is a connection between the words (and thus between i and j which represents them) if the words co-occur within a window of a specified 'window_size' in the processed_text.\n",
352 | "\n",
353 | "The value of the weighted_edge[i][j] is increased by (1/(distance between positions of words currently represented by i and j)) for every connection discovered between the same words in different locations of the text. \n",
354 | "\n",
355 | "The covered_coocurrences list (which is contain the list of pairs of absolute positions in processed_text of the words whose coocurrence at that location is already checked) is managed so that the same two words located in the same positions in processed_text are not repetitively counted while sliding the window one text unit at a time.\n",
356 | "\n",
357 | "The score of all vertices are intialized to one. \n",
358 | "\n",
359 | "Self-connections are not considered, so weighted_edge[i][i] will be zero."
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 33,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "import numpy as np\n",
369 | "import math\n",
370 | "vocab_len = len(vocabulary)\n",
371 | "\n",
372 | "weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)\n",
373 | "\n",
374 | "score = np.zeros((vocab_len),dtype=np.float32)\n",
375 | "window_size = 3\n",
376 | "covered_coocurrences = []\n",
377 | "\n",
378 | "for i in range(0,vocab_len):\n",
379 | " score[i]=1\n",
380 | " for j in range(0,vocab_len):\n",
381 | " if j==i:\n",
382 | " weighted_edge[i][j]=0\n",
383 | " else:\n",
384 | " for window_start in range(0,(len(processed_text)-window_size)):\n",
385 | " \n",
386 | " window_end = window_start+window_size\n",
387 | " \n",
388 | " window = processed_text[window_start:window_end]\n",
389 | " \n",
390 | " if (vocabulary[i] in window) and (vocabulary[j] in window):\n",
391 | " \n",
392 | " index_of_i = window_start + window.index(vocabulary[i])\n",
393 | " index_of_j = window_start + window.index(vocabulary[j])\n",
394 | " \n",
395 | " # index_of_x is the absolute position of the xth term in the window \n",
396 | " # (counting from 0) \n",
397 | " # in the processed_text\n",
398 | " \n",
399 | " if [index_of_i,index_of_j] not in covered_coocurrences:\n",
400 | " weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)\n",
401 | " covered_coocurrences.append([index_of_i,index_of_j])\n"
402 | ]
403 | },
404 | {
405 | "cell_type": "markdown",
406 | "metadata": {},
407 | "source": [
408 | "### Calculating weighted summation of connections of a vertex\n",
409 | "\n",
410 | "inout[i] will contain the sum of all the undirected connections\\edges associated withe the vertex represented by i."
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 34,
416 | "metadata": {},
417 | "outputs": [],
418 | "source": [
419 | "inout = np.zeros((vocab_len),dtype=np.float32)\n",
420 | "\n",
421 | "for i in range(0,vocab_len):\n",
422 | " for j in range(0,vocab_len):\n",
423 | " inout[i]+=weighted_edge[i][j]"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {},
429 | "source": [
430 | "### Scoring Vertices\n",
431 | "\n",
432 | "The formula used for scoring a vertex represented by i is:\n",
433 | "\n",
434 | "score[i] = (1-d) + d x [ Summation(j) ( (weighted_edge[i][j]/inout[j]) x score[j] ) ] where j belongs to the list of vertieces that has a connection with i. \n",
435 | "\n",
436 | "d is the damping factor.\n",
437 | "\n",
438 | "The score is iteratively updated until convergence. "
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": 35,
444 | "metadata": {},
445 | "outputs": [
446 | {
447 | "name": "stdout",
448 | "output_type": "stream",
449 | "text": [
450 | "Converging at iteration 23....\n"
451 | ]
452 | }
453 | ],
454 | "source": [
455 | "MAX_ITERATIONS = 50\n",
456 | "d=0.85\n",
457 | "threshold = 0.0001 #convergence threshold\n",
458 | "\n",
459 | "for iter in range(0,MAX_ITERATIONS):\n",
460 | " prev_score = np.copy(score)\n",
461 | " \n",
462 | " for i in range(0,vocab_len):\n",
463 | " \n",
464 | " summation = 0\n",
465 | " for j in range(0,vocab_len):\n",
466 | " if weighted_edge[i][j] != 0:\n",
467 | " summation += (weighted_edge[i][j]/inout[j])*score[j]\n",
468 | " \n",
469 | " score[i] = (1-d) + d*(summation)\n",
470 | " \n",
471 | " if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition\n",
472 | " print(\"Converging at iteration \"+str(iter)+\"....\")\n",
473 | " break\n"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": 36,
479 | "metadata": {},
480 | "outputs": [
481 | {
482 | "name": "stdout",
483 | "output_type": "stream",
484 | "text": [
485 | "Score of solving: 0.64231944\n",
486 | "Score of equation: 0.79981786\n",
487 | "Score of generating: 0.65264744\n",
488 | "Score of diophantine: 0.759297\n",
489 | "Score of construction: 0.6598107\n",
490 | "Score of set: 2.2718465\n",
491 | "Score of mixed: 0.2358227\n",
492 | "Score of minimal: 1.7869267\n",
493 | "Score of compatibility: 0.9445859\n",
494 | "Score of component: 0.73764145\n",
495 | "Score of system: 2.1203177\n",
496 | "Score of natural: 0.6883006\n",
497 | "Score of inequations: 1.308244\n",
498 | "Score of constraint: 0.67441183\n",
499 | "Score of criterion: 1.2255884\n",
500 | "Score of type: 1.0810083\n",
501 | "Score of upper: 0.8167923\n",
502 | "Score of solution: 1.683202\n",
503 | "Score of linear: 1.2716976\n",
504 | "Score of algorithm: 1.1936545\n",
505 | "Score of strict: 0.8237729\n",
506 | "Score of bound: 0.78600633\n",
507 | "Score of nonstrict: 0.8272164\n",
508 | "Score of number: 0.6883157\n",
509 | "Score of supporting: 0.6537049\n",
510 | "Score of constructing: 0.66728705\n"
511 | ]
512 | }
513 | ],
514 | "source": [
515 | "for i in range(0,vocab_len):\n",
516 | " print(\"Score of \"+vocabulary[i]+\": \"+str(score[i]))"
517 | ]
518 | },
519 | {
520 | "cell_type": "markdown",
521 | "metadata": {},
522 | "source": [
523 | "### Phrase Partiotioning\n",
524 | "\n",
525 | "Paritioning lemmatized_text into phrases using the stopwords in it as delimeters.\n",
526 | "The phrases are also candidates for keyphrases to be extracted. "
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": 37,
532 | "metadata": {},
533 | "outputs": [
534 | {
535 | "name": "stdout",
536 | "output_type": "stream",
537 | "text": [
538 | "Partitioned Phrases (Candidate Keyphrases): \n",
539 | "\n",
540 | "[['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['compatibility'], ['system'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['solution'], ['type'], ['system'], ['criterion'], ['algorithm'], ['constructing'], ['minimal', 'supporting', 'set'], ['solution'], ['solving'], ['type'], ['system'], ['system'], ['mixed', 'type']]\n"
541 | ]
542 | }
543 | ],
544 | "source": [
545 | "phrases = []\n",
546 | "\n",
547 | "phrase = \" \"\n",
548 | "for word in lemmatized_text:\n",
549 | " \n",
550 | " if word in stopwords_plus:\n",
551 | " if phrase!= \" \":\n",
552 | " phrases.append(str(phrase).strip().split())\n",
553 | " phrase = \" \"\n",
554 | " elif word not in stopwords_plus:\n",
555 | " phrase+=str(word)\n",
556 | " phrase+=\" \"\n",
557 | "\n",
558 | "print(\"Partitioned Phrases (Candidate Keyphrases): \\n\")\n",
559 | "print(phrases)"
560 | ]
561 | },
562 | {
563 | "cell_type": "markdown",
564 | "metadata": {},
565 | "source": [
566 | "### Create a list of unique phrases.\n",
567 | "\n",
568 | "Repeating phrases\\keyphrase candidates has no purpose here, anymore. "
569 | ]
570 | },
571 | {
572 | "cell_type": "code",
573 | "execution_count": 38,
574 | "metadata": {},
575 | "outputs": [
576 | {
577 | "name": "stdout",
578 | "output_type": "stream",
579 | "text": [
580 | "Unique Phrases (Candidate Keyphrases): \n",
581 | "\n",
582 | "[['compatibility'], ['system'], ['linear', 'constraint'], ['set'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['type'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']]\n"
583 | ]
584 | }
585 | ],
586 | "source": [
587 | "unique_phrases = []\n",
588 | "\n",
589 | "for phrase in phrases:\n",
590 | " if phrase not in unique_phrases:\n",
591 | " unique_phrases.append(phrase)\n",
592 | "\n",
593 | "print(\"Unique Phrases (Candidate Keyphrases): \\n\")\n",
594 | "print(unique_phrases)"
595 | ]
596 | },
597 | {
598 | "cell_type": "markdown",
599 | "metadata": {},
600 | "source": [
601 | "### Thinning the list of candidate-keyphrases.\n",
602 | "\n",
603 | "Removing single word keyphrases-candidates that are present multi-word alternatives. "
604 | ]
605 | },
606 | {
607 | "cell_type": "code",
608 | "execution_count": 39,
609 | "metadata": {},
610 | "outputs": [
611 | {
612 | "name": "stdout",
613 | "output_type": "stream",
614 | "text": [
615 | "Thinned Unique Phrases (Candidate Keyphrases): \n",
616 | "\n",
617 | "[['compatibility'], ['system'], ['linear', 'constraint'], ['natural', 'number'], ['criterion'], ['linear', 'diophantine', 'equation'], ['strict', 'inequations'], ['nonstrict', 'inequations'], ['upper', 'bound'], ['component'], ['minimal', 'set'], ['solution'], ['algorithm'], ['construction'], ['minimal', 'generating', 'set'], ['constructing'], ['minimal', 'supporting', 'set'], ['solving'], ['mixed', 'type']]\n"
618 | ]
619 | }
620 | ],
621 | "source": [
622 | "for word in vocabulary:\n",
623 | " #print word\n",
624 | " for phrase in unique_phrases:\n",
625 | " if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):\n",
626 | " #if len(phrase)>1 then the current phrase is multi-worded.\n",
627 | " #if the word in vocabulary is present in unique_phrases as a single-word-phrase\n",
628 | " # and at the same time present as a word within a multi-worded phrase,\n",
629 | " # then I will remove the single-word-phrase from the list.\n",
630 | " unique_phrases.remove([word])\n",
631 | " \n",
632 | "print(\"Thinned Unique Phrases (Candidate Keyphrases): \\n\")\n",
633 | "print(unique_phrases) "
634 | ]
635 | },
636 | {
637 | "cell_type": "markdown",
638 | "metadata": {},
639 | "source": [
640 | "### Scoring Keyphrases\n",
641 | "\n",
642 | "Scoring the phrases (candidate keyphrases) and building up a list of keyphrases\\keywords\n",
643 | "by listing untokenized versions of tokenized phrases\\candidate-keyphrases.\n",
644 | "Phrases are scored by adding the score of their members (words\\text-units that were ranked by the graph algorithm)\n"
645 | ]
646 | },
647 | {
648 | "cell_type": "code",
649 | "execution_count": 40,
650 | "metadata": {},
651 | "outputs": [
652 | {
653 | "name": "stdout",
654 | "output_type": "stream",
655 | "text": [
656 | "Keyword: 'compatibility', Score: 0.944585919380188\n",
657 | "Keyword: 'system', Score: 2.1203176975250244\n",
658 | "Keyword: 'linear constraint', Score: 1.9461094737052917\n",
659 | "Keyword: 'natural number', Score: 1.3766162991523743\n",
660 | "Keyword: 'criterion', Score: 1.2255884408950806\n",
661 | "Keyword: 'linear diophantine equation', Score: 2.8308125138282776\n",
662 | "Keyword: 'strict inequations', Score: 2.132016897201538\n",
663 | "Keyword: 'nonstrict inequations', Score: 2.135460376739502\n",
664 | "Keyword: 'upper bound', Score: 1.6027986407279968\n",
665 | "Keyword: 'component', Score: 0.737641453742981\n",
666 | "Keyword: 'minimal set', Score: 4.0587732791900635\n",
667 | "Keyword: 'solution', Score: 1.6832020282745361\n",
668 | "Keyword: 'algorithm', Score: 1.1936545372009277\n",
669 | "Keyword: 'construction', Score: 0.6598107218742371\n",
670 | "Keyword: 'minimal generating set', Score: 4.711420714855194\n",
671 | "Keyword: 'constructing', Score: 0.6672870516777039\n",
672 | "Keyword: 'minimal supporting set', Score: 4.712478160858154\n",
673 | "Keyword: 'solving', Score: 0.6423194408416748\n",
674 | "Keyword: 'mixed type', Score: 1.3168310225009918\n"
675 | ]
676 | }
677 | ],
678 | "source": [
679 | "phrase_scores = []\n",
680 | "keywords = []\n",
681 | "for phrase in unique_phrases:\n",
682 | " phrase_score=0\n",
683 | " keyword = ''\n",
684 | " for word in phrase:\n",
685 | " keyword += str(word)\n",
686 | " keyword += \" \"\n",
687 | " phrase_score+=score[vocabulary.index(word)]\n",
688 | " phrase_scores.append(phrase_score)\n",
689 | " keywords.append(keyword.strip())\n",
690 | "\n",
691 | "i=0\n",
692 | "for keyword in keywords:\n",
693 | " print (\"Keyword: '\"+str(keyword)+\"', Score: \"+str(phrase_scores[i]))\n",
694 | " i+=1"
695 | ]
696 | },
697 | {
698 | "cell_type": "markdown",
699 | "metadata": {},
700 | "source": [
701 | "### Ranking Keyphrases\n",
702 | "\n",
703 | "Ranking keyphrases based on their calculated scores. Displaying top keywords_num no. of keyphrases."
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": 43,
709 | "metadata": {},
710 | "outputs": [
711 | {
712 | "name": "stdout",
713 | "output_type": "stream",
714 | "text": [
715 | "Keywords:\n",
716 | "\n",
717 | "minimal supporting set, minimal generating set, minimal set, linear diophantine equation, nonstrict inequations, strict inequations, system, linear constraint, solution, upper bound, "
718 | ]
719 | }
720 | ],
721 | "source": [
722 | "sorted_index = np.flip(np.argsort(phrase_scores),0)\n",
723 | "\n",
724 | "keywords_num = 10\n",
725 | "\n",
726 | "print(\"Keywords:\\n\")\n",
727 | "\n",
728 | "for i in range(0,keywords_num):\n",
729 | " print(str(keywords[sorted_index[i]])+\", \", end=' ')"
730 | ]
731 | },
732 | {
733 | "cell_type": "markdown",
734 | "metadata": {},
735 | "source": [
736 | "# Input:\n",
737 | "\n",
738 | "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.\n",
739 | "\n",
740 | "# Extracted Keywords:\n",
741 | "\n",
742 | "* minimal supporting set, \n",
743 | "* minimal generating set, \n",
744 | "* minimal set, \n",
745 | "* linear diophantine equation, \n",
746 | "* nonstrict inequations, \n",
747 | "* strict inequations, \n",
748 | "* system, \n",
749 | "* linear constraint, \n",
750 | "* solution, \n",
751 | "* upper bound, \n"
752 | ]
753 | },
754 | {
755 | "cell_type": "code",
756 | "execution_count": null,
757 | "metadata": {},
758 | "outputs": [],
759 | "source": []
760 | },
761 | {
762 | "cell_type": "code",
763 | "execution_count": null,
764 | "metadata": {},
765 | "outputs": [],
766 | "source": []
767 | }
768 | ],
769 | "metadata": {
770 | "kernelspec": {
771 | "display_name": "Python 3",
772 | "language": "python",
773 | "name": "python3"
774 | },
775 | "language_info": {
776 | "codemirror_mode": {
777 | "name": "ipython",
778 | "version": 3
779 | },
780 | "file_extension": ".py",
781 | "mimetype": "text/x-python",
782 | "name": "python",
783 | "nbconvert_exporter": "python",
784 | "pygments_lexer": "ipython3",
785 | "version": "3.7.4"
786 | }
787 | },
788 | "nbformat": 4,
789 | "nbformat_minor": 2
790 | }
791 |
--------------------------------------------------------------------------------