├── README.md ├── MIT-License.txt ├── .gitignore ├── FoxStoplist.txt ├── SmartStoplist.txt └── rake.py /README.md: -------------------------------------------------------------------------------- 1 | RAKE 2 | ==== 3 | 4 | A Python implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm as described in: Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). Automatic Keyword Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.), Text Mining: Theory and Applications: John Wiley & Sons. 5 | 6 | The source code is released under the MIT License. 7 | -------------------------------------------------------------------------------- /MIT-License.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by http://www.gitignore.io 2 | 3 | ### Python ### 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | bin/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # Installer logs 29 | pip-log.txt 30 | pip-delete-this-directory.txt 31 | 32 | # Unit test / coverage reports 33 | .tox/ 34 | .coverage 35 | .cache 36 | nosetests.xml 37 | coverage.xml 38 | 39 | # Translations 40 | *.mo 41 | 42 | # Mr Developer 43 | .mr.developer.cfg 44 | .project 45 | .pydevproject 46 | 47 | # Rope 48 | .ropeproject 49 | 50 | # Django stuff: 51 | *.log 52 | *.pot 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | 58 | 59 | ### PyCharm ### 60 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode 61 | 62 | ## Directory-based project format 63 | .idea/ 64 | # if you remove the above rule, at least ignore user-specific stuff: 65 | # .idea/workspace.xml 66 | # .idea/tasks.xml 67 | # and these sensitive or high-churn files: 68 | # .idea/dataSources.ids 69 | # .idea/dataSources.xml 70 | # .idea/sqlDataSources.xml 71 | # .idea/dynamic.xml 72 | 73 | ## File-based project format 74 | *.ipr 75 | *.iws 76 | *.iml 77 | 78 | ## Additional for IntelliJ 79 | out/ 80 | 81 | # generated by mpeltonen/sbt-idea plugin 82 | .idea_modules/ 83 | 84 | # generated by JIRA plugin 85 | atlassian-ide-plugin.xml 86 | 87 | # generated by Crashlytics plugin (for Android Studio and Intellij) 88 | com_crashlytics_export_strings.xml 89 | 90 | -------------------------------------------------------------------------------- /FoxStoplist.txt: -------------------------------------------------------------------------------- 1 | #From "A stop list for general text" Fox 1989 2 | a 3 | about 4 | above 5 | across 6 | after 7 | again 8 | against 9 | all 10 | almost 11 | alone 12 | along 13 | already 14 | also 15 | although 16 | always 17 | among 18 | an 19 | and 20 | another 21 | any 22 | anybody 23 | anyone 24 | anything 25 | anywhere 26 | are 27 | area 28 | areas 29 | around 30 | as 31 | ask 32 | asked 33 | asking 34 | asks 35 | at 36 | away 37 | b 38 | back 39 | backed 40 | backing 41 | backs 42 | be 43 | because 44 | became 45 | become 46 | becomes 47 | been 48 | before 49 | began 50 | behind 51 | being 52 | beings 53 | best 54 | better 55 | between 56 | big 57 | both 58 | but 59 | by 60 | c 61 | came 62 | can 63 | cannot 64 | case 65 | cases 66 | certain 67 | certainly 68 | clear 69 | clearly 70 | come 71 | could 72 | d 73 | did 74 | differ 75 | different 76 | differently 77 | do 78 | does 79 | done 80 | down 81 | downed 82 | downing 83 | downs 84 | during 85 | e 86 | each 87 | early 88 | either 89 | end 90 | ended 91 | ending 92 | ends 93 | enough 94 | even 95 | evenly 96 | ever 97 | every 98 | everybody 99 | everyone 100 | everything 101 | everywhere 102 | f 103 | face 104 | faces 105 | fact 106 | facts 107 | far 108 | felt 109 | few 110 | find 111 | finds 112 | first 113 | for 114 | four 115 | from 116 | full 117 | fully 118 | further 119 | furthered 120 | furthering 121 | furthers 122 | g 123 | gave 124 | general 125 | generally 126 | get 127 | gets 128 | give 129 | given 130 | gives 131 | go 132 | going 133 | good 134 | goods 135 | got 136 | great 137 | greater 138 | greatest 139 | group 140 | grouped 141 | grouping 142 | groups 143 | h 144 | had 145 | has 146 | have 147 | having 148 | he 149 | her 150 | herself 151 | here 152 | high 153 | higher 154 | highest 155 | him 156 | himself 157 | his 158 | how 159 | however 160 | i 161 | if 162 | important 163 | in 164 | interest 165 | interested 166 | interesting 167 | interests 168 | into 169 | is 170 | it 171 | its 172 | itself 173 | j 174 | just 175 | k 176 | keep 177 | keeps 178 | kind 179 | knew 180 | know 181 | known 182 | knows 183 | l 184 | large 185 | largely 186 | last 187 | later 188 | latest 189 | least 190 | less 191 | let 192 | lets 193 | like 194 | likely 195 | long 196 | longer 197 | longest 198 | m 199 | made 200 | make 201 | making 202 | man 203 | many 204 | may 205 | me 206 | member 207 | members 208 | men 209 | might 210 | more 211 | most 212 | mostly 213 | mr 214 | mrs 215 | much 216 | must 217 | my 218 | myself 219 | n 220 | necessary 221 | need 222 | needed 223 | needing 224 | needs 225 | never 226 | new 227 | newer 228 | newest 229 | next 230 | no 231 | non 232 | not 233 | nobody 234 | noone 235 | nothing 236 | now 237 | nowhere 238 | number 239 | numbered 240 | numbering 241 | numbers 242 | o 243 | of 244 | off 245 | often 246 | old 247 | older 248 | oldest 249 | on 250 | once 251 | one 252 | only 253 | open 254 | opened 255 | opening 256 | opens 257 | or 258 | order 259 | ordered 260 | ordering 261 | orders 262 | other 263 | others 264 | our 265 | out 266 | over 267 | p 268 | part 269 | parted 270 | parting 271 | parts 272 | per 273 | perhaps 274 | place 275 | places 276 | point 277 | pointed 278 | pointing 279 | points 280 | possible 281 | present 282 | presented 283 | presenting 284 | presents 285 | problem 286 | problems 287 | put 288 | puts 289 | q 290 | quite 291 | r 292 | rather 293 | really 294 | right 295 | room 296 | rooms 297 | s 298 | said 299 | same 300 | saw 301 | say 302 | says 303 | second 304 | seconds 305 | see 306 | seem 307 | seemed 308 | seeming 309 | seems 310 | sees 311 | several 312 | shall 313 | she 314 | should 315 | show 316 | showed 317 | showing 318 | shows 319 | side 320 | sides 321 | since 322 | small 323 | smaller 324 | smallest 325 | so 326 | some 327 | somebody 328 | someone 329 | something 330 | somewhere 331 | state 332 | states 333 | still 334 | such 335 | sure 336 | t 337 | take 338 | taken 339 | than 340 | that 341 | the 342 | their 343 | them 344 | then 345 | there 346 | therefore 347 | these 348 | they 349 | thing 350 | things 351 | think 352 | thinks 353 | this 354 | those 355 | though 356 | thought 357 | thoughts 358 | three 359 | through 360 | thus 361 | to 362 | today 363 | together 364 | too 365 | took 366 | toward 367 | turn 368 | turned 369 | turning 370 | turns 371 | two 372 | u 373 | under 374 | until 375 | up 376 | upon 377 | us 378 | use 379 | uses 380 | used 381 | v 382 | very 383 | w 384 | want 385 | wanted 386 | wanting 387 | wants 388 | was 389 | way 390 | ways 391 | we 392 | well 393 | wells 394 | went 395 | were 396 | what 397 | when 398 | where 399 | whether 400 | which 401 | while 402 | who 403 | whole 404 | whose 405 | why 406 | will 407 | with 408 | within 409 | without 410 | work 411 | worked 412 | working 413 | works 414 | would 415 | x 416 | y 417 | year 418 | years 419 | yet 420 | you 421 | young 422 | younger 423 | youngest 424 | your 425 | yours 426 | z 427 | -------------------------------------------------------------------------------- /SmartStoplist.txt: -------------------------------------------------------------------------------- 1 | #stop word list from SMART (Salton,1971). Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop 2 | a 3 | a's 4 | able 5 | about 6 | above 7 | according 8 | accordingly 9 | across 10 | actually 11 | after 12 | afterwards 13 | again 14 | against 15 | ain't 16 | all 17 | allow 18 | allows 19 | almost 20 | alone 21 | along 22 | already 23 | also 24 | although 25 | always 26 | am 27 | among 28 | amongst 29 | an 30 | and 31 | another 32 | any 33 | anybody 34 | anyhow 35 | anyone 36 | anything 37 | anyway 38 | anyways 39 | anywhere 40 | apart 41 | appear 42 | appreciate 43 | appropriate 44 | are 45 | aren't 46 | around 47 | as 48 | aside 49 | ask 50 | asking 51 | associated 52 | at 53 | available 54 | away 55 | awfully 56 | b 57 | be 58 | became 59 | because 60 | become 61 | becomes 62 | becoming 63 | been 64 | before 65 | beforehand 66 | behind 67 | being 68 | believe 69 | below 70 | beside 71 | besides 72 | best 73 | better 74 | between 75 | beyond 76 | both 77 | brief 78 | but 79 | by 80 | c 81 | c'mon 82 | c's 83 | came 84 | can 85 | can't 86 | cannot 87 | cant 88 | cause 89 | causes 90 | certain 91 | certainly 92 | changes 93 | clearly 94 | co 95 | com 96 | come 97 | comes 98 | concerning 99 | consequently 100 | consider 101 | considering 102 | contain 103 | containing 104 | contains 105 | corresponding 106 | could 107 | couldn't 108 | course 109 | currently 110 | d 111 | definitely 112 | described 113 | despite 114 | did 115 | didn't 116 | different 117 | do 118 | does 119 | doesn't 120 | doing 121 | don't 122 | done 123 | down 124 | downwards 125 | during 126 | e 127 | each 128 | edu 129 | eg 130 | eight 131 | either 132 | else 133 | elsewhere 134 | enough 135 | entirely 136 | especially 137 | et 138 | etc 139 | even 140 | ever 141 | every 142 | everybody 143 | everyone 144 | everything 145 | everywhere 146 | ex 147 | exactly 148 | example 149 | except 150 | f 151 | far 152 | few 153 | fifth 154 | first 155 | five 156 | followed 157 | following 158 | follows 159 | for 160 | former 161 | formerly 162 | forth 163 | four 164 | from 165 | further 166 | furthermore 167 | g 168 | get 169 | gets 170 | getting 171 | given 172 | gives 173 | go 174 | goes 175 | going 176 | gone 177 | got 178 | gotten 179 | greetings 180 | h 181 | had 182 | hadn't 183 | happens 184 | hardly 185 | has 186 | hasn't 187 | have 188 | haven't 189 | having 190 | he 191 | he's 192 | hello 193 | help 194 | hence 195 | her 196 | here 197 | here's 198 | hereafter 199 | hereby 200 | herein 201 | hereupon 202 | hers 203 | herself 204 | hi 205 | him 206 | himself 207 | his 208 | hither 209 | hopefully 210 | how 211 | howbeit 212 | however 213 | i 214 | i'd 215 | i'll 216 | i'm 217 | i've 218 | ie 219 | if 220 | ignored 221 | immediate 222 | in 223 | inasmuch 224 | inc 225 | indeed 226 | indicate 227 | indicated 228 | indicates 229 | inner 230 | insofar 231 | instead 232 | into 233 | inward 234 | is 235 | isn't 236 | it 237 | it'd 238 | it'll 239 | it's 240 | its 241 | itself 242 | j 243 | just 244 | k 245 | keep 246 | keeps 247 | kept 248 | know 249 | knows 250 | known 251 | l 252 | last 253 | lately 254 | later 255 | latter 256 | latterly 257 | least 258 | less 259 | lest 260 | let 261 | let's 262 | like 263 | liked 264 | likely 265 | little 266 | look 267 | looking 268 | looks 269 | ltd 270 | m 271 | mainly 272 | many 273 | may 274 | maybe 275 | me 276 | mean 277 | meanwhile 278 | merely 279 | might 280 | more 281 | moreover 282 | most 283 | mostly 284 | much 285 | must 286 | my 287 | myself 288 | n 289 | name 290 | namely 291 | nd 292 | near 293 | nearly 294 | necessary 295 | need 296 | needs 297 | neither 298 | never 299 | nevertheless 300 | new 301 | next 302 | nine 303 | no 304 | nobody 305 | non 306 | none 307 | noone 308 | nor 309 | normally 310 | not 311 | nothing 312 | novel 313 | now 314 | nowhere 315 | o 316 | obviously 317 | of 318 | off 319 | often 320 | oh 321 | ok 322 | okay 323 | old 324 | on 325 | once 326 | one 327 | ones 328 | only 329 | onto 330 | or 331 | other 332 | others 333 | otherwise 334 | ought 335 | our 336 | ours 337 | ourselves 338 | out 339 | outside 340 | over 341 | overall 342 | own 343 | p 344 | particular 345 | particularly 346 | per 347 | perhaps 348 | placed 349 | please 350 | plus 351 | possible 352 | presumably 353 | probably 354 | provides 355 | q 356 | que 357 | quite 358 | qv 359 | r 360 | rather 361 | rd 362 | re 363 | really 364 | reasonably 365 | regarding 366 | regardless 367 | regards 368 | relatively 369 | respectively 370 | right 371 | s 372 | said 373 | same 374 | saw 375 | say 376 | saying 377 | says 378 | second 379 | secondly 380 | see 381 | seeing 382 | seem 383 | seemed 384 | seeming 385 | seems 386 | seen 387 | self 388 | selves 389 | sensible 390 | sent 391 | serious 392 | seriously 393 | seven 394 | several 395 | shall 396 | she 397 | should 398 | shouldn't 399 | since 400 | six 401 | so 402 | some 403 | somebody 404 | somehow 405 | someone 406 | something 407 | sometime 408 | sometimes 409 | somewhat 410 | somewhere 411 | soon 412 | sorry 413 | specified 414 | specify 415 | specifying 416 | still 417 | sub 418 | such 419 | sup 420 | sure 421 | t 422 | t's 423 | take 424 | taken 425 | tell 426 | tends 427 | th 428 | than 429 | thank 430 | thanks 431 | thanx 432 | that 433 | that's 434 | thats 435 | the 436 | their 437 | theirs 438 | them 439 | themselves 440 | then 441 | thence 442 | there 443 | there's 444 | thereafter 445 | thereby 446 | therefore 447 | therein 448 | theres 449 | thereupon 450 | these 451 | they 452 | they'd 453 | they'll 454 | they're 455 | they've 456 | think 457 | third 458 | this 459 | thorough 460 | thoroughly 461 | those 462 | though 463 | three 464 | through 465 | throughout 466 | thru 467 | thus 468 | to 469 | together 470 | too 471 | took 472 | toward 473 | towards 474 | tried 475 | tries 476 | truly 477 | try 478 | trying 479 | twice 480 | two 481 | u 482 | un 483 | under 484 | unfortunately 485 | unless 486 | unlikely 487 | until 488 | unto 489 | up 490 | upon 491 | us 492 | use 493 | used 494 | useful 495 | uses 496 | using 497 | usually 498 | uucp 499 | v 500 | value 501 | various 502 | very 503 | via 504 | viz 505 | vs 506 | w 507 | want 508 | wants 509 | was 510 | wasn't 511 | way 512 | we 513 | we'd 514 | we'll 515 | we're 516 | we've 517 | welcome 518 | well 519 | went 520 | were 521 | weren't 522 | what 523 | what's 524 | whatever 525 | when 526 | whence 527 | whenever 528 | where 529 | where's 530 | whereafter 531 | whereas 532 | whereby 533 | wherein 534 | whereupon 535 | wherever 536 | whether 537 | which 538 | while 539 | whither 540 | who 541 | who's 542 | whoever 543 | whole 544 | whom 545 | whose 546 | why 547 | will 548 | willing 549 | wish 550 | with 551 | within 552 | without 553 | won't 554 | wonder 555 | would 556 | would 557 | wouldn't 558 | x 559 | y 560 | yes 561 | yet 562 | you 563 | you'd 564 | you'll 565 | you're 566 | you've 567 | your 568 | yours 569 | yourself 570 | yourselves 571 | z 572 | zero 573 | -------------------------------------------------------------------------------- /rake.py: -------------------------------------------------------------------------------- 1 | # Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm 2 | # as described in: 3 | # Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 4 | # Automatic keyword extraction from indi-vidual documents. 5 | # In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd. 6 | 7 | import re 8 | import operator 9 | 10 | debug = False 11 | test = True 12 | 13 | 14 | def is_number(s): 15 | try: 16 | float(s) if '.' in s else int(s) 17 | return True 18 | except ValueError: 19 | return False 20 | 21 | 22 | def load_stop_words(stop_word_file): 23 | """ 24 | Utility function to load stop words from a file and return as a list of words 25 | @param stop_word_file Path and file name of a file containing stop words. 26 | @return list A list of stop words. 27 | """ 28 | stop_words = [] 29 | for line in open(stop_word_file): 30 | if line.strip()[0:1] != "#": 31 | for word in line.split(): # in case more than one per line 32 | stop_words.append(word) 33 | return stop_words 34 | 35 | 36 | def separate_words(text, min_word_return_size): 37 | """ 38 | Utility function to return a list of all words that are have a length greater than a specified number of characters. 39 | @param text The text that must be split in to words. 40 | @param min_word_return_size The minimum no of characters a word must have to be included. 41 | """ 42 | splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]') 43 | words = [] 44 | for single_word in splitter.split(text): 45 | current_word = single_word.strip().lower() 46 | #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases 47 | if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word): 48 | words.append(current_word) 49 | return words 50 | 51 | 52 | def split_sentences(text): 53 | """ 54 | Utility function to return a list of sentences. 55 | @param text The text that must be split in to sentences. 56 | """ 57 | sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s') 58 | sentences = sentence_delimiters.split(text) 59 | return sentences 60 | 61 | 62 | def build_stop_word_regex(stop_word_file_path): 63 | stop_word_list = load_stop_words(stop_word_file_path) 64 | stop_word_regex_list = [] 65 | for word in stop_word_list: 66 | word_regex = r'\b' + word + r'(?![\w-])' # added look ahead for hyphen 67 | stop_word_regex_list.append(word_regex) 68 | stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE) 69 | return stop_word_pattern 70 | 71 | 72 | def generate_candidate_keywords(sentence_list, stopword_pattern): 73 | phrase_list = [] 74 | for s in sentence_list: 75 | tmp = re.sub(stopword_pattern, '|', s.strip()) 76 | phrases = tmp.split("|") 77 | for phrase in phrases: 78 | phrase = phrase.strip().lower() 79 | if phrase != "": 80 | phrase_list.append(phrase) 81 | return phrase_list 82 | 83 | 84 | def calculate_word_scores(phraseList): 85 | word_frequency = {} 86 | word_degree = {} 87 | for phrase in phraseList: 88 | word_list = separate_words(phrase, 0) 89 | word_list_length = len(word_list) 90 | word_list_degree = word_list_length - 1 91 | #if word_list_degree > 3: word_list_degree = 3 #exp. 92 | for word in word_list: 93 | word_frequency.setdefault(word, 0) 94 | word_frequency[word] += 1 95 | word_degree.setdefault(word, 0) 96 | word_degree[word] += word_list_degree #orig. 97 | #word_degree[word] += 1/(word_list_length*1.0) #exp. 98 | for item in word_frequency: 99 | word_degree[item] = word_degree[item] + word_frequency[item] 100 | 101 | # Calculate Word scores = deg(w)/frew(w) 102 | word_score = {} 103 | for item in word_frequency: 104 | word_score.setdefault(item, 0) 105 | word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) #orig. 106 | #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp. 107 | return word_score 108 | 109 | 110 | def generate_candidate_keyword_scores(phrase_list, word_score): 111 | keyword_candidates = {} 112 | for phrase in phrase_list: 113 | keyword_candidates.setdefault(phrase, 0) 114 | word_list = separate_words(phrase, 0) 115 | candidate_score = 0 116 | for word in word_list: 117 | candidate_score += word_score[word] 118 | keyword_candidates[phrase] = candidate_score 119 | return keyword_candidates 120 | 121 | 122 | class Rake(object): 123 | def __init__(self, stop_words_path): 124 | self.stop_words_path = stop_words_path 125 | self.__stop_words_pattern = build_stop_word_regex(stop_words_path) 126 | 127 | def run(self, text): 128 | sentence_list = split_sentences(text) 129 | 130 | phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern) 131 | 132 | word_scores = calculate_word_scores(phrase_list) 133 | 134 | keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores) 135 | 136 | sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True) 137 | return sorted_keywords 138 | 139 | 140 | if test: 141 | text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types." 142 | 143 | # Split text into sentences 144 | sentenceList = split_sentences(text) 145 | #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1 146 | stoppath = "SmartStoplist.txt" #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1 147 | stopwordpattern = build_stop_word_regex(stoppath) 148 | 149 | # generate candidate keywords 150 | phraseList = generate_candidate_keywords(sentenceList, stopwordpattern) 151 | 152 | # calculate individual word scores 153 | wordscores = calculate_word_scores(phraseList) 154 | 155 | # generate candidate keyword scores 156 | keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores) 157 | if debug: print keywordcandidates 158 | 159 | sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True) 160 | if debug: print sortedKeywords 161 | 162 | totalKeywords = len(sortedKeywords) 163 | if debug: print totalKeywords 164 | print sortedKeywords[0:(totalKeywords / 3)] 165 | 166 | rake = Rake("SmartStoplist.txt") 167 | keywords = rake.run(text) 168 | print keywords 169 | --------------------------------------------------------------------------------