├── rake_text-0.0.1.gem ├── rake_text.gemspec ├── LICENSE.txt ├── usage └── usage.rb ├── README.md ├── stoplists ├── FoxStoplist.txt └── SmartStoplist.txt └── lib └── rake_text.rb /rake_text-0.0.1.gem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nok/rake-text-ruby/HEAD/rake_text-0.0.1.gem -------------------------------------------------------------------------------- /rake_text.gemspec: -------------------------------------------------------------------------------- 1 | Gem::Specification.new do |s| 2 | s.name = 'rake_text' 3 | s.version = '0.0.1' 4 | s.date = '2013-12-23' 5 | s.summary = "Rapid Automatic Keyword Extraction" 6 | s.description = "Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm in Ruby, a multi-word keywords extraction." 7 | s.authors = ["Darius Morawiec"] 8 | s.email = 'github@voidplus.de' 9 | s.files = ["lib/rake_text.rb"] 10 | s.homepage = 'https://github.com/voidplus/rake-text-ruby' 11 | s.license = 'MIT' 12 | end -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Darius Morawiec 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /usage/usage.rb: -------------------------------------------------------------------------------- 1 | require 'rake_text' 2 | 3 | 4 | text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types." 5 | 6 | 7 | rake = RakeText.new 8 | 9 | 10 | # USE SMART STOPLIST 11 | rake.analyse text, RakeText.SMART 12 | # → {"compatibility"=>1.0, "systems"=>1.0, "linear constraints"=>4.5, "set"=>2.0, "natural numbers"=>4.0, "criteria"=>1.0, "system"=>1.0, "linear diophantine equations"=>8.5, "strict inequations"=>4.0, "nonstrict inequations"=>4.0, "considered"=>1.5, "upper bounds"=>4.0, "components"=>1.0, "minimal set"=>4.666666666666666, "solutions"=>1.0, "algorithms"=>1.0, "construction"=>1.0, "minimal generating sets"=>8.666666666666666, "types"=>1.6666666666666667, "constructing"=>1.0, "minimal supporting set"=>7.666666666666666, "solving"=>1.0, "considered types"=>3.166666666666667, "mixed types"=>3.666666666666667} 13 | 14 | # USE FOX STOPLIST 15 | rake.analyse text, RakeText.FOX 16 | # → {"compatibility"=>1.0, "systems"=>1.0, "linear constraints"=>4.5, "set"=>2.0, "natural"=>1.0, "criteria"=>1.0, "system"=>1.0, "linear diophantine equations"=>8.5, "strict inequations"=>4.0, "nonstrict inequations"=>4.0, "considered"=>1.5, "upper bounds"=>4.0, "components"=>1.0, "minimal set"=>4.666666666666666, "solutions"=>1.0, "algorithms"=>1.5, "construction"=>1.0, "minimal generating sets"=>8.666666666666666, "types"=>1.6666666666666667, "corresponding algorithms"=>3.5, "constructing"=>1.0, "minimal supporting set"=>7.666666666666666, "solving"=>1.0, "considered types"=>3.166666666666667, "mixed types"=>3.666666666666667} 17 | 18 | # ACTIVATE VERBOSE 19 | rake.analyse text, RakeText.SMART, true 20 | # 8.67 - minimal generating sets 21 | # 8.50 - linear diophantine equations 22 | # 7.67 - minimal supporting set 23 | # 4.67 - minimal set 24 | # 4.50 - linear constraints 25 | # 4.00 - upper bounds 26 | # 4.00 - strict inequations 27 | # [...] 28 | # → {"compatibility"=>1.0, "systems"=>1.0, "linear constraints"=>4.5, "set"=>2.0, "natural numbers"=>4.0, "criteria"=>1.0, "system"=>1.0, "linear diophantine equations"=>8.5, "strict inequations"=>4.0, "nonstrict inequations"=>4.0, "considered"=>1.5, "upper bounds"=>4.0, "components"=>1.0, "minimal set"=>4.666666666666666, "solutions"=>1.0, "algorithms"=>1.0, "construction"=>1.0, "minimal generating sets"=>8.666666666666666, "types"=>1.6666666666666667, "constructing"=>1.0, "minimal supporting set"=>7.666666666666666, "solving"=>1.0, "considered types"=>3.166666666666667, "mixed types"=>3.666666666666667} 29 | 30 | # USE CUSTOM STOPLIST 31 | rake.analyse text, ["custom","stopword","list"] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RAKE 2 | 3 | Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm in Ruby, a multi-word keywords extraction. 4 | 5 | > Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). Automatic Keyword Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.), [Text Mining: Theory and Applications: John Wiley & Sons](http://books.google.de/books?id=u-SrKyUrafsC&lpg=PP1&hl=de&pg=PA1#v=onepage&q&f=false). 6 | 7 | 8 | 9 | ## Installation 10 | 11 | Install [RAKE](https://rubygems.org/gems/rake_text) via RubyGem: 12 | 13 | ``` 14 | gem install rake_text 15 | ``` 16 | 17 | 18 | ## Usage 19 | 20 | Import the library and create an instance: 21 | 22 | ``` 23 | require 'rake_text' 24 | rake = RakeText.new 25 | ``` 26 | 27 | Use the Smart Stoplist: 28 | 29 | ``` 30 | rake.analyse text, RakeText.SMART 31 | # → {"compatibility"=>1.0, "systems"=>1.0, "linear constraints"=>4.5, "set"=>2.0, "natural numbers"=>4.0, "criteria"=>1.0, "system"=>1.0, "linear diophantine equations"=>8.5, "strict inequations"=>4.0, "nonstrict inequations"=>4.0, "considered"=>1.5, "upper bounds"=>4.0, "components"=>1.0, "minimal set"=>4.666666666666666, "solutions"=>1.0, "algorithms"=>1.0, "construction"=>1.0, "minimal generating sets"=>8.666666666666666, "types"=>1.6666666666666667, "constructing"=>1.0, "minimal supporting set"=>7.666666666666666, "solving"=>1.0, "considered types"=>3.166666666666667, "mixed types"=>3.666666666666667} 32 | ``` 33 | 34 | Use the Fox Stoplist: 35 | 36 | ``` 37 | rake.analyse text, RakeText.FOX 38 | # → {"compatibility"=>1.0, "systems"=>1.0, "linear constraints"=>4.5, "set"=>2.0, "natural"=>1.0, "criteria"=>1.0, "system"=>1.0, "linear diophantine equations"=>8.5, "strict inequations"=>4.0, "nonstrict inequations"=>4.0, "considered"=>1.5, "upper bounds"=>4.0, "components"=>1.0, "minimal set"=>4.666666666666666, "solutions"=>1.0, "algorithms"=>1.5, "construction"=>1.0, "minimal generating sets"=>8.666666666666666, "types"=>1.6666666666666667, "corresponding algorithms"=>3.5, "constructing"=>1.0, "minimal supporting set"=>7.666666666666666, "solving"=>1.0, "considered types"=>3.166666666666667, "mixed types"=>3.666666666666667} 39 | ``` 40 | 41 | Use a custom stopword list: 42 | 43 | ``` 44 | rake.analyse text, ["custom","stopword","list"] 45 | ``` 46 | 47 | Show sorted results: 48 | 49 | ``` 50 | rake.analyse text, RakeText.SMART, true 51 | # 8.67 - minimal generating sets 52 | # 8.50 - linear diophantine equations 53 | # 7.67 - minimal supporting set 54 | # 4.67 - minimal set 55 | # 4.50 - linear constraints 56 | # 4.00 - upper bounds 57 | # 4.00 - strict inequations 58 | # [...] 59 | # → {"compatibility"=>1.0, "systems"=>1.0, "linear constraints"=>4.5, "set"=>2.0, "natural numbers"=>4.0, "criteria"=>1.0, "system"=>1.0, "linear diophantine equations"=>8.5, "strict inequations"=>4.0, "nonstrict inequations"=>4.0, "considered"=>1.5, "upper bounds"=>4.0, "components"=>1.0, "minimal set"=>4.666666666666666, "solutions"=>1.0, "algorithms"=>1.0, "construction"=>1.0, "minimal generating sets"=>8.666666666666666, "types"=>1.6666666666666667, "constructing"=>1.0, "minimal supporting set"=>7.666666666666666, "solving"=>1.0, "considered types"=>3.166666666666667, "mixed types"=>3.666666666666667} 60 | ``` 61 | 62 | 63 | ## License 64 | 65 | The package is Open Source Software released under the [License](https://raw.github.com/voidplus/rake-text-ruby/master/LICENSE.txt). -------------------------------------------------------------------------------- /stoplists/FoxStoplist.txt: -------------------------------------------------------------------------------- 1 | #From "A stop list for general text" Fox 1989 2 | a 3 | about 4 | above 5 | across 6 | after 7 | again 8 | against 9 | all 10 | almost 11 | alone 12 | along 13 | already 14 | also 15 | although 16 | always 17 | among 18 | an 19 | and 20 | another 21 | any 22 | anybody 23 | anyone 24 | anything 25 | anywhere 26 | are 27 | area 28 | areas 29 | around 30 | as 31 | ask 32 | asked 33 | asking 34 | asks 35 | at 36 | away 37 | b 38 | back 39 | backed 40 | backing 41 | backs 42 | be 43 | because 44 | became 45 | become 46 | becomes 47 | been 48 | before 49 | began 50 | behind 51 | being 52 | beings 53 | best 54 | better 55 | between 56 | big 57 | both 58 | but 59 | by 60 | c 61 | came 62 | can 63 | cannot 64 | case 65 | cases 66 | certain 67 | certainly 68 | clear 69 | clearly 70 | come 71 | could 72 | d 73 | did 74 | differ 75 | different 76 | differently 77 | do 78 | does 79 | done 80 | down 81 | downed 82 | downing 83 | downs 84 | during 85 | e 86 | each 87 | early 88 | either 89 | end 90 | ended 91 | ending 92 | ends 93 | enough 94 | even 95 | evenly 96 | ever 97 | every 98 | everybody 99 | everyone 100 | everything 101 | everywhere 102 | f 103 | face 104 | faces 105 | fact 106 | facts 107 | far 108 | felt 109 | few 110 | find 111 | finds 112 | first 113 | for 114 | four 115 | from 116 | full 117 | fully 118 | further 119 | furthered 120 | furthering 121 | furthers 122 | g 123 | gave 124 | general 125 | generally 126 | get 127 | gets 128 | give 129 | given 130 | gives 131 | go 132 | going 133 | good 134 | goods 135 | got 136 | great 137 | greater 138 | greatest 139 | group 140 | grouped 141 | grouping 142 | groups 143 | h 144 | had 145 | has 146 | have 147 | having 148 | he 149 | her 150 | herself 151 | here 152 | high 153 | higher 154 | highest 155 | him 156 | himself 157 | his 158 | how 159 | however 160 | i 161 | if 162 | important 163 | in 164 | interest 165 | interested 166 | interesting 167 | interests 168 | into 169 | is 170 | it 171 | its 172 | itself 173 | j 174 | just 175 | k 176 | keep 177 | keeps 178 | kind 179 | knew 180 | know 181 | known 182 | knows 183 | l 184 | large 185 | largely 186 | last 187 | later 188 | latest 189 | least 190 | less 191 | let 192 | lets 193 | like 194 | likely 195 | long 196 | longer 197 | longest 198 | m 199 | made 200 | make 201 | making 202 | man 203 | many 204 | may 205 | me 206 | member 207 | members 208 | men 209 | might 210 | more 211 | most 212 | mostly 213 | mr 214 | mrs 215 | much 216 | must 217 | my 218 | myself 219 | n 220 | necessary 221 | need 222 | needed 223 | needing 224 | needs 225 | never 226 | new 227 | newer 228 | newest 229 | next 230 | no 231 | non 232 | not 233 | nobody 234 | noone 235 | nothing 236 | now 237 | nowhere 238 | number 239 | numbered 240 | numbering 241 | numbers 242 | o 243 | of 244 | off 245 | often 246 | old 247 | older 248 | oldest 249 | on 250 | once 251 | one 252 | only 253 | open 254 | opened 255 | opening 256 | opens 257 | or 258 | order 259 | ordered 260 | ordering 261 | orders 262 | other 263 | others 264 | our 265 | out 266 | over 267 | p 268 | part 269 | parted 270 | parting 271 | parts 272 | per 273 | perhaps 274 | place 275 | places 276 | point 277 | pointed 278 | pointing 279 | points 280 | possible 281 | present 282 | presented 283 | presenting 284 | presents 285 | problem 286 | problems 287 | put 288 | puts 289 | q 290 | quite 291 | r 292 | rather 293 | really 294 | right 295 | room 296 | rooms 297 | s 298 | said 299 | same 300 | saw 301 | say 302 | says 303 | second 304 | seconds 305 | see 306 | seem 307 | seemed 308 | seeming 309 | seems 310 | sees 311 | several 312 | shall 313 | she 314 | should 315 | show 316 | showed 317 | showing 318 | shows 319 | side 320 | sides 321 | since 322 | small 323 | smaller 324 | smallest 325 | so 326 | some 327 | somebody 328 | someone 329 | something 330 | somewhere 331 | state 332 | states 333 | still 334 | such 335 | sure 336 | t 337 | take 338 | taken 339 | than 340 | that 341 | the 342 | their 343 | them 344 | then 345 | there 346 | therefore 347 | these 348 | they 349 | thing 350 | things 351 | think 352 | thinks 353 | this 354 | those 355 | though 356 | thought 357 | thoughts 358 | three 359 | through 360 | thus 361 | to 362 | today 363 | together 364 | too 365 | took 366 | toward 367 | turn 368 | turned 369 | turning 370 | turns 371 | two 372 | u 373 | under 374 | until 375 | up 376 | upon 377 | us 378 | use 379 | uses 380 | used 381 | v 382 | very 383 | w 384 | want 385 | wanted 386 | wanting 387 | wants 388 | was 389 | way 390 | ways 391 | we 392 | well 393 | wells 394 | went 395 | were 396 | what 397 | when 398 | where 399 | whether 400 | which 401 | while 402 | who 403 | whole 404 | whose 405 | why 406 | will 407 | with 408 | within 409 | without 410 | work 411 | worked 412 | working 413 | works 414 | would 415 | x 416 | y 417 | year 418 | years 419 | yet 420 | you 421 | young 422 | younger 423 | youngest 424 | your 425 | yours 426 | z 427 | -------------------------------------------------------------------------------- /stoplists/SmartStoplist.txt: -------------------------------------------------------------------------------- 1 | #stop word list from SMART (Salton,1971). Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop 2 | a 3 | a's 4 | able 5 | about 6 | above 7 | according 8 | accordingly 9 | across 10 | actually 11 | after 12 | afterwards 13 | again 14 | against 15 | ain't 16 | all 17 | allow 18 | allows 19 | almost 20 | alone 21 | along 22 | already 23 | also 24 | although 25 | always 26 | am 27 | among 28 | amongst 29 | an 30 | and 31 | another 32 | any 33 | anybody 34 | anyhow 35 | anyone 36 | anything 37 | anyway 38 | anyways 39 | anywhere 40 | apart 41 | appear 42 | appreciate 43 | appropriate 44 | are 45 | aren't 46 | around 47 | as 48 | aside 49 | ask 50 | asking 51 | associated 52 | at 53 | available 54 | away 55 | awfully 56 | b 57 | be 58 | became 59 | because 60 | become 61 | becomes 62 | becoming 63 | been 64 | before 65 | beforehand 66 | behind 67 | being 68 | believe 69 | below 70 | beside 71 | besides 72 | best 73 | better 74 | between 75 | beyond 76 | both 77 | brief 78 | but 79 | by 80 | c 81 | c'mon 82 | c's 83 | came 84 | can 85 | can't 86 | cannot 87 | cant 88 | cause 89 | causes 90 | certain 91 | certainly 92 | changes 93 | clearly 94 | co 95 | com 96 | come 97 | comes 98 | concerning 99 | consequently 100 | consider 101 | considering 102 | contain 103 | containing 104 | contains 105 | corresponding 106 | could 107 | couldn't 108 | course 109 | currently 110 | d 111 | definitely 112 | described 113 | despite 114 | did 115 | didn't 116 | different 117 | do 118 | does 119 | doesn't 120 | doing 121 | don't 122 | done 123 | down 124 | downwards 125 | during 126 | e 127 | each 128 | edu 129 | eg 130 | eight 131 | either 132 | else 133 | elsewhere 134 | enough 135 | entirely 136 | especially 137 | et 138 | etc 139 | even 140 | ever 141 | every 142 | everybody 143 | everyone 144 | everything 145 | everywhere 146 | ex 147 | exactly 148 | example 149 | except 150 | f 151 | far 152 | few 153 | fifth 154 | first 155 | five 156 | followed 157 | following 158 | follows 159 | for 160 | former 161 | formerly 162 | forth 163 | four 164 | from 165 | further 166 | furthermore 167 | g 168 | get 169 | gets 170 | getting 171 | given 172 | gives 173 | go 174 | goes 175 | going 176 | gone 177 | got 178 | gotten 179 | greetings 180 | h 181 | had 182 | hadn't 183 | happens 184 | hardly 185 | has 186 | hasn't 187 | have 188 | haven't 189 | having 190 | he 191 | he's 192 | hello 193 | help 194 | hence 195 | her 196 | here 197 | here's 198 | hereafter 199 | hereby 200 | herein 201 | hereupon 202 | hers 203 | herself 204 | hi 205 | him 206 | himself 207 | his 208 | hither 209 | hopefully 210 | how 211 | howbeit 212 | however 213 | i 214 | i'd 215 | i'll 216 | i'm 217 | i've 218 | ie 219 | if 220 | ignored 221 | immediate 222 | in 223 | inasmuch 224 | inc 225 | indeed 226 | indicate 227 | indicated 228 | indicates 229 | inner 230 | insofar 231 | instead 232 | into 233 | inward 234 | is 235 | isn't 236 | it 237 | it'd 238 | it'll 239 | it's 240 | its 241 | itself 242 | j 243 | just 244 | k 245 | keep 246 | keeps 247 | kept 248 | know 249 | knows 250 | known 251 | l 252 | last 253 | lately 254 | later 255 | latter 256 | latterly 257 | least 258 | less 259 | lest 260 | let 261 | let's 262 | like 263 | liked 264 | likely 265 | little 266 | look 267 | looking 268 | looks 269 | ltd 270 | m 271 | mainly 272 | many 273 | may 274 | maybe 275 | me 276 | mean 277 | meanwhile 278 | merely 279 | might 280 | more 281 | moreover 282 | most 283 | mostly 284 | much 285 | must 286 | my 287 | myself 288 | n 289 | name 290 | namely 291 | nd 292 | near 293 | nearly 294 | necessary 295 | need 296 | needs 297 | neither 298 | never 299 | nevertheless 300 | new 301 | next 302 | nine 303 | no 304 | nobody 305 | non 306 | none 307 | noone 308 | nor 309 | normally 310 | not 311 | nothing 312 | novel 313 | now 314 | nowhere 315 | o 316 | obviously 317 | of 318 | off 319 | often 320 | oh 321 | ok 322 | okay 323 | old 324 | on 325 | once 326 | one 327 | ones 328 | only 329 | onto 330 | or 331 | other 332 | others 333 | otherwise 334 | ought 335 | our 336 | ours 337 | ourselves 338 | out 339 | outside 340 | over 341 | overall 342 | own 343 | p 344 | particular 345 | particularly 346 | per 347 | perhaps 348 | placed 349 | please 350 | plus 351 | possible 352 | presumably 353 | probably 354 | provides 355 | q 356 | que 357 | quite 358 | qv 359 | r 360 | rather 361 | rd 362 | re 363 | really 364 | reasonably 365 | regarding 366 | regardless 367 | regards 368 | relatively 369 | respectively 370 | right 371 | s 372 | said 373 | same 374 | saw 375 | say 376 | saying 377 | says 378 | second 379 | secondly 380 | see 381 | seeing 382 | seem 383 | seemed 384 | seeming 385 | seems 386 | seen 387 | self 388 | selves 389 | sensible 390 | sent 391 | serious 392 | seriously 393 | seven 394 | several 395 | shall 396 | she 397 | should 398 | shouldn't 399 | since 400 | six 401 | so 402 | some 403 | somebody 404 | somehow 405 | someone 406 | something 407 | sometime 408 | sometimes 409 | somewhat 410 | somewhere 411 | soon 412 | sorry 413 | specified 414 | specify 415 | specifying 416 | still 417 | sub 418 | such 419 | sup 420 | sure 421 | t 422 | t's 423 | take 424 | taken 425 | tell 426 | tends 427 | th 428 | than 429 | thank 430 | thanks 431 | thanx 432 | that 433 | that's 434 | thats 435 | the 436 | their 437 | theirs 438 | them 439 | themselves 440 | then 441 | thence 442 | there 443 | there's 444 | thereafter 445 | thereby 446 | therefore 447 | therein 448 | theres 449 | thereupon 450 | these 451 | they 452 | they'd 453 | they'll 454 | they're 455 | they've 456 | think 457 | third 458 | this 459 | thorough 460 | thoroughly 461 | those 462 | though 463 | three 464 | through 465 | throughout 466 | thru 467 | thus 468 | to 469 | together 470 | too 471 | took 472 | toward 473 | towards 474 | tried 475 | tries 476 | truly 477 | try 478 | trying 479 | twice 480 | two 481 | u 482 | un 483 | under 484 | unfortunately 485 | unless 486 | unlikely 487 | until 488 | unto 489 | up 490 | upon 491 | us 492 | use 493 | used 494 | useful 495 | uses 496 | using 497 | usually 498 | uucp 499 | v 500 | value 501 | various 502 | very 503 | via 504 | viz 505 | vs 506 | w 507 | want 508 | wants 509 | was 510 | wasn't 511 | way 512 | we 513 | we'd 514 | we'll 515 | we're 516 | we've 517 | welcome 518 | well 519 | went 520 | were 521 | weren't 522 | what 523 | what's 524 | whatever 525 | when 526 | whence 527 | whenever 528 | where 529 | where's 530 | whereafter 531 | whereas 532 | whereby 533 | wherein 534 | whereupon 535 | wherever 536 | whether 537 | which 538 | while 539 | whither 540 | who 541 | who's 542 | whoever 543 | whole 544 | whom 545 | whose 546 | why 547 | will 548 | willing 549 | wish 550 | with 551 | within 552 | without 553 | won't 554 | wonder 555 | would 556 | would 557 | wouldn't 558 | x 559 | y 560 | yes 561 | yet 562 | you 563 | you'd 564 | you'll 565 | you're 566 | you've 567 | your 568 | yours 569 | yourself 570 | yourselves 571 | z 572 | zero 573 | -------------------------------------------------------------------------------- /lib/rake_text.rb: -------------------------------------------------------------------------------- 1 | class RakeText 2 | 3 | @@stoplist_smart = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","knows","known","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"] 4 | @@stoplist_fox = ["a","about","above","across","after","again","against","all","almost","alone","along","already","also","although","always","among","an","and","another","any","anybody","anyone","anything","anywhere","are","area","areas","around","as","ask","asked","asking","asks","at","away","b","back","backed","backing","backs","be","because","became","become","becomes","been","before","began","behind","being","beings","best","better","between","big","both","but","by","c","came","can","cannot","case","cases","certain","certainly","clear","clearly","come","could","d","did","differ","different","differently","do","does","done","down","downed","downing","downs","during","e","each","early","either","end","ended","ending","ends","enough","even","evenly","ever","every","everybody","everyone","everything","everywhere","f","face","faces","fact","facts","far","felt","few","find","finds","first","for","four","from","full","fully","further","furthered","furthering","furthers","g","gave","general","generally","get","gets","give","given","gives","go","going","good","goods","got","great","greater","greatest","group","grouped","grouping","groups","h","had","has","have","having","he","her","herself","here","high","higher","highest","him","himself","his","how","however","i","if","important","in","interest","interested","interesting","interests","into","is","it","its","itself","j","just","k","keep","keeps","kind","knew","know","known","knows","l","large","largely","last","later","latest","least","less","let","lets","like","likely","long","longer","longest","m","made","make","making","man","many","may","me","member","members","men","might","more","most","mostly","mr","mrs","much","must","my","myself","n","necessary","need","needed","needing","needs","never","new","newer","newest","next","no","non","not","nobody","noone","nothing","now","nowhere","number","numbered","numbering","numbers","o","of","off","often","old","older","oldest","on","once","one","only","open","opened","opening","opens","or","order","ordered","ordering","orders","other","others","our","out","over","p","part","parted","parting","parts","per","perhaps","place","places","point","pointed","pointing","points","possible","present","presented","presenting","presents","problem","problems","put","puts","q","quite","r","rather","really","right","room","rooms","s","said","same","saw","say","says","second","seconds","see","seem","seemed","seeming","seems","sees","several","shall","she","should","show","showed","showing","shows","side","sides","since","small","smaller","smallest","so","some","somebody","someone","something","somewhere","state","states","still","such","sure","t","take","taken","than","that","the","their","them","then","there","therefore","these","they","thing","things","think","thinks","this","those","though","thought","thoughts","three","through","thus","to","today","together","too","took","toward","turn","turned","turning","turns","two","u","under","until","up","upon","us","use","uses","used","v","very","w","want","wanted","wanting","wants","was","way","ways","we","well","wells","went","were","what","when","where","whether","which","while","who","whole","whose","why","will","with","within","without","work","worked","working","works","would","x","y","year","years","yet","you","young","younger","youngest","your","yours","z"] 5 | 6 | def self.SMART 7 | return @@stoplist_smart 8 | end 9 | 10 | def self.FOX 11 | return @@stoplist_fox 12 | end 13 | 14 | def initialize 15 | end 16 | 17 | def analyse text, stoplist, verbose=false 18 | pattern = buildStopwordRegExPattern stoplist 19 | sentences = text.split(/[.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]/u) 20 | phrases = generateCandidateKeywords sentences, pattern 21 | wordscores = calculateWordScores phrases 22 | candidates = generateCandidateKeywordScores phrases, wordscores 23 | 24 | if verbose == true 25 | result = candidates.sort_by{|k,v| v}.reverse 26 | result.each do |word, score| 27 | puts sprintf '%.2f - %s', score, word 28 | end 29 | end 30 | 31 | return candidates 32 | end 33 | 34 | private 35 | 36 | # create stopword pattern 37 | # 1 38 | def buildStopwordRegExPattern words 39 | pattern = Array.new 40 | words.each do |word| 41 | pattern.push '\\b'+word+'\\b' 42 | end 43 | return Regexp.new(pattern.join("|"), Regexp::IGNORECASE) 44 | end 45 | 46 | # generate candidate keywords 47 | # 2 48 | def generateCandidateKeywords sentences, pattern 49 | phrases = Array.new 50 | 51 | sentences.each do |sentence| 52 | sentence = sentence.strip 53 | 54 | tmp = sentence.gsub pattern, "|" 55 | 56 | tmp.split("|").each do |part| 57 | part = part.strip.downcase 58 | if !part.empty? 59 | phrases.push part 60 | end 61 | end 62 | end 63 | 64 | return phrases 65 | end 66 | 67 | # calculate individual word scores 68 | # 3 69 | def calculateWordScores phrases 70 | word_freq = Hash.new 0 71 | word_degree = Hash.new 0 72 | word_score = Hash.new 0 73 | 74 | phrases.each do |phrase| 75 | words = seperateWords phrase 76 | 77 | length = words.length 78 | degree = length-1 79 | 80 | words.each do |word| 81 | word_freq[word] += 1 82 | word_degree[word] += degree 83 | end 84 | end 85 | 86 | word_freq.each do |word, counter| 87 | word_degree[word] = word_degree[word] + word_freq[word] 88 | end 89 | 90 | word_freq.each do |word, counter| 91 | word_score[word] = word_degree[word]/(word_freq[word] * 1.0) 92 | end 93 | 94 | return word_score 95 | end 96 | 97 | # generate candidate keyword scores 98 | # 4 99 | def generateCandidateKeywordScores phrases, scores 100 | candidates = Hash.new 0 101 | 102 | phrases.each do |phrase| 103 | words = seperateWords(phrase) 104 | score = 0 105 | words.each do |word| 106 | score += scores[word] 107 | end 108 | candidates[phrase] = score 109 | end 110 | 111 | return candidates 112 | end 113 | 114 | def seperateWords text 115 | words = Array.new 116 | 117 | text.split(/[^a-zA-Z0-9_\\+\\-]/).each do |word| 118 | word = word.strip.downcase 119 | if !word.empty? && !(true if Float(word) rescue false) 120 | words.push word 121 | end 122 | end 123 | 124 | return words 125 | end 126 | 127 | end --------------------------------------------------------------------------------