├── .gitignore ├── LICENSE ├── README.md ├── pom.xml ├── src ├── main │ ├── FoxStopListEn │ ├── RakePunctDefaultStopList │ ├── SmartStopListEn │ ├── SpanishCustomEs │ └── java │ │ └── edu │ │ └── ehu │ │ └── galan │ │ └── rake │ │ ├── RakeAlgorithm.java │ │ └── model │ │ ├── AbstractAlgorithm.java │ │ ├── Document.java │ │ ├── Term.java │ │ └── Token.java └── test │ └── java │ └── edu │ └── ehu │ └── galan │ └── rake │ └── AppTest.java └── stopLists ├── FoxStopListEn ├── RakePunctDefaultStopList ├── SmartStopListEn └── SpanishCustomEs /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | /nb-configuration.xml 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Angel Conde 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | RAKE-Java 2 | ===================== 3 | 4 | A Java 8 implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm as described in: Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). Automatic Keyword Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.), Text Mining: Theory and Applications: John Wiley & Sons. 5 | 6 | The implementation is based on the python one from https://github.com/aneesha/RAKE (however some changes have been made) 7 | The source code is released under the GPL V3License. 8 | 9 | Add this repository to your POM.XML whether you want to use it with maven 10 | ````xml 11 | 12 | galan-maven-repo 13 | galan-maven-repo-releases 14 | http://galan.ehu.es/artifactory/ext-release-local 15 | 16 | 17 | ```` 18 | 19 | This implementation requires a POS tagger to be used in order to work. For example The Illinois POS tagger could be used for English. 20 | 21 | 22 | http://cogcomp.cs.illinois.edu/page/software_view/POS 23 | 24 | For Spanish or other languages: 25 | 26 | FreeLing --> http://nlp.lsi.upc.edu/freeling/ 27 | 28 | or Standford Pos tagger --> http://nlp.stanford.edu/software/tagger.shtml 29 | 30 | 31 | The implementation is in beta state 32 | 33 | TODO: 34 | 35 | - More testing 36 | 37 | 38 | Then an example parser for english that will provide the required data (using Illinois POS Tagger) 39 | 40 | 41 | ```java 42 | 43 | import LBJ2.nlp.SentenceSplitter; 44 | import LBJ2.nlp.WordSplitter; 45 | import LBJ2.nlp.seg.PlainToTokenParser; 46 | import LBJ2.parse.Parser; 47 | import edu.illinois.cs.cogcomp.lbj.chunk.Chunker; 48 | import edu.illinois.cs.cogcomp.lbj.pos.POSTagger; 49 | import edu.ehu.galan.cvalue.model.Token; 50 | ...... 51 | 52 | List> tokenizedSentenceList; 53 | List sentenceList; 54 | POSTagger tagger = new POSTagger(); 55 | Chunker chunker = new Chunker(); 56 | boolean first = true; 57 | parser = new PlainToTokenParser(new WordSplitter(new SentenceSplitter(pFile))); 58 | String sentence = ""; 59 | LinkedList tokenList = null; 60 | for (LBJ2.nlp.seg.Token word = (LBJ2.nlp.seg.Token) parser.next(); word != null; 61 | word = (LBJ2.nlp.seg.Token) parser.next()) { 62 | String chunked = chunker.discreteValue(word); 63 | tagger.discreteValue(word); 64 | if (first) { 65 | tokenList = new LinkedList<>(); 66 | tokenizedSentenceList.add(tokenList); 67 | first = false; 68 | } 69 | tokenList.add(new Token(word.form, word.partOfSpeech, null, chunked)); 70 | sentence = sentence + " " + (word.form); 71 | if (word.next == null) { 72 | sentenceList.add(sentence); 73 | first = true; 74 | sentence = ""; 75 | } 76 | } 77 | parser.reset(); 78 | 79 | ``` 80 | 81 | Then RAKE can be processed then..... 82 | 83 | 84 | ```java 85 | 86 | Document doc=new Document(full_path,name); 87 | doc.setSentenceList(sentences); 88 | doc.setTokenList(tokenized_sentences); 89 | RakeAlgorithm ex = new RakeAlgorithm(); 90 | ex.loadStopWordsList("resources/lite/stopWordLists/RakeStopLists/SmartStopListEn"); 91 | ex.loadPunctStopWord("resources/lite/stopWordLists/RakeStopLists/RakePunctDefaultStopList"); 92 | PlainTextDocumentReaderLBJEn parser = new PlainTextDocumentReaderLBJEn(); 93 | parser.readSource("testCorpus/textAstronomy"); 94 | Document doc = new Document("full_path", "name"); 95 | ex.init(doc); 96 | ex.runAlgorithm(); 97 | doc.getTermList(); 98 | ``` 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | edu.ehu.galan.rake 6 | RAKE 7 | 1.0 8 | jar 9 | 10 | 11 | 12 | false 13 | 14 | central 15 | bintray-plugins 16 | http://jcenter.bintray.com 17 | 18 | 19 | RAKE 20 | http://maven.apache.org 21 | 22 | 23 | 24 | org.apache.maven.plugins 25 | maven-compiler-plugin 26 | 2.3.2 27 | 28 | 1.8 29 | 1.8 30 | 31 | 32 | 33 | org.jfrog.buildinfo 34 | artifactory-maven-plugin 35 | 2.2.2 36 | false 37 | 38 | 39 | build-info 40 | 41 | publish 42 | 43 | 44 | 45 | http://sips72.si.ehu.es:8080/artifactory/ 46 | admin 47 | 48 | ext-release-local 49 | ext-snapshots-local 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | UTF-8 59 | 60 | 61 | 62 | junit 63 | junit 64 | 3.8.1 65 | test 66 | 67 | 68 | com.google.guava 69 | guava 70 | 15.0 71 | jar 72 | 73 | 74 | org.slf4j 75 | slf4j-api 76 | 1.6.6 77 | jar 78 | 79 | 80 | com.google.code.gson 81 | gson 82 | 2.2.4 83 | jar 84 | 85 | 86 | commons-io 87 | commons-io 88 | 2.4 89 | jar 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /src/main/FoxStopListEn: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | across 5 | after 6 | again 7 | against 8 | all 9 | almost 10 | alone 11 | along 12 | already 13 | also 14 | although 15 | always 16 | among 17 | an 18 | and 19 | another 20 | any 21 | anybody 22 | anyone 23 | anything 24 | anywhere 25 | are 26 | area 27 | areas 28 | around 29 | as 30 | ask 31 | asked 32 | asking 33 | asks 34 | at 35 | away 36 | b 37 | back 38 | backed 39 | backing 40 | backs 41 | be 42 | because 43 | became 44 | become 45 | becomes 46 | been 47 | before 48 | began 49 | behind 50 | being 51 | beings 52 | best 53 | better 54 | between 55 | big 56 | both 57 | but 58 | by 59 | c 60 | came 61 | can 62 | cannot 63 | case 64 | cases 65 | certain 66 | certainly 67 | clear 68 | clearly 69 | come 70 | could 71 | d 72 | did 73 | differ 74 | different 75 | differently 76 | do 77 | does 78 | done 79 | down 80 | downed 81 | downing 82 | downs 83 | during 84 | e 85 | each 86 | early 87 | either 88 | end 89 | ended 90 | ending 91 | ends 92 | enough 93 | even 94 | evenly 95 | ever 96 | every 97 | everybody 98 | everyone 99 | everything 100 | everywhere 101 | f 102 | face 103 | faces 104 | fact 105 | facts 106 | far 107 | felt 108 | few 109 | find 110 | finds 111 | first 112 | for 113 | four 114 | from 115 | full 116 | fully 117 | further 118 | furthered 119 | furthering 120 | furthers 121 | g 122 | gave 123 | general 124 | generally 125 | get 126 | gets 127 | give 128 | given 129 | gives 130 | go 131 | going 132 | good 133 | goods 134 | got 135 | great 136 | greater 137 | greatest 138 | group 139 | grouped 140 | grouping 141 | groups 142 | h 143 | had 144 | has 145 | have 146 | having 147 | he 148 | her 149 | herself 150 | here 151 | high 152 | higher 153 | highest 154 | him 155 | himself 156 | his 157 | how 158 | however 159 | i 160 | if 161 | important 162 | in 163 | interest 164 | interested 165 | interesting 166 | interests 167 | into 168 | is 169 | it 170 | its 171 | itself 172 | j 173 | just 174 | k 175 | keep 176 | keeps 177 | kind 178 | knew 179 | know 180 | known 181 | knows 182 | l 183 | large 184 | largely 185 | last 186 | later 187 | latest 188 | least 189 | less 190 | let 191 | lets 192 | like 193 | likely 194 | long 195 | longer 196 | longest 197 | m 198 | made 199 | make 200 | making 201 | man 202 | many 203 | may 204 | me 205 | member 206 | members 207 | men 208 | might 209 | more 210 | most 211 | mostly 212 | mr 213 | mrs 214 | much 215 | must 216 | my 217 | myself 218 | n 219 | necessary 220 | need 221 | needed 222 | needing 223 | needs 224 | never 225 | new 226 | newer 227 | newest 228 | next 229 | no 230 | non 231 | not 232 | nobody 233 | noone 234 | nothing 235 | now 236 | nowhere 237 | number 238 | numbered 239 | numbering 240 | numbers 241 | o 242 | of 243 | off 244 | often 245 | old 246 | older 247 | oldest 248 | on 249 | once 250 | one 251 | only 252 | open 253 | opened 254 | opening 255 | opens 256 | or 257 | order 258 | ordered 259 | ordering 260 | orders 261 | other 262 | others 263 | our 264 | out 265 | over 266 | p 267 | part 268 | parted 269 | parting 270 | parts 271 | per 272 | perhaps 273 | place 274 | places 275 | point 276 | pointed 277 | pointing 278 | points 279 | possible 280 | present 281 | presented 282 | presenting 283 | presents 284 | problem 285 | problems 286 | put 287 | puts 288 | q 289 | quite 290 | r 291 | rather 292 | really 293 | right 294 | room 295 | rooms 296 | s 297 | said 298 | same 299 | saw 300 | say 301 | says 302 | second 303 | seconds 304 | see 305 | seem 306 | seemed 307 | seeming 308 | seems 309 | sees 310 | several 311 | shall 312 | she 313 | should 314 | show 315 | showed 316 | showing 317 | shows 318 | side 319 | sides 320 | since 321 | small 322 | smaller 323 | smallest 324 | so 325 | some 326 | somebody 327 | someone 328 | something 329 | somewhere 330 | state 331 | states 332 | still 333 | such 334 | sure 335 | t 336 | take 337 | taken 338 | than 339 | that 340 | the 341 | their 342 | them 343 | then 344 | there 345 | therefore 346 | these 347 | they 348 | thing 349 | things 350 | think 351 | thinks 352 | this 353 | those 354 | though 355 | thought 356 | thoughts 357 | three 358 | through 359 | thus 360 | to 361 | today 362 | together 363 | too 364 | took 365 | toward 366 | turn 367 | turned 368 | turning 369 | turns 370 | two 371 | u 372 | under 373 | until 374 | up 375 | upon 376 | us 377 | use 378 | uses 379 | used 380 | v 381 | very 382 | w 383 | want 384 | wanted 385 | wanting 386 | wants 387 | was 388 | way 389 | ways 390 | we 391 | well 392 | wells 393 | went 394 | were 395 | what 396 | when 397 | where 398 | whether 399 | which 400 | while 401 | who 402 | whole 403 | whose 404 | why 405 | will 406 | with 407 | within 408 | without 409 | work 410 | worked 411 | working 412 | works 413 | would 414 | x 415 | y 416 | year 417 | years 418 | yet 419 | you 420 | young 421 | younger 422 | youngest 423 | your 424 | yours 425 | z -------------------------------------------------------------------------------- /src/main/RakePunctDefaultStopList: -------------------------------------------------------------------------------- 1 | . 2 | / 3 | , 4 | ! 5 | ? 6 | { 7 | } 8 | [ 9 | ] 10 | ; 11 | : 12 | ( 13 | ) 14 | - 15 | _ 16 | @ -------------------------------------------------------------------------------- /src/main/SmartStopListEn: -------------------------------------------------------------------------------- 1 | a 2 | a's 3 | able 4 | about 5 | above 6 | according 7 | accordingly 8 | across 9 | actually 10 | after 11 | afterwards 12 | again 13 | against 14 | ain't 15 | all 16 | allow 17 | allows 18 | almost 19 | alone 20 | along 21 | already 22 | also 23 | although 24 | always 25 | am 26 | among 27 | amongst 28 | an 29 | and 30 | another 31 | any 32 | anybody 33 | anyhow 34 | anyone 35 | anything 36 | anyway 37 | anyways 38 | anywhere 39 | apart 40 | appear 41 | appreciate 42 | appropriate 43 | are 44 | aren't 45 | around 46 | as 47 | aside 48 | ask 49 | asking 50 | associated 51 | at 52 | available 53 | away 54 | awfully 55 | b 56 | be 57 | became 58 | because 59 | become 60 | becomes 61 | becoming 62 | been 63 | before 64 | beforehand 65 | behind 66 | being 67 | believe 68 | below 69 | beside 70 | besides 71 | best 72 | better 73 | between 74 | beyond 75 | both 76 | brief 77 | but 78 | by 79 | c 80 | c'mon 81 | c's 82 | came 83 | can 84 | can't 85 | cannot 86 | cant 87 | cause 88 | causes 89 | certain 90 | certainly 91 | changes 92 | clearly 93 | co 94 | com 95 | come 96 | comes 97 | concerning 98 | consequently 99 | consider 100 | considering 101 | contain 102 | containing 103 | contains 104 | corresponding 105 | could 106 | couldn't 107 | course 108 | currently 109 | d 110 | definitely 111 | described 112 | despite 113 | did 114 | didn't 115 | different 116 | do 117 | does 118 | doesn't 119 | doing 120 | don't 121 | done 122 | down 123 | downwards 124 | during 125 | e 126 | each 127 | edu 128 | eg 129 | eight 130 | either 131 | else 132 | elsewhere 133 | enough 134 | entirely 135 | especially 136 | et 137 | etc 138 | even 139 | ever 140 | every 141 | everybody 142 | everyone 143 | everything 144 | everywhere 145 | ex 146 | exactly 147 | example 148 | except 149 | f 150 | far 151 | few 152 | fifth 153 | first 154 | five 155 | followed 156 | following 157 | follows 158 | for 159 | former 160 | formerly 161 | forth 162 | four 163 | from 164 | further 165 | furthermore 166 | g 167 | get 168 | gets 169 | getting 170 | given 171 | gives 172 | go 173 | goes 174 | going 175 | gone 176 | got 177 | gotten 178 | greetings 179 | h 180 | had 181 | hadn't 182 | happens 183 | hardly 184 | has 185 | hasn't 186 | have 187 | haven't 188 | having 189 | he 190 | he's 191 | hello 192 | help 193 | hence 194 | her 195 | here 196 | here's 197 | hereafter 198 | hereby 199 | herein 200 | hereupon 201 | hers 202 | herself 203 | hi 204 | him 205 | himself 206 | his 207 | hither 208 | hopefully 209 | how 210 | howbeit 211 | however 212 | i 213 | i'd 214 | i'll 215 | i'm 216 | i've 217 | ie 218 | if 219 | ignored 220 | immediate 221 | in 222 | inasmuch 223 | inc 224 | indeed 225 | indicate 226 | indicated 227 | indicates 228 | inner 229 | insofar 230 | instead 231 | into 232 | inward 233 | is 234 | isn't 235 | it 236 | it'd 237 | it'll 238 | it's 239 | its 240 | itself 241 | j 242 | just 243 | k 244 | keep 245 | keeps 246 | kept 247 | know 248 | knows 249 | known 250 | l 251 | last 252 | lately 253 | later 254 | latter 255 | latterly 256 | least 257 | less 258 | lest 259 | let 260 | let's 261 | like 262 | liked 263 | likely 264 | little 265 | look 266 | looking 267 | looks 268 | ltd 269 | m 270 | mainly 271 | many 272 | may 273 | maybe 274 | me 275 | mean 276 | meanwhile 277 | merely 278 | might 279 | more 280 | moreover 281 | most 282 | mostly 283 | much 284 | must 285 | my 286 | myself 287 | n 288 | name 289 | namely 290 | nd 291 | near 292 | nearly 293 | necessary 294 | need 295 | needs 296 | neither 297 | never 298 | nevertheless 299 | new 300 | next 301 | nine 302 | no 303 | nobody 304 | non 305 | none 306 | noone 307 | nor 308 | normally 309 | not 310 | nothing 311 | novel 312 | now 313 | nowhere 314 | o 315 | obviously 316 | of 317 | off 318 | often 319 | oh 320 | ok 321 | okay 322 | old 323 | on 324 | once 325 | one 326 | ones 327 | only 328 | onto 329 | or 330 | other 331 | others 332 | otherwise 333 | ought 334 | our 335 | ours 336 | ourselves 337 | out 338 | outside 339 | over 340 | overall 341 | own 342 | p 343 | particular 344 | particularly 345 | per 346 | perhaps 347 | placed 348 | please 349 | plus 350 | possible 351 | presumably 352 | probably 353 | provides 354 | q 355 | que 356 | quite 357 | qv 358 | r 359 | rather 360 | rd 361 | re 362 | really 363 | reasonably 364 | regarding 365 | regardless 366 | regards 367 | relatively 368 | respectively 369 | right 370 | s 371 | said 372 | same 373 | saw 374 | say 375 | saying 376 | says 377 | second 378 | secondly 379 | see 380 | seeing 381 | seem 382 | seemed 383 | seeming 384 | seems 385 | seen 386 | self 387 | selves 388 | sensible 389 | sent 390 | serious 391 | seriously 392 | seven 393 | several 394 | shall 395 | she 396 | should 397 | shouldn't 398 | since 399 | six 400 | so 401 | some 402 | somebody 403 | somehow 404 | someone 405 | something 406 | sometime 407 | sometimes 408 | somewhat 409 | somewhere 410 | soon 411 | sorry 412 | specified 413 | specify 414 | specifying 415 | still 416 | sub 417 | such 418 | sup 419 | sure 420 | t 421 | t's 422 | take 423 | taken 424 | tell 425 | tends 426 | th 427 | than 428 | thank 429 | thanks 430 | thanx 431 | that 432 | that's 433 | thats 434 | the 435 | their 436 | theirs 437 | them 438 | themselves 439 | then 440 | thence 441 | there 442 | there's 443 | thereafter 444 | thereby 445 | therefore 446 | therein 447 | theres 448 | thereupon 449 | these 450 | they 451 | they'd 452 | they'll 453 | they're 454 | they've 455 | think 456 | third 457 | this 458 | thorough 459 | thoroughly 460 | those 461 | though 462 | three 463 | through 464 | throughout 465 | thru 466 | thus 467 | to 468 | together 469 | too 470 | took 471 | toward 472 | towards 473 | tried 474 | tries 475 | truly 476 | try 477 | trying 478 | twice 479 | two 480 | u 481 | un 482 | under 483 | unfortunately 484 | unless 485 | unlikely 486 | until 487 | unto 488 | up 489 | upon 490 | us 491 | use 492 | used 493 | useful 494 | uses 495 | using 496 | usually 497 | uucp 498 | v 499 | value 500 | various 501 | very 502 | via 503 | viz 504 | vs 505 | w 506 | want 507 | wants 508 | was 509 | wasn't 510 | way 511 | we 512 | we'd 513 | we'll 514 | we're 515 | we've 516 | welcome 517 | well 518 | went 519 | were 520 | weren't 521 | what 522 | what's 523 | whatever 524 | when 525 | whence 526 | whenever 527 | where 528 | where's 529 | whereafter 530 | whereas 531 | whereby 532 | wherein 533 | whereupon 534 | wherever 535 | whether 536 | which 537 | while 538 | whither 539 | who 540 | who's 541 | whoever 542 | whole 543 | whom 544 | whose 545 | why 546 | will 547 | willing 548 | wish 549 | with 550 | within 551 | without 552 | won't 553 | wonder 554 | would 555 | would 556 | wouldn't 557 | x 558 | y 559 | yes 560 | yet 561 | you 562 | you'd 563 | you'll 564 | you're 565 | you've 566 | your 567 | yours 568 | yourself 569 | yourselves 570 | z 571 | zero 572 | true 573 | false 574 | additional 575 | shown -------------------------------------------------------------------------------- /src/main/SpanishCustomEs: -------------------------------------------------------------------------------- 1 | algún 2 | alguna 3 | algunas 4 | alguno 5 | algunos 6 | ambos 7 | ampleamos 8 | ante 9 | antes 10 | aquel 11 | aquellas 12 | aquellos 13 | aqui 14 | arriba 15 | atras 16 | bajo 17 | bastante 18 | bien 19 | cada 20 | cierta 21 | ciertas 22 | cierto 23 | ciertos 24 | como 25 | con 26 | conseguimos 27 | conseguir 28 | consigo 29 | consigue 30 | consiguen 31 | consigues 32 | cual 33 | cuando 34 | dentro 35 | desde 36 | donde 37 | dos 38 | el 39 | ellas 40 | ellos 41 | empleais 42 | emplean 43 | emplear 44 | empleas 45 | empleo 46 | en 47 | encima 48 | entonces 49 | entre 50 | era 51 | eramos 52 | eran 53 | eras 54 | eres 55 | es 56 | esta 57 | estaba 58 | estado 59 | estais 60 | estamos 61 | estan 62 | estoy 63 | fin 64 | fue 65 | fueron 66 | fui 67 | fuimos 68 | gueno 69 | ha 70 | hace 71 | haceis 72 | hacemos 73 | hacen 74 | hacer 75 | haces 76 | hago 77 | incluso 78 | intenta 79 | intentais 80 | intentamos 81 | intentan 82 | intentar 83 | intentas 84 | intento 85 | ir 86 | la 87 | largo 88 | las 89 | lo 90 | los 91 | mientras 92 | mio 93 | modo 94 | muchos 95 | muy 96 | nos 97 | nosotros 98 | otro 99 | para 100 | pero 101 | podeis 102 | podemos 103 | poder 104 | podria 105 | podriais 106 | podriamos 107 | podrian 108 | podrias 109 | por 110 | por qué 111 | porque 112 | primero 113 | puede 114 | pueden 115 | puedo 116 | quien 117 | sabe 118 | sabeis 119 | sabemos 120 | saben 121 | saber 122 | sabes 123 | ser 124 | si 125 | siendo 126 | sin 127 | sobre 128 | sois 129 | solamente 130 | solo 131 | somos 132 | soy 133 | su 134 | sus 135 | también 136 | teneis 137 | tenemos 138 | tener 139 | tengo 140 | tiempo 141 | tiene 142 | tienen 143 | todo 144 | trabaja 145 | trabajais 146 | trabajamos 147 | trabajan 148 | trabajar 149 | trabajas 150 | trabajo 151 | tras 152 | tuyo 153 | ultimo 154 | un 155 | una 156 | unas 157 | uno 158 | unos 159 | usa 160 | usais 161 | usamos 162 | usan 163 | usar 164 | usas 165 | uso 166 | va 167 | vais 168 | valor 169 | vamos 170 | van 171 | vaya 172 | verdad 173 | verdadera 174 | verdadero 175 | vosotras 176 | vosotros 177 | voy 178 | yo 179 | -------------------------------------------------------------------------------- /src/main/java/edu/ehu/galan/rake/RakeAlgorithm.java: -------------------------------------------------------------------------------- 1 | package edu.ehu.galan.rake; 2 | 3 | /* 4 | * RakeAlgorithm.java 5 | * Copyright (C) 2014 Angel Conde, neuw84 at gmail dot com 6 | * 7 | * This program is free software; you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation; either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program; if not, write to the Free Software 19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20 | */ 21 | 22 | 23 | import edu.ehu.galan.rake.model.AbstractAlgorithm; 24 | import edu.ehu.galan.rake.model.Document; 25 | import edu.ehu.galan.rake.model.Term; 26 | import java.io.IOException; 27 | import java.nio.charset.StandardCharsets; 28 | import java.nio.file.Files; 29 | import java.nio.file.Paths; 30 | import java.util.ArrayList; 31 | import java.util.Arrays; 32 | import java.util.Comparator; 33 | import java.util.HashMap; 34 | import java.util.List; 35 | import java.util.Map; 36 | import java.util.regex.Matcher; 37 | import java.util.regex.Pattern; 38 | import static java.util.stream.Collectors.toList; 39 | import org.slf4j.Logger; 40 | import org.slf4j.LoggerFactory; 41 | 42 | /** 43 | * 44 | * An Implementation of the RAKE (Rapid Automatic Keyword Extraction) 45 | * Rose, Stuart, et al. "Automatic keyword extraction from individual 46 | * documents." Text Mining (2010): 1-20. 47 | * 48 | * 49 | * This implementation is based on JATE https://code.google.com/p/jatetoolkit/ 50 | * and on https://github.com/aneesha/RAKE, it gives similar results as the 51 | * python script provided a good stopword list with a punctuation list 52 | * 53 | * The numbers have been taken into account using JATE method. The algorithm 54 | * expects that the puntuaction marks are separated within a whitespace. 55 | * " The red table , that is in front of you , is mine . " 56 | * To achieve this you should use a parser like OpenNLP, Illinois POS Tagger, 57 | * Freeling parsers etc. 58 | * 59 | * 60 | * TODO: use POS tags to avoid verbs and other unwanted type of words in the 61 | * process of keyword generation 62 | * 63 | * @author Angel Conde Manjon 64 | */ 65 | 66 | public class RakeAlgorithm extends AbstractAlgorithm { 67 | 68 | private transient Document doc = null; 69 | private final transient List termList; 70 | private List stopWordList; 71 | transient private final Logger logger = LoggerFactory.getLogger(this.getClass()); 72 | private List regexList = null; 73 | private List punctList; 74 | private int minNumberOfletters = 2; 75 | 76 | /** 77 | * 78 | */ 79 | public RakeAlgorithm() { 80 | super(true, "RAKE"); 81 | termList = super.getTermList(); 82 | stopWordList = new ArrayList<>(); 83 | regexList = new ArrayList<>(); 84 | punctList = new ArrayList<>(); 85 | } 86 | 87 | @Override 88 | public void init(Document pDoc, String pPropsDir) { 89 | setDoc(pDoc); 90 | doc = pDoc; 91 | } 92 | 93 | /** 94 | * This methods requires a list of stopwords to build a the candidate list, 95 | * will search in each different sentence for this stopwords to delimite the 96 | * candidate generation 97 | * 98 | * 99 | * @param pStopWords - a list of stopWords 100 | */ 101 | public void loadStopWordsList(List pStopWords) { 102 | stopWordList = pStopWords; 103 | } 104 | 105 | /** 106 | * This method requires a list of stopwords to build a the candidate list, 107 | * will search in each different sentence for this stopwords to delimite the 108 | * candidate generation 109 | * 110 | * 111 | * @param pLoc - the location of the file where the stopwords are 112 | */ 113 | public void loadStopWordsList(String pLoc) { 114 | List stops = new ArrayList<>(); 115 | try { 116 | List words = Files.readAllLines(Paths.get(pLoc), StandardCharsets.UTF_8); 117 | for (String string : words) { 118 | stops.add(string.trim()); 119 | } 120 | stopWordList = stops; 121 | } catch (IOException ex) { 122 | logger.error("Error loading RAKE stopWordList from: " + pLoc, ex); 123 | } 124 | } 125 | 126 | /** 127 | * As this method uses Regex for candidate generation, custom regex 128 | * expresions could be added using this method (uses Java Pattern/Matcher 129 | * mechanism) 130 | * 131 | * @param pat 132 | */ 133 | public void addCustomRegex(Pattern pat) { 134 | regexList.add(pat); 135 | } 136 | 137 | private Pattern buildStopWordRegex(List pStopWords) { 138 | StringBuilder sb = new StringBuilder(); 139 | for (String string : pStopWords) { 140 | sb.append("\\b").append(string.trim()).append("\\b").append("|"); 141 | } 142 | String pattern = sb.substring(0, sb.length() - 1); 143 | Pattern pat = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE| Pattern.UNICODE_CASE); 144 | return pat; 145 | } 146 | 147 | /** 148 | * This method works better with a list of punctuation stop list, for 149 | * example for english, spanish and in general in latin based languages the 150 | * list could be (.,/{}[];:) 151 | * 152 | * @param pLoc - the location of the file where the stopwords are 153 | */ 154 | public void loadPunctStopWord(String pLoc) { 155 | List stops = new ArrayList<>(); 156 | try { 157 | List words = Files.readAllLines(Paths.get(pLoc), StandardCharsets.UTF_8); 158 | for (String string : words) { 159 | stops.add(string.trim()); 160 | } 161 | punctList = stops; 162 | } catch (IOException ex) { 163 | logger.error("Error loading RAKE punctList from: " + pLoc, ex); 164 | } 165 | } 166 | 167 | /** 168 | * (OPTIONAL)This method works better with a list of punctuation stop list, 169 | * for example for english, spanish and in general in latin based languages 170 | * the list could be (.,/{}[];:) 171 | * 172 | * @param pPunt - the string list to be added 173 | */ 174 | public void loadPunctStopWord(List pPunt) { 175 | punctList = pPunt; 176 | 177 | } 178 | 179 | private Pattern buildPunctStopWord(List pPunctStop) { 180 | StringBuilder sb = new StringBuilder(); 181 | for (String string : pPunctStop) { 182 | sb.append("\\").append(string.trim()).append("|"); 183 | } 184 | String pattern = sb.substring(0, sb.length() - 1); 185 | Pattern pat = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE |Pattern.UNICODE_CASE); 186 | return pat; 187 | } 188 | 189 | private List generateCandidateKeywords(List pSentenceList, List pStopWordPattern) { 190 | List candidates = new ArrayList<>(); 191 | StringBuffer sb = new StringBuffer(); 192 | for (String string : pSentenceList) { 193 | for (Pattern pat : pStopWordPattern) { 194 | Matcher matcher = pat.matcher(string.trim()); 195 | while (matcher.find()) { 196 | matcher.appendReplacement(sb, "|"); 197 | } 198 | matcher.appendTail(sb); 199 | if (sb.length() > 0) { 200 | 201 | string = sb.toString(); 202 | } 203 | sb = new StringBuffer(); 204 | } 205 | List cands = Arrays.asList(string.split("\\|")); 206 | for (String string1 : cands) { 207 | if (string1.trim().length() > 0) { 208 | String[] p = string1.trim().split("\\s+"); 209 | if (string1.length() > 2 && p.length > 1 && !containsDigit(string1)) { 210 | candidates.add(string1.trim()); 211 | } 212 | } 213 | } 214 | } 215 | return candidates; 216 | } 217 | 218 | @Override 219 | public void runAlgorithm() { 220 | if (stopWordList.isEmpty()) { 221 | logger.error("The method " + this.getName() + " requires a StopWordList to build the candidate list"); 222 | } else { 223 | Map wordfreq = new HashMap<>(); 224 | Map worddegree = new HashMap<>(); 225 | Map wordscore = new HashMap<>(); 226 | Pattern pat = buildStopWordRegex(stopWordList); 227 | regexList.add(pat); 228 | if (!punctList.isEmpty()) { 229 | Pattern pat2 = buildPunctStopWord(punctList); 230 | regexList.add(pat2); 231 | } 232 | List candidates = generateCandidateKeywords(doc.getSentenceList(), regexList); 233 | for (String phrase : candidates) { 234 | String[] wordlist = phrase.split("\\s+"); 235 | int wordlistlength = wordlist.length; 236 | int wordlistdegree = wordlistlength - 1; 237 | for (String word : wordlist) { 238 | int freq; 239 | if (wordfreq.containsKey(word) == false) { 240 | wordfreq.put(word, 1); 241 | } else { 242 | freq = wordfreq.get(word) + 1; 243 | wordfreq.remove(word); 244 | wordfreq.put(word, freq); 245 | } 246 | 247 | if (worddegree.containsKey(word) == false) { 248 | worddegree.put(word, wordlistdegree); 249 | } else { 250 | int deg = worddegree.get(word) + wordlistdegree; 251 | worddegree.remove(word); 252 | worddegree.put(word, deg); 253 | } 254 | } 255 | } 256 | for (Map.Entry entry : worddegree.entrySet()) { 257 | entry.setValue(entry.getValue() + wordfreq.get(entry.getKey())); 258 | } 259 | List termLi = new ArrayList<>(); 260 | for (Map.Entry entry : wordfreq.entrySet()) { 261 | wordscore.put(entry.getKey(), worddegree.get(entry.getKey()) / (wordfreq.get(entry.getKey()) * 1.0f)); 262 | } 263 | for (String phrase : candidates) { 264 | String[] words = phrase.split("\\s+"); 265 | float score = 0.0f; 266 | for (String word : words) { 267 | score += wordscore.get(word); 268 | } 269 | termLi.add(new Term(phrase, score)); 270 | } 271 | Comparator sorter = (o1, o2) -> o1.getScore() > o2.getScore() ? -1 : o1.getScore() == o2.getScore() ? 0 : 1; 272 | List orderedList = termLi.parallelStream().sorted(sorter).distinct().collect(toList()); 273 | doc.setTermList(orderedList); 274 | 275 | } 276 | } 277 | 278 | /** 279 | * 280 | * @return the doc 281 | */ 282 | public Document getDoc() { 283 | return doc; 284 | } 285 | 286 | /** 287 | * @param doc the doc to set 288 | */ 289 | public void setDoc(Document doc) { 290 | this.doc = doc; 291 | } 292 | 293 | 294 | 295 | /** 296 | * 297 | * Returns the current (Default 2) 298 | * 299 | * @return the minNumberOfletters required to a word to be included 300 | */ 301 | public int getMinNumberOfletters() { 302 | return minNumberOfletters; 303 | } 304 | 305 | /** 306 | * Default 2 307 | * 308 | * @param minNumberOfletters the minNumberOfletters to set to a word to be 309 | * included 310 | */ 311 | public void setMinNumberOfletters(int minNumberOfletters) { 312 | this.minNumberOfletters = minNumberOfletters; 313 | } 314 | 315 | private boolean containsDigit(String string) { 316 | for (char c : string.toCharArray()) { 317 | if (Character.isDigit(c)) { 318 | return true; 319 | } 320 | } 321 | return false; 322 | } 323 | } 324 | -------------------------------------------------------------------------------- /src/main/java/edu/ehu/galan/rake/model/AbstractAlgorithm.java: -------------------------------------------------------------------------------- 1 | package edu.ehu.galan.rake.model; 2 | /* 3 | * AbstractAlgorithm.java 4 | * Copyright (C) 2013 Angel Conde, neuw84 at gmail dot com 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 | */ 20 | 21 | import com.google.gson.Gson; 22 | import java.io.File; 23 | import java.io.FileWriter; 24 | import java.io.IOException; 25 | import java.io.PrintWriter; 26 | import java.nio.charset.StandardCharsets; 27 | import java.nio.file.Files; 28 | import java.nio.file.Path; 29 | import java.nio.file.Paths; 30 | import java.util.ArrayList; 31 | import java.util.Comparator; 32 | import java.util.List; 33 | import java.util.Properties; 34 | import java.util.concurrent.Callable; 35 | import java.util.stream.Collectors; 36 | import org.slf4j.Logger; 37 | import org.slf4j.LoggerFactory; 38 | 39 | /** 40 | * An abstract class that represents an Algorithm for term extraction, all the 41 | * different extraction methods should extend this 42 | * 43 | * @author Angel Conde Manjon 44 | */ 45 | public abstract class AbstractAlgorithm implements Callable { 46 | 47 | //TODO if we process a corpus instead a document, the termList in each 48 | //document is unusable, thing about the model and refactor 49 | 50 | private List termList; 51 | private boolean scored; 52 | private transient Document doc; 53 | private String name; 54 | private transient Properties properties = null; 55 | transient final Logger logger = LoggerFactory.getLogger(AbstractAlgorithm.class); 56 | 57 | /** 58 | * 59 | * @param pScored - if the results of the algorithm will be scored 60 | * @param pName - The name of the algorithm 61 | */ 62 | public AbstractAlgorithm(boolean pScored, String pName) { 63 | termList = new ArrayList<>(); 64 | scored = pScored; 65 | name = pName; 66 | } 67 | 68 | /** 69 | * Returns the term list 70 | * 71 | * @return 72 | */ 73 | public List getTermList() { 74 | return termList; 75 | } 76 | 77 | /** 78 | * Returns a list of the terms where all the scores will be > of the passed 79 | * threshold 80 | * 81 | * @param pThreshold 82 | * @return 83 | */ 84 | public List getThresholdedTermList(float pThreshold) { 85 | if (isScored()) { 86 | return getTermList().stream().filter((scoredTerm) -> (scoredTerm.getScore() > pThreshold)).collect(Collectors.toList()); 87 | } else { 88 | logger.warn("You can't get a thresholded list because this is not a scored algorithm"); 89 | return null; 90 | } 91 | } 92 | 93 | /** 94 | * Apply a stopWord list to the term list, will search for the stopword in 95 | * each term component (if the term is "solar system" will try to match the 96 | * stopword in each component of the term (solar and system). will filter 97 | * the term list 98 | * 99 | * @param pStopwordList 100 | */ 101 | public final void applyStopwordList(List pStopwordList) { 102 | List stopList = new ArrayList<>(); 103 | boolean stop; 104 | if (getTermList().size() > 0) { 105 | for (Term term : getTermList()) { 106 | String[] nGrams = term.getTerm().split("\\s"); 107 | stop = false; 108 | for (String string : pStopwordList) { 109 | if (nGrams.length == 1) { 110 | if (nGrams[0].equalsIgnoreCase(string)) { 111 | stop = true; 112 | break; 113 | } 114 | } else { 115 | for (String string1 : nGrams) { 116 | if (string1.equalsIgnoreCase(string)) { 117 | stop = true; 118 | break; 119 | } 120 | } 121 | } 122 | if (!stop) { 123 | stopList.add(term); 124 | } 125 | } 126 | } 127 | setTermList(stopList); 128 | } else { 129 | logger.info("The term list appears to be empty, have you ran the algorithm?"); 130 | } 131 | } 132 | 133 | /** 134 | * will try to match the stopword list to the first component of a multiword 135 | * term 136 | * 137 | * @param pFirstTermStopWordList 138 | */ 139 | public final void firstTermStopWordList(List pFirstTermStopWordList) { 140 | List stopList = new ArrayList<>(); 141 | boolean stop; 142 | if (getTermList().size() > 0) { 143 | for (String string : pFirstTermStopWordList) { 144 | for (Term term : getTermList()) { 145 | stop = false; 146 | String[] nGrams = term.getTerm().split("\\s"); 147 | if (nGrams[0].equalsIgnoreCase(string)) { 148 | stop = true; 149 | } 150 | if (!stop) { 151 | stopList.add(term); 152 | } 153 | } 154 | } 155 | setTermList(stopList); 156 | } else { 157 | logger.info("The term list appears to be empty, have you ran the algorithm?"); 158 | } 159 | } 160 | 161 | /** 162 | * will try to match the stopword list to the last component of a multiword 163 | * term 164 | * 165 | * @param pFirstTermStopWordList 166 | */ 167 | public void lastTermStopWordList(List pFirstTermStopWordList) { 168 | List stopList = new ArrayList<>(); 169 | boolean stop; 170 | if (getTermList().size() > 0) { 171 | for (String string : pFirstTermStopWordList) { 172 | for (Term term : getTermList()) { 173 | stop = false; 174 | String[] nGrams = term.getTerm().split("\\s"); 175 | if (nGrams[nGrams.length - 1].equalsIgnoreCase(string)) { 176 | stop = true; 177 | } 178 | if (!stop) { 179 | stopList.add(term); 180 | } 181 | } 182 | } 183 | setTermList(stopList); 184 | } else { 185 | logger.info("The term list appears to be empty, have you ran the algorithm?"); 186 | } 187 | 188 | } 189 | 190 | /** 191 | * Prints in the standar output the algorithm results 192 | */ 193 | public final void print() { 194 | if (isScored()) { 195 | getTermList().stream().forEach((scoredTerm) -> { 196 | System.out.printf("%s \t %f", scoredTerm.getTerm(), scoredTerm.getScore()); 197 | }); 198 | } else { 199 | getTermList().stream().forEach((scoredTerm) -> { 200 | System.out.printf("%s \t %f", scoredTerm.getTerm()); 201 | }); 202 | } 203 | } 204 | 205 | /** 206 | * The class that represents the action of running an algorithm in a corpus 207 | * must be implemented 208 | */ 209 | public abstract void runAlgorithm(); 210 | 211 | /** 212 | * This will be used by the ThreadPool to execute the algorithm and return 213 | * the results 214 | * 215 | * @return List a list of the extracted terms by the algorithm 216 | * @throws Exception 217 | */ 218 | @Override 219 | public final Integer call() throws Exception { 220 | runAlgorithm(); 221 | return new Integer(0); 222 | } 223 | 224 | /** 225 | * Sets the term list of this algorithm 226 | * 227 | * @param termList the termList to set 228 | */ 229 | public final void setTermList(List termList) { 230 | this.termList = termList; 231 | } 232 | 233 | /** 234 | * returns whether this algorithm is scored 235 | * 236 | * @return the scored 237 | */ 238 | public final boolean isScored() { 239 | return scored; 240 | } 241 | 242 | /** 243 | * Sets if this algorithm has scored results 244 | * 245 | * @param scored the scored to set 246 | */ 247 | public final void setScored(boolean scored) { 248 | this.scored = scored; 249 | } 250 | 251 | /** 252 | * Returns the document assigned to this algorithm 253 | * 254 | * @return the corpus 255 | */ 256 | public final Document getDocument() { 257 | return doc; 258 | } 259 | 260 | /** 261 | * The corpus that will be processed by the algorithm 262 | * 263 | * @param pDoc the document to set 264 | */ 265 | public final void setDocument(Document pDoc) { 266 | this.doc = pDoc; 267 | } 268 | 269 | /** 270 | * Saves the current term list to tmp folder (configured in the resources 271 | * folder) 272 | * 273 | */ 274 | public void saveToTmp() { 275 | try (FileWriter outFile = new FileWriter("kpminer")) { 276 | boolean first = true; 277 | try (PrintWriter out = new PrintWriter(outFile)) { 278 | for (Term term : termList) { 279 | out.printf("\n%s", term); 280 | } 281 | } 282 | } catch (IOException ex) { 283 | logger.warn(AbstractAlgorithm.class.getName(), "couldn't save the algorithm results to temp directory", ex); 284 | } 285 | } 286 | 287 | /** 288 | * Return a String with json extracted terms, name of algorithm and whether 289 | * is scored or not folder) 290 | * 291 | * @return - String with the contents of this algorithm in JSON format 292 | * (name,scored,termlist( 293 | */ 294 | public String toJson() { 295 | Gson son = new Gson(); 296 | return son.toJson(this); 297 | } 298 | 299 | 300 | public void sort(Comparator comparator){ 301 | termList= this.getTermList().stream().sorted(comparator).collect(Collectors.toList()); 302 | } 303 | 304 | /** 305 | * Save algorithms results in Json format to tmp directory configured in the 306 | * the config 307 | * 308 | */ 309 | public void saveGsonToTmp() { 310 | try ( FileWriter outFile = new FileWriter(properties.getProperty("tmpDir") + File.separator + this.getName() + ".json")){ 311 | try (PrintWriter out = new PrintWriter(outFile)) { 312 | Gson son = new Gson(); 313 | out.print(son.toJson(this)); 314 | } 315 | } catch (IOException ex) { 316 | logger.warn(AbstractAlgorithm.class.getName(), "couldn't save the algorithm results to temp directory in json format", ex); 317 | } 318 | } 319 | 320 | /** 321 | * Return a list of terms from a text file that contains the results of 322 | * running an algorithm 323 | * 324 | * @param pFile 325 | * @return 326 | */ 327 | public List readCandidates(String pFile) { 328 | //TODO improve the char recognition using YAGO char tools 329 | List list = null; 330 | try { 331 | Path path = Paths.get(pFile); 332 | List listC = Files.readAllLines(path, StandardCharsets.UTF_8); 333 | for (String string : listC) { 334 | string = string.trim(); 335 | String[] line = string.split(" "); 336 | String candidate; 337 | candidate = line[0]; 338 | candidate = candidate.trim(); 339 | float value = Float.parseFloat(line[1]); 340 | list.add(new Term(candidate, value)); 341 | } 342 | return list; 343 | } catch (IOException ex) { 344 | logger.error(AbstractAlgorithm.class.getName(), "error while reading algorithm results", ex); 345 | } catch (NullPointerException ex1){ 346 | logger.error(AbstractAlgorithm.class.getName(), "The file is not in the required format", ex1); 347 | } 348 | return null; 349 | 350 | } 351 | 352 | /** 353 | * @return the name 354 | */ 355 | public String getName() { 356 | return name; 357 | } 358 | 359 | /** 360 | * @param name the name to set 361 | */ 362 | public void setName(String name) { 363 | this.name = name; 364 | } 365 | 366 | /** 367 | * Method for class initialization, initializes the document that will 368 | * be processed for the given algorithm, and the directory where the program 369 | * is executed (standalone vs web server differences...) 370 | * 371 | * @param pDoc 372 | * @param pPropsDir 373 | */ 374 | public abstract void init(Document pDoc, String pPropsDir); 375 | 376 | public void setProperties(Properties pProps) { 377 | properties=pProps; 378 | } 379 | } 380 | -------------------------------------------------------------------------------- /src/main/java/edu/ehu/galan/rake/model/Document.java: -------------------------------------------------------------------------------- 1 | package edu.ehu.galan.rake.model; 2 | 3 | /* 4 | * Document.java 5 | * Copyright (C) 2014 Angel Conde, neuw84 at gmail dot com 6 | * 7 | * This program is free software; you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation; either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program; if not, write to the Free Software 19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20 | */ 21 | 22 | 23 | import java.io.BufferedInputStream; 24 | import java.io.BufferedWriter; 25 | import java.io.File; 26 | import java.io.FileInputStream; 27 | import java.io.FileNotFoundException; 28 | import java.io.FileOutputStream; 29 | import java.io.IOException; 30 | import java.io.InputStreamReader; 31 | import java.io.OutputStreamWriter; 32 | import java.io.Reader; 33 | import java.io.StringWriter; 34 | import java.io.Writer; 35 | import java.nio.charset.Charset; 36 | import java.nio.charset.CharsetDecoder; 37 | import java.nio.charset.CodingErrorAction; 38 | import java.util.ArrayList; 39 | import java.util.LinkedList; 40 | import java.util.List; 41 | import org.apache.commons.io.FileUtils; 42 | import org.apache.commons.io.IOUtils; 43 | import org.slf4j.Logger; 44 | import org.slf4j.LoggerFactory; 45 | 46 | /** 47 | * A document represents the piece of a corpus containing text. 48 | * 49 | * @author Angel Conde Manjon 50 | */ 51 | public class Document { 52 | 53 | private transient String path; 54 | private transient List sentenceList; 55 | private transient List> tokenList; 56 | private String name; 57 | private transient List termList; 58 | private transient static final Logger logger = LoggerFactory.getLogger(Document.class); 59 | 60 | /** 61 | * 62 | * @param pPath 63 | * @param pName 64 | */ 65 | public Document(String pPath, String pName) { 66 | path = pPath; 67 | name = pName; 68 | termList = new ArrayList<>(); 69 | } 70 | 71 | /** 72 | * @return the path 73 | */ 74 | public String getPath() { 75 | return path; 76 | } 77 | 78 | /** 79 | * @param path the path to set 80 | */ 81 | public void setPath(String path) { 82 | this.path = path; 83 | } 84 | 85 | /** 86 | * @return the sentenceList 87 | */ 88 | public List getSentenceList() { 89 | return sentenceList; 90 | } 91 | 92 | /** 93 | * @param sentenceList the sentenceList to set 94 | */ 95 | public void setSentenceList(List sentenceList) { 96 | this.sentenceList = sentenceList; 97 | } 98 | 99 | /** 100 | * @return the tokenList 101 | */ 102 | public List> getTokenList() { 103 | return tokenList; 104 | } 105 | 106 | /** 107 | * @param tokenList the tokenList to set 108 | */ 109 | public void List(List> tokenList) { 110 | this.tokenList = tokenList; 111 | } 112 | 113 | /** 114 | * @return the name 115 | */ 116 | public String getName() { 117 | return name; 118 | } 119 | 120 | /** 121 | * @param name the name to set 122 | */ 123 | public void setName(String name) { 124 | this.name = name; 125 | } 126 | 127 | /** 128 | * 129 | * @return 130 | */ 131 | public List getTermList() { 132 | return termList; 133 | } 134 | 135 | /** 136 | * Tries to convert the content of this document to UTF-8 using java 137 | * CharsetDecoders 138 | */ 139 | public void convertToUTF8() { 140 | FileInputStream istream = null; 141 | Writer out = null; 142 | try { 143 | istream = new FileInputStream(path); 144 | BufferedInputStream in = new BufferedInputStream(istream); 145 | CharsetDecoder charsetDecoder = Charset.forName("UTF-8").newDecoder(); 146 | charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE); 147 | charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 148 | Reader inputReader = new InputStreamReader(in, charsetDecoder); 149 | StringWriter writer = new StringWriter(); 150 | IOUtils.copy(inputReader, writer); 151 | String theString = writer.toString(); 152 | FileUtils.deleteQuietly(new File(path)); 153 | out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8")); 154 | out.write(theString); 155 | out.close(); 156 | // System.out.println(""); 157 | } catch (FileNotFoundException ex) { 158 | logger.error("Error converting the file to utf8", ex); 159 | } catch (IOException ex) { 160 | logger.error("Error converting the file to utf8", ex); 161 | } finally { 162 | try { 163 | if (out != null) { 164 | out.close(); 165 | } 166 | if (istream != null) { 167 | istream.close(); 168 | } 169 | } catch (IOException ex) { 170 | logger.error("Error converting the file to utf8", ex); 171 | } 172 | } 173 | 174 | } 175 | 176 | /** 177 | * @param termList the termList to set 178 | */ 179 | public void setTermList(List termList) { 180 | this.termList = termList; 181 | } 182 | 183 | } 184 | -------------------------------------------------------------------------------- /src/main/java/edu/ehu/galan/rake/model/Term.java: -------------------------------------------------------------------------------- 1 | package edu.ehu.galan.rake.model; 2 | 3 | /* 4 | * Term.java 5 | * Copyright (C) 2013 Angel Conde, neuw84 at gmail dot com 6 | * 7 | * This program is free software; you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation; either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program; if not, write to the Free Software 19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20 | */ 21 | 22 | 23 | import java.util.Objects; 24 | 25 | /** 26 | * A term represents a candidate of the term extraction methods, it's need that it will 27 | * pass a validation with a knowledge base before knowing if it is a topic 28 | * A term contains an String with the term's text and a score if the algorithm 29 | * used for extracting the term has one. (if not the score must be -1) 30 | * 31 | * @author Angel Conde Manjon 32 | */ 33 | 34 | public class Term { 35 | 36 | private String term; 37 | private float score; 38 | 39 | /** 40 | * 41 | */ 42 | public Term() { 43 | 44 | } 45 | 46 | /** 47 | * 48 | * @param pTerm 49 | */ 50 | 51 | 52 | public Term(String pTerm) { 53 | term = pTerm; 54 | score = -1; 55 | 56 | } 57 | 58 | /** 59 | * 60 | * @param pTerm 61 | * @param pScore 62 | */ 63 | public Term(String pTerm, float pScore) { 64 | term = pTerm; 65 | score = pScore; 66 | } 67 | 68 | /** 69 | * @return the extracted termterm 70 | */ 71 | public String getTerm() { 72 | return term; 73 | } 74 | 75 | /** 76 | * @param term the term to set 77 | */ 78 | public void setTerm(String term) { 79 | this.term = term; 80 | } 81 | 82 | /** 83 | * @return the score 84 | */ 85 | public float getScore() { 86 | return score; 87 | } 88 | 89 | /** 90 | * @param score the score to set 91 | */ 92 | public void setScore(float score) { 93 | this.score = score; 94 | } 95 | 96 | @Override 97 | public String toString() { 98 | return term + "\t" + score; 99 | } 100 | 101 | 102 | @Override 103 | public boolean equals(Object pObject) { 104 | if (pObject instanceof Term) { 105 | return this.term.equalsIgnoreCase(((Term) pObject).getTerm()); 106 | } else { 107 | return false; 108 | 109 | } 110 | } 111 | 112 | @Override 113 | public int hashCode() { 114 | int hash = 7; 115 | hash = 97 * hash + Objects.hashCode(this.term); 116 | return hash; 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/main/java/edu/ehu/galan/rake/model/Token.java: -------------------------------------------------------------------------------- 1 | 2 | package edu.ehu.galan.rake.model; 3 | /* 4 | * Token.java 5 | * Copyright (C) 2013 Angel Conde, neuw84 at gmail dot com 6 | * 7 | * This program is free software; you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation; either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program; if not, write to the Free Software 19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20 | */ 21 | 22 | 23 | /** 24 | * A token is a simple "word" containing the word form, POS tag, lemma, etc.... 25 | * 26 | * @author Angel Conde Manjon 27 | */ 28 | 29 | public class Token{ 30 | private String wordForm; 31 | private String posTag; 32 | private String chunkerTag; 33 | private String lemma; 34 | private int pos; //position inside the sentence? 35 | 36 | /** 37 | * 38 | * @param pWordForm 39 | */ 40 | public Token(String pWordForm){ 41 | wordForm=pWordForm; 42 | } 43 | 44 | /** 45 | * 46 | * @param pWordForm 47 | * @param pPostag 48 | */ 49 | public Token(String pWordForm,String pPostag){ 50 | wordForm=pWordForm; 51 | posTag=pPostag; 52 | } 53 | /** 54 | * 55 | * @param pWordForm 56 | * @param pPostag 57 | * @param pLemma 58 | */ 59 | public Token(String pWordForm,String pPostag,String pLemma){ 60 | wordForm=pWordForm; 61 | posTag=pPostag; 62 | lemma=pLemma; 63 | } 64 | 65 | /** 66 | * @param pChunker 67 | * @param pWordForm 68 | * @param pPostag 69 | * @param pLemma 70 | */ 71 | public Token(String pWordForm,String pPostag,String pLemma, String pChunker){ 72 | wordForm=pWordForm; 73 | posTag=pPostag; 74 | lemma=pLemma; 75 | chunkerTag=pChunker; 76 | } 77 | /** 78 | * @return the wordForm 79 | */ 80 | public String getWordForm() { 81 | return wordForm; 82 | } 83 | 84 | /** 85 | * @param wordForm the wordForm to set 86 | */ 87 | public void setWordForm(String wordForm) { 88 | this.wordForm = wordForm; 89 | } 90 | 91 | /** 92 | * @return the posTag 93 | */ 94 | public String getPosTag() { 95 | return posTag; 96 | } 97 | 98 | /** 99 | * @param posTag the posTag to set 100 | */ 101 | public void setPosTag(String posTag) { 102 | this.posTag = posTag; 103 | } 104 | 105 | @Override 106 | public String toString(){ 107 | return wordForm+ "\t" + posTag; 108 | } 109 | 110 | /** 111 | * @return the lemma 112 | */ 113 | public String getLemma() { 114 | return lemma; 115 | } 116 | 117 | /** 118 | * @param lemma the lemma to set 119 | */ 120 | public void setLemma(String lemma) { 121 | this.lemma = lemma; 122 | } 123 | 124 | /** 125 | * @return the chunkerTag 126 | */ 127 | public String getChunkerTag() { 128 | return chunkerTag; 129 | } 130 | 131 | /** 132 | * @param chunkerTag the chunkerTag to set 133 | */ 134 | public void setChunkerTag(String chunkerTag) { 135 | this.chunkerTag = chunkerTag; 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/test/java/edu/ehu/galan/rake/AppTest.java: -------------------------------------------------------------------------------- 1 | package edu.ehu.galan.rake; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /stopLists/FoxStopListEn: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | across 5 | after 6 | again 7 | against 8 | all 9 | almost 10 | alone 11 | along 12 | already 13 | also 14 | although 15 | always 16 | among 17 | an 18 | and 19 | another 20 | any 21 | anybody 22 | anyone 23 | anything 24 | anywhere 25 | are 26 | area 27 | areas 28 | around 29 | as 30 | ask 31 | asked 32 | asking 33 | asks 34 | at 35 | away 36 | b 37 | back 38 | backed 39 | backing 40 | backs 41 | be 42 | because 43 | became 44 | become 45 | becomes 46 | been 47 | before 48 | began 49 | behind 50 | being 51 | beings 52 | best 53 | better 54 | between 55 | big 56 | both 57 | but 58 | by 59 | c 60 | came 61 | can 62 | cannot 63 | case 64 | cases 65 | certain 66 | certainly 67 | clear 68 | clearly 69 | come 70 | could 71 | d 72 | did 73 | differ 74 | different 75 | differently 76 | do 77 | does 78 | done 79 | down 80 | downed 81 | downing 82 | downs 83 | during 84 | e 85 | each 86 | early 87 | either 88 | end 89 | ended 90 | ending 91 | ends 92 | enough 93 | even 94 | evenly 95 | ever 96 | every 97 | everybody 98 | everyone 99 | everything 100 | everywhere 101 | f 102 | face 103 | faces 104 | fact 105 | facts 106 | far 107 | felt 108 | few 109 | find 110 | finds 111 | first 112 | for 113 | four 114 | from 115 | full 116 | fully 117 | further 118 | furthered 119 | furthering 120 | furthers 121 | g 122 | gave 123 | general 124 | generally 125 | get 126 | gets 127 | give 128 | given 129 | gives 130 | go 131 | going 132 | good 133 | goods 134 | got 135 | great 136 | greater 137 | greatest 138 | group 139 | grouped 140 | grouping 141 | groups 142 | h 143 | had 144 | has 145 | have 146 | having 147 | he 148 | her 149 | herself 150 | here 151 | high 152 | higher 153 | highest 154 | him 155 | himself 156 | his 157 | how 158 | however 159 | i 160 | if 161 | important 162 | in 163 | interest 164 | interested 165 | interesting 166 | interests 167 | into 168 | is 169 | it 170 | its 171 | itself 172 | j 173 | just 174 | k 175 | keep 176 | keeps 177 | kind 178 | knew 179 | know 180 | known 181 | knows 182 | l 183 | large 184 | largely 185 | last 186 | later 187 | latest 188 | least 189 | less 190 | let 191 | lets 192 | like 193 | likely 194 | long 195 | longer 196 | longest 197 | m 198 | made 199 | make 200 | making 201 | man 202 | many 203 | may 204 | me 205 | member 206 | members 207 | men 208 | might 209 | more 210 | most 211 | mostly 212 | mr 213 | mrs 214 | much 215 | must 216 | my 217 | myself 218 | n 219 | necessary 220 | need 221 | needed 222 | needing 223 | needs 224 | never 225 | new 226 | newer 227 | newest 228 | next 229 | no 230 | non 231 | not 232 | nobody 233 | noone 234 | nothing 235 | now 236 | nowhere 237 | number 238 | numbered 239 | numbering 240 | numbers 241 | o 242 | of 243 | off 244 | often 245 | old 246 | older 247 | oldest 248 | on 249 | once 250 | one 251 | only 252 | open 253 | opened 254 | opening 255 | opens 256 | or 257 | order 258 | ordered 259 | ordering 260 | orders 261 | other 262 | others 263 | our 264 | out 265 | over 266 | p 267 | part 268 | parted 269 | parting 270 | parts 271 | per 272 | perhaps 273 | place 274 | places 275 | point 276 | pointed 277 | pointing 278 | points 279 | possible 280 | present 281 | presented 282 | presenting 283 | presents 284 | problem 285 | problems 286 | put 287 | puts 288 | q 289 | quite 290 | r 291 | rather 292 | really 293 | right 294 | room 295 | rooms 296 | s 297 | said 298 | same 299 | saw 300 | say 301 | says 302 | second 303 | seconds 304 | see 305 | seem 306 | seemed 307 | seeming 308 | seems 309 | sees 310 | several 311 | shall 312 | she 313 | should 314 | show 315 | showed 316 | showing 317 | shows 318 | side 319 | sides 320 | since 321 | small 322 | smaller 323 | smallest 324 | so 325 | some 326 | somebody 327 | someone 328 | something 329 | somewhere 330 | state 331 | states 332 | still 333 | such 334 | sure 335 | t 336 | take 337 | taken 338 | than 339 | that 340 | the 341 | their 342 | them 343 | then 344 | there 345 | therefore 346 | these 347 | they 348 | thing 349 | things 350 | think 351 | thinks 352 | this 353 | those 354 | though 355 | thought 356 | thoughts 357 | three 358 | through 359 | thus 360 | to 361 | today 362 | together 363 | too 364 | took 365 | toward 366 | turn 367 | turned 368 | turning 369 | turns 370 | two 371 | u 372 | under 373 | until 374 | up 375 | upon 376 | us 377 | use 378 | uses 379 | used 380 | v 381 | very 382 | w 383 | want 384 | wanted 385 | wanting 386 | wants 387 | was 388 | way 389 | ways 390 | we 391 | well 392 | wells 393 | went 394 | were 395 | what 396 | when 397 | where 398 | whether 399 | which 400 | while 401 | who 402 | whole 403 | whose 404 | why 405 | will 406 | with 407 | within 408 | without 409 | work 410 | worked 411 | working 412 | works 413 | would 414 | x 415 | y 416 | year 417 | years 418 | yet 419 | you 420 | young 421 | younger 422 | youngest 423 | your 424 | yours 425 | z -------------------------------------------------------------------------------- /stopLists/RakePunctDefaultStopList: -------------------------------------------------------------------------------- 1 | . 2 | / 3 | , 4 | ! 5 | ? 6 | { 7 | } 8 | [ 9 | ] 10 | ; 11 | : 12 | ( 13 | ) 14 | - 15 | _ 16 | @ -------------------------------------------------------------------------------- /stopLists/SmartStopListEn: -------------------------------------------------------------------------------- 1 | a 2 | a's 3 | able 4 | about 5 | above 6 | according 7 | accordingly 8 | across 9 | actually 10 | after 11 | afterwards 12 | again 13 | against 14 | ain't 15 | all 16 | allow 17 | allows 18 | almost 19 | alone 20 | along 21 | already 22 | also 23 | although 24 | always 25 | am 26 | among 27 | amongst 28 | an 29 | and 30 | another 31 | any 32 | anybody 33 | anyhow 34 | anyone 35 | anything 36 | anyway 37 | anyways 38 | anywhere 39 | apart 40 | appear 41 | appreciate 42 | appropriate 43 | are 44 | aren't 45 | around 46 | as 47 | aside 48 | ask 49 | asking 50 | associated 51 | at 52 | available 53 | away 54 | awfully 55 | b 56 | be 57 | became 58 | because 59 | become 60 | becomes 61 | becoming 62 | been 63 | before 64 | beforehand 65 | behind 66 | being 67 | believe 68 | below 69 | beside 70 | besides 71 | best 72 | better 73 | between 74 | beyond 75 | both 76 | brief 77 | but 78 | by 79 | c 80 | c'mon 81 | c's 82 | came 83 | can 84 | can't 85 | cannot 86 | cant 87 | cause 88 | causes 89 | certain 90 | certainly 91 | changes 92 | clearly 93 | co 94 | com 95 | come 96 | comes 97 | concerning 98 | consequently 99 | consider 100 | considering 101 | contain 102 | containing 103 | contains 104 | corresponding 105 | could 106 | couldn't 107 | course 108 | currently 109 | d 110 | definitely 111 | described 112 | despite 113 | did 114 | didn't 115 | different 116 | do 117 | does 118 | doesn't 119 | doing 120 | don't 121 | done 122 | down 123 | downwards 124 | during 125 | e 126 | each 127 | edu 128 | eg 129 | eight 130 | either 131 | else 132 | elsewhere 133 | enough 134 | entirely 135 | especially 136 | et 137 | etc 138 | even 139 | ever 140 | every 141 | everybody 142 | everyone 143 | everything 144 | everywhere 145 | ex 146 | exactly 147 | example 148 | except 149 | f 150 | far 151 | few 152 | fifth 153 | first 154 | five 155 | followed 156 | following 157 | follows 158 | for 159 | former 160 | formerly 161 | forth 162 | four 163 | from 164 | further 165 | furthermore 166 | g 167 | get 168 | gets 169 | getting 170 | given 171 | gives 172 | go 173 | goes 174 | going 175 | gone 176 | got 177 | gotten 178 | greetings 179 | h 180 | had 181 | hadn't 182 | happens 183 | hardly 184 | has 185 | hasn't 186 | have 187 | haven't 188 | having 189 | he 190 | he's 191 | hello 192 | help 193 | hence 194 | her 195 | here 196 | here's 197 | hereafter 198 | hereby 199 | herein 200 | hereupon 201 | hers 202 | herself 203 | hi 204 | him 205 | himself 206 | his 207 | hither 208 | hopefully 209 | how 210 | howbeit 211 | however 212 | i 213 | i'd 214 | i'll 215 | i'm 216 | i've 217 | ie 218 | if 219 | ignored 220 | immediate 221 | in 222 | inasmuch 223 | inc 224 | indeed 225 | indicate 226 | indicated 227 | indicates 228 | inner 229 | insofar 230 | instead 231 | into 232 | inward 233 | is 234 | isn't 235 | it 236 | it'd 237 | it'll 238 | it's 239 | its 240 | itself 241 | j 242 | just 243 | k 244 | keep 245 | keeps 246 | kept 247 | know 248 | knows 249 | known 250 | l 251 | last 252 | lately 253 | later 254 | latter 255 | latterly 256 | least 257 | less 258 | lest 259 | let 260 | let's 261 | like 262 | liked 263 | likely 264 | little 265 | look 266 | looking 267 | looks 268 | ltd 269 | m 270 | mainly 271 | many 272 | may 273 | maybe 274 | me 275 | mean 276 | meanwhile 277 | merely 278 | might 279 | more 280 | moreover 281 | most 282 | mostly 283 | much 284 | must 285 | my 286 | myself 287 | n 288 | name 289 | namely 290 | nd 291 | near 292 | nearly 293 | necessary 294 | need 295 | needs 296 | neither 297 | never 298 | nevertheless 299 | new 300 | next 301 | nine 302 | no 303 | nobody 304 | non 305 | none 306 | noone 307 | nor 308 | normally 309 | not 310 | nothing 311 | novel 312 | now 313 | nowhere 314 | o 315 | obviously 316 | of 317 | off 318 | often 319 | oh 320 | ok 321 | okay 322 | old 323 | on 324 | once 325 | one 326 | ones 327 | only 328 | onto 329 | or 330 | other 331 | others 332 | otherwise 333 | ought 334 | our 335 | ours 336 | ourselves 337 | out 338 | outside 339 | over 340 | overall 341 | own 342 | p 343 | particular 344 | particularly 345 | per 346 | perhaps 347 | placed 348 | please 349 | plus 350 | possible 351 | presumably 352 | probably 353 | provides 354 | q 355 | que 356 | quite 357 | qv 358 | r 359 | rather 360 | rd 361 | re 362 | really 363 | reasonably 364 | regarding 365 | regardless 366 | regards 367 | relatively 368 | respectively 369 | right 370 | s 371 | said 372 | same 373 | saw 374 | say 375 | saying 376 | says 377 | second 378 | secondly 379 | see 380 | seeing 381 | seem 382 | seemed 383 | seeming 384 | seems 385 | seen 386 | self 387 | selves 388 | sensible 389 | sent 390 | serious 391 | seriously 392 | seven 393 | several 394 | shall 395 | she 396 | should 397 | shouldn't 398 | since 399 | six 400 | so 401 | some 402 | somebody 403 | somehow 404 | someone 405 | something 406 | sometime 407 | sometimes 408 | somewhat 409 | somewhere 410 | soon 411 | sorry 412 | specified 413 | specify 414 | specifying 415 | still 416 | sub 417 | such 418 | sup 419 | sure 420 | t 421 | t's 422 | take 423 | taken 424 | tell 425 | tends 426 | th 427 | than 428 | thank 429 | thanks 430 | thanx 431 | that 432 | that's 433 | thats 434 | the 435 | their 436 | theirs 437 | them 438 | themselves 439 | then 440 | thence 441 | there 442 | there's 443 | thereafter 444 | thereby 445 | therefore 446 | therein 447 | theres 448 | thereupon 449 | these 450 | they 451 | they'd 452 | they'll 453 | they're 454 | they've 455 | think 456 | third 457 | this 458 | thorough 459 | thoroughly 460 | those 461 | though 462 | three 463 | through 464 | throughout 465 | thru 466 | thus 467 | to 468 | together 469 | too 470 | took 471 | toward 472 | towards 473 | tried 474 | tries 475 | truly 476 | try 477 | trying 478 | twice 479 | two 480 | u 481 | un 482 | under 483 | unfortunately 484 | unless 485 | unlikely 486 | until 487 | unto 488 | up 489 | upon 490 | us 491 | use 492 | used 493 | useful 494 | uses 495 | using 496 | usually 497 | uucp 498 | v 499 | value 500 | various 501 | very 502 | via 503 | viz 504 | vs 505 | w 506 | want 507 | wants 508 | was 509 | wasn't 510 | way 511 | we 512 | we'd 513 | we'll 514 | we're 515 | we've 516 | welcome 517 | well 518 | went 519 | were 520 | weren't 521 | what 522 | what's 523 | whatever 524 | when 525 | whence 526 | whenever 527 | where 528 | where's 529 | whereafter 530 | whereas 531 | whereby 532 | wherein 533 | whereupon 534 | wherever 535 | whether 536 | which 537 | while 538 | whither 539 | who 540 | who's 541 | whoever 542 | whole 543 | whom 544 | whose 545 | why 546 | will 547 | willing 548 | wish 549 | with 550 | within 551 | without 552 | won't 553 | wonder 554 | would 555 | would 556 | wouldn't 557 | x 558 | y 559 | yes 560 | yet 561 | you 562 | you'd 563 | you'll 564 | you're 565 | you've 566 | your 567 | yours 568 | yourself 569 | yourselves 570 | z 571 | zero 572 | true 573 | false 574 | additional 575 | shown -------------------------------------------------------------------------------- /stopLists/SpanishCustomEs: -------------------------------------------------------------------------------- 1 | algún 2 | alguna 3 | algunas 4 | alguno 5 | algunos 6 | ambos 7 | ampleamos 8 | ante 9 | antes 10 | aquel 11 | aquellas 12 | aquellos 13 | aqui 14 | arriba 15 | atras 16 | bajo 17 | bastante 18 | bien 19 | cada 20 | cierta 21 | ciertas 22 | cierto 23 | ciertos 24 | como 25 | con 26 | conseguimos 27 | conseguir 28 | consigo 29 | consigue 30 | consiguen 31 | consigues 32 | cual 33 | cuando 34 | dentro 35 | desde 36 | donde 37 | dos 38 | el 39 | ellas 40 | ellos 41 | empleais 42 | emplean 43 | emplear 44 | empleas 45 | empleo 46 | en 47 | encima 48 | entonces 49 | entre 50 | era 51 | eramos 52 | eran 53 | eras 54 | eres 55 | es 56 | esta 57 | estaba 58 | estado 59 | estais 60 | estamos 61 | estan 62 | estoy 63 | fin 64 | fue 65 | fueron 66 | fui 67 | fuimos 68 | gueno 69 | ha 70 | hace 71 | haceis 72 | hacemos 73 | hacen 74 | hacer 75 | haces 76 | hago 77 | incluso 78 | intenta 79 | intentais 80 | intentamos 81 | intentan 82 | intentar 83 | intentas 84 | intento 85 | ir 86 | la 87 | largo 88 | las 89 | lo 90 | los 91 | mientras 92 | mio 93 | modo 94 | muchos 95 | muy 96 | nos 97 | nosotros 98 | otro 99 | para 100 | pero 101 | podeis 102 | podemos 103 | poder 104 | podria 105 | podriais 106 | podriamos 107 | podrian 108 | podrias 109 | por 110 | por qué 111 | porque 112 | primero 113 | puede 114 | pueden 115 | puedo 116 | quien 117 | sabe 118 | sabeis 119 | sabemos 120 | saben 121 | saber 122 | sabes 123 | ser 124 | si 125 | siendo 126 | sin 127 | sobre 128 | sois 129 | solamente 130 | solo 131 | somos 132 | soy 133 | su 134 | sus 135 | también 136 | teneis 137 | tenemos 138 | tener 139 | tengo 140 | tiempo 141 | tiene 142 | tienen 143 | todo 144 | trabaja 145 | trabajais 146 | trabajamos 147 | trabajan 148 | trabajar 149 | trabajas 150 | trabajo 151 | tras 152 | tuyo 153 | ultimo 154 | un 155 | una 156 | unas 157 | uno 158 | unos 159 | usa 160 | usais 161 | usamos 162 | usan 163 | usar 164 | usas 165 | uso 166 | va 167 | vais 168 | valor 169 | vamos 170 | van 171 | vaya 172 | verdad 173 | verdadera 174 | verdadero 175 | vosotras 176 | vosotros 177 | voy 178 | yo 179 | --------------------------------------------------------------------------------