├── .gitignore
├── LICENSE
├── README.md
├── pom.xml
├── src
├── main
│ ├── FoxStopListEn
│ ├── RakePunctDefaultStopList
│ ├── SmartStopListEn
│ ├── SpanishCustomEs
│ └── java
│ │ └── edu
│ │ └── ehu
│ │ └── galan
│ │ └── rake
│ │ ├── RakeAlgorithm.java
│ │ └── model
│ │ ├── AbstractAlgorithm.java
│ │ ├── Document.java
│ │ ├── Term.java
│ │ └── Token.java
└── test
│ └── java
│ └── edu
│ └── ehu
│ └── galan
│ └── rake
│ └── AppTest.java
└── stopLists
├── FoxStopListEn
├── RakePunctDefaultStopList
├── SmartStopListEn
└── SpanishCustomEs
/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 | /nb-configuration.xml
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2018, Angel Conde
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | RAKE-Java
2 | =====================
3 |
4 | A Java 8 implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm as described in: Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). Automatic Keyword Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.), Text Mining: Theory and Applications: John Wiley & Sons.
5 |
6 | The implementation is based on the python one from https://github.com/aneesha/RAKE (however some changes have been made)
7 | The source code is released under the GPL V3License.
8 |
9 | Add this repository to your POM.XML whether you want to use it with maven
10 | ````xml
11 |
12 | galan-maven-repo
13 | galan-maven-repo-releases
14 | http://galan.ehu.es/artifactory/ext-release-local
15 |
16 |
17 | ````
18 |
19 | This implementation requires a POS tagger to be used in order to work. For example The Illinois POS tagger could be used for English.
20 |
21 |
22 | http://cogcomp.cs.illinois.edu/page/software_view/POS
23 |
24 | For Spanish or other languages:
25 |
26 | FreeLing --> http://nlp.lsi.upc.edu/freeling/
27 |
28 | or Standford Pos tagger --> http://nlp.stanford.edu/software/tagger.shtml
29 |
30 |
31 | The implementation is in beta state
32 |
33 | TODO:
34 |
35 | - More testing
36 |
37 |
38 | Then an example parser for english that will provide the required data (using Illinois POS Tagger)
39 |
40 |
41 | ```java
42 |
43 | import LBJ2.nlp.SentenceSplitter;
44 | import LBJ2.nlp.WordSplitter;
45 | import LBJ2.nlp.seg.PlainToTokenParser;
46 | import LBJ2.parse.Parser;
47 | import edu.illinois.cs.cogcomp.lbj.chunk.Chunker;
48 | import edu.illinois.cs.cogcomp.lbj.pos.POSTagger;
49 | import edu.ehu.galan.cvalue.model.Token;
50 | ......
51 |
52 | List> tokenizedSentenceList;
53 | List sentenceList;
54 | POSTagger tagger = new POSTagger();
55 | Chunker chunker = new Chunker();
56 | boolean first = true;
57 | parser = new PlainToTokenParser(new WordSplitter(new SentenceSplitter(pFile)));
58 | String sentence = "";
59 | LinkedList tokenList = null;
60 | for (LBJ2.nlp.seg.Token word = (LBJ2.nlp.seg.Token) parser.next(); word != null;
61 | word = (LBJ2.nlp.seg.Token) parser.next()) {
62 | String chunked = chunker.discreteValue(word);
63 | tagger.discreteValue(word);
64 | if (first) {
65 | tokenList = new LinkedList<>();
66 | tokenizedSentenceList.add(tokenList);
67 | first = false;
68 | }
69 | tokenList.add(new Token(word.form, word.partOfSpeech, null, chunked));
70 | sentence = sentence + " " + (word.form);
71 | if (word.next == null) {
72 | sentenceList.add(sentence);
73 | first = true;
74 | sentence = "";
75 | }
76 | }
77 | parser.reset();
78 |
79 | ```
80 |
81 | Then RAKE can be processed then.....
82 |
83 |
84 | ```java
85 |
86 | Document doc=new Document(full_path,name);
87 | doc.setSentenceList(sentences);
88 | doc.setTokenList(tokenized_sentences);
89 | RakeAlgorithm ex = new RakeAlgorithm();
90 | ex.loadStopWordsList("resources/lite/stopWordLists/RakeStopLists/SmartStopListEn");
91 | ex.loadPunctStopWord("resources/lite/stopWordLists/RakeStopLists/RakePunctDefaultStopList");
92 | PlainTextDocumentReaderLBJEn parser = new PlainTextDocumentReaderLBJEn();
93 | parser.readSource("testCorpus/textAstronomy");
94 | Document doc = new Document("full_path", "name");
95 | ex.init(doc);
96 | ex.runAlgorithm();
97 | doc.getTermList();
98 | ```
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | edu.ehu.galan.rake
6 | RAKE
7 | 1.0
8 | jar
9 |
10 |
11 |
12 | false
13 |
14 | central
15 | bintray-plugins
16 | http://jcenter.bintray.com
17 |
18 |
19 | RAKE
20 | http://maven.apache.org
21 |
22 |
23 |
24 | org.apache.maven.plugins
25 | maven-compiler-plugin
26 | 2.3.2
27 |
28 | 1.8
29 | 1.8
30 |
31 |
32 |
33 | org.jfrog.buildinfo
34 | artifactory-maven-plugin
35 | 2.2.2
36 | false
37 |
38 |
39 | build-info
40 |
41 | publish
42 |
43 |
44 |
45 | http://sips72.si.ehu.es:8080/artifactory/
46 | admin
47 |
48 | ext-release-local
49 | ext-snapshots-local
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 | UTF-8
59 |
60 |
61 |
62 | junit
63 | junit
64 | 3.8.1
65 | test
66 |
67 |
68 | com.google.guava
69 | guava
70 | 15.0
71 | jar
72 |
73 |
74 | org.slf4j
75 | slf4j-api
76 | 1.6.6
77 | jar
78 |
79 |
80 | com.google.code.gson
81 | gson
82 | 2.2.4
83 | jar
84 |
85 |
86 | commons-io
87 | commons-io
88 | 2.4
89 | jar
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/src/main/FoxStopListEn:
--------------------------------------------------------------------------------
1 | a
2 | about
3 | above
4 | across
5 | after
6 | again
7 | against
8 | all
9 | almost
10 | alone
11 | along
12 | already
13 | also
14 | although
15 | always
16 | among
17 | an
18 | and
19 | another
20 | any
21 | anybody
22 | anyone
23 | anything
24 | anywhere
25 | are
26 | area
27 | areas
28 | around
29 | as
30 | ask
31 | asked
32 | asking
33 | asks
34 | at
35 | away
36 | b
37 | back
38 | backed
39 | backing
40 | backs
41 | be
42 | because
43 | became
44 | become
45 | becomes
46 | been
47 | before
48 | began
49 | behind
50 | being
51 | beings
52 | best
53 | better
54 | between
55 | big
56 | both
57 | but
58 | by
59 | c
60 | came
61 | can
62 | cannot
63 | case
64 | cases
65 | certain
66 | certainly
67 | clear
68 | clearly
69 | come
70 | could
71 | d
72 | did
73 | differ
74 | different
75 | differently
76 | do
77 | does
78 | done
79 | down
80 | downed
81 | downing
82 | downs
83 | during
84 | e
85 | each
86 | early
87 | either
88 | end
89 | ended
90 | ending
91 | ends
92 | enough
93 | even
94 | evenly
95 | ever
96 | every
97 | everybody
98 | everyone
99 | everything
100 | everywhere
101 | f
102 | face
103 | faces
104 | fact
105 | facts
106 | far
107 | felt
108 | few
109 | find
110 | finds
111 | first
112 | for
113 | four
114 | from
115 | full
116 | fully
117 | further
118 | furthered
119 | furthering
120 | furthers
121 | g
122 | gave
123 | general
124 | generally
125 | get
126 | gets
127 | give
128 | given
129 | gives
130 | go
131 | going
132 | good
133 | goods
134 | got
135 | great
136 | greater
137 | greatest
138 | group
139 | grouped
140 | grouping
141 | groups
142 | h
143 | had
144 | has
145 | have
146 | having
147 | he
148 | her
149 | herself
150 | here
151 | high
152 | higher
153 | highest
154 | him
155 | himself
156 | his
157 | how
158 | however
159 | i
160 | if
161 | important
162 | in
163 | interest
164 | interested
165 | interesting
166 | interests
167 | into
168 | is
169 | it
170 | its
171 | itself
172 | j
173 | just
174 | k
175 | keep
176 | keeps
177 | kind
178 | knew
179 | know
180 | known
181 | knows
182 | l
183 | large
184 | largely
185 | last
186 | later
187 | latest
188 | least
189 | less
190 | let
191 | lets
192 | like
193 | likely
194 | long
195 | longer
196 | longest
197 | m
198 | made
199 | make
200 | making
201 | man
202 | many
203 | may
204 | me
205 | member
206 | members
207 | men
208 | might
209 | more
210 | most
211 | mostly
212 | mr
213 | mrs
214 | much
215 | must
216 | my
217 | myself
218 | n
219 | necessary
220 | need
221 | needed
222 | needing
223 | needs
224 | never
225 | new
226 | newer
227 | newest
228 | next
229 | no
230 | non
231 | not
232 | nobody
233 | noone
234 | nothing
235 | now
236 | nowhere
237 | number
238 | numbered
239 | numbering
240 | numbers
241 | o
242 | of
243 | off
244 | often
245 | old
246 | older
247 | oldest
248 | on
249 | once
250 | one
251 | only
252 | open
253 | opened
254 | opening
255 | opens
256 | or
257 | order
258 | ordered
259 | ordering
260 | orders
261 | other
262 | others
263 | our
264 | out
265 | over
266 | p
267 | part
268 | parted
269 | parting
270 | parts
271 | per
272 | perhaps
273 | place
274 | places
275 | point
276 | pointed
277 | pointing
278 | points
279 | possible
280 | present
281 | presented
282 | presenting
283 | presents
284 | problem
285 | problems
286 | put
287 | puts
288 | q
289 | quite
290 | r
291 | rather
292 | really
293 | right
294 | room
295 | rooms
296 | s
297 | said
298 | same
299 | saw
300 | say
301 | says
302 | second
303 | seconds
304 | see
305 | seem
306 | seemed
307 | seeming
308 | seems
309 | sees
310 | several
311 | shall
312 | she
313 | should
314 | show
315 | showed
316 | showing
317 | shows
318 | side
319 | sides
320 | since
321 | small
322 | smaller
323 | smallest
324 | so
325 | some
326 | somebody
327 | someone
328 | something
329 | somewhere
330 | state
331 | states
332 | still
333 | such
334 | sure
335 | t
336 | take
337 | taken
338 | than
339 | that
340 | the
341 | their
342 | them
343 | then
344 | there
345 | therefore
346 | these
347 | they
348 | thing
349 | things
350 | think
351 | thinks
352 | this
353 | those
354 | though
355 | thought
356 | thoughts
357 | three
358 | through
359 | thus
360 | to
361 | today
362 | together
363 | too
364 | took
365 | toward
366 | turn
367 | turned
368 | turning
369 | turns
370 | two
371 | u
372 | under
373 | until
374 | up
375 | upon
376 | us
377 | use
378 | uses
379 | used
380 | v
381 | very
382 | w
383 | want
384 | wanted
385 | wanting
386 | wants
387 | was
388 | way
389 | ways
390 | we
391 | well
392 | wells
393 | went
394 | were
395 | what
396 | when
397 | where
398 | whether
399 | which
400 | while
401 | who
402 | whole
403 | whose
404 | why
405 | will
406 | with
407 | within
408 | without
409 | work
410 | worked
411 | working
412 | works
413 | would
414 | x
415 | y
416 | year
417 | years
418 | yet
419 | you
420 | young
421 | younger
422 | youngest
423 | your
424 | yours
425 | z
--------------------------------------------------------------------------------
/src/main/RakePunctDefaultStopList:
--------------------------------------------------------------------------------
1 | .
2 | /
3 | ,
4 | !
5 | ?
6 | {
7 | }
8 | [
9 | ]
10 | ;
11 | :
12 | (
13 | )
14 | -
15 | _
16 | @
--------------------------------------------------------------------------------
/src/main/SmartStopListEn:
--------------------------------------------------------------------------------
1 | a
2 | a's
3 | able
4 | about
5 | above
6 | according
7 | accordingly
8 | across
9 | actually
10 | after
11 | afterwards
12 | again
13 | against
14 | ain't
15 | all
16 | allow
17 | allows
18 | almost
19 | alone
20 | along
21 | already
22 | also
23 | although
24 | always
25 | am
26 | among
27 | amongst
28 | an
29 | and
30 | another
31 | any
32 | anybody
33 | anyhow
34 | anyone
35 | anything
36 | anyway
37 | anyways
38 | anywhere
39 | apart
40 | appear
41 | appreciate
42 | appropriate
43 | are
44 | aren't
45 | around
46 | as
47 | aside
48 | ask
49 | asking
50 | associated
51 | at
52 | available
53 | away
54 | awfully
55 | b
56 | be
57 | became
58 | because
59 | become
60 | becomes
61 | becoming
62 | been
63 | before
64 | beforehand
65 | behind
66 | being
67 | believe
68 | below
69 | beside
70 | besides
71 | best
72 | better
73 | between
74 | beyond
75 | both
76 | brief
77 | but
78 | by
79 | c
80 | c'mon
81 | c's
82 | came
83 | can
84 | can't
85 | cannot
86 | cant
87 | cause
88 | causes
89 | certain
90 | certainly
91 | changes
92 | clearly
93 | co
94 | com
95 | come
96 | comes
97 | concerning
98 | consequently
99 | consider
100 | considering
101 | contain
102 | containing
103 | contains
104 | corresponding
105 | could
106 | couldn't
107 | course
108 | currently
109 | d
110 | definitely
111 | described
112 | despite
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | don't
121 | done
122 | down
123 | downwards
124 | during
125 | e
126 | each
127 | edu
128 | eg
129 | eight
130 | either
131 | else
132 | elsewhere
133 | enough
134 | entirely
135 | especially
136 | et
137 | etc
138 | even
139 | ever
140 | every
141 | everybody
142 | everyone
143 | everything
144 | everywhere
145 | ex
146 | exactly
147 | example
148 | except
149 | f
150 | far
151 | few
152 | fifth
153 | first
154 | five
155 | followed
156 | following
157 | follows
158 | for
159 | former
160 | formerly
161 | forth
162 | four
163 | from
164 | further
165 | furthermore
166 | g
167 | get
168 | gets
169 | getting
170 | given
171 | gives
172 | go
173 | goes
174 | going
175 | gone
176 | got
177 | gotten
178 | greetings
179 | h
180 | had
181 | hadn't
182 | happens
183 | hardly
184 | has
185 | hasn't
186 | have
187 | haven't
188 | having
189 | he
190 | he's
191 | hello
192 | help
193 | hence
194 | her
195 | here
196 | here's
197 | hereafter
198 | hereby
199 | herein
200 | hereupon
201 | hers
202 | herself
203 | hi
204 | him
205 | himself
206 | his
207 | hither
208 | hopefully
209 | how
210 | howbeit
211 | however
212 | i
213 | i'd
214 | i'll
215 | i'm
216 | i've
217 | ie
218 | if
219 | ignored
220 | immediate
221 | in
222 | inasmuch
223 | inc
224 | indeed
225 | indicate
226 | indicated
227 | indicates
228 | inner
229 | insofar
230 | instead
231 | into
232 | inward
233 | is
234 | isn't
235 | it
236 | it'd
237 | it'll
238 | it's
239 | its
240 | itself
241 | j
242 | just
243 | k
244 | keep
245 | keeps
246 | kept
247 | know
248 | knows
249 | known
250 | l
251 | last
252 | lately
253 | later
254 | latter
255 | latterly
256 | least
257 | less
258 | lest
259 | let
260 | let's
261 | like
262 | liked
263 | likely
264 | little
265 | look
266 | looking
267 | looks
268 | ltd
269 | m
270 | mainly
271 | many
272 | may
273 | maybe
274 | me
275 | mean
276 | meanwhile
277 | merely
278 | might
279 | more
280 | moreover
281 | most
282 | mostly
283 | much
284 | must
285 | my
286 | myself
287 | n
288 | name
289 | namely
290 | nd
291 | near
292 | nearly
293 | necessary
294 | need
295 | needs
296 | neither
297 | never
298 | nevertheless
299 | new
300 | next
301 | nine
302 | no
303 | nobody
304 | non
305 | none
306 | noone
307 | nor
308 | normally
309 | not
310 | nothing
311 | novel
312 | now
313 | nowhere
314 | o
315 | obviously
316 | of
317 | off
318 | often
319 | oh
320 | ok
321 | okay
322 | old
323 | on
324 | once
325 | one
326 | ones
327 | only
328 | onto
329 | or
330 | other
331 | others
332 | otherwise
333 | ought
334 | our
335 | ours
336 | ourselves
337 | out
338 | outside
339 | over
340 | overall
341 | own
342 | p
343 | particular
344 | particularly
345 | per
346 | perhaps
347 | placed
348 | please
349 | plus
350 | possible
351 | presumably
352 | probably
353 | provides
354 | q
355 | que
356 | quite
357 | qv
358 | r
359 | rather
360 | rd
361 | re
362 | really
363 | reasonably
364 | regarding
365 | regardless
366 | regards
367 | relatively
368 | respectively
369 | right
370 | s
371 | said
372 | same
373 | saw
374 | say
375 | saying
376 | says
377 | second
378 | secondly
379 | see
380 | seeing
381 | seem
382 | seemed
383 | seeming
384 | seems
385 | seen
386 | self
387 | selves
388 | sensible
389 | sent
390 | serious
391 | seriously
392 | seven
393 | several
394 | shall
395 | she
396 | should
397 | shouldn't
398 | since
399 | six
400 | so
401 | some
402 | somebody
403 | somehow
404 | someone
405 | something
406 | sometime
407 | sometimes
408 | somewhat
409 | somewhere
410 | soon
411 | sorry
412 | specified
413 | specify
414 | specifying
415 | still
416 | sub
417 | such
418 | sup
419 | sure
420 | t
421 | t's
422 | take
423 | taken
424 | tell
425 | tends
426 | th
427 | than
428 | thank
429 | thanks
430 | thanx
431 | that
432 | that's
433 | thats
434 | the
435 | their
436 | theirs
437 | them
438 | themselves
439 | then
440 | thence
441 | there
442 | there's
443 | thereafter
444 | thereby
445 | therefore
446 | therein
447 | theres
448 | thereupon
449 | these
450 | they
451 | they'd
452 | they'll
453 | they're
454 | they've
455 | think
456 | third
457 | this
458 | thorough
459 | thoroughly
460 | those
461 | though
462 | three
463 | through
464 | throughout
465 | thru
466 | thus
467 | to
468 | together
469 | too
470 | took
471 | toward
472 | towards
473 | tried
474 | tries
475 | truly
476 | try
477 | trying
478 | twice
479 | two
480 | u
481 | un
482 | under
483 | unfortunately
484 | unless
485 | unlikely
486 | until
487 | unto
488 | up
489 | upon
490 | us
491 | use
492 | used
493 | useful
494 | uses
495 | using
496 | usually
497 | uucp
498 | v
499 | value
500 | various
501 | very
502 | via
503 | viz
504 | vs
505 | w
506 | want
507 | wants
508 | was
509 | wasn't
510 | way
511 | we
512 | we'd
513 | we'll
514 | we're
515 | we've
516 | welcome
517 | well
518 | went
519 | were
520 | weren't
521 | what
522 | what's
523 | whatever
524 | when
525 | whence
526 | whenever
527 | where
528 | where's
529 | whereafter
530 | whereas
531 | whereby
532 | wherein
533 | whereupon
534 | wherever
535 | whether
536 | which
537 | while
538 | whither
539 | who
540 | who's
541 | whoever
542 | whole
543 | whom
544 | whose
545 | why
546 | will
547 | willing
548 | wish
549 | with
550 | within
551 | without
552 | won't
553 | wonder
554 | would
555 | would
556 | wouldn't
557 | x
558 | y
559 | yes
560 | yet
561 | you
562 | you'd
563 | you'll
564 | you're
565 | you've
566 | your
567 | yours
568 | yourself
569 | yourselves
570 | z
571 | zero
572 | true
573 | false
574 | additional
575 | shown
--------------------------------------------------------------------------------
/src/main/SpanishCustomEs:
--------------------------------------------------------------------------------
1 | algún
2 | alguna
3 | algunas
4 | alguno
5 | algunos
6 | ambos
7 | ampleamos
8 | ante
9 | antes
10 | aquel
11 | aquellas
12 | aquellos
13 | aqui
14 | arriba
15 | atras
16 | bajo
17 | bastante
18 | bien
19 | cada
20 | cierta
21 | ciertas
22 | cierto
23 | ciertos
24 | como
25 | con
26 | conseguimos
27 | conseguir
28 | consigo
29 | consigue
30 | consiguen
31 | consigues
32 | cual
33 | cuando
34 | dentro
35 | desde
36 | donde
37 | dos
38 | el
39 | ellas
40 | ellos
41 | empleais
42 | emplean
43 | emplear
44 | empleas
45 | empleo
46 | en
47 | encima
48 | entonces
49 | entre
50 | era
51 | eramos
52 | eran
53 | eras
54 | eres
55 | es
56 | esta
57 | estaba
58 | estado
59 | estais
60 | estamos
61 | estan
62 | estoy
63 | fin
64 | fue
65 | fueron
66 | fui
67 | fuimos
68 | gueno
69 | ha
70 | hace
71 | haceis
72 | hacemos
73 | hacen
74 | hacer
75 | haces
76 | hago
77 | incluso
78 | intenta
79 | intentais
80 | intentamos
81 | intentan
82 | intentar
83 | intentas
84 | intento
85 | ir
86 | la
87 | largo
88 | las
89 | lo
90 | los
91 | mientras
92 | mio
93 | modo
94 | muchos
95 | muy
96 | nos
97 | nosotros
98 | otro
99 | para
100 | pero
101 | podeis
102 | podemos
103 | poder
104 | podria
105 | podriais
106 | podriamos
107 | podrian
108 | podrias
109 | por
110 | por qué
111 | porque
112 | primero
113 | puede
114 | pueden
115 | puedo
116 | quien
117 | sabe
118 | sabeis
119 | sabemos
120 | saben
121 | saber
122 | sabes
123 | ser
124 | si
125 | siendo
126 | sin
127 | sobre
128 | sois
129 | solamente
130 | solo
131 | somos
132 | soy
133 | su
134 | sus
135 | también
136 | teneis
137 | tenemos
138 | tener
139 | tengo
140 | tiempo
141 | tiene
142 | tienen
143 | todo
144 | trabaja
145 | trabajais
146 | trabajamos
147 | trabajan
148 | trabajar
149 | trabajas
150 | trabajo
151 | tras
152 | tuyo
153 | ultimo
154 | un
155 | una
156 | unas
157 | uno
158 | unos
159 | usa
160 | usais
161 | usamos
162 | usan
163 | usar
164 | usas
165 | uso
166 | va
167 | vais
168 | valor
169 | vamos
170 | van
171 | vaya
172 | verdad
173 | verdadera
174 | verdadero
175 | vosotras
176 | vosotros
177 | voy
178 | yo
179 |
--------------------------------------------------------------------------------
/src/main/java/edu/ehu/galan/rake/RakeAlgorithm.java:
--------------------------------------------------------------------------------
1 | package edu.ehu.galan.rake;
2 |
3 | /*
4 | * RakeAlgorithm.java
5 | * Copyright (C) 2014 Angel Conde, neuw84 at gmail dot com
6 | *
7 | * This program is free software; you can redistribute it and/or modify
8 | * it under the terms of the GNU General Public License as published by
9 | * the Free Software Foundation; either version 3 of the License, or
10 | * (at your option) any later version.
11 | *
12 | * This program is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | * GNU General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU General Public License
18 | * along with this program; if not, write to the Free Software
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 | */
21 |
22 |
23 | import edu.ehu.galan.rake.model.AbstractAlgorithm;
24 | import edu.ehu.galan.rake.model.Document;
25 | import edu.ehu.galan.rake.model.Term;
26 | import java.io.IOException;
27 | import java.nio.charset.StandardCharsets;
28 | import java.nio.file.Files;
29 | import java.nio.file.Paths;
30 | import java.util.ArrayList;
31 | import java.util.Arrays;
32 | import java.util.Comparator;
33 | import java.util.HashMap;
34 | import java.util.List;
35 | import java.util.Map;
36 | import java.util.regex.Matcher;
37 | import java.util.regex.Pattern;
38 | import static java.util.stream.Collectors.toList;
39 | import org.slf4j.Logger;
40 | import org.slf4j.LoggerFactory;
41 |
42 | /**
43 | *
44 | * An Implementation of the RAKE (Rapid Automatic Keyword Extraction)
45 | * Rose, Stuart, et al. "Automatic keyword extraction from individual
46 | * documents." Text Mining (2010): 1-20.
47 | *
48 | *
49 | * This implementation is based on JATE https://code.google.com/p/jatetoolkit/
50 | * and on https://github.com/aneesha/RAKE, it gives similar results as the
51 | * python script provided a good stopword list with a punctuation list
52 | *
53 | * The numbers have been taken into account using JATE method. The algorithm
54 | * expects that the puntuaction marks are separated within a whitespace.
55 | * " The red table , that is in front of you , is mine . "
56 | * To achieve this you should use a parser like OpenNLP, Illinois POS Tagger,
57 | * Freeling parsers etc.
58 | *
59 | *
60 | * TODO: use POS tags to avoid verbs and other unwanted type of words in the
61 | * process of keyword generation
62 | *
63 | * @author Angel Conde Manjon
64 | */
65 |
66 | public class RakeAlgorithm extends AbstractAlgorithm {
67 |
68 | private transient Document doc = null;
69 | private final transient List termList;
70 | private List stopWordList;
71 | transient private final Logger logger = LoggerFactory.getLogger(this.getClass());
72 | private List regexList = null;
73 | private List punctList;
74 | private int minNumberOfletters = 2;
75 |
76 | /**
77 | *
78 | */
79 | public RakeAlgorithm() {
80 | super(true, "RAKE");
81 | termList = super.getTermList();
82 | stopWordList = new ArrayList<>();
83 | regexList = new ArrayList<>();
84 | punctList = new ArrayList<>();
85 | }
86 |
87 | @Override
88 | public void init(Document pDoc, String pPropsDir) {
89 | setDoc(pDoc);
90 | doc = pDoc;
91 | }
92 |
93 | /**
94 | * This methods requires a list of stopwords to build a the candidate list,
95 | * will search in each different sentence for this stopwords to delimite the
96 | * candidate generation
97 | *
98 | *
99 | * @param pStopWords - a list of stopWords
100 | */
101 | public void loadStopWordsList(List pStopWords) {
102 | stopWordList = pStopWords;
103 | }
104 |
105 | /**
106 | * This method requires a list of stopwords to build a the candidate list,
107 | * will search in each different sentence for this stopwords to delimite the
108 | * candidate generation
109 | *
110 | *
111 | * @param pLoc - the location of the file where the stopwords are
112 | */
113 | public void loadStopWordsList(String pLoc) {
114 | List stops = new ArrayList<>();
115 | try {
116 | List words = Files.readAllLines(Paths.get(pLoc), StandardCharsets.UTF_8);
117 | for (String string : words) {
118 | stops.add(string.trim());
119 | }
120 | stopWordList = stops;
121 | } catch (IOException ex) {
122 | logger.error("Error loading RAKE stopWordList from: " + pLoc, ex);
123 | }
124 | }
125 |
126 | /**
127 | * As this method uses Regex for candidate generation, custom regex
128 | * expresions could be added using this method (uses Java Pattern/Matcher
129 | * mechanism)
130 | *
131 | * @param pat
132 | */
133 | public void addCustomRegex(Pattern pat) {
134 | regexList.add(pat);
135 | }
136 |
137 | private Pattern buildStopWordRegex(List pStopWords) {
138 | StringBuilder sb = new StringBuilder();
139 | for (String string : pStopWords) {
140 | sb.append("\\b").append(string.trim()).append("\\b").append("|");
141 | }
142 | String pattern = sb.substring(0, sb.length() - 1);
143 | Pattern pat = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE| Pattern.UNICODE_CASE);
144 | return pat;
145 | }
146 |
147 | /**
148 | * This method works better with a list of punctuation stop list, for
149 | * example for english, spanish and in general in latin based languages the
150 | * list could be (.,/{}[];:)
151 | *
152 | * @param pLoc - the location of the file where the stopwords are
153 | */
154 | public void loadPunctStopWord(String pLoc) {
155 | List stops = new ArrayList<>();
156 | try {
157 | List words = Files.readAllLines(Paths.get(pLoc), StandardCharsets.UTF_8);
158 | for (String string : words) {
159 | stops.add(string.trim());
160 | }
161 | punctList = stops;
162 | } catch (IOException ex) {
163 | logger.error("Error loading RAKE punctList from: " + pLoc, ex);
164 | }
165 | }
166 |
167 | /**
168 | * (OPTIONAL)This method works better with a list of punctuation stop list,
169 | * for example for english, spanish and in general in latin based languages
170 | * the list could be (.,/{}[];:)
171 | *
172 | * @param pPunt - the string list to be added
173 | */
174 | public void loadPunctStopWord(List pPunt) {
175 | punctList = pPunt;
176 |
177 | }
178 |
179 | private Pattern buildPunctStopWord(List pPunctStop) {
180 | StringBuilder sb = new StringBuilder();
181 | for (String string : pPunctStop) {
182 | sb.append("\\").append(string.trim()).append("|");
183 | }
184 | String pattern = sb.substring(0, sb.length() - 1);
185 | Pattern pat = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE |Pattern.UNICODE_CASE);
186 | return pat;
187 | }
188 |
189 | private List generateCandidateKeywords(List pSentenceList, List pStopWordPattern) {
190 | List candidates = new ArrayList<>();
191 | StringBuffer sb = new StringBuffer();
192 | for (String string : pSentenceList) {
193 | for (Pattern pat : pStopWordPattern) {
194 | Matcher matcher = pat.matcher(string.trim());
195 | while (matcher.find()) {
196 | matcher.appendReplacement(sb, "|");
197 | }
198 | matcher.appendTail(sb);
199 | if (sb.length() > 0) {
200 |
201 | string = sb.toString();
202 | }
203 | sb = new StringBuffer();
204 | }
205 | List cands = Arrays.asList(string.split("\\|"));
206 | for (String string1 : cands) {
207 | if (string1.trim().length() > 0) {
208 | String[] p = string1.trim().split("\\s+");
209 | if (string1.length() > 2 && p.length > 1 && !containsDigit(string1)) {
210 | candidates.add(string1.trim());
211 | }
212 | }
213 | }
214 | }
215 | return candidates;
216 | }
217 |
218 | @Override
219 | public void runAlgorithm() {
220 | if (stopWordList.isEmpty()) {
221 | logger.error("The method " + this.getName() + " requires a StopWordList to build the candidate list");
222 | } else {
223 | Map wordfreq = new HashMap<>();
224 | Map worddegree = new HashMap<>();
225 | Map wordscore = new HashMap<>();
226 | Pattern pat = buildStopWordRegex(stopWordList);
227 | regexList.add(pat);
228 | if (!punctList.isEmpty()) {
229 | Pattern pat2 = buildPunctStopWord(punctList);
230 | regexList.add(pat2);
231 | }
232 | List candidates = generateCandidateKeywords(doc.getSentenceList(), regexList);
233 | for (String phrase : candidates) {
234 | String[] wordlist = phrase.split("\\s+");
235 | int wordlistlength = wordlist.length;
236 | int wordlistdegree = wordlistlength - 1;
237 | for (String word : wordlist) {
238 | int freq;
239 | if (wordfreq.containsKey(word) == false) {
240 | wordfreq.put(word, 1);
241 | } else {
242 | freq = wordfreq.get(word) + 1;
243 | wordfreq.remove(word);
244 | wordfreq.put(word, freq);
245 | }
246 |
247 | if (worddegree.containsKey(word) == false) {
248 | worddegree.put(word, wordlistdegree);
249 | } else {
250 | int deg = worddegree.get(word) + wordlistdegree;
251 | worddegree.remove(word);
252 | worddegree.put(word, deg);
253 | }
254 | }
255 | }
256 | for (Map.Entry entry : worddegree.entrySet()) {
257 | entry.setValue(entry.getValue() + wordfreq.get(entry.getKey()));
258 | }
259 | List termLi = new ArrayList<>();
260 | for (Map.Entry entry : wordfreq.entrySet()) {
261 | wordscore.put(entry.getKey(), worddegree.get(entry.getKey()) / (wordfreq.get(entry.getKey()) * 1.0f));
262 | }
263 | for (String phrase : candidates) {
264 | String[] words = phrase.split("\\s+");
265 | float score = 0.0f;
266 | for (String word : words) {
267 | score += wordscore.get(word);
268 | }
269 | termLi.add(new Term(phrase, score));
270 | }
271 | Comparator super Term> sorter = (o1, o2) -> o1.getScore() > o2.getScore() ? -1 : o1.getScore() == o2.getScore() ? 0 : 1;
272 | List orderedList = termLi.parallelStream().sorted(sorter).distinct().collect(toList());
273 | doc.setTermList(orderedList);
274 |
275 | }
276 | }
277 |
278 | /**
279 | *
280 | * @return the doc
281 | */
282 | public Document getDoc() {
283 | return doc;
284 | }
285 |
286 | /**
287 | * @param doc the doc to set
288 | */
289 | public void setDoc(Document doc) {
290 | this.doc = doc;
291 | }
292 |
293 |
294 |
295 | /**
296 | *
297 | * Returns the current (Default 2)
298 | *
299 | * @return the minNumberOfletters required to a word to be included
300 | */
301 | public int getMinNumberOfletters() {
302 | return minNumberOfletters;
303 | }
304 |
305 | /**
306 | * Default 2
307 | *
308 | * @param minNumberOfletters the minNumberOfletters to set to a word to be
309 | * included
310 | */
311 | public void setMinNumberOfletters(int minNumberOfletters) {
312 | this.minNumberOfletters = minNumberOfletters;
313 | }
314 |
315 | private boolean containsDigit(String string) {
316 | for (char c : string.toCharArray()) {
317 | if (Character.isDigit(c)) {
318 | return true;
319 | }
320 | }
321 | return false;
322 | }
323 | }
324 |
--------------------------------------------------------------------------------
/src/main/java/edu/ehu/galan/rake/model/AbstractAlgorithm.java:
--------------------------------------------------------------------------------
1 | package edu.ehu.galan.rake.model;
2 | /*
3 | * AbstractAlgorithm.java
4 | * Copyright (C) 2013 Angel Conde, neuw84 at gmail dot com
5 | *
6 | * This program is free software; you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation; either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program; if not, write to the Free Software
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 | */
20 |
21 | import com.google.gson.Gson;
22 | import java.io.File;
23 | import java.io.FileWriter;
24 | import java.io.IOException;
25 | import java.io.PrintWriter;
26 | import java.nio.charset.StandardCharsets;
27 | import java.nio.file.Files;
28 | import java.nio.file.Path;
29 | import java.nio.file.Paths;
30 | import java.util.ArrayList;
31 | import java.util.Comparator;
32 | import java.util.List;
33 | import java.util.Properties;
34 | import java.util.concurrent.Callable;
35 | import java.util.stream.Collectors;
36 | import org.slf4j.Logger;
37 | import org.slf4j.LoggerFactory;
38 |
39 | /**
40 | * An abstract class that represents an Algorithm for term extraction, all the
41 | * different extraction methods should extend this
42 | *
43 | * @author Angel Conde Manjon
44 | */
45 | public abstract class AbstractAlgorithm implements Callable {
46 |
47 | //TODO if we process a corpus instead a document, the termList in each
48 | //document is unusable, thing about the model and refactor
49 |
50 | private List termList;
51 | private boolean scored;
52 | private transient Document doc;
53 | private String name;
54 | private transient Properties properties = null;
55 | transient final Logger logger = LoggerFactory.getLogger(AbstractAlgorithm.class);
56 |
57 | /**
58 | *
59 | * @param pScored - if the results of the algorithm will be scored
60 | * @param pName - The name of the algorithm
61 | */
62 | public AbstractAlgorithm(boolean pScored, String pName) {
63 | termList = new ArrayList<>();
64 | scored = pScored;
65 | name = pName;
66 | }
67 |
68 | /**
69 | * Returns the term list
70 | *
71 | * @return
72 | */
73 | public List getTermList() {
74 | return termList;
75 | }
76 |
77 | /**
78 | * Returns a list of the terms where all the scores will be > of the passed
79 | * threshold
80 | *
81 | * @param pThreshold
82 | * @return
83 | */
84 | public List getThresholdedTermList(float pThreshold) {
85 | if (isScored()) {
86 | return getTermList().stream().filter((scoredTerm) -> (scoredTerm.getScore() > pThreshold)).collect(Collectors.toList());
87 | } else {
88 | logger.warn("You can't get a thresholded list because this is not a scored algorithm");
89 | return null;
90 | }
91 | }
92 |
93 | /**
94 | * Apply a stopWord list to the term list, will search for the stopword in
95 | * each term component (if the term is "solar system" will try to match the
96 | * stopword in each component of the term (solar and system). will filter
97 | * the term list
98 | *
99 | * @param pStopwordList
100 | */
101 | public final void applyStopwordList(List pStopwordList) {
102 | List stopList = new ArrayList<>();
103 | boolean stop;
104 | if (getTermList().size() > 0) {
105 | for (Term term : getTermList()) {
106 | String[] nGrams = term.getTerm().split("\\s");
107 | stop = false;
108 | for (String string : pStopwordList) {
109 | if (nGrams.length == 1) {
110 | if (nGrams[0].equalsIgnoreCase(string)) {
111 | stop = true;
112 | break;
113 | }
114 | } else {
115 | for (String string1 : nGrams) {
116 | if (string1.equalsIgnoreCase(string)) {
117 | stop = true;
118 | break;
119 | }
120 | }
121 | }
122 | if (!stop) {
123 | stopList.add(term);
124 | }
125 | }
126 | }
127 | setTermList(stopList);
128 | } else {
129 | logger.info("The term list appears to be empty, have you ran the algorithm?");
130 | }
131 | }
132 |
133 | /**
134 | * will try to match the stopword list to the first component of a multiword
135 | * term
136 | *
137 | * @param pFirstTermStopWordList
138 | */
139 | public final void firstTermStopWordList(List pFirstTermStopWordList) {
140 | List stopList = new ArrayList<>();
141 | boolean stop;
142 | if (getTermList().size() > 0) {
143 | for (String string : pFirstTermStopWordList) {
144 | for (Term term : getTermList()) {
145 | stop = false;
146 | String[] nGrams = term.getTerm().split("\\s");
147 | if (nGrams[0].equalsIgnoreCase(string)) {
148 | stop = true;
149 | }
150 | if (!stop) {
151 | stopList.add(term);
152 | }
153 | }
154 | }
155 | setTermList(stopList);
156 | } else {
157 | logger.info("The term list appears to be empty, have you ran the algorithm?");
158 | }
159 | }
160 |
161 | /**
162 | * will try to match the stopword list to the last component of a multiword
163 | * term
164 | *
165 | * @param pFirstTermStopWordList
166 | */
167 | public void lastTermStopWordList(List pFirstTermStopWordList) {
168 | List stopList = new ArrayList<>();
169 | boolean stop;
170 | if (getTermList().size() > 0) {
171 | for (String string : pFirstTermStopWordList) {
172 | for (Term term : getTermList()) {
173 | stop = false;
174 | String[] nGrams = term.getTerm().split("\\s");
175 | if (nGrams[nGrams.length - 1].equalsIgnoreCase(string)) {
176 | stop = true;
177 | }
178 | if (!stop) {
179 | stopList.add(term);
180 | }
181 | }
182 | }
183 | setTermList(stopList);
184 | } else {
185 | logger.info("The term list appears to be empty, have you ran the algorithm?");
186 | }
187 |
188 | }
189 |
190 | /**
191 | * Prints in the standar output the algorithm results
192 | */
193 | public final void print() {
194 | if (isScored()) {
195 | getTermList().stream().forEach((scoredTerm) -> {
196 | System.out.printf("%s \t %f", scoredTerm.getTerm(), scoredTerm.getScore());
197 | });
198 | } else {
199 | getTermList().stream().forEach((scoredTerm) -> {
200 | System.out.printf("%s \t %f", scoredTerm.getTerm());
201 | });
202 | }
203 | }
204 |
205 | /**
206 | * The class that represents the action of running an algorithm in a corpus
207 | * must be implemented
208 | */
209 | public abstract void runAlgorithm();
210 |
211 | /**
212 | * This will be used by the ThreadPool to execute the algorithm and return
213 | * the results
214 | *
215 | * @return List a list of the extracted terms by the algorithm
216 | * @throws Exception
217 | */
218 | @Override
219 | public final Integer call() throws Exception {
220 | runAlgorithm();
221 | return new Integer(0);
222 | }
223 |
224 | /**
225 | * Sets the term list of this algorithm
226 | *
227 | * @param termList the termList to set
228 | */
229 | public final void setTermList(List termList) {
230 | this.termList = termList;
231 | }
232 |
233 | /**
234 | * returns whether this algorithm is scored
235 | *
236 | * @return the scored
237 | */
238 | public final boolean isScored() {
239 | return scored;
240 | }
241 |
242 | /**
243 | * Sets if this algorithm has scored results
244 | *
245 | * @param scored the scored to set
246 | */
247 | public final void setScored(boolean scored) {
248 | this.scored = scored;
249 | }
250 |
251 | /**
252 | * Returns the document assigned to this algorithm
253 | *
254 | * @return the corpus
255 | */
256 | public final Document getDocument() {
257 | return doc;
258 | }
259 |
260 | /**
261 | * The corpus that will be processed by the algorithm
262 | *
263 | * @param pDoc the document to set
264 | */
265 | public final void setDocument(Document pDoc) {
266 | this.doc = pDoc;
267 | }
268 |
269 | /**
270 | * Saves the current term list to tmp folder (configured in the resources
271 | * folder)
272 | *
273 | */
274 | public void saveToTmp() {
275 | try (FileWriter outFile = new FileWriter("kpminer")) {
276 | boolean first = true;
277 | try (PrintWriter out = new PrintWriter(outFile)) {
278 | for (Term term : termList) {
279 | out.printf("\n%s", term);
280 | }
281 | }
282 | } catch (IOException ex) {
283 | logger.warn(AbstractAlgorithm.class.getName(), "couldn't save the algorithm results to temp directory", ex);
284 | }
285 | }
286 |
287 | /**
288 | * Return a String with json extracted terms, name of algorithm and whether
289 | * is scored or not folder)
290 | *
291 | * @return - String with the contents of this algorithm in JSON format
292 | * (name,scored,termlist(
293 | */
294 | public String toJson() {
295 | Gson son = new Gson();
296 | return son.toJson(this);
297 | }
298 |
299 |
300 | public void sort(Comparator comparator){
301 | termList= this.getTermList().stream().sorted(comparator).collect(Collectors.toList());
302 | }
303 |
304 | /**
305 | * Save algorithms results in Json format to tmp directory configured in the
306 | * the config
307 | *
308 | */
309 | public void saveGsonToTmp() {
310 | try ( FileWriter outFile = new FileWriter(properties.getProperty("tmpDir") + File.separator + this.getName() + ".json")){
311 | try (PrintWriter out = new PrintWriter(outFile)) {
312 | Gson son = new Gson();
313 | out.print(son.toJson(this));
314 | }
315 | } catch (IOException ex) {
316 | logger.warn(AbstractAlgorithm.class.getName(), "couldn't save the algorithm results to temp directory in json format", ex);
317 | }
318 | }
319 |
320 | /**
321 | * Return a list of terms from a text file that contains the results of
322 | * running an algorithm
323 | *
324 | * @param pFile
325 | * @return
326 | */
327 | public List readCandidates(String pFile) {
328 | //TODO improve the char recognition using YAGO char tools
329 | List list = null;
330 | try {
331 | Path path = Paths.get(pFile);
332 | List listC = Files.readAllLines(path, StandardCharsets.UTF_8);
333 | for (String string : listC) {
334 | string = string.trim();
335 | String[] line = string.split(" ");
336 | String candidate;
337 | candidate = line[0];
338 | candidate = candidate.trim();
339 | float value = Float.parseFloat(line[1]);
340 | list.add(new Term(candidate, value));
341 | }
342 | return list;
343 | } catch (IOException ex) {
344 | logger.error(AbstractAlgorithm.class.getName(), "error while reading algorithm results", ex);
345 | } catch (NullPointerException ex1){
346 | logger.error(AbstractAlgorithm.class.getName(), "The file is not in the required format", ex1);
347 | }
348 | return null;
349 |
350 | }
351 |
352 | /**
353 | * @return the name
354 | */
355 | public String getName() {
356 | return name;
357 | }
358 |
359 | /**
360 | * @param name the name to set
361 | */
362 | public void setName(String name) {
363 | this.name = name;
364 | }
365 |
366 | /**
367 | * Method for class initialization, initializes the document that will
368 | * be processed for the given algorithm, and the directory where the program
369 | * is executed (standalone vs web server differences...)
370 | *
371 | * @param pDoc
372 | * @param pPropsDir
373 | */
374 | public abstract void init(Document pDoc, String pPropsDir);
375 |
376 | public void setProperties(Properties pProps) {
377 | properties=pProps;
378 | }
379 | }
380 |
--------------------------------------------------------------------------------
/src/main/java/edu/ehu/galan/rake/model/Document.java:
--------------------------------------------------------------------------------
1 | package edu.ehu.galan.rake.model;
2 |
3 | /*
4 | * Document.java
5 | * Copyright (C) 2014 Angel Conde, neuw84 at gmail dot com
6 | *
7 | * This program is free software; you can redistribute it and/or modify
8 | * it under the terms of the GNU General Public License as published by
9 | * the Free Software Foundation; either version 3 of the License, or
10 | * (at your option) any later version.
11 | *
12 | * This program is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | * GNU General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU General Public License
18 | * along with this program; if not, write to the Free Software
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 | */
21 |
22 |
23 | import java.io.BufferedInputStream;
24 | import java.io.BufferedWriter;
25 | import java.io.File;
26 | import java.io.FileInputStream;
27 | import java.io.FileNotFoundException;
28 | import java.io.FileOutputStream;
29 | import java.io.IOException;
30 | import java.io.InputStreamReader;
31 | import java.io.OutputStreamWriter;
32 | import java.io.Reader;
33 | import java.io.StringWriter;
34 | import java.io.Writer;
35 | import java.nio.charset.Charset;
36 | import java.nio.charset.CharsetDecoder;
37 | import java.nio.charset.CodingErrorAction;
38 | import java.util.ArrayList;
39 | import java.util.LinkedList;
40 | import java.util.List;
41 | import org.apache.commons.io.FileUtils;
42 | import org.apache.commons.io.IOUtils;
43 | import org.slf4j.Logger;
44 | import org.slf4j.LoggerFactory;
45 |
46 | /**
47 | * A document represents the piece of a corpus containing text.
48 | *
49 | * @author Angel Conde Manjon
50 | */
51 | public class Document {
52 |
53 | private transient String path;
54 | private transient List sentenceList;
55 | private transient List> tokenList;
56 | private String name;
57 | private transient List termList;
58 | private transient static final Logger logger = LoggerFactory.getLogger(Document.class);
59 |
60 | /**
61 | *
62 | * @param pPath
63 | * @param pName
64 | */
65 | public Document(String pPath, String pName) {
66 | path = pPath;
67 | name = pName;
68 | termList = new ArrayList<>();
69 | }
70 |
71 | /**
72 | * @return the path
73 | */
74 | public String getPath() {
75 | return path;
76 | }
77 |
78 | /**
79 | * @param path the path to set
80 | */
81 | public void setPath(String path) {
82 | this.path = path;
83 | }
84 |
85 | /**
86 | * @return the sentenceList
87 | */
88 | public List getSentenceList() {
89 | return sentenceList;
90 | }
91 |
92 | /**
93 | * @param sentenceList the sentenceList to set
94 | */
95 | public void setSentenceList(List sentenceList) {
96 | this.sentenceList = sentenceList;
97 | }
98 |
99 | /**
100 | * @return the tokenList
101 | */
102 | public List> getTokenList() {
103 | return tokenList;
104 | }
105 |
106 | /**
107 | * @param tokenList the tokenList to set
108 | */
109 | public void List(List> tokenList) {
110 | this.tokenList = tokenList;
111 | }
112 |
113 | /**
114 | * @return the name
115 | */
116 | public String getName() {
117 | return name;
118 | }
119 |
120 | /**
121 | * @param name the name to set
122 | */
123 | public void setName(String name) {
124 | this.name = name;
125 | }
126 |
127 | /**
128 | *
129 | * @return
130 | */
131 | public List getTermList() {
132 | return termList;
133 | }
134 |
135 | /**
136 | * Tries to convert the content of this document to UTF-8 using java
137 | * CharsetDecoders
138 | */
139 | public void convertToUTF8() {
140 | FileInputStream istream = null;
141 | Writer out = null;
142 | try {
143 | istream = new FileInputStream(path);
144 | BufferedInputStream in = new BufferedInputStream(istream);
145 | CharsetDecoder charsetDecoder = Charset.forName("UTF-8").newDecoder();
146 | charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
147 | charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
148 | Reader inputReader = new InputStreamReader(in, charsetDecoder);
149 | StringWriter writer = new StringWriter();
150 | IOUtils.copy(inputReader, writer);
151 | String theString = writer.toString();
152 | FileUtils.deleteQuietly(new File(path));
153 | out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), "UTF-8"));
154 | out.write(theString);
155 | out.close();
156 | // System.out.println("");
157 | } catch (FileNotFoundException ex) {
158 | logger.error("Error converting the file to utf8", ex);
159 | } catch (IOException ex) {
160 | logger.error("Error converting the file to utf8", ex);
161 | } finally {
162 | try {
163 | if (out != null) {
164 | out.close();
165 | }
166 | if (istream != null) {
167 | istream.close();
168 | }
169 | } catch (IOException ex) {
170 | logger.error("Error converting the file to utf8", ex);
171 | }
172 | }
173 |
174 | }
175 |
176 | /**
177 | * @param termList the termList to set
178 | */
179 | public void setTermList(List termList) {
180 | this.termList = termList;
181 | }
182 |
183 | }
184 |
--------------------------------------------------------------------------------
/src/main/java/edu/ehu/galan/rake/model/Term.java:
--------------------------------------------------------------------------------
1 | package edu.ehu.galan.rake.model;
2 |
3 | /*
4 | * Term.java
5 | * Copyright (C) 2013 Angel Conde, neuw84 at gmail dot com
6 | *
7 | * This program is free software; you can redistribute it and/or modify
8 | * it under the terms of the GNU General Public License as published by
9 | * the Free Software Foundation; either version 3 of the License, or
10 | * (at your option) any later version.
11 | *
12 | * This program is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | * GNU General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU General Public License
18 | * along with this program; if not, write to the Free Software
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 | */
21 |
22 |
23 | import java.util.Objects;
24 |
25 | /**
26 | * A term represents a candidate of the term extraction methods, it's need that it will
27 | * pass a validation with a knowledge base before knowing if it is a topic
28 | * A term contains an String with the term's text and a score if the algorithm
29 | * used for extracting the term has one. (if not the score must be -1)
30 | *
31 | * @author Angel Conde Manjon
32 | */
33 |
34 | public class Term {
35 |
36 | private String term;
37 | private float score;
38 |
39 | /**
40 | *
41 | */
42 | public Term() {
43 |
44 | }
45 |
46 | /**
47 | *
48 | * @param pTerm
49 | */
50 |
51 |
52 | public Term(String pTerm) {
53 | term = pTerm;
54 | score = -1;
55 |
56 | }
57 |
58 | /**
59 | *
60 | * @param pTerm
61 | * @param pScore
62 | */
63 | public Term(String pTerm, float pScore) {
64 | term = pTerm;
65 | score = pScore;
66 | }
67 |
68 | /**
69 | * @return the extracted termterm
70 | */
71 | public String getTerm() {
72 | return term;
73 | }
74 |
75 | /**
76 | * @param term the term to set
77 | */
78 | public void setTerm(String term) {
79 | this.term = term;
80 | }
81 |
82 | /**
83 | * @return the score
84 | */
85 | public float getScore() {
86 | return score;
87 | }
88 |
89 | /**
90 | * @param score the score to set
91 | */
92 | public void setScore(float score) {
93 | this.score = score;
94 | }
95 |
96 | @Override
97 | public String toString() {
98 | return term + "\t" + score;
99 | }
100 |
101 |
102 | @Override
103 | public boolean equals(Object pObject) {
104 | if (pObject instanceof Term) {
105 | return this.term.equalsIgnoreCase(((Term) pObject).getTerm());
106 | } else {
107 | return false;
108 |
109 | }
110 | }
111 |
112 | @Override
113 | public int hashCode() {
114 | int hash = 7;
115 | hash = 97 * hash + Objects.hashCode(this.term);
116 | return hash;
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/src/main/java/edu/ehu/galan/rake/model/Token.java:
--------------------------------------------------------------------------------
1 |
2 | package edu.ehu.galan.rake.model;
3 | /*
4 | * Token.java
5 | * Copyright (C) 2013 Angel Conde, neuw84 at gmail dot com
6 | *
7 | * This program is free software; you can redistribute it and/or modify
8 | * it under the terms of the GNU General Public License as published by
9 | * the Free Software Foundation; either version 3 of the License, or
10 | * (at your option) any later version.
11 | *
12 | * This program is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | * GNU General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU General Public License
18 | * along with this program; if not, write to the Free Software
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 | */
21 |
22 |
23 | /**
24 | * A token is a simple "word" containing the word form, POS tag, lemma, etc....
25 | *
26 | * @author Angel Conde Manjon
27 | */
28 |
29 | public class Token{
30 | private String wordForm;
31 | private String posTag;
32 | private String chunkerTag;
33 | private String lemma;
34 | private int pos; //position inside the sentence?
35 |
36 | /**
37 | *
38 | * @param pWordForm
39 | */
40 | public Token(String pWordForm){
41 | wordForm=pWordForm;
42 | }
43 |
44 | /**
45 | *
46 | * @param pWordForm
47 | * @param pPostag
48 | */
49 | public Token(String pWordForm,String pPostag){
50 | wordForm=pWordForm;
51 | posTag=pPostag;
52 | }
53 | /**
54 | *
55 | * @param pWordForm
56 | * @param pPostag
57 | * @param pLemma
58 | */
59 | public Token(String pWordForm,String pPostag,String pLemma){
60 | wordForm=pWordForm;
61 | posTag=pPostag;
62 | lemma=pLemma;
63 | }
64 |
65 | /**
66 | * @param pChunker
67 | * @param pWordForm
68 | * @param pPostag
69 | * @param pLemma
70 | */
71 | public Token(String pWordForm,String pPostag,String pLemma, String pChunker){
72 | wordForm=pWordForm;
73 | posTag=pPostag;
74 | lemma=pLemma;
75 | chunkerTag=pChunker;
76 | }
77 | /**
78 | * @return the wordForm
79 | */
80 | public String getWordForm() {
81 | return wordForm;
82 | }
83 |
84 | /**
85 | * @param wordForm the wordForm to set
86 | */
87 | public void setWordForm(String wordForm) {
88 | this.wordForm = wordForm;
89 | }
90 |
91 | /**
92 | * @return the posTag
93 | */
94 | public String getPosTag() {
95 | return posTag;
96 | }
97 |
98 | /**
99 | * @param posTag the posTag to set
100 | */
101 | public void setPosTag(String posTag) {
102 | this.posTag = posTag;
103 | }
104 |
105 | @Override
106 | public String toString(){
107 | return wordForm+ "\t" + posTag;
108 | }
109 |
110 | /**
111 | * @return the lemma
112 | */
113 | public String getLemma() {
114 | return lemma;
115 | }
116 |
117 | /**
118 | * @param lemma the lemma to set
119 | */
120 | public void setLemma(String lemma) {
121 | this.lemma = lemma;
122 | }
123 |
124 | /**
125 | * @return the chunkerTag
126 | */
127 | public String getChunkerTag() {
128 | return chunkerTag;
129 | }
130 |
131 | /**
132 | * @param chunkerTag the chunkerTag to set
133 | */
134 | public void setChunkerTag(String chunkerTag) {
135 | this.chunkerTag = chunkerTag;
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/src/test/java/edu/ehu/galan/rake/AppTest.java:
--------------------------------------------------------------------------------
1 | package edu.ehu.galan.rake;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/stopLists/FoxStopListEn:
--------------------------------------------------------------------------------
1 | a
2 | about
3 | above
4 | across
5 | after
6 | again
7 | against
8 | all
9 | almost
10 | alone
11 | along
12 | already
13 | also
14 | although
15 | always
16 | among
17 | an
18 | and
19 | another
20 | any
21 | anybody
22 | anyone
23 | anything
24 | anywhere
25 | are
26 | area
27 | areas
28 | around
29 | as
30 | ask
31 | asked
32 | asking
33 | asks
34 | at
35 | away
36 | b
37 | back
38 | backed
39 | backing
40 | backs
41 | be
42 | because
43 | became
44 | become
45 | becomes
46 | been
47 | before
48 | began
49 | behind
50 | being
51 | beings
52 | best
53 | better
54 | between
55 | big
56 | both
57 | but
58 | by
59 | c
60 | came
61 | can
62 | cannot
63 | case
64 | cases
65 | certain
66 | certainly
67 | clear
68 | clearly
69 | come
70 | could
71 | d
72 | did
73 | differ
74 | different
75 | differently
76 | do
77 | does
78 | done
79 | down
80 | downed
81 | downing
82 | downs
83 | during
84 | e
85 | each
86 | early
87 | either
88 | end
89 | ended
90 | ending
91 | ends
92 | enough
93 | even
94 | evenly
95 | ever
96 | every
97 | everybody
98 | everyone
99 | everything
100 | everywhere
101 | f
102 | face
103 | faces
104 | fact
105 | facts
106 | far
107 | felt
108 | few
109 | find
110 | finds
111 | first
112 | for
113 | four
114 | from
115 | full
116 | fully
117 | further
118 | furthered
119 | furthering
120 | furthers
121 | g
122 | gave
123 | general
124 | generally
125 | get
126 | gets
127 | give
128 | given
129 | gives
130 | go
131 | going
132 | good
133 | goods
134 | got
135 | great
136 | greater
137 | greatest
138 | group
139 | grouped
140 | grouping
141 | groups
142 | h
143 | had
144 | has
145 | have
146 | having
147 | he
148 | her
149 | herself
150 | here
151 | high
152 | higher
153 | highest
154 | him
155 | himself
156 | his
157 | how
158 | however
159 | i
160 | if
161 | important
162 | in
163 | interest
164 | interested
165 | interesting
166 | interests
167 | into
168 | is
169 | it
170 | its
171 | itself
172 | j
173 | just
174 | k
175 | keep
176 | keeps
177 | kind
178 | knew
179 | know
180 | known
181 | knows
182 | l
183 | large
184 | largely
185 | last
186 | later
187 | latest
188 | least
189 | less
190 | let
191 | lets
192 | like
193 | likely
194 | long
195 | longer
196 | longest
197 | m
198 | made
199 | make
200 | making
201 | man
202 | many
203 | may
204 | me
205 | member
206 | members
207 | men
208 | might
209 | more
210 | most
211 | mostly
212 | mr
213 | mrs
214 | much
215 | must
216 | my
217 | myself
218 | n
219 | necessary
220 | need
221 | needed
222 | needing
223 | needs
224 | never
225 | new
226 | newer
227 | newest
228 | next
229 | no
230 | non
231 | not
232 | nobody
233 | noone
234 | nothing
235 | now
236 | nowhere
237 | number
238 | numbered
239 | numbering
240 | numbers
241 | o
242 | of
243 | off
244 | often
245 | old
246 | older
247 | oldest
248 | on
249 | once
250 | one
251 | only
252 | open
253 | opened
254 | opening
255 | opens
256 | or
257 | order
258 | ordered
259 | ordering
260 | orders
261 | other
262 | others
263 | our
264 | out
265 | over
266 | p
267 | part
268 | parted
269 | parting
270 | parts
271 | per
272 | perhaps
273 | place
274 | places
275 | point
276 | pointed
277 | pointing
278 | points
279 | possible
280 | present
281 | presented
282 | presenting
283 | presents
284 | problem
285 | problems
286 | put
287 | puts
288 | q
289 | quite
290 | r
291 | rather
292 | really
293 | right
294 | room
295 | rooms
296 | s
297 | said
298 | same
299 | saw
300 | say
301 | says
302 | second
303 | seconds
304 | see
305 | seem
306 | seemed
307 | seeming
308 | seems
309 | sees
310 | several
311 | shall
312 | she
313 | should
314 | show
315 | showed
316 | showing
317 | shows
318 | side
319 | sides
320 | since
321 | small
322 | smaller
323 | smallest
324 | so
325 | some
326 | somebody
327 | someone
328 | something
329 | somewhere
330 | state
331 | states
332 | still
333 | such
334 | sure
335 | t
336 | take
337 | taken
338 | than
339 | that
340 | the
341 | their
342 | them
343 | then
344 | there
345 | therefore
346 | these
347 | they
348 | thing
349 | things
350 | think
351 | thinks
352 | this
353 | those
354 | though
355 | thought
356 | thoughts
357 | three
358 | through
359 | thus
360 | to
361 | today
362 | together
363 | too
364 | took
365 | toward
366 | turn
367 | turned
368 | turning
369 | turns
370 | two
371 | u
372 | under
373 | until
374 | up
375 | upon
376 | us
377 | use
378 | uses
379 | used
380 | v
381 | very
382 | w
383 | want
384 | wanted
385 | wanting
386 | wants
387 | was
388 | way
389 | ways
390 | we
391 | well
392 | wells
393 | went
394 | were
395 | what
396 | when
397 | where
398 | whether
399 | which
400 | while
401 | who
402 | whole
403 | whose
404 | why
405 | will
406 | with
407 | within
408 | without
409 | work
410 | worked
411 | working
412 | works
413 | would
414 | x
415 | y
416 | year
417 | years
418 | yet
419 | you
420 | young
421 | younger
422 | youngest
423 | your
424 | yours
425 | z
--------------------------------------------------------------------------------
/stopLists/RakePunctDefaultStopList:
--------------------------------------------------------------------------------
1 | .
2 | /
3 | ,
4 | !
5 | ?
6 | {
7 | }
8 | [
9 | ]
10 | ;
11 | :
12 | (
13 | )
14 | -
15 | _
16 | @
--------------------------------------------------------------------------------
/stopLists/SmartStopListEn:
--------------------------------------------------------------------------------
1 | a
2 | a's
3 | able
4 | about
5 | above
6 | according
7 | accordingly
8 | across
9 | actually
10 | after
11 | afterwards
12 | again
13 | against
14 | ain't
15 | all
16 | allow
17 | allows
18 | almost
19 | alone
20 | along
21 | already
22 | also
23 | although
24 | always
25 | am
26 | among
27 | amongst
28 | an
29 | and
30 | another
31 | any
32 | anybody
33 | anyhow
34 | anyone
35 | anything
36 | anyway
37 | anyways
38 | anywhere
39 | apart
40 | appear
41 | appreciate
42 | appropriate
43 | are
44 | aren't
45 | around
46 | as
47 | aside
48 | ask
49 | asking
50 | associated
51 | at
52 | available
53 | away
54 | awfully
55 | b
56 | be
57 | became
58 | because
59 | become
60 | becomes
61 | becoming
62 | been
63 | before
64 | beforehand
65 | behind
66 | being
67 | believe
68 | below
69 | beside
70 | besides
71 | best
72 | better
73 | between
74 | beyond
75 | both
76 | brief
77 | but
78 | by
79 | c
80 | c'mon
81 | c's
82 | came
83 | can
84 | can't
85 | cannot
86 | cant
87 | cause
88 | causes
89 | certain
90 | certainly
91 | changes
92 | clearly
93 | co
94 | com
95 | come
96 | comes
97 | concerning
98 | consequently
99 | consider
100 | considering
101 | contain
102 | containing
103 | contains
104 | corresponding
105 | could
106 | couldn't
107 | course
108 | currently
109 | d
110 | definitely
111 | described
112 | despite
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | don't
121 | done
122 | down
123 | downwards
124 | during
125 | e
126 | each
127 | edu
128 | eg
129 | eight
130 | either
131 | else
132 | elsewhere
133 | enough
134 | entirely
135 | especially
136 | et
137 | etc
138 | even
139 | ever
140 | every
141 | everybody
142 | everyone
143 | everything
144 | everywhere
145 | ex
146 | exactly
147 | example
148 | except
149 | f
150 | far
151 | few
152 | fifth
153 | first
154 | five
155 | followed
156 | following
157 | follows
158 | for
159 | former
160 | formerly
161 | forth
162 | four
163 | from
164 | further
165 | furthermore
166 | g
167 | get
168 | gets
169 | getting
170 | given
171 | gives
172 | go
173 | goes
174 | going
175 | gone
176 | got
177 | gotten
178 | greetings
179 | h
180 | had
181 | hadn't
182 | happens
183 | hardly
184 | has
185 | hasn't
186 | have
187 | haven't
188 | having
189 | he
190 | he's
191 | hello
192 | help
193 | hence
194 | her
195 | here
196 | here's
197 | hereafter
198 | hereby
199 | herein
200 | hereupon
201 | hers
202 | herself
203 | hi
204 | him
205 | himself
206 | his
207 | hither
208 | hopefully
209 | how
210 | howbeit
211 | however
212 | i
213 | i'd
214 | i'll
215 | i'm
216 | i've
217 | ie
218 | if
219 | ignored
220 | immediate
221 | in
222 | inasmuch
223 | inc
224 | indeed
225 | indicate
226 | indicated
227 | indicates
228 | inner
229 | insofar
230 | instead
231 | into
232 | inward
233 | is
234 | isn't
235 | it
236 | it'd
237 | it'll
238 | it's
239 | its
240 | itself
241 | j
242 | just
243 | k
244 | keep
245 | keeps
246 | kept
247 | know
248 | knows
249 | known
250 | l
251 | last
252 | lately
253 | later
254 | latter
255 | latterly
256 | least
257 | less
258 | lest
259 | let
260 | let's
261 | like
262 | liked
263 | likely
264 | little
265 | look
266 | looking
267 | looks
268 | ltd
269 | m
270 | mainly
271 | many
272 | may
273 | maybe
274 | me
275 | mean
276 | meanwhile
277 | merely
278 | might
279 | more
280 | moreover
281 | most
282 | mostly
283 | much
284 | must
285 | my
286 | myself
287 | n
288 | name
289 | namely
290 | nd
291 | near
292 | nearly
293 | necessary
294 | need
295 | needs
296 | neither
297 | never
298 | nevertheless
299 | new
300 | next
301 | nine
302 | no
303 | nobody
304 | non
305 | none
306 | noone
307 | nor
308 | normally
309 | not
310 | nothing
311 | novel
312 | now
313 | nowhere
314 | o
315 | obviously
316 | of
317 | off
318 | often
319 | oh
320 | ok
321 | okay
322 | old
323 | on
324 | once
325 | one
326 | ones
327 | only
328 | onto
329 | or
330 | other
331 | others
332 | otherwise
333 | ought
334 | our
335 | ours
336 | ourselves
337 | out
338 | outside
339 | over
340 | overall
341 | own
342 | p
343 | particular
344 | particularly
345 | per
346 | perhaps
347 | placed
348 | please
349 | plus
350 | possible
351 | presumably
352 | probably
353 | provides
354 | q
355 | que
356 | quite
357 | qv
358 | r
359 | rather
360 | rd
361 | re
362 | really
363 | reasonably
364 | regarding
365 | regardless
366 | regards
367 | relatively
368 | respectively
369 | right
370 | s
371 | said
372 | same
373 | saw
374 | say
375 | saying
376 | says
377 | second
378 | secondly
379 | see
380 | seeing
381 | seem
382 | seemed
383 | seeming
384 | seems
385 | seen
386 | self
387 | selves
388 | sensible
389 | sent
390 | serious
391 | seriously
392 | seven
393 | several
394 | shall
395 | she
396 | should
397 | shouldn't
398 | since
399 | six
400 | so
401 | some
402 | somebody
403 | somehow
404 | someone
405 | something
406 | sometime
407 | sometimes
408 | somewhat
409 | somewhere
410 | soon
411 | sorry
412 | specified
413 | specify
414 | specifying
415 | still
416 | sub
417 | such
418 | sup
419 | sure
420 | t
421 | t's
422 | take
423 | taken
424 | tell
425 | tends
426 | th
427 | than
428 | thank
429 | thanks
430 | thanx
431 | that
432 | that's
433 | thats
434 | the
435 | their
436 | theirs
437 | them
438 | themselves
439 | then
440 | thence
441 | there
442 | there's
443 | thereafter
444 | thereby
445 | therefore
446 | therein
447 | theres
448 | thereupon
449 | these
450 | they
451 | they'd
452 | they'll
453 | they're
454 | they've
455 | think
456 | third
457 | this
458 | thorough
459 | thoroughly
460 | those
461 | though
462 | three
463 | through
464 | throughout
465 | thru
466 | thus
467 | to
468 | together
469 | too
470 | took
471 | toward
472 | towards
473 | tried
474 | tries
475 | truly
476 | try
477 | trying
478 | twice
479 | two
480 | u
481 | un
482 | under
483 | unfortunately
484 | unless
485 | unlikely
486 | until
487 | unto
488 | up
489 | upon
490 | us
491 | use
492 | used
493 | useful
494 | uses
495 | using
496 | usually
497 | uucp
498 | v
499 | value
500 | various
501 | very
502 | via
503 | viz
504 | vs
505 | w
506 | want
507 | wants
508 | was
509 | wasn't
510 | way
511 | we
512 | we'd
513 | we'll
514 | we're
515 | we've
516 | welcome
517 | well
518 | went
519 | were
520 | weren't
521 | what
522 | what's
523 | whatever
524 | when
525 | whence
526 | whenever
527 | where
528 | where's
529 | whereafter
530 | whereas
531 | whereby
532 | wherein
533 | whereupon
534 | wherever
535 | whether
536 | which
537 | while
538 | whither
539 | who
540 | who's
541 | whoever
542 | whole
543 | whom
544 | whose
545 | why
546 | will
547 | willing
548 | wish
549 | with
550 | within
551 | without
552 | won't
553 | wonder
554 | would
555 | would
556 | wouldn't
557 | x
558 | y
559 | yes
560 | yet
561 | you
562 | you'd
563 | you'll
564 | you're
565 | you've
566 | your
567 | yours
568 | yourself
569 | yourselves
570 | z
571 | zero
572 | true
573 | false
574 | additional
575 | shown
--------------------------------------------------------------------------------
/stopLists/SpanishCustomEs:
--------------------------------------------------------------------------------
1 | algún
2 | alguna
3 | algunas
4 | alguno
5 | algunos
6 | ambos
7 | ampleamos
8 | ante
9 | antes
10 | aquel
11 | aquellas
12 | aquellos
13 | aqui
14 | arriba
15 | atras
16 | bajo
17 | bastante
18 | bien
19 | cada
20 | cierta
21 | ciertas
22 | cierto
23 | ciertos
24 | como
25 | con
26 | conseguimos
27 | conseguir
28 | consigo
29 | consigue
30 | consiguen
31 | consigues
32 | cual
33 | cuando
34 | dentro
35 | desde
36 | donde
37 | dos
38 | el
39 | ellas
40 | ellos
41 | empleais
42 | emplean
43 | emplear
44 | empleas
45 | empleo
46 | en
47 | encima
48 | entonces
49 | entre
50 | era
51 | eramos
52 | eran
53 | eras
54 | eres
55 | es
56 | esta
57 | estaba
58 | estado
59 | estais
60 | estamos
61 | estan
62 | estoy
63 | fin
64 | fue
65 | fueron
66 | fui
67 | fuimos
68 | gueno
69 | ha
70 | hace
71 | haceis
72 | hacemos
73 | hacen
74 | hacer
75 | haces
76 | hago
77 | incluso
78 | intenta
79 | intentais
80 | intentamos
81 | intentan
82 | intentar
83 | intentas
84 | intento
85 | ir
86 | la
87 | largo
88 | las
89 | lo
90 | los
91 | mientras
92 | mio
93 | modo
94 | muchos
95 | muy
96 | nos
97 | nosotros
98 | otro
99 | para
100 | pero
101 | podeis
102 | podemos
103 | poder
104 | podria
105 | podriais
106 | podriamos
107 | podrian
108 | podrias
109 | por
110 | por qué
111 | porque
112 | primero
113 | puede
114 | pueden
115 | puedo
116 | quien
117 | sabe
118 | sabeis
119 | sabemos
120 | saben
121 | saber
122 | sabes
123 | ser
124 | si
125 | siendo
126 | sin
127 | sobre
128 | sois
129 | solamente
130 | solo
131 | somos
132 | soy
133 | su
134 | sus
135 | también
136 | teneis
137 | tenemos
138 | tener
139 | tengo
140 | tiempo
141 | tiene
142 | tienen
143 | todo
144 | trabaja
145 | trabajais
146 | trabajamos
147 | trabajan
148 | trabajar
149 | trabajas
150 | trabajo
151 | tras
152 | tuyo
153 | ultimo
154 | un
155 | una
156 | unas
157 | uno
158 | unos
159 | usa
160 | usais
161 | usamos
162 | usan
163 | usar
164 | usas
165 | uso
166 | va
167 | vais
168 | valor
169 | vamos
170 | van
171 | vaya
172 | verdad
173 | verdadera
174 | verdadero
175 | vosotras
176 | vosotros
177 | voy
178 | yo
179 |
--------------------------------------------------------------------------------