├── .babelrc
├── .gitignore
├── LICENCE
├── README.md
├── package.json
├── src
    └── index.js
└── test
    ├── data
        ├── fox_1989_stoplist.txt
        └── salton_1971_smartstoplist.txt
    └── rake.test.js


/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 |   "presets": ["es2015"],
3 |   "plugins": ["transform-async-to-generator", ["transform-runtime", { "polyfill": false, "regenerator": true }]]
4 | }
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | dist
3 | *.swp
4 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | (The MIT License)
 2 | 
 3 | Copyright (c) 2017 Mike Williamson <mike@korora.ca>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | 'Software'), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RAKE: Rapid automatic keyword extraction
 2 | 
 3 | The goal of this library was to create a well tested Javascript translation of the
 4 | [python implementation](https://github.com/zelandiya/RAKE-tutorial).
 5 | 
 6 | Differences in regular expressions and stopword lists have big impacts on this algorithm and
 7 | sticking close to the python means that the code was easy to compare to ensure
 8 | that it was in the ballpark.
 9 | 
10 | This algorithm is described in [Text Mining: Applications and
11 | Theory](https://www.amazon.ca/Text-Mining-Applications-Michael-Berry/dp/0470749822)
12 | and also in this [excellent blog
13 | post](https://www.airpair.com/nlp/keyword-extraction-tutorial) by Alyona
14 | Medelyan.
15 | 
16 | It operates using only the text you give it and produces surprisingly good
17 | results. There are likely [better results
18 | possible](http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/)
19 | but these mostly seem to involve a combination of Python, Machine Learning and
20 | a corpus of data.
21 | 
22 | The appeal of RAKE is of the "bang for the buck" variety.
23 | 
24 | Currently this library produces subtly different results than either the paper
25 | or the original Python implementation. While the results (especially the top
26 | scoring ones) line up nicely, these little deviations represent something to
27 | understand and resolve.
28 | 
29 | ## What's next
30 | 
31 | After hammering out differences in the results, plans are to focus on
32 | 
33 | * Fully embracing JS idioms (Promises/ES201X)
34 | * Explore ways to improve the results as described
35 |   [here](https://www.ijarcsse.com/docs/papers/Volume_6/5_May2016/V6I5-0392.pdf)
36 | * Options to control result format (number, result|result+rank, etc)
37 | * Include default stopword list.
38 | * Improve handling of special characters and italics
39 | * Deal with sentences that have been split over multiple lines (sentence now ends with -)
40 | 
41 | # Usage
42 | 
43 | ```javascript
44 | > var rake = require('../dist/index').default
45 | undefined
46 | > rake('Compatibility of systems of linear constraints over the set of natural numbers', 'test/data/salton_1971_smartstoplist.txt').then(console.log)
47 | { 'natural numbers': 4,
48 |   'linear constraints': 4,
49 |   set: 1,
50 |   systems: 1,
51 |   compatibility: 1 }
52 | ```
53 | 
54 | ## Stopword lists
55 | 
56 | The stopword list used by the python version is [here](https://github.com/zelandiya/RAKE-tutorial/blob/master/SmartStoplist.txt).
57 | It has a comment as the first line which might break the world...
58 | 
59 | Links to other stopword lists can be found [here](http://trialstravails.blogspot.ca/2014/04/fox-stop-words-list.html)
60 | 
61 | Any file with one word per line should be fine.
62 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "rapid-automated-keyword-extraction",
 3 |   "version": "1.0.0",
 4 |   "description": "An javascript implementation of the Rapid Automated Keyword Extraction (RAKE) algorithm",
 5 |   "main": "dist/index.js",
 6 |   "jsnext:main": "src/index.js",
 7 |   "scripts": {
 8 |     "test": "jest",
 9 |     "build": "babel --copy-files --out-dir dist/ src/"
10 |   },
11 |   "keywords": [
12 |     "keyword",
13 |     "extraction",
14 |     "rake"
15 |   ],
16 |   "author": "Mike Williamson",
17 |   "license": "MIT",
18 |   "devDependencies": {
19 |     "babel-jest": "^19.0.0",
20 |     "babel-plugin-transform-async-to-generator": "^6.22.0",
21 |     "babel-plugin-transform-runtime": "^6.23.0",
22 |     "babel-polyfill": "^6.23.0",
23 |     "babel-preset-es2015": "^6.22.0"
24 |   },
25 |   "dependencies": {
26 |     "fs-promise": "^2.0.0",
27 |     "lodash": "^4.17.4"
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
  1 | import fsp from 'fs-promise'
  2 | import { fromPairs, sortBy, toPairs } from 'lodash'
  3 | 
  4 | function isNumber(str) {
  5 |   return /\d/.test(str)
  6 | }
  7 | 
  8 | // TODO: smaller functions should be extracted from this
  9 | export function isAcceptable (phrase, minCharLength, maxWordsLength) {
 10 |   // a phrase must have a min length in characters
 11 |   if(phrase < minCharLength) {
 12 |     return false
 13 |   }
 14 |   // a phrase must have a max number of words
 15 |   let words = phrase.split(' ')
 16 |   if(words.length > maxWordsLength) {
 17 |     return false
 18 |   }
 19 | 
 20 |   let digits = 0
 21 |   let alpha = 0
 22 |   //is there a better way to do this?
 23 |   for(let i = 0; i < phrase.length; i++) {
 24 |     if(/\d/.test(phrase[i])) digits += 1
 25 |     if(/[a-zA-Z]/.test(phrase[i])) alpha += 1
 26 |   }
 27 | 
 28 |   // a phrase must have at least one alpha character
 29 |   if(alpha == 0) {
 30 |     return false
 31 |   }
 32 | 
 33 |   // a phrase must have more alpha than digits characters
 34 |   if(digits > alpha) {
 35 |     return false
 36 |   }
 37 | 
 38 |   return true
 39 | }
 40 | 
 41 | export function countOccurances (haystack, needle) {
 42 |   return haystack.reduce((n, value) => {
 43 |         return n + (value === needle)
 44 |   }, 0)
 45 | }
 46 | 
 47 | export function generateCandidateKeywordScores (phraseList, wordScore, minKeywordFrequency = 1) {
 48 | 
 49 |   let keywordCandidates = {}
 50 | 
 51 |   phraseList.forEach(phrase => {
 52 |     if(minKeywordFrequency > 1) {
 53 |       if(countOccurances(phraseList, phrase) < minKeywordFrequency) {
 54 | 	return
 55 |       }
 56 |     }
 57 |     phrase in keywordCandidates || (keywordCandidates[phrase] = 0)
 58 |     let wordList = separateWords(phrase, 0)
 59 |     let candidateScore = 0
 60 |     wordList.forEach(word => {
 61 |       candidateScore += wordScore[word]
 62 |       keywordCandidates[phrase] = candidateScore
 63 |     })
 64 |   })
 65 |   return keywordCandidates
 66 | }
 67 | 
 68 | export function separateWords (text, minWordReturnSize) {
 69 |   let wordDelimiters = /[^a-zA-Z0-9_\+\-/]/
 70 |   let words = []
 71 |   text.split(wordDelimiters).forEach(singleWord => {
 72 |     let currentWord = singleWord.trim().toLowerCase()
 73 |     //leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
 74 |     if(currentWord.length > minWordReturnSize && currentWord != '' && !isNumber(currentWord)) {
 75 |       words.push(currentWord)
 76 |     }
 77 |   })
 78 |   return words
 79 | }
 80 | 
 81 | export function calculateWordScores (phraseList) {
 82 |   let wordFrequency = {}
 83 |   let wordDegree = {}
 84 |   phraseList.forEach(phrase => {
 85 |     let wordList = separateWords(phrase, 0)
 86 |     let wordListLength = wordList.length
 87 |     let wordListDegree = wordListLength - 1
 88 |     wordList.forEach(word => {
 89 |       word in wordFrequency || (wordFrequency[word] = 0)
 90 |       wordFrequency[word] += 1
 91 |       word in wordDegree || (wordDegree[word] = 0)
 92 |       wordDegree[word] += wordListDegree
 93 |     })
 94 |   })
 95 | 
 96 |   Object.keys(wordFrequency).forEach(item => {
 97 |     wordDegree[item] = wordDegree[item] + wordFrequency[item]
 98 |   })
 99 | 
100 |   // Calculate Word scores = deg(w)/frew(w)
101 |   let wordScore = {}
102 |   Object.keys(wordFrequency).forEach(item => {
103 |     item in wordScore || (wordScore[item] = 0)
104 |     wordScore[item] = wordDegree[item] / (wordFrequency[item] * 1.0)
105 |   })
106 | 
107 |   return wordScore
108 | }
109 | 
110 | 
111 | export function generateCandidateKeywords (sentenceList, stopWordPattern, minCharLength = 1, maxWordsLength = 5) {
112 |   let phraseList = []
113 |   sentenceList.forEach(sentence => {
114 |     let tmp = stopWordPattern[Symbol.replace](sentence, '|')
115 |     let phrases = tmp.split("|")
116 |     phrases.forEach(ph => {
117 |       let phrase = ph.trim().toLowerCase()
118 | 
119 |       if(phrase != "" && isAcceptable(phrase, minCharLength, maxWordsLength)) {
120 | 	phraseList.push(phrase)
121 |       } else {
122 |       }
123 |     })
124 |   })
125 |   return phraseList
126 | }
127 | 
128 | export async function buildStopWordRegex (path) {
129 |   let stopWordList = await loadStopWords(path)
130 |   let stopWordRegexList = []
131 |   stopWordList.forEach(word => {
132 |     if(/\w+/.test(word)) {
133 |     // match only stop words surrounded by word boundaries (\b)
134 |     let wordRegex = `\\b${word}\\b`
135 |     stopWordRegexList.push(wordRegex)
136 |     }
137 |   })
138 |   let stopWordPattern = new RegExp(stopWordRegexList.join('|'), 'ig')
139 |   return stopWordPattern
140 | }
141 | 
142 | export function splitSentences (text) {
143 |   let sentenceDelimiters = /[\[\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]/
144 |   return text.split(sentenceDelimiters)
145 | }
146 | 
147 | export async function loadStopWords (path) {
148 |   let contents = await fsp.readFile(path, {encoding:'utf8'})
149 | 
150 |   //TODO: we are assuming one word per line
151 |   return contents.split(/\n/)
152 | }
153 | 
154 | export default async function rake (text, stopWordsPath, minCharLength=3, maxWordsLength=5, minKeywordFrequency=1) {
155 |   let stopWordPattern = await buildStopWordRegex(stopWordsPath)
156 |   let sentenceList = splitSentences(text)
157 |   let phraseList = generateCandidateKeywords(sentenceList, stopWordPattern, minCharLength, maxWordsLength)
158 |   let wordScores = calculateWordScores(phraseList)
159 |   let keywordCandidates = generateCandidateKeywordScores(phraseList, wordScores, minKeywordFrequency)
160 |   let sortedKeywords = fromPairs(sortBy(toPairs(keywordCandidates), (pair) => pair[1]).reverse())
161 |   return sortedKeywords
162 | }
163 | 
164 | 


--------------------------------------------------------------------------------
/test/data/fox_1989_stoplist.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | about
  3 | above
  4 | across
  5 | after
  6 | again
  7 | against
  8 | all
  9 | almost
 10 | alone
 11 | along
 12 | already
 13 | also
 14 | although
 15 | always
 16 | among
 17 | an
 18 | and
 19 | another
 20 | any
 21 | anybody
 22 | anyone
 23 | anything
 24 | anywhere
 25 | are
 26 | area
 27 | areas
 28 | around
 29 | as
 30 | ask
 31 | asked
 32 | asking
 33 | asks
 34 | at
 35 | away
 36 | b
 37 | back
 38 | backed
 39 | backing
 40 | backs
 41 | be
 42 | because
 43 | become
 44 | becomes
 45 | became
 46 | been
 47 | before
 48 | began
 49 | behind
 50 | being
 51 | beings
 52 | best
 53 | better
 54 | between
 55 | big
 56 | both
 57 | but
 58 | by
 59 | c
 60 | came
 61 | can
 62 | cannot
 63 | case
 64 | cases
 65 | certain
 66 | certainly
 67 | clear
 68 | clearly
 69 | come
 70 | could
 71 | d
 72 | did
 73 | differ
 74 | different
 75 | differently
 76 | do
 77 | does
 78 | done
 79 | down
 80 | downed
 81 | downing
 82 | downs
 83 | during
 84 | e
 85 | each
 86 | early
 87 | either
 88 | end
 89 | ended
 90 | ending
 91 | ends
 92 | enough
 93 | even
 94 | evenly
 95 | ever
 96 | every
 97 | everybody
 98 | everyone
 99 | everything
100 | everywhere
101 | f
102 | face
103 | faces
104 | fact
105 | facts
106 | far
107 | felt
108 | few
109 | find
110 | finds
111 | first
112 | for
113 | four
114 | from
115 | full
116 | fully
117 | further
118 | furthered
119 | furthering
120 | furthers
121 | g
122 | gave
123 | general
124 | generally
125 | get
126 | gets
127 | give
128 | given
129 | gives
130 | go
131 | going
132 | good
133 | goods
134 | got
135 | great
136 | greater
137 | greatest
138 | group
139 | grouped
140 | grouping
141 | groups
142 | h
143 | had
144 | has
145 | have
146 | having
147 | he
148 | her
149 | herself
150 | here
151 | high
152 | higher
153 | highest
154 | him
155 | himself
156 | his
157 | how
158 | however
159 | i
160 | if
161 | important
162 | in
163 | interest
164 | interested
165 | interesting
166 | interests
167 | into
168 | is
169 | it
170 | its
171 | itself
172 | j
173 | just
174 | k
175 | keep
176 | keeps
177 | kind
178 | knew
179 | know
180 | known
181 | knows
182 | l
183 | large
184 | largely
185 | last
186 | later
187 | latest
188 | least
189 | less
190 | let
191 | lets
192 | like
193 | likely
194 | long
195 | longer
196 | longest
197 | m
198 | made
199 | make
200 | making
201 | man
202 | many
203 | may
204 | me
205 | member
206 | members
207 | men
208 | might
209 | more
210 | most
211 | mostly
212 | mr
213 | mrs
214 | much
215 | must
216 | my
217 | myself
218 | n
219 | necessary
220 | need
221 | needed
222 | needing
223 | needs
224 | never
225 | new
226 | newer
227 | newest
228 | next
229 | no
230 | non
231 | not
232 | nobody
233 | noone
234 | nothing
235 | now
236 | nowhere
237 | number
238 | numbers
239 | o
240 | of
241 | off
242 | often
243 | old
244 | older
245 | oldest
246 | on
247 | once
248 | one
249 | only
250 | open
251 | opened
252 | opening
253 | opens
254 | or
255 | order
256 | ordered
257 | ordering
258 | orders
259 | other
260 | others
261 | our
262 | out
263 | over
264 | p
265 | part
266 | parted
267 | parting
268 | parts
269 | per
270 | perhaps
271 | place
272 | places
273 | point
274 | pointed
275 | pointing
276 | points
277 | possible
278 | present
279 | presented
280 | presenting
281 | presents
282 | problem
283 | problems
284 | put
285 | puts
286 | q
287 | quite
288 | r
289 | rather
290 | really
291 | right
292 | room
293 | rooms
294 | s
295 | said
296 | same
297 | saw
298 | say
299 | says
300 | second
301 | seconds
302 | see
303 | sees
304 | seem
305 | seemed
306 | seeming
307 | seems
308 | several
309 | shall
310 | she
311 | should
312 | show
313 | showed
314 | showing
315 | shows
316 | side
317 | sides
318 | since
319 | small
320 | smaller
321 | smallest
322 | so
323 | some
324 | somebody
325 | someone
326 | something
327 | somewhere
328 | state
329 | states
330 | still
331 | such
332 | sure
333 | t
334 | take
335 | taken
336 | than
337 | that
338 | the
339 | their
340 | them
341 | then
342 | there
343 | therefore
344 | these
345 | they
346 | thing
347 | things
348 | think
349 | thinks
350 | this
351 | those
352 | though
353 | thought
354 | thoughts
355 | three
356 | through
357 | thus
358 | to
359 | today
360 | together
361 | too
362 | took
363 | toward
364 | turn
365 | turned
366 | turning
367 | turns
368 | two
369 | u
370 | under
371 | until
372 | up
373 | upon
374 | us
375 | use
376 | uses
377 | used
378 | v
379 | very
380 | w
381 | want
382 | wanted
383 | wanting
384 | wants
385 | was
386 | way
387 | ways
388 | we
389 | well
390 | wells
391 | went
392 | were
393 | what
394 | when
395 | where
396 | whether
397 | which
398 | while
399 | who
400 | whole
401 | whose
402 | why
403 | will
404 | with
405 | within
406 | without
407 | work
408 | worked
409 | working
410 | works
411 | would
412 | y
413 | year
414 | years
415 | yet
416 | you
417 | young
418 | younger
419 | youngest
420 | your
421 | yours
422 | 


--------------------------------------------------------------------------------
/test/data/salton_1971_smartstoplist.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | a's
  3 | able
  4 | about
  5 | above
  6 | according
  7 | accordingly
  8 | across
  9 | actually
 10 | after
 11 | afterwards
 12 | again
 13 | against
 14 | ain't
 15 | all
 16 | allow
 17 | allows
 18 | almost
 19 | alone
 20 | along
 21 | already
 22 | also
 23 | although
 24 | always
 25 | am
 26 | among
 27 | amongst
 28 | an
 29 | and
 30 | another
 31 | any
 32 | anybody
 33 | anyhow
 34 | anyone
 35 | anything
 36 | anyway
 37 | anyways
 38 | anywhere
 39 | apart
 40 | appear
 41 | appreciate
 42 | appropriate
 43 | are
 44 | aren't
 45 | around
 46 | as
 47 | aside
 48 | ask
 49 | asking
 50 | associated
 51 | at
 52 | available
 53 | away
 54 | awfully
 55 | b
 56 | be
 57 | became
 58 | because
 59 | become
 60 | becomes
 61 | becoming
 62 | been
 63 | before
 64 | beforehand
 65 | behind
 66 | being
 67 | believe
 68 | below
 69 | beside
 70 | besides
 71 | best
 72 | better
 73 | between
 74 | beyond
 75 | both
 76 | brief
 77 | but
 78 | by
 79 | c
 80 | c'mon
 81 | c's
 82 | came
 83 | can
 84 | can't
 85 | cannot
 86 | cant
 87 | cause
 88 | causes
 89 | certain
 90 | certainly
 91 | changes
 92 | clearly
 93 | co
 94 | com
 95 | come
 96 | comes
 97 | concerning
 98 | consequently
 99 | consider
100 | considering
101 | contain
102 | containing
103 | contains
104 | corresponding
105 | could
106 | couldn't
107 | course
108 | currently
109 | d
110 | definitely
111 | described
112 | despite
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | don't
121 | done
122 | down
123 | downwards
124 | during
125 | e
126 | each
127 | edu
128 | eg
129 | eight
130 | either
131 | else
132 | elsewhere
133 | enough
134 | entirely
135 | especially
136 | et
137 | etc
138 | even
139 | ever
140 | every
141 | everybody
142 | everyone
143 | everything
144 | everywhere
145 | ex
146 | exactly
147 | example
148 | except
149 | f
150 | far
151 | few
152 | fifth
153 | first
154 | five
155 | followed
156 | following
157 | follows
158 | for
159 | former
160 | formerly
161 | forth
162 | four
163 | from
164 | further
165 | furthermore
166 | g
167 | get
168 | gets
169 | getting
170 | given
171 | gives
172 | go
173 | goes
174 | going
175 | gone
176 | got
177 | gotten
178 | greetings
179 | h
180 | had
181 | hadn't
182 | happens
183 | hardly
184 | has
185 | hasn't
186 | have
187 | haven't
188 | having
189 | he
190 | he's
191 | hello
192 | help
193 | hence
194 | her
195 | here
196 | here's
197 | hereafter
198 | hereby
199 | herein
200 | hereupon
201 | hers
202 | herself
203 | hi
204 | him
205 | himself
206 | his
207 | hither
208 | hopefully
209 | how
210 | howbeit
211 | however
212 | i
213 | i'd
214 | i'll
215 | i'm
216 | i've
217 | ie
218 | if
219 | ignored
220 | immediate
221 | in
222 | inasmuch
223 | inc
224 | indeed
225 | indicate
226 | indicated
227 | indicates
228 | inner
229 | insofar
230 | instead
231 | into
232 | inward
233 | is
234 | isn't
235 | it
236 | it'd
237 | it'll
238 | it's
239 | its
240 | itself
241 | j
242 | just
243 | k
244 | keep
245 | keeps
246 | kept
247 | know
248 | knows
249 | known
250 | l
251 | last
252 | lately
253 | later
254 | latter
255 | latterly
256 | least
257 | less
258 | lest
259 | let
260 | let's
261 | like
262 | liked
263 | likely
264 | little
265 | look
266 | looking
267 | looks
268 | ltd
269 | m
270 | mainly
271 | many
272 | may
273 | maybe
274 | me
275 | mean
276 | meanwhile
277 | merely
278 | might
279 | more
280 | moreover
281 | most
282 | mostly
283 | much
284 | must
285 | my
286 | myself
287 | n
288 | name
289 | namely
290 | nd
291 | near
292 | nearly
293 | necessary
294 | need
295 | needs
296 | neither
297 | never
298 | nevertheless
299 | new
300 | next
301 | nine
302 | no
303 | nobody
304 | non
305 | none
306 | noone
307 | nor
308 | normally
309 | not
310 | nothing
311 | novel
312 | now
313 | nowhere
314 | o
315 | obviously
316 | of
317 | off
318 | often
319 | oh
320 | ok
321 | okay
322 | old
323 | on
324 | once
325 | one
326 | ones
327 | only
328 | onto
329 | or
330 | other
331 | others
332 | otherwise
333 | ought
334 | our
335 | ours
336 | ourselves
337 | out
338 | outside
339 | over
340 | overall
341 | own
342 | p
343 | particular
344 | particularly
345 | per
346 | perhaps
347 | placed
348 | please
349 | plus
350 | possible
351 | presumably
352 | probably
353 | provides
354 | q
355 | que
356 | quite
357 | qv
358 | r
359 | rather
360 | rd
361 | re
362 | really
363 | reasonably
364 | regarding
365 | regardless
366 | regards
367 | relatively
368 | respectively
369 | right
370 | s
371 | said
372 | same
373 | saw
374 | say
375 | saying
376 | says
377 | second
378 | secondly
379 | see
380 | seeing
381 | seem
382 | seemed
383 | seeming
384 | seems
385 | seen
386 | self
387 | selves
388 | sensible
389 | sent
390 | serious
391 | seriously
392 | seven
393 | several
394 | shall
395 | she
396 | should
397 | shouldn't
398 | since
399 | six
400 | so
401 | some
402 | somebody
403 | somehow
404 | someone
405 | something
406 | sometime
407 | sometimes
408 | somewhat
409 | somewhere
410 | soon
411 | sorry
412 | specified
413 | specify
414 | specifying
415 | still
416 | sub
417 | such
418 | sup
419 | sure
420 | t
421 | t's
422 | take
423 | taken
424 | tell
425 | tends
426 | th
427 | than
428 | thank
429 | thanks
430 | thanx
431 | that
432 | that's
433 | thats
434 | the
435 | their
436 | theirs
437 | them
438 | themselves
439 | then
440 | thence
441 | there
442 | there's
443 | thereafter
444 | thereby
445 | therefore
446 | therein
447 | theres
448 | thereupon
449 | these
450 | they
451 | they'd
452 | they'll
453 | they're
454 | they've
455 | think
456 | third
457 | this
458 | thorough
459 | thoroughly
460 | those
461 | though
462 | three
463 | through
464 | throughout
465 | thru
466 | thus
467 | to
468 | together
469 | too
470 | took
471 | toward
472 | towards
473 | tried
474 | tries
475 | truly
476 | try
477 | trying
478 | twice
479 | two
480 | u
481 | un
482 | under
483 | unfortunately
484 | unless
485 | unlikely
486 | until
487 | unto
488 | up
489 | upon
490 | us
491 | use
492 | used
493 | useful
494 | uses
495 | using
496 | usually
497 | uucp
498 | v
499 | value
500 | various
501 | very
502 | via
503 | viz
504 | vs
505 | w
506 | want
507 | wants
508 | was
509 | wasn't
510 | way
511 | we
512 | we'd
513 | we'll
514 | we're
515 | we've
516 | welcome
517 | well
518 | went
519 | were
520 | weren't
521 | what
522 | what's
523 | whatever
524 | when
525 | whence
526 | whenever
527 | where
528 | where's
529 | whereafter
530 | whereas
531 | whereby
532 | wherein
533 | whereupon
534 | wherever
535 | whether
536 | which
537 | while
538 | whither
539 | who
540 | who's
541 | whoever
542 | whole
543 | whom
544 | whose
545 | why
546 | will
547 | willing
548 | wish
549 | with
550 | within
551 | without
552 | won't
553 | wonder
554 | would
555 | would
556 | wouldn't
557 | x
558 | y
559 | yes
560 | yet
561 | you
562 | you'd
563 | you'll
564 | you're
565 | you've
566 | your
567 | yours
568 | yourself
569 | yourselves
570 | z
571 | zero
572 | 


--------------------------------------------------------------------------------
/test/rake.test.js:
--------------------------------------------------------------------------------
  1 | import rake, {
  2 |   countOccurances,
  3 |   loadStopWords,
  4 |   separateWords,
  5 |   splitSentences,
  6 |   isAcceptable,
  7 |   buildStopWordRegex,
  8 |   generateCandidateKeywords,
  9 |   generateCandidateKeywordScores,
 10 |   calculateWordScores,
 11 | } from '../src/index'
 12 | 
 13 | 
 14 | // This is the text used in the paper
 15 | let text = `Compatibility of systems of linear constraints over the set of natural numbers.
 16 | 
 17 | Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.`
 18 | 
 19 | //let path = './test/data/salton_1971_smartstoplist.txt'
 20 | let path = './test/data/fox_1989_stoplist.txt'
 21 | //let path = './test/data/stop-words_english_6_en.txt'
 22 | 
 23 | describe('countOccurances', () => {
 24 | 
 25 |   it('counts the number of occurances within an array', () => {
 26 |     var dataset = [2,2,4,2,6,4,7,8]
 27 |     expect(countOccurances(dataset, 2)).toEqual(3)
 28 |   })
 29 | 
 30 | })
 31 | 
 32 | describe('rake', () => {
 33 | 
 34 |   it('can be imported', () => {
 35 |     expect(rake).toBeTruthy()
 36 |   })
 37 | 
 38 |   it.skip('Matches the python version', async () => {
 39 |     // TODO: explore why this test doesn't pass.
 40 |     // We are getting very similar results but not the same.
 41 |     // The python implementation uses the Salton smartstoplist:
 42 |     let saltonList = './test/data/salton_1971_smartstoplist.txt'
 43 |     let results = await rake(text, saltonList)
 44 |     expect(Object.keys(results)).toEqual([
 45 |       {'minimal generating sets': 8.666666666666666},
 46 |       {'linear diophantine equations': 8.5},
 47 |       {'minimal supporting set': 7.666666666666666},
 48 |       {'minimal set': 4.666666666666666},
 49 |       {'linear constraints': 4.5},
 50 |       {'upper bounds': 4.0},
 51 |       {'natural numbers': 4.0},
 52 |       {'nonstrict inequations': 4.0},
 53 |       {'strict inequations': 4.0},
 54 |       {'mixed types': 3.666666666666667},
 55 |       {'considered types': 3.166666666666667},
 56 |       {'set': 2.0},
 57 |       {'types': 1.6666666666666667},
 58 |       {'considered': 1.5},
 59 |       {'constructing': 1.0},
 60 |       {'solutions': 1.0},
 61 |       {'solving': 1.0},
 62 |       {'system': 1.0},
 63 |       {'compatibility': 1.0},
 64 |       {'systems': 1.0},
 65 |       {'criteria': 1.0},
 66 |       {'construction': 1.0},
 67 |       {'algorithms': 1.0},
 68 |       {'components': 1.0}
 69 |     ].map(el => Object.keys(el)[0])
 70 |     )
 71 |   })
 72 | 
 73 |   it.skip('produces the output from the paper', async () => {
 74 |     // This test likely can't pass at the same time as the "matching the python version"
 75 |     // It seems the original paper is using the Fox 1989 stoplist.
 76 |     // Just like the python test, this implementation generates slightly different results,
 77 |     // but with enough overlap to know that we are in the ballpark.
 78 |     let results = await rake(text, path)
 79 |     expect(Object.keys(results)).toEqual([
 80 |       "minimal generating sets",
 81 |       "linear diophantine equations",
 82 |       "minimal set",
 83 |       "minimal supporting set",
 84 |       "linear constraints",
 85 |       "natural numbers",
 86 |       "strict inequations",
 87 |       "nonstrict inequations",
 88 |       "upper bound",
 89 |       "corresponding algorithms",
 90 |       "considered types",
 91 |       "mixed types"
 92 |     ])
 93 |   })
 94 | 
 95 | })
 96 | 
 97 | describe('loadStopWords', () => {
 98 | 
 99 |   it('accepts a file path', async () => {
100 |     let [first, second, third, ...rest] = await loadStopWords(path)
101 |     expect(first).toEqual('a')
102 |     expect(second).toEqual('about')
103 |     expect(third).toEqual('above')
104 |   })
105 | 
106 | })
107 | 
108 | describe('separateWords', () => {
109 | 
110 |   it('returns all words greater than a given length', async () => {
111 |     let words = separateWords('a aa aaa aaaa aaaaa', 3)
112 |     expect(words).toEqual(['aaaa', 'aaaaa'])
113 |   })
114 | 
115 | })
116 | 
117 | describe('splitSentences', () => {
118 | 
119 |   it('splits the given text into an array of sentences', async () => {
120 |     let sentences = splitSentences(text)
121 |     let sentencesWithoutEmptyLines = sentences.filter(sentence => sentence != '')
122 |     expect(sentencesWithoutEmptyLines.length).toEqual(6)
123 |   })
124 | 
125 | })
126 | 
127 | describe('isAcceptable', () => {
128 | 
129 |   it('returns true for phrases longer than the minimum phrase length', async () => {
130 |     let min = 1
131 |     let max = 5
132 |     let phrase = "criteria and the corresponding"
133 |     let verdict = isAcceptable(phrase, min, max)
134 |     expect(verdict).toBeTruthy()
135 |   })
136 | 
137 |   it("returns false for phrases that don't pass the minimum phrase length", async () => {
138 |     let min = 1
139 |     let max = 5
140 |     let phrase = "a"
141 |     let verdict = isAcceptable(phrase, min, max)
142 |     expect(verdict).toBeTruthy()
143 |   })
144 | 
145 |   it('returns false for phrases longer than the maxWordsLength ', async () => {
146 |     let min = 1
147 |     let max = 2
148 |     let phrase = "criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types"
149 |     let verdict = isAcceptable(phrase, min, 5)
150 |     expect(verdict).toBeFalsy()
151 |   })
152 | 
153 |   it('returns false for phrases with mostly digits', async () => {
154 |     let min = 1
155 |     let max = 5
156 |     let phrase = 'this 7777 is 7777 it 7777'
157 |     let verdict = isAcceptable(phrase, min, 5)
158 |     expect(verdict).toBeFalsy()
159 |   })
160 | 
161 |   it('returns false for phrases that are only digits', async () => {
162 |     let min = 1
163 |     let max = 5
164 |     let phrase = '777'
165 |     let verdict = isAcceptable(phrase, min, 5)
166 |     expect(verdict).toBeFalsy()
167 |   })
168 | 
169 | })
170 | 
171 | describe('buildStopWordRegex', () => {
172 | 
173 |   it('builds a regex based on the stop words file', async () => {
174 |     let stopWordPattern = await buildStopWordRegex(path)
175 |     expect(stopWordPattern.toString()).toContain('|\\babout\\b|')
176 |   })
177 | 
178 |   it('should not allow newlines to have crept into the regex |\\b\\b|', async () => {
179 |     let stopWordPattern = await buildStopWordRegex(path)
180 |     expect(stopWordPattern.toString()).not.toContain('|\\b\\b|')
181 |   })
182 | 
183 |   it('produces a regex that replaces globally', async () => {
184 |     let stopWordPattern = await buildStopWordRegex(path)
185 |     let phrase = 'Compatibility of systems of linear constraints over the set of natural numbers'
186 |     let modifiedText = text.replace(stopWordPattern, '|')
187 |     //We are expecting more than one replacement value
188 |     expect((modifiedText.match(/|/g) || []).length).toBeGreaterThan(1)
189 |   })
190 | 
191 | })
192 | 
193 | describe('generateCandidateKeywords', () => {
194 | 
195 |   //TODO: The output from the function is not yet perfect.
196 |   // The book says it should be something like:
197 |   // Compatibility – systems – linear constraints – set – natural numbers – Criteria –
198 |   // compatibility – system – linear Diophantine equations – strict inequations – nonstrict
199 |   // inequations – Upper bounds – components – minimal set – solutions – algorithms –
200 |   // minimal generating sets – solutions – systems – criteria – corresponding algorithms –
201 |   // constructing – minimal supporting set – solving – systems – systems
202 | 
203 |   it('generates keywords from a list of sentences and a stopword list', async () => {
204 |     let sentenceList = splitSentences(text)
205 |     let stopWordPattern = await buildStopWordRegex(path)
206 | 
207 |     let candidateKeywords = generateCandidateKeywords(sentenceList, stopWordPattern)
208 |     expect(candidateKeywords).toContain("strict inequations", "nonstrict inequations are considered")
209 |   })
210 | 
211 | })
212 | 
213 | describe('calculateWordScores', () => {
214 | 
215 |   it('calculates the word score for phrases given a phrase list', async () => {
216 |     let phraseList = ["strict inequations", "nonstrict inequations are considered"]
217 |     let scores = calculateWordScores(phraseList)
218 |     expect(scores).toEqual({"are": 4, "considered": 4, "inequations": 3, "nonstrict": 4, "strict": 2})
219 |   })
220 | 
221 | })
222 | 
223 | describe('generateCandidateKeywordScores', () => {
224 | 
225 |   it('generates scores for candiate keywords', async () => {
226 |     let phraseList = ["strict inequations", "nonstrict inequations are considered"]
227 |     let wordScores = calculateWordScores(phraseList)
228 |     let scores = generateCandidateKeywordScores(phraseList, wordScores, 1)
229 |     expect(scores).toEqual({"nonstrict inequations are considered": 15, "strict inequations": 5})
230 |   })
231 | 
232 | })
233 | 


--------------------------------------------------------------------------------