├── .babelrc ├── .gitignore ├── LICENCE ├── README.md ├── package.json ├── src └── index.js └── test ├── data ├── fox_1989_stoplist.txt └── salton_1971_smartstoplist.txt └── rake.test.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": ["es2015"], 3 | "plugins": ["transform-async-to-generator", ["transform-runtime", { "polyfill": false, "regenerator": true }]] 4 | } 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | *.swp 4 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | (The MIT License) 2 | 3 | Copyright (c) 2017 Mike Williamson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | 'Software'), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RAKE: Rapid automatic keyword extraction 2 | 3 | The goal of this library was to create a well tested Javascript translation of the 4 | [python implementation](https://github.com/zelandiya/RAKE-tutorial). 5 | 6 | Differences in regular expressions and stopword lists have big impacts on this algorithm and 7 | sticking close to the python means that the code was easy to compare to ensure 8 | that it was in the ballpark. 9 | 10 | This algorithm is described in [Text Mining: Applications and 11 | Theory](https://www.amazon.ca/Text-Mining-Applications-Michael-Berry/dp/0470749822) 12 | and also in this [excellent blog 13 | post](https://www.airpair.com/nlp/keyword-extraction-tutorial) by Alyona 14 | Medelyan. 15 | 16 | It operates using only the text you give it and produces surprisingly good 17 | results. There are likely [better results 18 | possible](http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/) 19 | but these mostly seem to involve a combination of Python, Machine Learning and 20 | a corpus of data. 21 | 22 | The appeal of RAKE is of the "bang for the buck" variety. 23 | 24 | Currently this library produces subtly different results than either the paper 25 | or the original Python implementation. While the results (especially the top 26 | scoring ones) line up nicely, these little deviations represent something to 27 | understand and resolve. 28 | 29 | ## What's next 30 | 31 | After hammering out differences in the results, plans are to focus on 32 | 33 | * Fully embracing JS idioms (Promises/ES201X) 34 | * Explore ways to improve the results as described 35 | [here](https://www.ijarcsse.com/docs/papers/Volume_6/5_May2016/V6I5-0392.pdf) 36 | * Options to control result format (number, result|result+rank, etc) 37 | * Include default stopword list. 38 | * Improve handling of special characters and italics 39 | * Deal with sentences that have been split over multiple lines (sentence now ends with -) 40 | 41 | # Usage 42 | 43 | ```javascript 44 | > var rake = require('../dist/index').default 45 | undefined 46 | > rake('Compatibility of systems of linear constraints over the set of natural numbers', 'test/data/salton_1971_smartstoplist.txt').then(console.log) 47 | { 'natural numbers': 4, 48 | 'linear constraints': 4, 49 | set: 1, 50 | systems: 1, 51 | compatibility: 1 } 52 | ``` 53 | 54 | ## Stopword lists 55 | 56 | The stopword list used by the python version is [here](https://github.com/zelandiya/RAKE-tutorial/blob/master/SmartStoplist.txt). 57 | It has a comment as the first line which might break the world... 58 | 59 | Links to other stopword lists can be found [here](http://trialstravails.blogspot.ca/2014/04/fox-stop-words-list.html) 60 | 61 | Any file with one word per line should be fine. 62 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "rapid-automated-keyword-extraction", 3 | "version": "1.0.0", 4 | "description": "An javascript implementation of the Rapid Automated Keyword Extraction (RAKE) algorithm", 5 | "main": "dist/index.js", 6 | "jsnext:main": "src/index.js", 7 | "scripts": { 8 | "test": "jest", 9 | "build": "babel --copy-files --out-dir dist/ src/" 10 | }, 11 | "keywords": [ 12 | "keyword", 13 | "extraction", 14 | "rake" 15 | ], 16 | "author": "Mike Williamson", 17 | "license": "MIT", 18 | "devDependencies": { 19 | "babel-jest": "^19.0.0", 20 | "babel-plugin-transform-async-to-generator": "^6.22.0", 21 | "babel-plugin-transform-runtime": "^6.23.0", 22 | "babel-polyfill": "^6.23.0", 23 | "babel-preset-es2015": "^6.22.0" 24 | }, 25 | "dependencies": { 26 | "fs-promise": "^2.0.0", 27 | "lodash": "^4.17.4" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | import fsp from 'fs-promise' 2 | import { fromPairs, sortBy, toPairs } from 'lodash' 3 | 4 | function isNumber(str) { 5 | return /\d/.test(str) 6 | } 7 | 8 | // TODO: smaller functions should be extracted from this 9 | export function isAcceptable (phrase, minCharLength, maxWordsLength) { 10 | // a phrase must have a min length in characters 11 | if(phrase < minCharLength) { 12 | return false 13 | } 14 | // a phrase must have a max number of words 15 | let words = phrase.split(' ') 16 | if(words.length > maxWordsLength) { 17 | return false 18 | } 19 | 20 | let digits = 0 21 | let alpha = 0 22 | //is there a better way to do this? 23 | for(let i = 0; i < phrase.length; i++) { 24 | if(/\d/.test(phrase[i])) digits += 1 25 | if(/[a-zA-Z]/.test(phrase[i])) alpha += 1 26 | } 27 | 28 | // a phrase must have at least one alpha character 29 | if(alpha == 0) { 30 | return false 31 | } 32 | 33 | // a phrase must have more alpha than digits characters 34 | if(digits > alpha) { 35 | return false 36 | } 37 | 38 | return true 39 | } 40 | 41 | export function countOccurances (haystack, needle) { 42 | return haystack.reduce((n, value) => { 43 | return n + (value === needle) 44 | }, 0) 45 | } 46 | 47 | export function generateCandidateKeywordScores (phraseList, wordScore, minKeywordFrequency = 1) { 48 | 49 | let keywordCandidates = {} 50 | 51 | phraseList.forEach(phrase => { 52 | if(minKeywordFrequency > 1) { 53 | if(countOccurances(phraseList, phrase) < minKeywordFrequency) { 54 | return 55 | } 56 | } 57 | phrase in keywordCandidates || (keywordCandidates[phrase] = 0) 58 | let wordList = separateWords(phrase, 0) 59 | let candidateScore = 0 60 | wordList.forEach(word => { 61 | candidateScore += wordScore[word] 62 | keywordCandidates[phrase] = candidateScore 63 | }) 64 | }) 65 | return keywordCandidates 66 | } 67 | 68 | export function separateWords (text, minWordReturnSize) { 69 | let wordDelimiters = /[^a-zA-Z0-9_\+\-/]/ 70 | let words = [] 71 | text.split(wordDelimiters).forEach(singleWord => { 72 | let currentWord = singleWord.trim().toLowerCase() 73 | //leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases 74 | if(currentWord.length > minWordReturnSize && currentWord != '' && !isNumber(currentWord)) { 75 | words.push(currentWord) 76 | } 77 | }) 78 | return words 79 | } 80 | 81 | export function calculateWordScores (phraseList) { 82 | let wordFrequency = {} 83 | let wordDegree = {} 84 | phraseList.forEach(phrase => { 85 | let wordList = separateWords(phrase, 0) 86 | let wordListLength = wordList.length 87 | let wordListDegree = wordListLength - 1 88 | wordList.forEach(word => { 89 | word in wordFrequency || (wordFrequency[word] = 0) 90 | wordFrequency[word] += 1 91 | word in wordDegree || (wordDegree[word] = 0) 92 | wordDegree[word] += wordListDegree 93 | }) 94 | }) 95 | 96 | Object.keys(wordFrequency).forEach(item => { 97 | wordDegree[item] = wordDegree[item] + wordFrequency[item] 98 | }) 99 | 100 | // Calculate Word scores = deg(w)/frew(w) 101 | let wordScore = {} 102 | Object.keys(wordFrequency).forEach(item => { 103 | item in wordScore || (wordScore[item] = 0) 104 | wordScore[item] = wordDegree[item] / (wordFrequency[item] * 1.0) 105 | }) 106 | 107 | return wordScore 108 | } 109 | 110 | 111 | export function generateCandidateKeywords (sentenceList, stopWordPattern, minCharLength = 1, maxWordsLength = 5) { 112 | let phraseList = [] 113 | sentenceList.forEach(sentence => { 114 | let tmp = stopWordPattern[Symbol.replace](sentence, '|') 115 | let phrases = tmp.split("|") 116 | phrases.forEach(ph => { 117 | let phrase = ph.trim().toLowerCase() 118 | 119 | if(phrase != "" && isAcceptable(phrase, minCharLength, maxWordsLength)) { 120 | phraseList.push(phrase) 121 | } else { 122 | } 123 | }) 124 | }) 125 | return phraseList 126 | } 127 | 128 | export async function buildStopWordRegex (path) { 129 | let stopWordList = await loadStopWords(path) 130 | let stopWordRegexList = [] 131 | stopWordList.forEach(word => { 132 | if(/\w+/.test(word)) { 133 | // match only stop words surrounded by word boundaries (\b) 134 | let wordRegex = `\\b${word}\\b` 135 | stopWordRegexList.push(wordRegex) 136 | } 137 | }) 138 | let stopWordPattern = new RegExp(stopWordRegexList.join('|'), 'ig') 139 | return stopWordPattern 140 | } 141 | 142 | export function splitSentences (text) { 143 | let sentenceDelimiters = /[\[\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]/ 144 | return text.split(sentenceDelimiters) 145 | } 146 | 147 | export async function loadStopWords (path) { 148 | let contents = await fsp.readFile(path, {encoding:'utf8'}) 149 | 150 | //TODO: we are assuming one word per line 151 | return contents.split(/\n/) 152 | } 153 | 154 | export default async function rake (text, stopWordsPath, minCharLength=3, maxWordsLength=5, minKeywordFrequency=1) { 155 | let stopWordPattern = await buildStopWordRegex(stopWordsPath) 156 | let sentenceList = splitSentences(text) 157 | let phraseList = generateCandidateKeywords(sentenceList, stopWordPattern, minCharLength, maxWordsLength) 158 | let wordScores = calculateWordScores(phraseList) 159 | let keywordCandidates = generateCandidateKeywordScores(phraseList, wordScores, minKeywordFrequency) 160 | let sortedKeywords = fromPairs(sortBy(toPairs(keywordCandidates), (pair) => pair[1]).reverse()) 161 | return sortedKeywords 162 | } 163 | 164 | -------------------------------------------------------------------------------- /test/data/fox_1989_stoplist.txt: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | across 5 | after 6 | again 7 | against 8 | all 9 | almost 10 | alone 11 | along 12 | already 13 | also 14 | although 15 | always 16 | among 17 | an 18 | and 19 | another 20 | any 21 | anybody 22 | anyone 23 | anything 24 | anywhere 25 | are 26 | area 27 | areas 28 | around 29 | as 30 | ask 31 | asked 32 | asking 33 | asks 34 | at 35 | away 36 | b 37 | back 38 | backed 39 | backing 40 | backs 41 | be 42 | because 43 | become 44 | becomes 45 | became 46 | been 47 | before 48 | began 49 | behind 50 | being 51 | beings 52 | best 53 | better 54 | between 55 | big 56 | both 57 | but 58 | by 59 | c 60 | came 61 | can 62 | cannot 63 | case 64 | cases 65 | certain 66 | certainly 67 | clear 68 | clearly 69 | come 70 | could 71 | d 72 | did 73 | differ 74 | different 75 | differently 76 | do 77 | does 78 | done 79 | down 80 | downed 81 | downing 82 | downs 83 | during 84 | e 85 | each 86 | early 87 | either 88 | end 89 | ended 90 | ending 91 | ends 92 | enough 93 | even 94 | evenly 95 | ever 96 | every 97 | everybody 98 | everyone 99 | everything 100 | everywhere 101 | f 102 | face 103 | faces 104 | fact 105 | facts 106 | far 107 | felt 108 | few 109 | find 110 | finds 111 | first 112 | for 113 | four 114 | from 115 | full 116 | fully 117 | further 118 | furthered 119 | furthering 120 | furthers 121 | g 122 | gave 123 | general 124 | generally 125 | get 126 | gets 127 | give 128 | given 129 | gives 130 | go 131 | going 132 | good 133 | goods 134 | got 135 | great 136 | greater 137 | greatest 138 | group 139 | grouped 140 | grouping 141 | groups 142 | h 143 | had 144 | has 145 | have 146 | having 147 | he 148 | her 149 | herself 150 | here 151 | high 152 | higher 153 | highest 154 | him 155 | himself 156 | his 157 | how 158 | however 159 | i 160 | if 161 | important 162 | in 163 | interest 164 | interested 165 | interesting 166 | interests 167 | into 168 | is 169 | it 170 | its 171 | itself 172 | j 173 | just 174 | k 175 | keep 176 | keeps 177 | kind 178 | knew 179 | know 180 | known 181 | knows 182 | l 183 | large 184 | largely 185 | last 186 | later 187 | latest 188 | least 189 | less 190 | let 191 | lets 192 | like 193 | likely 194 | long 195 | longer 196 | longest 197 | m 198 | made 199 | make 200 | making 201 | man 202 | many 203 | may 204 | me 205 | member 206 | members 207 | men 208 | might 209 | more 210 | most 211 | mostly 212 | mr 213 | mrs 214 | much 215 | must 216 | my 217 | myself 218 | n 219 | necessary 220 | need 221 | needed 222 | needing 223 | needs 224 | never 225 | new 226 | newer 227 | newest 228 | next 229 | no 230 | non 231 | not 232 | nobody 233 | noone 234 | nothing 235 | now 236 | nowhere 237 | number 238 | numbers 239 | o 240 | of 241 | off 242 | often 243 | old 244 | older 245 | oldest 246 | on 247 | once 248 | one 249 | only 250 | open 251 | opened 252 | opening 253 | opens 254 | or 255 | order 256 | ordered 257 | ordering 258 | orders 259 | other 260 | others 261 | our 262 | out 263 | over 264 | p 265 | part 266 | parted 267 | parting 268 | parts 269 | per 270 | perhaps 271 | place 272 | places 273 | point 274 | pointed 275 | pointing 276 | points 277 | possible 278 | present 279 | presented 280 | presenting 281 | presents 282 | problem 283 | problems 284 | put 285 | puts 286 | q 287 | quite 288 | r 289 | rather 290 | really 291 | right 292 | room 293 | rooms 294 | s 295 | said 296 | same 297 | saw 298 | say 299 | says 300 | second 301 | seconds 302 | see 303 | sees 304 | seem 305 | seemed 306 | seeming 307 | seems 308 | several 309 | shall 310 | she 311 | should 312 | show 313 | showed 314 | showing 315 | shows 316 | side 317 | sides 318 | since 319 | small 320 | smaller 321 | smallest 322 | so 323 | some 324 | somebody 325 | someone 326 | something 327 | somewhere 328 | state 329 | states 330 | still 331 | such 332 | sure 333 | t 334 | take 335 | taken 336 | than 337 | that 338 | the 339 | their 340 | them 341 | then 342 | there 343 | therefore 344 | these 345 | they 346 | thing 347 | things 348 | think 349 | thinks 350 | this 351 | those 352 | though 353 | thought 354 | thoughts 355 | three 356 | through 357 | thus 358 | to 359 | today 360 | together 361 | too 362 | took 363 | toward 364 | turn 365 | turned 366 | turning 367 | turns 368 | two 369 | u 370 | under 371 | until 372 | up 373 | upon 374 | us 375 | use 376 | uses 377 | used 378 | v 379 | very 380 | w 381 | want 382 | wanted 383 | wanting 384 | wants 385 | was 386 | way 387 | ways 388 | we 389 | well 390 | wells 391 | went 392 | were 393 | what 394 | when 395 | where 396 | whether 397 | which 398 | while 399 | who 400 | whole 401 | whose 402 | why 403 | will 404 | with 405 | within 406 | without 407 | work 408 | worked 409 | working 410 | works 411 | would 412 | y 413 | year 414 | years 415 | yet 416 | you 417 | young 418 | younger 419 | youngest 420 | your 421 | yours 422 | -------------------------------------------------------------------------------- /test/data/salton_1971_smartstoplist.txt: -------------------------------------------------------------------------------- 1 | a 2 | a's 3 | able 4 | about 5 | above 6 | according 7 | accordingly 8 | across 9 | actually 10 | after 11 | afterwards 12 | again 13 | against 14 | ain't 15 | all 16 | allow 17 | allows 18 | almost 19 | alone 20 | along 21 | already 22 | also 23 | although 24 | always 25 | am 26 | among 27 | amongst 28 | an 29 | and 30 | another 31 | any 32 | anybody 33 | anyhow 34 | anyone 35 | anything 36 | anyway 37 | anyways 38 | anywhere 39 | apart 40 | appear 41 | appreciate 42 | appropriate 43 | are 44 | aren't 45 | around 46 | as 47 | aside 48 | ask 49 | asking 50 | associated 51 | at 52 | available 53 | away 54 | awfully 55 | b 56 | be 57 | became 58 | because 59 | become 60 | becomes 61 | becoming 62 | been 63 | before 64 | beforehand 65 | behind 66 | being 67 | believe 68 | below 69 | beside 70 | besides 71 | best 72 | better 73 | between 74 | beyond 75 | both 76 | brief 77 | but 78 | by 79 | c 80 | c'mon 81 | c's 82 | came 83 | can 84 | can't 85 | cannot 86 | cant 87 | cause 88 | causes 89 | certain 90 | certainly 91 | changes 92 | clearly 93 | co 94 | com 95 | come 96 | comes 97 | concerning 98 | consequently 99 | consider 100 | considering 101 | contain 102 | containing 103 | contains 104 | corresponding 105 | could 106 | couldn't 107 | course 108 | currently 109 | d 110 | definitely 111 | described 112 | despite 113 | did 114 | didn't 115 | different 116 | do 117 | does 118 | doesn't 119 | doing 120 | don't 121 | done 122 | down 123 | downwards 124 | during 125 | e 126 | each 127 | edu 128 | eg 129 | eight 130 | either 131 | else 132 | elsewhere 133 | enough 134 | entirely 135 | especially 136 | et 137 | etc 138 | even 139 | ever 140 | every 141 | everybody 142 | everyone 143 | everything 144 | everywhere 145 | ex 146 | exactly 147 | example 148 | except 149 | f 150 | far 151 | few 152 | fifth 153 | first 154 | five 155 | followed 156 | following 157 | follows 158 | for 159 | former 160 | formerly 161 | forth 162 | four 163 | from 164 | further 165 | furthermore 166 | g 167 | get 168 | gets 169 | getting 170 | given 171 | gives 172 | go 173 | goes 174 | going 175 | gone 176 | got 177 | gotten 178 | greetings 179 | h 180 | had 181 | hadn't 182 | happens 183 | hardly 184 | has 185 | hasn't 186 | have 187 | haven't 188 | having 189 | he 190 | he's 191 | hello 192 | help 193 | hence 194 | her 195 | here 196 | here's 197 | hereafter 198 | hereby 199 | herein 200 | hereupon 201 | hers 202 | herself 203 | hi 204 | him 205 | himself 206 | his 207 | hither 208 | hopefully 209 | how 210 | howbeit 211 | however 212 | i 213 | i'd 214 | i'll 215 | i'm 216 | i've 217 | ie 218 | if 219 | ignored 220 | immediate 221 | in 222 | inasmuch 223 | inc 224 | indeed 225 | indicate 226 | indicated 227 | indicates 228 | inner 229 | insofar 230 | instead 231 | into 232 | inward 233 | is 234 | isn't 235 | it 236 | it'd 237 | it'll 238 | it's 239 | its 240 | itself 241 | j 242 | just 243 | k 244 | keep 245 | keeps 246 | kept 247 | know 248 | knows 249 | known 250 | l 251 | last 252 | lately 253 | later 254 | latter 255 | latterly 256 | least 257 | less 258 | lest 259 | let 260 | let's 261 | like 262 | liked 263 | likely 264 | little 265 | look 266 | looking 267 | looks 268 | ltd 269 | m 270 | mainly 271 | many 272 | may 273 | maybe 274 | me 275 | mean 276 | meanwhile 277 | merely 278 | might 279 | more 280 | moreover 281 | most 282 | mostly 283 | much 284 | must 285 | my 286 | myself 287 | n 288 | name 289 | namely 290 | nd 291 | near 292 | nearly 293 | necessary 294 | need 295 | needs 296 | neither 297 | never 298 | nevertheless 299 | new 300 | next 301 | nine 302 | no 303 | nobody 304 | non 305 | none 306 | noone 307 | nor 308 | normally 309 | not 310 | nothing 311 | novel 312 | now 313 | nowhere 314 | o 315 | obviously 316 | of 317 | off 318 | often 319 | oh 320 | ok 321 | okay 322 | old 323 | on 324 | once 325 | one 326 | ones 327 | only 328 | onto 329 | or 330 | other 331 | others 332 | otherwise 333 | ought 334 | our 335 | ours 336 | ourselves 337 | out 338 | outside 339 | over 340 | overall 341 | own 342 | p 343 | particular 344 | particularly 345 | per 346 | perhaps 347 | placed 348 | please 349 | plus 350 | possible 351 | presumably 352 | probably 353 | provides 354 | q 355 | que 356 | quite 357 | qv 358 | r 359 | rather 360 | rd 361 | re 362 | really 363 | reasonably 364 | regarding 365 | regardless 366 | regards 367 | relatively 368 | respectively 369 | right 370 | s 371 | said 372 | same 373 | saw 374 | say 375 | saying 376 | says 377 | second 378 | secondly 379 | see 380 | seeing 381 | seem 382 | seemed 383 | seeming 384 | seems 385 | seen 386 | self 387 | selves 388 | sensible 389 | sent 390 | serious 391 | seriously 392 | seven 393 | several 394 | shall 395 | she 396 | should 397 | shouldn't 398 | since 399 | six 400 | so 401 | some 402 | somebody 403 | somehow 404 | someone 405 | something 406 | sometime 407 | sometimes 408 | somewhat 409 | somewhere 410 | soon 411 | sorry 412 | specified 413 | specify 414 | specifying 415 | still 416 | sub 417 | such 418 | sup 419 | sure 420 | t 421 | t's 422 | take 423 | taken 424 | tell 425 | tends 426 | th 427 | than 428 | thank 429 | thanks 430 | thanx 431 | that 432 | that's 433 | thats 434 | the 435 | their 436 | theirs 437 | them 438 | themselves 439 | then 440 | thence 441 | there 442 | there's 443 | thereafter 444 | thereby 445 | therefore 446 | therein 447 | theres 448 | thereupon 449 | these 450 | they 451 | they'd 452 | they'll 453 | they're 454 | they've 455 | think 456 | third 457 | this 458 | thorough 459 | thoroughly 460 | those 461 | though 462 | three 463 | through 464 | throughout 465 | thru 466 | thus 467 | to 468 | together 469 | too 470 | took 471 | toward 472 | towards 473 | tried 474 | tries 475 | truly 476 | try 477 | trying 478 | twice 479 | two 480 | u 481 | un 482 | under 483 | unfortunately 484 | unless 485 | unlikely 486 | until 487 | unto 488 | up 489 | upon 490 | us 491 | use 492 | used 493 | useful 494 | uses 495 | using 496 | usually 497 | uucp 498 | v 499 | value 500 | various 501 | very 502 | via 503 | viz 504 | vs 505 | w 506 | want 507 | wants 508 | was 509 | wasn't 510 | way 511 | we 512 | we'd 513 | we'll 514 | we're 515 | we've 516 | welcome 517 | well 518 | went 519 | were 520 | weren't 521 | what 522 | what's 523 | whatever 524 | when 525 | whence 526 | whenever 527 | where 528 | where's 529 | whereafter 530 | whereas 531 | whereby 532 | wherein 533 | whereupon 534 | wherever 535 | whether 536 | which 537 | while 538 | whither 539 | who 540 | who's 541 | whoever 542 | whole 543 | whom 544 | whose 545 | why 546 | will 547 | willing 548 | wish 549 | with 550 | within 551 | without 552 | won't 553 | wonder 554 | would 555 | would 556 | wouldn't 557 | x 558 | y 559 | yes 560 | yet 561 | you 562 | you'd 563 | you'll 564 | you're 565 | you've 566 | your 567 | yours 568 | yourself 569 | yourselves 570 | z 571 | zero 572 | -------------------------------------------------------------------------------- /test/rake.test.js: -------------------------------------------------------------------------------- 1 | import rake, { 2 | countOccurances, 3 | loadStopWords, 4 | separateWords, 5 | splitSentences, 6 | isAcceptable, 7 | buildStopWordRegex, 8 | generateCandidateKeywords, 9 | generateCandidateKeywordScores, 10 | calculateWordScores, 11 | } from '../src/index' 12 | 13 | 14 | // This is the text used in the paper 15 | let text = `Compatibility of systems of linear constraints over the set of natural numbers. 16 | 17 | Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.` 18 | 19 | //let path = './test/data/salton_1971_smartstoplist.txt' 20 | let path = './test/data/fox_1989_stoplist.txt' 21 | //let path = './test/data/stop-words_english_6_en.txt' 22 | 23 | describe('countOccurances', () => { 24 | 25 | it('counts the number of occurances within an array', () => { 26 | var dataset = [2,2,4,2,6,4,7,8] 27 | expect(countOccurances(dataset, 2)).toEqual(3) 28 | }) 29 | 30 | }) 31 | 32 | describe('rake', () => { 33 | 34 | it('can be imported', () => { 35 | expect(rake).toBeTruthy() 36 | }) 37 | 38 | it.skip('Matches the python version', async () => { 39 | // TODO: explore why this test doesn't pass. 40 | // We are getting very similar results but not the same. 41 | // The python implementation uses the Salton smartstoplist: 42 | let saltonList = './test/data/salton_1971_smartstoplist.txt' 43 | let results = await rake(text, saltonList) 44 | expect(Object.keys(results)).toEqual([ 45 | {'minimal generating sets': 8.666666666666666}, 46 | {'linear diophantine equations': 8.5}, 47 | {'minimal supporting set': 7.666666666666666}, 48 | {'minimal set': 4.666666666666666}, 49 | {'linear constraints': 4.5}, 50 | {'upper bounds': 4.0}, 51 | {'natural numbers': 4.0}, 52 | {'nonstrict inequations': 4.0}, 53 | {'strict inequations': 4.0}, 54 | {'mixed types': 3.666666666666667}, 55 | {'considered types': 3.166666666666667}, 56 | {'set': 2.0}, 57 | {'types': 1.6666666666666667}, 58 | {'considered': 1.5}, 59 | {'constructing': 1.0}, 60 | {'solutions': 1.0}, 61 | {'solving': 1.0}, 62 | {'system': 1.0}, 63 | {'compatibility': 1.0}, 64 | {'systems': 1.0}, 65 | {'criteria': 1.0}, 66 | {'construction': 1.0}, 67 | {'algorithms': 1.0}, 68 | {'components': 1.0} 69 | ].map(el => Object.keys(el)[0]) 70 | ) 71 | }) 72 | 73 | it.skip('produces the output from the paper', async () => { 74 | // This test likely can't pass at the same time as the "matching the python version" 75 | // It seems the original paper is using the Fox 1989 stoplist. 76 | // Just like the python test, this implementation generates slightly different results, 77 | // but with enough overlap to know that we are in the ballpark. 78 | let results = await rake(text, path) 79 | expect(Object.keys(results)).toEqual([ 80 | "minimal generating sets", 81 | "linear diophantine equations", 82 | "minimal set", 83 | "minimal supporting set", 84 | "linear constraints", 85 | "natural numbers", 86 | "strict inequations", 87 | "nonstrict inequations", 88 | "upper bound", 89 | "corresponding algorithms", 90 | "considered types", 91 | "mixed types" 92 | ]) 93 | }) 94 | 95 | }) 96 | 97 | describe('loadStopWords', () => { 98 | 99 | it('accepts a file path', async () => { 100 | let [first, second, third, ...rest] = await loadStopWords(path) 101 | expect(first).toEqual('a') 102 | expect(second).toEqual('about') 103 | expect(third).toEqual('above') 104 | }) 105 | 106 | }) 107 | 108 | describe('separateWords', () => { 109 | 110 | it('returns all words greater than a given length', async () => { 111 | let words = separateWords('a aa aaa aaaa aaaaa', 3) 112 | expect(words).toEqual(['aaaa', 'aaaaa']) 113 | }) 114 | 115 | }) 116 | 117 | describe('splitSentences', () => { 118 | 119 | it('splits the given text into an array of sentences', async () => { 120 | let sentences = splitSentences(text) 121 | let sentencesWithoutEmptyLines = sentences.filter(sentence => sentence != '') 122 | expect(sentencesWithoutEmptyLines.length).toEqual(6) 123 | }) 124 | 125 | }) 126 | 127 | describe('isAcceptable', () => { 128 | 129 | it('returns true for phrases longer than the minimum phrase length', async () => { 130 | let min = 1 131 | let max = 5 132 | let phrase = "criteria and the corresponding" 133 | let verdict = isAcceptable(phrase, min, max) 134 | expect(verdict).toBeTruthy() 135 | }) 136 | 137 | it("returns false for phrases that don't pass the minimum phrase length", async () => { 138 | let min = 1 139 | let max = 5 140 | let phrase = "a" 141 | let verdict = isAcceptable(phrase, min, max) 142 | expect(verdict).toBeTruthy() 143 | }) 144 | 145 | it('returns false for phrases longer than the maxWordsLength ', async () => { 146 | let min = 1 147 | let max = 2 148 | let phrase = "criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types" 149 | let verdict = isAcceptable(phrase, min, 5) 150 | expect(verdict).toBeFalsy() 151 | }) 152 | 153 | it('returns false for phrases with mostly digits', async () => { 154 | let min = 1 155 | let max = 5 156 | let phrase = 'this 7777 is 7777 it 7777' 157 | let verdict = isAcceptable(phrase, min, 5) 158 | expect(verdict).toBeFalsy() 159 | }) 160 | 161 | it('returns false for phrases that are only digits', async () => { 162 | let min = 1 163 | let max = 5 164 | let phrase = '777' 165 | let verdict = isAcceptable(phrase, min, 5) 166 | expect(verdict).toBeFalsy() 167 | }) 168 | 169 | }) 170 | 171 | describe('buildStopWordRegex', () => { 172 | 173 | it('builds a regex based on the stop words file', async () => { 174 | let stopWordPattern = await buildStopWordRegex(path) 175 | expect(stopWordPattern.toString()).toContain('|\\babout\\b|') 176 | }) 177 | 178 | it('should not allow newlines to have crept into the regex |\\b\\b|', async () => { 179 | let stopWordPattern = await buildStopWordRegex(path) 180 | expect(stopWordPattern.toString()).not.toContain('|\\b\\b|') 181 | }) 182 | 183 | it('produces a regex that replaces globally', async () => { 184 | let stopWordPattern = await buildStopWordRegex(path) 185 | let phrase = 'Compatibility of systems of linear constraints over the set of natural numbers' 186 | let modifiedText = text.replace(stopWordPattern, '|') 187 | //We are expecting more than one replacement value 188 | expect((modifiedText.match(/|/g) || []).length).toBeGreaterThan(1) 189 | }) 190 | 191 | }) 192 | 193 | describe('generateCandidateKeywords', () => { 194 | 195 | //TODO: The output from the function is not yet perfect. 196 | // The book says it should be something like: 197 | // Compatibility – systems – linear constraints – set – natural numbers – Criteria – 198 | // compatibility – system – linear Diophantine equations – strict inequations – nonstrict 199 | // inequations – Upper bounds – components – minimal set – solutions – algorithms – 200 | // minimal generating sets – solutions – systems – criteria – corresponding algorithms – 201 | // constructing – minimal supporting set – solving – systems – systems 202 | 203 | it('generates keywords from a list of sentences and a stopword list', async () => { 204 | let sentenceList = splitSentences(text) 205 | let stopWordPattern = await buildStopWordRegex(path) 206 | 207 | let candidateKeywords = generateCandidateKeywords(sentenceList, stopWordPattern) 208 | expect(candidateKeywords).toContain("strict inequations", "nonstrict inequations are considered") 209 | }) 210 | 211 | }) 212 | 213 | describe('calculateWordScores', () => { 214 | 215 | it('calculates the word score for phrases given a phrase list', async () => { 216 | let phraseList = ["strict inequations", "nonstrict inequations are considered"] 217 | let scores = calculateWordScores(phraseList) 218 | expect(scores).toEqual({"are": 4, "considered": 4, "inequations": 3, "nonstrict": 4, "strict": 2}) 219 | }) 220 | 221 | }) 222 | 223 | describe('generateCandidateKeywordScores', () => { 224 | 225 | it('generates scores for candiate keywords', async () => { 226 | let phraseList = ["strict inequations", "nonstrict inequations are considered"] 227 | let wordScores = calculateWordScores(phraseList) 228 | let scores = generateCandidateKeywordScores(phraseList, wordScores, 1) 229 | expect(scores).toEqual({"nonstrict inequations are considered": 15, "strict inequations": 5}) 230 | }) 231 | 232 | }) 233 | --------------------------------------------------------------------------------