├── .jshintrc ├── Gruntfile.js ├── README.md ├── index.js ├── package.json └── test ├── Automatic_Summarization-sents.json ├── Automatic_Summarization-tokens.json └── tr-test.js /.jshintrc: -------------------------------------------------------------------------------- 1 | { 2 | "node": true, 3 | "browser": false, 4 | "es5": true, 5 | "esnext": true, 6 | "bitwise": true, 7 | "camelcase": true, 8 | "curly": true, 9 | "eqeqeq": true, 10 | "immed": true, 11 | "indent": 4, 12 | "latedef": true, 13 | "newcap": true, 14 | "noarg": true, 15 | "quotmark": "single", 16 | "regexp": true, 17 | "undef": true, 18 | "unused": true, 19 | "strict": true, 20 | "trailing": true, 21 | "smarttabs": true, 22 | "white": false, 23 | "globals": { "window": false } 24 | } -------------------------------------------------------------------------------- /Gruntfile.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | module.exports = function (grunt) { 4 | // load all grunt tasks 5 | require('matchdep').filterDev('grunt-*').forEach(grunt.loadNpmTasks); 6 | 7 | grunt.initConfig({ 8 | clean: { 9 | dist: ['.tmp', 'dist/*'], 10 | server: '.tmp' 11 | }, 12 | uglify: { 13 | my_target: { 14 | files: { 15 | 'dist/textrank.min.js': ['index.js'] 16 | } 17 | } 18 | }, 19 | jshint: { 20 | options: { 21 | jshintrc: '.jshintrc' 22 | }, 23 | all: [ 24 | 'lib/*.js' 25 | ] 26 | }, 27 | mochaTest: { 28 | test: { 29 | options: { 30 | reporter: 'spec' 31 | }, 32 | src: ['test/*.js'] 33 | } 34 | } 35 | }); 36 | 37 | grunt.registerTask('test', [ 38 | 'clean', 39 | 'mochaTest' 40 | ]); 41 | 42 | grunt.registerTask('build', [ 43 | 'clean:dist', 44 | 'uglify' 45 | ]); 46 | 47 | grunt.registerTask('default', [ 48 | 'jshint', 49 | 'test', 50 | 'build' 51 | ]); 52 | }; 53 | 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | textrank-js 2 | =========== 3 | 4 | TextRank is an algorithm for Text Summarization, by Rada Mihalcea & Paul Tarau. The code here is based on their paper "TextRank: Bringing Order into Texts". I've noticed that there are many implementations out there, but this one is intended to demonstrate the algorithm without any additional baggage. Also, unlike many other implementations I have seen, this has no algorithm dependencies and could also work in the browser. I wanted to show how elegant, simple and clean the algorithm is, so I kept it short -- about 130 lines of Javascript (ES5). It currently depends only on lodash ('_'), a standard JS library used in many (most?) projects, for a few choice zingey one-liners. 5 | 6 | The algorithm itself can extend to any type of graph, as they note in their paper, but I have provided the two types of graphs explored in the paper: keyword extraction an undirected graph derived from collocation, and a sentence extraction graph using the similarity weighting (as described in the paper) on the edges. There is a function for building a graph of each type, and once the graph has been built, the textRank function performs the algorithm on the generated graph. 7 | 8 | Note this code only implements the TextRank algorithm itself, the sentences must be properly formatted upfront. I have provided example tokenization for both tasks in the tests directory, both derived from tokenizing the Wikipedia entry for "Automatic summarization", both minimally processed using a custom (very minimal) tokenizer, and OpenNLP's default models for sentence splitting and POS, and converted to JSON. As long as you get the format right that this is expecting, you should be able to use whatever library you want to preprocess. The keyword extraction builder needs the format to include POS tags since it filters the content while it is building its adjacencies. The sentence extraction builder does not require POS, but requires pre-split sentences. 9 | 10 | The "tests" are not currently testing anything, but serve as demonstration code for how to run the software. Note that textRank() has a default number of iterations -- it doesnt try and test for convergence. This is just to keep it simple, it would be simple to modify to test this instead, but for now you can pass in any number you want if that default isnt suitable (see test examples). 11 | 12 | Build using Grunt: 13 | ``` 14 | $ npm install 15 | $ grunt 16 | 17 | ``` 18 | Here's what the output looks like for the first 5 extracted sentences when performing the sentence extraction task: 19 | 20 | ``` 21 | automatic summarization is the process of reducing a text document with a computer program in order to create a summary that retains the most important points of the original document 22 | 23 | two particular types of summarization often addressed in the literature are keyphrase extraction where the goal is to select individual words or phrases to tag a document and document summarization where the goal is to select whole sentences to create a short paragraph summary 24 | 25 | in general abstraction can condense a text more strongly than extraction but the programs that can do this are harder to develop as they require the use of natural language generation technology which itself is a growing field 26 | 27 | while some work has been done in abstractive summarization creating an abstract synopsis like that of a human the majority of summarization systems are extractive selecting a subset of sentences to place in a summary 28 | 29 | apart from fully automated summarizers fas there are systems that aid users with the task of summarization mahs = machine aided human summarization for example by highlighting candidate passages to be included in the summary and there are systems that depend on post-processing by a human hams = human aided machine summarization 30 | 31 | ``` 32 | You are welcome to use this code for whatever nefarious purposes, but please attribute it to this implementation if you do. 33 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | if (typeof require === 'function' && typeof exports === 'object' && typeof module === 'object') { 3 | var _ = require('lodash'); 4 | } 5 | function textRank(V, niter, dampening) { 6 | 7 | var d = dampening || 0.85; 8 | var K = niter || 200; 9 | var denom = []; 10 | var ws = []; 11 | 12 | function sum(edges) { 13 | var acc = 0.0; 14 | edges.forEach(function (edge) { 15 | acc += edge.weight 16 | }); 17 | return acc; 18 | } 19 | 20 | function accum(i) { 21 | var sum = 0.0; 22 | V[i].in.forEach(function (v_j) { 23 | var j = v_j.index; 24 | var v_ji = _.find(V[j].out, function (x) { 25 | return x.index == i; 26 | }); 27 | sum += (v_ji ? (v_ji.weight / denom[j] * ws[j].score) : 0.); 28 | }); 29 | return sum; 30 | } 31 | 32 | V.forEach(function (v_j, j) { 33 | denom[j] = sum(v_j.out); 34 | ws[j] = {name: v_j.name, vertex: j, score: Math.random()}; 35 | }); 36 | for (var k = 0; k < K; ++k) { 37 | for (var i = 0; i < V.length; ++i) { 38 | var acc = accum(i); 39 | ws[i].score = (1 - d) + d * acc; 40 | } 41 | } 42 | ws.sort(function (x, y) { 43 | return (y.score - x.score) 44 | }); 45 | return ws; 46 | } 47 | 48 | function sentExGraph(sentences) { 49 | function sim(s1, s2) { 50 | return _.intersection(s1, s2).length / (Math.log(s1.length) + Math.log(s2.length)); 51 | } 52 | 53 | var V = []; 54 | for (var i = 0; i < sentences.length; ++i) { 55 | for (var j = i + 1; j < sentences.length; ++j) { 56 | var score = sim(sentences[i], sentences[j]); 57 | V[i] = V[i] || {name: sentences[i], out: [], in: []}; 58 | V[j] = V[j] || {name: sentences[j], out: [], in: []}; 59 | // Symmetric 60 | V[i].out.push({index: j, weight: score}); 61 | V[i].in.push({index: j, weight: score}); 62 | V[j].in.push({index: i, weight: score}); 63 | V[j].out.push({index: i, weight: score}); 64 | } 65 | } 66 | return V; 67 | } 68 | 69 | function keyExGraph(text, win) { 70 | 71 | var V = []; 72 | var edges = {}; 73 | var sz = text.length; 74 | var winSz = win || 2; 75 | var halfN = winSz / 2.; 76 | var term2idx = {}; 77 | var n = 1; 78 | 79 | function addIfNotPresent(term) { 80 | if (!term2idx[term]) { 81 | term2idx[term] = n++; 82 | } 83 | return term2idx[term] - 1; 84 | } 85 | 86 | for (var i = 0; i < sz; ++i) { 87 | var token = text[i]; 88 | if (!token.pos.match(/^[NJ]/) && token.pos !== 'ADJ' && token.pos !== 'CD') { 89 | continue; 90 | } 91 | var minWin = Math.max(0, i - halfN); 92 | var maxWin = Math.min(sz, i + halfN); 93 | for (var j = minWin; j < maxWin; ++j) { 94 | if (i == j) { 95 | continue; 96 | } 97 | var other = text[j]; 98 | if (!other.pos.match(/^[NJ]/) && other.pos !== 'ADJ' && other.pos !== 'CD') { 99 | continue; 100 | } 101 | var edge = [token.term, other.term]; 102 | edge.sort(); 103 | edge = edge.join("____"); 104 | edges[edge] = 1; 105 | } 106 | } 107 | 108 | for (var e in edges) { 109 | var thisFirst = e.split("____"); 110 | i = addIfNotPresent(thisFirst[0]); 111 | j = addIfNotPresent(thisFirst[1]); 112 | V[i] = V[i] || {me: i, name: thisFirst[0], out: [], in: []}; 113 | V[j] = V[j] || {me: j, name: thisFirst[1], out: [], in: []}; 114 | V[i].out.push({index: j, weight: 1}); 115 | V[i].in.push({index: j, weight: 1}); 116 | V[j].out.push({index: i, weight: 1}); 117 | V[j].in.push({index: i, weight: 1}); 118 | } 119 | 120 | return V; 121 | } 122 | 123 | if (typeof module !== 'undefined' && module.exports) { 124 | module.exports.textRank = textRank; 125 | module.exports.keyExGraph = keyExGraph; 126 | module.exports.sentExGraph = sentExGraph; 127 | } 128 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "textrank", 3 | "main": "./index.js", 4 | "version": "0.0.1", 5 | "description": "TextRank implementation in Javascript", 6 | "scripts": { 7 | "test": "mocha test --recursive" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/dpressel/textrank-js.git" 12 | }, 13 | "author": "Dan Pressel", 14 | "dependencies": { 15 | }, 16 | "devDependencies": { 17 | "lodash": "3.0.0", 18 | "chai": "~1.7.2", 19 | "should": "~3.1.3", 20 | "mocha": "~1.13.0", 21 | "matchdep": "~0.1.1", 22 | "grunt": "~0.4.0", 23 | "grunt-contrib-uglify": "~0.1.1", 24 | "grunt-contrib-jshint": "~0.1.1", 25 | "grunt-contrib-connect": "0.1.2", 26 | "grunt-contrib-clean": "0.4.0", 27 | "grunt-mocha-test": "~0.9.0" 28 | } 29 | } 30 | 31 | -------------------------------------------------------------------------------- /test/Automatic_Summarization-sents.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "automatic", 4 | "summarization", 5 | "is", 6 | "the", 7 | "process", 8 | "of", 9 | "reducing", 10 | "a", 11 | "text", 12 | "document", 13 | "with", 14 | "a", 15 | "computer", 16 | "program", 17 | "in", 18 | "order", 19 | "to", 20 | "create", 21 | "a", 22 | "summary", 23 | "that", 24 | "retains", 25 | "the", 26 | "most", 27 | "important", 28 | "points", 29 | "of", 30 | "the", 31 | "original", 32 | "document" 33 | ], 34 | [ 35 | "as", 36 | "the", 37 | "problem", 38 | "of", 39 | "information", 40 | "overload", 41 | "has", 42 | "grown", 43 | "and", 44 | "as", 45 | "the", 46 | "quantity", 47 | "of", 48 | "data", 49 | "has", 50 | "increased", 51 | "so", 52 | "has", 53 | "interest", 54 | "in", 55 | "automatic", 56 | "summarization" 57 | ], 58 | [ 59 | "technologies", 60 | "that", 61 | "can", 62 | "make", 63 | "a", 64 | "coherent", 65 | "summary", 66 | "take", 67 | "into", 68 | "account", 69 | "variables", 70 | "such", 71 | "as", 72 | "length", 73 | "writing", 74 | "style", 75 | "and", 76 | "syntax" 77 | ], 78 | [ 79 | "an", 80 | "example", 81 | "of", 82 | "the", 83 | "use", 84 | "of", 85 | "summarization", 86 | "technology", 87 | "is", 88 | "search", 89 | "engines", 90 | "such", 91 | "as", 92 | "google" 93 | ], 94 | [ 95 | "document", 96 | "summarization", 97 | "is", 98 | "another" 99 | ], 100 | [ 101 | "generally", 102 | "there", 103 | "are", 104 | "two", 105 | "approaches", 106 | "to", 107 | "automatic", 108 | "summarization", 109 | ":", 110 | "extraction", 111 | "and", 112 | "abstraction" 113 | ], 114 | [ 115 | "extractive", 116 | "methods", 117 | "work", 118 | "by", 119 | "selecting", 120 | "a", 121 | "subset", 122 | "of", 123 | "existing", 124 | "words", 125 | "phrases", 126 | "or", 127 | "sentences", 128 | "in", 129 | "the", 130 | "original", 131 | "text", 132 | "to", 133 | "form", 134 | "the", 135 | "summary" 136 | ], 137 | [ 138 | "in", 139 | "contrast", 140 | "abstractive", 141 | "methods", 142 | "build", 143 | "an", 144 | "internal", 145 | "semantic", 146 | "representation", 147 | "and", 148 | "then", 149 | "use", 150 | "natural", 151 | "language", 152 | "generation", 153 | "techniques", 154 | "to", 155 | "create", 156 | "a", 157 | "summary", 158 | "that", 159 | "is", 160 | "closer", 161 | "to", 162 | "what", 163 | "a", 164 | "human", 165 | "might", 166 | "generate" 167 | ], 168 | [ 169 | "such", 170 | "a", 171 | "summary", 172 | "might", 173 | "contain", 174 | "words", 175 | "not", 176 | "explicitly", 177 | "present", 178 | "in", 179 | "the", 180 | "original" 181 | ], 182 | [ 183 | "research", 184 | "into", 185 | "abstractive", 186 | "methods", 187 | "is", 188 | "an", 189 | "increasingly", 190 | "important", 191 | "and", 192 | "active", 193 | "research", 194 | "area", 195 | "however", 196 | "due", 197 | "to", 198 | "complexity", 199 | "constraints", 200 | "research", 201 | "to", 202 | "date", 203 | "has", 204 | "focused", 205 | "primarily", 206 | "on", 207 | "extractive", 208 | "methods" 209 | ], 210 | [ 211 | "methods", 212 | "of", 213 | "automatic", 214 | "summarization", 215 | "include", 216 | "extraction-based", 217 | "abstraction-based", 218 | "maximum", 219 | "entropy-based", 220 | "and", 221 | "aided", 222 | "summarization" 223 | ], 224 | [ 225 | "two", 226 | "particular", 227 | "types", 228 | "of", 229 | "summarization", 230 | "often", 231 | "addressed", 232 | "in", 233 | "the", 234 | "literature", 235 | "are", 236 | "keyphrase", 237 | "extraction", 238 | "where", 239 | "the", 240 | "goal", 241 | "is", 242 | "to", 243 | "select", 244 | "individual", 245 | "words", 246 | "or", 247 | "phrases", 248 | "to", 249 | "tag", 250 | "a", 251 | "document", 252 | "and", 253 | "document", 254 | "summarization", 255 | "where", 256 | "the", 257 | "goal", 258 | "is", 259 | "to", 260 | "select", 261 | "whole", 262 | "sentences", 263 | "to", 264 | "create", 265 | "a", 266 | "short", 267 | "paragraph", 268 | "summary" 269 | ], 270 | [ 271 | "abstraction-based", 272 | "summarization", 273 | "extraction", 274 | "techniques", 275 | "merely", 276 | "copy", 277 | "the", 278 | "information", 279 | "deemed", 280 | "most", 281 | "important", 282 | "by", 283 | "the", 284 | "system", 285 | "to", 286 | "the", 287 | "summary", 288 | "for", 289 | "example", 290 | "key", 291 | "clauses", 292 | "sentences", 293 | "or", 294 | "paragraphs", 295 | "while", 296 | "abstraction", 297 | "involves", 298 | "paraphrasing", 299 | "sections", 300 | "of", 301 | "the", 302 | "source", 303 | "document" 304 | ], 305 | [ 306 | "in", 307 | "general", 308 | "abstraction", 309 | "can", 310 | "condense", 311 | "a", 312 | "text", 313 | "more", 314 | "strongly", 315 | "than", 316 | "extraction", 317 | "but", 318 | "the", 319 | "programs", 320 | "that", 321 | "can", 322 | "do", 323 | "this", 324 | "are", 325 | "harder", 326 | "to", 327 | "develop", 328 | "as", 329 | "they", 330 | "require", 331 | "the", 332 | "use", 333 | "of", 334 | "natural", 335 | "language", 336 | "generation", 337 | "technology", 338 | "which", 339 | "itself", 340 | "is", 341 | "a", 342 | "growing", 343 | "field" 344 | ], 345 | [ 346 | "while", 347 | "some", 348 | "work", 349 | "has", 350 | "been", 351 | "done", 352 | "in", 353 | "abstractive", 354 | "summarization", 355 | "creating", 356 | "an", 357 | "abstract", 358 | "synopsis", 359 | "like", 360 | "that", 361 | "of", 362 | "a", 363 | "human", 364 | "the", 365 | "majority", 366 | "of", 367 | "summarization", 368 | "systems", 369 | "are", 370 | "extractive", 371 | "selecting", 372 | "a", 373 | "subset", 374 | "of", 375 | "sentences", 376 | "to", 377 | "place", 378 | "in", 379 | "a", 380 | "summary" 381 | ], 382 | [ 383 | "even", 384 | "though", 385 | "automating", 386 | "abstractive", 387 | "summarization", 388 | "is", 389 | "the", 390 | "goal", 391 | "of", 392 | "summarization", 393 | "research", 394 | "most", 395 | "practical", 396 | "systems", 397 | "are", 398 | "based", 399 | "on", 400 | "some", 401 | "form", 402 | "of", 403 | "extractive", 404 | "summarization" 405 | ], 406 | [ 407 | "extracted", 408 | "sentences", 409 | "can", 410 | "form", 411 | "a", 412 | "valid", 413 | "summary", 414 | "in", 415 | "themselves", 416 | "or", 417 | "form", 418 | "a", 419 | "basis", 420 | "for", 421 | "further", 422 | "condensation", 423 | "operations" 424 | ], 425 | [ 426 | "furthermore", 427 | "evaluation", 428 | "of", 429 | "extracted", 430 | "summaries", 431 | "can", 432 | "be", 433 | "automated", 434 | "since", 435 | "it", 436 | "is", 437 | "essentially", 438 | "a", 439 | "classification", 440 | "task" 441 | ], 442 | [ 443 | "during", 444 | "the", 445 | "duc", 446 | "2001", 447 | "and", 448 | "2002", 449 | "evaluation", 450 | "workshops", 451 | "tno", 452 | "developed", 453 | "a", 454 | "sentence", 455 | "extraction", 456 | "system", 457 | "for", 458 | "multi-document", 459 | "summarization", 460 | "in", 461 | "the", 462 | "news", 463 | "domain" 464 | ], 465 | [ 466 | "the", 467 | "system", 468 | "was", 469 | "based", 470 | "on", 471 | "a", 472 | "hybrid", 473 | "system", 474 | "using", 475 | "a", 476 | "naive", 477 | "bayes", 478 | "classifier", 479 | "and", 480 | "statistical", 481 | "language", 482 | "models", 483 | "for", 484 | "modeling", 485 | "salience" 486 | ], 487 | [ 488 | "although", 489 | "the", 490 | "system", 491 | "exhibited", 492 | "good", 493 | "results", 494 | "the", 495 | "researchers", 496 | "wanted", 497 | "to", 498 | "explore", 499 | "the", 500 | "effectiveness", 501 | "of", 502 | "a", 503 | "maximum", 504 | "entropy", 505 | "me", 506 | "classifier", 507 | "for", 508 | "the", 509 | "meeting", 510 | "summarization", 511 | "task", 512 | "as", 513 | "me", 514 | "is", 515 | "known", 516 | "to", 517 | "be", 518 | "robust", 519 | "against", 520 | "feature", 521 | "dependencies" 522 | ], 523 | [ 524 | "maximum", 525 | "entropy", 526 | "has", 527 | "also", 528 | "been", 529 | "applied", 530 | "successfully", 531 | "for", 532 | "summarization", 533 | "in", 534 | "the", 535 | "broadcast", 536 | "news", 537 | "domain" 538 | ], 539 | [ 540 | "machine", 541 | "learning", 542 | "techniques", 543 | "from", 544 | "closely", 545 | "related", 546 | "fields", 547 | "such", 548 | "as", 549 | "information", 550 | "retrieval", 551 | "or", 552 | "text", 553 | "mining", 554 | "have", 555 | "been", 556 | "successfully", 557 | "adapted", 558 | "to", 559 | "help", 560 | "automatic", 561 | "summarization" 562 | ], 563 | [ 564 | "apart", 565 | "from", 566 | "fully", 567 | "automated", 568 | "summarizers", 569 | "fas", 570 | "there", 571 | "are", 572 | "systems", 573 | "that", 574 | "aid", 575 | "users", 576 | "with", 577 | "the", 578 | "task", 579 | "of", 580 | "summarization", 581 | "mahs", 582 | "=", 583 | "machine", 584 | "aided", 585 | "human", 586 | "summarization", 587 | "for", 588 | "example", 589 | "by", 590 | "highlighting", 591 | "candidate", 592 | "passages", 593 | "to", 594 | "be", 595 | "included", 596 | "in", 597 | "the", 598 | "summary", 599 | "and", 600 | "there", 601 | "are", 602 | "systems", 603 | "that", 604 | "depend", 605 | "on", 606 | "post-processing", 607 | "by", 608 | "a", 609 | "human", 610 | "hams", 611 | "=", 612 | "human", 613 | "aided", 614 | "machine", 615 | "summarization" 616 | ], 617 | [ 618 | "there", 619 | "are", 620 | "different", 621 | "types", 622 | "of", 623 | "summaries", 624 | "depending", 625 | "what", 626 | "the", 627 | "summarization", 628 | "program", 629 | "focuses", 630 | "on", 631 | "to", 632 | "make", 633 | "the", 634 | "summary", 635 | "of", 636 | "the", 637 | "text", 638 | "for", 639 | "example", 640 | "generic", 641 | "summaries", 642 | "or", 643 | "query", 644 | "relevant", 645 | "summaries", 646 | "sometimes", 647 | "called", 648 | "query-based", 649 | "summaries" 650 | ], 651 | [ 652 | "summarization", 653 | "systems", 654 | "are", 655 | "able", 656 | "to", 657 | "create", 658 | "both", 659 | "query", 660 | "relevant", 661 | "text", 662 | "summaries", 663 | "and", 664 | "generic", 665 | "machine-generated", 666 | "summaries", 667 | "depending", 668 | "on", 669 | "what", 670 | "the", 671 | "user", 672 | "needs" 673 | ], 674 | [ 675 | "summarization", 676 | "of", 677 | "multimedia", 678 | "documents", 679 | "e" 680 | ], 681 | [ 682 | "pictures", 683 | "or", 684 | "movies", 685 | "is", 686 | "also", 687 | "possible" 688 | ], 689 | [ 690 | "some", 691 | "systems", 692 | "will", 693 | "generate", 694 | "a", 695 | "summary", 696 | "based", 697 | "on", 698 | "a", 699 | "single", 700 | "source", 701 | "document", 702 | "while", 703 | "others", 704 | "can", 705 | "use", 706 | "multiple", 707 | "source", 708 | "documents", 709 | "for", 710 | "example", 711 | "a", 712 | "cluster", 713 | "of", 714 | "news", 715 | "stories", 716 | "on", 717 | "the", 718 | "same", 719 | "topic" 720 | ], 721 | [ 722 | "these", 723 | "systems", 724 | "are", 725 | "known", 726 | "as", 727 | "multi-document", 728 | "summarization", 729 | "systems" 730 | ], 731 | [ 732 | "image", 733 | "collection", 734 | "summarization", 735 | "is", 736 | "other", 737 | "application", 738 | "example", 739 | "of", 740 | "automatic", 741 | "summarization" 742 | ], 743 | [ 744 | "it", 745 | "consists", 746 | "in", 747 | "selecting", 748 | "a", 749 | "representative", 750 | "set", 751 | "of", 752 | "images", 753 | "from", 754 | "a", 755 | "larger", 756 | "set", 757 | "of", 758 | "images" 759 | ], 760 | [ 761 | "a", 762 | "summary", 763 | "in", 764 | "this", 765 | "context", 766 | "is", 767 | "useful", 768 | "to", 769 | "show", 770 | "the", 771 | "most", 772 | "representative", 773 | "images", 774 | "of", 775 | "results", 776 | "in", 777 | "an", 778 | "image", 779 | "collection", 780 | "exploration", 781 | "system" 782 | ], 783 | [ 784 | "keyphrase", 785 | "extraction", 786 | ":", 787 | "task", 788 | "description", 789 | "and", 790 | "example" 791 | ], 792 | [ 793 | "the", 794 | "task", 795 | "is", 796 | "the", 797 | "following" 798 | ], 799 | [ 800 | "you", 801 | "are", 802 | "given", 803 | "a", 804 | "piece", 805 | "of", 806 | "text", 807 | "such", 808 | "as", 809 | "a", 810 | "journal", 811 | "article", 812 | "and", 813 | "you", 814 | "must", 815 | "produce", 816 | "a", 817 | "list", 818 | "of", 819 | "keywords", 820 | "or", 821 | "keyphrases", 822 | "that", 823 | "capture", 824 | "the", 825 | "primary", 826 | "topics", 827 | "discussed", 828 | "in", 829 | "the", 830 | "text" 831 | ], 832 | [ 833 | "in", 834 | "the", 835 | "case", 836 | "of", 837 | "research", 838 | "articles", 839 | "many", 840 | "authors", 841 | "provide", 842 | "manually", 843 | "assigned", 844 | "keywords", 845 | "but", 846 | "most", 847 | "text", 848 | "lacks", 849 | "pre-existing", 850 | "keyphrases" 851 | ], 852 | [ 853 | "for", 854 | "example", 855 | "news", 856 | "articles", 857 | "rarely", 858 | "have", 859 | "keyphrases", 860 | "attached", 861 | "but", 862 | "it", 863 | "would", 864 | "be", 865 | "useful", 866 | "to", 867 | "be", 868 | "able", 869 | "to", 870 | "automatically", 871 | "do", 872 | "so", 873 | "for", 874 | "a", 875 | "number", 876 | "of", 877 | "applications", 878 | "discussed", 879 | "below" 880 | ], 881 | [ 882 | "consider", 883 | "the", 884 | "example", 885 | "text", 886 | "from", 887 | "a", 888 | "recent", 889 | "news", 890 | "article", 891 | ":", 892 | "the", 893 | "army", 894 | "corps", 895 | "of", 896 | "engineers", 897 | "rushing", 898 | "to", 899 | "meet", 900 | "president", 901 | "bush's", 902 | "promise", 903 | "to", 904 | "protect", 905 | "new", 906 | "orleans", 907 | "by", 908 | "the", 909 | "start", 910 | "of", 911 | "the", 912 | "2006", 913 | "hurricane", 914 | "season", 915 | "installed", 916 | "defective", 917 | "flood-control", 918 | "pumps", 919 | "last", 920 | "year", 921 | "despite", 922 | "warnings", 923 | "from", 924 | "its", 925 | "own", 926 | "expert", 927 | "that", 928 | "the", 929 | "equipment", 930 | "would", 931 | "fail", 932 | "during", 933 | "a", 934 | "storm", 935 | "according", 936 | "to", 937 | "documents", 938 | "obtained", 939 | "by", 940 | "the", 941 | "associated", 942 | "press" 943 | ], 944 | [ 945 | "an", 946 | "extractive", 947 | "keyphrase", 948 | "extractor", 949 | "might", 950 | "select", 951 | "army", 952 | "corps", 953 | "of", 954 | "engineers", 955 | "president", 956 | "bush", 957 | "new", 958 | "orleans", 959 | "and", 960 | "defective", 961 | "flood-control", 962 | "pumps", 963 | "as", 964 | "keyphrases" 965 | ], 966 | [ 967 | "these", 968 | "are", 969 | "pulled", 970 | "directly", 971 | "from", 972 | "the", 973 | "text" 974 | ], 975 | [ 976 | "in", 977 | "contrast", 978 | "an", 979 | "abstractive", 980 | "keyphrase", 981 | "system", 982 | "would", 983 | "somehow", 984 | "internalize", 985 | "the", 986 | "content", 987 | "and", 988 | "generate", 989 | "keyphrases", 990 | "that", 991 | "might", 992 | "be", 993 | "more", 994 | "descriptive", 995 | "and", 996 | "more", 997 | "like", 998 | "what", 999 | "a", 1000 | "human", 1001 | "would", 1002 | "produce", 1003 | "such", 1004 | "as", 1005 | "political", 1006 | "negligence", 1007 | "or", 1008 | "inadequate", 1009 | "protection", 1010 | "from", 1011 | "floods" 1012 | ], 1013 | [ 1014 | "note", 1015 | "that", 1016 | "these", 1017 | "terms", 1018 | "do", 1019 | "not", 1020 | "appear", 1021 | "in", 1022 | "the", 1023 | "text", 1024 | "and", 1025 | "require", 1026 | "a", 1027 | "deep", 1028 | "understanding", 1029 | "which", 1030 | "makes", 1031 | "it", 1032 | "difficult", 1033 | "for", 1034 | "a", 1035 | "computer", 1036 | "to", 1037 | "produce", 1038 | "such", 1039 | "keyphrases" 1040 | ], 1041 | [ 1042 | "keyphrases", 1043 | "have", 1044 | "many", 1045 | "applications", 1046 | "such", 1047 | "as", 1048 | "to", 1049 | "improve", 1050 | "document", 1051 | "browsing", 1052 | "by", 1053 | "providing", 1054 | "a", 1055 | "short", 1056 | "summary" 1057 | ], 1058 | [ 1059 | "also", 1060 | "keyphrases", 1061 | "can", 1062 | "improve", 1063 | "information", 1064 | "retrieval", 1065 | "if", 1066 | "documents", 1067 | "have", 1068 | "keyphrases", 1069 | "assigned", 1070 | "a", 1071 | "user", 1072 | "could", 1073 | "search", 1074 | "by", 1075 | "keyphrase", 1076 | "to", 1077 | "produce", 1078 | "more", 1079 | "reliable", 1080 | "hits", 1081 | "than", 1082 | "a", 1083 | "full-text", 1084 | "search" 1085 | ], 1086 | [ 1087 | "also", 1088 | "automatic", 1089 | "keyphrase", 1090 | "extraction", 1091 | "can", 1092 | "be", 1093 | "useful", 1094 | "in", 1095 | "generating", 1096 | "index", 1097 | "entries", 1098 | "for", 1099 | "a", 1100 | "large", 1101 | "text", 1102 | "corpus" 1103 | ], 1104 | [ 1105 | "beginning", 1106 | "with", 1107 | "the", 1108 | "turney", 1109 | "paper", 1110 | "many", 1111 | "researchers", 1112 | "have", 1113 | "approached", 1114 | "keyphrase", 1115 | "extraction", 1116 | "as", 1117 | "a", 1118 | "supervised", 1119 | "machine", 1120 | "learning", 1121 | "problem" 1122 | ], 1123 | [ 1124 | "given", 1125 | "a", 1126 | "document", 1127 | "we", 1128 | "construct", 1129 | "an", 1130 | "example", 1131 | "for", 1132 | "each", 1133 | "unigram", 1134 | "bigram", 1135 | "and", 1136 | "trigram", 1137 | "found", 1138 | "in", 1139 | "the", 1140 | "text", 1141 | "though", 1142 | "other", 1143 | "text", 1144 | "units", 1145 | "are", 1146 | "also", 1147 | "possible", 1148 | "as", 1149 | "discussed", 1150 | "below" 1151 | ], 1152 | [ 1153 | "we", 1154 | "then", 1155 | "compute", 1156 | "various", 1157 | "features", 1158 | "describing", 1159 | "each", 1160 | "example", 1161 | "e" 1162 | ], 1163 | [ 1164 | "does", 1165 | "the", 1166 | "phrase", 1167 | "begin", 1168 | "with", 1169 | "an", 1170 | "upper-case", 1171 | "letter" 1172 | ], 1173 | [ 1174 | "we", 1175 | "assume", 1176 | "there", 1177 | "are", 1178 | "known", 1179 | "keyphrases", 1180 | "available", 1181 | "for", 1182 | "a", 1183 | "set", 1184 | "of", 1185 | "training", 1186 | "documents" 1187 | ], 1188 | [ 1189 | "using", 1190 | "the", 1191 | "known", 1192 | "keyphrases", 1193 | "we", 1194 | "can", 1195 | "assign", 1196 | "positive", 1197 | "or", 1198 | "negative", 1199 | "labels", 1200 | "to", 1201 | "the", 1202 | "examples" 1203 | ], 1204 | [ 1205 | "then", 1206 | "we", 1207 | "learn", 1208 | "a", 1209 | "classifier", 1210 | "that", 1211 | "can", 1212 | "discriminate", 1213 | "between", 1214 | "positive", 1215 | "and", 1216 | "negative", 1217 | "examples", 1218 | "as", 1219 | "a", 1220 | "function", 1221 | "of", 1222 | "the", 1223 | "features" 1224 | ], 1225 | [ 1226 | "some", 1227 | "classifiers", 1228 | "make", 1229 | "a", 1230 | "binary", 1231 | "classification", 1232 | "for", 1233 | "a", 1234 | "test", 1235 | "example", 1236 | "while", 1237 | "others", 1238 | "assign", 1239 | "a", 1240 | "probability", 1241 | "of", 1242 | "being", 1243 | "a", 1244 | "keyphrase" 1245 | ], 1246 | [ 1247 | "for", 1248 | "instance", 1249 | "in", 1250 | "the", 1251 | "above", 1252 | "text", 1253 | "we", 1254 | "might", 1255 | "learn", 1256 | "a", 1257 | "rule", 1258 | "that", 1259 | "says", 1260 | "phrases", 1261 | "with", 1262 | "initial", 1263 | "capital", 1264 | "letters", 1265 | "are", 1266 | "likely", 1267 | "to", 1268 | "be", 1269 | "keyphrases" 1270 | ], 1271 | [ 1272 | "after", 1273 | "training", 1274 | "a", 1275 | "learner", 1276 | "we", 1277 | "can", 1278 | "select", 1279 | "keyphrases", 1280 | "for", 1281 | "test", 1282 | "documents", 1283 | "in", 1284 | "the", 1285 | "following", 1286 | "manner" 1287 | ], 1288 | [ 1289 | "we", 1290 | "apply", 1291 | "the", 1292 | "same", 1293 | "example-generation", 1294 | "strategy", 1295 | "to", 1296 | "the", 1297 | "test", 1298 | "documents", 1299 | "then", 1300 | "run", 1301 | "each", 1302 | "example", 1303 | "through", 1304 | "the", 1305 | "learner" 1306 | ], 1307 | [ 1308 | "we", 1309 | "can", 1310 | "determine", 1311 | "the", 1312 | "keyphrases", 1313 | "by", 1314 | "looking", 1315 | "at", 1316 | "binary", 1317 | "classification", 1318 | "decisions", 1319 | "or", 1320 | "probabilities", 1321 | "returned", 1322 | "from", 1323 | "our", 1324 | "learned", 1325 | "model" 1326 | ], 1327 | [ 1328 | "if", 1329 | "probabilities", 1330 | "are", 1331 | "given", 1332 | "a", 1333 | "threshold", 1334 | "is", 1335 | "used", 1336 | "to", 1337 | "select", 1338 | "the", 1339 | "keyphrases" 1340 | ], 1341 | [ 1342 | "keyphrase", 1343 | "extractors", 1344 | "are", 1345 | "generally", 1346 | "evaluated", 1347 | "using", 1348 | "precision", 1349 | "and", 1350 | "recall" 1351 | ], 1352 | [ 1353 | "precision", 1354 | "measures", 1355 | "how", 1356 | "many", 1357 | "of", 1358 | "the", 1359 | "proposed", 1360 | "keyphrases", 1361 | "are", 1362 | "actually", 1363 | "correct" 1364 | ], 1365 | [ 1366 | "recall", 1367 | "measures", 1368 | "how", 1369 | "many", 1370 | "of", 1371 | "the", 1372 | "true", 1373 | "keyphrases", 1374 | "your", 1375 | "system", 1376 | "proposed" 1377 | ], 1378 | [ 1379 | "the", 1380 | "two", 1381 | "measures", 1382 | "can", 1383 | "be", 1384 | "combined", 1385 | "in", 1386 | "an", 1387 | "f-score", 1388 | "which", 1389 | "is", 1390 | "the", 1391 | "harmonic", 1392 | "mean", 1393 | "of", 1394 | "the", 1395 | "two" 1396 | ], 1397 | [ 1398 | "matches", 1399 | "between", 1400 | "the", 1401 | "proposed", 1402 | "keyphrases", 1403 | "and", 1404 | "the", 1405 | "known", 1406 | "keyphrases", 1407 | "can", 1408 | "be", 1409 | "checked", 1410 | "after", 1411 | "stemming", 1412 | "or", 1413 | "applying", 1414 | "some", 1415 | "other", 1416 | "text", 1417 | "normalization" 1418 | ], 1419 | [ 1420 | "designing", 1421 | "a", 1422 | "supervised", 1423 | "keyphrase", 1424 | "extraction", 1425 | "system", 1426 | "involves", 1427 | "deciding", 1428 | "on", 1429 | "several", 1430 | "choices", 1431 | "some", 1432 | "of", 1433 | "these", 1434 | "apply", 1435 | "to", 1436 | "unsupervised", 1437 | "too", 1438 | ":", 1439 | "what", 1440 | "are", 1441 | "the", 1442 | "examples" 1443 | ], 1444 | [ 1445 | "the", 1446 | "first", 1447 | "choice", 1448 | "is", 1449 | "exactly", 1450 | "how", 1451 | "to", 1452 | "generate", 1453 | "examples" 1454 | ], 1455 | [ 1456 | "turney", 1457 | "and", 1458 | "others", 1459 | "have", 1460 | "used", 1461 | "all", 1462 | "possible", 1463 | "unigrams", 1464 | "bigrams", 1465 | "and", 1466 | "trigrams", 1467 | "without", 1468 | "intervening", 1469 | "punctuation", 1470 | "and", 1471 | "after", 1472 | "removing", 1473 | "stopwords" 1474 | ], 1475 | [ 1476 | "hulth", 1477 | "showed", 1478 | "that", 1479 | "you", 1480 | "can", 1481 | "get", 1482 | "some", 1483 | "improvement", 1484 | "by", 1485 | "selecting", 1486 | "examples", 1487 | "to", 1488 | "be", 1489 | "sequences", 1490 | "of", 1491 | "tokens", 1492 | "that", 1493 | "match", 1494 | "certain", 1495 | "patterns", 1496 | "of", 1497 | "part-of-speech", 1498 | "tags" 1499 | ], 1500 | [ 1501 | "ideally", 1502 | "the", 1503 | "mechanism", 1504 | "for", 1505 | "generating", 1506 | "examples", 1507 | "produces", 1508 | "all", 1509 | "the", 1510 | "known", 1511 | "labeled", 1512 | "keyphrases", 1513 | "as", 1514 | "candidates", 1515 | "though", 1516 | "this", 1517 | "is", 1518 | "often", 1519 | "not", 1520 | "the", 1521 | "case" 1522 | ], 1523 | [ 1524 | "for", 1525 | "example", 1526 | "if", 1527 | "we", 1528 | "use", 1529 | "only", 1530 | "unigrams", 1531 | "bigrams", 1532 | "and", 1533 | "trigrams", 1534 | "then", 1535 | "we", 1536 | "will", 1537 | "never", 1538 | "be", 1539 | "able", 1540 | "to", 1541 | "extract", 1542 | "a", 1543 | "known", 1544 | "keyphrase", 1545 | "containing", 1546 | "four", 1547 | "words" 1548 | ], 1549 | [ 1550 | "thus", 1551 | "recall", 1552 | "may", 1553 | "suffer" 1554 | ], 1555 | [ 1556 | "however", 1557 | "generating", 1558 | "too", 1559 | "many", 1560 | "examples", 1561 | "can", 1562 | "also", 1563 | "lead", 1564 | "to", 1565 | "low", 1566 | "precision" 1567 | ], 1568 | [ 1569 | "what", 1570 | "are", 1571 | "the", 1572 | "features" 1573 | ], 1574 | [ 1575 | "we", 1576 | "also", 1577 | "need", 1578 | "to", 1579 | "create", 1580 | "features", 1581 | "that", 1582 | "describe", 1583 | "the", 1584 | "examples", 1585 | "and", 1586 | "are", 1587 | "informative", 1588 | "enough", 1589 | "to", 1590 | "allow", 1591 | "a", 1592 | "learning", 1593 | "algorithm", 1594 | "to", 1595 | "discriminate", 1596 | "keyphrases", 1597 | "from", 1598 | "non-", 1599 | "keyphrases" 1600 | ], 1601 | [ 1602 | "typically", 1603 | "features", 1604 | "involve", 1605 | "various", 1606 | "term", 1607 | "frequencies", 1608 | "how", 1609 | "many", 1610 | "times", 1611 | "a", 1612 | "phrase", 1613 | "appears", 1614 | "in", 1615 | "the", 1616 | "current", 1617 | "text", 1618 | "or", 1619 | "in", 1620 | "a", 1621 | "larger", 1622 | "corpus", 1623 | "the", 1624 | "length", 1625 | "of", 1626 | "the", 1627 | "example", 1628 | "relative", 1629 | "position", 1630 | "of", 1631 | "the", 1632 | "first", 1633 | "occurrence", 1634 | "various", 1635 | "boolean", 1636 | "syntactic", 1637 | "features", 1638 | "e" 1639 | ], 1640 | [ 1641 | "contains", 1642 | "all", 1643 | "caps", 1644 | "etc" 1645 | ], 1646 | [ 1647 | "the", 1648 | "turney", 1649 | "paper", 1650 | "used", 1651 | "about", 1652 | "12", 1653 | "such", 1654 | "features" 1655 | ], 1656 | [ 1657 | "hulth", 1658 | "uses", 1659 | "a", 1660 | "reduced", 1661 | "set", 1662 | "of", 1663 | "features", 1664 | "which", 1665 | "were", 1666 | "found", 1667 | "most", 1668 | "successful", 1669 | "in", 1670 | "the", 1671 | "kea", 1672 | "keyphrase", 1673 | "extraction", 1674 | "algorithm", 1675 | "work", 1676 | "derived", 1677 | "from", 1678 | "turney", 1679 | "s", 1680 | "seminal", 1681 | "paper" 1682 | ], 1683 | [ 1684 | "how", 1685 | "many", 1686 | "keyphrases", 1687 | "to", 1688 | "return" 1689 | ], 1690 | [ 1691 | "in", 1692 | "the", 1693 | "end", 1694 | "the", 1695 | "system", 1696 | "will", 1697 | "need", 1698 | "to", 1699 | "return", 1700 | "a", 1701 | "list", 1702 | "of", 1703 | "keyphrases", 1704 | "for", 1705 | "a", 1706 | "test", 1707 | "document", 1708 | "so", 1709 | "we", 1710 | "need", 1711 | "to", 1712 | "have", 1713 | "a", 1714 | "way", 1715 | "to", 1716 | "limit", 1717 | "the", 1718 | "number" 1719 | ], 1720 | [ 1721 | "ensemble", 1722 | "methods", 1723 | "i" 1724 | ], 1725 | [ 1726 | "using", 1727 | "votes", 1728 | "from", 1729 | "several", 1730 | "classifiers", 1731 | "have", 1732 | "been", 1733 | "used", 1734 | "to", 1735 | "produce", 1736 | "numeric", 1737 | "scores", 1738 | "that", 1739 | "can", 1740 | "be", 1741 | "thresholded", 1742 | "to", 1743 | "provide", 1744 | "a", 1745 | "user-provided", 1746 | "number", 1747 | "of", 1748 | "keyphrases" 1749 | ], 1750 | [ 1751 | "this", 1752 | "is", 1753 | "the", 1754 | "technique", 1755 | "used", 1756 | "by", 1757 | "turney", 1758 | "with", 1759 | "c4.5", 1760 | "decision", 1761 | "trees" 1762 | ], 1763 | [ 1764 | "hulth", 1765 | "used", 1766 | "a", 1767 | "single", 1768 | "binary", 1769 | "classifier", 1770 | "so", 1771 | "the", 1772 | "learning", 1773 | "algorithm", 1774 | "implicitly", 1775 | "determines", 1776 | "the", 1777 | "appropriate", 1778 | "number" 1779 | ], 1780 | [ 1781 | "what", 1782 | "learning", 1783 | "algorithm" 1784 | ], 1785 | [ 1786 | "once", 1787 | "examples", 1788 | "and", 1789 | "features", 1790 | "are", 1791 | "created", 1792 | "we", 1793 | "need", 1794 | "a", 1795 | "way", 1796 | "to", 1797 | "learn", 1798 | "to", 1799 | "predict", 1800 | "keyphrases" 1801 | ], 1802 | [ 1803 | "virtually", 1804 | "any", 1805 | "supervised", 1806 | "learning", 1807 | "algorithm", 1808 | "could", 1809 | "be", 1810 | "used", 1811 | "such", 1812 | "as", 1813 | "decision", 1814 | "trees", 1815 | "naive", 1816 | "bayes", 1817 | "and", 1818 | "rule", 1819 | "induction" 1820 | ], 1821 | [ 1822 | "in", 1823 | "the", 1824 | "case", 1825 | "of", 1826 | "turney's", 1827 | "genex", 1828 | "algorithm", 1829 | "a", 1830 | "genetic", 1831 | "algorithm", 1832 | "is", 1833 | "used", 1834 | "to", 1835 | "learn", 1836 | "parameters", 1837 | "for", 1838 | "a", 1839 | "domain-specific", 1840 | "keyphrase", 1841 | "extraction", 1842 | "algorithm" 1843 | ], 1844 | [ 1845 | "the", 1846 | "extractor", 1847 | "follows", 1848 | "a", 1849 | "series", 1850 | "of", 1851 | "heuristics", 1852 | "to", 1853 | "identify", 1854 | "keyphrases" 1855 | ], 1856 | [ 1857 | "the", 1858 | "genetic", 1859 | "algorithm", 1860 | "optimizes", 1861 | "parameters", 1862 | "for", 1863 | "these", 1864 | "heuristics", 1865 | "with", 1866 | "respect", 1867 | "to", 1868 | "performance", 1869 | "on", 1870 | "training", 1871 | "documents", 1872 | "with", 1873 | "known", 1874 | "key", 1875 | "phrases" 1876 | ], 1877 | [ 1878 | "unsupervised", 1879 | "keyphrase", 1880 | "extraction", 1881 | ":", 1882 | "textrank" 1883 | ], 1884 | [ 1885 | "while", 1886 | "supervised", 1887 | "methods", 1888 | "have", 1889 | "some", 1890 | "nice", 1891 | "properties", 1892 | "like", 1893 | "being", 1894 | "able", 1895 | "to", 1896 | "produce", 1897 | "interpretable", 1898 | "rules", 1899 | "for", 1900 | "what", 1901 | "features", 1902 | "characterize", 1903 | "a", 1904 | "keyphrase", 1905 | "they", 1906 | "also", 1907 | "require", 1908 | "a", 1909 | "large", 1910 | "amount", 1911 | "of", 1912 | "training", 1913 | "data" 1914 | ], 1915 | [ 1916 | "many", 1917 | "documents", 1918 | "with", 1919 | "known", 1920 | "keyphrases", 1921 | "are", 1922 | "needed" 1923 | ], 1924 | [ 1925 | "furthermore", 1926 | "training", 1927 | "on", 1928 | "a", 1929 | "specific", 1930 | "domain", 1931 | "tends", 1932 | "to", 1933 | "customize", 1934 | "the", 1935 | "extraction", 1936 | "process", 1937 | "to", 1938 | "that", 1939 | "domain", 1940 | "so", 1941 | "the", 1942 | "resulting", 1943 | "classifier", 1944 | "is", 1945 | "not", 1946 | "necessarily", 1947 | "portable", 1948 | "as", 1949 | "some", 1950 | "of", 1951 | "turney's", 1952 | "results", 1953 | "demonstrate" 1954 | ], 1955 | [ 1956 | "unsupervised", 1957 | "keyphrase", 1958 | "extraction", 1959 | "removes", 1960 | "the", 1961 | "need", 1962 | "for", 1963 | "training", 1964 | "data" 1965 | ], 1966 | [ 1967 | "it", 1968 | "approaches", 1969 | "the", 1970 | "problem", 1971 | "from", 1972 | "a", 1973 | "different", 1974 | "angle" 1975 | ], 1976 | [ 1977 | "instead", 1978 | "of", 1979 | "trying", 1980 | "to", 1981 | "learn", 1982 | "explicit", 1983 | "features", 1984 | "that", 1985 | "characterize", 1986 | "keyphrases", 1987 | "the", 1988 | "textrank", 1989 | "algorithm", 1990 | "exploits", 1991 | "the", 1992 | "structure", 1993 | "of", 1994 | "the", 1995 | "text", 1996 | "itself", 1997 | "to", 1998 | "determine", 1999 | "keyphrases", 2000 | "that", 2001 | "appear", 2002 | "central", 2003 | "to", 2004 | "the", 2005 | "text", 2006 | "in", 2007 | "the", 2008 | "same", 2009 | "way", 2010 | "that", 2011 | "pagerank", 2012 | "selects", 2013 | "important", 2014 | "web", 2015 | "pages" 2016 | ], 2017 | [ 2018 | "recall", 2019 | "this", 2020 | "is", 2021 | "based", 2022 | "on", 2023 | "the", 2024 | "notion", 2025 | "of", 2026 | "prestige", 2027 | "or", 2028 | "recommendation", 2029 | "from", 2030 | "social", 2031 | "networks" 2032 | ], 2033 | [ 2034 | "in", 2035 | "this", 2036 | "way", 2037 | "textrank", 2038 | "does", 2039 | "not", 2040 | "rely", 2041 | "on", 2042 | "any", 2043 | "previous", 2044 | "training", 2045 | "data", 2046 | "at", 2047 | "all", 2048 | "but", 2049 | "rather", 2050 | "can", 2051 | "be", 2052 | "run", 2053 | "on", 2054 | "any", 2055 | "arbitrary", 2056 | "piece", 2057 | "of", 2058 | "text", 2059 | "and", 2060 | "it", 2061 | "can", 2062 | "produce", 2063 | "output", 2064 | "simply", 2065 | "based", 2066 | "on", 2067 | "the", 2068 | "text's", 2069 | "intrinsic", 2070 | "properties" 2071 | ], 2072 | [ 2073 | "thus", 2074 | "the", 2075 | "algorithm", 2076 | "is", 2077 | "easily", 2078 | "portable", 2079 | "to", 2080 | "new", 2081 | "domains", 2082 | "and", 2083 | "languages" 2084 | ], 2085 | [ 2086 | "textrank", 2087 | "is", 2088 | "a", 2089 | "general", 2090 | "purpose", 2091 | "graph-based", 2092 | "ranking", 2093 | "algorithm", 2094 | "for", 2095 | "nlp" 2096 | ], 2097 | [ 2098 | "essentially", 2099 | "it", 2100 | "runs", 2101 | "pagerank", 2102 | "on", 2103 | "a", 2104 | "graph", 2105 | "specially", 2106 | "designed", 2107 | "for", 2108 | "a", 2109 | "particular", 2110 | "nlp", 2111 | "task" 2112 | ], 2113 | [ 2114 | "for", 2115 | "keyphrase", 2116 | "extraction", 2117 | "it", 2118 | "builds", 2119 | "a", 2120 | "graph", 2121 | "using", 2122 | "some", 2123 | "set", 2124 | "of", 2125 | "text", 2126 | "units", 2127 | "as", 2128 | "vertices" 2129 | ], 2130 | [ 2131 | "edges", 2132 | "are", 2133 | "based", 2134 | "on", 2135 | "some", 2136 | "measure", 2137 | "of", 2138 | "semantic", 2139 | "or", 2140 | "lexical", 2141 | "similarity", 2142 | "between", 2143 | "the", 2144 | "text", 2145 | "unit", 2146 | "vertices" 2147 | ], 2148 | [ 2149 | "unlike", 2150 | "pagerank", 2151 | "the", 2152 | "edges", 2153 | "are", 2154 | "typically", 2155 | "undirected", 2156 | "and", 2157 | "can", 2158 | "be", 2159 | "weighted", 2160 | "to", 2161 | "reflect", 2162 | "a", 2163 | "degree", 2164 | "of", 2165 | "similarity" 2166 | ], 2167 | [ 2168 | "once", 2169 | "the", 2170 | "graph", 2171 | "is", 2172 | "constructed", 2173 | "it", 2174 | "is", 2175 | "used", 2176 | "to", 2177 | "form", 2178 | "a", 2179 | "stochastic", 2180 | "matrix", 2181 | "combined", 2182 | "with", 2183 | "a", 2184 | "damping", 2185 | "factor", 2186 | "as", 2187 | "in", 2188 | "the", 2189 | "random", 2190 | "surfer", 2191 | "model", 2192 | "and", 2193 | "the", 2194 | "ranking", 2195 | "over", 2196 | "vertices", 2197 | "is", 2198 | "obtained", 2199 | "by", 2200 | "finding", 2201 | "the", 2202 | "eigenvector", 2203 | "corresponding", 2204 | "to", 2205 | "eigenvalue", 2206 | "1", 2207 | "i" 2208 | ], 2209 | [ 2210 | "the", 2211 | "stationary", 2212 | "distribution", 2213 | "of", 2214 | "the", 2215 | "random", 2216 | "walk", 2217 | "on", 2218 | "the", 2219 | "graph" 2220 | ], 2221 | [ 2222 | "what", 2223 | "should", 2224 | "vertices", 2225 | "be" 2226 | ], 2227 | [ 2228 | "the", 2229 | "vertices", 2230 | "should", 2231 | "correspond", 2232 | "to", 2233 | "what", 2234 | "we", 2235 | "want", 2236 | "to", 2237 | "rank" 2238 | ], 2239 | [ 2240 | "potentially", 2241 | "we", 2242 | "could", 2243 | "do", 2244 | "something", 2245 | "similar", 2246 | "to", 2247 | "the", 2248 | "supervised", 2249 | "methods", 2250 | "and", 2251 | "create", 2252 | "a", 2253 | "vertex", 2254 | "for", 2255 | "each", 2256 | "unigram", 2257 | "bigram", 2258 | "trigram", 2259 | "etc" 2260 | ], 2261 | [ 2262 | "however", 2263 | "to", 2264 | "keep", 2265 | "the", 2266 | "graph", 2267 | "small", 2268 | "the", 2269 | "authors", 2270 | "decide", 2271 | "to", 2272 | "rank", 2273 | "individual", 2274 | "unigrams", 2275 | "in", 2276 | "a", 2277 | "first", 2278 | "step", 2279 | "and", 2280 | "then", 2281 | "include", 2282 | "a", 2283 | "second", 2284 | "step", 2285 | "that", 2286 | "merges", 2287 | "highly", 2288 | "ranked", 2289 | "adjacent", 2290 | "unigrams", 2291 | "to", 2292 | "form", 2293 | "multi-word", 2294 | "phrases" 2295 | ], 2296 | [ 2297 | "this", 2298 | "has", 2299 | "a", 2300 | "nice", 2301 | "side", 2302 | "effect", 2303 | "of", 2304 | "allowing", 2305 | "us", 2306 | "to", 2307 | "produce", 2308 | "keyphrases", 2309 | "of", 2310 | "arbitrary", 2311 | "length" 2312 | ], 2313 | [ 2314 | "for", 2315 | "example", 2316 | "if", 2317 | "we", 2318 | "rank", 2319 | "unigrams", 2320 | "and", 2321 | "find", 2322 | "that", 2323 | "advanced", 2324 | "natural", 2325 | "language", 2326 | "and", 2327 | "processing", 2328 | "all", 2329 | "get", 2330 | "high", 2331 | "ranks", 2332 | "then", 2333 | "we", 2334 | "would", 2335 | "look", 2336 | "at", 2337 | "the", 2338 | "original", 2339 | "text", 2340 | "and", 2341 | "see", 2342 | "that", 2343 | "these", 2344 | "words", 2345 | "appear", 2346 | "consecutively", 2347 | "and", 2348 | "create", 2349 | "a", 2350 | "final", 2351 | "keyphrase", 2352 | "using", 2353 | "all", 2354 | "four", 2355 | "together" 2356 | ], 2357 | [ 2358 | "note", 2359 | "that", 2360 | "the", 2361 | "unigrams", 2362 | "placed", 2363 | "in", 2364 | "the", 2365 | "graph", 2366 | "can", 2367 | "be", 2368 | "filtered", 2369 | "by", 2370 | "part", 2371 | "of", 2372 | "speech" 2373 | ], 2374 | [ 2375 | "the", 2376 | "authors", 2377 | "found", 2378 | "that", 2379 | "adjectives", 2380 | "and", 2381 | "nouns", 2382 | "were", 2383 | "the", 2384 | "best", 2385 | "to", 2386 | "include" 2387 | ], 2388 | [ 2389 | "thus", 2390 | "some", 2391 | "linguistic", 2392 | "knowledge", 2393 | "comes", 2394 | "into", 2395 | "play", 2396 | "in", 2397 | "this", 2398 | "step" 2399 | ], 2400 | [ 2401 | "how", 2402 | "should", 2403 | "we", 2404 | "create", 2405 | "edges" 2406 | ], 2407 | [ 2408 | "edges", 2409 | "are", 2410 | "created", 2411 | "based", 2412 | "on", 2413 | "word", 2414 | "co-occurrence", 2415 | "in", 2416 | "this", 2417 | "application", 2418 | "of", 2419 | "textrank" 2420 | ], 2421 | [ 2422 | "two", 2423 | "vertices", 2424 | "are", 2425 | "connected", 2426 | "by", 2427 | "an", 2428 | "edge", 2429 | "if", 2430 | "the", 2431 | "unigrams", 2432 | "appear", 2433 | "within", 2434 | "a", 2435 | "window", 2436 | "of", 2437 | "size", 2438 | "n", 2439 | "in", 2440 | "the", 2441 | "original", 2442 | "text" 2443 | ], 2444 | [ 2445 | "n", 2446 | "is", 2447 | "typically", 2448 | "around", 2449 | "2", 2450 | "10" 2451 | ], 2452 | [ 2453 | "thus", 2454 | "natural", 2455 | "and", 2456 | "language", 2457 | "might", 2458 | "be", 2459 | "linked", 2460 | "in", 2461 | "a", 2462 | "text", 2463 | "about", 2464 | "nlp" 2465 | ], 2466 | [ 2467 | "natural", 2468 | "and", 2469 | "processing", 2470 | "would", 2471 | "also", 2472 | "be", 2473 | "linked", 2474 | "because", 2475 | "they", 2476 | "would", 2477 | "both", 2478 | "appear", 2479 | "in", 2480 | "the", 2481 | "same", 2482 | "string", 2483 | "of", 2484 | "n", 2485 | "words" 2486 | ], 2487 | [ 2488 | "these", 2489 | "edges", 2490 | "build", 2491 | "on", 2492 | "the", 2493 | "notion", 2494 | "of", 2495 | "text", 2496 | "cohesion", 2497 | "and", 2498 | "the", 2499 | "idea", 2500 | "that", 2501 | "words", 2502 | "that", 2503 | "appear", 2504 | "near", 2505 | "each", 2506 | "other", 2507 | "are", 2508 | "likely", 2509 | "related", 2510 | "in", 2511 | "a", 2512 | "meaningful", 2513 | "way", 2514 | "and", 2515 | "recommend", 2516 | "each", 2517 | "other", 2518 | "to", 2519 | "the", 2520 | "reader" 2521 | ], 2522 | [ 2523 | "how", 2524 | "are", 2525 | "the", 2526 | "final", 2527 | "keyphrases", 2528 | "formed" 2529 | ], 2530 | [ 2531 | "since", 2532 | "this", 2533 | "method", 2534 | "simply", 2535 | "ranks", 2536 | "the", 2537 | "individual", 2538 | "vertices", 2539 | "we", 2540 | "need", 2541 | "a", 2542 | "way", 2543 | "to", 2544 | "threshold", 2545 | "or", 2546 | "produce", 2547 | "a", 2548 | "limited", 2549 | "number", 2550 | "of", 2551 | "keyphrases" 2552 | ], 2553 | [ 2554 | "the", 2555 | "technique", 2556 | "chosen", 2557 | "is", 2558 | "to", 2559 | "set", 2560 | "a", 2561 | "count", 2562 | "t", 2563 | "to", 2564 | "be", 2565 | "a", 2566 | "user-specified", 2567 | "fraction", 2568 | "of", 2569 | "the", 2570 | "total", 2571 | "number", 2572 | "of", 2573 | "vertices", 2574 | "in", 2575 | "the", 2576 | "graph" 2577 | ], 2578 | [ 2579 | "then", 2580 | "the", 2581 | "top", 2582 | "t", 2583 | "vertices/unigrams", 2584 | "are", 2585 | "selected", 2586 | "based", 2587 | "on", 2588 | "their", 2589 | "stationary", 2590 | "probabilities" 2591 | ], 2592 | [ 2593 | "a", 2594 | "post-", 2595 | "processing", 2596 | "step", 2597 | "is", 2598 | "then", 2599 | "applied", 2600 | "to", 2601 | "merge", 2602 | "adjacent", 2603 | "instances", 2604 | "of", 2605 | "these", 2606 | "t", 2607 | "unigrams" 2608 | ], 2609 | [ 2610 | "as", 2611 | "a", 2612 | "result", 2613 | "potentially", 2614 | "more", 2615 | "or", 2616 | "less", 2617 | "than", 2618 | "t", 2619 | "final", 2620 | "keyphrases", 2621 | "will", 2622 | "be", 2623 | "produced", 2624 | "but", 2625 | "the", 2626 | "number", 2627 | "should", 2628 | "be", 2629 | "roughly", 2630 | "proportional", 2631 | "to", 2632 | "the", 2633 | "length", 2634 | "of", 2635 | "the", 2636 | "original", 2637 | "text" 2638 | ], 2639 | [ 2640 | "it", 2641 | "is", 2642 | "not", 2643 | "initially", 2644 | "clear", 2645 | "why", 2646 | "applying", 2647 | "pagerank", 2648 | "to", 2649 | "a", 2650 | "co-occurrence", 2651 | "graph", 2652 | "would", 2653 | "produce", 2654 | "useful", 2655 | "keyphrases" 2656 | ], 2657 | [ 2658 | "one", 2659 | "way", 2660 | "to", 2661 | "think", 2662 | "about", 2663 | "it", 2664 | "is", 2665 | "the", 2666 | "following" 2667 | ], 2668 | [ 2669 | "a", 2670 | "word", 2671 | "that", 2672 | "appears", 2673 | "multiple", 2674 | "times", 2675 | "throughout", 2676 | "a", 2677 | "text", 2678 | "may", 2679 | "have", 2680 | "many", 2681 | "different", 2682 | "co-occurring", 2683 | "neighbors" 2684 | ], 2685 | [ 2686 | "for", 2687 | "example", 2688 | "in", 2689 | "a", 2690 | "text", 2691 | "about", 2692 | "machine", 2693 | "learning", 2694 | "the", 2695 | "unigram", 2696 | "learning", 2697 | "might", 2698 | "co-occur", 2699 | "with", 2700 | "machine", 2701 | "supervised", 2702 | "un-supervised", 2703 | "and", 2704 | "semi-supervised", 2705 | "in", 2706 | "four", 2707 | "different", 2708 | "sentences" 2709 | ], 2710 | [ 2711 | "thus", 2712 | "the", 2713 | "learning", 2714 | "vertex", 2715 | "would", 2716 | "be", 2717 | "a", 2718 | "central", 2719 | "hub", 2720 | "that", 2721 | "connects", 2722 | "to", 2723 | "these", 2724 | "other", 2725 | "modifying", 2726 | "words" 2727 | ], 2728 | [ 2729 | "running", 2730 | "pagerank/textrank", 2731 | "on", 2732 | "the", 2733 | "graph", 2734 | "is", 2735 | "likely", 2736 | "to", 2737 | "rank", 2738 | "learning", 2739 | "highly" 2740 | ], 2741 | [ 2742 | "similarly", 2743 | "if", 2744 | "the", 2745 | "text", 2746 | "contains", 2747 | "the", 2748 | "phrase", 2749 | "supervised", 2750 | "classification", 2751 | "then", 2752 | "there", 2753 | "would", 2754 | "be", 2755 | "an", 2756 | "edge", 2757 | "between", 2758 | "supervised", 2759 | "and", 2760 | "classification" 2761 | ], 2762 | [ 2763 | "if", 2764 | "classification", 2765 | "appears", 2766 | "several", 2767 | "other", 2768 | "places", 2769 | "and", 2770 | "thus", 2771 | "has", 2772 | "many", 2773 | "neighbors", 2774 | "its", 2775 | "importance", 2776 | "would", 2777 | "contribute", 2778 | "to", 2779 | "the", 2780 | "importance", 2781 | "of", 2782 | "supervised" 2783 | ], 2784 | [ 2785 | "if", 2786 | "it", 2787 | "ends", 2788 | "up", 2789 | "with", 2790 | "a", 2791 | "high", 2792 | "rank", 2793 | "it", 2794 | "will", 2795 | "be", 2796 | "selected", 2797 | "as", 2798 | "one", 2799 | "of", 2800 | "the", 2801 | "top", 2802 | "t", 2803 | "unigrams", 2804 | "along", 2805 | "with", 2806 | "learning", 2807 | "and", 2808 | "probably", 2809 | "classification" 2810 | ], 2811 | [ 2812 | "in", 2813 | "the", 2814 | "final", 2815 | "post-processing", 2816 | "step", 2817 | "we", 2818 | "would", 2819 | "then", 2820 | "end", 2821 | "up", 2822 | "with", 2823 | "keyphrases", 2824 | "supervised", 2825 | "learning", 2826 | "and", 2827 | "supervised", 2828 | "classification" 2829 | ], 2830 | [ 2831 | "in", 2832 | "short", 2833 | "the", 2834 | "co-occurrence", 2835 | "graph", 2836 | "will", 2837 | "contain", 2838 | "densely", 2839 | "connected", 2840 | "regions", 2841 | "for", 2842 | "terms", 2843 | "that", 2844 | "appear", 2845 | "often", 2846 | "and", 2847 | "in", 2848 | "different", 2849 | "contexts" 2850 | ], 2851 | [ 2852 | "a", 2853 | "random", 2854 | "walk", 2855 | "on", 2856 | "this", 2857 | "graph", 2858 | "will", 2859 | "have", 2860 | "a", 2861 | "stationary", 2862 | "distribution", 2863 | "that", 2864 | "assigns", 2865 | "large", 2866 | "probabilities", 2867 | "to", 2868 | "the", 2869 | "terms", 2870 | "in", 2871 | "the", 2872 | "centers", 2873 | "of", 2874 | "the", 2875 | "clusters" 2876 | ], 2877 | [ 2878 | "this", 2879 | "is", 2880 | "similar", 2881 | "to", 2882 | "densely", 2883 | "connected", 2884 | "web", 2885 | "pages", 2886 | "getting", 2887 | "ranked", 2888 | "highly", 2889 | "by", 2890 | "pagerank" 2891 | ], 2892 | [ 2893 | "like", 2894 | "keyphrase", 2895 | "extraction", 2896 | "document", 2897 | "summarization", 2898 | "hopes", 2899 | "to", 2900 | "identify", 2901 | "the", 2902 | "essence", 2903 | "of", 2904 | "a", 2905 | "text" 2906 | ], 2907 | [ 2908 | "the", 2909 | "only", 2910 | "real", 2911 | "difference", 2912 | "is", 2913 | "that", 2914 | "now", 2915 | "we", 2916 | "are", 2917 | "dealing", 2918 | "with", 2919 | "larger", 2920 | "text", 2921 | "units", 2922 | "whole", 2923 | "sentences", 2924 | "instead", 2925 | "of", 2926 | "words", 2927 | "and", 2928 | "phrases" 2929 | ], 2930 | [ 2931 | "before", 2932 | "getting", 2933 | "into", 2934 | "the", 2935 | "details", 2936 | "of", 2937 | "some", 2938 | "summarization", 2939 | "methods", 2940 | "we", 2941 | "will", 2942 | "mention", 2943 | "how", 2944 | "summarization", 2945 | "systems", 2946 | "are", 2947 | "typically", 2948 | "evaluated" 2949 | ], 2950 | [ 2951 | "the", 2952 | "most", 2953 | "common", 2954 | "way", 2955 | "is", 2956 | "using", 2957 | "the", 2958 | "so-called", 2959 | "rouge", 2960 | "recall-oriented", 2961 | "understudy", 2962 | "for", 2963 | "gisting", 2964 | "evaluation", 2965 | "measure" 2966 | ], 2967 | [ 2968 | "this", 2969 | "is", 2970 | "a", 2971 | "recall-based", 2972 | "measure", 2973 | "that", 2974 | "determines", 2975 | "how", 2976 | "well", 2977 | "a", 2978 | "system-generated", 2979 | "summary", 2980 | "covers", 2981 | "the", 2982 | "content", 2983 | "present", 2984 | "in", 2985 | "one", 2986 | "or", 2987 | "more", 2988 | "human-generated", 2989 | "model", 2990 | "summaries", 2991 | "known", 2992 | "as", 2993 | "references" 2994 | ], 2995 | [ 2996 | "it", 2997 | "is", 2998 | "recall-based", 2999 | "to", 3000 | "encourage", 3001 | "systems", 3002 | "to", 3003 | "include", 3004 | "all", 3005 | "the", 3006 | "important", 3007 | "topics", 3008 | "in", 3009 | "the", 3010 | "text" 3011 | ], 3012 | [ 3013 | "recall", 3014 | "can", 3015 | "be", 3016 | "computed", 3017 | "with", 3018 | "respect", 3019 | "to", 3020 | "unigram", 3021 | "bigram", 3022 | "trigram", 3023 | "or", 3024 | "4-gram", 3025 | "matching" 3026 | ], 3027 | [ 3028 | "for", 3029 | "example", 3030 | "rouge-1", 3031 | "is", 3032 | "computed", 3033 | "as", 3034 | "division", 3035 | "of", 3036 | "count", 3037 | "of", 3038 | "unigrams", 3039 | "in", 3040 | "reference", 3041 | "that", 3042 | "appear", 3043 | "in", 3044 | "system", 3045 | "and", 3046 | "count", 3047 | "of", 3048 | "unigrams", 3049 | "in", 3050 | "reference", 3051 | "summary" 3052 | ], 3053 | [ 3054 | "if", 3055 | "there", 3056 | "are", 3057 | "multiple", 3058 | "references", 3059 | "the", 3060 | "rouge-1", 3061 | "scores", 3062 | "are", 3063 | "averaged" 3064 | ], 3065 | [ 3066 | "because", 3067 | "rouge", 3068 | "is", 3069 | "based", 3070 | "only", 3071 | "on", 3072 | "content", 3073 | "overlap", 3074 | "it", 3075 | "can", 3076 | "determine", 3077 | "if", 3078 | "the", 3079 | "same", 3080 | "general", 3081 | "concepts", 3082 | "are", 3083 | "discussed", 3084 | "between", 3085 | "an", 3086 | "automatic", 3087 | "summary", 3088 | "and", 3089 | "a", 3090 | "reference", 3091 | "summary", 3092 | "but", 3093 | "it", 3094 | "cannot", 3095 | "determine", 3096 | "if", 3097 | "the", 3098 | "result", 3099 | "is", 3100 | "coherent", 3101 | "or", 3102 | "the", 3103 | "sentences", 3104 | "flow", 3105 | "together", 3106 | "in", 3107 | "a", 3108 | "sensible", 3109 | "manner" 3110 | ], 3111 | [ 3112 | "high-order", 3113 | "n-gram", 3114 | "rouge", 3115 | "measures", 3116 | "try", 3117 | "to", 3118 | "judge", 3119 | "fluency", 3120 | "to", 3121 | "some", 3122 | "degree" 3123 | ], 3124 | [ 3125 | "note", 3126 | "that", 3127 | "rouge", 3128 | "is", 3129 | "similar", 3130 | "to", 3131 | "the", 3132 | "bleu", 3133 | "measure", 3134 | "for", 3135 | "machine", 3136 | "translation", 3137 | "but", 3138 | "bleu", 3139 | "is", 3140 | "precision-", 3141 | "based", 3142 | "because", 3143 | "translation", 3144 | "systems", 3145 | "favor", 3146 | "accuracy" 3147 | ], 3148 | [ 3149 | "a", 3150 | "promising", 3151 | "line", 3152 | "in", 3153 | "document", 3154 | "summarization", 3155 | "is", 3156 | "adaptive", 3157 | "document/text", 3158 | "summarization" 3159 | ], 3160 | [ 3161 | "the", 3162 | "idea", 3163 | "of", 3164 | "adaptive", 3165 | "summarization", 3166 | "involves", 3167 | "preliminary", 3168 | "recognition", 3169 | "of", 3170 | "document/text", 3171 | "genre", 3172 | "and", 3173 | "subsequent", 3174 | "application", 3175 | "of", 3176 | "summarization", 3177 | "algorithms", 3178 | "optimized", 3179 | "for", 3180 | "this", 3181 | "genre" 3182 | ], 3183 | [ 3184 | "first", 3185 | "summarizes", 3186 | "that", 3187 | "perform", 3188 | "adaptive", 3189 | "summarization", 3190 | "have", 3191 | "been", 3192 | "created" 3193 | ], 3194 | [ 3195 | "supervised", 3196 | "text", 3197 | "summarization", 3198 | "is", 3199 | "very", 3200 | "much", 3201 | "like", 3202 | "supervised", 3203 | "keyphrase", 3204 | "extraction" 3205 | ], 3206 | [ 3207 | "basically", 3208 | "if", 3209 | "you", 3210 | "have", 3211 | "a", 3212 | "collection", 3213 | "of", 3214 | "documents", 3215 | "and", 3216 | "human-generated", 3217 | "summaries", 3218 | "for", 3219 | "them", 3220 | "you", 3221 | "can", 3222 | "learn", 3223 | "features", 3224 | "of", 3225 | "sentences", 3226 | "that", 3227 | "make", 3228 | "them", 3229 | "good", 3230 | "candidates", 3231 | "for", 3232 | "inclusion", 3233 | "in", 3234 | "the", 3235 | "summary" 3236 | ], 3237 | [ 3238 | "features", 3239 | "might", 3240 | "include", 3241 | "the", 3242 | "position", 3243 | "in", 3244 | "the", 3245 | "document", 3246 | "i" 3247 | ], 3248 | [ 3249 | "the", 3250 | "first", 3251 | "few", 3252 | "sentences", 3253 | "are", 3254 | "probably", 3255 | "important", 3256 | "the", 3257 | "number", 3258 | "of", 3259 | "words", 3260 | "in", 3261 | "the", 3262 | "sentence", 3263 | "etc" 3264 | ], 3265 | [ 3266 | "the", 3267 | "main", 3268 | "difficulty", 3269 | "in", 3270 | "supervised", 3271 | "extractive", 3272 | "summarization", 3273 | "is", 3274 | "that", 3275 | "the", 3276 | "known", 3277 | "summaries", 3278 | "must", 3279 | "be", 3280 | "manually", 3281 | "created", 3282 | "by", 3283 | "extracting", 3284 | "sentences", 3285 | "so", 3286 | "the", 3287 | "sentences", 3288 | "in", 3289 | "an", 3290 | "original", 3291 | "training", 3292 | "document", 3293 | "can", 3294 | "be", 3295 | "labeled", 3296 | "as", 3297 | "in", 3298 | "summary", 3299 | "or", 3300 | "not", 3301 | "in", 3302 | "summary" 3303 | ], 3304 | [ 3305 | "this", 3306 | "is", 3307 | "not", 3308 | "typically", 3309 | "how", 3310 | "people", 3311 | "create", 3312 | "summaries", 3313 | "so", 3314 | "simply", 3315 | "using", 3316 | "journal", 3317 | "abstracts", 3318 | "or", 3319 | "existing", 3320 | "summaries", 3321 | "is", 3322 | "usually", 3323 | "not", 3324 | "sufficient" 3325 | ], 3326 | [ 3327 | "the", 3328 | "sentences", 3329 | "in", 3330 | "these", 3331 | "summaries", 3332 | "do", 3333 | "not", 3334 | "necessarily", 3335 | "match", 3336 | "up", 3337 | "with", 3338 | "sentences", 3339 | "in", 3340 | "the", 3341 | "original", 3342 | "text", 3343 | "so", 3344 | "it", 3345 | "would", 3346 | "be", 3347 | "difficult", 3348 | "to", 3349 | "assign", 3350 | "labels", 3351 | "to", 3352 | "examples", 3353 | "for", 3354 | "training" 3355 | ], 3356 | [ 3357 | "note", 3358 | "however", 3359 | "that", 3360 | "these", 3361 | "natural", 3362 | "summaries", 3363 | "can", 3364 | "still", 3365 | "be", 3366 | "used", 3367 | "for", 3368 | "evaluation", 3369 | "purposes", 3370 | "since", 3371 | "rouge-1", 3372 | "only", 3373 | "cares", 3374 | "about", 3375 | "unigrams" 3376 | ], 3377 | [ 3378 | "the", 3379 | "unsupervised", 3380 | "approach", 3381 | "to", 3382 | "summarization", 3383 | "is", 3384 | "also", 3385 | "quite", 3386 | "similar", 3387 | "in", 3388 | "spirit", 3389 | "to", 3390 | "unsupervised", 3391 | "keyphrase", 3392 | "extraction", 3393 | "and", 3394 | "gets", 3395 | "around", 3396 | "the", 3397 | "issue", 3398 | "of", 3399 | "costly", 3400 | "training", 3401 | "data" 3402 | ], 3403 | [ 3404 | "some", 3405 | "unsupervised", 3406 | "summarization", 3407 | "approaches", 3408 | "are", 3409 | "based", 3410 | "on", 3411 | "finding", 3412 | "a", 3413 | "centroid", 3414 | "sentence", 3415 | "which", 3416 | "is", 3417 | "the", 3418 | "mean", 3419 | "word", 3420 | "vector", 3421 | "of", 3422 | "all", 3423 | "the", 3424 | "sentences", 3425 | "in", 3426 | "the", 3427 | "document" 3428 | ], 3429 | [ 3430 | "then", 3431 | "the", 3432 | "sentences", 3433 | "can", 3434 | "be", 3435 | "ranked", 3436 | "with", 3437 | "regard", 3438 | "to", 3439 | "their", 3440 | "similarity", 3441 | "to", 3442 | "this", 3443 | "centroid", 3444 | "sentence" 3445 | ], 3446 | [ 3447 | "a", 3448 | "more", 3449 | "principled", 3450 | "way", 3451 | "to", 3452 | "estimate", 3453 | "sentence", 3454 | "importance", 3455 | "is", 3456 | "using", 3457 | "random", 3458 | "walks", 3459 | "and", 3460 | "eigenvector", 3461 | "centrality" 3462 | ], 3463 | [ 3464 | "lexrank", 3465 | "is", 3466 | "an", 3467 | "algorithm", 3468 | "essentially", 3469 | "identical", 3470 | "to", 3471 | "textrank", 3472 | "and", 3473 | "both", 3474 | "use", 3475 | "this", 3476 | "approach", 3477 | "for", 3478 | "document", 3479 | "summarization" 3480 | ], 3481 | [ 3482 | "the", 3483 | "two", 3484 | "methods", 3485 | "were", 3486 | "developed", 3487 | "by", 3488 | "different", 3489 | "groups", 3490 | "at", 3491 | "the", 3492 | "same", 3493 | "time", 3494 | "and", 3495 | "lexrank", 3496 | "simply", 3497 | "focused", 3498 | "on", 3499 | "summarization", 3500 | "but", 3501 | "could", 3502 | "just", 3503 | "as", 3504 | "easily", 3505 | "be", 3506 | "used", 3507 | "for", 3508 | "keyphrase", 3509 | "extraction", 3510 | "or", 3511 | "any", 3512 | "other", 3513 | "nlp", 3514 | "ranking", 3515 | "task" 3516 | ], 3517 | [ 3518 | "what", 3519 | "are", 3520 | "the", 3521 | "vertices" 3522 | ], 3523 | [ 3524 | "in", 3525 | "both", 3526 | "lexrank", 3527 | "and", 3528 | "textrank", 3529 | "a", 3530 | "graph", 3531 | "is", 3532 | "constructed", 3533 | "by", 3534 | "creating", 3535 | "a", 3536 | "vertex", 3537 | "for", 3538 | "each", 3539 | "sentence", 3540 | "in", 3541 | "the", 3542 | "document" 3543 | ], 3544 | [ 3545 | "what", 3546 | "are", 3547 | "the", 3548 | "edges" 3549 | ], 3550 | [ 3551 | "the", 3552 | "edges", 3553 | "between", 3554 | "sentences", 3555 | "are", 3556 | "based", 3557 | "on", 3558 | "some", 3559 | "form", 3560 | "of", 3561 | "semantic", 3562 | "similarity", 3563 | "or", 3564 | "content", 3565 | "overlap" 3566 | ], 3567 | [ 3568 | "while", 3569 | "lexrank", 3570 | "uses", 3571 | "cosine", 3572 | "similarity", 3573 | "of", 3574 | "tf-idf", 3575 | "vectors", 3576 | "textrank", 3577 | "uses", 3578 | "a", 3579 | "very", 3580 | "similar", 3581 | "measure", 3582 | "based", 3583 | "on", 3584 | "the", 3585 | "number", 3586 | "of", 3587 | "words", 3588 | "two", 3589 | "sentences", 3590 | "have", 3591 | "in", 3592 | "common", 3593 | "normalized", 3594 | "by", 3595 | "the", 3596 | "sentences'", 3597 | "lengths" 3598 | ], 3599 | [ 3600 | "the", 3601 | "lexrank", 3602 | "paper", 3603 | "explored", 3604 | "using", 3605 | "unweighted", 3606 | "edges", 3607 | "after", 3608 | "applying", 3609 | "a", 3610 | "threshold", 3611 | "to", 3612 | "the", 3613 | "cosine", 3614 | "values", 3615 | "but", 3616 | "also", 3617 | "experimented", 3618 | "with", 3619 | "using", 3620 | "edges", 3621 | "with", 3622 | "weights", 3623 | "equal", 3624 | "to", 3625 | "the", 3626 | "similarity", 3627 | "score" 3628 | ], 3629 | [ 3630 | "textrank", 3631 | "uses", 3632 | "continuous", 3633 | "similarity", 3634 | "scores", 3635 | "as", 3636 | "weights" 3637 | ], 3638 | [ 3639 | "how", 3640 | "are", 3641 | "summaries", 3642 | "formed" 3643 | ], 3644 | [ 3645 | "in", 3646 | "both", 3647 | "algorithms", 3648 | "the", 3649 | "sentences", 3650 | "are", 3651 | "ranked", 3652 | "by", 3653 | "applying", 3654 | "pagerank", 3655 | "to", 3656 | "the", 3657 | "resulting", 3658 | "graph" 3659 | ], 3660 | [ 3661 | "a", 3662 | "summary", 3663 | "is", 3664 | "formed", 3665 | "by", 3666 | "combining", 3667 | "the", 3668 | "top", 3669 | "ranking", 3670 | "sentences", 3671 | "using", 3672 | "a", 3673 | "threshold", 3674 | "or", 3675 | "length", 3676 | "cutoff", 3677 | "to", 3678 | "limit", 3679 | "the", 3680 | "size", 3681 | "of", 3682 | "the", 3683 | "summary" 3684 | ], 3685 | [ 3686 | "it", 3687 | "is", 3688 | "worth", 3689 | "noting", 3690 | "that", 3691 | "textrank", 3692 | "was", 3693 | "applied", 3694 | "to", 3695 | "summarization", 3696 | "exactly", 3697 | "as", 3698 | "described", 3699 | "here", 3700 | "while", 3701 | "lexrank", 3702 | "was", 3703 | "used", 3704 | "as", 3705 | "part", 3706 | "of", 3707 | "a", 3708 | "larger", 3709 | "summarization", 3710 | "system", 3711 | "mead", 3712 | "that", 3713 | "combines", 3714 | "the", 3715 | "lexrank", 3716 | "score", 3717 | "stationary", 3718 | "probability", 3719 | "with", 3720 | "other", 3721 | "features", 3722 | "like", 3723 | "sentence", 3724 | "position", 3725 | "and", 3726 | "length", 3727 | "using", 3728 | "a", 3729 | "linear", 3730 | "combination", 3731 | "with", 3732 | "either", 3733 | "user-specified", 3734 | "or", 3735 | "automatically", 3736 | "tuned", 3737 | "weights" 3738 | ], 3739 | [ 3740 | "in", 3741 | "this", 3742 | "case", 3743 | "some", 3744 | "training", 3745 | "documents", 3746 | "might", 3747 | "be", 3748 | "needed", 3749 | "though", 3750 | "the", 3751 | "textrank", 3752 | "results", 3753 | "show", 3754 | "the", 3755 | "additional", 3756 | "features", 3757 | "are", 3758 | "not", 3759 | "absolutely", 3760 | "necessary" 3761 | ], 3762 | [ 3763 | "another", 3764 | "important", 3765 | "distinction", 3766 | "is", 3767 | "that", 3768 | "textrank", 3769 | "was", 3770 | "used", 3771 | "for", 3772 | "single", 3773 | "document", 3774 | "summarization", 3775 | "while", 3776 | "lexrank", 3777 | "has", 3778 | "been", 3779 | "applied", 3780 | "to", 3781 | "multi-document", 3782 | "summarization" 3783 | ], 3784 | [ 3785 | "the", 3786 | "task", 3787 | "remains", 3788 | "the", 3789 | "same", 3790 | "in", 3791 | "both", 3792 | "cases", 3793 | "only", 3794 | "the", 3795 | "number", 3796 | "of", 3797 | "sentences", 3798 | "to", 3799 | "choose", 3800 | "from", 3801 | "has", 3802 | "grown" 3803 | ], 3804 | [ 3805 | "however", 3806 | "when", 3807 | "summarizing", 3808 | "multiple", 3809 | "documents", 3810 | "there", 3811 | "is", 3812 | "a", 3813 | "greater", 3814 | "risk", 3815 | "of", 3816 | "selecting", 3817 | "duplicate", 3818 | "or", 3819 | "highly", 3820 | "redundant", 3821 | "sentences", 3822 | "to", 3823 | "place", 3824 | "in", 3825 | "the", 3826 | "same", 3827 | "summary" 3828 | ], 3829 | [ 3830 | "imagine", 3831 | "you", 3832 | "have", 3833 | "a", 3834 | "cluster", 3835 | "of", 3836 | "news", 3837 | "articles", 3838 | "on", 3839 | "a", 3840 | "particular", 3841 | "event", 3842 | "and", 3843 | "you", 3844 | "want", 3845 | "to", 3846 | "produce", 3847 | "one", 3848 | "summary" 3849 | ], 3850 | [ 3851 | "each", 3852 | "article", 3853 | "is", 3854 | "likely", 3855 | "to", 3856 | "have", 3857 | "many", 3858 | "similar", 3859 | "sentences", 3860 | "and", 3861 | "you", 3862 | "would", 3863 | "only", 3864 | "want", 3865 | "to", 3866 | "include", 3867 | "distinct", 3868 | "ideas", 3869 | "in", 3870 | "the", 3871 | "summary" 3872 | ], 3873 | [ 3874 | "to", 3875 | "address", 3876 | "this", 3877 | "issue", 3878 | "lexrank", 3879 | "applies", 3880 | "a", 3881 | "heuristic", 3882 | "post-processing", 3883 | "step", 3884 | "that", 3885 | "builds", 3886 | "up", 3887 | "a", 3888 | "summary", 3889 | "by", 3890 | "adding", 3891 | "sentences", 3892 | "in", 3893 | "rank", 3894 | "order", 3895 | "but", 3896 | "discards", 3897 | "any", 3898 | "sentences", 3899 | "that", 3900 | "are", 3901 | "too", 3902 | "similar", 3903 | "to", 3904 | "ones", 3905 | "already", 3906 | "placed", 3907 | "in", 3908 | "the", 3909 | "summary" 3910 | ], 3911 | [ 3912 | "the", 3913 | "method", 3914 | "used", 3915 | "is", 3916 | "called", 3917 | "cross-sentence", 3918 | "information", 3919 | "subsumption", 3920 | "csis" 3921 | ], 3922 | [ 3923 | "these", 3924 | "methods", 3925 | "work", 3926 | "based", 3927 | "on", 3928 | "the", 3929 | "idea", 3930 | "that", 3931 | "sentences", 3932 | "recommend", 3933 | "other", 3934 | "similar", 3935 | "sentences", 3936 | "to", 3937 | "the", 3938 | "reader" 3939 | ], 3940 | [ 3941 | "thus", 3942 | "if", 3943 | "one", 3944 | "sentence", 3945 | "is", 3946 | "very", 3947 | "similar", 3948 | "to", 3949 | "many", 3950 | "others", 3951 | "it", 3952 | "will", 3953 | "likely", 3954 | "be", 3955 | "a", 3956 | "sentence", 3957 | "of", 3958 | "great", 3959 | "importance" 3960 | ], 3961 | [ 3962 | "the", 3963 | "importance", 3964 | "of", 3965 | "this", 3966 | "sentence", 3967 | "also", 3968 | "stems", 3969 | "from", 3970 | "the", 3971 | "importance", 3972 | "of", 3973 | "the", 3974 | "sentences", 3975 | "recommending", 3976 | "it" 3977 | ], 3978 | [ 3979 | "thus", 3980 | "to", 3981 | "get", 3982 | "ranked", 3983 | "highly", 3984 | "and", 3985 | "placed", 3986 | "in", 3987 | "a", 3988 | "summary", 3989 | "a", 3990 | "sentence", 3991 | "must", 3992 | "be", 3993 | "similar", 3994 | "to", 3995 | "many", 3996 | "sentences", 3997 | "that", 3998 | "are", 3999 | "in", 4000 | "turn", 4001 | "also", 4002 | "similar", 4003 | "to", 4004 | "many", 4005 | "other", 4006 | "sentences" 4007 | ], 4008 | [ 4009 | "this", 4010 | "makes", 4011 | "intuitive", 4012 | "sense", 4013 | "and", 4014 | "allows", 4015 | "the", 4016 | "algorithms", 4017 | "to", 4018 | "be", 4019 | "applied", 4020 | "to", 4021 | "any", 4022 | "arbitrary", 4023 | "new", 4024 | "text" 4025 | ], 4026 | [ 4027 | "the", 4028 | "methods", 4029 | "are", 4030 | "domain-independent", 4031 | "and", 4032 | "easily", 4033 | "portable" 4034 | ], 4035 | [ 4036 | "one", 4037 | "could", 4038 | "imagine", 4039 | "the", 4040 | "features", 4041 | "indicating", 4042 | "important", 4043 | "sentences", 4044 | "in", 4045 | "the", 4046 | "news", 4047 | "domain", 4048 | "might", 4049 | "vary", 4050 | "considerably", 4051 | "from", 4052 | "the", 4053 | "biomedical", 4054 | "domain" 4055 | ], 4056 | [ 4057 | "however", 4058 | "the", 4059 | "unsupervised", 4060 | "recommendation", 4061 | "-based", 4062 | "approach", 4063 | "applies", 4064 | "to", 4065 | "any", 4066 | "domain" 4067 | ], 4068 | [ 4069 | "multi-document", 4070 | "summarization", 4071 | "is", 4072 | "an", 4073 | "automatic", 4074 | "procedure", 4075 | "aimed", 4076 | "at", 4077 | "extraction", 4078 | "of", 4079 | "information", 4080 | "from", 4081 | "multiple", 4082 | "texts", 4083 | "written", 4084 | "about", 4085 | "the", 4086 | "same", 4087 | "topic" 4088 | ], 4089 | [ 4090 | "resulting", 4091 | "summary", 4092 | "report", 4093 | "allows", 4094 | "individual", 4095 | "users", 4096 | "such", 4097 | "as", 4098 | "professional", 4099 | "information", 4100 | "consumers", 4101 | "to", 4102 | "quickly", 4103 | "familiarize", 4104 | "themselves", 4105 | "with", 4106 | "information", 4107 | "contained", 4108 | "in", 4109 | "a", 4110 | "large", 4111 | "cluster", 4112 | "of", 4113 | "documents" 4114 | ], 4115 | [ 4116 | "in", 4117 | "such", 4118 | "a", 4119 | "way", 4120 | "multi-document", 4121 | "summarization", 4122 | "systems", 4123 | "are", 4124 | "complementing", 4125 | "the", 4126 | "news", 4127 | "aggregators", 4128 | "performing", 4129 | "the", 4130 | "next", 4131 | "step", 4132 | "down", 4133 | "the", 4134 | "road", 4135 | "of", 4136 | "coping", 4137 | "with", 4138 | "information", 4139 | "overload" 4140 | ], 4141 | [ 4142 | "multi-document", 4143 | "summarization", 4144 | "creates", 4145 | "information", 4146 | "reports", 4147 | "that", 4148 | "are", 4149 | "both", 4150 | "concise", 4151 | "and", 4152 | "comprehensive" 4153 | ], 4154 | [ 4155 | "with", 4156 | "different", 4157 | "opinions", 4158 | "being", 4159 | "put", 4160 | "together", 4161 | "&", 4162 | "outlined", 4163 | "every", 4164 | "topic", 4165 | "is", 4166 | "described", 4167 | "from", 4168 | "multiple", 4169 | "perspectives", 4170 | "within", 4171 | "a", 4172 | "single", 4173 | "document" 4174 | ], 4175 | [ 4176 | "while", 4177 | "the", 4178 | "goal", 4179 | "of", 4180 | "a", 4181 | "brief", 4182 | "summary", 4183 | "is", 4184 | "to", 4185 | "simplify", 4186 | "information", 4187 | "search", 4188 | "and", 4189 | "cut", 4190 | "the", 4191 | "time", 4192 | "by", 4193 | "pointing", 4194 | "to", 4195 | "the", 4196 | "most", 4197 | "relevant", 4198 | "source", 4199 | "documents", 4200 | "comprehensive", 4201 | "multi-document", 4202 | "summary", 4203 | "should", 4204 | "itself", 4205 | "contain", 4206 | "the", 4207 | "required", 4208 | "information", 4209 | "hence", 4210 | "limiting", 4211 | "the", 4212 | "need", 4213 | "for", 4214 | "accessing", 4215 | "original", 4216 | "files", 4217 | "to", 4218 | "cases", 4219 | "when", 4220 | "refinement", 4221 | "is", 4222 | "required" 4223 | ], 4224 | [ 4225 | "automatic", 4226 | "summaries", 4227 | "present", 4228 | "information", 4229 | "extracted", 4230 | "from", 4231 | "multiple", 4232 | "sources", 4233 | "algorithmically", 4234 | "without", 4235 | "any", 4236 | "editorial", 4237 | "touch", 4238 | "or", 4239 | "subjective", 4240 | "human", 4241 | "intervention", 4242 | "thus", 4243 | "making", 4244 | "it", 4245 | "completely", 4246 | "unbiased" 4247 | ], 4248 | [ 4249 | "multi-document", 4250 | "extractive", 4251 | "summarization", 4252 | "faces", 4253 | "a", 4254 | "problem", 4255 | "of", 4256 | "potential", 4257 | "redundancy" 4258 | ], 4259 | [ 4260 | "ideally", 4261 | "we", 4262 | "would", 4263 | "like", 4264 | "to", 4265 | "extract", 4266 | "sentences", 4267 | "that", 4268 | "are", 4269 | "both", 4270 | "central", 4271 | "i" 4272 | ], 4273 | [ 4274 | "contain", 4275 | "the", 4276 | "main", 4277 | "ideas", 4278 | "and", 4279 | "diverse", 4280 | "i" 4281 | ], 4282 | [ 4283 | "they", 4284 | "differ", 4285 | "from", 4286 | "one", 4287 | "another" 4288 | ], 4289 | [ 4290 | "lexrank", 4291 | "deals", 4292 | "with", 4293 | "diversity", 4294 | "as", 4295 | "a", 4296 | "heuristic", 4297 | "final", 4298 | "stage", 4299 | "using", 4300 | "csis", 4301 | "and", 4302 | "other", 4303 | "systems", 4304 | "have", 4305 | "used", 4306 | "similar", 4307 | "methods", 4308 | "such", 4309 | "as", 4310 | "maximal", 4311 | "marginal", 4312 | "relevance", 4313 | "mmr", 4314 | "in", 4315 | "trying", 4316 | "to", 4317 | "eliminate", 4318 | "redundancy", 4319 | "in", 4320 | "information", 4321 | "retrieval", 4322 | "results" 4323 | ], 4324 | [ 4325 | "there", 4326 | "is", 4327 | "a", 4328 | "general", 4329 | "purpose", 4330 | "graph-based", 4331 | "ranking", 4332 | "algorithm", 4333 | "like", 4334 | "page/lex/textrank", 4335 | "that", 4336 | "handles", 4337 | "both", 4338 | "centrality", 4339 | "and", 4340 | "diversity", 4341 | "in", 4342 | "a", 4343 | "unified", 4344 | "mathematical", 4345 | "framework", 4346 | "based", 4347 | "on", 4348 | "absorbing", 4349 | "markov", 4350 | "chain", 4351 | "random", 4352 | "walks" 4353 | ], 4354 | [ 4355 | "an", 4356 | "absorbing", 4357 | "random", 4358 | "walk", 4359 | "is", 4360 | "like", 4361 | "a", 4362 | "standard", 4363 | "random", 4364 | "walk", 4365 | "except", 4366 | "some", 4367 | "states", 4368 | "are", 4369 | "now", 4370 | "absorbing", 4371 | "states", 4372 | "that", 4373 | "act", 4374 | "as", 4375 | "black", 4376 | "holes", 4377 | "that", 4378 | "cause", 4379 | "the", 4380 | "walk", 4381 | "to", 4382 | "end", 4383 | "abruptly", 4384 | "at", 4385 | "that", 4386 | "state" 4387 | ], 4388 | [ 4389 | "the", 4390 | "algorithm", 4391 | "is", 4392 | "called", 4393 | "grasshopper" 4394 | ], 4395 | [ 4396 | "in", 4397 | "addition", 4398 | "to", 4399 | "explicitly", 4400 | "promoting", 4401 | "diversity", 4402 | "during", 4403 | "the", 4404 | "ranking", 4405 | "process", 4406 | "grasshopper", 4407 | "incorporates", 4408 | "a", 4409 | "prior", 4410 | "ranking", 4411 | "based", 4412 | "on", 4413 | "sentence", 4414 | "position", 4415 | "in", 4416 | "the", 4417 | "case", 4418 | "of", 4419 | "summarization" 4420 | ], 4421 | [ 4422 | "the", 4423 | "most", 4424 | "common", 4425 | "way", 4426 | "to", 4427 | "evaluate", 4428 | "the", 4429 | "informativeness", 4430 | "of", 4431 | "automatic", 4432 | "summaries", 4433 | "is", 4434 | "to", 4435 | "compare", 4436 | "them", 4437 | "with", 4438 | "human-made", 4439 | "model", 4440 | "summaries" 4441 | ], 4442 | [ 4443 | "evaluation", 4444 | "techniques", 4445 | "fall", 4446 | "into", 4447 | "intrinsic", 4448 | "and", 4449 | "extrinsic", 4450 | "inter-texual", 4451 | "and", 4452 | "intra-texual" 4453 | ], 4454 | [ 4455 | "an", 4456 | "intrinsic", 4457 | "evaluation", 4458 | "tests", 4459 | "the", 4460 | "summarization", 4461 | "system", 4462 | "in", 4463 | "of", 4464 | "itself", 4465 | "while", 4466 | "an", 4467 | "extrinsic", 4468 | "evaluation", 4469 | "tests", 4470 | "the", 4471 | "summarization", 4472 | "based", 4473 | "on", 4474 | "how", 4475 | "it", 4476 | "affects", 4477 | "the", 4478 | "completion", 4479 | "of", 4480 | "some", 4481 | "other", 4482 | "task" 4483 | ], 4484 | [ 4485 | "intrinsic", 4486 | "evaluations", 4487 | "have", 4488 | "assessed", 4489 | "mainly", 4490 | "the", 4491 | "coherence", 4492 | "and", 4493 | "informativeness", 4494 | "of", 4495 | "summaries" 4496 | ], 4497 | [ 4498 | "extrinsic", 4499 | "evaluations", 4500 | "on", 4501 | "the", 4502 | "other", 4503 | "hand", 4504 | "have", 4505 | "tested", 4506 | "the", 4507 | "impact", 4508 | "of", 4509 | "summarization", 4510 | "on", 4511 | "tasks", 4512 | "like", 4513 | "relevance", 4514 | "assessment", 4515 | "reading", 4516 | "comprehension", 4517 | "etc" 4518 | ], 4519 | [ 4520 | "intra-textual", 4521 | "methods", 4522 | "assess", 4523 | "the", 4524 | "output", 4525 | "of", 4526 | "a", 4527 | "specific", 4528 | "summarization", 4529 | "system", 4530 | "and", 4531 | "the", 4532 | "inter-texual", 4533 | "ones", 4534 | "focus", 4535 | "on", 4536 | "contrastive", 4537 | "analysis", 4538 | "of", 4539 | "outputs", 4540 | "of", 4541 | "several", 4542 | "summarization", 4543 | "systems" 4544 | ], 4545 | [ 4546 | "human", 4547 | "judgement", 4548 | "often", 4549 | "has", 4550 | "wide", 4551 | "variance", 4552 | "on", 4553 | "what", 4554 | "is", 4555 | "considered", 4556 | "a", 4557 | "good", 4558 | "summary", 4559 | "which", 4560 | "means", 4561 | "that", 4562 | "making", 4563 | "the", 4564 | "evaluation", 4565 | "process", 4566 | "automatic", 4567 | "is", 4568 | "particularly", 4569 | "difficult" 4570 | ], 4571 | [ 4572 | "manual", 4573 | "evaluation", 4574 | "can", 4575 | "be", 4576 | "used", 4577 | "but", 4578 | "this", 4579 | "is", 4580 | "both", 4581 | "time", 4582 | "and", 4583 | "labor-intensive", 4584 | "as", 4585 | "it", 4586 | "requires", 4587 | "humans", 4588 | "to", 4589 | "read", 4590 | "not", 4591 | "only", 4592 | "the", 4593 | "summaries", 4594 | "but", 4595 | "also", 4596 | "the", 4597 | "source", 4598 | "documents" 4599 | ], 4600 | [ 4601 | "other", 4602 | "issues", 4603 | "are", 4604 | "those", 4605 | "concerning", 4606 | "coherence", 4607 | "and", 4608 | "coverage" 4609 | ], 4610 | [ 4611 | "one", 4612 | "of", 4613 | "the", 4614 | "metrics", 4615 | "used", 4616 | "in", 4617 | "nist's", 4618 | "annual", 4619 | "document", 4620 | "understanding", 4621 | "conferences", 4622 | "in", 4623 | "which", 4624 | "research", 4625 | "groups", 4626 | "submit", 4627 | "their", 4628 | "systems", 4629 | "for", 4630 | "both", 4631 | "summarization", 4632 | "and", 4633 | "translation", 4634 | "tasks", 4635 | "is", 4636 | "the", 4637 | "rouge", 4638 | "metric", 4639 | "recall-oriented", 4640 | "understudy", 4641 | "for", 4642 | "gisting", 4643 | "evaluation" 4644 | ], 4645 | [ 4646 | "it", 4647 | "essentially", 4648 | "calculates", 4649 | "n-gram", 4650 | "overlaps", 4651 | "between", 4652 | "automatically", 4653 | "generated", 4654 | "summaries", 4655 | "and", 4656 | "previously-written", 4657 | "human", 4658 | "summaries" 4659 | ], 4660 | [ 4661 | "a", 4662 | "high", 4663 | "level", 4664 | "of", 4665 | "overlap", 4666 | "should", 4667 | "indicate", 4668 | "a", 4669 | "high", 4670 | "level", 4671 | "of", 4672 | "shared", 4673 | "concepts", 4674 | "between", 4675 | "the", 4676 | "two", 4677 | "summaries" 4678 | ], 4679 | [ 4680 | "note", 4681 | "that", 4682 | "overlap", 4683 | "metrics", 4684 | "like", 4685 | "this", 4686 | "are", 4687 | "unable", 4688 | "to", 4689 | "provide", 4690 | "any", 4691 | "feedback", 4692 | "on", 4693 | "a", 4694 | "summary's", 4695 | "coherence" 4696 | ], 4697 | [ 4698 | "anaphor", 4699 | "resolution", 4700 | "remains", 4701 | "another", 4702 | "problem", 4703 | "yet", 4704 | "to", 4705 | "be", 4706 | "fully", 4707 | "solved" 4708 | ], 4709 | [ 4710 | "evaluating", 4711 | "summaries", 4712 | "either", 4713 | "manually", 4714 | "or", 4715 | "automatically", 4716 | "is", 4717 | "a", 4718 | "hard", 4719 | "task" 4720 | ], 4721 | [ 4722 | "the", 4723 | "main", 4724 | "difficulty", 4725 | "in", 4726 | "evaluation", 4727 | "comes", 4728 | "from", 4729 | "the", 4730 | "impossibility", 4731 | "of", 4732 | "building", 4733 | "a", 4734 | "fair", 4735 | "gold-standard", 4736 | "against", 4737 | "which", 4738 | "the", 4739 | "results", 4740 | "of", 4741 | "the", 4742 | "systems", 4743 | "can", 4744 | "be", 4745 | "compared" 4746 | ], 4747 | [ 4748 | "furthermore", 4749 | "it", 4750 | "is", 4751 | "also", 4752 | "very", 4753 | "hard", 4754 | "to", 4755 | "determine", 4756 | "what", 4757 | "a", 4758 | "correct", 4759 | "summary", 4760 | "is", 4761 | "because", 4762 | "there", 4763 | "is", 4764 | "always", 4765 | "the", 4766 | "possibility", 4767 | "of", 4768 | "a", 4769 | "system", 4770 | "to", 4771 | "generate", 4772 | "a", 4773 | "good", 4774 | "summary", 4775 | "that", 4776 | "is", 4777 | "quite", 4778 | "different", 4779 | "from", 4780 | "any", 4781 | "human", 4782 | "summary", 4783 | "used", 4784 | "as", 4785 | "an", 4786 | "approximation", 4787 | "to", 4788 | "the", 4789 | "correct", 4790 | "output" 4791 | ], 4792 | [ 4793 | "content", 4794 | "selection", 4795 | "is", 4796 | "not", 4797 | "a", 4798 | "deterministic", 4799 | "problem" 4800 | ], 4801 | [ 4802 | "people", 4803 | "are", 4804 | "subjective", 4805 | "and", 4806 | "different", 4807 | "authors", 4808 | "would", 4809 | "choose", 4810 | "different", 4811 | "sentences" 4812 | ], 4813 | [ 4814 | "and", 4815 | "individuals", 4816 | "may", 4817 | "not", 4818 | "be", 4819 | "consistent" 4820 | ], 4821 | [ 4822 | "a", 4823 | "particular", 4824 | "person", 4825 | "may", 4826 | "chose", 4827 | "different", 4828 | "sentences", 4829 | "at", 4830 | "different", 4831 | "times" 4832 | ], 4833 | [ 4834 | "two", 4835 | "distinct", 4836 | "sentences", 4837 | "expressed", 4838 | "in", 4839 | "different", 4840 | "words", 4841 | "can", 4842 | "express", 4843 | "the", 4844 | "same", 4845 | "meaning" 4846 | ], 4847 | [ 4848 | "this", 4849 | "phenomenon", 4850 | "is", 4851 | "known", 4852 | "as", 4853 | "paraphrasing" 4854 | ], 4855 | [ 4856 | "we", 4857 | "can", 4858 | "find", 4859 | "an", 4860 | "approach", 4861 | "to", 4862 | "automatically", 4863 | "evaluating", 4864 | "summaries", 4865 | "using", 4866 | "paraphrases", 4867 | "paraeval" 4868 | ], 4869 | [ 4870 | "most", 4871 | "summarization", 4872 | "systems", 4873 | "perform", 4874 | "an", 4875 | "extractive", 4876 | "approach", 4877 | "selecting", 4878 | "and", 4879 | "copying", 4880 | "important", 4881 | "sentences", 4882 | "from", 4883 | "the", 4884 | "source", 4885 | "documents" 4886 | ], 4887 | [ 4888 | "although", 4889 | "humans", 4890 | "can", 4891 | "also", 4892 | "cut", 4893 | "and", 4894 | "paste", 4895 | "relevant", 4896 | "information", 4897 | "of", 4898 | "a", 4899 | "text", 4900 | "most", 4901 | "of", 4902 | "the", 4903 | "times", 4904 | "they", 4905 | "rephrase", 4906 | "sentences", 4907 | "when", 4908 | "necessary", 4909 | "or", 4910 | "they", 4911 | "join", 4912 | "different", 4913 | "related", 4914 | "information", 4915 | "into", 4916 | "one", 4917 | "sentence" 4918 | ], 4919 | [ 4920 | "domain", 4921 | "independent", 4922 | "summarization", 4923 | "techniques", 4924 | "generally", 4925 | "apply", 4926 | "sets", 4927 | "of", 4928 | "general", 4929 | "features", 4930 | "which", 4931 | "can", 4932 | "be", 4933 | "used", 4934 | "to", 4935 | "identify", 4936 | "information-rich", 4937 | "text", 4938 | "segments" 4939 | ], 4940 | [ 4941 | "recent", 4942 | "research", 4943 | "focus", 4944 | "has", 4945 | "drifted", 4946 | "to", 4947 | "domain-specific", 4948 | "summarization", 4949 | "techniques", 4950 | "that", 4951 | "utilize", 4952 | "the", 4953 | "available", 4954 | "knowledge", 4955 | "specific", 4956 | "to", 4957 | "the", 4958 | "domain", 4959 | "of", 4960 | "text" 4961 | ], 4962 | [ 4963 | "for", 4964 | "example", 4965 | "automatic", 4966 | "summarization", 4967 | "research", 4968 | "on", 4969 | "medical", 4970 | "text", 4971 | "generally", 4972 | "attempts", 4973 | "to", 4974 | "utilize", 4975 | "the", 4976 | "various", 4977 | "sources", 4978 | "of", 4979 | "codified", 4980 | "medical", 4981 | "knowledge", 4982 | "and", 4983 | "ontologies" 4984 | ], 4985 | [ 4986 | "the", 4987 | "main", 4988 | "drawback", 4989 | "of", 4990 | "the", 4991 | "evaluation", 4992 | "systems", 4993 | "existing", 4994 | "so", 4995 | "far", 4996 | "is", 4997 | "that", 4998 | "we", 4999 | "need", 5000 | "at", 5001 | "least", 5002 | "one", 5003 | "reference", 5004 | "summary", 5005 | "and", 5006 | "for", 5007 | "some", 5008 | "methods", 5009 | "more", 5010 | "than", 5011 | "one", 5012 | "to", 5013 | "be", 5014 | "able", 5015 | "to", 5016 | "compare", 5017 | "summaries", 5018 | "with", 5019 | "models" 5020 | ], 5021 | [ 5022 | "this", 5023 | "is", 5024 | "a", 5025 | "hard", 5026 | "and", 5027 | "expensive", 5028 | "task" 5029 | ], 5030 | [ 5031 | "much", 5032 | "effort", 5033 | "has", 5034 | "to", 5035 | "be", 5036 | "done", 5037 | "in", 5038 | "order", 5039 | "to", 5040 | "have", 5041 | "corpus", 5042 | "of", 5043 | "texts", 5044 | "and", 5045 | "their", 5046 | "corresponding", 5047 | "summaries" 5048 | ], 5049 | [ 5050 | "furthermore", 5051 | "for", 5052 | "some", 5053 | "methods", 5054 | "not", 5055 | "only", 5056 | "do", 5057 | "we", 5058 | "need", 5059 | "to", 5060 | "have", 5061 | "human-made", 5062 | "summaries", 5063 | "available", 5064 | "for", 5065 | "comparison", 5066 | "but", 5067 | "also", 5068 | "manual", 5069 | "annotation", 5070 | "has", 5071 | "to", 5072 | "be", 5073 | "performed", 5074 | "in", 5075 | "some", 5076 | "of", 5077 | "them", 5078 | "e" 5079 | ], 5080 | [ 5081 | "scu", 5082 | "in", 5083 | "the", 5084 | "pyramid", 5085 | "method" 5086 | ], 5087 | [ 5088 | "in", 5089 | "any", 5090 | "case", 5091 | "what", 5092 | "the", 5093 | "evaluation", 5094 | "methods", 5095 | "need", 5096 | "as", 5097 | "an", 5098 | "input", 5099 | "is", 5100 | "a", 5101 | "set", 5102 | "of", 5103 | "summaries", 5104 | "to", 5105 | "serve", 5106 | "as", 5107 | "gold", 5108 | "standards", 5109 | "and", 5110 | "a", 5111 | "set", 5112 | "of", 5113 | "automatic", 5114 | "summaries" 5115 | ], 5116 | [ 5117 | "moreover", 5118 | "they", 5119 | "all", 5120 | "perform", 5121 | "a", 5122 | "quantitative", 5123 | "evaluation", 5124 | "with", 5125 | "regard", 5126 | "to", 5127 | "different", 5128 | "similarity", 5129 | "metrics" 5130 | ], 5131 | [ 5132 | "to", 5133 | "overcome", 5134 | "these", 5135 | "problems", 5136 | "we", 5137 | "think", 5138 | "that", 5139 | "the", 5140 | "quantitative", 5141 | "evaluation", 5142 | "might", 5143 | "not", 5144 | "be", 5145 | "the", 5146 | "only", 5147 | "way", 5148 | "to", 5149 | "evaluate", 5150 | "summaries", 5151 | "and", 5152 | "a", 5153 | "qualitative", 5154 | "automatic", 5155 | "evaluation", 5156 | "would", 5157 | "be", 5158 | "also", 5159 | "important" 5160 | ] 5161 | ] -------------------------------------------------------------------------------- /test/tr-test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'); 2 | var should = require('should'); 3 | var expect = require('chai').expect; 4 | var tr = require('../index'); 5 | var LONG_KEY_EX = require('./Automatic_Summarization-tokens.json'); 6 | var LONG_SENT_EX = require('./Automatic_Summarization-sents.json'); 7 | var fs = require('fs'); 8 | 9 | // TODO: right now just example code, no real unit tests! 10 | describe('textrank', function () { 11 | describe('keyword extraction', function () { 12 | 13 | it('should compute top keywords', function () { 14 | // First convert to a keyword graph as described in the paper 15 | var graph = tr.keyExGraph(LONG_KEY_EX); 16 | // Now run text rank on the graph 17 | var ws = tr.textRank(graph); 18 | // Get the top N keywords 19 | ws = ws.slice(0, Math.min(ws.length, 10)); 20 | console.log('Top rated hits!'); 21 | ws.forEach(function (item) { 22 | console.log(item.name + ' --> ' + item.score); 23 | }); 24 | }); 25 | }); 26 | describe('sentence extraction', function () { 27 | 28 | it('should compute top sentences', function () { 29 | // First convert to a sentence graph as described in the paper 30 | var graph = tr.sentExGraph(LONG_SENT_EX); 31 | // Now run 40 iterations of the algorithm 32 | var ws = tr.textRank(graph, 40); 33 | // Get the top N sentence 34 | ws = ws.slice(0, Math.min(ws.length, 10)); 35 | // Reorder the top N sentences by article order 36 | ws.sort(function(a, b) { return a.vertex - b.vertex }); 37 | console.log('Top rated hits!\n'); 38 | ws.forEach(function (item) { 39 | console.log(item.name.join(' ') + '\n'); 40 | }); 41 | }); 42 | }); 43 | 44 | }); 45 | --------------------------------------------------------------------------------