├── .jshintrc
├── Gruntfile.js
├── README.md
├── index.js
├── package.json
└── test
    ├── Automatic_Summarization-sents.json
    ├── Automatic_Summarization-tokens.json
    └── tr-test.js


/.jshintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |     "node": true,
 3 |     "browser": false,
 4 |     "es5": true,
 5 |     "esnext": true,
 6 |     "bitwise": true,
 7 |     "camelcase": true,
 8 |     "curly": true,
 9 |     "eqeqeq": true,
10 |     "immed": true,
11 |     "indent": 4,
12 |     "latedef": true,
13 |     "newcap": true,
14 |     "noarg": true,
15 |     "quotmark": "single",
16 |     "regexp": true,
17 |     "undef": true,
18 |     "unused": true,
19 |     "strict": true,
20 |     "trailing": true,
21 |     "smarttabs": true,
22 |     "white": false,
23 |     "globals": { "window": false }
24 | }


--------------------------------------------------------------------------------
/Gruntfile.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | module.exports = function (grunt) {
 4 |     // load all grunt tasks
 5 |     require('matchdep').filterDev('grunt-*').forEach(grunt.loadNpmTasks);
 6 | 
 7 |     grunt.initConfig({
 8 |         clean: {
 9 |             dist: ['.tmp', 'dist/*'],
10 |             server: '.tmp'
11 |         },
12 |         uglify: {
13 |             my_target: {
14 |                 files: {
15 |                     'dist/textrank.min.js': ['index.js']
16 |                 }
17 |             }
18 |         },
19 |         jshint: {
20 |             options: {
21 |                 jshintrc: '.jshintrc'
22 |             },
23 |             all: [
24 |                 'lib/*.js'
25 |             ]
26 |         },
27 |         mochaTest: {
28 |             test: {
29 |                 options: {
30 |                     reporter: 'spec'
31 |                 },
32 |                 src: ['test/*.js']
33 |             }
34 |         }
35 |     });
36 | 
37 |     grunt.registerTask('test', [
38 |         'clean',
39 |         'mochaTest'
40 |     ]);
41 | 
42 |     grunt.registerTask('build', [
43 |         'clean:dist',
44 |         'uglify'
45 |     ]);
46 | 
47 |     grunt.registerTask('default', [
48 |         'jshint',
49 |         'test',
50 |         'build'
51 |     ]);
52 | };
53 | 
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | textrank-js
 2 | ===========
 3 | 
 4 | TextRank is an algorithm for Text Summarization, by Rada Mihalcea & Paul Tarau.  The code here is based on their paper "TextRank: Bringing Order into Texts".  I've noticed that there are many implementations out there, but this one is intended to demonstrate the algorithm without any additional baggage.  Also, unlike many other implementations I have seen, this has no algorithm dependencies and could also work in the browser. I wanted to show how elegant, simple and clean the algorithm is, so I kept it short -- about 130 lines of Javascript (ES5).  It currently depends only on lodash ('_'), a standard JS library used in many (most?) projects, for a few choice zingey one-liners.
 5 | 
 6 | The algorithm itself can extend to any type of graph, as they note in their paper, but I have provided the two types of graphs explored in the paper: keyword extraction an undirected graph derived from collocation, and a sentence extraction graph using the similarity weighting (as described in the paper) on the edges.  There is a function for building a graph of each type, and once the graph has been built, the textRank function performs the algorithm on the generated graph.
 7 | 
 8 | Note this code only implements the TextRank algorithm itself, the sentences must be properly formatted upfront.  I have provided example tokenization for both tasks in the tests directory, both derived from tokenizing the Wikipedia entry for "Automatic summarization", both minimally processed using a custom (very minimal) tokenizer, and OpenNLP's default models for sentence splitting and POS, and converted to JSON.  As long as you get the format right that this is expecting, you should be able to use whatever library you want to preprocess.  The keyword extraction builder needs the format to include POS tags since it filters the content while it is building its adjacencies.  The sentence extraction builder does not require POS, but requires pre-split sentences.
 9 | 
10 | The "tests" are not currently testing anything, but serve as demonstration code for how to run the software.  Note that textRank() has a default number of iterations -- it doesnt try and test for convergence.  This is just to keep it simple, it would be simple to modify to test this instead, but for now you can pass in any number you want if that default isnt suitable (see test examples).
11 | 
12 | Build using Grunt:
13 | ```
14 | $ npm install
15 | $ grunt
16 | 
17 | ```
18 | Here's what the output looks like for the first 5 extracted sentences when performing the sentence extraction task:
19 | 
20 | ```
21 | automatic summarization is the process of reducing a text document with a computer program in order to create a summary that retains the most important points of the original document
22 | 
23 | two particular types of summarization often addressed in the literature are keyphrase extraction where the goal is to select individual words or phrases to tag a document and document summarization where the goal is to select whole sentences to create a short paragraph summary
24 | 
25 | in general abstraction can condense a text more strongly than extraction but the programs that can do this are harder to develop as they require the use of natural language generation technology which itself is a growing field
26 | 
27 | while some work has been done in abstractive summarization creating an abstract synopsis like that of a human the majority of summarization systems are extractive selecting a subset of sentences to place in a summary
28 | 
29 | apart from fully automated summarizers fas there are systems that aid users with the task of summarization mahs = machine aided human summarization for example by highlighting candidate passages to be included in the summary and there are systems that depend on post-processing by a human hams = human aided machine summarization
30 | 
31 | ```
32 | You are welcome to use this code for whatever nefarious purposes, but please attribute it to this implementation if you do.
33 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | if (typeof require === 'function' && typeof exports === 'object' && typeof module === 'object') {
  3 |     var _ = require('lodash');
  4 | }
  5 | function textRank(V, niter, dampening) {
  6 | 
  7 |     var d = dampening || 0.85;
  8 |     var K = niter || 200;
  9 |     var denom = [];
 10 |     var ws = [];
 11 | 
 12 |     function sum(edges) {
 13 |         var acc = 0.0;
 14 |         edges.forEach(function (edge) {
 15 |             acc += edge.weight
 16 |         });
 17 |         return acc;
 18 |     }
 19 | 
 20 |     function accum(i) {
 21 |         var sum = 0.0;
 22 |         V[i].in.forEach(function (v_j) {
 23 |             var j = v_j.index;
 24 |             var v_ji = _.find(V[j].out, function (x) {
 25 |                 return x.index == i;
 26 |             });
 27 |             sum += (v_ji ? (v_ji.weight / denom[j] * ws[j].score) : 0.);
 28 |         });
 29 |         return sum;
 30 |     }
 31 | 
 32 |     V.forEach(function (v_j, j) {
 33 |         denom[j] = sum(v_j.out);
 34 |         ws[j] = {name: v_j.name, vertex: j, score: Math.random()};
 35 |     });
 36 |     for (var k = 0; k < K; ++k) {
 37 |         for (var i = 0; i < V.length; ++i) {
 38 |             var acc = accum(i);
 39 |             ws[i].score = (1 - d) + d * acc;
 40 |         }
 41 |     }
 42 |     ws.sort(function (x, y) {
 43 |         return (y.score - x.score)
 44 |     });
 45 |     return ws;
 46 | }
 47 | 
 48 | function sentExGraph(sentences) {
 49 |     function sim(s1, s2) {
 50 |         return _.intersection(s1, s2).length / (Math.log(s1.length) + Math.log(s2.length));
 51 |     }
 52 | 
 53 |     var V = [];
 54 |     for (var i = 0; i < sentences.length; ++i) {
 55 |         for (var j = i + 1; j < sentences.length; ++j) {
 56 |             var score = sim(sentences[i], sentences[j]);
 57 |             V[i] = V[i] || {name: sentences[i], out: [], in: []};
 58 |             V[j] = V[j] || {name: sentences[j], out: [], in: []};
 59 |             // Symmetric
 60 |             V[i].out.push({index: j, weight: score});
 61 |             V[i].in.push({index: j, weight: score});
 62 |             V[j].in.push({index: i, weight: score});
 63 |             V[j].out.push({index: i, weight: score});
 64 |         }
 65 |     }
 66 |     return V;
 67 | }
 68 | 
 69 | function keyExGraph(text, win) {
 70 | 
 71 |     var V = [];
 72 |     var edges = {};
 73 |     var sz = text.length;
 74 |     var winSz = win || 2;
 75 |     var halfN = winSz / 2.;
 76 |     var term2idx = {};
 77 |     var n = 1;
 78 | 
 79 |     function addIfNotPresent(term) {
 80 |         if (!term2idx[term]) {
 81 |             term2idx[term] = n++;
 82 |         }
 83 |         return term2idx[term] - 1;
 84 |     }
 85 | 
 86 |     for (var i = 0; i < sz; ++i) {
 87 |         var token = text[i];
 88 |         if (!token.pos.match(/^[NJ]/) && token.pos !== 'ADJ' && token.pos !== 'CD') {
 89 |             continue;
 90 |         }
 91 |         var minWin = Math.max(0, i - halfN);
 92 |         var maxWin = Math.min(sz, i + halfN);
 93 |         for (var j = minWin; j < maxWin; ++j) {
 94 |             if (i == j) {
 95 |                 continue;
 96 |             }
 97 |             var other = text[j];
 98 |             if (!other.pos.match(/^[NJ]/) && other.pos !== 'ADJ' && other.pos !== 'CD') {
 99 |                 continue;
100 |             }
101 |             var edge = [token.term, other.term];
102 |             edge.sort();
103 |             edge = edge.join("____");
104 |             edges[edge] = 1;
105 |         }
106 |     }
107 |     
108 |     for (var e in edges) {
109 |         var thisFirst = e.split("____");
110 |         i = addIfNotPresent(thisFirst[0]);
111 |         j = addIfNotPresent(thisFirst[1]);
112 |         V[i] = V[i] || {me: i, name: thisFirst[0], out: [], in: []};
113 |         V[j] = V[j] || {me: j, name: thisFirst[1], out: [], in: []};
114 |         V[i].out.push({index: j, weight: 1});
115 |         V[i].in.push({index: j, weight: 1});
116 |         V[j].out.push({index: i, weight: 1});
117 |         V[j].in.push({index: i, weight: 1});
118 |     }
119 | 
120 |     return V;
121 | }
122 | 
123 | if (typeof module !== 'undefined' && module.exports) {
124 |     module.exports.textRank = textRank;
125 |     module.exports.keyExGraph = keyExGraph;
126 |     module.exports.sentExGraph = sentExGraph;
127 | }
128 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "textrank",
 3 |   "main": "./index.js",
 4 |   "version": "0.0.1",
 5 |   "description": "TextRank implementation in Javascript",
 6 |   "scripts": {
 7 |     "test": "mocha test --recursive"
 8 |   },
 9 |   "repository": {
10 |     "type": "git",
11 |     "url": "https://github.com/dpressel/textrank-js.git"
12 |   },
13 |   "author": "Dan Pressel",
14 |   "dependencies": {
15 |   },
16 |   "devDependencies": {
17 |     "lodash": "3.0.0",
18 |     "chai": "~1.7.2",
19 |     "should": "~3.1.3",
20 |     "mocha": "~1.13.0",
21 |     "matchdep": "~0.1.1",
22 |     "grunt": "~0.4.0",
23 |     "grunt-contrib-uglify": "~0.1.1",
24 |     "grunt-contrib-jshint": "~0.1.1",
25 |     "grunt-contrib-connect": "0.1.2",
26 |     "grunt-contrib-clean": "0.4.0",
27 |     "grunt-mocha-test": "~0.9.0"
28 |   }
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/test/Automatic_Summarization-sents.json:
--------------------------------------------------------------------------------
   1 | [
   2 |   [
   3 |     "automatic",
   4 |     "summarization",
   5 |     "is",
   6 |     "the",
   7 |     "process",
   8 |     "of",
   9 |     "reducing",
  10 |     "a",
  11 |     "text",
  12 |     "document",
  13 |     "with",
  14 |     "a",
  15 |     "computer",
  16 |     "program",
  17 |     "in",
  18 |     "order",
  19 |     "to",
  20 |     "create",
  21 |     "a",
  22 |     "summary",
  23 |     "that",
  24 |     "retains",
  25 |     "the",
  26 |     "most",
  27 |     "important",
  28 |     "points",
  29 |     "of",
  30 |     "the",
  31 |     "original",
  32 |     "document"
  33 |   ],
  34 |   [
  35 |     "as",
  36 |     "the",
  37 |     "problem",
  38 |     "of",
  39 |     "information",
  40 |     "overload",
  41 |     "has",
  42 |     "grown",
  43 |     "and",
  44 |     "as",
  45 |     "the",
  46 |     "quantity",
  47 |     "of",
  48 |     "data",
  49 |     "has",
  50 |     "increased",
  51 |     "so",
  52 |     "has",
  53 |     "interest",
  54 |     "in",
  55 |     "automatic",
  56 |     "summarization"
  57 |   ],
  58 |   [
  59 |     "technologies",
  60 |     "that",
  61 |     "can",
  62 |     "make",
  63 |     "a",
  64 |     "coherent",
  65 |     "summary",
  66 |     "take",
  67 |     "into",
  68 |     "account",
  69 |     "variables",
  70 |     "such",
  71 |     "as",
  72 |     "length",
  73 |     "writing",
  74 |     "style",
  75 |     "and",
  76 |     "syntax"
  77 |   ],
  78 |   [
  79 |     "an",
  80 |     "example",
  81 |     "of",
  82 |     "the",
  83 |     "use",
  84 |     "of",
  85 |     "summarization",
  86 |     "technology",
  87 |     "is",
  88 |     "search",
  89 |     "engines",
  90 |     "such",
  91 |     "as",
  92 |     "google"
  93 |   ],
  94 |   [
  95 |     "document",
  96 |     "summarization",
  97 |     "is",
  98 |     "another"
  99 |   ],
 100 |   [
 101 |     "generally",
 102 |     "there",
 103 |     "are",
 104 |     "two",
 105 |     "approaches",
 106 |     "to",
 107 |     "automatic",
 108 |     "summarization",
 109 |     ":",
 110 |     "extraction",
 111 |     "and",
 112 |     "abstraction"
 113 |   ],
 114 |   [
 115 |     "extractive",
 116 |     "methods",
 117 |     "work",
 118 |     "by",
 119 |     "selecting",
 120 |     "a",
 121 |     "subset",
 122 |     "of",
 123 |     "existing",
 124 |     "words",
 125 |     "phrases",
 126 |     "or",
 127 |     "sentences",
 128 |     "in",
 129 |     "the",
 130 |     "original",
 131 |     "text",
 132 |     "to",
 133 |     "form",
 134 |     "the",
 135 |     "summary"
 136 |   ],
 137 |   [
 138 |     "in",
 139 |     "contrast",
 140 |     "abstractive",
 141 |     "methods",
 142 |     "build",
 143 |     "an",
 144 |     "internal",
 145 |     "semantic",
 146 |     "representation",
 147 |     "and",
 148 |     "then",
 149 |     "use",
 150 |     "natural",
 151 |     "language",
 152 |     "generation",
 153 |     "techniques",
 154 |     "to",
 155 |     "create",
 156 |     "a",
 157 |     "summary",
 158 |     "that",
 159 |     "is",
 160 |     "closer",
 161 |     "to",
 162 |     "what",
 163 |     "a",
 164 |     "human",
 165 |     "might",
 166 |     "generate"
 167 |   ],
 168 |   [
 169 |     "such",
 170 |     "a",
 171 |     "summary",
 172 |     "might",
 173 |     "contain",
 174 |     "words",
 175 |     "not",
 176 |     "explicitly",
 177 |     "present",
 178 |     "in",
 179 |     "the",
 180 |     "original"
 181 |   ],
 182 |   [
 183 |     "research",
 184 |     "into",
 185 |     "abstractive",
 186 |     "methods",
 187 |     "is",
 188 |     "an",
 189 |     "increasingly",
 190 |     "important",
 191 |     "and",
 192 |     "active",
 193 |     "research",
 194 |     "area",
 195 |     "however",
 196 |     "due",
 197 |     "to",
 198 |     "complexity",
 199 |     "constraints",
 200 |     "research",
 201 |     "to",
 202 |     "date",
 203 |     "has",
 204 |     "focused",
 205 |     "primarily",
 206 |     "on",
 207 |     "extractive",
 208 |     "methods"
 209 |   ],
 210 |   [
 211 |     "methods",
 212 |     "of",
 213 |     "automatic",
 214 |     "summarization",
 215 |     "include",
 216 |     "extraction-based",
 217 |     "abstraction-based",
 218 |     "maximum",
 219 |     "entropy-based",
 220 |     "and",
 221 |     "aided",
 222 |     "summarization"
 223 |   ],
 224 |   [
 225 |     "two",
 226 |     "particular",
 227 |     "types",
 228 |     "of",
 229 |     "summarization",
 230 |     "often",
 231 |     "addressed",
 232 |     "in",
 233 |     "the",
 234 |     "literature",
 235 |     "are",
 236 |     "keyphrase",
 237 |     "extraction",
 238 |     "where",
 239 |     "the",
 240 |     "goal",
 241 |     "is",
 242 |     "to",
 243 |     "select",
 244 |     "individual",
 245 |     "words",
 246 |     "or",
 247 |     "phrases",
 248 |     "to",
 249 |     "tag",
 250 |     "a",
 251 |     "document",
 252 |     "and",
 253 |     "document",
 254 |     "summarization",
 255 |     "where",
 256 |     "the",
 257 |     "goal",
 258 |     "is",
 259 |     "to",
 260 |     "select",
 261 |     "whole",
 262 |     "sentences",
 263 |     "to",
 264 |     "create",
 265 |     "a",
 266 |     "short",
 267 |     "paragraph",
 268 |     "summary"
 269 |   ],
 270 |   [
 271 |     "abstraction-based",
 272 |     "summarization",
 273 |     "extraction",
 274 |     "techniques",
 275 |     "merely",
 276 |     "copy",
 277 |     "the",
 278 |     "information",
 279 |     "deemed",
 280 |     "most",
 281 |     "important",
 282 |     "by",
 283 |     "the",
 284 |     "system",
 285 |     "to",
 286 |     "the",
 287 |     "summary",
 288 |     "for",
 289 |     "example",
 290 |     "key",
 291 |     "clauses",
 292 |     "sentences",
 293 |     "or",
 294 |     "paragraphs",
 295 |     "while",
 296 |     "abstraction",
 297 |     "involves",
 298 |     "paraphrasing",
 299 |     "sections",
 300 |     "of",
 301 |     "the",
 302 |     "source",
 303 |     "document"
 304 |   ],
 305 |   [
 306 |     "in",
 307 |     "general",
 308 |     "abstraction",
 309 |     "can",
 310 |     "condense",
 311 |     "a",
 312 |     "text",
 313 |     "more",
 314 |     "strongly",
 315 |     "than",
 316 |     "extraction",
 317 |     "but",
 318 |     "the",
 319 |     "programs",
 320 |     "that",
 321 |     "can",
 322 |     "do",
 323 |     "this",
 324 |     "are",
 325 |     "harder",
 326 |     "to",
 327 |     "develop",
 328 |     "as",
 329 |     "they",
 330 |     "require",
 331 |     "the",
 332 |     "use",
 333 |     "of",
 334 |     "natural",
 335 |     "language",
 336 |     "generation",
 337 |     "technology",
 338 |     "which",
 339 |     "itself",
 340 |     "is",
 341 |     "a",
 342 |     "growing",
 343 |     "field"
 344 |   ],
 345 |   [
 346 |     "while",
 347 |     "some",
 348 |     "work",
 349 |     "has",
 350 |     "been",
 351 |     "done",
 352 |     "in",
 353 |     "abstractive",
 354 |     "summarization",
 355 |     "creating",
 356 |     "an",
 357 |     "abstract",
 358 |     "synopsis",
 359 |     "like",
 360 |     "that",
 361 |     "of",
 362 |     "a",
 363 |     "human",
 364 |     "the",
 365 |     "majority",
 366 |     "of",
 367 |     "summarization",
 368 |     "systems",
 369 |     "are",
 370 |     "extractive",
 371 |     "selecting",
 372 |     "a",
 373 |     "subset",
 374 |     "of",
 375 |     "sentences",
 376 |     "to",
 377 |     "place",
 378 |     "in",
 379 |     "a",
 380 |     "summary"
 381 |   ],
 382 |   [
 383 |     "even",
 384 |     "though",
 385 |     "automating",
 386 |     "abstractive",
 387 |     "summarization",
 388 |     "is",
 389 |     "the",
 390 |     "goal",
 391 |     "of",
 392 |     "summarization",
 393 |     "research",
 394 |     "most",
 395 |     "practical",
 396 |     "systems",
 397 |     "are",
 398 |     "based",
 399 |     "on",
 400 |     "some",
 401 |     "form",
 402 |     "of",
 403 |     "extractive",
 404 |     "summarization"
 405 |   ],
 406 |   [
 407 |     "extracted",
 408 |     "sentences",
 409 |     "can",
 410 |     "form",
 411 |     "a",
 412 |     "valid",
 413 |     "summary",
 414 |     "in",
 415 |     "themselves",
 416 |     "or",
 417 |     "form",
 418 |     "a",
 419 |     "basis",
 420 |     "for",
 421 |     "further",
 422 |     "condensation",
 423 |     "operations"
 424 |   ],
 425 |   [
 426 |     "furthermore",
 427 |     "evaluation",
 428 |     "of",
 429 |     "extracted",
 430 |     "summaries",
 431 |     "can",
 432 |     "be",
 433 |     "automated",
 434 |     "since",
 435 |     "it",
 436 |     "is",
 437 |     "essentially",
 438 |     "a",
 439 |     "classification",
 440 |     "task"
 441 |   ],
 442 |   [
 443 |     "during",
 444 |     "the",
 445 |     "duc",
 446 |     "2001",
 447 |     "and",
 448 |     "2002",
 449 |     "evaluation",
 450 |     "workshops",
 451 |     "tno",
 452 |     "developed",
 453 |     "a",
 454 |     "sentence",
 455 |     "extraction",
 456 |     "system",
 457 |     "for",
 458 |     "multi-document",
 459 |     "summarization",
 460 |     "in",
 461 |     "the",
 462 |     "news",
 463 |     "domain"
 464 |   ],
 465 |   [
 466 |     "the",
 467 |     "system",
 468 |     "was",
 469 |     "based",
 470 |     "on",
 471 |     "a",
 472 |     "hybrid",
 473 |     "system",
 474 |     "using",
 475 |     "a",
 476 |     "naive",
 477 |     "bayes",
 478 |     "classifier",
 479 |     "and",
 480 |     "statistical",
 481 |     "language",
 482 |     "models",
 483 |     "for",
 484 |     "modeling",
 485 |     "salience"
 486 |   ],
 487 |   [
 488 |     "although",
 489 |     "the",
 490 |     "system",
 491 |     "exhibited",
 492 |     "good",
 493 |     "results",
 494 |     "the",
 495 |     "researchers",
 496 |     "wanted",
 497 |     "to",
 498 |     "explore",
 499 |     "the",
 500 |     "effectiveness",
 501 |     "of",
 502 |     "a",
 503 |     "maximum",
 504 |     "entropy",
 505 |     "me",
 506 |     "classifier",
 507 |     "for",
 508 |     "the",
 509 |     "meeting",
 510 |     "summarization",
 511 |     "task",
 512 |     "as",
 513 |     "me",
 514 |     "is",
 515 |     "known",
 516 |     "to",
 517 |     "be",
 518 |     "robust",
 519 |     "against",
 520 |     "feature",
 521 |     "dependencies"
 522 |   ],
 523 |   [
 524 |     "maximum",
 525 |     "entropy",
 526 |     "has",
 527 |     "also",
 528 |     "been",
 529 |     "applied",
 530 |     "successfully",
 531 |     "for",
 532 |     "summarization",
 533 |     "in",
 534 |     "the",
 535 |     "broadcast",
 536 |     "news",
 537 |     "domain"
 538 |   ],
 539 |   [
 540 |     "machine",
 541 |     "learning",
 542 |     "techniques",
 543 |     "from",
 544 |     "closely",
 545 |     "related",
 546 |     "fields",
 547 |     "such",
 548 |     "as",
 549 |     "information",
 550 |     "retrieval",
 551 |     "or",
 552 |     "text",
 553 |     "mining",
 554 |     "have",
 555 |     "been",
 556 |     "successfully",
 557 |     "adapted",
 558 |     "to",
 559 |     "help",
 560 |     "automatic",
 561 |     "summarization"
 562 |   ],
 563 |   [
 564 |     "apart",
 565 |     "from",
 566 |     "fully",
 567 |     "automated",
 568 |     "summarizers",
 569 |     "fas",
 570 |     "there",
 571 |     "are",
 572 |     "systems",
 573 |     "that",
 574 |     "aid",
 575 |     "users",
 576 |     "with",
 577 |     "the",
 578 |     "task",
 579 |     "of",
 580 |     "summarization",
 581 |     "mahs",
 582 |     "=",
 583 |     "machine",
 584 |     "aided",
 585 |     "human",
 586 |     "summarization",
 587 |     "for",
 588 |     "example",
 589 |     "by",
 590 |     "highlighting",
 591 |     "candidate",
 592 |     "passages",
 593 |     "to",
 594 |     "be",
 595 |     "included",
 596 |     "in",
 597 |     "the",
 598 |     "summary",
 599 |     "and",
 600 |     "there",
 601 |     "are",
 602 |     "systems",
 603 |     "that",
 604 |     "depend",
 605 |     "on",
 606 |     "post-processing",
 607 |     "by",
 608 |     "a",
 609 |     "human",
 610 |     "hams",
 611 |     "=",
 612 |     "human",
 613 |     "aided",
 614 |     "machine",
 615 |     "summarization"
 616 |   ],
 617 |   [
 618 |     "there",
 619 |     "are",
 620 |     "different",
 621 |     "types",
 622 |     "of",
 623 |     "summaries",
 624 |     "depending",
 625 |     "what",
 626 |     "the",
 627 |     "summarization",
 628 |     "program",
 629 |     "focuses",
 630 |     "on",
 631 |     "to",
 632 |     "make",
 633 |     "the",
 634 |     "summary",
 635 |     "of",
 636 |     "the",
 637 |     "text",
 638 |     "for",
 639 |     "example",
 640 |     "generic",
 641 |     "summaries",
 642 |     "or",
 643 |     "query",
 644 |     "relevant",
 645 |     "summaries",
 646 |     "sometimes",
 647 |     "called",
 648 |     "query-based",
 649 |     "summaries"
 650 |   ],
 651 |   [
 652 |     "summarization",
 653 |     "systems",
 654 |     "are",
 655 |     "able",
 656 |     "to",
 657 |     "create",
 658 |     "both",
 659 |     "query",
 660 |     "relevant",
 661 |     "text",
 662 |     "summaries",
 663 |     "and",
 664 |     "generic",
 665 |     "machine-generated",
 666 |     "summaries",
 667 |     "depending",
 668 |     "on",
 669 |     "what",
 670 |     "the",
 671 |     "user",
 672 |     "needs"
 673 |   ],
 674 |   [
 675 |     "summarization",
 676 |     "of",
 677 |     "multimedia",
 678 |     "documents",
 679 |     "e"
 680 |   ],
 681 |   [
 682 |     "pictures",
 683 |     "or",
 684 |     "movies",
 685 |     "is",
 686 |     "also",
 687 |     "possible"
 688 |   ],
 689 |   [
 690 |     "some",
 691 |     "systems",
 692 |     "will",
 693 |     "generate",
 694 |     "a",
 695 |     "summary",
 696 |     "based",
 697 |     "on",
 698 |     "a",
 699 |     "single",
 700 |     "source",
 701 |     "document",
 702 |     "while",
 703 |     "others",
 704 |     "can",
 705 |     "use",
 706 |     "multiple",
 707 |     "source",
 708 |     "documents",
 709 |     "for",
 710 |     "example",
 711 |     "a",
 712 |     "cluster",
 713 |     "of",
 714 |     "news",
 715 |     "stories",
 716 |     "on",
 717 |     "the",
 718 |     "same",
 719 |     "topic"
 720 |   ],
 721 |   [
 722 |     "these",
 723 |     "systems",
 724 |     "are",
 725 |     "known",
 726 |     "as",
 727 |     "multi-document",
 728 |     "summarization",
 729 |     "systems"
 730 |   ],
 731 |   [
 732 |     "image",
 733 |     "collection",
 734 |     "summarization",
 735 |     "is",
 736 |     "other",
 737 |     "application",
 738 |     "example",
 739 |     "of",
 740 |     "automatic",
 741 |     "summarization"
 742 |   ],
 743 |   [
 744 |     "it",
 745 |     "consists",
 746 |     "in",
 747 |     "selecting",
 748 |     "a",
 749 |     "representative",
 750 |     "set",
 751 |     "of",
 752 |     "images",
 753 |     "from",
 754 |     "a",
 755 |     "larger",
 756 |     "set",
 757 |     "of",
 758 |     "images"
 759 |   ],
 760 |   [
 761 |     "a",
 762 |     "summary",
 763 |     "in",
 764 |     "this",
 765 |     "context",
 766 |     "is",
 767 |     "useful",
 768 |     "to",
 769 |     "show",
 770 |     "the",
 771 |     "most",
 772 |     "representative",
 773 |     "images",
 774 |     "of",
 775 |     "results",
 776 |     "in",
 777 |     "an",
 778 |     "image",
 779 |     "collection",
 780 |     "exploration",
 781 |     "system"
 782 |   ],
 783 |   [
 784 |     "keyphrase",
 785 |     "extraction",
 786 |     ":",
 787 |     "task",
 788 |     "description",
 789 |     "and",
 790 |     "example"
 791 |   ],
 792 |   [
 793 |     "the",
 794 |     "task",
 795 |     "is",
 796 |     "the",
 797 |     "following"
 798 |   ],
 799 |   [
 800 |     "you",
 801 |     "are",
 802 |     "given",
 803 |     "a",
 804 |     "piece",
 805 |     "of",
 806 |     "text",
 807 |     "such",
 808 |     "as",
 809 |     "a",
 810 |     "journal",
 811 |     "article",
 812 |     "and",
 813 |     "you",
 814 |     "must",
 815 |     "produce",
 816 |     "a",
 817 |     "list",
 818 |     "of",
 819 |     "keywords",
 820 |     "or",
 821 |     "keyphrases",
 822 |     "that",
 823 |     "capture",
 824 |     "the",
 825 |     "primary",
 826 |     "topics",
 827 |     "discussed",
 828 |     "in",
 829 |     "the",
 830 |     "text"
 831 |   ],
 832 |   [
 833 |     "in",
 834 |     "the",
 835 |     "case",
 836 |     "of",
 837 |     "research",
 838 |     "articles",
 839 |     "many",
 840 |     "authors",
 841 |     "provide",
 842 |     "manually",
 843 |     "assigned",
 844 |     "keywords",
 845 |     "but",
 846 |     "most",
 847 |     "text",
 848 |     "lacks",
 849 |     "pre-existing",
 850 |     "keyphrases"
 851 |   ],
 852 |   [
 853 |     "for",
 854 |     "example",
 855 |     "news",
 856 |     "articles",
 857 |     "rarely",
 858 |     "have",
 859 |     "keyphrases",
 860 |     "attached",
 861 |     "but",
 862 |     "it",
 863 |     "would",
 864 |     "be",
 865 |     "useful",
 866 |     "to",
 867 |     "be",
 868 |     "able",
 869 |     "to",
 870 |     "automatically",
 871 |     "do",
 872 |     "so",
 873 |     "for",
 874 |     "a",
 875 |     "number",
 876 |     "of",
 877 |     "applications",
 878 |     "discussed",
 879 |     "below"
 880 |   ],
 881 |   [
 882 |     "consider",
 883 |     "the",
 884 |     "example",
 885 |     "text",
 886 |     "from",
 887 |     "a",
 888 |     "recent",
 889 |     "news",
 890 |     "article",
 891 |     ":",
 892 |     "the",
 893 |     "army",
 894 |     "corps",
 895 |     "of",
 896 |     "engineers",
 897 |     "rushing",
 898 |     "to",
 899 |     "meet",
 900 |     "president",
 901 |     "bush's",
 902 |     "promise",
 903 |     "to",
 904 |     "protect",
 905 |     "new",
 906 |     "orleans",
 907 |     "by",
 908 |     "the",
 909 |     "start",
 910 |     "of",
 911 |     "the",
 912 |     "2006",
 913 |     "hurricane",
 914 |     "season",
 915 |     "installed",
 916 |     "defective",
 917 |     "flood-control",
 918 |     "pumps",
 919 |     "last",
 920 |     "year",
 921 |     "despite",
 922 |     "warnings",
 923 |     "from",
 924 |     "its",
 925 |     "own",
 926 |     "expert",
 927 |     "that",
 928 |     "the",
 929 |     "equipment",
 930 |     "would",
 931 |     "fail",
 932 |     "during",
 933 |     "a",
 934 |     "storm",
 935 |     "according",
 936 |     "to",
 937 |     "documents",
 938 |     "obtained",
 939 |     "by",
 940 |     "the",
 941 |     "associated",
 942 |     "press"
 943 |   ],
 944 |   [
 945 |     "an",
 946 |     "extractive",
 947 |     "keyphrase",
 948 |     "extractor",
 949 |     "might",
 950 |     "select",
 951 |     "army",
 952 |     "corps",
 953 |     "of",
 954 |     "engineers",
 955 |     "president",
 956 |     "bush",
 957 |     "new",
 958 |     "orleans",
 959 |     "and",
 960 |     "defective",
 961 |     "flood-control",
 962 |     "pumps",
 963 |     "as",
 964 |     "keyphrases"
 965 |   ],
 966 |   [
 967 |     "these",
 968 |     "are",
 969 |     "pulled",
 970 |     "directly",
 971 |     "from",
 972 |     "the",
 973 |     "text"
 974 |   ],
 975 |   [
 976 |     "in",
 977 |     "contrast",
 978 |     "an",
 979 |     "abstractive",
 980 |     "keyphrase",
 981 |     "system",
 982 |     "would",
 983 |     "somehow",
 984 |     "internalize",
 985 |     "the",
 986 |     "content",
 987 |     "and",
 988 |     "generate",
 989 |     "keyphrases",
 990 |     "that",
 991 |     "might",
 992 |     "be",
 993 |     "more",
 994 |     "descriptive",
 995 |     "and",
 996 |     "more",
 997 |     "like",
 998 |     "what",
 999 |     "a",
1000 |     "human",
1001 |     "would",
1002 |     "produce",
1003 |     "such",
1004 |     "as",
1005 |     "political",
1006 |     "negligence",
1007 |     "or",
1008 |     "inadequate",
1009 |     "protection",
1010 |     "from",
1011 |     "floods"
1012 |   ],
1013 |   [
1014 |     "note",
1015 |     "that",
1016 |     "these",
1017 |     "terms",
1018 |     "do",
1019 |     "not",
1020 |     "appear",
1021 |     "in",
1022 |     "the",
1023 |     "text",
1024 |     "and",
1025 |     "require",
1026 |     "a",
1027 |     "deep",
1028 |     "understanding",
1029 |     "which",
1030 |     "makes",
1031 |     "it",
1032 |     "difficult",
1033 |     "for",
1034 |     "a",
1035 |     "computer",
1036 |     "to",
1037 |     "produce",
1038 |     "such",
1039 |     "keyphrases"
1040 |   ],
1041 |   [
1042 |     "keyphrases",
1043 |     "have",
1044 |     "many",
1045 |     "applications",
1046 |     "such",
1047 |     "as",
1048 |     "to",
1049 |     "improve",
1050 |     "document",
1051 |     "browsing",
1052 |     "by",
1053 |     "providing",
1054 |     "a",
1055 |     "short",
1056 |     "summary"
1057 |   ],
1058 |   [
1059 |     "also",
1060 |     "keyphrases",
1061 |     "can",
1062 |     "improve",
1063 |     "information",
1064 |     "retrieval",
1065 |     "if",
1066 |     "documents",
1067 |     "have",
1068 |     "keyphrases",
1069 |     "assigned",
1070 |     "a",
1071 |     "user",
1072 |     "could",
1073 |     "search",
1074 |     "by",
1075 |     "keyphrase",
1076 |     "to",
1077 |     "produce",
1078 |     "more",
1079 |     "reliable",
1080 |     "hits",
1081 |     "than",
1082 |     "a",
1083 |     "full-text",
1084 |     "search"
1085 |   ],
1086 |   [
1087 |     "also",
1088 |     "automatic",
1089 |     "keyphrase",
1090 |     "extraction",
1091 |     "can",
1092 |     "be",
1093 |     "useful",
1094 |     "in",
1095 |     "generating",
1096 |     "index",
1097 |     "entries",
1098 |     "for",
1099 |     "a",
1100 |     "large",
1101 |     "text",
1102 |     "corpus"
1103 |   ],
1104 |   [
1105 |     "beginning",
1106 |     "with",
1107 |     "the",
1108 |     "turney",
1109 |     "paper",
1110 |     "many",
1111 |     "researchers",
1112 |     "have",
1113 |     "approached",
1114 |     "keyphrase",
1115 |     "extraction",
1116 |     "as",
1117 |     "a",
1118 |     "supervised",
1119 |     "machine",
1120 |     "learning",
1121 |     "problem"
1122 |   ],
1123 |   [
1124 |     "given",
1125 |     "a",
1126 |     "document",
1127 |     "we",
1128 |     "construct",
1129 |     "an",
1130 |     "example",
1131 |     "for",
1132 |     "each",
1133 |     "unigram",
1134 |     "bigram",
1135 |     "and",
1136 |     "trigram",
1137 |     "found",
1138 |     "in",
1139 |     "the",
1140 |     "text",
1141 |     "though",
1142 |     "other",
1143 |     "text",
1144 |     "units",
1145 |     "are",
1146 |     "also",
1147 |     "possible",
1148 |     "as",
1149 |     "discussed",
1150 |     "below"
1151 |   ],
1152 |   [
1153 |     "we",
1154 |     "then",
1155 |     "compute",
1156 |     "various",
1157 |     "features",
1158 |     "describing",
1159 |     "each",
1160 |     "example",
1161 |     "e"
1162 |   ],
1163 |   [
1164 |     "does",
1165 |     "the",
1166 |     "phrase",
1167 |     "begin",
1168 |     "with",
1169 |     "an",
1170 |     "upper-case",
1171 |     "letter"
1172 |   ],
1173 |   [
1174 |     "we",
1175 |     "assume",
1176 |     "there",
1177 |     "are",
1178 |     "known",
1179 |     "keyphrases",
1180 |     "available",
1181 |     "for",
1182 |     "a",
1183 |     "set",
1184 |     "of",
1185 |     "training",
1186 |     "documents"
1187 |   ],
1188 |   [
1189 |     "using",
1190 |     "the",
1191 |     "known",
1192 |     "keyphrases",
1193 |     "we",
1194 |     "can",
1195 |     "assign",
1196 |     "positive",
1197 |     "or",
1198 |     "negative",
1199 |     "labels",
1200 |     "to",
1201 |     "the",
1202 |     "examples"
1203 |   ],
1204 |   [
1205 |     "then",
1206 |     "we",
1207 |     "learn",
1208 |     "a",
1209 |     "classifier",
1210 |     "that",
1211 |     "can",
1212 |     "discriminate",
1213 |     "between",
1214 |     "positive",
1215 |     "and",
1216 |     "negative",
1217 |     "examples",
1218 |     "as",
1219 |     "a",
1220 |     "function",
1221 |     "of",
1222 |     "the",
1223 |     "features"
1224 |   ],
1225 |   [
1226 |     "some",
1227 |     "classifiers",
1228 |     "make",
1229 |     "a",
1230 |     "binary",
1231 |     "classification",
1232 |     "for",
1233 |     "a",
1234 |     "test",
1235 |     "example",
1236 |     "while",
1237 |     "others",
1238 |     "assign",
1239 |     "a",
1240 |     "probability",
1241 |     "of",
1242 |     "being",
1243 |     "a",
1244 |     "keyphrase"
1245 |   ],
1246 |   [
1247 |     "for",
1248 |     "instance",
1249 |     "in",
1250 |     "the",
1251 |     "above",
1252 |     "text",
1253 |     "we",
1254 |     "might",
1255 |     "learn",
1256 |     "a",
1257 |     "rule",
1258 |     "that",
1259 |     "says",
1260 |     "phrases",
1261 |     "with",
1262 |     "initial",
1263 |     "capital",
1264 |     "letters",
1265 |     "are",
1266 |     "likely",
1267 |     "to",
1268 |     "be",
1269 |     "keyphrases"
1270 |   ],
1271 |   [
1272 |     "after",
1273 |     "training",
1274 |     "a",
1275 |     "learner",
1276 |     "we",
1277 |     "can",
1278 |     "select",
1279 |     "keyphrases",
1280 |     "for",
1281 |     "test",
1282 |     "documents",
1283 |     "in",
1284 |     "the",
1285 |     "following",
1286 |     "manner"
1287 |   ],
1288 |   [
1289 |     "we",
1290 |     "apply",
1291 |     "the",
1292 |     "same",
1293 |     "example-generation",
1294 |     "strategy",
1295 |     "to",
1296 |     "the",
1297 |     "test",
1298 |     "documents",
1299 |     "then",
1300 |     "run",
1301 |     "each",
1302 |     "example",
1303 |     "through",
1304 |     "the",
1305 |     "learner"
1306 |   ],
1307 |   [
1308 |     "we",
1309 |     "can",
1310 |     "determine",
1311 |     "the",
1312 |     "keyphrases",
1313 |     "by",
1314 |     "looking",
1315 |     "at",
1316 |     "binary",
1317 |     "classification",
1318 |     "decisions",
1319 |     "or",
1320 |     "probabilities",
1321 |     "returned",
1322 |     "from",
1323 |     "our",
1324 |     "learned",
1325 |     "model"
1326 |   ],
1327 |   [
1328 |     "if",
1329 |     "probabilities",
1330 |     "are",
1331 |     "given",
1332 |     "a",
1333 |     "threshold",
1334 |     "is",
1335 |     "used",
1336 |     "to",
1337 |     "select",
1338 |     "the",
1339 |     "keyphrases"
1340 |   ],
1341 |   [
1342 |     "keyphrase",
1343 |     "extractors",
1344 |     "are",
1345 |     "generally",
1346 |     "evaluated",
1347 |     "using",
1348 |     "precision",
1349 |     "and",
1350 |     "recall"
1351 |   ],
1352 |   [
1353 |     "precision",
1354 |     "measures",
1355 |     "how",
1356 |     "many",
1357 |     "of",
1358 |     "the",
1359 |     "proposed",
1360 |     "keyphrases",
1361 |     "are",
1362 |     "actually",
1363 |     "correct"
1364 |   ],
1365 |   [
1366 |     "recall",
1367 |     "measures",
1368 |     "how",
1369 |     "many",
1370 |     "of",
1371 |     "the",
1372 |     "true",
1373 |     "keyphrases",
1374 |     "your",
1375 |     "system",
1376 |     "proposed"
1377 |   ],
1378 |   [
1379 |     "the",
1380 |     "two",
1381 |     "measures",
1382 |     "can",
1383 |     "be",
1384 |     "combined",
1385 |     "in",
1386 |     "an",
1387 |     "f-score",
1388 |     "which",
1389 |     "is",
1390 |     "the",
1391 |     "harmonic",
1392 |     "mean",
1393 |     "of",
1394 |     "the",
1395 |     "two"
1396 |   ],
1397 |   [
1398 |     "matches",
1399 |     "between",
1400 |     "the",
1401 |     "proposed",
1402 |     "keyphrases",
1403 |     "and",
1404 |     "the",
1405 |     "known",
1406 |     "keyphrases",
1407 |     "can",
1408 |     "be",
1409 |     "checked",
1410 |     "after",
1411 |     "stemming",
1412 |     "or",
1413 |     "applying",
1414 |     "some",
1415 |     "other",
1416 |     "text",
1417 |     "normalization"
1418 |   ],
1419 |   [
1420 |     "designing",
1421 |     "a",
1422 |     "supervised",
1423 |     "keyphrase",
1424 |     "extraction",
1425 |     "system",
1426 |     "involves",
1427 |     "deciding",
1428 |     "on",
1429 |     "several",
1430 |     "choices",
1431 |     "some",
1432 |     "of",
1433 |     "these",
1434 |     "apply",
1435 |     "to",
1436 |     "unsupervised",
1437 |     "too",
1438 |     ":",
1439 |     "what",
1440 |     "are",
1441 |     "the",
1442 |     "examples"
1443 |   ],
1444 |   [
1445 |     "the",
1446 |     "first",
1447 |     "choice",
1448 |     "is",
1449 |     "exactly",
1450 |     "how",
1451 |     "to",
1452 |     "generate",
1453 |     "examples"
1454 |   ],
1455 |   [
1456 |     "turney",
1457 |     "and",
1458 |     "others",
1459 |     "have",
1460 |     "used",
1461 |     "all",
1462 |     "possible",
1463 |     "unigrams",
1464 |     "bigrams",
1465 |     "and",
1466 |     "trigrams",
1467 |     "without",
1468 |     "intervening",
1469 |     "punctuation",
1470 |     "and",
1471 |     "after",
1472 |     "removing",
1473 |     "stopwords"
1474 |   ],
1475 |   [
1476 |     "hulth",
1477 |     "showed",
1478 |     "that",
1479 |     "you",
1480 |     "can",
1481 |     "get",
1482 |     "some",
1483 |     "improvement",
1484 |     "by",
1485 |     "selecting",
1486 |     "examples",
1487 |     "to",
1488 |     "be",
1489 |     "sequences",
1490 |     "of",
1491 |     "tokens",
1492 |     "that",
1493 |     "match",
1494 |     "certain",
1495 |     "patterns",
1496 |     "of",
1497 |     "part-of-speech",
1498 |     "tags"
1499 |   ],
1500 |   [
1501 |     "ideally",
1502 |     "the",
1503 |     "mechanism",
1504 |     "for",
1505 |     "generating",
1506 |     "examples",
1507 |     "produces",
1508 |     "all",
1509 |     "the",
1510 |     "known",
1511 |     "labeled",
1512 |     "keyphrases",
1513 |     "as",
1514 |     "candidates",
1515 |     "though",
1516 |     "this",
1517 |     "is",
1518 |     "often",
1519 |     "not",
1520 |     "the",
1521 |     "case"
1522 |   ],
1523 |   [
1524 |     "for",
1525 |     "example",
1526 |     "if",
1527 |     "we",
1528 |     "use",
1529 |     "only",
1530 |     "unigrams",
1531 |     "bigrams",
1532 |     "and",
1533 |     "trigrams",
1534 |     "then",
1535 |     "we",
1536 |     "will",
1537 |     "never",
1538 |     "be",
1539 |     "able",
1540 |     "to",
1541 |     "extract",
1542 |     "a",
1543 |     "known",
1544 |     "keyphrase",
1545 |     "containing",
1546 |     "four",
1547 |     "words"
1548 |   ],
1549 |   [
1550 |     "thus",
1551 |     "recall",
1552 |     "may",
1553 |     "suffer"
1554 |   ],
1555 |   [
1556 |     "however",
1557 |     "generating",
1558 |     "too",
1559 |     "many",
1560 |     "examples",
1561 |     "can",
1562 |     "also",
1563 |     "lead",
1564 |     "to",
1565 |     "low",
1566 |     "precision"
1567 |   ],
1568 |   [
1569 |     "what",
1570 |     "are",
1571 |     "the",
1572 |     "features"
1573 |   ],
1574 |   [
1575 |     "we",
1576 |     "also",
1577 |     "need",
1578 |     "to",
1579 |     "create",
1580 |     "features",
1581 |     "that",
1582 |     "describe",
1583 |     "the",
1584 |     "examples",
1585 |     "and",
1586 |     "are",
1587 |     "informative",
1588 |     "enough",
1589 |     "to",
1590 |     "allow",
1591 |     "a",
1592 |     "learning",
1593 |     "algorithm",
1594 |     "to",
1595 |     "discriminate",
1596 |     "keyphrases",
1597 |     "from",
1598 |     "non-",
1599 |     "keyphrases"
1600 |   ],
1601 |   [
1602 |     "typically",
1603 |     "features",
1604 |     "involve",
1605 |     "various",
1606 |     "term",
1607 |     "frequencies",
1608 |     "how",
1609 |     "many",
1610 |     "times",
1611 |     "a",
1612 |     "phrase",
1613 |     "appears",
1614 |     "in",
1615 |     "the",
1616 |     "current",
1617 |     "text",
1618 |     "or",
1619 |     "in",
1620 |     "a",
1621 |     "larger",
1622 |     "corpus",
1623 |     "the",
1624 |     "length",
1625 |     "of",
1626 |     "the",
1627 |     "example",
1628 |     "relative",
1629 |     "position",
1630 |     "of",
1631 |     "the",
1632 |     "first",
1633 |     "occurrence",
1634 |     "various",
1635 |     "boolean",
1636 |     "syntactic",
1637 |     "features",
1638 |     "e"
1639 |   ],
1640 |   [
1641 |     "contains",
1642 |     "all",
1643 |     "caps",
1644 |     "etc"
1645 |   ],
1646 |   [
1647 |     "the",
1648 |     "turney",
1649 |     "paper",
1650 |     "used",
1651 |     "about",
1652 |     "12",
1653 |     "such",
1654 |     "features"
1655 |   ],
1656 |   [
1657 |     "hulth",
1658 |     "uses",
1659 |     "a",
1660 |     "reduced",
1661 |     "set",
1662 |     "of",
1663 |     "features",
1664 |     "which",
1665 |     "were",
1666 |     "found",
1667 |     "most",
1668 |     "successful",
1669 |     "in",
1670 |     "the",
1671 |     "kea",
1672 |     "keyphrase",
1673 |     "extraction",
1674 |     "algorithm",
1675 |     "work",
1676 |     "derived",
1677 |     "from",
1678 |     "turney",
1679 |     "s",
1680 |     "seminal",
1681 |     "paper"
1682 |   ],
1683 |   [
1684 |     "how",
1685 |     "many",
1686 |     "keyphrases",
1687 |     "to",
1688 |     "return"
1689 |   ],
1690 |   [
1691 |     "in",
1692 |     "the",
1693 |     "end",
1694 |     "the",
1695 |     "system",
1696 |     "will",
1697 |     "need",
1698 |     "to",
1699 |     "return",
1700 |     "a",
1701 |     "list",
1702 |     "of",
1703 |     "keyphrases",
1704 |     "for",
1705 |     "a",
1706 |     "test",
1707 |     "document",
1708 |     "so",
1709 |     "we",
1710 |     "need",
1711 |     "to",
1712 |     "have",
1713 |     "a",
1714 |     "way",
1715 |     "to",
1716 |     "limit",
1717 |     "the",
1718 |     "number"
1719 |   ],
1720 |   [
1721 |     "ensemble",
1722 |     "methods",
1723 |     "i"
1724 |   ],
1725 |   [
1726 |     "using",
1727 |     "votes",
1728 |     "from",
1729 |     "several",
1730 |     "classifiers",
1731 |     "have",
1732 |     "been",
1733 |     "used",
1734 |     "to",
1735 |     "produce",
1736 |     "numeric",
1737 |     "scores",
1738 |     "that",
1739 |     "can",
1740 |     "be",
1741 |     "thresholded",
1742 |     "to",
1743 |     "provide",
1744 |     "a",
1745 |     "user-provided",
1746 |     "number",
1747 |     "of",
1748 |     "keyphrases"
1749 |   ],
1750 |   [
1751 |     "this",
1752 |     "is",
1753 |     "the",
1754 |     "technique",
1755 |     "used",
1756 |     "by",
1757 |     "turney",
1758 |     "with",
1759 |     "c4.5",
1760 |     "decision",
1761 |     "trees"
1762 |   ],
1763 |   [
1764 |     "hulth",
1765 |     "used",
1766 |     "a",
1767 |     "single",
1768 |     "binary",
1769 |     "classifier",
1770 |     "so",
1771 |     "the",
1772 |     "learning",
1773 |     "algorithm",
1774 |     "implicitly",
1775 |     "determines",
1776 |     "the",
1777 |     "appropriate",
1778 |     "number"
1779 |   ],
1780 |   [
1781 |     "what",
1782 |     "learning",
1783 |     "algorithm"
1784 |   ],
1785 |   [
1786 |     "once",
1787 |     "examples",
1788 |     "and",
1789 |     "features",
1790 |     "are",
1791 |     "created",
1792 |     "we",
1793 |     "need",
1794 |     "a",
1795 |     "way",
1796 |     "to",
1797 |     "learn",
1798 |     "to",
1799 |     "predict",
1800 |     "keyphrases"
1801 |   ],
1802 |   [
1803 |     "virtually",
1804 |     "any",
1805 |     "supervised",
1806 |     "learning",
1807 |     "algorithm",
1808 |     "could",
1809 |     "be",
1810 |     "used",
1811 |     "such",
1812 |     "as",
1813 |     "decision",
1814 |     "trees",
1815 |     "naive",
1816 |     "bayes",
1817 |     "and",
1818 |     "rule",
1819 |     "induction"
1820 |   ],
1821 |   [
1822 |     "in",
1823 |     "the",
1824 |     "case",
1825 |     "of",
1826 |     "turney's",
1827 |     "genex",
1828 |     "algorithm",
1829 |     "a",
1830 |     "genetic",
1831 |     "algorithm",
1832 |     "is",
1833 |     "used",
1834 |     "to",
1835 |     "learn",
1836 |     "parameters",
1837 |     "for",
1838 |     "a",
1839 |     "domain-specific",
1840 |     "keyphrase",
1841 |     "extraction",
1842 |     "algorithm"
1843 |   ],
1844 |   [
1845 |     "the",
1846 |     "extractor",
1847 |     "follows",
1848 |     "a",
1849 |     "series",
1850 |     "of",
1851 |     "heuristics",
1852 |     "to",
1853 |     "identify",
1854 |     "keyphrases"
1855 |   ],
1856 |   [
1857 |     "the",
1858 |     "genetic",
1859 |     "algorithm",
1860 |     "optimizes",
1861 |     "parameters",
1862 |     "for",
1863 |     "these",
1864 |     "heuristics",
1865 |     "with",
1866 |     "respect",
1867 |     "to",
1868 |     "performance",
1869 |     "on",
1870 |     "training",
1871 |     "documents",
1872 |     "with",
1873 |     "known",
1874 |     "key",
1875 |     "phrases"
1876 |   ],
1877 |   [
1878 |     "unsupervised",
1879 |     "keyphrase",
1880 |     "extraction",
1881 |     ":",
1882 |     "textrank"
1883 |   ],
1884 |   [
1885 |     "while",
1886 |     "supervised",
1887 |     "methods",
1888 |     "have",
1889 |     "some",
1890 |     "nice",
1891 |     "properties",
1892 |     "like",
1893 |     "being",
1894 |     "able",
1895 |     "to",
1896 |     "produce",
1897 |     "interpretable",
1898 |     "rules",
1899 |     "for",
1900 |     "what",
1901 |     "features",
1902 |     "characterize",
1903 |     "a",
1904 |     "keyphrase",
1905 |     "they",
1906 |     "also",
1907 |     "require",
1908 |     "a",
1909 |     "large",
1910 |     "amount",
1911 |     "of",
1912 |     "training",
1913 |     "data"
1914 |   ],
1915 |   [
1916 |     "many",
1917 |     "documents",
1918 |     "with",
1919 |     "known",
1920 |     "keyphrases",
1921 |     "are",
1922 |     "needed"
1923 |   ],
1924 |   [
1925 |     "furthermore",
1926 |     "training",
1927 |     "on",
1928 |     "a",
1929 |     "specific",
1930 |     "domain",
1931 |     "tends",
1932 |     "to",
1933 |     "customize",
1934 |     "the",
1935 |     "extraction",
1936 |     "process",
1937 |     "to",
1938 |     "that",
1939 |     "domain",
1940 |     "so",
1941 |     "the",
1942 |     "resulting",
1943 |     "classifier",
1944 |     "is",
1945 |     "not",
1946 |     "necessarily",
1947 |     "portable",
1948 |     "as",
1949 |     "some",
1950 |     "of",
1951 |     "turney's",
1952 |     "results",
1953 |     "demonstrate"
1954 |   ],
1955 |   [
1956 |     "unsupervised",
1957 |     "keyphrase",
1958 |     "extraction",
1959 |     "removes",
1960 |     "the",
1961 |     "need",
1962 |     "for",
1963 |     "training",
1964 |     "data"
1965 |   ],
1966 |   [
1967 |     "it",
1968 |     "approaches",
1969 |     "the",
1970 |     "problem",
1971 |     "from",
1972 |     "a",
1973 |     "different",
1974 |     "angle"
1975 |   ],
1976 |   [
1977 |     "instead",
1978 |     "of",
1979 |     "trying",
1980 |     "to",
1981 |     "learn",
1982 |     "explicit",
1983 |     "features",
1984 |     "that",
1985 |     "characterize",
1986 |     "keyphrases",
1987 |     "the",
1988 |     "textrank",
1989 |     "algorithm",
1990 |     "exploits",
1991 |     "the",
1992 |     "structure",
1993 |     "of",
1994 |     "the",
1995 |     "text",
1996 |     "itself",
1997 |     "to",
1998 |     "determine",
1999 |     "keyphrases",
2000 |     "that",
2001 |     "appear",
2002 |     "central",
2003 |     "to",
2004 |     "the",
2005 |     "text",
2006 |     "in",
2007 |     "the",
2008 |     "same",
2009 |     "way",
2010 |     "that",
2011 |     "pagerank",
2012 |     "selects",
2013 |     "important",
2014 |     "web",
2015 |     "pages"
2016 |   ],
2017 |   [
2018 |     "recall",
2019 |     "this",
2020 |     "is",
2021 |     "based",
2022 |     "on",
2023 |     "the",
2024 |     "notion",
2025 |     "of",
2026 |     "prestige",
2027 |     "or",
2028 |     "recommendation",
2029 |     "from",
2030 |     "social",
2031 |     "networks"
2032 |   ],
2033 |   [
2034 |     "in",
2035 |     "this",
2036 |     "way",
2037 |     "textrank",
2038 |     "does",
2039 |     "not",
2040 |     "rely",
2041 |     "on",
2042 |     "any",
2043 |     "previous",
2044 |     "training",
2045 |     "data",
2046 |     "at",
2047 |     "all",
2048 |     "but",
2049 |     "rather",
2050 |     "can",
2051 |     "be",
2052 |     "run",
2053 |     "on",
2054 |     "any",
2055 |     "arbitrary",
2056 |     "piece",
2057 |     "of",
2058 |     "text",
2059 |     "and",
2060 |     "it",
2061 |     "can",
2062 |     "produce",
2063 |     "output",
2064 |     "simply",
2065 |     "based",
2066 |     "on",
2067 |     "the",
2068 |     "text's",
2069 |     "intrinsic",
2070 |     "properties"
2071 |   ],
2072 |   [
2073 |     "thus",
2074 |     "the",
2075 |     "algorithm",
2076 |     "is",
2077 |     "easily",
2078 |     "portable",
2079 |     "to",
2080 |     "new",
2081 |     "domains",
2082 |     "and",
2083 |     "languages"
2084 |   ],
2085 |   [
2086 |     "textrank",
2087 |     "is",
2088 |     "a",
2089 |     "general",
2090 |     "purpose",
2091 |     "graph-based",
2092 |     "ranking",
2093 |     "algorithm",
2094 |     "for",
2095 |     "nlp"
2096 |   ],
2097 |   [
2098 |     "essentially",
2099 |     "it",
2100 |     "runs",
2101 |     "pagerank",
2102 |     "on",
2103 |     "a",
2104 |     "graph",
2105 |     "specially",
2106 |     "designed",
2107 |     "for",
2108 |     "a",
2109 |     "particular",
2110 |     "nlp",
2111 |     "task"
2112 |   ],
2113 |   [
2114 |     "for",
2115 |     "keyphrase",
2116 |     "extraction",
2117 |     "it",
2118 |     "builds",
2119 |     "a",
2120 |     "graph",
2121 |     "using",
2122 |     "some",
2123 |     "set",
2124 |     "of",
2125 |     "text",
2126 |     "units",
2127 |     "as",
2128 |     "vertices"
2129 |   ],
2130 |   [
2131 |     "edges",
2132 |     "are",
2133 |     "based",
2134 |     "on",
2135 |     "some",
2136 |     "measure",
2137 |     "of",
2138 |     "semantic",
2139 |     "or",
2140 |     "lexical",
2141 |     "similarity",
2142 |     "between",
2143 |     "the",
2144 |     "text",
2145 |     "unit",
2146 |     "vertices"
2147 |   ],
2148 |   [
2149 |     "unlike",
2150 |     "pagerank",
2151 |     "the",
2152 |     "edges",
2153 |     "are",
2154 |     "typically",
2155 |     "undirected",
2156 |     "and",
2157 |     "can",
2158 |     "be",
2159 |     "weighted",
2160 |     "to",
2161 |     "reflect",
2162 |     "a",
2163 |     "degree",
2164 |     "of",
2165 |     "similarity"
2166 |   ],
2167 |   [
2168 |     "once",
2169 |     "the",
2170 |     "graph",
2171 |     "is",
2172 |     "constructed",
2173 |     "it",
2174 |     "is",
2175 |     "used",
2176 |     "to",
2177 |     "form",
2178 |     "a",
2179 |     "stochastic",
2180 |     "matrix",
2181 |     "combined",
2182 |     "with",
2183 |     "a",
2184 |     "damping",
2185 |     "factor",
2186 |     "as",
2187 |     "in",
2188 |     "the",
2189 |     "random",
2190 |     "surfer",
2191 |     "model",
2192 |     "and",
2193 |     "the",
2194 |     "ranking",
2195 |     "over",
2196 |     "vertices",
2197 |     "is",
2198 |     "obtained",
2199 |     "by",
2200 |     "finding",
2201 |     "the",
2202 |     "eigenvector",
2203 |     "corresponding",
2204 |     "to",
2205 |     "eigenvalue",
2206 |     "1",
2207 |     "i"
2208 |   ],
2209 |   [
2210 |     "the",
2211 |     "stationary",
2212 |     "distribution",
2213 |     "of",
2214 |     "the",
2215 |     "random",
2216 |     "walk",
2217 |     "on",
2218 |     "the",
2219 |     "graph"
2220 |   ],
2221 |   [
2222 |     "what",
2223 |     "should",
2224 |     "vertices",
2225 |     "be"
2226 |   ],
2227 |   [
2228 |     "the",
2229 |     "vertices",
2230 |     "should",
2231 |     "correspond",
2232 |     "to",
2233 |     "what",
2234 |     "we",
2235 |     "want",
2236 |     "to",
2237 |     "rank"
2238 |   ],
2239 |   [
2240 |     "potentially",
2241 |     "we",
2242 |     "could",
2243 |     "do",
2244 |     "something",
2245 |     "similar",
2246 |     "to",
2247 |     "the",
2248 |     "supervised",
2249 |     "methods",
2250 |     "and",
2251 |     "create",
2252 |     "a",
2253 |     "vertex",
2254 |     "for",
2255 |     "each",
2256 |     "unigram",
2257 |     "bigram",
2258 |     "trigram",
2259 |     "etc"
2260 |   ],
2261 |   [
2262 |     "however",
2263 |     "to",
2264 |     "keep",
2265 |     "the",
2266 |     "graph",
2267 |     "small",
2268 |     "the",
2269 |     "authors",
2270 |     "decide",
2271 |     "to",
2272 |     "rank",
2273 |     "individual",
2274 |     "unigrams",
2275 |     "in",
2276 |     "a",
2277 |     "first",
2278 |     "step",
2279 |     "and",
2280 |     "then",
2281 |     "include",
2282 |     "a",
2283 |     "second",
2284 |     "step",
2285 |     "that",
2286 |     "merges",
2287 |     "highly",
2288 |     "ranked",
2289 |     "adjacent",
2290 |     "unigrams",
2291 |     "to",
2292 |     "form",
2293 |     "multi-word",
2294 |     "phrases"
2295 |   ],
2296 |   [
2297 |     "this",
2298 |     "has",
2299 |     "a",
2300 |     "nice",
2301 |     "side",
2302 |     "effect",
2303 |     "of",
2304 |     "allowing",
2305 |     "us",
2306 |     "to",
2307 |     "produce",
2308 |     "keyphrases",
2309 |     "of",
2310 |     "arbitrary",
2311 |     "length"
2312 |   ],
2313 |   [
2314 |     "for",
2315 |     "example",
2316 |     "if",
2317 |     "we",
2318 |     "rank",
2319 |     "unigrams",
2320 |     "and",
2321 |     "find",
2322 |     "that",
2323 |     "advanced",
2324 |     "natural",
2325 |     "language",
2326 |     "and",
2327 |     "processing",
2328 |     "all",
2329 |     "get",
2330 |     "high",
2331 |     "ranks",
2332 |     "then",
2333 |     "we",
2334 |     "would",
2335 |     "look",
2336 |     "at",
2337 |     "the",
2338 |     "original",
2339 |     "text",
2340 |     "and",
2341 |     "see",
2342 |     "that",
2343 |     "these",
2344 |     "words",
2345 |     "appear",
2346 |     "consecutively",
2347 |     "and",
2348 |     "create",
2349 |     "a",
2350 |     "final",
2351 |     "keyphrase",
2352 |     "using",
2353 |     "all",
2354 |     "four",
2355 |     "together"
2356 |   ],
2357 |   [
2358 |     "note",
2359 |     "that",
2360 |     "the",
2361 |     "unigrams",
2362 |     "placed",
2363 |     "in",
2364 |     "the",
2365 |     "graph",
2366 |     "can",
2367 |     "be",
2368 |     "filtered",
2369 |     "by",
2370 |     "part",
2371 |     "of",
2372 |     "speech"
2373 |   ],
2374 |   [
2375 |     "the",
2376 |     "authors",
2377 |     "found",
2378 |     "that",
2379 |     "adjectives",
2380 |     "and",
2381 |     "nouns",
2382 |     "were",
2383 |     "the",
2384 |     "best",
2385 |     "to",
2386 |     "include"
2387 |   ],
2388 |   [
2389 |     "thus",
2390 |     "some",
2391 |     "linguistic",
2392 |     "knowledge",
2393 |     "comes",
2394 |     "into",
2395 |     "play",
2396 |     "in",
2397 |     "this",
2398 |     "step"
2399 |   ],
2400 |   [
2401 |     "how",
2402 |     "should",
2403 |     "we",
2404 |     "create",
2405 |     "edges"
2406 |   ],
2407 |   [
2408 |     "edges",
2409 |     "are",
2410 |     "created",
2411 |     "based",
2412 |     "on",
2413 |     "word",
2414 |     "co-occurrence",
2415 |     "in",
2416 |     "this",
2417 |     "application",
2418 |     "of",
2419 |     "textrank"
2420 |   ],
2421 |   [
2422 |     "two",
2423 |     "vertices",
2424 |     "are",
2425 |     "connected",
2426 |     "by",
2427 |     "an",
2428 |     "edge",
2429 |     "if",
2430 |     "the",
2431 |     "unigrams",
2432 |     "appear",
2433 |     "within",
2434 |     "a",
2435 |     "window",
2436 |     "of",
2437 |     "size",
2438 |     "n",
2439 |     "in",
2440 |     "the",
2441 |     "original",
2442 |     "text"
2443 |   ],
2444 |   [
2445 |     "n",
2446 |     "is",
2447 |     "typically",
2448 |     "around",
2449 |     "2",
2450 |     "10"
2451 |   ],
2452 |   [
2453 |     "thus",
2454 |     "natural",
2455 |     "and",
2456 |     "language",
2457 |     "might",
2458 |     "be",
2459 |     "linked",
2460 |     "in",
2461 |     "a",
2462 |     "text",
2463 |     "about",
2464 |     "nlp"
2465 |   ],
2466 |   [
2467 |     "natural",
2468 |     "and",
2469 |     "processing",
2470 |     "would",
2471 |     "also",
2472 |     "be",
2473 |     "linked",
2474 |     "because",
2475 |     "they",
2476 |     "would",
2477 |     "both",
2478 |     "appear",
2479 |     "in",
2480 |     "the",
2481 |     "same",
2482 |     "string",
2483 |     "of",
2484 |     "n",
2485 |     "words"
2486 |   ],
2487 |   [
2488 |     "these",
2489 |     "edges",
2490 |     "build",
2491 |     "on",
2492 |     "the",
2493 |     "notion",
2494 |     "of",
2495 |     "text",
2496 |     "cohesion",
2497 |     "and",
2498 |     "the",
2499 |     "idea",
2500 |     "that",
2501 |     "words",
2502 |     "that",
2503 |     "appear",
2504 |     "near",
2505 |     "each",
2506 |     "other",
2507 |     "are",
2508 |     "likely",
2509 |     "related",
2510 |     "in",
2511 |     "a",
2512 |     "meaningful",
2513 |     "way",
2514 |     "and",
2515 |     "recommend",
2516 |     "each",
2517 |     "other",
2518 |     "to",
2519 |     "the",
2520 |     "reader"
2521 |   ],
2522 |   [
2523 |     "how",
2524 |     "are",
2525 |     "the",
2526 |     "final",
2527 |     "keyphrases",
2528 |     "formed"
2529 |   ],
2530 |   [
2531 |     "since",
2532 |     "this",
2533 |     "method",
2534 |     "simply",
2535 |     "ranks",
2536 |     "the",
2537 |     "individual",
2538 |     "vertices",
2539 |     "we",
2540 |     "need",
2541 |     "a",
2542 |     "way",
2543 |     "to",
2544 |     "threshold",
2545 |     "or",
2546 |     "produce",
2547 |     "a",
2548 |     "limited",
2549 |     "number",
2550 |     "of",
2551 |     "keyphrases"
2552 |   ],
2553 |   [
2554 |     "the",
2555 |     "technique",
2556 |     "chosen",
2557 |     "is",
2558 |     "to",
2559 |     "set",
2560 |     "a",
2561 |     "count",
2562 |     "t",
2563 |     "to",
2564 |     "be",
2565 |     "a",
2566 |     "user-specified",
2567 |     "fraction",
2568 |     "of",
2569 |     "the",
2570 |     "total",
2571 |     "number",
2572 |     "of",
2573 |     "vertices",
2574 |     "in",
2575 |     "the",
2576 |     "graph"
2577 |   ],
2578 |   [
2579 |     "then",
2580 |     "the",
2581 |     "top",
2582 |     "t",
2583 |     "vertices/unigrams",
2584 |     "are",
2585 |     "selected",
2586 |     "based",
2587 |     "on",
2588 |     "their",
2589 |     "stationary",
2590 |     "probabilities"
2591 |   ],
2592 |   [
2593 |     "a",
2594 |     "post-",
2595 |     "processing",
2596 |     "step",
2597 |     "is",
2598 |     "then",
2599 |     "applied",
2600 |     "to",
2601 |     "merge",
2602 |     "adjacent",
2603 |     "instances",
2604 |     "of",
2605 |     "these",
2606 |     "t",
2607 |     "unigrams"
2608 |   ],
2609 |   [
2610 |     "as",
2611 |     "a",
2612 |     "result",
2613 |     "potentially",
2614 |     "more",
2615 |     "or",
2616 |     "less",
2617 |     "than",
2618 |     "t",
2619 |     "final",
2620 |     "keyphrases",
2621 |     "will",
2622 |     "be",
2623 |     "produced",
2624 |     "but",
2625 |     "the",
2626 |     "number",
2627 |     "should",
2628 |     "be",
2629 |     "roughly",
2630 |     "proportional",
2631 |     "to",
2632 |     "the",
2633 |     "length",
2634 |     "of",
2635 |     "the",
2636 |     "original",
2637 |     "text"
2638 |   ],
2639 |   [
2640 |     "it",
2641 |     "is",
2642 |     "not",
2643 |     "initially",
2644 |     "clear",
2645 |     "why",
2646 |     "applying",
2647 |     "pagerank",
2648 |     "to",
2649 |     "a",
2650 |     "co-occurrence",
2651 |     "graph",
2652 |     "would",
2653 |     "produce",
2654 |     "useful",
2655 |     "keyphrases"
2656 |   ],
2657 |   [
2658 |     "one",
2659 |     "way",
2660 |     "to",
2661 |     "think",
2662 |     "about",
2663 |     "it",
2664 |     "is",
2665 |     "the",
2666 |     "following"
2667 |   ],
2668 |   [
2669 |     "a",
2670 |     "word",
2671 |     "that",
2672 |     "appears",
2673 |     "multiple",
2674 |     "times",
2675 |     "throughout",
2676 |     "a",
2677 |     "text",
2678 |     "may",
2679 |     "have",
2680 |     "many",
2681 |     "different",
2682 |     "co-occurring",
2683 |     "neighbors"
2684 |   ],
2685 |   [
2686 |     "for",
2687 |     "example",
2688 |     "in",
2689 |     "a",
2690 |     "text",
2691 |     "about",
2692 |     "machine",
2693 |     "learning",
2694 |     "the",
2695 |     "unigram",
2696 |     "learning",
2697 |     "might",
2698 |     "co-occur",
2699 |     "with",
2700 |     "machine",
2701 |     "supervised",
2702 |     "un-supervised",
2703 |     "and",
2704 |     "semi-supervised",
2705 |     "in",
2706 |     "four",
2707 |     "different",
2708 |     "sentences"
2709 |   ],
2710 |   [
2711 |     "thus",
2712 |     "the",
2713 |     "learning",
2714 |     "vertex",
2715 |     "would",
2716 |     "be",
2717 |     "a",
2718 |     "central",
2719 |     "hub",
2720 |     "that",
2721 |     "connects",
2722 |     "to",
2723 |     "these",
2724 |     "other",
2725 |     "modifying",
2726 |     "words"
2727 |   ],
2728 |   [
2729 |     "running",
2730 |     "pagerank/textrank",
2731 |     "on",
2732 |     "the",
2733 |     "graph",
2734 |     "is",
2735 |     "likely",
2736 |     "to",
2737 |     "rank",
2738 |     "learning",
2739 |     "highly"
2740 |   ],
2741 |   [
2742 |     "similarly",
2743 |     "if",
2744 |     "the",
2745 |     "text",
2746 |     "contains",
2747 |     "the",
2748 |     "phrase",
2749 |     "supervised",
2750 |     "classification",
2751 |     "then",
2752 |     "there",
2753 |     "would",
2754 |     "be",
2755 |     "an",
2756 |     "edge",
2757 |     "between",
2758 |     "supervised",
2759 |     "and",
2760 |     "classification"
2761 |   ],
2762 |   [
2763 |     "if",
2764 |     "classification",
2765 |     "appears",
2766 |     "several",
2767 |     "other",
2768 |     "places",
2769 |     "and",
2770 |     "thus",
2771 |     "has",
2772 |     "many",
2773 |     "neighbors",
2774 |     "its",
2775 |     "importance",
2776 |     "would",
2777 |     "contribute",
2778 |     "to",
2779 |     "the",
2780 |     "importance",
2781 |     "of",
2782 |     "supervised"
2783 |   ],
2784 |   [
2785 |     "if",
2786 |     "it",
2787 |     "ends",
2788 |     "up",
2789 |     "with",
2790 |     "a",
2791 |     "high",
2792 |     "rank",
2793 |     "it",
2794 |     "will",
2795 |     "be",
2796 |     "selected",
2797 |     "as",
2798 |     "one",
2799 |     "of",
2800 |     "the",
2801 |     "top",
2802 |     "t",
2803 |     "unigrams",
2804 |     "along",
2805 |     "with",
2806 |     "learning",
2807 |     "and",
2808 |     "probably",
2809 |     "classification"
2810 |   ],
2811 |   [
2812 |     "in",
2813 |     "the",
2814 |     "final",
2815 |     "post-processing",
2816 |     "step",
2817 |     "we",
2818 |     "would",
2819 |     "then",
2820 |     "end",
2821 |     "up",
2822 |     "with",
2823 |     "keyphrases",
2824 |     "supervised",
2825 |     "learning",
2826 |     "and",
2827 |     "supervised",
2828 |     "classification"
2829 |   ],
2830 |   [
2831 |     "in",
2832 |     "short",
2833 |     "the",
2834 |     "co-occurrence",
2835 |     "graph",
2836 |     "will",
2837 |     "contain",
2838 |     "densely",
2839 |     "connected",
2840 |     "regions",
2841 |     "for",
2842 |     "terms",
2843 |     "that",
2844 |     "appear",
2845 |     "often",
2846 |     "and",
2847 |     "in",
2848 |     "different",
2849 |     "contexts"
2850 |   ],
2851 |   [
2852 |     "a",
2853 |     "random",
2854 |     "walk",
2855 |     "on",
2856 |     "this",
2857 |     "graph",
2858 |     "will",
2859 |     "have",
2860 |     "a",
2861 |     "stationary",
2862 |     "distribution",
2863 |     "that",
2864 |     "assigns",
2865 |     "large",
2866 |     "probabilities",
2867 |     "to",
2868 |     "the",
2869 |     "terms",
2870 |     "in",
2871 |     "the",
2872 |     "centers",
2873 |     "of",
2874 |     "the",
2875 |     "clusters"
2876 |   ],
2877 |   [
2878 |     "this",
2879 |     "is",
2880 |     "similar",
2881 |     "to",
2882 |     "densely",
2883 |     "connected",
2884 |     "web",
2885 |     "pages",
2886 |     "getting",
2887 |     "ranked",
2888 |     "highly",
2889 |     "by",
2890 |     "pagerank"
2891 |   ],
2892 |   [
2893 |     "like",
2894 |     "keyphrase",
2895 |     "extraction",
2896 |     "document",
2897 |     "summarization",
2898 |     "hopes",
2899 |     "to",
2900 |     "identify",
2901 |     "the",
2902 |     "essence",
2903 |     "of",
2904 |     "a",
2905 |     "text"
2906 |   ],
2907 |   [
2908 |     "the",
2909 |     "only",
2910 |     "real",
2911 |     "difference",
2912 |     "is",
2913 |     "that",
2914 |     "now",
2915 |     "we",
2916 |     "are",
2917 |     "dealing",
2918 |     "with",
2919 |     "larger",
2920 |     "text",
2921 |     "units",
2922 |     "whole",
2923 |     "sentences",
2924 |     "instead",
2925 |     "of",
2926 |     "words",
2927 |     "and",
2928 |     "phrases"
2929 |   ],
2930 |   [
2931 |     "before",
2932 |     "getting",
2933 |     "into",
2934 |     "the",
2935 |     "details",
2936 |     "of",
2937 |     "some",
2938 |     "summarization",
2939 |     "methods",
2940 |     "we",
2941 |     "will",
2942 |     "mention",
2943 |     "how",
2944 |     "summarization",
2945 |     "systems",
2946 |     "are",
2947 |     "typically",
2948 |     "evaluated"
2949 |   ],
2950 |   [
2951 |     "the",
2952 |     "most",
2953 |     "common",
2954 |     "way",
2955 |     "is",
2956 |     "using",
2957 |     "the",
2958 |     "so-called",
2959 |     "rouge",
2960 |     "recall-oriented",
2961 |     "understudy",
2962 |     "for",
2963 |     "gisting",
2964 |     "evaluation",
2965 |     "measure"
2966 |   ],
2967 |   [
2968 |     "this",
2969 |     "is",
2970 |     "a",
2971 |     "recall-based",
2972 |     "measure",
2973 |     "that",
2974 |     "determines",
2975 |     "how",
2976 |     "well",
2977 |     "a",
2978 |     "system-generated",
2979 |     "summary",
2980 |     "covers",
2981 |     "the",
2982 |     "content",
2983 |     "present",
2984 |     "in",
2985 |     "one",
2986 |     "or",
2987 |     "more",
2988 |     "human-generated",
2989 |     "model",
2990 |     "summaries",
2991 |     "known",
2992 |     "as",
2993 |     "references"
2994 |   ],
2995 |   [
2996 |     "it",
2997 |     "is",
2998 |     "recall-based",
2999 |     "to",
3000 |     "encourage",
3001 |     "systems",
3002 |     "to",
3003 |     "include",
3004 |     "all",
3005 |     "the",
3006 |     "important",
3007 |     "topics",
3008 |     "in",
3009 |     "the",
3010 |     "text"
3011 |   ],
3012 |   [
3013 |     "recall",
3014 |     "can",
3015 |     "be",
3016 |     "computed",
3017 |     "with",
3018 |     "respect",
3019 |     "to",
3020 |     "unigram",
3021 |     "bigram",
3022 |     "trigram",
3023 |     "or",
3024 |     "4-gram",
3025 |     "matching"
3026 |   ],
3027 |   [
3028 |     "for",
3029 |     "example",
3030 |     "rouge-1",
3031 |     "is",
3032 |     "computed",
3033 |     "as",
3034 |     "division",
3035 |     "of",
3036 |     "count",
3037 |     "of",
3038 |     "unigrams",
3039 |     "in",
3040 |     "reference",
3041 |     "that",
3042 |     "appear",
3043 |     "in",
3044 |     "system",
3045 |     "and",
3046 |     "count",
3047 |     "of",
3048 |     "unigrams",
3049 |     "in",
3050 |     "reference",
3051 |     "summary"
3052 |   ],
3053 |   [
3054 |     "if",
3055 |     "there",
3056 |     "are",
3057 |     "multiple",
3058 |     "references",
3059 |     "the",
3060 |     "rouge-1",
3061 |     "scores",
3062 |     "are",
3063 |     "averaged"
3064 |   ],
3065 |   [
3066 |     "because",
3067 |     "rouge",
3068 |     "is",
3069 |     "based",
3070 |     "only",
3071 |     "on",
3072 |     "content",
3073 |     "overlap",
3074 |     "it",
3075 |     "can",
3076 |     "determine",
3077 |     "if",
3078 |     "the",
3079 |     "same",
3080 |     "general",
3081 |     "concepts",
3082 |     "are",
3083 |     "discussed",
3084 |     "between",
3085 |     "an",
3086 |     "automatic",
3087 |     "summary",
3088 |     "and",
3089 |     "a",
3090 |     "reference",
3091 |     "summary",
3092 |     "but",
3093 |     "it",
3094 |     "cannot",
3095 |     "determine",
3096 |     "if",
3097 |     "the",
3098 |     "result",
3099 |     "is",
3100 |     "coherent",
3101 |     "or",
3102 |     "the",
3103 |     "sentences",
3104 |     "flow",
3105 |     "together",
3106 |     "in",
3107 |     "a",
3108 |     "sensible",
3109 |     "manner"
3110 |   ],
3111 |   [
3112 |     "high-order",
3113 |     "n-gram",
3114 |     "rouge",
3115 |     "measures",
3116 |     "try",
3117 |     "to",
3118 |     "judge",
3119 |     "fluency",
3120 |     "to",
3121 |     "some",
3122 |     "degree"
3123 |   ],
3124 |   [
3125 |     "note",
3126 |     "that",
3127 |     "rouge",
3128 |     "is",
3129 |     "similar",
3130 |     "to",
3131 |     "the",
3132 |     "bleu",
3133 |     "measure",
3134 |     "for",
3135 |     "machine",
3136 |     "translation",
3137 |     "but",
3138 |     "bleu",
3139 |     "is",
3140 |     "precision-",
3141 |     "based",
3142 |     "because",
3143 |     "translation",
3144 |     "systems",
3145 |     "favor",
3146 |     "accuracy"
3147 |   ],
3148 |   [
3149 |     "a",
3150 |     "promising",
3151 |     "line",
3152 |     "in",
3153 |     "document",
3154 |     "summarization",
3155 |     "is",
3156 |     "adaptive",
3157 |     "document/text",
3158 |     "summarization"
3159 |   ],
3160 |   [
3161 |     "the",
3162 |     "idea",
3163 |     "of",
3164 |     "adaptive",
3165 |     "summarization",
3166 |     "involves",
3167 |     "preliminary",
3168 |     "recognition",
3169 |     "of",
3170 |     "document/text",
3171 |     "genre",
3172 |     "and",
3173 |     "subsequent",
3174 |     "application",
3175 |     "of",
3176 |     "summarization",
3177 |     "algorithms",
3178 |     "optimized",
3179 |     "for",
3180 |     "this",
3181 |     "genre"
3182 |   ],
3183 |   [
3184 |     "first",
3185 |     "summarizes",
3186 |     "that",
3187 |     "perform",
3188 |     "adaptive",
3189 |     "summarization",
3190 |     "have",
3191 |     "been",
3192 |     "created"
3193 |   ],
3194 |   [
3195 |     "supervised",
3196 |     "text",
3197 |     "summarization",
3198 |     "is",
3199 |     "very",
3200 |     "much",
3201 |     "like",
3202 |     "supervised",
3203 |     "keyphrase",
3204 |     "extraction"
3205 |   ],
3206 |   [
3207 |     "basically",
3208 |     "if",
3209 |     "you",
3210 |     "have",
3211 |     "a",
3212 |     "collection",
3213 |     "of",
3214 |     "documents",
3215 |     "and",
3216 |     "human-generated",
3217 |     "summaries",
3218 |     "for",
3219 |     "them",
3220 |     "you",
3221 |     "can",
3222 |     "learn",
3223 |     "features",
3224 |     "of",
3225 |     "sentences",
3226 |     "that",
3227 |     "make",
3228 |     "them",
3229 |     "good",
3230 |     "candidates",
3231 |     "for",
3232 |     "inclusion",
3233 |     "in",
3234 |     "the",
3235 |     "summary"
3236 |   ],
3237 |   [
3238 |     "features",
3239 |     "might",
3240 |     "include",
3241 |     "the",
3242 |     "position",
3243 |     "in",
3244 |     "the",
3245 |     "document",
3246 |     "i"
3247 |   ],
3248 |   [
3249 |     "the",
3250 |     "first",
3251 |     "few",
3252 |     "sentences",
3253 |     "are",
3254 |     "probably",
3255 |     "important",
3256 |     "the",
3257 |     "number",
3258 |     "of",
3259 |     "words",
3260 |     "in",
3261 |     "the",
3262 |     "sentence",
3263 |     "etc"
3264 |   ],
3265 |   [
3266 |     "the",
3267 |     "main",
3268 |     "difficulty",
3269 |     "in",
3270 |     "supervised",
3271 |     "extractive",
3272 |     "summarization",
3273 |     "is",
3274 |     "that",
3275 |     "the",
3276 |     "known",
3277 |     "summaries",
3278 |     "must",
3279 |     "be",
3280 |     "manually",
3281 |     "created",
3282 |     "by",
3283 |     "extracting",
3284 |     "sentences",
3285 |     "so",
3286 |     "the",
3287 |     "sentences",
3288 |     "in",
3289 |     "an",
3290 |     "original",
3291 |     "training",
3292 |     "document",
3293 |     "can",
3294 |     "be",
3295 |     "labeled",
3296 |     "as",
3297 |     "in",
3298 |     "summary",
3299 |     "or",
3300 |     "not",
3301 |     "in",
3302 |     "summary"
3303 |   ],
3304 |   [
3305 |     "this",
3306 |     "is",
3307 |     "not",
3308 |     "typically",
3309 |     "how",
3310 |     "people",
3311 |     "create",
3312 |     "summaries",
3313 |     "so",
3314 |     "simply",
3315 |     "using",
3316 |     "journal",
3317 |     "abstracts",
3318 |     "or",
3319 |     "existing",
3320 |     "summaries",
3321 |     "is",
3322 |     "usually",
3323 |     "not",
3324 |     "sufficient"
3325 |   ],
3326 |   [
3327 |     "the",
3328 |     "sentences",
3329 |     "in",
3330 |     "these",
3331 |     "summaries",
3332 |     "do",
3333 |     "not",
3334 |     "necessarily",
3335 |     "match",
3336 |     "up",
3337 |     "with",
3338 |     "sentences",
3339 |     "in",
3340 |     "the",
3341 |     "original",
3342 |     "text",
3343 |     "so",
3344 |     "it",
3345 |     "would",
3346 |     "be",
3347 |     "difficult",
3348 |     "to",
3349 |     "assign",
3350 |     "labels",
3351 |     "to",
3352 |     "examples",
3353 |     "for",
3354 |     "training"
3355 |   ],
3356 |   [
3357 |     "note",
3358 |     "however",
3359 |     "that",
3360 |     "these",
3361 |     "natural",
3362 |     "summaries",
3363 |     "can",
3364 |     "still",
3365 |     "be",
3366 |     "used",
3367 |     "for",
3368 |     "evaluation",
3369 |     "purposes",
3370 |     "since",
3371 |     "rouge-1",
3372 |     "only",
3373 |     "cares",
3374 |     "about",
3375 |     "unigrams"
3376 |   ],
3377 |   [
3378 |     "the",
3379 |     "unsupervised",
3380 |     "approach",
3381 |     "to",
3382 |     "summarization",
3383 |     "is",
3384 |     "also",
3385 |     "quite",
3386 |     "similar",
3387 |     "in",
3388 |     "spirit",
3389 |     "to",
3390 |     "unsupervised",
3391 |     "keyphrase",
3392 |     "extraction",
3393 |     "and",
3394 |     "gets",
3395 |     "around",
3396 |     "the",
3397 |     "issue",
3398 |     "of",
3399 |     "costly",
3400 |     "training",
3401 |     "data"
3402 |   ],
3403 |   [
3404 |     "some",
3405 |     "unsupervised",
3406 |     "summarization",
3407 |     "approaches",
3408 |     "are",
3409 |     "based",
3410 |     "on",
3411 |     "finding",
3412 |     "a",
3413 |     "centroid",
3414 |     "sentence",
3415 |     "which",
3416 |     "is",
3417 |     "the",
3418 |     "mean",
3419 |     "word",
3420 |     "vector",
3421 |     "of",
3422 |     "all",
3423 |     "the",
3424 |     "sentences",
3425 |     "in",
3426 |     "the",
3427 |     "document"
3428 |   ],
3429 |   [
3430 |     "then",
3431 |     "the",
3432 |     "sentences",
3433 |     "can",
3434 |     "be",
3435 |     "ranked",
3436 |     "with",
3437 |     "regard",
3438 |     "to",
3439 |     "their",
3440 |     "similarity",
3441 |     "to",
3442 |     "this",
3443 |     "centroid",
3444 |     "sentence"
3445 |   ],
3446 |   [
3447 |     "a",
3448 |     "more",
3449 |     "principled",
3450 |     "way",
3451 |     "to",
3452 |     "estimate",
3453 |     "sentence",
3454 |     "importance",
3455 |     "is",
3456 |     "using",
3457 |     "random",
3458 |     "walks",
3459 |     "and",
3460 |     "eigenvector",
3461 |     "centrality"
3462 |   ],
3463 |   [
3464 |     "lexrank",
3465 |     "is",
3466 |     "an",
3467 |     "algorithm",
3468 |     "essentially",
3469 |     "identical",
3470 |     "to",
3471 |     "textrank",
3472 |     "and",
3473 |     "both",
3474 |     "use",
3475 |     "this",
3476 |     "approach",
3477 |     "for",
3478 |     "document",
3479 |     "summarization"
3480 |   ],
3481 |   [
3482 |     "the",
3483 |     "two",
3484 |     "methods",
3485 |     "were",
3486 |     "developed",
3487 |     "by",
3488 |     "different",
3489 |     "groups",
3490 |     "at",
3491 |     "the",
3492 |     "same",
3493 |     "time",
3494 |     "and",
3495 |     "lexrank",
3496 |     "simply",
3497 |     "focused",
3498 |     "on",
3499 |     "summarization",
3500 |     "but",
3501 |     "could",
3502 |     "just",
3503 |     "as",
3504 |     "easily",
3505 |     "be",
3506 |     "used",
3507 |     "for",
3508 |     "keyphrase",
3509 |     "extraction",
3510 |     "or",
3511 |     "any",
3512 |     "other",
3513 |     "nlp",
3514 |     "ranking",
3515 |     "task"
3516 |   ],
3517 |   [
3518 |     "what",
3519 |     "are",
3520 |     "the",
3521 |     "vertices"
3522 |   ],
3523 |   [
3524 |     "in",
3525 |     "both",
3526 |     "lexrank",
3527 |     "and",
3528 |     "textrank",
3529 |     "a",
3530 |     "graph",
3531 |     "is",
3532 |     "constructed",
3533 |     "by",
3534 |     "creating",
3535 |     "a",
3536 |     "vertex",
3537 |     "for",
3538 |     "each",
3539 |     "sentence",
3540 |     "in",
3541 |     "the",
3542 |     "document"
3543 |   ],
3544 |   [
3545 |     "what",
3546 |     "are",
3547 |     "the",
3548 |     "edges"
3549 |   ],
3550 |   [
3551 |     "the",
3552 |     "edges",
3553 |     "between",
3554 |     "sentences",
3555 |     "are",
3556 |     "based",
3557 |     "on",
3558 |     "some",
3559 |     "form",
3560 |     "of",
3561 |     "semantic",
3562 |     "similarity",
3563 |     "or",
3564 |     "content",
3565 |     "overlap"
3566 |   ],
3567 |   [
3568 |     "while",
3569 |     "lexrank",
3570 |     "uses",
3571 |     "cosine",
3572 |     "similarity",
3573 |     "of",
3574 |     "tf-idf",
3575 |     "vectors",
3576 |     "textrank",
3577 |     "uses",
3578 |     "a",
3579 |     "very",
3580 |     "similar",
3581 |     "measure",
3582 |     "based",
3583 |     "on",
3584 |     "the",
3585 |     "number",
3586 |     "of",
3587 |     "words",
3588 |     "two",
3589 |     "sentences",
3590 |     "have",
3591 |     "in",
3592 |     "common",
3593 |     "normalized",
3594 |     "by",
3595 |     "the",
3596 |     "sentences'",
3597 |     "lengths"
3598 |   ],
3599 |   [
3600 |     "the",
3601 |     "lexrank",
3602 |     "paper",
3603 |     "explored",
3604 |     "using",
3605 |     "unweighted",
3606 |     "edges",
3607 |     "after",
3608 |     "applying",
3609 |     "a",
3610 |     "threshold",
3611 |     "to",
3612 |     "the",
3613 |     "cosine",
3614 |     "values",
3615 |     "but",
3616 |     "also",
3617 |     "experimented",
3618 |     "with",
3619 |     "using",
3620 |     "edges",
3621 |     "with",
3622 |     "weights",
3623 |     "equal",
3624 |     "to",
3625 |     "the",
3626 |     "similarity",
3627 |     "score"
3628 |   ],
3629 |   [
3630 |     "textrank",
3631 |     "uses",
3632 |     "continuous",
3633 |     "similarity",
3634 |     "scores",
3635 |     "as",
3636 |     "weights"
3637 |   ],
3638 |   [
3639 |     "how",
3640 |     "are",
3641 |     "summaries",
3642 |     "formed"
3643 |   ],
3644 |   [
3645 |     "in",
3646 |     "both",
3647 |     "algorithms",
3648 |     "the",
3649 |     "sentences",
3650 |     "are",
3651 |     "ranked",
3652 |     "by",
3653 |     "applying",
3654 |     "pagerank",
3655 |     "to",
3656 |     "the",
3657 |     "resulting",
3658 |     "graph"
3659 |   ],
3660 |   [
3661 |     "a",
3662 |     "summary",
3663 |     "is",
3664 |     "formed",
3665 |     "by",
3666 |     "combining",
3667 |     "the",
3668 |     "top",
3669 |     "ranking",
3670 |     "sentences",
3671 |     "using",
3672 |     "a",
3673 |     "threshold",
3674 |     "or",
3675 |     "length",
3676 |     "cutoff",
3677 |     "to",
3678 |     "limit",
3679 |     "the",
3680 |     "size",
3681 |     "of",
3682 |     "the",
3683 |     "summary"
3684 |   ],
3685 |   [
3686 |     "it",
3687 |     "is",
3688 |     "worth",
3689 |     "noting",
3690 |     "that",
3691 |     "textrank",
3692 |     "was",
3693 |     "applied",
3694 |     "to",
3695 |     "summarization",
3696 |     "exactly",
3697 |     "as",
3698 |     "described",
3699 |     "here",
3700 |     "while",
3701 |     "lexrank",
3702 |     "was",
3703 |     "used",
3704 |     "as",
3705 |     "part",
3706 |     "of",
3707 |     "a",
3708 |     "larger",
3709 |     "summarization",
3710 |     "system",
3711 |     "mead",
3712 |     "that",
3713 |     "combines",
3714 |     "the",
3715 |     "lexrank",
3716 |     "score",
3717 |     "stationary",
3718 |     "probability",
3719 |     "with",
3720 |     "other",
3721 |     "features",
3722 |     "like",
3723 |     "sentence",
3724 |     "position",
3725 |     "and",
3726 |     "length",
3727 |     "using",
3728 |     "a",
3729 |     "linear",
3730 |     "combination",
3731 |     "with",
3732 |     "either",
3733 |     "user-specified",
3734 |     "or",
3735 |     "automatically",
3736 |     "tuned",
3737 |     "weights"
3738 |   ],
3739 |   [
3740 |     "in",
3741 |     "this",
3742 |     "case",
3743 |     "some",
3744 |     "training",
3745 |     "documents",
3746 |     "might",
3747 |     "be",
3748 |     "needed",
3749 |     "though",
3750 |     "the",
3751 |     "textrank",
3752 |     "results",
3753 |     "show",
3754 |     "the",
3755 |     "additional",
3756 |     "features",
3757 |     "are",
3758 |     "not",
3759 |     "absolutely",
3760 |     "necessary"
3761 |   ],
3762 |   [
3763 |     "another",
3764 |     "important",
3765 |     "distinction",
3766 |     "is",
3767 |     "that",
3768 |     "textrank",
3769 |     "was",
3770 |     "used",
3771 |     "for",
3772 |     "single",
3773 |     "document",
3774 |     "summarization",
3775 |     "while",
3776 |     "lexrank",
3777 |     "has",
3778 |     "been",
3779 |     "applied",
3780 |     "to",
3781 |     "multi-document",
3782 |     "summarization"
3783 |   ],
3784 |   [
3785 |     "the",
3786 |     "task",
3787 |     "remains",
3788 |     "the",
3789 |     "same",
3790 |     "in",
3791 |     "both",
3792 |     "cases",
3793 |     "only",
3794 |     "the",
3795 |     "number",
3796 |     "of",
3797 |     "sentences",
3798 |     "to",
3799 |     "choose",
3800 |     "from",
3801 |     "has",
3802 |     "grown"
3803 |   ],
3804 |   [
3805 |     "however",
3806 |     "when",
3807 |     "summarizing",
3808 |     "multiple",
3809 |     "documents",
3810 |     "there",
3811 |     "is",
3812 |     "a",
3813 |     "greater",
3814 |     "risk",
3815 |     "of",
3816 |     "selecting",
3817 |     "duplicate",
3818 |     "or",
3819 |     "highly",
3820 |     "redundant",
3821 |     "sentences",
3822 |     "to",
3823 |     "place",
3824 |     "in",
3825 |     "the",
3826 |     "same",
3827 |     "summary"
3828 |   ],
3829 |   [
3830 |     "imagine",
3831 |     "you",
3832 |     "have",
3833 |     "a",
3834 |     "cluster",
3835 |     "of",
3836 |     "news",
3837 |     "articles",
3838 |     "on",
3839 |     "a",
3840 |     "particular",
3841 |     "event",
3842 |     "and",
3843 |     "you",
3844 |     "want",
3845 |     "to",
3846 |     "produce",
3847 |     "one",
3848 |     "summary"
3849 |   ],
3850 |   [
3851 |     "each",
3852 |     "article",
3853 |     "is",
3854 |     "likely",
3855 |     "to",
3856 |     "have",
3857 |     "many",
3858 |     "similar",
3859 |     "sentences",
3860 |     "and",
3861 |     "you",
3862 |     "would",
3863 |     "only",
3864 |     "want",
3865 |     "to",
3866 |     "include",
3867 |     "distinct",
3868 |     "ideas",
3869 |     "in",
3870 |     "the",
3871 |     "summary"
3872 |   ],
3873 |   [
3874 |     "to",
3875 |     "address",
3876 |     "this",
3877 |     "issue",
3878 |     "lexrank",
3879 |     "applies",
3880 |     "a",
3881 |     "heuristic",
3882 |     "post-processing",
3883 |     "step",
3884 |     "that",
3885 |     "builds",
3886 |     "up",
3887 |     "a",
3888 |     "summary",
3889 |     "by",
3890 |     "adding",
3891 |     "sentences",
3892 |     "in",
3893 |     "rank",
3894 |     "order",
3895 |     "but",
3896 |     "discards",
3897 |     "any",
3898 |     "sentences",
3899 |     "that",
3900 |     "are",
3901 |     "too",
3902 |     "similar",
3903 |     "to",
3904 |     "ones",
3905 |     "already",
3906 |     "placed",
3907 |     "in",
3908 |     "the",
3909 |     "summary"
3910 |   ],
3911 |   [
3912 |     "the",
3913 |     "method",
3914 |     "used",
3915 |     "is",
3916 |     "called",
3917 |     "cross-sentence",
3918 |     "information",
3919 |     "subsumption",
3920 |     "csis"
3921 |   ],
3922 |   [
3923 |     "these",
3924 |     "methods",
3925 |     "work",
3926 |     "based",
3927 |     "on",
3928 |     "the",
3929 |     "idea",
3930 |     "that",
3931 |     "sentences",
3932 |     "recommend",
3933 |     "other",
3934 |     "similar",
3935 |     "sentences",
3936 |     "to",
3937 |     "the",
3938 |     "reader"
3939 |   ],
3940 |   [
3941 |     "thus",
3942 |     "if",
3943 |     "one",
3944 |     "sentence",
3945 |     "is",
3946 |     "very",
3947 |     "similar",
3948 |     "to",
3949 |     "many",
3950 |     "others",
3951 |     "it",
3952 |     "will",
3953 |     "likely",
3954 |     "be",
3955 |     "a",
3956 |     "sentence",
3957 |     "of",
3958 |     "great",
3959 |     "importance"
3960 |   ],
3961 |   [
3962 |     "the",
3963 |     "importance",
3964 |     "of",
3965 |     "this",
3966 |     "sentence",
3967 |     "also",
3968 |     "stems",
3969 |     "from",
3970 |     "the",
3971 |     "importance",
3972 |     "of",
3973 |     "the",
3974 |     "sentences",
3975 |     "recommending",
3976 |     "it"
3977 |   ],
3978 |   [
3979 |     "thus",
3980 |     "to",
3981 |     "get",
3982 |     "ranked",
3983 |     "highly",
3984 |     "and",
3985 |     "placed",
3986 |     "in",
3987 |     "a",
3988 |     "summary",
3989 |     "a",
3990 |     "sentence",
3991 |     "must",
3992 |     "be",
3993 |     "similar",
3994 |     "to",
3995 |     "many",
3996 |     "sentences",
3997 |     "that",
3998 |     "are",
3999 |     "in",
4000 |     "turn",
4001 |     "also",
4002 |     "similar",
4003 |     "to",
4004 |     "many",
4005 |     "other",
4006 |     "sentences"
4007 |   ],
4008 |   [
4009 |     "this",
4010 |     "makes",
4011 |     "intuitive",
4012 |     "sense",
4013 |     "and",
4014 |     "allows",
4015 |     "the",
4016 |     "algorithms",
4017 |     "to",
4018 |     "be",
4019 |     "applied",
4020 |     "to",
4021 |     "any",
4022 |     "arbitrary",
4023 |     "new",
4024 |     "text"
4025 |   ],
4026 |   [
4027 |     "the",
4028 |     "methods",
4029 |     "are",
4030 |     "domain-independent",
4031 |     "and",
4032 |     "easily",
4033 |     "portable"
4034 |   ],
4035 |   [
4036 |     "one",
4037 |     "could",
4038 |     "imagine",
4039 |     "the",
4040 |     "features",
4041 |     "indicating",
4042 |     "important",
4043 |     "sentences",
4044 |     "in",
4045 |     "the",
4046 |     "news",
4047 |     "domain",
4048 |     "might",
4049 |     "vary",
4050 |     "considerably",
4051 |     "from",
4052 |     "the",
4053 |     "biomedical",
4054 |     "domain"
4055 |   ],
4056 |   [
4057 |     "however",
4058 |     "the",
4059 |     "unsupervised",
4060 |     "recommendation",
4061 |     "-based",
4062 |     "approach",
4063 |     "applies",
4064 |     "to",
4065 |     "any",
4066 |     "domain"
4067 |   ],
4068 |   [
4069 |     "multi-document",
4070 |     "summarization",
4071 |     "is",
4072 |     "an",
4073 |     "automatic",
4074 |     "procedure",
4075 |     "aimed",
4076 |     "at",
4077 |     "extraction",
4078 |     "of",
4079 |     "information",
4080 |     "from",
4081 |     "multiple",
4082 |     "texts",
4083 |     "written",
4084 |     "about",
4085 |     "the",
4086 |     "same",
4087 |     "topic"
4088 |   ],
4089 |   [
4090 |     "resulting",
4091 |     "summary",
4092 |     "report",
4093 |     "allows",
4094 |     "individual",
4095 |     "users",
4096 |     "such",
4097 |     "as",
4098 |     "professional",
4099 |     "information",
4100 |     "consumers",
4101 |     "to",
4102 |     "quickly",
4103 |     "familiarize",
4104 |     "themselves",
4105 |     "with",
4106 |     "information",
4107 |     "contained",
4108 |     "in",
4109 |     "a",
4110 |     "large",
4111 |     "cluster",
4112 |     "of",
4113 |     "documents"
4114 |   ],
4115 |   [
4116 |     "in",
4117 |     "such",
4118 |     "a",
4119 |     "way",
4120 |     "multi-document",
4121 |     "summarization",
4122 |     "systems",
4123 |     "are",
4124 |     "complementing",
4125 |     "the",
4126 |     "news",
4127 |     "aggregators",
4128 |     "performing",
4129 |     "the",
4130 |     "next",
4131 |     "step",
4132 |     "down",
4133 |     "the",
4134 |     "road",
4135 |     "of",
4136 |     "coping",
4137 |     "with",
4138 |     "information",
4139 |     "overload"
4140 |   ],
4141 |   [
4142 |     "multi-document",
4143 |     "summarization",
4144 |     "creates",
4145 |     "information",
4146 |     "reports",
4147 |     "that",
4148 |     "are",
4149 |     "both",
4150 |     "concise",
4151 |     "and",
4152 |     "comprehensive"
4153 |   ],
4154 |   [
4155 |     "with",
4156 |     "different",
4157 |     "opinions",
4158 |     "being",
4159 |     "put",
4160 |     "together",
4161 |     "&",
4162 |     "outlined",
4163 |     "every",
4164 |     "topic",
4165 |     "is",
4166 |     "described",
4167 |     "from",
4168 |     "multiple",
4169 |     "perspectives",
4170 |     "within",
4171 |     "a",
4172 |     "single",
4173 |     "document"
4174 |   ],
4175 |   [
4176 |     "while",
4177 |     "the",
4178 |     "goal",
4179 |     "of",
4180 |     "a",
4181 |     "brief",
4182 |     "summary",
4183 |     "is",
4184 |     "to",
4185 |     "simplify",
4186 |     "information",
4187 |     "search",
4188 |     "and",
4189 |     "cut",
4190 |     "the",
4191 |     "time",
4192 |     "by",
4193 |     "pointing",
4194 |     "to",
4195 |     "the",
4196 |     "most",
4197 |     "relevant",
4198 |     "source",
4199 |     "documents",
4200 |     "comprehensive",
4201 |     "multi-document",
4202 |     "summary",
4203 |     "should",
4204 |     "itself",
4205 |     "contain",
4206 |     "the",
4207 |     "required",
4208 |     "information",
4209 |     "hence",
4210 |     "limiting",
4211 |     "the",
4212 |     "need",
4213 |     "for",
4214 |     "accessing",
4215 |     "original",
4216 |     "files",
4217 |     "to",
4218 |     "cases",
4219 |     "when",
4220 |     "refinement",
4221 |     "is",
4222 |     "required"
4223 |   ],
4224 |   [
4225 |     "automatic",
4226 |     "summaries",
4227 |     "present",
4228 |     "information",
4229 |     "extracted",
4230 |     "from",
4231 |     "multiple",
4232 |     "sources",
4233 |     "algorithmically",
4234 |     "without",
4235 |     "any",
4236 |     "editorial",
4237 |     "touch",
4238 |     "or",
4239 |     "subjective",
4240 |     "human",
4241 |     "intervention",
4242 |     "thus",
4243 |     "making",
4244 |     "it",
4245 |     "completely",
4246 |     "unbiased"
4247 |   ],
4248 |   [
4249 |     "multi-document",
4250 |     "extractive",
4251 |     "summarization",
4252 |     "faces",
4253 |     "a",
4254 |     "problem",
4255 |     "of",
4256 |     "potential",
4257 |     "redundancy"
4258 |   ],
4259 |   [
4260 |     "ideally",
4261 |     "we",
4262 |     "would",
4263 |     "like",
4264 |     "to",
4265 |     "extract",
4266 |     "sentences",
4267 |     "that",
4268 |     "are",
4269 |     "both",
4270 |     "central",
4271 |     "i"
4272 |   ],
4273 |   [
4274 |     "contain",
4275 |     "the",
4276 |     "main",
4277 |     "ideas",
4278 |     "and",
4279 |     "diverse",
4280 |     "i"
4281 |   ],
4282 |   [
4283 |     "they",
4284 |     "differ",
4285 |     "from",
4286 |     "one",
4287 |     "another"
4288 |   ],
4289 |   [
4290 |     "lexrank",
4291 |     "deals",
4292 |     "with",
4293 |     "diversity",
4294 |     "as",
4295 |     "a",
4296 |     "heuristic",
4297 |     "final",
4298 |     "stage",
4299 |     "using",
4300 |     "csis",
4301 |     "and",
4302 |     "other",
4303 |     "systems",
4304 |     "have",
4305 |     "used",
4306 |     "similar",
4307 |     "methods",
4308 |     "such",
4309 |     "as",
4310 |     "maximal",
4311 |     "marginal",
4312 |     "relevance",
4313 |     "mmr",
4314 |     "in",
4315 |     "trying",
4316 |     "to",
4317 |     "eliminate",
4318 |     "redundancy",
4319 |     "in",
4320 |     "information",
4321 |     "retrieval",
4322 |     "results"
4323 |   ],
4324 |   [
4325 |     "there",
4326 |     "is",
4327 |     "a",
4328 |     "general",
4329 |     "purpose",
4330 |     "graph-based",
4331 |     "ranking",
4332 |     "algorithm",
4333 |     "like",
4334 |     "page/lex/textrank",
4335 |     "that",
4336 |     "handles",
4337 |     "both",
4338 |     "centrality",
4339 |     "and",
4340 |     "diversity",
4341 |     "in",
4342 |     "a",
4343 |     "unified",
4344 |     "mathematical",
4345 |     "framework",
4346 |     "based",
4347 |     "on",
4348 |     "absorbing",
4349 |     "markov",
4350 |     "chain",
4351 |     "random",
4352 |     "walks"
4353 |   ],
4354 |   [
4355 |     "an",
4356 |     "absorbing",
4357 |     "random",
4358 |     "walk",
4359 |     "is",
4360 |     "like",
4361 |     "a",
4362 |     "standard",
4363 |     "random",
4364 |     "walk",
4365 |     "except",
4366 |     "some",
4367 |     "states",
4368 |     "are",
4369 |     "now",
4370 |     "absorbing",
4371 |     "states",
4372 |     "that",
4373 |     "act",
4374 |     "as",
4375 |     "black",
4376 |     "holes",
4377 |     "that",
4378 |     "cause",
4379 |     "the",
4380 |     "walk",
4381 |     "to",
4382 |     "end",
4383 |     "abruptly",
4384 |     "at",
4385 |     "that",
4386 |     "state"
4387 |   ],
4388 |   [
4389 |     "the",
4390 |     "algorithm",
4391 |     "is",
4392 |     "called",
4393 |     "grasshopper"
4394 |   ],
4395 |   [
4396 |     "in",
4397 |     "addition",
4398 |     "to",
4399 |     "explicitly",
4400 |     "promoting",
4401 |     "diversity",
4402 |     "during",
4403 |     "the",
4404 |     "ranking",
4405 |     "process",
4406 |     "grasshopper",
4407 |     "incorporates",
4408 |     "a",
4409 |     "prior",
4410 |     "ranking",
4411 |     "based",
4412 |     "on",
4413 |     "sentence",
4414 |     "position",
4415 |     "in",
4416 |     "the",
4417 |     "case",
4418 |     "of",
4419 |     "summarization"
4420 |   ],
4421 |   [
4422 |     "the",
4423 |     "most",
4424 |     "common",
4425 |     "way",
4426 |     "to",
4427 |     "evaluate",
4428 |     "the",
4429 |     "informativeness",
4430 |     "of",
4431 |     "automatic",
4432 |     "summaries",
4433 |     "is",
4434 |     "to",
4435 |     "compare",
4436 |     "them",
4437 |     "with",
4438 |     "human-made",
4439 |     "model",
4440 |     "summaries"
4441 |   ],
4442 |   [
4443 |     "evaluation",
4444 |     "techniques",
4445 |     "fall",
4446 |     "into",
4447 |     "intrinsic",
4448 |     "and",
4449 |     "extrinsic",
4450 |     "inter-texual",
4451 |     "and",
4452 |     "intra-texual"
4453 |   ],
4454 |   [
4455 |     "an",
4456 |     "intrinsic",
4457 |     "evaluation",
4458 |     "tests",
4459 |     "the",
4460 |     "summarization",
4461 |     "system",
4462 |     "in",
4463 |     "of",
4464 |     "itself",
4465 |     "while",
4466 |     "an",
4467 |     "extrinsic",
4468 |     "evaluation",
4469 |     "tests",
4470 |     "the",
4471 |     "summarization",
4472 |     "based",
4473 |     "on",
4474 |     "how",
4475 |     "it",
4476 |     "affects",
4477 |     "the",
4478 |     "completion",
4479 |     "of",
4480 |     "some",
4481 |     "other",
4482 |     "task"
4483 |   ],
4484 |   [
4485 |     "intrinsic",
4486 |     "evaluations",
4487 |     "have",
4488 |     "assessed",
4489 |     "mainly",
4490 |     "the",
4491 |     "coherence",
4492 |     "and",
4493 |     "informativeness",
4494 |     "of",
4495 |     "summaries"
4496 |   ],
4497 |   [
4498 |     "extrinsic",
4499 |     "evaluations",
4500 |     "on",
4501 |     "the",
4502 |     "other",
4503 |     "hand",
4504 |     "have",
4505 |     "tested",
4506 |     "the",
4507 |     "impact",
4508 |     "of",
4509 |     "summarization",
4510 |     "on",
4511 |     "tasks",
4512 |     "like",
4513 |     "relevance",
4514 |     "assessment",
4515 |     "reading",
4516 |     "comprehension",
4517 |     "etc"
4518 |   ],
4519 |   [
4520 |     "intra-textual",
4521 |     "methods",
4522 |     "assess",
4523 |     "the",
4524 |     "output",
4525 |     "of",
4526 |     "a",
4527 |     "specific",
4528 |     "summarization",
4529 |     "system",
4530 |     "and",
4531 |     "the",
4532 |     "inter-texual",
4533 |     "ones",
4534 |     "focus",
4535 |     "on",
4536 |     "contrastive",
4537 |     "analysis",
4538 |     "of",
4539 |     "outputs",
4540 |     "of",
4541 |     "several",
4542 |     "summarization",
4543 |     "systems"
4544 |   ],
4545 |   [
4546 |     "human",
4547 |     "judgement",
4548 |     "often",
4549 |     "has",
4550 |     "wide",
4551 |     "variance",
4552 |     "on",
4553 |     "what",
4554 |     "is",
4555 |     "considered",
4556 |     "a",
4557 |     "good",
4558 |     "summary",
4559 |     "which",
4560 |     "means",
4561 |     "that",
4562 |     "making",
4563 |     "the",
4564 |     "evaluation",
4565 |     "process",
4566 |     "automatic",
4567 |     "is",
4568 |     "particularly",
4569 |     "difficult"
4570 |   ],
4571 |   [
4572 |     "manual",
4573 |     "evaluation",
4574 |     "can",
4575 |     "be",
4576 |     "used",
4577 |     "but",
4578 |     "this",
4579 |     "is",
4580 |     "both",
4581 |     "time",
4582 |     "and",
4583 |     "labor-intensive",
4584 |     "as",
4585 |     "it",
4586 |     "requires",
4587 |     "humans",
4588 |     "to",
4589 |     "read",
4590 |     "not",
4591 |     "only",
4592 |     "the",
4593 |     "summaries",
4594 |     "but",
4595 |     "also",
4596 |     "the",
4597 |     "source",
4598 |     "documents"
4599 |   ],
4600 |   [
4601 |     "other",
4602 |     "issues",
4603 |     "are",
4604 |     "those",
4605 |     "concerning",
4606 |     "coherence",
4607 |     "and",
4608 |     "coverage"
4609 |   ],
4610 |   [
4611 |     "one",
4612 |     "of",
4613 |     "the",
4614 |     "metrics",
4615 |     "used",
4616 |     "in",
4617 |     "nist's",
4618 |     "annual",
4619 |     "document",
4620 |     "understanding",
4621 |     "conferences",
4622 |     "in",
4623 |     "which",
4624 |     "research",
4625 |     "groups",
4626 |     "submit",
4627 |     "their",
4628 |     "systems",
4629 |     "for",
4630 |     "both",
4631 |     "summarization",
4632 |     "and",
4633 |     "translation",
4634 |     "tasks",
4635 |     "is",
4636 |     "the",
4637 |     "rouge",
4638 |     "metric",
4639 |     "recall-oriented",
4640 |     "understudy",
4641 |     "for",
4642 |     "gisting",
4643 |     "evaluation"
4644 |   ],
4645 |   [
4646 |     "it",
4647 |     "essentially",
4648 |     "calculates",
4649 |     "n-gram",
4650 |     "overlaps",
4651 |     "between",
4652 |     "automatically",
4653 |     "generated",
4654 |     "summaries",
4655 |     "and",
4656 |     "previously-written",
4657 |     "human",
4658 |     "summaries"
4659 |   ],
4660 |   [
4661 |     "a",
4662 |     "high",
4663 |     "level",
4664 |     "of",
4665 |     "overlap",
4666 |     "should",
4667 |     "indicate",
4668 |     "a",
4669 |     "high",
4670 |     "level",
4671 |     "of",
4672 |     "shared",
4673 |     "concepts",
4674 |     "between",
4675 |     "the",
4676 |     "two",
4677 |     "summaries"
4678 |   ],
4679 |   [
4680 |     "note",
4681 |     "that",
4682 |     "overlap",
4683 |     "metrics",
4684 |     "like",
4685 |     "this",
4686 |     "are",
4687 |     "unable",
4688 |     "to",
4689 |     "provide",
4690 |     "any",
4691 |     "feedback",
4692 |     "on",
4693 |     "a",
4694 |     "summary's",
4695 |     "coherence"
4696 |   ],
4697 |   [
4698 |     "anaphor",
4699 |     "resolution",
4700 |     "remains",
4701 |     "another",
4702 |     "problem",
4703 |     "yet",
4704 |     "to",
4705 |     "be",
4706 |     "fully",
4707 |     "solved"
4708 |   ],
4709 |   [
4710 |     "evaluating",
4711 |     "summaries",
4712 |     "either",
4713 |     "manually",
4714 |     "or",
4715 |     "automatically",
4716 |     "is",
4717 |     "a",
4718 |     "hard",
4719 |     "task"
4720 |   ],
4721 |   [
4722 |     "the",
4723 |     "main",
4724 |     "difficulty",
4725 |     "in",
4726 |     "evaluation",
4727 |     "comes",
4728 |     "from",
4729 |     "the",
4730 |     "impossibility",
4731 |     "of",
4732 |     "building",
4733 |     "a",
4734 |     "fair",
4735 |     "gold-standard",
4736 |     "against",
4737 |     "which",
4738 |     "the",
4739 |     "results",
4740 |     "of",
4741 |     "the",
4742 |     "systems",
4743 |     "can",
4744 |     "be",
4745 |     "compared"
4746 |   ],
4747 |   [
4748 |     "furthermore",
4749 |     "it",
4750 |     "is",
4751 |     "also",
4752 |     "very",
4753 |     "hard",
4754 |     "to",
4755 |     "determine",
4756 |     "what",
4757 |     "a",
4758 |     "correct",
4759 |     "summary",
4760 |     "is",
4761 |     "because",
4762 |     "there",
4763 |     "is",
4764 |     "always",
4765 |     "the",
4766 |     "possibility",
4767 |     "of",
4768 |     "a",
4769 |     "system",
4770 |     "to",
4771 |     "generate",
4772 |     "a",
4773 |     "good",
4774 |     "summary",
4775 |     "that",
4776 |     "is",
4777 |     "quite",
4778 |     "different",
4779 |     "from",
4780 |     "any",
4781 |     "human",
4782 |     "summary",
4783 |     "used",
4784 |     "as",
4785 |     "an",
4786 |     "approximation",
4787 |     "to",
4788 |     "the",
4789 |     "correct",
4790 |     "output"
4791 |   ],
4792 |   [
4793 |     "content",
4794 |     "selection",
4795 |     "is",
4796 |     "not",
4797 |     "a",
4798 |     "deterministic",
4799 |     "problem"
4800 |   ],
4801 |   [
4802 |     "people",
4803 |     "are",
4804 |     "subjective",
4805 |     "and",
4806 |     "different",
4807 |     "authors",
4808 |     "would",
4809 |     "choose",
4810 |     "different",
4811 |     "sentences"
4812 |   ],
4813 |   [
4814 |     "and",
4815 |     "individuals",
4816 |     "may",
4817 |     "not",
4818 |     "be",
4819 |     "consistent"
4820 |   ],
4821 |   [
4822 |     "a",
4823 |     "particular",
4824 |     "person",
4825 |     "may",
4826 |     "chose",
4827 |     "different",
4828 |     "sentences",
4829 |     "at",
4830 |     "different",
4831 |     "times"
4832 |   ],
4833 |   [
4834 |     "two",
4835 |     "distinct",
4836 |     "sentences",
4837 |     "expressed",
4838 |     "in",
4839 |     "different",
4840 |     "words",
4841 |     "can",
4842 |     "express",
4843 |     "the",
4844 |     "same",
4845 |     "meaning"
4846 |   ],
4847 |   [
4848 |     "this",
4849 |     "phenomenon",
4850 |     "is",
4851 |     "known",
4852 |     "as",
4853 |     "paraphrasing"
4854 |   ],
4855 |   [
4856 |     "we",
4857 |     "can",
4858 |     "find",
4859 |     "an",
4860 |     "approach",
4861 |     "to",
4862 |     "automatically",
4863 |     "evaluating",
4864 |     "summaries",
4865 |     "using",
4866 |     "paraphrases",
4867 |     "paraeval"
4868 |   ],
4869 |   [
4870 |     "most",
4871 |     "summarization",
4872 |     "systems",
4873 |     "perform",
4874 |     "an",
4875 |     "extractive",
4876 |     "approach",
4877 |     "selecting",
4878 |     "and",
4879 |     "copying",
4880 |     "important",
4881 |     "sentences",
4882 |     "from",
4883 |     "the",
4884 |     "source",
4885 |     "documents"
4886 |   ],
4887 |   [
4888 |     "although",
4889 |     "humans",
4890 |     "can",
4891 |     "also",
4892 |     "cut",
4893 |     "and",
4894 |     "paste",
4895 |     "relevant",
4896 |     "information",
4897 |     "of",
4898 |     "a",
4899 |     "text",
4900 |     "most",
4901 |     "of",
4902 |     "the",
4903 |     "times",
4904 |     "they",
4905 |     "rephrase",
4906 |     "sentences",
4907 |     "when",
4908 |     "necessary",
4909 |     "or",
4910 |     "they",
4911 |     "join",
4912 |     "different",
4913 |     "related",
4914 |     "information",
4915 |     "into",
4916 |     "one",
4917 |     "sentence"
4918 |   ],
4919 |   [
4920 |     "domain",
4921 |     "independent",
4922 |     "summarization",
4923 |     "techniques",
4924 |     "generally",
4925 |     "apply",
4926 |     "sets",
4927 |     "of",
4928 |     "general",
4929 |     "features",
4930 |     "which",
4931 |     "can",
4932 |     "be",
4933 |     "used",
4934 |     "to",
4935 |     "identify",
4936 |     "information-rich",
4937 |     "text",
4938 |     "segments"
4939 |   ],
4940 |   [
4941 |     "recent",
4942 |     "research",
4943 |     "focus",
4944 |     "has",
4945 |     "drifted",
4946 |     "to",
4947 |     "domain-specific",
4948 |     "summarization",
4949 |     "techniques",
4950 |     "that",
4951 |     "utilize",
4952 |     "the",
4953 |     "available",
4954 |     "knowledge",
4955 |     "specific",
4956 |     "to",
4957 |     "the",
4958 |     "domain",
4959 |     "of",
4960 |     "text"
4961 |   ],
4962 |   [
4963 |     "for",
4964 |     "example",
4965 |     "automatic",
4966 |     "summarization",
4967 |     "research",
4968 |     "on",
4969 |     "medical",
4970 |     "text",
4971 |     "generally",
4972 |     "attempts",
4973 |     "to",
4974 |     "utilize",
4975 |     "the",
4976 |     "various",
4977 |     "sources",
4978 |     "of",
4979 |     "codified",
4980 |     "medical",
4981 |     "knowledge",
4982 |     "and",
4983 |     "ontologies"
4984 |   ],
4985 |   [
4986 |     "the",
4987 |     "main",
4988 |     "drawback",
4989 |     "of",
4990 |     "the",
4991 |     "evaluation",
4992 |     "systems",
4993 |     "existing",
4994 |     "so",
4995 |     "far",
4996 |     "is",
4997 |     "that",
4998 |     "we",
4999 |     "need",
5000 |     "at",
5001 |     "least",
5002 |     "one",
5003 |     "reference",
5004 |     "summary",
5005 |     "and",
5006 |     "for",
5007 |     "some",
5008 |     "methods",
5009 |     "more",
5010 |     "than",
5011 |     "one",
5012 |     "to",
5013 |     "be",
5014 |     "able",
5015 |     "to",
5016 |     "compare",
5017 |     "summaries",
5018 |     "with",
5019 |     "models"
5020 |   ],
5021 |   [
5022 |     "this",
5023 |     "is",
5024 |     "a",
5025 |     "hard",
5026 |     "and",
5027 |     "expensive",
5028 |     "task"
5029 |   ],
5030 |   [
5031 |     "much",
5032 |     "effort",
5033 |     "has",
5034 |     "to",
5035 |     "be",
5036 |     "done",
5037 |     "in",
5038 |     "order",
5039 |     "to",
5040 |     "have",
5041 |     "corpus",
5042 |     "of",
5043 |     "texts",
5044 |     "and",
5045 |     "their",
5046 |     "corresponding",
5047 |     "summaries"
5048 |   ],
5049 |   [
5050 |     "furthermore",
5051 |     "for",
5052 |     "some",
5053 |     "methods",
5054 |     "not",
5055 |     "only",
5056 |     "do",
5057 |     "we",
5058 |     "need",
5059 |     "to",
5060 |     "have",
5061 |     "human-made",
5062 |     "summaries",
5063 |     "available",
5064 |     "for",
5065 |     "comparison",
5066 |     "but",
5067 |     "also",
5068 |     "manual",
5069 |     "annotation",
5070 |     "has",
5071 |     "to",
5072 |     "be",
5073 |     "performed",
5074 |     "in",
5075 |     "some",
5076 |     "of",
5077 |     "them",
5078 |     "e"
5079 |   ],
5080 |   [
5081 |     "scu",
5082 |     "in",
5083 |     "the",
5084 |     "pyramid",
5085 |     "method"
5086 |   ],
5087 |   [
5088 |     "in",
5089 |     "any",
5090 |     "case",
5091 |     "what",
5092 |     "the",
5093 |     "evaluation",
5094 |     "methods",
5095 |     "need",
5096 |     "as",
5097 |     "an",
5098 |     "input",
5099 |     "is",
5100 |     "a",
5101 |     "set",
5102 |     "of",
5103 |     "summaries",
5104 |     "to",
5105 |     "serve",
5106 |     "as",
5107 |     "gold",
5108 |     "standards",
5109 |     "and",
5110 |     "a",
5111 |     "set",
5112 |     "of",
5113 |     "automatic",
5114 |     "summaries"
5115 |   ],
5116 |   [
5117 |     "moreover",
5118 |     "they",
5119 |     "all",
5120 |     "perform",
5121 |     "a",
5122 |     "quantitative",
5123 |     "evaluation",
5124 |     "with",
5125 |     "regard",
5126 |     "to",
5127 |     "different",
5128 |     "similarity",
5129 |     "metrics"
5130 |   ],
5131 |   [
5132 |     "to",
5133 |     "overcome",
5134 |     "these",
5135 |     "problems",
5136 |     "we",
5137 |     "think",
5138 |     "that",
5139 |     "the",
5140 |     "quantitative",
5141 |     "evaluation",
5142 |     "might",
5143 |     "not",
5144 |     "be",
5145 |     "the",
5146 |     "only",
5147 |     "way",
5148 |     "to",
5149 |     "evaluate",
5150 |     "summaries",
5151 |     "and",
5152 |     "a",
5153 |     "qualitative",
5154 |     "automatic",
5155 |     "evaluation",
5156 |     "would",
5157 |     "be",
5158 |     "also",
5159 |     "important"
5160 |   ]
5161 | ]


--------------------------------------------------------------------------------
/test/tr-test.js:
--------------------------------------------------------------------------------
 1 | var assert = require('assert');
 2 | var should = require('should');
 3 | var expect = require('chai').expect;
 4 | var tr = require('../index');
 5 | var LONG_KEY_EX = require('./Automatic_Summarization-tokens.json');
 6 | var LONG_SENT_EX = require('./Automatic_Summarization-sents.json');
 7 | var fs = require('fs');
 8 | 
 9 | // TODO: right now just example code, no real unit tests!
10 | describe('textrank', function () {
11 |     describe('keyword extraction', function () {
12 | 
13 |         it('should compute top keywords', function () {
14 |             // First convert to a keyword graph as described in the paper
15 |             var graph = tr.keyExGraph(LONG_KEY_EX);
16 |             // Now run text rank on the graph
17 |             var ws = tr.textRank(graph);
18 |             // Get the top N keywords
19 |             ws = ws.slice(0, Math.min(ws.length, 10));
20 |             console.log('Top rated hits!');
21 |             ws.forEach(function (item) {
22 |                 console.log(item.name + ' --> ' + item.score);
23 |             });
24 |         });
25 |     });
26 |     describe('sentence extraction', function () {
27 | 
28 |         it('should compute top sentences', function () {
29 |             // First convert to a sentence graph as described in the paper
30 |             var graph = tr.sentExGraph(LONG_SENT_EX);
31 |             // Now run 40 iterations of the algorithm
32 |             var ws = tr.textRank(graph, 40);
33 |             // Get the top N sentence
34 |             ws = ws.slice(0, Math.min(ws.length, 10));
35 |             // Reorder the top N sentences by article order
36 |             ws.sort(function(a, b) { return a.vertex - b.vertex });
37 |             console.log('Top rated hits!\n');
38 |             ws.forEach(function (item) {
39 |                 console.log(item.name.join(' ') + '\n');
40 |             });
41 |         });
42 |     });
43 | 
44 | });
45 | 


--------------------------------------------------------------------------------