├── LICENSE.md
├── README.md
├── driver.js
├── index.html
└── summarizer.js
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT: http://arnavroy.mit-license.org
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | TextSummarizer
2 | ==============
3 |
4 | Client side extractive text summarization using JavaScript, based on [TextRank](http://www.cse.unt.edu/~rada/papers/mihalcea.emnlp04.pdf). There's no network IO involved, all computations happen on the local machine.
5 |
--------------------------------------------------------------------------------
/driver.js:
--------------------------------------------------------------------------------
1 | $(document).ready(initJS);
2 |
3 | function initJS() {
4 | // Configure this object for tweaking summarization params.
5 | var configObj = {
6 | "maxIter": 100,
7 | "dampingFactor": 0.85,
8 | "delta": 0.5
9 | };
10 |
11 | var summarizeButton = $("#summarizeButton")[0];
12 | var inputText = $("#input")[0];
13 | var outputText = $("#output")[0];
14 | // On focus the default input text should go away.
15 | $(inputText).focus(function () {
16 | console.log("Focus event!");
17 | console.log($(inputText).val());
18 | if ($(inputText).val() == "Insert the text to be summarized here...") {
19 | $(inputText).val("");
20 | }
21 | });
22 |
23 | // On blur the default input text should be injected.
24 | $(inputText).blur(function () {
25 | console.log("Blur event!");
26 | console.log($(inputText).val());
27 | if ($(inputText).val() == "") {
28 | $(inputText).val("Insert the text to be summarized here...");
29 | }
30 | });
31 | $(summarizeButton).on("click", function (event) {
32 | var inputToSummarize = $.trim($(inputText).val());
33 | if (inputToSummarize.length == 0) {
34 | $(outputText).val("No text to be summarized...");
35 | } else {
36 | // Invoke the summarizer algo.
37 | var sentences = Summarizer.Utility.getSentences(inputToSummarize);
38 | var graph = Summarizer.Utility.makeGraph(sentences);
39 | var result = Summarizer.Utility.calculatePageRank(graph, configObj.maxIter,
40 | configObj.dampingFactor, configObj.delta);
41 |
42 | var arr = [];
43 | var idx = 0;
44 | _.each(result, function (v, k) {
45 | arr.push({
46 | "sentence": v.sentence,
47 | "PR": v.PR,
48 | "idx": idx++
49 | });
50 | console.log("sentence: " + v.sentence + ", PR: " + v.PR);
51 | });
52 |
53 | // Sort in descending order of PR.
54 | arr = arr.sort(function (a, b) {
55 | return b.PR - a.PR;
56 | });
57 |
58 | // Just returning half the original number of lines.
59 | var halfNumLines = Math.floor(arr.length / 2);
60 | if (halfNumLines == 0) {
61 | halfNumLines = arr.length;
62 | }
63 |
64 | // Collect the half number of lines and sort them according to their occurence in the original text.
65 | arr = arr.splice(0, halfNumLines);
66 | arr = arr.sort(function (a, b) {
67 | return a.idx - b.idx;
68 | });
69 | var finalResult = "";
70 | for (var idx = 0; idx < halfNumLines; ++idx) {
71 | finalResult += arr[idx].sentence + ". ";
72 | }
73 | $(outputText).val(finalResult);
74 | }
75 | });
76 | }
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
7 |
16 |
18 |
20 |
21 |
22 |
24 |
25 |
26 |
27 |
28 |
Text summarization on the client side
29 |
30 |
31 |
32 |
42 |
43 |
44 |
45 |
46 |
47 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/summarizer.js:
--------------------------------------------------------------------------------
1 | var Summarizer = {};
2 | Summarizer.Utility = {};
3 |
4 | // Get text from an HTML document.
5 | Summarizer.Utility.getTextFromHtml = function (someHtmlDoc) {
6 | var tmp = document.createElement("DIV");
7 | tmp.innerHTML = someHtmlDoc;
8 | return tmp.textContent || tmp.innerText;
9 | }
10 |
11 | // Get sentences from text.
12 | Summarizer.Utility.getSentences = function (text) {
13 | var sentences = text.split(/\. |\.|\?|!|\n/g);
14 | $(sentences).each(function (idx) {
15 | sentences[idx] = $.trim(sentences[idx]);
16 | });
17 | sentences = $(sentences).filter(function (idx) {
18 | return sentences[idx].length > 0;
19 | });
20 | return sentences;
21 | }
22 |
23 | // Calculate similarity between 2 sentences.
24 | Summarizer.Utility.calculateSimilarity = function (sentence1, sentence2) {
25 | var words1 = sentence1.split(" ");
26 | var words2 = sentence2.split(" ");
27 | var intersection = _.intersection(words1, words2);
28 | var sumOfLengths = Math.log(words1.length) + Math.log(words2.length);
29 | if (sumOfLengths == 0) {
30 | return 0;
31 | } else {
32 | return intersection.length / sumOfLengths; // JS uses floating point arithmetic by default.
33 | }
34 | }
35 |
36 | // Make directed graph.
37 | Summarizer.Utility.makeGraph = function (sentences) {
38 | var graph = {};
39 | for (var idx1 = 0; idx1 < sentences.length; ++idx1) {
40 | for (var idx2 = idx1 + 1; idx2 < sentences.length; ++idx2) {
41 | if (graph[idx1] == undefined) {
42 | graph[idx1] = [];
43 | }
44 |
45 | if (graph[idx2] == undefined) {
46 | graph[idx2] = [];
47 | }
48 | var similarityScore = Summarizer.Utility.calculateSimilarity(
49 | sentences[idx1], sentences[idx2]);
50 | graph[idx1].push({
51 | "node": idx2,
52 | "weight": similarityScore
53 | });
54 | graph[idx2].push({
55 | "node": idx1,
56 | "weight": similarityScore
57 | });
58 | }
59 | }
60 | // Inculde a lookup from the sentenceId to the actual sentence.
61 | graph.sentenceIdLookup = sentences;
62 | return graph;
63 | }
64 |
65 | // Page Rank calculation driver.
66 | Summarizer.Utility.calculatePageRank = function (graph, maxIterations,
67 | dampingFactor, delta) {
68 | var pageRankStruct = {};
69 | var totalWeight = {};
70 | var totalNumNodes = graph.sentenceIdLookup.length; // Number of nodes.
71 | for (var idx = 0; idx < totalNumNodes; ++idx) {
72 | pageRankStruct[idx] = {
73 | "oldPR": 1.0,
74 | "newPR": 0.0
75 | };
76 | totalWeight[idx] = 0.0;
77 | }
78 | for (var idx = 0; idx < totalNumNodes; ++idx) {
79 | var adjacencyList = graph[idx];
80 | if (adjacencyList == undefined) {
81 | continue;
82 | }
83 | // The adjacency list is an array containg objects that contain the neighbours' index as
84 | // key and similarity score as the weight.
85 | _.each(adjacencyList, function (item) {
86 | totalWeight[idx] += item["weight"];
87 | });
88 | }
89 | var converged = false;
90 | for (var iter = 0; iter < maxIterations; ++iter) {
91 | maxPRChange = Summarizer.Utility.runPageRankOnce(graph, pageRankStruct,
92 | totalWeight, totalNumNodes, dampingFactor);
93 | if (maxPRChange <= (delta / totalNumNodes)) {
94 | converged = true;
95 | break;
96 | }
97 | }
98 | var pageRankResults = {};
99 | for (var idx = 0; idx < totalNumNodes; ++idx) {
100 | pageRankResults[idx] = {
101 | "PR": pageRankStruct[idx]["oldPR"] / totalNumNodes,
102 | "sentence": graph.sentenceIdLookup[idx]
103 | };
104 | }
105 | return pageRankResults;
106 | }
107 |
108 |
109 | // Single iteration of Page Rank.
110 | Summarizer.Utility.runPageRankOnce = function (graph, pageRankStruct,
111 | totalWeight, totalNumNodes, dampingFactor) {
112 | var sinkContrib = 0.0;
113 | for (var idx = 0; idx < totalNumNodes; ++idx) {
114 | if (graph[idx] == undefined || graph[idx].length == 0) {
115 | // Sink.
116 | sinkContrib += pageRankStruct[idx]["oldPR"];
117 | continue;
118 | }
119 | var wt = 0.0;
120 | // Now iterate over all the nodes that are pointing to this node.
121 | _.each(graph[idx], function (adjNode) {
122 | var node = adjNode["node"];
123 | // Get the total weight shared by this adjacent node and its neighbours.
124 | var sharedWt = totalWeight[node];
125 | if (sharedWt != 0) { // To prevent NaN
126 | wt += (adjNode["weight"] / sharedWt) * pageRankStruct[node]["oldPR"];
127 | }
128 | });
129 | wt *= dampingFactor;
130 | wt += (1 - dampingFactor);
131 | // Update the structure w/ the new PR.
132 | pageRankStruct[idx]["newPR"] = wt;
133 | }
134 | // Apply the sink contrib overall.
135 | sinkContrib /= totalNumNodes;
136 | var max_pr_change = 0.0;
137 | for (var idx = 0; idx < totalNumNodes; ++idx) {
138 | pageRankStruct[idx]["newPR"] += sinkContrib;
139 | // Report back the max PR change.
140 | var change = Math.abs(pageRankStruct[idx]["newPR"] - pageRankStruct[idx][
141 | "oldPR"
142 | ]);
143 | if (change > max_pr_change) {
144 | max_pr_change = change;
145 | }
146 | // Set old PR to new PR for next iteration.
147 | pageRankStruct[idx]["oldPR"] = pageRankStruct[idx]["newPR"];
148 | pageRankStruct[idx]["newPR"] = 0.0;
149 | }
150 | return max_pr_change;
151 | }
152 |
--------------------------------------------------------------------------------