├── LICENSE.md ├── README.md ├── driver.js ├── index.html └── summarizer.js /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT: http://arnavroy.mit-license.org 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TextSummarizer 2 | ============== 3 | 4 | Client side extractive text summarization using JavaScript, based on [TextRank](http://www.cse.unt.edu/~rada/papers/mihalcea.emnlp04.pdf). There's no network IO involved, all computations happen on the local machine. 5 | -------------------------------------------------------------------------------- /driver.js: -------------------------------------------------------------------------------- 1 | $(document).ready(initJS); 2 | 3 | function initJS() { 4 | // Configure this object for tweaking summarization params. 5 | var configObj = { 6 | "maxIter": 100, 7 | "dampingFactor": 0.85, 8 | "delta": 0.5 9 | }; 10 | 11 | var summarizeButton = $("#summarizeButton")[0]; 12 | var inputText = $("#input")[0]; 13 | var outputText = $("#output")[0]; 14 | // On focus the default input text should go away. 15 | $(inputText).focus(function () { 16 | console.log("Focus event!"); 17 | console.log($(inputText).val()); 18 | if ($(inputText).val() == "Insert the text to be summarized here...") { 19 | $(inputText).val(""); 20 | } 21 | }); 22 | 23 | // On blur the default input text should be injected. 24 | $(inputText).blur(function () { 25 | console.log("Blur event!"); 26 | console.log($(inputText).val()); 27 | if ($(inputText).val() == "") { 28 | $(inputText).val("Insert the text to be summarized here..."); 29 | } 30 | }); 31 | $(summarizeButton).on("click", function (event) { 32 | var inputToSummarize = $.trim($(inputText).val()); 33 | if (inputToSummarize.length == 0) { 34 | $(outputText).val("No text to be summarized..."); 35 | } else { 36 | // Invoke the summarizer algo. 37 | var sentences = Summarizer.Utility.getSentences(inputToSummarize); 38 | var graph = Summarizer.Utility.makeGraph(sentences); 39 | var result = Summarizer.Utility.calculatePageRank(graph, configObj.maxIter, 40 | configObj.dampingFactor, configObj.delta); 41 | 42 | var arr = []; 43 | var idx = 0; 44 | _.each(result, function (v, k) { 45 | arr.push({ 46 | "sentence": v.sentence, 47 | "PR": v.PR, 48 | "idx": idx++ 49 | }); 50 | console.log("sentence: " + v.sentence + ", PR: " + v.PR); 51 | }); 52 | 53 | // Sort in descending order of PR. 54 | arr = arr.sort(function (a, b) { 55 | return b.PR - a.PR; 56 | }); 57 | 58 | // Just returning half the original number of lines. 59 | var halfNumLines = Math.floor(arr.length / 2); 60 | if (halfNumLines == 0) { 61 | halfNumLines = arr.length; 62 | } 63 | 64 | // Collect the half number of lines and sort them according to their occurence in the original text. 65 | arr = arr.splice(0, halfNumLines); 66 | arr = arr.sort(function (a, b) { 67 | return a.idx - b.idx; 68 | }); 69 | var finalResult = ""; 70 | for (var idx = 0; idx < halfNumLines; ++idx) { 71 | finalResult += arr[idx].sentence + ". "; 72 | } 73 | $(outputText).val(finalResult); 74 | } 75 | }); 76 | } -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 16 | 18 | 20 | 21 | 22 | 24 | 25 | 26 | 27 |
28 |

Text summarization on the client side

29 |
30 |
31 |
32 |
33 | 34 | 35 |
36 |
37 | 38 |
39 |
40 | 41 |
42 | 43 |
44 | 45 | 46 |
47 |
48 | 49 |
50 |
51 | 52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /summarizer.js: -------------------------------------------------------------------------------- 1 | var Summarizer = {}; 2 | Summarizer.Utility = {}; 3 | 4 | // Get text from an HTML document. 5 | Summarizer.Utility.getTextFromHtml = function (someHtmlDoc) { 6 | var tmp = document.createElement("DIV"); 7 | tmp.innerHTML = someHtmlDoc; 8 | return tmp.textContent || tmp.innerText; 9 | } 10 | 11 | // Get sentences from text. 12 | Summarizer.Utility.getSentences = function (text) { 13 | var sentences = text.split(/\. |\.|\?|!|\n/g); 14 | $(sentences).each(function (idx) { 15 | sentences[idx] = $.trim(sentences[idx]); 16 | }); 17 | sentences = $(sentences).filter(function (idx) { 18 | return sentences[idx].length > 0; 19 | }); 20 | return sentences; 21 | } 22 | 23 | // Calculate similarity between 2 sentences. 24 | Summarizer.Utility.calculateSimilarity = function (sentence1, sentence2) { 25 | var words1 = sentence1.split(" "); 26 | var words2 = sentence2.split(" "); 27 | var intersection = _.intersection(words1, words2); 28 | var sumOfLengths = Math.log(words1.length) + Math.log(words2.length); 29 | if (sumOfLengths == 0) { 30 | return 0; 31 | } else { 32 | return intersection.length / sumOfLengths; // JS uses floating point arithmetic by default. 33 | } 34 | } 35 | 36 | // Make directed graph. 37 | Summarizer.Utility.makeGraph = function (sentences) { 38 | var graph = {}; 39 | for (var idx1 = 0; idx1 < sentences.length; ++idx1) { 40 | for (var idx2 = idx1 + 1; idx2 < sentences.length; ++idx2) { 41 | if (graph[idx1] == undefined) { 42 | graph[idx1] = []; 43 | } 44 | 45 | if (graph[idx2] == undefined) { 46 | graph[idx2] = []; 47 | } 48 | var similarityScore = Summarizer.Utility.calculateSimilarity( 49 | sentences[idx1], sentences[idx2]); 50 | graph[idx1].push({ 51 | "node": idx2, 52 | "weight": similarityScore 53 | }); 54 | graph[idx2].push({ 55 | "node": idx1, 56 | "weight": similarityScore 57 | }); 58 | } 59 | } 60 | // Inculde a lookup from the sentenceId to the actual sentence. 61 | graph.sentenceIdLookup = sentences; 62 | return graph; 63 | } 64 | 65 | // Page Rank calculation driver. 66 | Summarizer.Utility.calculatePageRank = function (graph, maxIterations, 67 | dampingFactor, delta) { 68 | var pageRankStruct = {}; 69 | var totalWeight = {}; 70 | var totalNumNodes = graph.sentenceIdLookup.length; // Number of nodes. 71 | for (var idx = 0; idx < totalNumNodes; ++idx) { 72 | pageRankStruct[idx] = { 73 | "oldPR": 1.0, 74 | "newPR": 0.0 75 | }; 76 | totalWeight[idx] = 0.0; 77 | } 78 | for (var idx = 0; idx < totalNumNodes; ++idx) { 79 | var adjacencyList = graph[idx]; 80 | if (adjacencyList == undefined) { 81 | continue; 82 | } 83 | // The adjacency list is an array containg objects that contain the neighbours' index as 84 | // key and similarity score as the weight. 85 | _.each(adjacencyList, function (item) { 86 | totalWeight[idx] += item["weight"]; 87 | }); 88 | } 89 | var converged = false; 90 | for (var iter = 0; iter < maxIterations; ++iter) { 91 | maxPRChange = Summarizer.Utility.runPageRankOnce(graph, pageRankStruct, 92 | totalWeight, totalNumNodes, dampingFactor); 93 | if (maxPRChange <= (delta / totalNumNodes)) { 94 | converged = true; 95 | break; 96 | } 97 | } 98 | var pageRankResults = {}; 99 | for (var idx = 0; idx < totalNumNodes; ++idx) { 100 | pageRankResults[idx] = { 101 | "PR": pageRankStruct[idx]["oldPR"] / totalNumNodes, 102 | "sentence": graph.sentenceIdLookup[idx] 103 | }; 104 | } 105 | return pageRankResults; 106 | } 107 | 108 | 109 | // Single iteration of Page Rank. 110 | Summarizer.Utility.runPageRankOnce = function (graph, pageRankStruct, 111 | totalWeight, totalNumNodes, dampingFactor) { 112 | var sinkContrib = 0.0; 113 | for (var idx = 0; idx < totalNumNodes; ++idx) { 114 | if (graph[idx] == undefined || graph[idx].length == 0) { 115 | // Sink. 116 | sinkContrib += pageRankStruct[idx]["oldPR"]; 117 | continue; 118 | } 119 | var wt = 0.0; 120 | // Now iterate over all the nodes that are pointing to this node. 121 | _.each(graph[idx], function (adjNode) { 122 | var node = adjNode["node"]; 123 | // Get the total weight shared by this adjacent node and its neighbours. 124 | var sharedWt = totalWeight[node]; 125 | if (sharedWt != 0) { // To prevent NaN 126 | wt += (adjNode["weight"] / sharedWt) * pageRankStruct[node]["oldPR"]; 127 | } 128 | }); 129 | wt *= dampingFactor; 130 | wt += (1 - dampingFactor); 131 | // Update the structure w/ the new PR. 132 | pageRankStruct[idx]["newPR"] = wt; 133 | } 134 | // Apply the sink contrib overall. 135 | sinkContrib /= totalNumNodes; 136 | var max_pr_change = 0.0; 137 | for (var idx = 0; idx < totalNumNodes; ++idx) { 138 | pageRankStruct[idx]["newPR"] += sinkContrib; 139 | // Report back the max PR change. 140 | var change = Math.abs(pageRankStruct[idx]["newPR"] - pageRankStruct[idx][ 141 | "oldPR" 142 | ]); 143 | if (change > max_pr_change) { 144 | max_pr_change = change; 145 | } 146 | // Set old PR to new PR for next iteration. 147 | pageRankStruct[idx]["oldPR"] = pageRankStruct[idx]["newPR"]; 148 | pageRankStruct[idx]["newPR"] = 0.0; 149 | } 150 | return max_pr_change; 151 | } 152 | --------------------------------------------------------------------------------