├── .eslintignore ├── .eslintrc ├── .travis.yml ├── example.js ├── .gitignore ├── package.json ├── index.js ├── LICENSE ├── README.md └── example.json /.eslintignore: -------------------------------------------------------------------------------- 1 | node_modules/** 2 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | env: 2 | node: true 3 | 4 | rules: 5 | no-use-before-define: [1, nofunc] 6 | quotes: [2, single] 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | 3 | node_js: 4 | - "0.10" 5 | 6 | script: 7 | - npm i nsp -g 8 | - npm shrinkwrap --dev 9 | - nsp audit-shrinkwrap || true 10 | - npm outdated --depth 0 11 | -------------------------------------------------------------------------------- /example.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var getPage = require('summarizer').getPage; 4 | 5 | var uri = 'http://nodejs.org/api/documentation.html'; 6 | 7 | getPage(uri).then(function (data) { 8 | console.log(JSON.stringify(data, null, 2)); 9 | }, console.error); 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | 5 | # Runtime data 6 | pids 7 | *.pid 8 | *.seed 9 | 10 | # Directory for instrumented libs generated by jscoverage/JSCover 11 | lib-cov 12 | 13 | # Coverage directory used by tools like istanbul 14 | coverage 15 | 16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 17 | .grunt 18 | 19 | # Compiled binary addons (http://nodejs.org/api/addons.html) 20 | build/Release 21 | 22 | # Dependency directory 23 | # Deployed apps should consider commenting this line out: 24 | # see https://npmjs.org/doc/faq.html#Should-I-check-my-node_modules-folder-into-git 25 | node_modules 26 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "summarizer", 3 | "version": "1.0.0", 4 | "description": "Scrapes a remote page and creates a summary with statistics.", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node index", 8 | "test": "eslint .", 9 | "example": "node example" 10 | }, 11 | "repository": { 12 | "type": "git", 13 | "url": "https://github.com/pdehaan/summarizer.git" 14 | }, 15 | "keywords": [ 16 | "summarize", 17 | "summarizer", 18 | "summary" 19 | ], 20 | "author": "Peter deHaan (http://nodeexamples.com/)", 21 | "license": "WTFPL", 22 | "bugs": { 23 | "url": "https://github.com/pdehaan/summarizer/issues" 24 | }, 25 | "homepage": "https://github.com/pdehaan/summarizer", 26 | "dependencies": { 27 | "node-summary": "1.0.0", 28 | "promise": "5.0.0", 29 | "summarize": "1.4.0", 30 | "superagent": "0.18.2", 31 | "unfluff": "0.7.0" 32 | }, 33 | "devDependencies": { 34 | "eslint": "0.7.4" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var Promise = require('promise'); 2 | var summarize = require('summarize'); 3 | var summary = require('node-summary'); 4 | var superagent = require('superagent'); 5 | var unfluff = require('unfluff'); 6 | 7 | var summarizeP = Promise.denodeify(summary.summarize); 8 | 9 | module.exports.getPage = function (uri) { 10 | 'use strict'; 11 | 12 | return new Promise(function (resolve, reject) { 13 | superagentGetP(uri).then(function (data) { 14 | var text = data.text; 15 | var pageContent = unfluff(text); 16 | pageContent.raw = text; 17 | pageContent.stats = summarize(text); 18 | summarizeP(pageContent.title, pageContent.text).then(function (res) { 19 | pageContent.summary = res; 20 | resolve(pageContent); 21 | }, reject); 22 | }, reject); 23 | }); 24 | }; 25 | 26 | function superagentGetP(uri) { 27 | 'use strict'; 28 | 29 | return new Promise(function (resolve, reject) { 30 | superagent.get(uri).end(function (err, res) { 31 | if (err) { 32 | return reject(err); 33 | } 34 | resolve(res); 35 | }); 36 | }); 37 | } 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # summarizer 2 | 3 | [![Build Status: Travis](https://travis-ci.org/pdehaan/summarizer.svg?branch=master)](https://travis-ci.org/pdehaan/summarizer) 4 | 5 | Scrapes a remote page and creates a summary with statistics. 6 | 7 | This package uses a combination of the following modules: 8 | 9 | - [summarize](https://www.npmjs.org/package/summarize) — Summarize html content. 10 | - [node-summary](https://www.npmjs.org/package/node-summary) — Summarizes text using a naive summarization algorithm. 11 | - [unfluff](https://www.npmjs.org/package/unfluff) — A web page content extractor. 12 | 13 | ## Installation 14 | 15 | ```sh 16 | $ npm i summarizer 17 | ``` 18 | 19 | ### [Example usage](/example.js) 20 | 21 | ```js 22 | 'use strict'; 23 | 24 | var getPage = require('summarizer').getPage; 25 | 26 | var uri = 'http://nodejs.org/api/documentation.html'; 27 | 28 | getPage(uri).then(function (data) { 29 | console.log(JSON.stringify(data, null, 2)); 30 | }, console.error); 31 | ``` 32 | 33 | #### [Example output](/example.json) 34 | 35 | ```json 36 | { 37 | "title": "About this Documentation Node.js v0.10.31 Manual & Documentation", 38 | "lang": "en", 39 | "canonicalLink": "http://nodejs.org/api/documentation.html", 40 | "tags": [], 41 | "image": null, 42 | "videos": [], 43 | "text": "The goal of this documentation is to comprehensively explain the Node.js API, both from a reference as well as a conceptual point of view. ...", 44 | "raw": "\n\n...", 45 | "stats": { 46 | "ok": true, 47 | "sentiment": 0.018134715025906734, 48 | "title": "About this Documentation Node.js v0.10.31 Manual & Documentation", 49 | "topics": [ 50 | "Stability", 51 | "change", 52 | "..." 53 | ], 54 | "words": 414, 55 | "difficulty": 0.6416666666666667, 56 | "minutes": 4, 57 | "image": null 58 | }, 59 | "summary": "About this Documentation Node.js v0.10.31 Manual & Documentation..." 60 | } 61 | 62 | ``` 63 | -------------------------------------------------------------------------------- /example.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "About this Documentation Node.js v0.10.31 Manual & Documentation", 3 | "lang": "en", 4 | "canonicalLink": "http://nodejs.org/api/documentation.html", 5 | "tags": [], 6 | "image": null, 7 | "videos": [], 8 | "text": "The goal of this documentation is to comprehensively explain the Node.js API, both from a reference as well as a conceptual point of view. Each section describes a built-in module or high-level concept.\n\nWhere appropriate, property types, method arguments, and the arguments provided to event handlers are detailed in a list underneath the topic heading.\n\nEvery .html document has a corresponding .json document presenting the same information in a structured manner. This feature is experimental, and added for the benefit of IDEs and other utilities that wish to do programmatic things with the documentation.\n\nEvery .html and .json file is generated based on the corresponding\n\n.markdown file in the doc/api/ folder in node's source tree. The documentation is generated using the tools/doc/generate.js program. The HTML template is located at doc/template.html.\n\nThroughout the documentation, you will see indications of a section's stability. The Node.js API is still somewhat changing, and as it matures, certain parts are more reliable than others. Some are so proven, and so relied upon, that they are unlikely to ever change at all. Others are brand new and experimental, or known to be hazardous and in the process of being redesigned.\n\nThe stability indices are as follows:\n\nStability: 0 - Deprecated\n\nThis feature is known to be problematic, and changes are\n\nplanned. Do not rely on it. Use of the feature may cause warnings. Backwards\n\ncompatibility should not be expected.Stability: 1 - Experimental\n\nThis feature was introduced recently, and may change\n\nor be removed in future versions. Please try it out and provide feedback.\n\nIf it addresses a use-case that is important to you, tell the node core team.Stability: 2 - Unstable\n\nThe API is in the process of settling, but has not yet had\n\nsufficient real-world testing to be considered stable. Backwards-compatibility\n\nwill be maintained if reasonable.Stability: 3 - Stable\n\nThe API has proven satisfactory, but cleanup in the underlying\n\ncode may cause minor changes. Backwards-compatibility is guaranteed.Stability: 4 - API Frozen\n\nThis API has been tested extensively in production and is\n\nunlikely to ever have to change.Stability: 5 - Locked\n\nUnless serious bugs are found, this code will not ever\n\nchange. Please do not suggest changes in this area; they will be refused.\n\nStability: 1 - Experimental\n\nEvery HTML file in the markdown has a corresponding JSON file with the same data.\n\nThis feature is new as of node v0.6.12. It is experimental.", 9 | "raw": "\n\n\n \n About this Documentation Node.js v0.10.31 Manual & Documentation\n \n \n \n\n\n
\n \n \"node.js\"\n \n
\n
\n \n\n
\n
\n

Node.js v0.10.31 Manual & Documentation

\n \n
\n
\n\n
\n

Table of Contents

\n \n\n
\n\n
\n

About this Documentation#

\n\n\n

The goal of this documentation is to comprehensively explain the Node.js\nAPI, both from a reference as well as a conceptual point of view. Each\nsection describes a built-in module or high-level concept.\n\n

\n

Where appropriate, property types, method arguments, and the arguments\nprovided to event handlers are detailed in a list underneath the topic\nheading.\n\n

\n

Every .html document has a corresponding .json document presenting\nthe same information in a structured manner. This feature is\nexperimental, and added for the benefit of IDEs and other utilities that\nwish to do programmatic things with the documentation.\n\n

\n

Every .html and .json file is generated based on the corresponding\n.markdown file in the doc/api/ folder in node's source tree. The\ndocumentation is generated using the tools/doc/generate.js program.\nThe HTML template is located at doc/template.html.\n\n

\n

Stability Index#

\n\n\n

Throughout the documentation, you will see indications of a section's\nstability. The Node.js API is still somewhat changing, and as it\nmatures, certain parts are more reliable than others. Some are so\nproven, and so relied upon, that they are unlikely to ever change at\nall. Others are brand new and experimental, or known to be hazardous\nand in the process of being redesigned.\n\n

\n

The stability indices are as follows:\n\n

\n
Stability: 0 - Deprecated\nThis feature is known to be problematic, and changes are\nplanned.  Do not rely on it.  Use of the feature may cause warnings.  Backwards\ncompatibility should not be expected.
Stability: 1 - Experimental\nThis feature was introduced recently, and may change\nor be removed in future versions.  Please try it out and provide feedback.\nIf it addresses a use-case that is important to you, tell the node core team.
Stability: 2 - Unstable\nThe API is in the process of settling, but has not yet had\nsufficient real-world testing to be considered stable. Backwards-compatibility\nwill be maintained if reasonable.
Stability: 3 - Stable\nThe API has proven satisfactory, but cleanup in the underlying\ncode may cause minor changes.  Backwards-compatibility is guaranteed.
Stability: 4 - API Frozen\nThis API has been tested extensively in production and is\nunlikely to ever have to change.
Stability: 5 - Locked\nUnless serious bugs are found, this code will not ever\nchange.  Please do not suggest changes in this area; they will be refused.

JSON Output#

\n
Stability: 1 - Experimental

Every HTML file in the markdown has a corresponding JSON file with the\nsame data.\n\n

\n

This feature is new as of node v0.6.12. It is experimental.\n

\n\n
\n
\n
\n
\n Joyent\n \n\n

Copyright Joyent, Inc, Node.js is a trademark of Joyent, Inc. View license.

\n
\n\n \n \n \n \n\n\n\n", 10 | "stats": { 11 | "ok": true, 12 | "sentiment": 0.018134715025906734, 13 | "title": "About this Documentation Node.js v0.10.31 Manual & Documentation", 14 | "topics": [ 15 | "Stability", 16 | "change", 17 | "API", 18 | "feature", 19 | "documentation", 20 | "node", 21 | "html", 22 | "Node", 23 | "code", 24 | "Backwards-compatibility", 25 | "Experimental", 26 | "cause", 27 | "proces", 28 | "stability", 29 | "template", 30 | "HTML", 31 | "markdown", 32 | "json", 33 | "argument", 34 | "section", 35 | "js api", 36 | "Documentation" 37 | ], 38 | "words": 414, 39 | "difficulty": 0.6416666666666667, 40 | "minutes": 4, 41 | "image": null 42 | }, 43 | "summary": "About this Documentation Node.js v0.10.31 Manual & Documentation\n Each section describes a built-in module or high-level concept.\n This feature is experimental, and added for the benefit of IDEs and other utilities that wish to do programmatic things with the documentation.\n The documentation is generated using the tools/doc/generate.js program. \n The Node.js API is still somewhat changing, and as it matures, certain parts are more reliable than others. \n Use of the feature may cause warnings. \n Please try it out and provide feedback.\n Please do not suggest changes in this area; they will be refused.\n It is experimental." 44 | } 45 | --------------------------------------------------------------------------------