├── package.json
├── README.md
├── LICENSE
└── index.js


/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "keywordsextract",
 3 |   "version": "1.0.0",
 4 |   "description": "Extract keywords from website",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1"
 8 |   },
 9 |   "bin": {
10 |     "keywordsextract": "./index.js"
11 |   },
12 |   "dependencies": {
13 |     "node-readability": "^3.0.0",
14 |     "gramophone": "^0.0.3",
15 |     "commander": "^2.19.0",
16 |     "sanitizer": "^0.1.3"
17 |   },
18 |   "keywords": [
19 |     "keywords",
20 |     "extract",
21 |     "node"
22 |   ],
23 |   "author": "Vladimir Carrer <vladocar@gmail.com> (http://www.vcarrer.com)",
24 |   "license": "MIT"
25 | }
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | keywords-extract - extract keywords from any website
 4 | =======================================
 5 | 
 6 | Extract keywords with Node.js in the console
 7 | 
 8 | * * *
 9 | 
10 | Command line tool for keyword extraction. This Node.js project works mainly with two node modules "node-readability" that filters only the main text clearing the noise from the main text and "gramophone" module that filters the keywords from the texts.
11 | 
12 | 
13 | ### Install & Use
14 | 
15 | ```shell
16 | $ npm i -g keywordsextract
17 | ```
18 | 
19 | Example:
20 | 
21 | ```shell
22 | $ keywordsextract --url https://en.wikipedia.org/wiki/Search_engine_optimization --n 3,4
23 | ```
24 | You can use two parameters
25 | 
26 | --url the url address
27 | 
28 | --n number of words example:(--n 1,4) means from 1 word to max 4 words;
29 | 
30 | 
31 | All the keywords will be displayed in the console and title.txt file will be created with the keywords.
32 | 
33 | ### License
34 | 
35 | This project is licensed under the MIT License
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Vladimir Carrer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | var read = require('node-readability'),
 4 |     sanitizer = require('sanitizer'),
 5 |     keyword = require('gramophone'),
 6 |     program = require('commander'),
 7 |     urlvalue = "",
 8 |     ngramsvalue;
 9 | 
10 | program
11 |     .option('--url, [url]', 'The url')
12 |     .option('--n, [ngrams]', 'Words')
13 |     .parse(process.argv);
14 | 
15 | if (program.url) urlvalue = program.url;
16 | else process.exit(console.log('Please add --url parameter. Something like this: $ keywordsextract --url https://en.wikipedia.org/wiki/Search_engine_optimization'));
17 | 
18 | if (program.ngrams) ngramsvalue = program.ngrams
19 | else ngramsvalue = 2, 3;
20 | 
21 | 
22 | read(urlvalue, function(err, article, title, meta) {
23 | 
24 |     var title1 = article.title;
25 |     var total = stripHTML(article.title + " " + article.content);
26 | 
27 |     var extraction_result = keyword.extract(total, {
28 |         stem: true,
29 |         ngrams: [2, 3]
30 |     });
31 | 
32 |     console.log(extraction_result);
33 | 
34 |     var fs = require('fs');
35 |     fs.writeFile(title1 + ".txt", extraction_result, function(err) {
36 |         if (err) {
37 |             return console.log(err);
38 |         }
39 | 
40 |         console.log(title1 + ".txt file was saved!");
41 |     });
42 | 
43 | });
44 | 
45 | 
46 | 
47 | function stripHTML(html) {
48 |     var clean = sanitizer.sanitize(html, function(str) {
49 |         return str;
50 |     });
51 | 
52 |     clean = clean.replace(/<(?:.|\n)*?>/gm, "");
53 |     clean = clean.replace(/(?:(?:\r\n|\r|\n)\s*){2,}/ig, "\n");
54 |     return clean.trim();
55 | }
56 | 


--------------------------------------------------------------------------------