├── package.json ├── README.md ├── LICENSE └── index.js /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "keywordsextract", 3 | "version": "1.0.0", 4 | "description": "Extract keywords from website", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "bin": { 10 | "keywordsextract": "./index.js" 11 | }, 12 | "dependencies": { 13 | "node-readability": "^3.0.0", 14 | "gramophone": "^0.0.3", 15 | "commander": "^2.19.0", 16 | "sanitizer": "^0.1.3" 17 | }, 18 | "keywords": [ 19 | "keywords", 20 | "extract", 21 | "node" 22 | ], 23 | "author": "Vladimir Carrer (http://www.vcarrer.com)", 24 | "license": "MIT" 25 | } 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | keywords-extract - extract keywords from any website 4 | ======================================= 5 | 6 | Extract keywords with Node.js in the console 7 | 8 | * * * 9 | 10 | Command line tool for keyword extraction. This Node.js project works mainly with two node modules "node-readability" that filters only the main text clearing the noise from the main text and "gramophone" module that filters the keywords from the texts. 11 | 12 | 13 | ### Install & Use 14 | 15 | ```shell 16 | $ npm i -g keywordsextract 17 | ``` 18 | 19 | Example: 20 | 21 | ```shell 22 | $ keywordsextract --url https://en.wikipedia.org/wiki/Search_engine_optimization --n 3,4 23 | ``` 24 | You can use two parameters 25 | 26 | --url the url address 27 | 28 | --n number of words example:(--n 1,4) means from 1 word to max 4 words; 29 | 30 | 31 | All the keywords will be displayed in the console and title.txt file will be created with the keywords. 32 | 33 | ### License 34 | 35 | This project is licensed under the MIT License 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Vladimir Carrer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var read = require('node-readability'), 4 | sanitizer = require('sanitizer'), 5 | keyword = require('gramophone'), 6 | program = require('commander'), 7 | urlvalue = "", 8 | ngramsvalue; 9 | 10 | program 11 | .option('--url, [url]', 'The url') 12 | .option('--n, [ngrams]', 'Words') 13 | .parse(process.argv); 14 | 15 | if (program.url) urlvalue = program.url; 16 | else process.exit(console.log('Please add --url parameter. Something like this: $ keywordsextract --url https://en.wikipedia.org/wiki/Search_engine_optimization')); 17 | 18 | if (program.ngrams) ngramsvalue = program.ngrams 19 | else ngramsvalue = 2, 3; 20 | 21 | 22 | read(urlvalue, function(err, article, title, meta) { 23 | 24 | var title1 = article.title; 25 | var total = stripHTML(article.title + " " + article.content); 26 | 27 | var extraction_result = keyword.extract(total, { 28 | stem: true, 29 | ngrams: [2, 3] 30 | }); 31 | 32 | console.log(extraction_result); 33 | 34 | var fs = require('fs'); 35 | fs.writeFile(title1 + ".txt", extraction_result, function(err) { 36 | if (err) { 37 | return console.log(err); 38 | } 39 | 40 | console.log(title1 + ".txt file was saved!"); 41 | }); 42 | 43 | }); 44 | 45 | 46 | 47 | function stripHTML(html) { 48 | var clean = sanitizer.sanitize(html, function(str) { 49 | return str; 50 | }); 51 | 52 | clean = clean.replace(/<(?:.|\n)*?>/gm, ""); 53 | clean = clean.replace(/(?:(?:\r\n|\r|\n)\s*){2,}/ig, "\n"); 54 | return clean.trim(); 55 | } 56 | --------------------------------------------------------------------------------