├── .gitignore ├── LICENSE ├── README.md ├── bin └── cli.js ├── index.html ├── lib └── fetch-dom.js ├── package.json └── tests ├── server.js └── target.html /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | package-lock.json 3 | logs 4 | *.log 5 | npm-debug.log* 6 | yarn-debug.log* 7 | yarn-error.log* 8 | 9 | # Runtime data 10 | pids 11 | *.pid 12 | *.seed 13 | *.pid.lock 14 | 15 | # Directory for instrumented libs generated by jscoverage/JSCover 16 | lib-cov 17 | 18 | # Coverage directory used by tools like istanbul 19 | coverage 20 | 21 | # nyc test coverage 22 | .nyc_output 23 | 24 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 25 | .grunt 26 | 27 | # Bower dependency directory (https://bower.io/) 28 | bower_components 29 | 30 | # node-waf configuration 31 | .lock-wscript 32 | 33 | # Compiled binary addons (https://nodejs.org/api/addons.html) 34 | build/Release 35 | 36 | # Dependency directories 37 | node_modules/ 38 | jspm_packages/ 39 | 40 | # TypeScript v1 declaration files 41 | typings/ 42 | 43 | # Optional npm cache directory 44 | .npm 45 | 46 | # Optional eslint cache 47 | .eslintcache 48 | 49 | # Optional REPL history 50 | .node_repl_history 51 | 52 | # Output of 'npm pack' 53 | *.tgz 54 | 55 | # Yarn Integrity file 56 | .yarn-integrity 57 | 58 | # dotenv environment variables file 59 | .env 60 | 61 | # next.js build output 62 | .next 63 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Stefano Cudini 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fetch-dom 2 | Magic utility that extract javascript global variables from a remote html page. 3 | 4 | [![npm version](https://badge.fury.io/js/node-fetch-dom.svg)](https://badge.fury.io/js/node-fetch-dom) 5 | 6 | *Licensed under the MIT license.* 7 | 8 | *Copyright [Stefano Cudini](https://opengeo.tech/stefano-cudini/)* 9 | 10 | ## Source code 11 | [Github](https://github.com/stefanocudini/node-fetch-dom) 12 | [NPM](https://npmjs.org/package/fetch-dom) 13 | 14 | # Usage 15 | ## Install in command line 16 | 17 | ```bash 18 | $ npm install -g @stefcud/fetch-dom 19 | ``` 20 | **run command** 21 | parameters is a url page and a global variable name 22 | ```bash 23 | $ fetchdom https://opengeo.tech/index.html allTags 24 | ``` 25 | the output is a json 26 | 27 | **html in remote page** 28 | ```html 29 | 32 | ``` 33 | 34 | ## Usage 35 | ```bash 36 | $ npm install --save fetch-dom 37 | ``` 38 | 39 | ## Integrate in your code 40 | ```javascript 41 | 42 | var fetchdom = require('@stefcud/fetch-dom'); 43 | 44 | /* 45 | by default return the window object 46 | */ 47 | fetchdom('https://opengeo.tech/', function(window) { 48 | 49 | console.log(window.document.body); 50 | }); 51 | 52 | /* 53 | specify sub property of default DOM 54 | */ 55 | fetchdom('https://opengeo.tech/', 'location.href', function(href) { 56 | 57 | console.log(href); 58 | 59 | }); 60 | 61 | /* 62 | parsing of global remote variables (in remote page is defined window.allTags = {...}; ) 63 | */ 64 | fetchdom('https://opengeo.tech/', 'allTags', function(tags) { 65 | 66 | console.log(tags); 67 | 68 | }); 69 | ``` 70 | -------------------------------------------------------------------------------- /bin/cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var fetchdom = require('../lib/fetch-dom'); 4 | 5 | if(process.argv.length<3) { 6 | process.stderr.write("pass url and var name\n"); 7 | return; 8 | } 9 | 10 | var url = process.argv[2] || undefined, 11 | opts = { 12 | subvar: process.argv[3] || undefined, 13 | wait: process.argv[4] || undefined, 14 | }; 15 | 16 | fetchdom(url, opts, function(dom) { 17 | 18 | var json = JSON.stringify(dom, null, 4); 19 | 20 | process.stdout.write(json) 21 | 22 | }); 23 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 |

fetch-dom

2 |

Library for Nodejs that retrieve the DOM and global vars from a remote html page.

3 |

npm version

4 |

Licensed under the MIT license.

5 |

Copyright Stefano Cudini

6 |

Source code

7 |

Github
NPM

8 |

Usage

9 |

Install in command line

10 |
$ npm install -g @stefcud/fetch-dom
11 | 
12 |

run command
page url and global variable name

13 |
$ fetchdom https://opengeo.tech/index.html allTags
14 | 
15 |

Integrate in your application

16 |
$ npm install --save fetch-dom 
17 | 
18 |

application code

19 |

20 | var fetchdom = require('@stefcud/fetch-dom');
21 | 
22 | /*
23 |   by default return the window object
24 | */
25 | fetchdom('https://opengeo.tech/', function(window) {
26 | 
27 |     console.log(window.document.body);
28 | });
29 | 
30 | /*
31 |   specify sub property of default DOM
32 | */
33 | fetchdom('https://opengeo.tech/', 'location.href', function(href) {
34 | 
35 |     console.log(href);
36 | 
37 | });
38 | 
39 | /*
40 |   parsing of global remote variables (in remote page is defined window.allTags = {...}; )
41 | */
42 | fetchdom('https://opengeo.tech/', 'allTags', function(tags) {
43 | 
44 |     console.log(tags);
45 | 
46 | });
47 | 
48 | -------------------------------------------------------------------------------- /lib/fetch-dom.js: -------------------------------------------------------------------------------- 1 | 2 | const Nightmare = require('nightmare') 3 | const nightmare = Nightmare({ 4 | height: 1280, 5 | width: 1280, 6 | //show: true 7 | }); 8 | 9 | Object.getPath = function(o, s) { 10 | 11 | if(!s) return Object.keys(o); 12 | 13 | s = s.replace(/\[(\w+)\]/g, '.$1'); // convert indexes to properties 14 | s = s.replace(/^\./, ''); // strip a leading dot 15 | var a = s.split('.'); 16 | for (var i = 0, n = a.length; i < n; ++i) { 17 | var k = a[i]; 18 | if (k in o) { 19 | o = o[k]; 20 | } else { 21 | return; 22 | } 23 | } 24 | return o; 25 | } 26 | 27 | module.exports = function(url, opts, cb) { 28 | 29 | if(opts && typeof opts === 'function') { 30 | cb = opts; 31 | } 32 | 33 | if(url) { 34 | nightmare 35 | .useragent(opts.userAgent || '') 36 | .goto(url) 37 | .wait(opts.wait || 'body') 38 | .evaluate(function() { 39 | return this; 40 | }) 41 | .end() 42 | .then(window => { 43 | 44 | var o = Object.getPath(window, opts.subvar); 45 | 46 | if(cb) { 47 | cb(o); 48 | } 49 | }) 50 | .catch(error => { 51 | console.error('Error: ', error) 52 | }); 53 | } 54 | 55 | }; 56 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@stefcud/fetch-dom", 3 | "version": "1.3.5", 4 | "description": "Magic utility that extract javascript global variables from a remote html page.", 5 | "main": "lib/fetch-dom", 6 | "scripts": { 7 | "build": "markdown README.md > index.html", 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "bin": { 11 | "fetchdom": "bin/cli.js" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "git+https://github.com/stefanocudini/node-fetch-dom.git" 16 | }, 17 | "keywords": [ 18 | "nodejs", 19 | "fetch", 20 | "dom", 21 | "crawler", 22 | "scraping" 23 | ], 24 | "author": { 25 | "name": "Stefano Cudini", 26 | "email": "stefano.cudini@gmail.com", 27 | "url": "https://opengeo.tech/" 28 | }, 29 | "license": "MIT", 30 | "bugs": { 31 | "url": "https://github.com/stefanocudini/node-fetch-dom/issues" 32 | }, 33 | "homepage": "https://github.com/stefanocudini/node-fetch-dom#readme", 34 | "dependencies": { 35 | "nightmare": "^3.0.1" 36 | }, 37 | "devDependencies": { 38 | "markdown-to-html": "0.0.13", 39 | "restify": "^7.2.1" 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /tests/server.js: -------------------------------------------------------------------------------- 1 | 2 | var restify = require('restify'); 3 | var fetchdom = require('../lib/fetch-dom'); 4 | 5 | 6 | var server = restify.createServer(); 7 | 8 | server.use(restify.plugins.queryParser()); 9 | 10 | server.get('/*', function(req, res, next) { 11 | 12 | console.log('request', req.query); 13 | 14 | if(!req.query.url || ! req.query.dom) { 15 | console.log('specify url and dom params'); 16 | return next(); 17 | } 18 | 19 | fetchdom(req.query.url, req.query.dom, function(dom) { 20 | 21 | var json = JSON.stringify(dom, null, 4); 22 | 23 | res.send(dom); 24 | 25 | return next(); 26 | }); 27 | }); 28 | 29 | server.listen(3000, function() { 30 | console.log('%s listening at %s', server.name, server.url); 31 | }); 32 | -------------------------------------------------------------------------------- /tests/target.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 14 | 15 | 16 | 17 |
18 | 19 |
20 | 65 | 66 | 67 | --------------------------------------------------------------------------------