├── .gitignore ├── LICENSE ├── README.md ├── bin └── cli.js ├── index.html ├── lib └── fetch-dom.js ├── package.json └── tests ├── server.js └── target.html /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | package-lock.json 3 | logs 4 | *.log 5 | npm-debug.log* 6 | yarn-debug.log* 7 | yarn-error.log* 8 | 9 | # Runtime data 10 | pids 11 | *.pid 12 | *.seed 13 | *.pid.lock 14 | 15 | # Directory for instrumented libs generated by jscoverage/JSCover 16 | lib-cov 17 | 18 | # Coverage directory used by tools like istanbul 19 | coverage 20 | 21 | # nyc test coverage 22 | .nyc_output 23 | 24 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 25 | .grunt 26 | 27 | # Bower dependency directory (https://bower.io/) 28 | bower_components 29 | 30 | # node-waf configuration 31 | .lock-wscript 32 | 33 | # Compiled binary addons (https://nodejs.org/api/addons.html) 34 | build/Release 35 | 36 | # Dependency directories 37 | node_modules/ 38 | jspm_packages/ 39 | 40 | # TypeScript v1 declaration files 41 | typings/ 42 | 43 | # Optional npm cache directory 44 | .npm 45 | 46 | # Optional eslint cache 47 | .eslintcache 48 | 49 | # Optional REPL history 50 | .node_repl_history 51 | 52 | # Output of 'npm pack' 53 | *.tgz 54 | 55 | # Yarn Integrity file 56 | .yarn-integrity 57 | 58 | # dotenv environment variables file 59 | .env 60 | 61 | # next.js build output 62 | .next 63 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Stefano Cudini 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fetch-dom 2 | Magic utility that extract javascript global variables from a remote html page. 3 | 4 | [](https://badge.fury.io/js/node-fetch-dom) 5 | 6 | *Licensed under the MIT license.* 7 | 8 | *Copyright [Stefano Cudini](https://opengeo.tech/stefano-cudini/)* 9 | 10 | ## Source code 11 | [Github](https://github.com/stefanocudini/node-fetch-dom) 12 | [NPM](https://npmjs.org/package/fetch-dom) 13 | 14 | # Usage 15 | ## Install in command line 16 | 17 | ```bash 18 | $ npm install -g @stefcud/fetch-dom 19 | ``` 20 | **run command** 21 | parameters is a url page and a global variable name 22 | ```bash 23 | $ fetchdom https://opengeo.tech/index.html allTags 24 | ``` 25 | the output is a json 26 | 27 | **html in remote page** 28 | ```html 29 | 32 | ``` 33 | 34 | ## Usage 35 | ```bash 36 | $ npm install --save fetch-dom 37 | ``` 38 | 39 | ## Integrate in your code 40 | ```javascript 41 | 42 | var fetchdom = require('@stefcud/fetch-dom'); 43 | 44 | /* 45 | by default return the window object 46 | */ 47 | fetchdom('https://opengeo.tech/', function(window) { 48 | 49 | console.log(window.document.body); 50 | }); 51 | 52 | /* 53 | specify sub property of default DOM 54 | */ 55 | fetchdom('https://opengeo.tech/', 'location.href', function(href) { 56 | 57 | console.log(href); 58 | 59 | }); 60 | 61 | /* 62 | parsing of global remote variables (in remote page is defined window.allTags = {...}; ) 63 | */ 64 | fetchdom('https://opengeo.tech/', 'allTags', function(tags) { 65 | 66 | console.log(tags); 67 | 68 | }); 69 | ``` 70 | -------------------------------------------------------------------------------- /bin/cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var fetchdom = require('../lib/fetch-dom'); 4 | 5 | if(process.argv.length<3) { 6 | process.stderr.write("pass url and var name\n"); 7 | return; 8 | } 9 | 10 | var url = process.argv[2] || undefined, 11 | opts = { 12 | subvar: process.argv[3] || undefined, 13 | wait: process.argv[4] || undefined, 14 | }; 15 | 16 | fetchdom(url, opts, function(dom) { 17 | 18 | var json = JSON.stringify(dom, null, 4); 19 | 20 | process.stdout.write(json) 21 | 22 | }); 23 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 |
Library for Nodejs that retrieve the DOM and global vars from a remote html page.
3 | 4 |Licensed under the MIT license.
5 |Copyright Stefano Cudini
6 |$ npm install -g @stefcud/fetch-dom
11 |
12 | run command
page url and global variable name
$ fetchdom https://opengeo.tech/index.html allTags
14 |
15 | $ npm install --save fetch-dom
17 |
18 | application code
19 |
20 | var fetchdom = require('@stefcud/fetch-dom');
21 |
22 | /*
23 | by default return the window object
24 | */
25 | fetchdom('https://opengeo.tech/', function(window) {
26 |
27 | console.log(window.document.body);
28 | });
29 |
30 | /*
31 | specify sub property of default DOM
32 | */
33 | fetchdom('https://opengeo.tech/', 'location.href', function(href) {
34 |
35 | console.log(href);
36 |
37 | });
38 |
39 | /*
40 | parsing of global remote variables (in remote page is defined window.allTags = {...}; )
41 | */
42 | fetchdom('https://opengeo.tech/', 'allTags', function(tags) {
43 |
44 | console.log(tags);
45 |
46 | });
47 |
48 |
--------------------------------------------------------------------------------
/lib/fetch-dom.js:
--------------------------------------------------------------------------------
1 |
2 | const Nightmare = require('nightmare')
3 | const nightmare = Nightmare({
4 | height: 1280,
5 | width: 1280,
6 | //show: true
7 | });
8 |
9 | Object.getPath = function(o, s) {
10 |
11 | if(!s) return Object.keys(o);
12 |
13 | s = s.replace(/\[(\w+)\]/g, '.$1'); // convert indexes to properties
14 | s = s.replace(/^\./, ''); // strip a leading dot
15 | var a = s.split('.');
16 | for (var i = 0, n = a.length; i < n; ++i) {
17 | var k = a[i];
18 | if (k in o) {
19 | o = o[k];
20 | } else {
21 | return;
22 | }
23 | }
24 | return o;
25 | }
26 |
27 | module.exports = function(url, opts, cb) {
28 |
29 | if(opts && typeof opts === 'function') {
30 | cb = opts;
31 | }
32 |
33 | if(url) {
34 | nightmare
35 | .useragent(opts.userAgent || '')
36 | .goto(url)
37 | .wait(opts.wait || 'body')
38 | .evaluate(function() {
39 | return this;
40 | })
41 | .end()
42 | .then(window => {
43 |
44 | var o = Object.getPath(window, opts.subvar);
45 |
46 | if(cb) {
47 | cb(o);
48 | }
49 | })
50 | .catch(error => {
51 | console.error('Error: ', error)
52 | });
53 | }
54 |
55 | };
56 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@stefcud/fetch-dom",
3 | "version": "1.3.5",
4 | "description": "Magic utility that extract javascript global variables from a remote html page.",
5 | "main": "lib/fetch-dom",
6 | "scripts": {
7 | "build": "markdown README.md > index.html",
8 | "test": "echo \"Error: no test specified\" && exit 1"
9 | },
10 | "bin": {
11 | "fetchdom": "bin/cli.js"
12 | },
13 | "repository": {
14 | "type": "git",
15 | "url": "git+https://github.com/stefanocudini/node-fetch-dom.git"
16 | },
17 | "keywords": [
18 | "nodejs",
19 | "fetch",
20 | "dom",
21 | "crawler",
22 | "scraping"
23 | ],
24 | "author": {
25 | "name": "Stefano Cudini",
26 | "email": "stefano.cudini@gmail.com",
27 | "url": "https://opengeo.tech/"
28 | },
29 | "license": "MIT",
30 | "bugs": {
31 | "url": "https://github.com/stefanocudini/node-fetch-dom/issues"
32 | },
33 | "homepage": "https://github.com/stefanocudini/node-fetch-dom#readme",
34 | "dependencies": {
35 | "nightmare": "^3.0.1"
36 | },
37 | "devDependencies": {
38 | "markdown-to-html": "0.0.13",
39 | "restify": "^7.2.1"
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/tests/server.js:
--------------------------------------------------------------------------------
1 |
2 | var restify = require('restify');
3 | var fetchdom = require('../lib/fetch-dom');
4 |
5 |
6 | var server = restify.createServer();
7 |
8 | server.use(restify.plugins.queryParser());
9 |
10 | server.get('/*', function(req, res, next) {
11 |
12 | console.log('request', req.query);
13 |
14 | if(!req.query.url || ! req.query.dom) {
15 | console.log('specify url and dom params');
16 | return next();
17 | }
18 |
19 | fetchdom(req.query.url, req.query.dom, function(dom) {
20 |
21 | var json = JSON.stringify(dom, null, 4);
22 |
23 | res.send(dom);
24 |
25 | return next();
26 | });
27 | });
28 |
29 | server.listen(3000, function() {
30 | console.log('%s listening at %s', server.name, server.url);
31 | });
32 |
--------------------------------------------------------------------------------
/tests/target.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |