├── .gitignore
├── examples
    ├── single.js
    ├── multiple.js
    └── crawl.js
├── package.json
├── test.js
├── LICENSE.md
├── node.js
├── domp.js
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | 


--------------------------------------------------------------------------------
/examples/single.js:
--------------------------------------------------------------------------------
 1 | (function () {
 2 |   'use strict';
 3 | 
 4 |   var domp = require('../domp');
 5 |   domp('https://en.wikipedia.org', function(dom) {
 6 |     console.log(...dom.map(node => node.name));
 7 |     // html head meta title script ...
 8 |   });
 9 | }());
10 | 


--------------------------------------------------------------------------------
/examples/multiple.js:
--------------------------------------------------------------------------------
 1 | (function () {
 2 |   'use strict';
 3 | 
 4 |   var domp = require('../domp');
 5 | 
 6 |   var urls = [
 7 |     'https://en.wikipedia.org',
 8 |     'https://de.wikipedia.org/'
 9 |   ];
10 | 
11 |   // #1
12 |   domp(urls, function(dom) {
13 |     console.log(...dom.map(node => node.name));
14 |     console.log();
15 |     // html head meta title script ...
16 |   });
17 | 
18 |   // #2
19 |   for (var request of domp(urls))
20 |     request.then(function () {
21 |       // resolved
22 |     });
23 | }());
24 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "domp",
 3 |   "version": "0.4.0",
 4 |   "description": "Web scraping, crawling and DOM tree manipulation for Node.js.",
 5 |   "main": "domp.js",
 6 |   "repository": {
 7 |     "type": "git",
 8 |     "url": "git+https://github.com/mateogianolio/domp.git"
 9 |   },
10 |   "author": "Mateo Gianolio",
11 |   "license": "MIT",
12 |   "bugs": {
13 |     "url": "https://github.com/mateogianolio/domp/issues"
14 |   },
15 |   "homepage": "https://github.com/mateogianolio/domp#readme",
16 |   "dependencies": {
17 |     "htmlparser2": "^3.9.0",
18 |     "request": "^2.69.0",
19 |     "robots-txt": "^0.1.5"
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/test.js:
--------------------------------------------------------------------------------
 1 | (function () {
 2 |   'use strict';
 3 | 
 4 |   var domp = require('./domp'),
 5 |       url = 'https://en.wikipedia.org/wiki/Web_scraping';
 6 | 
 7 |   function link(node) {
 8 |     return node.name === 'a' &&
 9 |            node.href &&
10 |            node.href.indexOf('http') === 0;
11 |   }
12 | 
13 |   // careful, will crawl everything
14 |   domp.crawl(url, function (pages, next) {
15 |     // pages is iterator of promises
16 |     for (var page of pages)
17 |       page.then(function (dom) {
18 |         // submit new urls to crawl to the next() function
19 |         var links = [...dom.filter(link)].map(node => node.href);
20 |         next(links);
21 |       });
22 |   });
23 | }());
24 | 


--------------------------------------------------------------------------------
/examples/crawl.js:
--------------------------------------------------------------------------------
 1 | (function () {
 2 |   'use strict';
 3 | 
 4 |   var domp = require('../domp'),
 5 |       url = 'https://en.wikipedia.org';
 6 | 
 7 |   function resolve(next) {
 8 |     return function (dom) {
 9 |       var title = dom.find('title').next().value,
10 |           links = [...dom.filter(node => node.href && node.href.indexOf('/wiki/') === 0)];
11 | 
12 |       var link = links[Math.floor(Math.random() * links.length)];
13 | 
14 |       console.log(title.text);
15 |       console.log(link.href);
16 |       console.log();
17 | 
18 |       next(url + link.href);
19 |     };
20 |   }
21 | 
22 |   domp.crawl('https://en.wikipedia.org', function(requests, next) {
23 |     for (var request of requests)
24 |       request.then(resolve(next));
25 |   });
26 | }());
27 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | #The MIT License (MIT)
 2 | 
 3 | *Copyright (c) 2016 Mateo Gianolio*
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/node.js:
--------------------------------------------------------------------------------
 1 | (function () {
 2 |   'use strict';
 3 | 
 4 |   class Node {
 5 |     constructor(name, attribs) {
 6 |       this.name = name;
 7 |       this.children = [];
 8 |       for (var key in attribs)
 9 |         if (attribs.hasOwnProperty(key))
10 |           this[key] = attribs[key];
11 |     }
12 | 
13 |     [Symbol.iterator]() {
14 |       return this.traverse(this);
15 |     }
16 | 
17 |     get siblings() {
18 |       var copy = this;
19 |       return (function* () {
20 |         if (!copy.parent)
21 |           return;
22 | 
23 |         for (var child of copy.parent.children)
24 |           if (copy.name !== child.name)
25 |             yield child;
26 |       }());
27 |     }
28 | 
29 |     append(node) {
30 |       node.parent = this;
31 |       this.children.push(node);
32 |     }
33 | 
34 |     map(f) {
35 |       var copy = this;
36 |       return (function* () {
37 |         for (var node of copy)
38 |           yield f(node);
39 |       }());
40 |     }
41 | 
42 |     filter(f) {
43 |       var copy = this;
44 |       return (function* () {
45 |         for (var node of copy)
46 |           if (f(node))
47 |             yield node;
48 |       }());
49 |     }
50 | 
51 |     find(name) {
52 |       return this.filter(node => node.name === name);
53 |     }
54 | 
55 |     *traverse(node) {
56 |       yield node;
57 |       for (var child of node.children)
58 |         yield *this.traverse(child);
59 |     }
60 |   }
61 | 
62 |   module.exports = Node;
63 | }());
64 | 


--------------------------------------------------------------------------------
/domp.js:
--------------------------------------------------------------------------------
 1 | (function () {
 2 |   'use strict';
 3 | 
 4 |   var request = require('request'),
 5 |       robots = require('robots-txt')();
 6 | 
 7 |   var Node = require('./node'),
 8 |       Parser = require('htmlparser2').Parser;
 9 | 
10 |   function parse(body) {
11 |     var dom;
12 |     var parser = new Parser({
13 |       onopentag: function (name, attribs) {
14 |         if (!dom) {
15 |           dom = new Node(name, attribs);
16 |           return;
17 |         }
18 | 
19 |         var node = new Node(name, attribs);
20 |         dom.append(node);
21 |         dom = node;
22 |       },
23 |       ontext: function (text) {
24 |         if (dom)
25 |           dom.text = dom.text ? (dom.text + text) : text;
26 |       },
27 |       onclosetag: function (name) {
28 |         dom = dom.parent ? dom.parent : dom;
29 |       }
30 |     });
31 | 
32 |     parser.write(body);
33 |     parser.end();
34 | 
35 |     return dom;
36 |   }
37 | 
38 |   function get(url) {
39 |     return new Promise(function (resolve, reject) {
40 |       robots
41 |         .isAllowed('domp', url)
42 |         .then(function (allowed) {
43 |           if (!allowed)
44 |             return;
45 | 
46 |           request(url, function (error, response, body) {
47 |             if (error || response.statusCode !== 200)
48 |               return reject(error);
49 |             resolve(parse(body));
50 |           });
51 |         });
52 |     });
53 |   }
54 | 
55 |   function iterate(urls) {
56 |     return (function* () {
57 |       for (var url of urls)
58 |         yield get(url);
59 |     }());
60 |   }
61 | 
62 |   module.exports = function (urls, callback) {
63 |     urls = typeof urls === 'string' ? [urls] : urls;
64 |     if (!callback)
65 |       return iterate(urls);
66 | 
67 |     for (var page of iterate(urls))
68 |       page.then(callback);
69 |   };
70 | 
71 |   function crawl(url, callback) {
72 |     url = typeof url === 'string' ? [url] : url;
73 |     callback(iterate(url), function (urls) {
74 |       if (urls && urls.length)
75 |         crawl(urls, callback);
76 |     });
77 |   }
78 | 
79 |   module.exports.crawl = crawl;
80 | }());
81 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # domp
  2 | 
  3 | Web scraping, crawling and DOM tree manipulation for Node.js. Uses [htmlparser2](https://github.com/fb55/htmlparser2) for HTML parsing and [robots-txt](https://github.com/Woorank/robots-txt) for `robots.txt` checking.
  4 | 
  5 | ```bash
  6 | $ npm install domp
  7 | ```
  8 | 
  9 | ```javascript
 10 | var domp = require('domp');
 11 | ```
 12 | 
 13 | ### Usage
 14 | 
 15 | #### [Get single page (`examples/single.js`)](https://github.com/mateogianolio/domp/blob/master/examples/single.js)
 16 | 
 17 | ```javascript
 18 | domp(url, function(dom) {
 19 |   console.log(...dom.map(node => node.name));
 20 |   // html head meta title script ...
 21 | });
 22 | ```
 23 | 
 24 | #### [Get multiple pages (`examples/multiple.js`)](https://github.com/mateogianolio/domp/blob/master/examples/multiple.js)
 25 | 
 26 | You can scrape an `Array` of urls by
 27 | 
 28 | 1. providing a callback:
 29 | 
 30 |   ```javascript
 31 |   domp(urls, function(dom) {
 32 |     // called twice
 33 |   })
 34 |   ```
 35 | 
 36 | 2. looping through an iterator
 37 | 
 38 |   ```javascript
 39 |   for (var page of domp(urls))
 40 |     page.then(function (dom) {
 41 |       // resolved
 42 |     }, function (error) {
 43 |       // rejected
 44 |     });
 45 |   ```
 46 | 
 47 | #### [Crawling (`examples/crawl.js`)](https://github.com/mateogianolio/domp/blob/master/examples/crawl.js)
 48 | 
 49 | ```javascript
 50 | function resolve(next) {
 51 |   return function (dom) {
 52 |     var title = dom.find('title').next().value,
 53 |         links = [...dom.filter(node => node.href && node.href.indexOf('http') === 0)];
 54 | 
 55 |     // get random link
 56 |     var link = links[Math.floor(Math.random() * links.length)];
 57 | 
 58 |     console.log(title.text);
 59 |     console.log(link.href);
 60 | 
 61 |     // submit link(s) to be scraped next
 62 |     next(link.href);
 63 |   };
 64 | }
 65 | 
 66 | domp.crawl('https://en.wikipedia.org', function(requests, next) {
 67 |   for (var request of requests)
 68 |     request.then(resolve(next));
 69 | });
 70 | ```
 71 | 
 72 | ### DOM Tree traversal
 73 | 
 74 | Standard traversal using `for ... of`:
 75 | 
 76 | ```javascript
 77 | for (var node of dom)
 78 |   console.log(node);
 79 | ```
 80 | 
 81 | Sibling (children with same parent) traversal using `for ... of`:
 82 | 
 83 | ```javascript
 84 | for (var sibling of node.siblings)
 85 |   console.log(sibling);
 86 | ```
 87 | 
 88 | Tag name traversal using `for ... of` and `find(name)`:
 89 | 
 90 | ```javascript
 91 | for (var node of dom.find('p'))
 92 |   console.log(node);
 93 | ```
 94 | 
 95 | 
 96 | ### DOM Manipulation
 97 | 
 98 | DOM nodes (see `node.js`) implement mapping similar to what we're used to from `Array.prototype.map`, but instead of returning an `Array` it returns an `Iterable`. The `Iterable` can either be unpacked into an `Array` using the spread operator (`...`) or be used as a normal iterator.
 99 | 
100 | ```javascript
101 | var names = dom.map(node => node.name);
102 | 
103 | names = [...names];
104 | // names = ['html', 'head', 'meta', 'title', ...]
105 | 
106 | for (var name of names)
107 |   console.log(name);
108 | // html
109 | // head
110 | // ...
111 | ```
112 | 
113 | Filtering works pretty much the same (returns `Iterable`):
114 | 
115 | ```javascript
116 | // get all 'p' tags
117 | var paragraphs = dom.filter(node => node.name === 'p');
118 | 
119 | // traverse
120 | for (var p of paragraphs)
121 |   console.log(p);
122 | ```
123 | 
124 | There's also the short `find(name)` that can be used to find tag names in the tree:
125 | 
126 | ```javascript
127 | for (var node in dom.find('p'))
128 |   console.log(node);
129 | ```
130 | 


--------------------------------------------------------------------------------