├── .gitignore ├── examples ├── single.js ├── multiple.js └── crawl.js ├── package.json ├── test.js ├── LICENSE.md ├── node.js ├── domp.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | -------------------------------------------------------------------------------- /examples/single.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | 'use strict'; 3 | 4 | var domp = require('../domp'); 5 | domp('https://en.wikipedia.org', function(dom) { 6 | console.log(...dom.map(node => node.name)); 7 | // html head meta title script ... 8 | }); 9 | }()); 10 | -------------------------------------------------------------------------------- /examples/multiple.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | 'use strict'; 3 | 4 | var domp = require('../domp'); 5 | 6 | var urls = [ 7 | 'https://en.wikipedia.org', 8 | 'https://de.wikipedia.org/' 9 | ]; 10 | 11 | // #1 12 | domp(urls, function(dom) { 13 | console.log(...dom.map(node => node.name)); 14 | console.log(); 15 | // html head meta title script ... 16 | }); 17 | 18 | // #2 19 | for (var request of domp(urls)) 20 | request.then(function () { 21 | // resolved 22 | }); 23 | }()); 24 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "domp", 3 | "version": "0.4.0", 4 | "description": "Web scraping, crawling and DOM tree manipulation for Node.js.", 5 | "main": "domp.js", 6 | "repository": { 7 | "type": "git", 8 | "url": "git+https://github.com/mateogianolio/domp.git" 9 | }, 10 | "author": "Mateo Gianolio", 11 | "license": "MIT", 12 | "bugs": { 13 | "url": "https://github.com/mateogianolio/domp/issues" 14 | }, 15 | "homepage": "https://github.com/mateogianolio/domp#readme", 16 | "dependencies": { 17 | "htmlparser2": "^3.9.0", 18 | "request": "^2.69.0", 19 | "robots-txt": "^0.1.5" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | 'use strict'; 3 | 4 | var domp = require('./domp'), 5 | url = 'https://en.wikipedia.org/wiki/Web_scraping'; 6 | 7 | function link(node) { 8 | return node.name === 'a' && 9 | node.href && 10 | node.href.indexOf('http') === 0; 11 | } 12 | 13 | // careful, will crawl everything 14 | domp.crawl(url, function (pages, next) { 15 | // pages is iterator of promises 16 | for (var page of pages) 17 | page.then(function (dom) { 18 | // submit new urls to crawl to the next() function 19 | var links = [...dom.filter(link)].map(node => node.href); 20 | next(links); 21 | }); 22 | }); 23 | }()); 24 | -------------------------------------------------------------------------------- /examples/crawl.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | 'use strict'; 3 | 4 | var domp = require('../domp'), 5 | url = 'https://en.wikipedia.org'; 6 | 7 | function resolve(next) { 8 | return function (dom) { 9 | var title = dom.find('title').next().value, 10 | links = [...dom.filter(node => node.href && node.href.indexOf('/wiki/') === 0)]; 11 | 12 | var link = links[Math.floor(Math.random() * links.length)]; 13 | 14 | console.log(title.text); 15 | console.log(link.href); 16 | console.log(); 17 | 18 | next(url + link.href); 19 | }; 20 | } 21 | 22 | domp.crawl('https://en.wikipedia.org', function(requests, next) { 23 | for (var request of requests) 24 | request.then(resolve(next)); 25 | }); 26 | }()); 27 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | #The MIT License (MIT) 2 | 3 | *Copyright (c) 2016 Mateo Gianolio* 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /node.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | 'use strict'; 3 | 4 | class Node { 5 | constructor(name, attribs) { 6 | this.name = name; 7 | this.children = []; 8 | for (var key in attribs) 9 | if (attribs.hasOwnProperty(key)) 10 | this[key] = attribs[key]; 11 | } 12 | 13 | [Symbol.iterator]() { 14 | return this.traverse(this); 15 | } 16 | 17 | get siblings() { 18 | var copy = this; 19 | return (function* () { 20 | if (!copy.parent) 21 | return; 22 | 23 | for (var child of copy.parent.children) 24 | if (copy.name !== child.name) 25 | yield child; 26 | }()); 27 | } 28 | 29 | append(node) { 30 | node.parent = this; 31 | this.children.push(node); 32 | } 33 | 34 | map(f) { 35 | var copy = this; 36 | return (function* () { 37 | for (var node of copy) 38 | yield f(node); 39 | }()); 40 | } 41 | 42 | filter(f) { 43 | var copy = this; 44 | return (function* () { 45 | for (var node of copy) 46 | if (f(node)) 47 | yield node; 48 | }()); 49 | } 50 | 51 | find(name) { 52 | return this.filter(node => node.name === name); 53 | } 54 | 55 | *traverse(node) { 56 | yield node; 57 | for (var child of node.children) 58 | yield *this.traverse(child); 59 | } 60 | } 61 | 62 | module.exports = Node; 63 | }()); 64 | -------------------------------------------------------------------------------- /domp.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | 'use strict'; 3 | 4 | var request = require('request'), 5 | robots = require('robots-txt')(); 6 | 7 | var Node = require('./node'), 8 | Parser = require('htmlparser2').Parser; 9 | 10 | function parse(body) { 11 | var dom; 12 | var parser = new Parser({ 13 | onopentag: function (name, attribs) { 14 | if (!dom) { 15 | dom = new Node(name, attribs); 16 | return; 17 | } 18 | 19 | var node = new Node(name, attribs); 20 | dom.append(node); 21 | dom = node; 22 | }, 23 | ontext: function (text) { 24 | if (dom) 25 | dom.text = dom.text ? (dom.text + text) : text; 26 | }, 27 | onclosetag: function (name) { 28 | dom = dom.parent ? dom.parent : dom; 29 | } 30 | }); 31 | 32 | parser.write(body); 33 | parser.end(); 34 | 35 | return dom; 36 | } 37 | 38 | function get(url) { 39 | return new Promise(function (resolve, reject) { 40 | robots 41 | .isAllowed('domp', url) 42 | .then(function (allowed) { 43 | if (!allowed) 44 | return; 45 | 46 | request(url, function (error, response, body) { 47 | if (error || response.statusCode !== 200) 48 | return reject(error); 49 | resolve(parse(body)); 50 | }); 51 | }); 52 | }); 53 | } 54 | 55 | function iterate(urls) { 56 | return (function* () { 57 | for (var url of urls) 58 | yield get(url); 59 | }()); 60 | } 61 | 62 | module.exports = function (urls, callback) { 63 | urls = typeof urls === 'string' ? [urls] : urls; 64 | if (!callback) 65 | return iterate(urls); 66 | 67 | for (var page of iterate(urls)) 68 | page.then(callback); 69 | }; 70 | 71 | function crawl(url, callback) { 72 | url = typeof url === 'string' ? [url] : url; 73 | callback(iterate(url), function (urls) { 74 | if (urls && urls.length) 75 | crawl(urls, callback); 76 | }); 77 | } 78 | 79 | module.exports.crawl = crawl; 80 | }()); 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # domp 2 | 3 | Web scraping, crawling and DOM tree manipulation for Node.js. Uses [htmlparser2](https://github.com/fb55/htmlparser2) for HTML parsing and [robots-txt](https://github.com/Woorank/robots-txt) for `robots.txt` checking. 4 | 5 | ```bash 6 | $ npm install domp 7 | ``` 8 | 9 | ```javascript 10 | var domp = require('domp'); 11 | ``` 12 | 13 | ### Usage 14 | 15 | #### [Get single page (`examples/single.js`)](https://github.com/mateogianolio/domp/blob/master/examples/single.js) 16 | 17 | ```javascript 18 | domp(url, function(dom) { 19 | console.log(...dom.map(node => node.name)); 20 | // html head meta title script ... 21 | }); 22 | ``` 23 | 24 | #### [Get multiple pages (`examples/multiple.js`)](https://github.com/mateogianolio/domp/blob/master/examples/multiple.js) 25 | 26 | You can scrape an `Array` of urls by 27 | 28 | 1. providing a callback: 29 | 30 | ```javascript 31 | domp(urls, function(dom) { 32 | // called twice 33 | }) 34 | ``` 35 | 36 | 2. looping through an iterator 37 | 38 | ```javascript 39 | for (var page of domp(urls)) 40 | page.then(function (dom) { 41 | // resolved 42 | }, function (error) { 43 | // rejected 44 | }); 45 | ``` 46 | 47 | #### [Crawling (`examples/crawl.js`)](https://github.com/mateogianolio/domp/blob/master/examples/crawl.js) 48 | 49 | ```javascript 50 | function resolve(next) { 51 | return function (dom) { 52 | var title = dom.find('title').next().value, 53 | links = [...dom.filter(node => node.href && node.href.indexOf('http') === 0)]; 54 | 55 | // get random link 56 | var link = links[Math.floor(Math.random() * links.length)]; 57 | 58 | console.log(title.text); 59 | console.log(link.href); 60 | 61 | // submit link(s) to be scraped next 62 | next(link.href); 63 | }; 64 | } 65 | 66 | domp.crawl('https://en.wikipedia.org', function(requests, next) { 67 | for (var request of requests) 68 | request.then(resolve(next)); 69 | }); 70 | ``` 71 | 72 | ### DOM Tree traversal 73 | 74 | Standard traversal using `for ... of`: 75 | 76 | ```javascript 77 | for (var node of dom) 78 | console.log(node); 79 | ``` 80 | 81 | Sibling (children with same parent) traversal using `for ... of`: 82 | 83 | ```javascript 84 | for (var sibling of node.siblings) 85 | console.log(sibling); 86 | ``` 87 | 88 | Tag name traversal using `for ... of` and `find(name)`: 89 | 90 | ```javascript 91 | for (var node of dom.find('p')) 92 | console.log(node); 93 | ``` 94 | 95 | 96 | ### DOM Manipulation 97 | 98 | DOM nodes (see `node.js`) implement mapping similar to what we're used to from `Array.prototype.map`, but instead of returning an `Array` it returns an `Iterable`. The `Iterable` can either be unpacked into an `Array` using the spread operator (`...`) or be used as a normal iterator. 99 | 100 | ```javascript 101 | var names = dom.map(node => node.name); 102 | 103 | names = [...names]; 104 | // names = ['html', 'head', 'meta', 'title', ...] 105 | 106 | for (var name of names) 107 | console.log(name); 108 | // html 109 | // head 110 | // ... 111 | ``` 112 | 113 | Filtering works pretty much the same (returns `Iterable`): 114 | 115 | ```javascript 116 | // get all 'p' tags 117 | var paragraphs = dom.filter(node => node.name === 'p'); 118 | 119 | // traverse 120 | for (var p of paragraphs) 121 | console.log(p); 122 | ``` 123 | 124 | There's also the short `find(name)` that can be used to find tag names in the tree: 125 | 126 | ```javascript 127 | for (var node in dom.find('p')) 128 | console.log(node); 129 | ``` 130 | --------------------------------------------------------------------------------