├── README.md ├── examples ├── missing_parameters.js └── pulp_fiction.js ├── index.js └── package.json /README.md: -------------------------------------------------------------------------------- 1 | # web-scraper-js 2 | 3 | A lightweight, no BS, simple to use web scraping library written in node, which simply does its 4 | job, nothing more, nothing less. 5 | 6 | # Disclaimer 7 | 8 | Please make sure to use this package within legal and ethical boundaries. 9 | 10 | # Install 11 | 12 | `npm install --save web-scraper-js` 13 | 14 | # Usage 15 | 16 | You'll have to specifiy if the data you want to scrape is (rendered) text or attribute values. 17 | 18 | ## scrape([params]) 19 | 20 | This is the only functionality this package provides. 21 | 22 | |parameter |type |description |required | 23 | |:----------------|:------:|:-----------------------------------------------------|:-------:| 24 | |`url `|`string`|The url from which to scrape from |`true `| 25 | |`tags `|`object`|An object, yielding the information on what to scrape |`true `| 26 | |`tags.text `|`object`|Text elements to be scraped |`false `| 27 | |`tags.attribute `|`object`|Attributes of elements to be scraped |`false `| 28 | |`tags.singleton `|`object`|Category of content that only occurs once |`false `| 29 | |`tags.collection`|`object`|Category of content that can occur multiple times |`false `| 30 | 31 | In order to successfully scrape something you'll have to provide selectors. 32 | Since you're reading this I assume you know what that is and just go on. 33 | 34 | The `tags.text` and `tags.attribute` objects take different key value pairs. 35 | 36 | ### tags.text 37 | 38 | This object takes key value pairs of the form: `{'name': 'selector'}`, where the key is a name of 39 | your choice. 40 | However, it should have a meaningful name, since you will be accessing it later in the response. 41 | Also as of now you should not declare the same names in `tags.text` and `tags.attribute` since 42 | one will overwrite the other. 43 | 44 | The selector part should be obvious, typically browsers allow you to just copy them using their 45 | dev tools. 46 | 47 | ### tags.attribute 48 | 49 | This object takes key value pairs of the form: `{'name': ['selector', 'attribute']}`. 50 | 51 | This time the value is a tuple containing the selector and the attribute from which to collect 52 | data. 53 | Since an element can have multiple values you'll have to declare which one to use, simple as that 54 | . 55 | 56 | ### tags.collection and tags.singleton 57 | 58 | These are objects to provide more meaning to the search, where singleton tells the code that 59 | these items should only occur once, whereas collection means there might be multiple entries of 60 | the same structure. 61 | 62 | Both contain objects structured like the previous `tags.text` and `tags.attribute` objects, as 63 | you can see in the examples. 64 | 65 | ### Examples 66 | 67 | The following examples scrape a couple of details about the movie Pulp Fiction from IMDb. 68 | 69 | ```js 70 | (async () => { 71 | 72 | let result = await webscraper.scrape({ 73 | url: 'https://www.imdb.com/title/tt0110912/', 74 | tags: { 75 | text: { 76 | "movie-rating-value": 'span[itemprop="ratingValue"]', 77 | "movie-character": ".character a" 78 | }, 79 | attribute: { 80 | "movie-title": ["meta[property='og:title']", "content"], 81 | "movie-actor": [".primary_photo > a > img", "alt"] 82 | } 83 | } 84 | }); 85 | 86 | console.log(result); 87 | })(); 88 | ``` 89 | 90 | The code above will print the follwing output: 91 | 92 | ```js 93 | { 94 | "movie-rating-value": [ "8.9" ], 95 | "movie-character": [ 96 | "Pumpkin", "Honey Bunny", "Waitress", //... 97 | ], 98 | "movie-title": [ "Pulp Fiction (1994) - IMDb" ], 99 | "movie-actor": [ 100 | "Tim Roth", "Amanda Plummer", "Laura Lovelace", //... 101 | ] 102 | } 103 | ``` 104 | 105 | As you can see it's a simple object, using your declared names as keys and, respectively, the 106 | results of their selectors inside an array since there can be multiple results for one selector. 107 | 108 | There also is a more semantically sensitive way to declare the contents you want to have scraped. 109 | With this method you declare if the respective elements should occure just once (singleton) or if 110 | there might be more than one elements containing the same sort of type. 111 | 112 | ```js 113 | let webscraper = require('web-scraper-js'); 114 | 115 | (async () => { 116 | 117 | let result = await webscraper.scrape({ 118 | url: 'https://www.imdb.com/title/tt0110912/', 119 | tags: { 120 | singleton: { 121 | text: { 122 | "movie-rating-value": 'span[itemprop="ratingValue"]' 123 | }, 124 | attribute: { 125 | "movie-title": ["meta[property='og:title']", "content"] 126 | } 127 | }, 128 | collection: { 129 | text: { 130 | "movie-character": ".character a" 131 | }, 132 | attribute: { 133 | "movie-actor": [".primary_photo > a > img", "alt"] 134 | } 135 | } 136 | } 137 | }); 138 | 139 | console.log(result); 140 | 141 | })(); 142 | ``` 143 | 144 | For the elements declared as singleton, an object will be returned, an array for collection type 145 | elements, respectively. 146 | 147 | ```js 148 | { 149 | "movie-rating-value": "8.9", 150 | "movie-character": [ 151 | "Pumpkin", "Honey Bunny", "Waitress", //... 152 | ], 153 | "movie-title": "Pulp Fiction (1994) - IMDb", 154 | "movie-actor": [ 155 | "Tim Roth", "Amanda Plummer", "Laura Lovelace", //... 156 | ] 157 | } 158 | ``` 159 | -------------------------------------------------------------------------------- /examples/missing_parameters.js: -------------------------------------------------------------------------------- 1 | let webscraper = require('../index.js'); 2 | 3 | (async () => { 4 | 5 | try { 6 | let result = await webscraper.scrape({ url: 'url' }); 7 | console.log(result); 8 | } catch (e) { 9 | console.log(e); 10 | } 11 | })(); 12 | 13 | (async () => { 14 | 15 | try { 16 | let result = await webscraper.scrape({ tags: {} }); 17 | console.log(result); 18 | } catch (e) { 19 | console.log(e); 20 | } 21 | })(); 22 | 23 | (async () => { 24 | 25 | try { 26 | let result = await webscraper.scrape(); 27 | console.log(result); 28 | } catch (e) { 29 | console.log(e); 30 | } 31 | })(); 32 | -------------------------------------------------------------------------------- /examples/pulp_fiction.js: -------------------------------------------------------------------------------- 1 | let webscraper = require('../index.js'); 2 | 3 | (async () => { 4 | 5 | let result = await webscraper.scrape({ 6 | url: 'https://www.imdb.com/title/tt0110912/', 7 | tags: { 8 | singleton: { 9 | text: { 10 | "movie-rating-value": 'span[itemprop="ratingValue"]' 11 | }, 12 | attribute: { 13 | "movie-title": ["meta[property='og:title']", "content"] 14 | } 15 | }, 16 | collection: { 17 | text: { 18 | "movie-character": ".character a" 19 | }, 20 | attribute: { 21 | "movie-actor": [".primary_photo > a > img", "alt"] 22 | } 23 | } 24 | } 25 | }); 26 | 27 | console.log(result); 28 | })(); 29 | 30 | (async () => { 31 | 32 | let result = await webscraper.scrape({ 33 | url: 'https://www.imdb.com/title/tt0110912/', 34 | tags: { 35 | text: { 36 | "movie-rating-value": 'span[itemprop="ratingValue"]', 37 | "movie-character": ".character a" 38 | }, 39 | attribute: { 40 | "movie-title": ["meta[property='og:title']", "content"], 41 | "movie-actor": [".primary_photo > a > img", "alt"] 42 | } 43 | } 44 | }); 45 | 46 | console.log(result); 47 | })(); 48 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const request = require('request-promise'); 2 | 3 | const jsdom = require('jsdom'); 4 | 5 | const { 6 | JSDOM 7 | } = jsdom; 8 | 9 | /* 10 | * TODO: if names are the same merge? 11 | */ 12 | exports.scrape = async (params) => { 13 | 14 | if (!params) { 15 | throw new Error('Parameters missing!'); 16 | } 17 | 18 | if (!params.url || !params.tags) { 19 | throw new Error('Parameters missing!'); 20 | } 21 | 22 | const { 23 | url, 24 | tags 25 | } = params; 26 | 27 | let body = await request(params.url); 28 | 29 | if (body === null || body === undefined) 30 | return; 31 | 32 | const vdom = new JSDOM(body); 33 | let $ = require('jquery')(vdom.window); 34 | 35 | let sample = {}; 36 | 37 | /* set of items where a quantity of one is expected */ 38 | if (tags.singleton) { 39 | 40 | Object.keys(tags.singleton.text).forEach(tag => { 41 | 42 | sample[tag] = $(tags.singleton.text[tag]).text(); 43 | }); 44 | 45 | Object.keys(tags.singleton.attribute).forEach(tag => { 46 | 47 | let query = tags.singleton.attribute[tag]; 48 | sample[tag] = $(query[0]).attr(query[1]); 49 | }); 50 | } 51 | 52 | 53 | /* set of items where a quantity of >1 is possible */ 54 | if (tags.collection) { 55 | 56 | Object.keys(tags.collection.text).forEach(tag => { 57 | 58 | sample[tag] = []; 59 | $(tags.collection.text[tag]).each(function(index, item) { 60 | sample[tag].push(item.textContent); 61 | }); 62 | }); 63 | 64 | Object.keys(tags.collection.attribute).forEach(tag => { 65 | 66 | sample[tag] = []; 67 | let query = tags.collection.attribute[tag]; 68 | $(query[0]).each(function() { // no syntactic sugar here (won't work)! 69 | 70 | let val = $(this).attr(query[1]); 71 | if (val) { 72 | sample[tag].push(val); 73 | } 74 | }); 75 | }); 76 | } 77 | 78 | /* every item specified is simply put into an array */ 79 | if (tags.text) { 80 | 81 | Object.keys(tags.text).forEach(tag => { 82 | 83 | sample[tag] = []; 84 | $(tags.text[tag]).each(function(index, item) { 85 | 86 | sample[tag].push(item.textContent); 87 | }); 88 | }); 89 | } 90 | 91 | if (tags.attribute) { 92 | 93 | Object.keys(tags.attribute).forEach(tag => { 94 | 95 | sample[tag] = []; 96 | 97 | let query = tags.attribute[tag]; 98 | $(query[0]).each(function() { // no syntactic sugar here (won't work)! 99 | 100 | let val = $(this).attr(query[1]); 101 | if (val) { 102 | sample[tag].push(val); 103 | } 104 | }); 105 | }); 106 | } 107 | 108 | return sample; 109 | } 110 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "web-scraper-js", 3 | "version": "1.1.0", 4 | "description": "A lightweight and simple to use web scraping library.", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "Juliette Opdenplatz", 10 | "license": "MIT", 11 | "dependencies": { 12 | "jquery": "^3.4.1", 13 | "jsdom": "^15.1.1", 14 | "request": "^2.88.0", 15 | "request-promise": "^4.2.4" 16 | }, 17 | "devDependencies": {}, 18 | "repository": { 19 | "type": "git", 20 | "url": "git+https://github.com/julietcetera/web-scraper-js.git" 21 | }, 22 | "keywords": [ 23 | "web", 24 | "scraping" 25 | ], 26 | "bugs": { 27 | "url": "https://github.com/julietcetera/web-scraper-js/issues" 28 | }, 29 | "homepage": "https://github.com/julietcetera/web-scraper-js#readme" 30 | } 31 | --------------------------------------------------------------------------------