├── README.md
├── examples
    ├── missing_parameters.js
    └── pulp_fiction.js
├── index.js
└── package.json


/README.md:
--------------------------------------------------------------------------------
  1 | # web-scraper-js
  2 | 
  3 | A lightweight, no BS, simple to use web scraping library written in node, which simply does its
  4 | job, nothing more, nothing less.
  5 | 
  6 | # Disclaimer
  7 | 
  8 | Please make sure to use this package within legal and ethical boundaries.
  9 | 
 10 | # Install
 11 | 
 12 | `npm install --save web-scraper-js`
 13 | 
 14 | # Usage
 15 | 
 16 | You'll have to specifiy if the data you want to scrape is (rendered) text or attribute values.
 17 | 
 18 | ## scrape([params])
 19 | 
 20 | This is the only functionality this package provides.
 21 | 
 22 | |parameter        |type    |description                                           |required |
 23 | |:----------------|:------:|:-----------------------------------------------------|:-------:|
 24 | |`url            `|`string`|The url from which to scrape from                     |`true   `|
 25 | |`tags           `|`object`|An object, yielding the information on what to scrape |`true   `|
 26 | |`tags.text      `|`object`|Text elements to be scraped                           |`false  `|
 27 | |`tags.attribute `|`object`|Attributes of elements to be scraped                  |`false  `|
 28 | |`tags.singleton `|`object`|Category of content that only occurs once             |`false  `|
 29 | |`tags.collection`|`object`|Category of content that can occur multiple times     |`false  `|
 30 | 
 31 | In order to successfully scrape something you'll have to provide selectors.
 32 | Since you're reading this I assume you know what that is and just go on.
 33 | 
 34 | The `tags.text` and `tags.attribute` objects take different key value pairs.
 35 | 
 36 | ### tags.text
 37 | 
 38 | This object takes key value pairs of the form: `{'name': 'selector'}`, where the key is a name of
 39 | your choice.
 40 | However, it should have a meaningful name, since you will be accessing it later in the response.
 41 | Also as of now you should not declare the same names in `tags.text` and `tags.attribute` since
 42 | one will overwrite the other.
 43 | 
 44 | The selector part should be obvious, typically browsers allow you to just copy them using their
 45 | dev tools.
 46 | 
 47 | ### tags.attribute
 48 | 
 49 | This object takes key value pairs of the form: `{'name': ['selector', 'attribute']}`.
 50 | 
 51 | This time the value is a tuple containing the selector and the attribute from which to collect
 52 | data.
 53 | Since an element can have multiple values you'll have to declare which one to use, simple as that
 54 | .
 55 | 
 56 | ### tags.collection and tags.singleton
 57 | 
 58 | These are objects to provide more meaning to the search, where singleton tells the code that
 59 | these items should only occur once, whereas collection means there might be multiple entries of
 60 | the same structure.
 61 | 
 62 | Both contain objects structured like the previous `tags.text` and `tags.attribute` objects, as
 63 | you can see in the examples.
 64 | 
 65 | ### Examples
 66 | 
 67 | The following examples scrape a couple of details about the movie Pulp Fiction from IMDb.
 68 | 
 69 | ```js
 70 | (async () => {
 71 |     
 72 |     let result = await webscraper.scrape({
 73 |         url: 'https://www.imdb.com/title/tt0110912/',
 74 |         tags: {
 75 |             text: {
 76 |                 "movie-rating-value": 'span[itemprop="ratingValue"]',
 77 |                 "movie-character": ".character a"
 78 |             },
 79 |             attribute: {
 80 |                 "movie-title": ["meta[property='og:title']", "content"],
 81 |                 "movie-actor": [".primary_photo > a > img", "alt"]
 82 |             }
 83 |         }
 84 |     });
 85 | 
 86 |     console.log(result);
 87 | })();
 88 | ```
 89 | 
 90 | The code above will print the follwing output:
 91 | 
 92 | ```js
 93 | {
 94 |   "movie-rating-value": [ "8.9" ],
 95 |   "movie-character": [
 96 |      "Pumpkin", "Honey Bunny", "Waitress", //...
 97 |    ],
 98 |   "movie-title": [ "Pulp Fiction (1994) - IMDb" ],
 99 |   "movie-actor": [
100 |      "Tim Roth", "Amanda Plummer", "Laura Lovelace", //...
101 |    ]
102 | }
103 | ```
104 | 
105 | As you can see it's a simple object, using your declared names as keys and, respectively, the
106 | results of their selectors inside an array since there can be multiple results for one selector.
107 | 
108 | There also is a more semantically sensitive way to declare the contents you want to have scraped.
109 | With this method you declare if the respective elements should occure just once (singleton) or if
110 | there might be more than one elements containing the same sort of type.
111 | 
112 | ```js
113 | let webscraper = require('web-scraper-js');
114 | 
115 | (async () => {
116 |     
117 |     let result = await webscraper.scrape({
118 |         url: 'https://www.imdb.com/title/tt0110912/',
119 |         tags: {
120 |             singleton: {
121 |                 text: {
122 |                     "movie-rating-value": 'span[itemprop="ratingValue"]'
123 |                 },
124 |                 attribute: {
125 |                     "movie-title": ["meta[property='og:title']", "content"]
126 |                 }
127 |             },
128 |             collection: {
129 |                 text: {
130 |                     "movie-character": ".character a"
131 |                 },
132 |                 attribute: {
133 |                     "movie-actor": [".primary_photo > a > img", "alt"]
134 |                 }
135 |             }
136 |         }
137 |     });
138 | 
139 |     console.log(result);
140 | 
141 | })();
142 | ```
143 | 
144 | For the elements declared as singleton, an object will be returned, an array for collection type
145 | elements, respectively.
146 | 
147 | ```js
148 | {
149 |   "movie-rating-value": "8.9",
150 |   "movie-character": [
151 |      "Pumpkin", "Honey Bunny", "Waitress", //...
152 |    ],
153 |   "movie-title": "Pulp Fiction (1994) - IMDb",
154 |   "movie-actor": [
155 |      "Tim Roth", "Amanda Plummer", "Laura Lovelace", //...
156 |    ]
157 | }
158 | ```
159 | 


--------------------------------------------------------------------------------
/examples/missing_parameters.js:
--------------------------------------------------------------------------------
 1 | let webscraper = require('../index.js');
 2 | 
 3 | (async () => {
 4 |     
 5 |     try {
 6 |         let result = await webscraper.scrape({ url: 'url' });
 7 |         console.log(result);
 8 |     } catch (e) {
 9 |         console.log(e);
10 |     }
11 | })();
12 | 
13 | (async () => {
14 |     
15 |     try {
16 |         let result = await webscraper.scrape({ tags: {} });
17 |         console.log(result);
18 |     } catch (e) {
19 |         console.log(e);
20 |     }
21 | })();
22 | 
23 | (async () => {
24 |     
25 |     try {
26 |         let result = await webscraper.scrape();
27 |         console.log(result);
28 |     } catch (e) {
29 |         console.log(e);
30 |     }
31 | })();
32 | 


--------------------------------------------------------------------------------
/examples/pulp_fiction.js:
--------------------------------------------------------------------------------
 1 | let webscraper = require('../index.js');
 2 | 
 3 | (async () => {
 4 |     
 5 |     let result = await webscraper.scrape({
 6 |         url: 'https://www.imdb.com/title/tt0110912/',
 7 |         tags: {
 8 |             singleton: {
 9 |                 text: {
10 |                     "movie-rating-value": 'span[itemprop="ratingValue"]'
11 |                 },
12 |                 attribute: {
13 |                     "movie-title": ["meta[property='og:title']", "content"]
14 |                 }
15 |             },
16 |             collection: {
17 |                 text: {
18 |                     "movie-character": ".character a"
19 |                 },
20 |                 attribute: {
21 |                     "movie-actor": [".primary_photo > a > img", "alt"]
22 |                 }
23 |             }
24 |         }
25 |     });
26 | 
27 |     console.log(result);
28 | })();
29 | 
30 | (async () => {
31 |     
32 |     let result = await webscraper.scrape({
33 |         url: 'https://www.imdb.com/title/tt0110912/',
34 |         tags: {
35 |             text: {
36 |                 "movie-rating-value": 'span[itemprop="ratingValue"]',
37 |                 "movie-character": ".character a"
38 |             },
39 |             attribute: {
40 |                 "movie-title": ["meta[property='og:title']", "content"],
41 |                 "movie-actor": [".primary_photo > a > img", "alt"]
42 |             }
43 |         }
44 |     });
45 | 
46 |     console.log(result);
47 | })();
48 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | const request = require('request-promise');
  2 | 
  3 | const jsdom = require('jsdom');
  4 | 
  5 | const {
  6 |     JSDOM
  7 | } = jsdom;
  8 | 
  9 | /*
 10 |  * TODO: if names are the same merge?
 11 |  */
 12 | exports.scrape = async (params) => {
 13 | 
 14 |     if (!params) {
 15 |        throw new Error('Parameters missing!');
 16 |     }
 17 | 
 18 |     if (!params.url || !params.tags) {
 19 |        throw new Error('Parameters missing!');
 20 |     }
 21 | 
 22 |     const {
 23 |         url,
 24 |         tags
 25 |     } = params;
 26 | 
 27 |     let body = await request(params.url);
 28 | 
 29 |     if (body === null || body === undefined)
 30 |         return;
 31 |     
 32 |     const vdom = new JSDOM(body);
 33 |     let $ = require('jquery')(vdom.window);
 34 | 
 35 |     let sample = {};
 36 | 
 37 |     /* set of items where a quantity of one is expected */
 38 |     if (tags.singleton) {
 39 | 
 40 |         Object.keys(tags.singleton.text).forEach(tag => {
 41 | 
 42 |             sample[tag] = $(tags.singleton.text[tag]).text();
 43 |         });
 44 | 
 45 |         Object.keys(tags.singleton.attribute).forEach(tag => {
 46 | 
 47 |             let query = tags.singleton.attribute[tag];
 48 |             sample[tag] = $(query[0]).attr(query[1]);
 49 |         });
 50 |     }
 51 | 
 52 | 
 53 |     /* set of items where a quantity of >1 is possible */
 54 |     if (tags.collection) {
 55 | 
 56 |         Object.keys(tags.collection.text).forEach(tag => {
 57 | 
 58 |             sample[tag] = [];
 59 |             $(tags.collection.text[tag]).each(function(index, item) {
 60 |                 sample[tag].push(item.textContent);
 61 |             });
 62 |         });
 63 | 
 64 |         Object.keys(tags.collection.attribute).forEach(tag => {
 65 | 
 66 |             sample[tag] = [];
 67 |             let query = tags.collection.attribute[tag];
 68 |             $(query[0]).each(function() { // no syntactic sugar here (won't work)!
 69 | 
 70 |                 let val = $(this).attr(query[1]);
 71 |                 if (val) {
 72 |                     sample[tag].push(val);
 73 |                 }
 74 |             });
 75 |         });
 76 |     }
 77 | 
 78 |     /* every item specified is simply put into an array */
 79 |     if (tags.text) { 
 80 | 
 81 |         Object.keys(tags.text).forEach(tag => {
 82 | 
 83 |             sample[tag] = [];
 84 |             $(tags.text[tag]).each(function(index, item) {
 85 | 
 86 |                 sample[tag].push(item.textContent);
 87 |             });
 88 |         });
 89 |     }
 90 | 
 91 |     if (tags.attribute) {
 92 | 
 93 |         Object.keys(tags.attribute).forEach(tag => {
 94 | 
 95 |             sample[tag] = [];
 96 | 
 97 |             let query = tags.attribute[tag];
 98 |             $(query[0]).each(function() { // no syntactic sugar here (won't work)!
 99 | 
100 |                 let val = $(this).attr(query[1]);
101 |                 if (val) {
102 |                     sample[tag].push(val);
103 |                 }
104 |             });
105 |         });
106 |     }
107 | 
108 |     return sample;
109 | }
110 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "web-scraper-js",
 3 |   "version": "1.1.0",
 4 |   "description": "A lightweight and simple to use web scraping library.",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1"
 8 |   },
 9 |   "author": "Juliette Opdenplatz",
10 |   "license": "MIT",
11 |   "dependencies": {
12 |     "jquery": "^3.4.1",
13 |     "jsdom": "^15.1.1",
14 |     "request": "^2.88.0",
15 |     "request-promise": "^4.2.4"
16 |   },
17 |   "devDependencies": {},
18 |   "repository": {
19 |     "type": "git",
20 |     "url": "git+https://github.com/julietcetera/web-scraper-js.git"
21 |   },
22 |   "keywords": [
23 |     "web",
24 |     "scraping"
25 |   ],
26 |   "bugs": {
27 |     "url": "https://github.com/julietcetera/web-scraper-js/issues"
28 |   },
29 |   "homepage": "https://github.com/julietcetera/web-scraper-js#readme"
30 | }
31 | 


--------------------------------------------------------------------------------