├── SUq.gif
├── .gitignore
├── .npmignore
├── recipes
    ├── request-options.js
    ├── microformat-dump.js
    ├── youtube.js
    ├── generic.js
    ├── images.js
    └── wordpress-microformat.js
├── bin
    ├── usage.txt
    └── suq.js
├── lib
    ├── parseMeta.js
    ├── parseTwitterCard.js
    ├── parseOpenGraph.js
    ├── parseOembed.js
    ├── parseTags.js
    ├── cleanMicrodata.js
    └── cleanMicroformats.js
├── tests
    ├── cleanMicrodata.js
    ├── parseTwitterCard.js
    ├── parseMeta.js
    ├── parseOembed.js
    ├── cleanMicroformats.js
    ├── parseTags.js
    ├── parseOpenGraph.js
    └── fixtures
    │   ├── sample.js
    │   ├── cleanedMicrodata.json
    │   └── rawMicrodata.json
├── LICENSE
├── package.json
├── index.js
└── README.md


/SUq.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MattMcFarland/SUq/HEAD/SUq.gif


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .idea
3 | .c9
4 | npm-debug.log
5 | tests/sites/*.json
6 | .npmrc
7 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .idea
3 | .c9
4 | npm-debug.log
5 | task.txt
6 | tests/sites/*.json


--------------------------------------------------------------------------------
/recipes/request-options.js:
--------------------------------------------------------------------------------
1 | var suq = require('suq');
2 | 
3 | suq('http://www.nytimes.com/2016/01/31/books/review/the-powers-that-were.html', function (err, data, body) {
4 | 
5 |     console.log(data);
6 | 
7 | }, { jar: true });


--------------------------------------------------------------------------------
/recipes/microformat-dump.js:
--------------------------------------------------------------------------------
1 | var suq = require('suq');
2 | var cheerio = require('cheerio');
3 | var _ = require('lodash');
4 | 
5 | suq('https://blog.agilebits.com/2015/06/17/1password-inter-process-communication-discussion/', function (err, data, body) {
6 | 
7 |     console.log(JSON.stringify(data));
8 | 
9 | });


--------------------------------------------------------------------------------
/bin/usage.txt:
--------------------------------------------------------------------------------
 1 | Usage: suq [url] {OPTIONS}
 2 | 
 3 | Options:
 4 | 
 5 |         --url, -u  Scrapes the url provided.
 6 |                    Optionally url can be the first parameter.
 7 | 
 8 |      --output, -o  Writes the scraped data to a file
 9 |                    If unspecified, suq prints to stdout.
10 | 
11 |     --version, -v  Displays version information.
12 | 
13 |        --help, -h  Show this message
14 | 
15 | Specify a parameter.
16 | 


--------------------------------------------------------------------------------
/recipes/youtube.js:
--------------------------------------------------------------------------------
 1 | var suq = require('suq');
 2 | 
 3 | suq('https://www.youtube.com/watch?v=Xft3asYLKo0', function (err, data) {
 4 | 
 5 |     if (!err) {      
 6 |         var props = data.microdata.items[0].properties;
 7 |         console.log('\n\ntitle:', props.name[0]);
 8 |         console.log('\nthumbnail:', props.thumbnailUrl[0]);
 9 |         console.log('embedURL:', props.embedURL[0]);
10 |         console.log('\ndescription:', props.description[0]);
11 |         console.log('\ndatePublished:', props.datePublished[0]);
12 |     }
13 | });


--------------------------------------------------------------------------------
/lib/parseMeta.js:
--------------------------------------------------------------------------------
 1 | var _ = require('lodash');
 2 | 
 3 | module.exports = function ($, callback) {
 4 | 
 5 |   try {
 6 | 
 7 |     var
 8 |       $head = $('head'),
 9 |       result = {};
10 | 
11 | 
12 |     $head.find('meta').each(function(i, el) {
13 |       var $el = $(el);
14 | 
15 |       if ($el.attr('name') && $el.attr('content')) {
16 |         result[$el.attr('name')] = $el.attr('content');
17 |       }
18 | 
19 |     });
20 | 
21 |     callback(null, result);
22 | 
23 |   } catch (e) {
24 |     console.log(e);
25 |     callback(e);
26 |   }
27 | 
28 | };


--------------------------------------------------------------------------------
/recipes/generic.js:
--------------------------------------------------------------------------------
 1 | var suq = require('suq');
 2 | var cheerio = require('cheerio');
 3 | var _ = require('lodash');
 4 | 
 5 | suq('http://odonatagame.blogspot.com/2015/07/oh-thats-right-were-not-dead.html', function (err, data, body) {
 6 | 
 7 |     var $ = cheerio.load(body);
 8 | 
 9 |     var scraped = {
10 |         title: data.meta.title || data.headers.h1[0],
11 |         description: data.meta.description || $('p').text().replace(/([\r\n\t])+/ig,'').substring(0,255) +'...',
12 |         images: _.sample(data.images, 8)
13 |     };
14 | 
15 |     console.log(scraped);
16 | 
17 | });


--------------------------------------------------------------------------------
/tests/cleanMicrodata.js:
--------------------------------------------------------------------------------
 1 | const test = require('tape');
 2 | const cleanMicrodata = require('../lib/cleanMicrodata');
 3 | const rawMicrodata = require('./fixtures/rawMicrodata.json');
 4 | const cleanedMicrodata = require('./fixtures/cleanedMicrodata.json');
 5 | 
 6 | test('cleanMicrodata.js', function (t) {
 7 |     t.plan(2);
 8 | 
 9 |     cleanMicrodata(rawMicrodata, (err, data) => {
10 |         t.equal(err, null, 'should return callback without error');
11 |         t.deepEqual(data, cleanedMicrodata, 'should return callback with cleaned microdata');
12 |     });
13 | });
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/lib/parseTwitterCard.js:
--------------------------------------------------------------------------------
 1 | var _ = require('lodash');
 2 | 
 3 | module.exports = function ($, callback) {
 4 | 
 5 |   try {
 6 | 
 7 |     var
 8 |       $head = $('head'),
 9 |       result = {};
10 | 
11 | 
12 |     $head.find('meta').each(function(i, el) {
13 |       var $el = $(el);
14 | 
15 |       if ($el.attr('name') && $el.attr('content') && $el.attr('name').indexOf('twitter:') > -1) {
16 |         result[$el.attr('name')] = $el.attr('content');
17 |       }
18 | 
19 |     });
20 | 
21 |     callback(null, result);
22 | 
23 |   } catch (e) {
24 |     console.log(e);
25 |     callback(e);
26 |   }
27 | 
28 | };
29 | 


--------------------------------------------------------------------------------
/lib/parseOpenGraph.js:
--------------------------------------------------------------------------------
 1 | var _ = require('lodash');
 2 | 
 3 | module.exports = function ($, callback) {
 4 | 
 5 |   try {
 6 | 
 7 |     var
 8 |       $head = $('head'),
 9 |       result = {};
10 | 
11 | 
12 |     $head.find('meta').each(function(i, el) {
13 |       var $el = $(el);
14 | 
15 |       if ($el.attr('property') && $el.attr('content') && $el.attr('property').indexOf('og:') > -1) {
16 |         result[$el.attr('property')] = $el.attr('content');
17 |       }
18 | 
19 |     });
20 | 
21 |     callback(null, result);
22 | 
23 |   } catch (e) {
24 |     console.log(e);
25 |     callback(e);
26 |   }
27 | 
28 | };
29 | 


--------------------------------------------------------------------------------
/tests/parseTwitterCard.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const test = require('tape');
 3 | const parseTwitterCard = require('../lib/parseTwitterCard');
 4 | const { $: parsedHtml } = require('./fixtures/sample');
 5 | const expected = {
 6 |     'twitter:card': 'summary',
 7 |     'twitter:site': '@nytimesbits',
 8 |     'twitter:creator': '@nickbilton',
 9 | }
10 | 
11 | test('parseTwitterCard.js', function (t) {
12 |     t.plan(2);
13 |     parseTwitterCard(parsedHtml, (err, data) => {
14 |          t.equal(err, null, 'should return callback without error');
15 |          t.deepEqual(expected, data);
16 |     })
17 | });
18 | 


--------------------------------------------------------------------------------
/tests/parseMeta.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const test = require('tape');
 3 | const parseMeta = require('../lib/parseMeta');
 4 | const { $: parsedHtml } = require('./fixtures/sample');
 5 | const expected = { 
 6 |     description: 'Free Web tutorials',
 7 |     keywords: 'HTML,CSS,XML,JavaScript',
 8 |     author: 'Hege Refsnes',
 9 |     'twitter:card': 'summary',
10 |     'twitter:site': '@nytimesbits',
11 |     'twitter:creator': '@nickbilton' 
12 | }
13 | 
14 | test('parseMeta.js', function (t) {
15 |     t.plan(2);
16 |     parseMeta(parsedHtml, (err, data) => {
17 |          t.equal(err, null, 'should return callback without error');
18 |          t.deepEqual(expected, data);
19 |     })
20 | });
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/tests/parseOembed.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const test = require('tape');
 3 | const parseOembed = require('../lib/parseOembed');
 4 | const { $: parsedHtml } = require('./fixtures/sample');
 5 | 
 6 | const expected = {
 7 |     'text/xml+oembed': "https://namchey.com/api/oembed?url=https%3A%2F%2Fnamchey.com%2Fitineraries%2Ftilicho&format=xml",
 8 |     'text/json+oembed': "https://namchey.com/api/oembed?url=https%3A%2F%2Fnamchey.com%2Fitineraries%2Ftilicho&format=json"
 9 | };
10 | 
11 | test('parseOembed.js', function (t) {
12 |     t.plan(2);
13 |     parseOembed(parsedHtml, (err, data) => {
14 |          t.equal(err, null, 'should return callback without error');
15 |          t.deepEqual(expected, data);
16 |     });
17 | });
18 | 


--------------------------------------------------------------------------------
/recipes/images.js:
--------------------------------------------------------------------------------
 1 | // How to scrape image tag URLS from a website:
 2 | 
 3 | var suq = require('suq');
 4 | var _ = require('underscore');
 5 | 
 6 | var url = "http://www.ufirstgroup.com";
 7 | 
 8 | suq(url, function (err, json, body) {
 9 | 
10 |     if (!err) {
11 |         var images = json.images;
12 | 
13 |         console.log('\nThe Image tag URLs in the page, converted to json: \n\n', JSON.stringify(images, null, 2));             
14 |         
15 |         console.log('\n\nList of individual Image tag URLs, pulled from the JSON using Underscore.js and converted into valid HTML: \n\n');
16 | 
17 |         _.each(images, function (src) {
18 |             console.log('<img src="' + src + '"/>');
19 |         });
20 | 
21 |     }
22 | 
23 | });


--------------------------------------------------------------------------------
/tests/cleanMicroformats.js:
--------------------------------------------------------------------------------
 1 | const fs = require("fs");
 2 | const test = require("tape");
 3 | const cleanMicroformats = require("../lib/cleanMicroformats");
 4 | 
 5 | test("cleanMicroformats.js", function(t) {
 6 |   t.plan(2);
 7 |   const body = '<a class="h-card" href="http://glennjones.net">Glenn</a>';
 8 | 
 9 |   cleanMicroformats(body, (err, data) => {
10 |     t.equal(err, null, "should return callback without error");
11 | 
12 |     const expected = [
13 |       {
14 |         id: 1,
15 |         type: "h-card",
16 |         props: { name: "Glenn", url: "http://glennjones.net" },
17 |         path: ["0", "type"],
18 |         length: 2,
19 |         level: 2
20 |       }
21 |     ];
22 |     t.deepEqual(data, expected);
23 |   });
24 | });
25 | 


--------------------------------------------------------------------------------
/tests/parseTags.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const test = require('tape');
 3 | const parseTags = require('../lib/parseTags');
 4 | const { $: parsedHtml } = require('./fixtures/sample');
 5 | const expected = { 
 6 |     title: 'My cool website',
 7 |     headers: { h1: [ 'Lorem' ], h2: [ 'images' ], h3: [], h4: [], h5: [], h6: [] },
 8 |     images: [ '/cat1.jpg', '/cat2.jpg', '/cat3.jpg', '/cat4.jpg' ],
 9 |     links: [ { text: '', title: undefined, href: '#' },
10 |     { text: '', title: undefined, href: '#' },
11 |     { text: '', title: undefined, href: '#' },
12 |     { text: '', title: undefined, href: '#' },
13 |     { text: 'more stuff', title: undefined, href: '/more' } ] 
14 | }
15 | 
16 | test('parseTags.js', function (t) {
17 |     t.plan(2);
18 |     parseTags(parsedHtml, (err, data) => {
19 |          t.equal(err, null, 'should return callback without error');
20 |          t.deepEqual(expected, data);
21 |     })
22 | });
23 | 


--------------------------------------------------------------------------------
/tests/parseOpenGraph.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const test = require('tape');
 3 | const parseOpenGraph = require('../lib/parseOpenGraph');
 4 | const { $: parsedHtml } = require('./fixtures/sample');
 5 | const expected = { 
 6 |     'og:url': 'http://bits.blogs.nytimes.com/2011/12/08/a-twitter-for-my-sister/',
 7 |     'og:title': 'A Twitter for My Sister',
 8 |     'og:description': 'In the early days, Twitter grew so quickly that it was almost impossible to add new features because engineers spent their time trying to keep the rocket ship from stalling.',
 9 |     'og:image': 'http://graphics8.nytimes.com/images/2011/12/08/technology/bits-newtwitter/bits-newtwitter-tmagArticle.jpg' 
10 | }
11 | 
12 | test('parseTags.js', function (t) {
13 |     t.plan(2);
14 |     parseOpenGraph(parsedHtml, (err, data) => {
15 |          t.equal(err, null, 'should return callback without error');
16 |          t.deepEqual(expected, data);
17 |     })
18 | });
19 | 


--------------------------------------------------------------------------------
/lib/parseOembed.js:
--------------------------------------------------------------------------------
 1 | var _ = require('lodash');
 2 | 
 3 | module.exports = function ($, callback) {
 4 | 
 5 |   try {
 6 | 
 7 |     var
 8 |       $head = $('head'),
 9 |       result = {};
10 | 
11 | 
12 |     $head.find('link').each(function(i, el) {
13 |       var $el = $(el);
14 | 
15 |       //xml type
16 |       if ($el.attr('type') && $el.attr('href') && $el.attr('rel') === 'alternate' && $el.attr('type').indexOf('text/xml+oembed') > -1) {
17 |         result[$el.attr('type')] = $el.attr('href');
18 |       }
19 | 
20 |       //json type
21 |       if ($el.attr('type') && $el.attr('href') && $el.attr('rel') === 'alternate' && ($el.attr('type').indexOf('text/json+oembed') > -1 || $el.attr('type').indexOf('application/json+oembed') > -1)) {
22 |         result[$el.attr('type')] = $el.attr('href');
23 |       }
24 | 
25 |     });
26 | 
27 |     callback(null, result);
28 | 
29 |   } catch (e) {
30 |     console.log(e);
31 |     callback(e);
32 |   }
33 | 
34 | };
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This software is released under the MIT license:
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so,
 8 | subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/recipes/wordpress-microformat.js:
--------------------------------------------------------------------------------
 1 | var suq = require('suq');
 2 | var _ = require('lodash');
 3 | 
 4 | function hget(model) {
 5 |     if (model[0]) {
 6 |         return (model[0])
 7 |     } else {
 8 |         return {}
 9 |     }
10 | }
11 | 
12 | 
13 | suq('/a-wordpress/blog-post', function (err, data) {
14 | 
15 |     var feed, posts, postData, postProps, author, post;
16 | 
17 |     feed = hget(_.filter(data.microformat.items, _.matches({type: ["h-feed"]})));
18 | 
19 |     if (feed) {
20 | 
21 |         posts = _.filter(feed.children, _.matches({type: ["h-entry"]}));
22 | 
23 |         if (posts) {
24 |             postData = hget(posts);
25 |         }
26 | 
27 |         if (postData) {
28 | 
29 |             postProps = postData.properties;
30 | 
31 |             if (postProps) {
32 | 
33 |                 author = hget(_.filter(postProps.author, _.matches({type: ["h-card"]})));
34 | 
35 |                 post = {
36 |                     title: hget(postProps.name),
37 |                     category: hget(postProps.category),
38 |                     excerpt: hget(postProps.content).value.substring(0, 150) + '...',
39 |                     author: hget(author.properties.name),
40 |                     url: hget(data.microformat.rels.canonical),
41 |                     images: _.sample(data.images, 4)
42 |                 };
43 | 
44 |                 console.log(post);
45 | 
46 |             }
47 | 
48 |         }
49 |     }
50 | 
51 | });


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "suq",
 3 |   "main": "./index",
 4 |   "version": "1.4.1",
 5 |   "description": "A Scraping Utility for lazy people",
 6 |   "scripts": {
 7 |     "test": "tape tests/*.js | tap-diff"
 8 |   },
 9 |   "keywords": [
10 |     "htmlparser",
11 |     "jquery",
12 |     "selector",
13 |     "scraper",
14 |     "parser",
15 |     "html",
16 |     "microdata",
17 |     "microformats",
18 |     "opengraph",
19 |     "twittercard",
20 |     "meta"
21 |   ],
22 |   "bin": {
23 |     "suq": "./bin/suq.js"
24 |   },
25 |   "files": [
26 |     "index.js",
27 |     "lib",
28 |     "bin",
29 |     "recipes"
30 |   ],
31 |   "author": {
32 |     "name": "Matt McFarland",
33 |     "email": "contact@mattmcfarland.com"
34 |   },
35 |   "contributors": [
36 |     {
37 |       "name": "Matt McFarland",
38 |       "email": "contact@mattmcfarland.com"
39 |     },
40 |     {
41 |       "name": "Tom Sutton",
42 |       "url": "https://github.com/tomsutton1984"
43 |     },
44 |     {
45 |       "name": "Oscar Illescas",
46 |       "url": "https://github.com/oillescas"
47 |     },
48 |     {
49 |       "name": "Gary Moon",
50 |       "url": "https://github.com/garymoon"
51 |     }
52 |   ],
53 |   "license": "MIT",
54 |   "dependencies": {
55 |     "cheerio": "^0.22.0",
56 |     "lodash": "^4.17.11",
57 |     "microdata-node": "^1.0.0",
58 |     "microformat-node": "^2.0.1",
59 |     "minimist": "^1.2.0",
60 |     "request": "^2.88.0",
61 |     "traverse": "^0.6.6",
62 |     "xss": "^1.0.6"
63 |   },
64 |   "repository": {
65 |     "type": "git",
66 |     "url": "git://github.com/MattMcFarland/SUq.git"
67 |   },
68 |   "devDependencies": {
69 |     "chalk": "^2.4.2",
70 |     "tap-diff": "^0.1.1",
71 |     "tape": "^4.10.2"
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/lib/parseTags.js:
--------------------------------------------------------------------------------
 1 | var _ = require('lodash');
 2 | 
 3 | module.exports = function ($, callback) {
 4 |   try {
 5 | 
 6 |     var
 7 |         $head = $('head'),
 8 |         $body = $('body'),
 9 |         result = {
10 |           title: '',
11 |           headers: { h1: [], h2: [], h3: [], h4: [], h5: [], h6: [] },
12 |           images: [],
13 |           links :[]
14 |         };
15 | 
16 |     result.title = $head.find('title').text().replace(/\n|\t/g, "");
17 | 
18 |     $body.find('h1').each(function(i, el) {
19 | 
20 |       result.headers.h1.push($(el).text().replace(/\n|\t/g, ""));
21 | 
22 |     });
23 | 
24 |     $body.find('h2').each(function(i, el) {
25 | 
26 |       result.headers.h2.push($(el).text().replace(/\n|\t/g, ""));
27 | 
28 |     });
29 | 
30 |     $body.find('h3').each(function(i, el) {
31 | 
32 |       result.headers.h3.push($(el).text().replace(/\n|\t/g, ""));
33 | 
34 |     });
35 | 
36 |     $body.find('h4').each(function(i, el) {
37 | 
38 |       result.headers.h4.push($(el).text().replace(/\n|\t/g, ""));
39 | 
40 |     });
41 | 
42 |     $body.find('h5').each(function(i, el) {
43 | 
44 |       result.headers.h5.push($(el).text().replace(/\n|\t/g, ""));
45 | 
46 |     });
47 | 
48 |     $body.find('img').each(function(i, el) {
49 | 
50 |       result.images.push($(el).attr('src'));
51 | 
52 |     });
53 |     
54 |     $body.find('a[href!=""]').each(function(i, el) {
55 | 
56 |         result.links.push({
57 |             text: $(el).text().trim(),
58 |             title: $(el).attr('title'),
59 |             href: $(el).attr('href')
60 |         });
61 | 
62 |     });
63 |     
64 |     result.links = _.compact(result.links);
65 |     result.images = _.compact(result.images);
66 | 
67 |     callback(null, result);
68 | 
69 |   } catch (e) {
70 |     callback(e);
71 |   }
72 | 
73 | };


--------------------------------------------------------------------------------
/bin/suq.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | var argv = require('minimist')(process.argv.slice(2)),
 4 |     url = argv.url || argv.u || process.argv.slice(2)[0],
 5 |     output = argv.output || argv.o || false,
 6 |     fs = require('fs'),
 7 |     path = require('path'),
 8 |     suq = require('../');
 9 | 
10 | 
11 | if (argv._[0] === 'help' || argv.h || argv.help
12 |     || (process.argv.length <= 1 && process.stdin.isTTY)) {
13 |     return fs.createReadStream(__dirname + '/usage.txt')
14 |         .pipe(process.stdout)
15 |         .on('close', function () { process.exit(1) });
16 | }
17 | 
18 | if (argv.version || argv.v) {
19 |     return console.log(require('../package.json').version);
20 | }
21 | 
22 | 
23 | if (url) {
24 | 
25 |     if (url.indexOf('http') === -1) {
26 |         console.error('SUq Error ['+ url + ']\n', 'Be sure to use http:// or https://');
27 |     } else {
28 |         suq(url, function (err, data) {
29 | 
30 |             if (!err) {
31 |                 if (!data) {
32 |                     console.error('SUq Error ['+ url + ']\n', 'response empty');
33 |                 } else {
34 |                     if (!output) {
35 |                         console.log(JSON.stringify(data, null, 2));
36 |                     } else {
37 |                         fs.writeFile(path.join(__dirname, output), JSON.stringify(data), function(err){
38 |                             console.log('File ' + output + ' successfully written!');
39 |                         })
40 |                     }
41 |                 }
42 |             } else {
43 |                 console.log('SUq Error ['+ url + ']\n', err);
44 |             }
45 |         });
46 |     }
47 | 
48 | 
49 | } else {
50 |     return fs.createReadStream(__dirname + '/usage.txt')
51 |         .pipe(process.stdout)
52 |         .on('close', function () { process.exit(1) });
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/tests/fixtures/sample.js:
--------------------------------------------------------------------------------
 1 | const html = `
 2 | <html>
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="description" content="Free Web tutorials">
 6 |     <meta name="keywords" content="HTML,CSS,XML,JavaScript">
 7 |     <meta name="author" content="Hege Refsnes">
 8 |     <meta name="twitter:card" content="summary" />
 9 |     <meta name="twitter:site" content="@nytimesbits" />
10 |     <meta name="twitter:creator" content="@nickbilton" />
11 |     <meta property="og:url" content="http://bits.blogs.nytimes.com/2011/12/08/a-twitter-for-my-sister/" />
12 |     <meta property="og:title" content="A Twitter for My Sister" />
13 |     <meta property="og:description" content="In the early days, Twitter grew so quickly that it was almost impossible to add new features because engineers spent their time trying to keep the rocket ship from stalling." />
14 |     <meta property="og:image" content="http://graphics8.nytimes.com/images/2011/12/08/technology/bits-newtwitter/bits-newtwitter-tmagArticle.jpg" />
15 |     <link rel="alternate" type="text/xml+oembed" href="https://namchey.com/api/oembed?url=https%3A%2F%2Fnamchey.com%2Fitineraries%2Ftilicho&amp;format=xml">
16 |     <link rel="alternate" type="text/json+oembed" href="https://namchey.com/api/oembed?url=https%3A%2F%2Fnamchey.com%2Fitineraries%2Ftilicho&amp;format=json">
17 |     <title>My cool website</title>
18 | 
19 | </head>
20 | 
21 | <body>
22 |     <h1>Lorem</h1>
23 |     <p>Ipsum???</p>
24 |     <h2>images</h2>
25 |     <ul>
26 |         <li><a href="#"><img src="/cat1.jpg"/></a></li>
27 |         <li><a href="#"><img src="/cat2.jpg"/></a></li>
28 |         <li><a href="#"><img src="/cat3.jpg"/></a></li>
29 |         <li><a href="#"><img src="/cat4.jpg"/></a></li>
30 |     </ul>
31 |     <footer>
32 |         <a href="/more">more stuff</a>
33 |     </footer>
34 | </body>
35 | 
36 | </html>
37 | `
38 | module.exports = {
39 |     html,
40 |     $: require('cheerio').load(html)
41 | }
42 | 


--------------------------------------------------------------------------------
/lib/cleanMicrodata.js:
--------------------------------------------------------------------------------
 1 | var traverse = require('traverse');
 2 | var xss = require('xss');
 3 | var _ = require('lodash');
 4 | 
 5 | module.exports = function (mdata, callback) {
 6 | 
 7 |   var index = 0;
 8 |   mdata = traverse(mdata).forEach(function(item) {
 9 |     if (this.key === 'type' && this.notLeaf) {
10 |       index ++;
11 |       this.node.push(index);
12 |       //console.log(this.node);
13 |     }
14 |   });
15 | 
16 |   var flattenProps = function (props) {
17 |     return traverse(props).reduce(function (acc) {
18 | 
19 | 
20 |       // If its an array, it is not empty, and it has only one item.
21 |       if (Array.isArray(this.node) &&
22 |           this.node.length === 1 &&
23 |           this.level < 3 &&
24 |           !this.node[0].properties && this.parent && this.parent.node) {
25 |         acc[this.key] = (
26 |           (typeof this.node[0] === "string") ?
27 |             xss(this.node[0].replace(/(<([^>]+)>)|\n|\t/ig,"")) :
28 |             this.node[0]
29 |         );
30 |       }
31 |       if (this.level === 3) {
32 |         var node = _.get(this, 'parent.parent.node');
33 | 
34 |         if (node && node[0] && node[0].type && node[0].type[1])
35 |           acc[node.key] = '__ref__' + node[0].type[1];
36 |       }
37 |       return acc;
38 | 
39 |     }, {});
40 | 
41 |   };
42 | 
43 |   var result = traverse(mdata).reduce(function (acc) {
44 | 
45 |     if (this.key === 'type' && this.notLeaf) {
46 | 
47 |       var props = this.parent.node['properties'],
48 |           size =  props.length ?
49 |                   props.length :
50 |                   Object.keys(props).length || 0;
51 | 
52 | 
53 |       if (size) {
54 |         acc.push({
55 |           id: this.node[1],
56 |           type: this.node[0],
57 |           props: flattenProps(props),
58 |           path: this.path,
59 |           length: size,
60 |           level: this.level
61 |         });
62 |       }
63 |     }
64 |     return acc;
65 |   }, []);
66 | 
67 |   callback(null, result);
68 | };


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Required Modules
 3 |  */
 4 | 
 5 | var
 6 |   cheerio = require('cheerio'),
 7 |   request = require('request'),
 8 |   microdata = require('microdata-node'),
 9 |   _ = require('lodash');
10 | 
11 | 
12 | var
13 |   cleanMicrodata = require('./lib/cleanMicrodata'),
14 |   cleanMicroformats = require('./lib/cleanMicroformats'),
15 |   parseMeta = require('./lib/parseMeta'),
16 |   parseTags = require('./lib/parseTags'),
17 |   parseTwitterCard = require('./lib/parseTwitterCard'),
18 |   parseOpenGraph = require('./lib/parseOpenGraph'),
19 |   parseOembed = require('./lib/parseOembed');
20 | 
21 | var populate = {
22 |   meta: {},
23 |   microdata: {},
24 |   microformat: {},
25 |   tags: {},
26 |   opengraph: {},
27 |   twittercard: {},
28 |   oembed: {}
29 | };
30 | 
31 | 
32 | module.exports = function (url, callback, opts) {
33 |   request(_.extend({"url": url}, opts || {}), function (err, res, body) {
34 |     if (err) {
35 |       callback(err, null);
36 |     } else if (body && res) {
37 |       module.exports.parse(body, callback);
38 |     } else {
39 |       callback('No Response');
40 |     }
41 |   });
42 | };
43 | 
44 | 
45 | module.exports.parse = function (body, callback) {
46 |   cleanMicrodata(microdata.toJson(body), function (err, cleanData) {
47 |     if (!err && cleanData) {
48 |       populate.microdata = cleanData;
49 |       var $ = cheerio.load(body);
50 |       parseMeta($, function (err, meta) {
51 |         populate.meta = meta;
52 |         parseTags($, function(err, tags) {
53 |           populate.tags = tags;
54 |           cleanMicroformats(body, function(err, mfats) {
55 |             populate.microformat = mfats;
56 |             parseOpenGraph($, function(err, og) {
57 |               populate.opengraph = og;
58 |               parseTwitterCard($, function(err, twittercard) {
59 |                 populate.twittercard = twittercard;
60 |                 parseOembed($, function(err, oembed) {
61 |                   populate.oembed = oembed;
62 |                   callback(null, populate, body);
63 |                 });                
64 |               });
65 |             });
66 |           })
67 |         })
68 |       });
69 |     } else {
70 |       callback(err || 'CleanData fail');
71 |     }
72 |   });
73 | };
74 | 


--------------------------------------------------------------------------------
/lib/cleanMicroformats.js:
--------------------------------------------------------------------------------
 1 | var traverse = require("traverse");
 2 | var _ = require("lodash");
 3 | var microformat = require("microformat-node");
 4 | var xss = require("xss");
 5 | 
 6 | module.exports = function(body, callback) {
 7 |   microformat.get({ html: body, logger: false }, function(err, data) {
 8 |     if (err) {
 9 |       callback(err);
10 |       return;
11 |     }
12 |     var index = 0;
13 | 
14 |     var items = traverse(data.items).forEach(function(item) {
15 |       if (this.key === "type" && this.notLeaf) {
16 |         index++;
17 |         this.node.push(index);
18 |       }
19 |     });
20 | 
21 |     var flattenProps = function(props) {
22 |       return traverse(props).reduce(function(acc) {
23 |         if (this.key === "content" && this.notLeaf) {
24 |           if (
25 |             this.node[0] &&
26 |             this.node[0].html &&
27 |             typeof this.node[0].html === "string"
28 |           ) {
29 |             this.node[0] = xss(this.node[0].html).replace(/\n|\t/g, "");
30 |           } else {
31 |             if (
32 |               this.node[0] &&
33 |               this.node[0].value &&
34 |               typeof this.node[0].value === "string"
35 |             ) {
36 |               this.node[0] = xss(this.node[0].value).replace(/\n|\t/g, "");
37 |             }
38 |           }
39 |         }
40 |         // If its an array, it is not empty, and it has only one item.
41 |         if (
42 |           Array.isArray(this.node) &&
43 |           this.node.length === 1 &&
44 |           this.level === 1 &&
45 |           !this.node[0].properties &&
46 |           this.parent &&
47 |           this.parent.node
48 |         ) {
49 |           acc[this.key] =
50 |             typeof this.node[0] === "string"
51 |               ? xss(this.node[0].replace(/(<([^>]+)>)|\n|\t/gi, ""))
52 |               : this.node[0];
53 |         }
54 |         return acc;
55 |       }, {});
56 |     };
57 | 
58 |     var result = traverse(items).reduce(function(acc) {
59 |       if (this.key === "type" && this.notLeaf) {
60 |         var props = this.parent.node["properties"],
61 |           size = props.length ? props.length : Object.keys(props).length || 0;
62 | 
63 |         if (size) {
64 |           acc.push({
65 |             id: this.node[1],
66 |             type: this.node[0],
67 |             props: flattenProps(props),
68 |             path: this.path,
69 |             length: size,
70 |             level: this.level
71 |           });
72 |         }
73 |       }
74 |       return acc;
75 |     }, []);
76 | 
77 |     callback(null, result);
78 |   });
79 | };
80 | 


--------------------------------------------------------------------------------
/tests/fixtures/cleanedMicrodata.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "id": 1,
 4 |     "type": "http://schema.org/ImageObject",
 5 |     "props": {
 6 |       "height": "720",
 7 |       "width": "1280",
 8 |       "url": "https://i.ytimg.com/vi/Xft3asYLKo0/maxresdefault.jpg"
 9 |     },
10 |     "path": [
11 |       "items",
12 |       "0",
13 |       "properties",
14 |       "thumbnail",
15 |       "0",
16 |       "type"
17 |     ],
18 |     "length": 3,
19 |     "level": 6
20 |   },
21 |   {
22 |     "id": 2,
23 |     "type": "http://schema.org/Person",
24 |     "props": {
25 |       "url": "http://www.youtube.com/user/packt1000"
26 |     },
27 |     "path": [
28 |       "items",
29 |       "0",
30 |       "properties",
31 |       "author",
32 |       "0",
33 |       "type"
34 |     ],
35 |     "length": 1,
36 |     "level": 6
37 |   },
38 |   {
39 |     "id": 3,
40 |     "type": "http://schema.org/Person",
41 |     "props": {
42 |       "url": "https://plus.google.com/102849454411878407512"
43 |     },
44 |     "path": [
45 |       "items",
46 |       "0",
47 |       "properties",
48 |       "author",
49 |       "1",
50 |       "type"
51 |     ],
52 |     "length": 1,
53 |     "level": 6
54 |   },
55 |   {
56 |     "id": 4,
57 |     "type": "http://schema.org/VideoObject",
58 |     "props": {
59 |       "thumbnailUrl": "https://i.ytimg.com/vi/Xft3asYLKo0/maxresdefault.jpg",
60 |       "genre": "Science & Technology",
61 |       "undefined": "__ref__2",
62 |       "url": "https://www.youtube.com/watch?v=Xft3asYLKo0",
63 |       "embedURL": "https://www.youtube.com/embed/Xft3asYLKo0",
64 |       "description": "Part of Rapid Lo-Dash video series. For the full Course visit: https://www.packtpub.com/web-development/rapid-lo-dash-video?utm_source=youtube&utm_medium=vid...",
65 |       "playerType": "HTML5 Flash",
66 |       "channelId": "UC3VydBGBl132baPCLeDspMQ",
67 |       "width": "1280",
68 |       "duration": "PT5M1S",
69 |       "height": "720",
70 |       "name": "Rapid Lo-Dash Tutorial: Creating and Using Objects | packtpub.com",
71 |       "datePublished": "2014-11-26",
72 |       "interactionCount": "3534",
73 |       "regionsAllowed": "AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,AS,AT,AU,AW,AX,AZ,BA,BB,BD,BE,BF,BG,BH,BI,BJ,BL,BM,BN,BO,BQ,BR,BS,BT,BV,BW,BY,BZ,CA,CC,CD,CF,CG,CH,CI,CK,CL,CM,CN,CO,CR,CU,CV,CW,CX,CY,CZ,DE,DJ,DK,DM,DO,DZ,EC,EE,EG,EH,ER,ES,ET,FI,FJ,FK,FM,FO,FR,GA,GB,GD,GE,GF,GG,GH,GI,GL,GM,GN,GP,GQ,GR,GS,GT,GU,GW,GY,HK,HM,HN,HR,HT,HU,ID,IE,IL,IM,IN,IO,IQ,IR,IS,IT,JE,JM,JO,JP,KE,KG,KH,KI,KM,KN,KP,KR,KW,KY,KZ,LA,LB,LC,LI,LK,LR,LS,LT,LU,LV,LY,MA,MC,MD,ME,MF,MG,MH,MK,ML,MM,MN,MO,MP,MQ,MR,MS,MT,MU,MV,MW,MX,MY,MZ,NA,NC,NE,NF,NG,NI,NL,NO,NP,NR,NU,NZ,OM,PA,PE,PF,PG,PH,PK,PL,PM,PN,PR,PS,PT,PW,PY,QA,RE,RO,RS,RU,RW,SA,SB,SC,SD,SE,SG,SH,SI,SJ,SK,SL,SM,SN,SO,SR,SS,ST,SV,SX,SY,SZ,TC,TD,TF,TG,TH,TJ,TK,TL,TM,TN,TO,TR,TT,TV,TW,TZ,UA,UG,UM,US,UY,UZ,VA,VC,VE,VG,VI,VN,VU,WF,WS,YE,YT,ZA,ZM,ZW",
74 |       "isFamilyFriendly": "True",
75 |       "paid": "False",
76 |       "unlisted": "False",
77 |       "videoId": "Xft3asYLKo0"
78 |     },
79 |     "path": [
80 |       "items",
81 |       "0",
82 |       "type"
83 |     ],
84 |     "length": 20,
85 |     "level": 3
86 |   }
87 | ]
88 | 


--------------------------------------------------------------------------------
/tests/fixtures/rawMicrodata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "items": [
  3 |     {
  4 |       "properties": {
  5 |         "thumbnailUrl": [
  6 |           "https://i.ytimg.com/vi/Xft3asYLKo0/maxresdefault.jpg"
  7 |         ],
  8 |         "genre": [
  9 |           "Science & Technology"
 10 |         ],
 11 |         "thumbnail": [
 12 |           {
 13 |             "properties": {
 14 |               "height": [
 15 |                 "720"
 16 |               ],
 17 |               "width": [
 18 |                 "1280"
 19 |               ],
 20 |               "url": [
 21 |                 "https://i.ytimg.com/vi/Xft3asYLKo0/maxresdefault.jpg"
 22 |               ]
 23 |             },
 24 |             "type": [
 25 |               "http://schema.org/ImageObject"
 26 |             ]
 27 |           }
 28 |         ],
 29 |         "url": [
 30 |           "https://www.youtube.com/watch?v=Xft3asYLKo0"
 31 |         ],
 32 |         "embedURL": [
 33 |           "https://www.youtube.com/embed/Xft3asYLKo0"
 34 |         ],
 35 |         "description": [
 36 |           "Part of Rapid Lo-Dash video series. For the full Course visit: https://www.packtpub.com/web-development/rapid-lo-dash-video?utm_source=youtube&utm_medium=vid..."
 37 |         ],
 38 |         "playerType": [
 39 |           "HTML5 Flash"
 40 |         ],
 41 |         "channelId": [
 42 |           "UC3VydBGBl132baPCLeDspMQ"
 43 |         ],
 44 |         "width": [
 45 |           "1280"
 46 |         ],
 47 |         "duration": [
 48 |           "PT5M1S"
 49 |         ],
 50 |         "height": [
 51 |           "720"
 52 |         ],
 53 |         "author": [
 54 |           {
 55 |             "properties": {
 56 |               "url": [
 57 |                 "http://www.youtube.com/user/packt1000"
 58 |               ]
 59 |             },
 60 |             "type": [
 61 |               "http://schema.org/Person"
 62 |             ]
 63 |           },
 64 |           {
 65 |             "properties": {
 66 |               "url": [
 67 |                 "https://plus.google.com/102849454411878407512"
 68 |               ]
 69 |             },
 70 |             "type": [
 71 |               "http://schema.org/Person"
 72 |             ]
 73 |           }
 74 |         ],
 75 |         "name": [
 76 |           "Rapid Lo-Dash Tutorial: Creating and Using Objects | packtpub.com"
 77 |         ],
 78 |         "datePublished": [
 79 |           "2014-11-26"
 80 |         ],
 81 |         "interactionCount": [
 82 |           "3534"
 83 |         ],
 84 |         "regionsAllowed": [
 85 |           "AD,AE,AF,AG,AI,AL,AM,AO,AQ,AR,AS,AT,AU,AW,AX,AZ,BA,BB,BD,BE,BF,BG,BH,BI,BJ,BL,BM,BN,BO,BQ,BR,BS,BT,BV,BW,BY,BZ,CA,CC,CD,CF,CG,CH,CI,CK,CL,CM,CN,CO,CR,CU,CV,CW,CX,CY,CZ,DE,DJ,DK,DM,DO,DZ,EC,EE,EG,EH,ER,ES,ET,FI,FJ,FK,FM,FO,FR,GA,GB,GD,GE,GF,GG,GH,GI,GL,GM,GN,GP,GQ,GR,GS,GT,GU,GW,GY,HK,HM,HN,HR,HT,HU,ID,IE,IL,IM,IN,IO,IQ,IR,IS,IT,JE,JM,JO,JP,KE,KG,KH,KI,KM,KN,KP,KR,KW,KY,KZ,LA,LB,LC,LI,LK,LR,LS,LT,LU,LV,LY,MA,MC,MD,ME,MF,MG,MH,MK,ML,MM,MN,MO,MP,MQ,MR,MS,MT,MU,MV,MW,MX,MY,MZ,NA,NC,NE,NF,NG,NI,NL,NO,NP,NR,NU,NZ,OM,PA,PE,PF,PG,PH,PK,PL,PM,PN,PR,PS,PT,PW,PY,QA,RE,RO,RS,RU,RW,SA,SB,SC,SD,SE,SG,SH,SI,SJ,SK,SL,SM,SN,SO,SR,SS,ST,SV,SX,SY,SZ,TC,TD,TF,TG,TH,TJ,TK,TL,TM,TN,TO,TR,TT,TV,TW,TZ,UA,UG,UM,US,UY,UZ,VA,VC,VE,VG,VI,VN,VU,WF,WS,YE,YT,ZA,ZM,ZW"
 86 |         ],
 87 |         "isFamilyFriendly": [
 88 |           "True"
 89 |         ],
 90 |         "paid": [
 91 |           "False"
 92 |         ],
 93 |         "unlisted": [
 94 |           "False"
 95 |         ],
 96 |         "videoId": [
 97 |           "Xft3asYLKo0"
 98 |         ]
 99 |       },
100 |       "type": [
101 |         "http://schema.org/VideoObject"
102 |       ]
103 |     }
104 |   ]
105 | }
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![NPM](https://nodei.co/npm/suq.png)](https://npmjs.org/package/suq)
  2 | 
  3 | ## SUq
  4 | 
  5 | Scraping Utility for lazy people.
  6 | MIT Licensed
  7 | 
  8 | Here's a simple node module that will allow you to asynchronously scrape opengraph tags, microformats, microdata, header tags, images, classic meta, and whatever else you want with minimal effort.
  9 | You can output the scraped data in the command line, or you can output scraped data as a JSON object.
 10 | If you don't want the scraped data yet, and still want to fine tune and grab more data from the html, no problem.  You can extend suq as much as you want, it doesn't care.
 11 | 
 12 | * [Recipes](./recipes)
 13 | * [Command line Usage](#command-line-usage)
 14 | * [Basic Usage](#basic-usage)
 15 | * [Opengraph](#opengraph)
 16 | * [TwitterCard](#twittercard)
 17 | * [Microformat](#microformat)
 18 | * [Microdata](#microdata)
 19 | * [Headers](#headers)
 20 | * [Images](#images)
 21 | * [Meta](#meta)
 22 | * [Signature](#signature)
 23 | * [Extending](#extending)
 24 | * [Mentions](#mentions)
 25 | 
 26 | ### Command line usage:
 27 | 
 28 | Scrape a website and output the data to command line.
 29 | 
 30 | suq can be used in the command line when installed globally, outputting scraped data to `stdout`
 31 | 
 32 | ```
 33 | npm install suq -g
 34 | 
 35 | suq http://www.example.com > example.json
 36 | 
 37 | suq -u http://www.example.com -o example.json
 38 | 
 39 | suq --url http://www.example.com --output example.json
 40 | 
 41 | ```
 42 | 
 43 | 
 44 | 
 45 | ### Basic usage
 46 | 
 47 | How to scrape a website and convert structured data to json, and keep the html data as well (in case you're not done with it yet)
 48 | 
 49 | 
 50 | ```javascript
 51 | var suq = require('suq');
 52 | 
 53 | var url = "http://www.example.com";
 54 | 
 55 | suq(url, function (err, json, body) {
 56 | 
 57 |     if (!err) {
 58 |         console.log('scraped json is:', JSON.stringify(json, null, 2));
 59 |         console.log('html body is', body);
 60 |     }
 61 | 
 62 | });
 63 | 
 64 | ```
 65 | 
 66 | 
 67 | ### Opengraph
 68 | 
 69 | How to scrape a website and store its opengraph tags.
 70 | 
 71 | 
 72 | ```javascript
 73 | var suq = require('suq');
 74 | var url = "http://www.example.com";
 75 | 
 76 | suq(url, function (err, json, body) {
 77 | 
 78 |     if (!err) {
 79 |         var openGraphTags = json.og;
 80 |         console.log(JSON.stringify(openGraphTags, null, 2));
 81 |     }
 82 | 
 83 | });
 84 | 
 85 | ```
 86 | 
 87 | ### TwitterCard
 88 | 
 89 | How to scrape a website and store its twitter card tags.
 90 | 
 91 | 
 92 | ```javascript
 93 | var suq = require('suq');
 94 | var url = "http://www.example.com";
 95 | 
 96 | suq(url, function (err, json, body) {
 97 | 
 98 |     if (!err) {
 99 |         var openTwitterCardTags = json.twittercard;
100 |         console.log(JSON.stringify(openTwitterCardTags, null, 2));
101 |     }
102 | 
103 | });
104 | 
105 | ```
106 | 
107 | 
108 | ### Oembed
109 | 
110 | How to scrape a website and store its oembed links.
111 | https://oembed.com/
112 | 
113 | 
114 | ```javascript
115 | var suq = require('suq');
116 | var url = "http://www.example.com";
117 | 
118 | suq(url, function (err, json, body) {
119 | 
120 |     if (!err) {
121 |         var oembedLinks = json.oembed;
122 |         console.log(JSON.stringify(oembedLinks, null, 2));
123 |     }
124 | 
125 | });
126 | 
127 | ```
128 | 
129 | ### Microformat
130 | 
131 | How to scrape a website and store its microformats version 1 and 2 data.
132 | 
133 | 
134 | ```javascript
135 | var suq = require('suq');
136 | var url = "http://www.example.com";
137 | 
138 | suq(url, function (err, json, body) {
139 | 
140 |     if (!err) {
141 |         var microformat = json.microformat;
142 |         console.log(JSON.stringify(microformat, null, 2));
143 |     }
144 | 
145 | });
146 | 
147 | ```
148 | 
149 | ### Microdata
150 | 
151 | How to scrape a website and store its schema.org microdata.
152 | 
153 | 
154 | ```javascript
155 | var suq = require('suq');
156 | var url = "http://www.example.com";
157 | 
158 | suq(url, function (err, json, body) {
159 | 
160 |     if (!err) {
161 |         var microdata = json.microdata;
162 |         DoSomethingCool(microdata);
163 |     }
164 | 
165 | });
166 | 
167 | ```
168 | 
169 | ### Headers
170 | 
171 | How to scrape header tags from a URL:
172 | 
173 | 
174 | ```javascript
175 | var suq = require('suq');
176 | var url = "http://www.example.com";
177 | 
178 | suq(url, function (err, json, body) {
179 | 
180 |     if (!err) {
181 |         var headers = json.headers;
182 | 
183 |         var title = json.headers.h1[0];
184 |         var subtitle = json.headers.h2[0];
185 | 
186 |     }
187 | 
188 | });
189 | 
190 | ```
191 | 
192 | ### Images
193 | 
194 | How to scrape image tag URLS from a website:
195 | 
196 | ```javascript
197 | var suq = require('suq');
198 | var _ = require('lodash');
199 | var url = "http://www.example.com";
200 | 
201 | suq(url, function (err, json, body) {
202 | 
203 |     if (!err) {
204 |         var images = json.images;
205 | 
206 |         _.each(images, function (src) {
207 |             makeSomeHTML('<img src="' + src + '"/>');
208 |         });
209 | 
210 |     }
211 | 
212 | });
213 | 
214 | ```
215 | 
216 | ### Meta
217 | 
218 | How to scrape meta title and description from a URL:
219 | 
220 | 
221 | ```javascript
222 | var suq = require('suq');
223 | var url = "http://www.example.com";
224 | 
225 | suq(url, function (err, json, body) {
226 | 
227 |     if (!err) {
228 |         var title = json.meta.title;
229 |         var description = json.meta.description;
230 |     }
231 | 
232 | });
233 | ```
234 | 
235 | 
236 | ### Signature
237 | 
238 | If you are familiar with signature patterns, you may find this helpful.  If not, you may ignore this :)
239 | 
240 | ```javascript
241 | suq(String url, Callback( JSON err, JSON json, String body ) callback);
242 | ```
243 | 
244 | 
245 | ### Extending
246 | 
247 | SUq is a node module that lets you scrape website data and customize what you want because it doesnt drop the html body from the request.
248 | 
249 | In this example we scrape an unordered list with the class "grocerylist" and scrape all the p tags too for fun.
250 | 
251 | ```javascript
252 | var suq = require('suq');
253 | var cheerio = require('cheerio');
254 | var url = "http://www.example.com";
255 | 
256 | suq(url, function (err, json, body) {
257 | 
258 |     var $ = cheerio.load(body);
259 | 
260 | 
261 |     $('body').find('p').each(function(i, el) {
262 | 
263 |         json.pTags.push($(el).text().trim());
264 | 
265 |     });
266 | 
267 |     $('body').find('ul.grocerylist').find('li').each(function(i, el) {
268 | 
269 |         json.groceryList.push($(el).text().trim());
270 | 
271 |     });
272 | 
273 |     NowDoSomethingCool(json);
274 | });
275 | ```
276 | ### Request options
277 | 
278 | SUq uses the [request](https://github.com/request/request) library to retrieve the HTML of the given site. The default options may not always be ideal, so you can pass any [options](https://github.com/request/request#requestoptions-callback) to `request()` using an optional third argument to `suq()`. A prominent example is the NYTimes, where you must accept cookies to get to get past the paywall the content.
279 | 
280 | ```javascript
281 | var suq = require('suq');
282 | var url = "http://www.example.com";
283 | 
284 | suq(url, function (err, json, body) {
285 |     NowDoSomethingCool(json);
286 | }, { jar: true });
287 | ```
288 | 
289 | ### Handling requests yourself
290 | 
291 | If you pass URLs that don't send HTML back, one of the dependencies for SUq will return an error. SUq therefore exposes
292 | it's `parse` function so you can handle these events yourself (in the cases when you don't want to validate the URL
293 | being passed to SUq) like so:
294 | 
295 | ```javascript
296 | var request = require('request');
297 | var suq = require('suq');
298 | 
299 | request("http://www.example.com/image.jpeg", function (err, res, body) {
300 |   if (err) return callback(err);
301 |   else if (!res || !res.statusCode) return callback(new Error('No response'));
302 |   else if (res.headers['content-type'] !== 'text/html') return callback(null, {}, body);
303 |   else suq.parse(body, callback);
304 | });
305 | ```
306 | 
307 | ### Mentions
308 | 
309 | SUq was made possible by:
310 | 
311 | * [cheerio by Matt Mueller](https://github.com/cheeriojs/cheerio.git)
312 | 
313 | * [lodash by John-David Dalton](https://lodash.com/)
314 | 
315 | * [microdata-node by Jan Potoms](https://github.com/Janpot/microdata-node)
316 | 
317 | * [microformat-node by Glenn Jones](https://github.com/glennjones/microformat-node#readme)
318 | 
319 | * [minimist and traverse by James Halliday](https://github.com/substack/minimist)
320 | 
321 | * [request by Mikeal Rogers](https://github.com/request/request#readme)
322 | 
323 | * And of course the awesome folks over at nodeJS.org
324 | 
325 | 
326 | A huge THANK YOU goes out to all of you for making this easy for me..  :)
327 | 
328 | 
329 | ### Contributors
330 | 
331 | - Matt McFarland
332 | - Tom Sutton
333 | - Oscar Illescas
334 | - Gary Moon
335 | 
336 | ### TODOS:
337 | 
338 | - Add more explanations regarding options
339 | 
340 | 
341 | ### Changelog
342 | 
343 | #### v1.3.0
344 | - Backfill unit tests, remove microformat truncation.
345 | 
346 | #### v1.2.0
347 | - Add new request and documentation for using it.
348 | 
349 | #### v1.1.0
350 | - Add anchor tag links thanks to Oscar Illescas
351 | 
352 | 
353 | #### v1.0.1
354 | 
355 | - Fixed issue with missing body (only populate data was coming in) thanks Tom Sutton
356 | 
357 | #### v1.0.0
358 | 
359 | - Cleaned up Microdata to much more managable state.
360 | 
361 | - Cleaned up Microformats to much more managable state.
362 | 
363 | - Cleaned up meta tag scraping
364 | 
365 | - Reworked Opengraph tag scraping
366 | 
367 | - Removed options support due to async bugs (may add back in later)
368 | 
369 | - Added some (not all) XSS protection
370 | 
371 | - Added trimming/whitespace removal
372 | 
373 | - Remove options support.
374 | 
375 | - Fails are graceful, resulting in at least some data returning if an error occurs
376 | 


--------------------------------------------------------------------------------