├── SUq.gif
├── .gitignore
├── .npmignore
├── recipes
├── request-options.js
├── microformat-dump.js
├── youtube.js
├── generic.js
├── images.js
└── wordpress-microformat.js
├── bin
├── usage.txt
└── suq.js
├── lib
├── parseMeta.js
├── parseTwitterCard.js
├── parseOpenGraph.js
├── parseOembed.js
├── parseTags.js
├── cleanMicrodata.js
└── cleanMicroformats.js
├── tests
├── cleanMicrodata.js
├── parseTwitterCard.js
├── parseMeta.js
├── parseOembed.js
├── cleanMicroformats.js
├── parseTags.js
├── parseOpenGraph.js
└── fixtures
│ ├── sample.js
│ ├── cleanedMicrodata.json
│ └── rawMicrodata.json
├── LICENSE
├── package.json
├── index.js
└── README.md
/SUq.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MattMcFarland/SUq/HEAD/SUq.gif
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .idea
3 | .c9
4 | npm-debug.log
5 | tests/sites/*.json
6 | .npmrc
7 |
--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .idea
3 | .c9
4 | npm-debug.log
5 | task.txt
6 | tests/sites/*.json
--------------------------------------------------------------------------------
/recipes/request-options.js:
--------------------------------------------------------------------------------
1 | var suq = require('suq');
2 |
3 | suq('http://www.nytimes.com/2016/01/31/books/review/the-powers-that-were.html', function (err, data, body) {
4 |
5 | console.log(data);
6 |
7 | }, { jar: true });
--------------------------------------------------------------------------------
/recipes/microformat-dump.js:
--------------------------------------------------------------------------------
1 | var suq = require('suq');
2 | var cheerio = require('cheerio');
3 | var _ = require('lodash');
4 |
5 | suq('https://blog.agilebits.com/2015/06/17/1password-inter-process-communication-discussion/', function (err, data, body) {
6 |
7 | console.log(JSON.stringify(data));
8 |
9 | });
--------------------------------------------------------------------------------
/bin/usage.txt:
--------------------------------------------------------------------------------
1 | Usage: suq [url] {OPTIONS}
2 |
3 | Options:
4 |
5 | --url, -u Scrapes the url provided.
6 | Optionally url can be the first parameter.
7 |
8 | --output, -o Writes the scraped data to a file
9 | If unspecified, suq prints to stdout.
10 |
11 | --version, -v Displays version information.
12 |
13 | --help, -h Show this message
14 |
15 | Specify a parameter.
16 |
--------------------------------------------------------------------------------
/recipes/youtube.js:
--------------------------------------------------------------------------------
1 | var suq = require('suq');
2 |
3 | suq('https://www.youtube.com/watch?v=Xft3asYLKo0', function (err, data) {
4 |
5 | if (!err) {
6 | var props = data.microdata.items[0].properties;
7 | console.log('\n\ntitle:', props.name[0]);
8 | console.log('\nthumbnail:', props.thumbnailUrl[0]);
9 | console.log('embedURL:', props.embedURL[0]);
10 | console.log('\ndescription:', props.description[0]);
11 | console.log('\ndatePublished:', props.datePublished[0]);
12 | }
13 | });
--------------------------------------------------------------------------------
/lib/parseMeta.js:
--------------------------------------------------------------------------------
1 | var _ = require('lodash');
2 |
3 | module.exports = function ($, callback) {
4 |
5 | try {
6 |
7 | var
8 | $head = $('head'),
9 | result = {};
10 |
11 |
12 | $head.find('meta').each(function(i, el) {
13 | var $el = $(el);
14 |
15 | if ($el.attr('name') && $el.attr('content')) {
16 | result[$el.attr('name')] = $el.attr('content');
17 | }
18 |
19 | });
20 |
21 | callback(null, result);
22 |
23 | } catch (e) {
24 | console.log(e);
25 | callback(e);
26 | }
27 |
28 | };
--------------------------------------------------------------------------------
/recipes/generic.js:
--------------------------------------------------------------------------------
1 | var suq = require('suq');
2 | var cheerio = require('cheerio');
3 | var _ = require('lodash');
4 |
5 | suq('http://odonatagame.blogspot.com/2015/07/oh-thats-right-were-not-dead.html', function (err, data, body) {
6 |
7 | var $ = cheerio.load(body);
8 |
9 | var scraped = {
10 | title: data.meta.title || data.headers.h1[0],
11 | description: data.meta.description || $('p').text().replace(/([\r\n\t])+/ig,'').substring(0,255) +'...',
12 | images: _.sample(data.images, 8)
13 | };
14 |
15 | console.log(scraped);
16 |
17 | });
--------------------------------------------------------------------------------
/tests/cleanMicrodata.js:
--------------------------------------------------------------------------------
1 | const test = require('tape');
2 | const cleanMicrodata = require('../lib/cleanMicrodata');
3 | const rawMicrodata = require('./fixtures/rawMicrodata.json');
4 | const cleanedMicrodata = require('./fixtures/cleanedMicrodata.json');
5 |
6 | test('cleanMicrodata.js', function (t) {
7 | t.plan(2);
8 |
9 | cleanMicrodata(rawMicrodata, (err, data) => {
10 | t.equal(err, null, 'should return callback without error');
11 | t.deepEqual(data, cleanedMicrodata, 'should return callback with cleaned microdata');
12 | });
13 | });
14 |
15 |
16 |
--------------------------------------------------------------------------------
/lib/parseTwitterCard.js:
--------------------------------------------------------------------------------
1 | var _ = require('lodash');
2 |
3 | module.exports = function ($, callback) {
4 |
5 | try {
6 |
7 | var
8 | $head = $('head'),
9 | result = {};
10 |
11 |
12 | $head.find('meta').each(function(i, el) {
13 | var $el = $(el);
14 |
15 | if ($el.attr('name') && $el.attr('content') && $el.attr('name').indexOf('twitter:') > -1) {
16 | result[$el.attr('name')] = $el.attr('content');
17 | }
18 |
19 | });
20 |
21 | callback(null, result);
22 |
23 | } catch (e) {
24 | console.log(e);
25 | callback(e);
26 | }
27 |
28 | };
29 |
--------------------------------------------------------------------------------
/lib/parseOpenGraph.js:
--------------------------------------------------------------------------------
1 | var _ = require('lodash');
2 |
3 | module.exports = function ($, callback) {
4 |
5 | try {
6 |
7 | var
8 | $head = $('head'),
9 | result = {};
10 |
11 |
12 | $head.find('meta').each(function(i, el) {
13 | var $el = $(el);
14 |
15 | if ($el.attr('property') && $el.attr('content') && $el.attr('property').indexOf('og:') > -1) {
16 | result[$el.attr('property')] = $el.attr('content');
17 | }
18 |
19 | });
20 |
21 | callback(null, result);
22 |
23 | } catch (e) {
24 | console.log(e);
25 | callback(e);
26 | }
27 |
28 | };
29 |
--------------------------------------------------------------------------------
/tests/parseTwitterCard.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const test = require('tape');
3 | const parseTwitterCard = require('../lib/parseTwitterCard');
4 | const { $: parsedHtml } = require('./fixtures/sample');
5 | const expected = {
6 | 'twitter:card': 'summary',
7 | 'twitter:site': '@nytimesbits',
8 | 'twitter:creator': '@nickbilton',
9 | }
10 |
11 | test('parseTwitterCard.js', function (t) {
12 | t.plan(2);
13 | parseTwitterCard(parsedHtml, (err, data) => {
14 | t.equal(err, null, 'should return callback without error');
15 | t.deepEqual(expected, data);
16 | })
17 | });
18 |
--------------------------------------------------------------------------------
/tests/parseMeta.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const test = require('tape');
3 | const parseMeta = require('../lib/parseMeta');
4 | const { $: parsedHtml } = require('./fixtures/sample');
5 | const expected = {
6 | description: 'Free Web tutorials',
7 | keywords: 'HTML,CSS,XML,JavaScript',
8 | author: 'Hege Refsnes',
9 | 'twitter:card': 'summary',
10 | 'twitter:site': '@nytimesbits',
11 | 'twitter:creator': '@nickbilton'
12 | }
13 |
14 | test('parseMeta.js', function (t) {
15 | t.plan(2);
16 | parseMeta(parsedHtml, (err, data) => {
17 | t.equal(err, null, 'should return callback without error');
18 | t.deepEqual(expected, data);
19 | })
20 | });
21 |
22 |
23 |
--------------------------------------------------------------------------------
/tests/parseOembed.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const test = require('tape');
3 | const parseOembed = require('../lib/parseOembed');
4 | const { $: parsedHtml } = require('./fixtures/sample');
5 |
6 | const expected = {
7 | 'text/xml+oembed': "https://namchey.com/api/oembed?url=https%3A%2F%2Fnamchey.com%2Fitineraries%2Ftilicho&format=xml",
8 | 'text/json+oembed': "https://namchey.com/api/oembed?url=https%3A%2F%2Fnamchey.com%2Fitineraries%2Ftilicho&format=json"
9 | };
10 |
11 | test('parseOembed.js', function (t) {
12 | t.plan(2);
13 | parseOembed(parsedHtml, (err, data) => {
14 | t.equal(err, null, 'should return callback without error');
15 | t.deepEqual(expected, data);
16 | });
17 | });
18 |
--------------------------------------------------------------------------------
/recipes/images.js:
--------------------------------------------------------------------------------
1 | // How to scrape image tag URLS from a website:
2 |
3 | var suq = require('suq');
4 | var _ = require('underscore');
5 |
6 | var url = "http://www.ufirstgroup.com";
7 |
8 | suq(url, function (err, json, body) {
9 |
10 | if (!err) {
11 | var images = json.images;
12 |
13 | console.log('\nThe Image tag URLs in the page, converted to json: \n\n', JSON.stringify(images, null, 2));
14 |
15 | console.log('\n\nList of individual Image tag URLs, pulled from the JSON using Underscore.js and converted into valid HTML: \n\n');
16 |
17 | _.each(images, function (src) {
18 | console.log('');
19 | });
20 |
21 | }
22 |
23 | });
--------------------------------------------------------------------------------
/tests/cleanMicroformats.js:
--------------------------------------------------------------------------------
1 | const fs = require("fs");
2 | const test = require("tape");
3 | const cleanMicroformats = require("../lib/cleanMicroformats");
4 |
5 | test("cleanMicroformats.js", function(t) {
6 | t.plan(2);
7 | const body = 'Glenn';
8 |
9 | cleanMicroformats(body, (err, data) => {
10 | t.equal(err, null, "should return callback without error");
11 |
12 | const expected = [
13 | {
14 | id: 1,
15 | type: "h-card",
16 | props: { name: "Glenn", url: "http://glennjones.net" },
17 | path: ["0", "type"],
18 | length: 2,
19 | level: 2
20 | }
21 | ];
22 | t.deepEqual(data, expected);
23 | });
24 | });
25 |
--------------------------------------------------------------------------------
/tests/parseTags.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const test = require('tape');
3 | const parseTags = require('../lib/parseTags');
4 | const { $: parsedHtml } = require('./fixtures/sample');
5 | const expected = {
6 | title: 'My cool website',
7 | headers: { h1: [ 'Lorem' ], h2: [ 'images' ], h3: [], h4: [], h5: [], h6: [] },
8 | images: [ '/cat1.jpg', '/cat2.jpg', '/cat3.jpg', '/cat4.jpg' ],
9 | links: [ { text: '', title: undefined, href: '#' },
10 | { text: '', title: undefined, href: '#' },
11 | { text: '', title: undefined, href: '#' },
12 | { text: '', title: undefined, href: '#' },
13 | { text: 'more stuff', title: undefined, href: '/more' } ]
14 | }
15 |
16 | test('parseTags.js', function (t) {
17 | t.plan(2);
18 | parseTags(parsedHtml, (err, data) => {
19 | t.equal(err, null, 'should return callback without error');
20 | t.deepEqual(expected, data);
21 | })
22 | });
23 |
--------------------------------------------------------------------------------
/tests/parseOpenGraph.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const test = require('tape');
3 | const parseOpenGraph = require('../lib/parseOpenGraph');
4 | const { $: parsedHtml } = require('./fixtures/sample');
5 | const expected = {
6 | 'og:url': 'http://bits.blogs.nytimes.com/2011/12/08/a-twitter-for-my-sister/',
7 | 'og:title': 'A Twitter for My Sister',
8 | 'og:description': 'In the early days, Twitter grew so quickly that it was almost impossible to add new features because engineers spent their time trying to keep the rocket ship from stalling.',
9 | 'og:image': 'http://graphics8.nytimes.com/images/2011/12/08/technology/bits-newtwitter/bits-newtwitter-tmagArticle.jpg'
10 | }
11 |
12 | test('parseTags.js', function (t) {
13 | t.plan(2);
14 | parseOpenGraph(parsedHtml, (err, data) => {
15 | t.equal(err, null, 'should return callback without error');
16 | t.deepEqual(expected, data);
17 | })
18 | });
19 |
--------------------------------------------------------------------------------
/lib/parseOembed.js:
--------------------------------------------------------------------------------
1 | var _ = require('lodash');
2 |
3 | module.exports = function ($, callback) {
4 |
5 | try {
6 |
7 | var
8 | $head = $('head'),
9 | result = {};
10 |
11 |
12 | $head.find('link').each(function(i, el) {
13 | var $el = $(el);
14 |
15 | //xml type
16 | if ($el.attr('type') && $el.attr('href') && $el.attr('rel') === 'alternate' && $el.attr('type').indexOf('text/xml+oembed') > -1) {
17 | result[$el.attr('type')] = $el.attr('href');
18 | }
19 |
20 | //json type
21 | if ($el.attr('type') && $el.attr('href') && $el.attr('rel') === 'alternate' && ($el.attr('type').indexOf('text/json+oembed') > -1 || $el.attr('type').indexOf('application/json+oembed') > -1)) {
22 | result[$el.attr('type')] = $el.attr('href');
23 | }
24 |
25 | });
26 |
27 | callback(null, result);
28 |
29 | } catch (e) {
30 | console.log(e);
31 | callback(e);
32 | }
33 |
34 | };
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | This software is released under the MIT license:
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | this software and associated documentation files (the "Software"), to deal in
5 | the Software without restriction, including without limitation the rights to
6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | the Software, and to permit persons to whom the Software is furnished to do so,
8 | subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 |
--------------------------------------------------------------------------------
/recipes/wordpress-microformat.js:
--------------------------------------------------------------------------------
1 | var suq = require('suq');
2 | var _ = require('lodash');
3 |
4 | function hget(model) {
5 | if (model[0]) {
6 | return (model[0])
7 | } else {
8 | return {}
9 | }
10 | }
11 |
12 |
13 | suq('/a-wordpress/blog-post', function (err, data) {
14 |
15 | var feed, posts, postData, postProps, author, post;
16 |
17 | feed = hget(_.filter(data.microformat.items, _.matches({type: ["h-feed"]})));
18 |
19 | if (feed) {
20 |
21 | posts = _.filter(feed.children, _.matches({type: ["h-entry"]}));
22 |
23 | if (posts) {
24 | postData = hget(posts);
25 | }
26 |
27 | if (postData) {
28 |
29 | postProps = postData.properties;
30 |
31 | if (postProps) {
32 |
33 | author = hget(_.filter(postProps.author, _.matches({type: ["h-card"]})));
34 |
35 | post = {
36 | title: hget(postProps.name),
37 | category: hget(postProps.category),
38 | excerpt: hget(postProps.content).value.substring(0, 150) + '...',
39 | author: hget(author.properties.name),
40 | url: hget(data.microformat.rels.canonical),
41 | images: _.sample(data.images, 4)
42 | };
43 |
44 | console.log(post);
45 |
46 | }
47 |
48 | }
49 | }
50 |
51 | });
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "suq",
3 | "main": "./index",
4 | "version": "1.4.1",
5 | "description": "A Scraping Utility for lazy people",
6 | "scripts": {
7 | "test": "tape tests/*.js | tap-diff"
8 | },
9 | "keywords": [
10 | "htmlparser",
11 | "jquery",
12 | "selector",
13 | "scraper",
14 | "parser",
15 | "html",
16 | "microdata",
17 | "microformats",
18 | "opengraph",
19 | "twittercard",
20 | "meta"
21 | ],
22 | "bin": {
23 | "suq": "./bin/suq.js"
24 | },
25 | "files": [
26 | "index.js",
27 | "lib",
28 | "bin",
29 | "recipes"
30 | ],
31 | "author": {
32 | "name": "Matt McFarland",
33 | "email": "contact@mattmcfarland.com"
34 | },
35 | "contributors": [
36 | {
37 | "name": "Matt McFarland",
38 | "email": "contact@mattmcfarland.com"
39 | },
40 | {
41 | "name": "Tom Sutton",
42 | "url": "https://github.com/tomsutton1984"
43 | },
44 | {
45 | "name": "Oscar Illescas",
46 | "url": "https://github.com/oillescas"
47 | },
48 | {
49 | "name": "Gary Moon",
50 | "url": "https://github.com/garymoon"
51 | }
52 | ],
53 | "license": "MIT",
54 | "dependencies": {
55 | "cheerio": "^0.22.0",
56 | "lodash": "^4.17.11",
57 | "microdata-node": "^1.0.0",
58 | "microformat-node": "^2.0.1",
59 | "minimist": "^1.2.0",
60 | "request": "^2.88.0",
61 | "traverse": "^0.6.6",
62 | "xss": "^1.0.6"
63 | },
64 | "repository": {
65 | "type": "git",
66 | "url": "git://github.com/MattMcFarland/SUq.git"
67 | },
68 | "devDependencies": {
69 | "chalk": "^2.4.2",
70 | "tap-diff": "^0.1.1",
71 | "tape": "^4.10.2"
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/lib/parseTags.js:
--------------------------------------------------------------------------------
1 | var _ = require('lodash');
2 |
3 | module.exports = function ($, callback) {
4 | try {
5 |
6 | var
7 | $head = $('head'),
8 | $body = $('body'),
9 | result = {
10 | title: '',
11 | headers: { h1: [], h2: [], h3: [], h4: [], h5: [], h6: [] },
12 | images: [],
13 | links :[]
14 | };
15 |
16 | result.title = $head.find('title').text().replace(/\n|\t/g, "");
17 |
18 | $body.find('h1').each(function(i, el) {
19 |
20 | result.headers.h1.push($(el).text().replace(/\n|\t/g, ""));
21 |
22 | });
23 |
24 | $body.find('h2').each(function(i, el) {
25 |
26 | result.headers.h2.push($(el).text().replace(/\n|\t/g, ""));
27 |
28 | });
29 |
30 | $body.find('h3').each(function(i, el) {
31 |
32 | result.headers.h3.push($(el).text().replace(/\n|\t/g, ""));
33 |
34 | });
35 |
36 | $body.find('h4').each(function(i, el) {
37 |
38 | result.headers.h4.push($(el).text().replace(/\n|\t/g, ""));
39 |
40 | });
41 |
42 | $body.find('h5').each(function(i, el) {
43 |
44 | result.headers.h5.push($(el).text().replace(/\n|\t/g, ""));
45 |
46 | });
47 |
48 | $body.find('img').each(function(i, el) {
49 |
50 | result.images.push($(el).attr('src'));
51 |
52 | });
53 |
54 | $body.find('a[href!=""]').each(function(i, el) {
55 |
56 | result.links.push({
57 | text: $(el).text().trim(),
58 | title: $(el).attr('title'),
59 | href: $(el).attr('href')
60 | });
61 |
62 | });
63 |
64 | result.links = _.compact(result.links);
65 | result.images = _.compact(result.images);
66 |
67 | callback(null, result);
68 |
69 | } catch (e) {
70 | callback(e);
71 | }
72 |
73 | };
--------------------------------------------------------------------------------
/bin/suq.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 |
3 | var argv = require('minimist')(process.argv.slice(2)),
4 | url = argv.url || argv.u || process.argv.slice(2)[0],
5 | output = argv.output || argv.o || false,
6 | fs = require('fs'),
7 | path = require('path'),
8 | suq = require('../');
9 |
10 |
11 | if (argv._[0] === 'help' || argv.h || argv.help
12 | || (process.argv.length <= 1 && process.stdin.isTTY)) {
13 | return fs.createReadStream(__dirname + '/usage.txt')
14 | .pipe(process.stdout)
15 | .on('close', function () { process.exit(1) });
16 | }
17 |
18 | if (argv.version || argv.v) {
19 | return console.log(require('../package.json').version);
20 | }
21 |
22 |
23 | if (url) {
24 |
25 | if (url.indexOf('http') === -1) {
26 | console.error('SUq Error ['+ url + ']\n', 'Be sure to use http:// or https://');
27 | } else {
28 | suq(url, function (err, data) {
29 |
30 | if (!err) {
31 | if (!data) {
32 | console.error('SUq Error ['+ url + ']\n', 'response empty');
33 | } else {
34 | if (!output) {
35 | console.log(JSON.stringify(data, null, 2));
36 | } else {
37 | fs.writeFile(path.join(__dirname, output), JSON.stringify(data), function(err){
38 | console.log('File ' + output + ' successfully written!');
39 | })
40 | }
41 | }
42 | } else {
43 | console.log('SUq Error ['+ url + ']\n', err);
44 | }
45 | });
46 | }
47 |
48 |
49 | } else {
50 | return fs.createReadStream(__dirname + '/usage.txt')
51 | .pipe(process.stdout)
52 | .on('close', function () { process.exit(1) });
53 | }
54 |
55 |
--------------------------------------------------------------------------------
/tests/fixtures/sample.js:
--------------------------------------------------------------------------------
1 | const html = `
2 |
3 |
Ipsum???
24 |