├── package.json ├── apps ├── BlankScraper.js ├── FullPageContent.js ├── SimpleEightyApp.js ├── Sample80Flag.js ├── templates │ ├── RegexMatcher.js │ └── KeywordMatcher.js ├── EmailsAndPageContent.js ├── LinkCollector.js ├── CrawlInternalLinks.js ├── StopOnExternalDomain.js ├── LinkTracer.js ├── EmailCollector.js ├── LinkCollector-External.js ├── LossyPageContent.js ├── ExternalLinkCollector.js ├── FileFinder.js ├── TextFromURLListOnly.js ├── HeaderData.js ├── LinksAndKeywords.js ├── LossyPageContentInternalLinks.js ├── InternalLinkCollector.js ├── SampleScraper.js ├── DomainCollector.js ├── DocumentData.js ├── KeywordCount.js ├── KeywordCountWith80Flag.js ├── KeywordCountPass80Flag.js ├── LossyDocumentData.js ├── CrawlImages.js ├── SiteSpecificScrapers │ ├── rentdotcom.js │ └── IMDBScraper.js └── DocumentsAndImages.js ├── .eslintrc.js └── README.md /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "eighty-app": "1.18.2" 4 | } 5 | } -------------------------------------------------------------------------------- /apps/BlankScraper.js: -------------------------------------------------------------------------------- 1 | const EightyApp = require('eighty-app'); 2 | const app = new EightyApp(); 3 | 4 | app.processDocument = function(html, url, headers, status, $) { 5 | const $html = this.parseHtml(html, $); 6 | const data = {}; 7 | 8 | return data; 9 | } 10 | 11 | app.parseLinks = function (html, url, headers, status, $) { 12 | const $html = this.parseHtml(html, $); 13 | const links = []; 14 | 15 | return links; 16 | } 17 | 18 | module.exports = function () { 19 | return app; 20 | } -------------------------------------------------------------------------------- /apps/FullPageContent.js: -------------------------------------------------------------------------------- 1 | const EightyApp = require('eighty-app'); 2 | const app = new EightyApp(); 3 | 4 | app.processDocument = function(html, url, headers, status, cheerio, extras) { 5 | return { html } 6 | }; 7 | app.parseLinks = function(html, url, headers, status, cheerio, extras) { 8 | const $ = cheerio; 9 | const $html = app.parseHtml(html, $); 10 | const links = []; 11 | 12 | // gets all links in the html document 13 | $html.find('a').each(function(i, obj) { 14 | const link = app.makeLink(url, $(this).attr('href')); 15 | if(link != null) { 16 | links.push(link); 17 | } 18 | }); 19 | 20 | return links; 21 | } 22 | 23 | module.exports = function() { 24 | return app; 25 | }; 26 | -------------------------------------------------------------------------------- /apps/SimpleEightyApp.js: -------------------------------------------------------------------------------- 1 | // This 80app returns the full HTML of each URL crawled 2 | 3 | var EightyApp = function() { 4 | this.processDocument = function(html, url, headers, status, jQuery) { 5 | return html; 6 | } 7 | 8 | this.parseLinks = function(html, url, headers, status, jQuery) { 9 | var app = this; 10 | var $ = jQuery; 11 | var $html = app.parseHtml(html, $); 12 | var links = []; 13 | 14 | // gets all links in the html document 15 | $html.find('a').each(function(i, obj) { 16 | var link = app.makeLink(url, $(this).attr('href')); 17 | if(link != null) { 18 | links.push(link); 19 | } 20 | }); 21 | 22 | return links; 23 | } 24 | } 25 | 26 | module.exports = function (EightyAppBase) { 27 | EightyApp.prototype = new EightyAppBase(); 28 | return new EightyApp(); 29 | } 30 | -------------------------------------------------------------------------------- /apps/Sample80Flag.js: -------------------------------------------------------------------------------- 1 | // An exampe for passing an 80flag value around through the parseLinks method 2 | 3 | var EightyApp = function() { 4 | this.processDocument = function(html, url, headers, status, jQuery) { 5 | return html; 6 | } 7 | 8 | this.parseLinks = function(html, url, headers, status, jQuery) { 9 | var app = this; 10 | var $ = jQuery; 11 | var $html = app.parseHtml(html, $); 12 | var links = []; 13 | 14 | // gets all links in the html document 15 | $html.find('a').each(function(i, obj) { 16 | var link = app.makeLink(url, $(this).attr('href')); 17 | 18 | if(link != null) { 19 | // append 80flag 20 | link = app.append80FlagToLink("your_value_here", link); 21 | links.push(link); 22 | } 23 | }); 24 | 25 | return links; 26 | } 27 | } 28 | 29 | module.exports = function (EightyAppBase) { 30 | EightyApp.prototype = new EightyAppBase(); 31 | return new EightyApp(); 32 | } 33 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "env": { 3 | "node": true, 4 | "es6": true, 5 | "mocha": true 6 | }, 7 | "parserOptions": { 8 | "ecmaVersion": 8, 9 | "ecmaFeatures": { 10 | "experimentalObjectRestSpread": true 11 | } 12 | }, 13 | "extends": "eslint:recommended", 14 | "rules": { 15 | "indent": [ 16 | "error", 17 | 4 18 | ], 19 | "linebreak-style": [ 20 | "error", 21 | "unix" 22 | ], 23 | "quotes": [ 24 | "error", 25 | "single" 26 | ], 27 | "semi": [ 28 | "error", 29 | "always" 30 | ], 31 | "keyword-spacing": [ 32 | "error" 33 | ], 34 | "object-curly-spacing": [ 35 | "error", 36 | "always" 37 | ], 38 | "no-trailing-spaces": [ 39 | "error" 40 | ], 41 | "no-console": "off" 42 | } 43 | }; -------------------------------------------------------------------------------- /apps/templates/RegexMatcher.js: -------------------------------------------------------------------------------- 1 | // This 80app returns all links found on a page 2 | 3 | var EightyApp = function() { 4 | this.processDocument = function(html, url, headers, status, jQuery) { 5 | var app = this; 6 | var $ = jQuery; 7 | var $html = app.parseHtml(html, $); 8 | var object = {}; 9 | 10 | // gets all regex matches in the html document 11 | var regexpattern = /your_regex_here/g; 12 | var matches = html.match(regexpattern); 13 | object.matches = matches; 14 | 15 | return JSON.stringify(object); 16 | } 17 | 18 | this.parseLinks = function(html, url, headers, status, jQuery) { 19 | var app = this; 20 | var $ = jQuery; 21 | var $html = app.parseHtml(html, $); 22 | var links = []; 23 | 24 | // gets all links in the html document 25 | $html.find('a').each(function(i, obj) { 26 | var link = app.makeLink(url, $(this).attr('href')); 27 | 28 | if(link != null) { 29 | links.push(link); 30 | } 31 | }); 32 | 33 | return links; 34 | } 35 | } 36 | 37 | module.exports = function (EightyAppBase) { 38 | EightyApp.prototype = new EightyAppBase(); 39 | return new EightyApp(); 40 | } -------------------------------------------------------------------------------- /apps/EmailsAndPageContent.js: -------------------------------------------------------------------------------- 1 | // Returns a list of email and full page content 2 | 3 | var EightyApp = function() { 4 | this.processDocument = function(html, url, headers, status, jQuery) { 5 | var app = this; 6 | $ = jQuery; 7 | var $html = app.parseHtml(html, $); 8 | var object = {}; 9 | 10 | // Get crawl date 11 | object.dateCrawled = app.formatDate(Date.now()); 12 | 13 | // Get emails 14 | var emailList = []; 15 | emailList = html.match(/[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9]+)*(\.[a-z]{2,})/gi); 16 | object.emailList = emailList; 17 | 18 | // Get page content 19 | object.html = html; 20 | 21 | return JSON.stringify(object); 22 | } 23 | 24 | this.parseLinks = function(html, url, headers, status, jQuery) { 25 | var app = this; 26 | var $ = jQuery; 27 | var $html = app.parseHtml(html, $); 28 | var links = []; 29 | 30 | // gets all links in the html document 31 | $html.find('a').each(function(i, obj) { 32 | var link = app.makeLink(url, $(this).attr('href')); 33 | 34 | if(link != null) { 35 | links.push(link); 36 | } 37 | }); 38 | 39 | return links; 40 | } 41 | } 42 | 43 | module.exports = function (EightyAppBase) { 44 | EightyApp.prototype = new EightyAppBase(); 45 | return new EightyApp(); 46 | } 47 | -------------------------------------------------------------------------------- /apps/LinkCollector.js: -------------------------------------------------------------------------------- 1 | const EightyApp = require('eighty-app'); 2 | const app = new EightyApp(); 3 | 4 | 5 | app.processDocument = function(html, url, headers, status, cheerio, extras) { 6 | const $ = cheerio; 7 | const $html = app.parseHtml(html, cheerio); 8 | const data = {}; 9 | // gets all links in the html document 10 | var links = []; 11 | $html.find('a').each(function(i, obj) { 12 | var link = app.makeLink(url, $(this).attr('href')); 13 | var text = $(this).text(); 14 | var linkObject = {}; 15 | linkObject.link = link; 16 | linkObject.text = text; 17 | if (link != null) { 18 | links.push(linkObject); 19 | } 20 | }); 21 | data.links = links; 22 | return JSON.stringify(data); 23 | }; 24 | 25 | app.parseLinks = function(html, url, headers, status, cheerio, extras) { 26 | const $ = cheerio; 27 | const $html = app.parseHtml(html, $); 28 | const links = []; 29 | // gets all links in the html document 30 | $html.find('a').each(function(i, obj) { 31 | const link = app.makeLink(url, $(this).attr('href')); 32 | if(link != null) { 33 | links.push(link); 34 | } 35 | }); 36 | return links; 37 | } 38 | module.exports = function() { 39 | return app; 40 | }; 41 | -------------------------------------------------------------------------------- /apps/CrawlInternalLinks.js: -------------------------------------------------------------------------------- 1 | /* ************************************************************************ 2 | * This 80app's parseLinks only returns URLs that have the same domain as * 3 | * the current URL being crawled. * 4 | ************************************************************************** 5 | */ 6 | 7 | const EightyApp = require('eighty-app'); 8 | const app = new EightyApp(); 9 | 10 | app.processDocument = function (html, url, headers, status, $) { 11 | return { html }; 12 | } 13 | 14 | app.parseLinks = function (html, url, headers, status, $) { 15 | const $html = this.parseHtml(html, $); 16 | const links = []; 17 | 18 | const r = /:\/\/(.[^/]+)/; 19 | const urlDomain = url.match(r)[1]; 20 | const normalizedUrlDomain = urlDomain.toLowerCase(); 21 | 22 | // gets all links in the html document 23 | $html.find('a').each(function (i, obj) { 24 | const link = app.makeLink(url, $(this).attr('href')); 25 | 26 | if (link) { 27 | const linkDomain = link.match(r)[1]; 28 | 29 | if (linkDomain.toLowerCase() === normalizedUrlDomain) { 30 | links.push(link); 31 | } 32 | } 33 | }); 34 | 35 | return links; 36 | } 37 | 38 | module.exports = function () { 39 | return app; 40 | } -------------------------------------------------------------------------------- /apps/templates/KeywordMatcher.js: -------------------------------------------------------------------------------- 1 | // This 80app returns all links found on a page 2 | 3 | var EightyApp = function() { 4 | this.processDocument = function(html, url, headers, status, jQuery) { 5 | var app = this; 6 | var $ = jQuery; 7 | var $html = app.parseHtml(html, $); 8 | var object = {}; 9 | 10 | // Get text 11 | var text = ""; 12 | $html.find('p,div,h1,h2,h3,h4,h5,li,td').each(function(i) { 13 | text += " " + $(this).text(); 14 | }); 15 | 16 | // gets all keyword matches in the text 17 | var keywordList = [ 18 | "80legs", 19 | "web" 20 | ]; 21 | var matches = []; 22 | for (i = 0; i < keywordList.length; i++) { 23 | var regex = new RegExp(keywordList[i],"gi"); 24 | matches.push(text.match(regex)); 25 | } 26 | object.matches = matches; 27 | 28 | return JSON.stringify(object); 29 | } 30 | 31 | this.parseLinks = function(html, url, headers, status, jQuery) { 32 | var app = this; 33 | var $ = jQuery; 34 | var $html = app.parseHtml(html, $); 35 | var links = []; 36 | 37 | // gets all links in the html document 38 | $html.find('a').each(function(i, obj) { 39 | var link = app.makeLink(url, $(this).attr('href')); 40 | 41 | if(link != null) { 42 | links.push(link); 43 | } 44 | }); 45 | 46 | return links; 47 | } 48 | } 49 | 50 | module.exports = function (EightyAppBase) { 51 | EightyApp.prototype = new EightyAppBase(); 52 | return new EightyApp(); 53 | } 54 | -------------------------------------------------------------------------------- /apps/StopOnExternalDomain.js: -------------------------------------------------------------------------------- 1 | // Keeps crawling until it hits an external domain and then stops. 2 | // Returns status code for every URL crawled. 3 | 4 | var EightyApp = function() { 5 | this.processDocument = function(html, url, headers, status, jQuery) { 6 | var app = this; 7 | $ = jQuery; 8 | var $html = app.parseHtml(html, $); 9 | var object = {}; 10 | 11 | // Get crawl date 12 | object.dateCrawled = app.formatDate(Date.now()); 13 | 14 | // Get HTML 15 | object.status = status; 16 | 17 | return JSON.stringify(object); 18 | } 19 | 20 | this.parseLinks = function(html, url, headers, status, jQuery) { 21 | var app = this; 22 | var $ = jQuery; 23 | var $html = app.parseHtml(html, $); 24 | var links = []; 25 | 26 | var r = /:\/\/(.[^/]+)/; 27 | var urlDomain = url.match(r)[1]; 28 | 29 | var eightyvalue = app.get80Value(url); 30 | if (eightyvalue == null) eightyvalue = url; 31 | 32 | var startingURLDomain = eightyvalue.match(r)[1]; 33 | 34 | if (startingURLDomain == urlDomain) { 35 | // gets all links in the html document 36 | $html.find('a').each(function(i, obj) { 37 | var link = app.makeLink(url, $(this).attr('href')); 38 | 39 | if (link != null) { 40 | link = app.append80FlagToLink(eightyvalue, link); 41 | links.push(link); 42 | } 43 | }); 44 | } else { 45 | // do nothing (return empty set) if on external domain 46 | } 47 | 48 | return links; 49 | } 50 | } 51 | 52 | module.exports = function (EightyAppBase) { 53 | EightyApp.prototype = new EightyAppBase(); 54 | return new EightyApp(); 55 | } 56 | -------------------------------------------------------------------------------- /apps/LinkTracer.js: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * The processDocument returns a timestamp for when the URL was crawled, * 3 | * the original URL that led to this URL being crawled, and the HTML. * 4 | *************************************************************************/ 5 | 6 | var EightyApp = function() { 7 | this.processDocument = function(html, url, headers, status, jQuery) { 8 | var app = this; 9 | $ = jQuery; 10 | var $html = app.parseHtml(html, $); 11 | var object = {}; 12 | 13 | // Get crawl date 14 | object.dateCrawled = app.formatDate(Date.now()); 15 | 16 | // Get original URL 17 | var eightyvalue = app.get80Value(url); 18 | if (eightyvalue == null) eightyvalue = url; 19 | object.startingURL = eightyvalue; 20 | 21 | // Get HTML 22 | object.html = html; 23 | 24 | return JSON.stringify(object); 25 | } 26 | 27 | this.parseLinks = function(html, url, headers, status, jQuery) { 28 | var app = this; 29 | var $ = jQuery; 30 | var $html = app.parseHtml(html, $); 31 | var links = []; 32 | 33 | // gets all links in the html document 34 | $html.find('a').each(function(i, obj) { 35 | var link = app.makeLink(url, $(this).attr('href')); 36 | if (link != null) { 37 | var eightyvalue = app.get80Value(url); 38 | if (eightyvalue == null) eightyvalue = url; 39 | link = app.append80FlagToLink(eightyvalue, link); 40 | links.push(link); 41 | } 42 | }); 43 | 44 | return links; 45 | } 46 | } 47 | 48 | module.exports = function (EightyAppBase) { 49 | EightyApp.prototype = new EightyAppBase(); 50 | return new EightyApp(); 51 | } 52 | -------------------------------------------------------------------------------- /apps/EmailCollector.js: -------------------------------------------------------------------------------- 1 | // Returns a list of emails for every page on the domains of the URL list 2 | const EightyApp = require('eighty-app'); 3 | const app = new EightyApp(); 4 | 5 | const EMAIL_REGEX = /([A-Za-z0-9_\-.])+@([A-Za-z0-9_\-.])+\.([A-Za-z]{2,4})/ig; 6 | 7 | app.processDocument = function(html, url, headers, status, cheerio, extras) { 8 | let data = {}; 9 | const { isEmail } = extras.validator; 10 | 11 | // Get emails 12 | const emails = html.match(EMAIL_REGEX); 13 | 14 | if (emails) { 15 | 16 | data.emails = emails.filter(email => isEmail(email)); 17 | } 18 | 19 | // It's possible that one page could contain the same email address multiple times, so we deduplify them. 20 | return app.removeAllDuplicates(data); 21 | }; 22 | 23 | app.parseLinks = function(html, url, headers, status, jQuery) { 24 | let $ = jQuery; 25 | let $html = app.parseHtml(html, $); 26 | let links = []; 27 | 28 | let r = /:\/\/(.[^/]+)/; 29 | let urlDomain = url.match(r)[1]; 30 | 31 | // gets all links in the html document 32 | $html.find('a').each(function(i, obj) { 33 | // console.log($(this).attr('href')); 34 | let link = app.makeLink(url, $(this).attr('href')); 35 | 36 | if (link != null) { 37 | let linkDomain = link.match(r); 38 | if (linkDomain && linkDomain.length > 1) { 39 | linkDomain = linkDomain[1]; 40 | if (urlDomain.toLowerCase() == linkDomain.toLowerCase()) 41 | links.push(link); 42 | } 43 | } 44 | }); 45 | 46 | return links; 47 | }; 48 | 49 | module.exports = () => { 50 | return app; 51 | }; 52 | -------------------------------------------------------------------------------- /apps/LinkCollector-External.js: -------------------------------------------------------------------------------- 1 | // This 80app returns all links found on a page that do not belong to the domain of the URL being crawled 2 | 3 | var EightyApp = function() { 4 | this.processDocument = function(html, url, headers, status, jQuery) { 5 | var app = this; 6 | var $ = jQuery; 7 | var $html = app.parseHtml(html, $); 8 | var object = {}; 9 | 10 | // gets all links in the html document 11 | var links = []; 12 | var r = /:\/\/(.[^/]+)/; 13 | var urlDomain = url.match(r)[1] 14 | 15 | $html.find('a').each(function(i, obj) { 16 | // console.log($(this).attr('href')); 17 | var link = app.makeLink(url, $(this).attr('href')); 18 | 19 | if(link != null) { 20 | var linkDomain = link.match(r)[1] 21 | if (urlDomain != linkDomain) { 22 | links.push(link); 23 | } 24 | } 25 | }); 26 | object.links = links; 27 | 28 | return JSON.stringify(object); 29 | } 30 | 31 | this.parseLinks = function(html, url, headers, status, jQuery) { 32 | var app = this; 33 | var $ = jQuery; 34 | var $html = app.parseHtml(html, $); 35 | var links = []; 36 | 37 | // gets all links in the html document 38 | $html.find('a').each(function(i, obj) { 39 | var link = app.makeLink(url, $(this).attr('href')); 40 | 41 | if(link != null) { 42 | links.push(link); 43 | } 44 | }); 45 | 46 | return links; 47 | } 48 | } 49 | 50 | module.exports = function (EightyAppBase) { 51 | EightyApp.prototype = new EightyAppBase(); 52 | return new EightyApp(); 53 | } -------------------------------------------------------------------------------- /apps/LossyPageContent.js: -------------------------------------------------------------------------------- 1 | /* ************************************************************************ 2 | * The processDocument for this 80app returns the HTML with all style, * 3 | * blocks, script blocks, and HTML tags stripped out. * 4 | ************************************************************************** 5 | */ 6 | 7 | var EightyApp = function() { 8 | this.processDocument = function(html, url, headers, status, jQuery) { 9 | var app = this; 10 | $ = jQuery; 11 | var $html = app.parseHtml(html, $); 12 | var object = {}; 13 | 14 | // Get crawl date 15 | object.dateCrawled = app.formatDate(Date.now()); 16 | 17 | // Get lossy content by removing html tags and javascript 18 | var lossyHTML = html; 19 | lossyHTML = lossyHTML.replace(//gi,""); 20 | lossyHTML = lossyHTML.replace(//gi,""); 21 | lossyHTML = lossyHTML.replace(/<[\s\S]*?>/g,""); 22 | object.lossyHTML = lossyHTML; 23 | 24 | return app.replaceSpecialCharacters(JSON.stringify(object)); 25 | } 26 | 27 | this.parseLinks = function(html, url, headers, status, jQuery) { 28 | var app = this; 29 | var $ = jQuery; 30 | var $html = app.parseHtml(html, $); 31 | var links = []; 32 | 33 | // gets all links in the html document 34 | $html.find('a').each(function(i, obj) { 35 | var link = app.makeLink(url, $(this).attr('href')); 36 | 37 | if(link != null) { 38 | links.push(link); 39 | } 40 | }); 41 | 42 | return links; 43 | } 44 | } 45 | 46 | module.exports = function (EightyAppBase) { 47 | EightyApp.prototype = new EightyAppBase(); 48 | return new EightyApp(); 49 | } 50 | -------------------------------------------------------------------------------- /apps/ExternalLinkCollector.js: -------------------------------------------------------------------------------- 1 | // This 80app returns all links from external domains found on a page 2 | 3 | var EightyApp = function() { 4 | this.processDocument = function(html, url, headers, status, jQuery) { 5 | var app = this; 6 | var $ = jQuery; 7 | var $html = app.parseHtml(html, $); 8 | var object = {}; 9 | var links = []; 10 | 11 | // gets all external links in the html document 12 | var r = /:\/\/(.[^/]+)/; 13 | if (url.match(r) != null) { 14 | var urlDomain = url.match(r)[1] 15 | $html.find('a').each(function(i, obj) { 16 | // console.log($(this).attr('href')); 17 | var link = app.makeLink(url, $(this).attr('href')); 18 | 19 | if(link != null) { 20 | if (link.match(r) != null) { 21 | var linkDomain = link.match(r)[1]; 22 | if (urlDomain != linkDomain) { 23 | links.push(link); 24 | } 25 | } 26 | } 27 | }); 28 | } 29 | object.externalLinks = links; 30 | 31 | return JSON.stringify(object); 32 | } 33 | 34 | this.parseLinks = function(html, url, headers, status, jQuery) { 35 | var app = this; 36 | var $ = jQuery; 37 | var $html = app.parseHtml(html, $); 38 | var links = []; 39 | 40 | // gets all links in the html document 41 | $html.find('a').each(function(i, obj) { 42 | var link = app.makeLink(url, $(this).attr('href')); 43 | 44 | if(link != null) { 45 | links.push(link); 46 | } 47 | }); 48 | 49 | return links; 50 | } 51 | } 52 | 53 | module.exports = function (EightyAppBase) { 54 | EightyApp.prototype = new EightyAppBase(); 55 | return new EightyApp(); 56 | } 57 | -------------------------------------------------------------------------------- /apps/FileFinder.js: -------------------------------------------------------------------------------- 1 | // This 80app returns all links to files (e.g., .docx, .pptx, .pdf, etc.) found on a page 2 | 3 | var EightyApp = function() { 4 | 5 | function containsFileExtensions(url) { 6 | var extensions = ['.pdf', '.doc', '.ppt', '.xls', '.docx', '.pptx', '.xlsx']; 7 | for (i = 0; i < extensions.length; i++) { 8 | if (url.match(extensions[i])) { 9 | return true; 10 | } 11 | } 12 | return false; 13 | } 14 | 15 | this.processDocument = function(html, url, headers, status, jQuery) { 16 | var app = this; 17 | var $ = jQuery; 18 | var $html = app.parseHtml(html, $); 19 | var object = {}; 20 | 21 | // gets all links in the html document 22 | var links = []; 23 | $html.find('a').each(function(i, obj) { 24 | var link = app.makeLink(url, $(this).attr('href')); 25 | if (link && containsFileExtensions(link)) { 26 | if (link != null) { 27 | links.push(link); 28 | } 29 | } 30 | }); 31 | object.links = links; 32 | 33 | return JSON.stringify(object); 34 | } 35 | 36 | this.parseLinks = function(html, url, headers, status, jQuery) { 37 | var app = this; 38 | var $ = jQuery; 39 | var $html = app.parseHtml(html, $); 40 | var links = []; 41 | 42 | // gets all links in the html document 43 | $html.find('a').each(function(i, obj) { 44 | var link = app.makeLink(url, $(this).attr('href')); 45 | 46 | if(link != null) { 47 | links.push(link); 48 | } 49 | }); 50 | 51 | return links; 52 | } 53 | } 54 | 55 | module.exports = function (EightyAppBase) { 56 | EightyApp.prototype = new EightyAppBase(); 57 | return new EightyApp(); 58 | } 59 | -------------------------------------------------------------------------------- /apps/TextFromURLListOnly.js: -------------------------------------------------------------------------------- 1 | /* ************************************************************************ 2 | * This 80app's parseLinks only returns URLs that have the same domain as * 3 | * the current URL being crawled. * 4 | * * 5 | * The processDocument returns text from each URL crawled. * 6 | ************************************************************************** 7 | */ 8 | 9 | var EightyApp = function() { 10 | this.processDocument = function(html, url, headers, status, jQuery) { 11 | var app = this, 12 | $ = jQuery, 13 | $html = app.parseHtml(html, $), 14 | object = {}; 15 | 16 | // Get crawl date 17 | object.dateCrawled = app.formatDate(Date.now()); 18 | 19 | // Get text 20 | var text = ""; 21 | $html.find('p,h1,h2,h3,h4,h5,li,td').each(function(i) { 22 | text += " " + $(this).text(); 23 | }); 24 | object.text = text; 25 | 26 | return JSON.stringify(object); 27 | } 28 | 29 | this.parseLinks = function(html, url, headers, status, jQuery) { 30 | var app = this, 31 | $ = jQuery, 32 | $html = app.parseHtml(html, $), 33 | links = [], 34 | 35 | r = /:\/\/(.[^\/]+)/, 36 | urlDomain = url.match(r)[1]; 37 | 38 | // gets all links in the html document 39 | var link, 40 | linkDomain; 41 | $html.find('a').each(function(i, obj) { 42 | link = app.makeLink(url, $(this).attr('href')); 43 | 44 | if(link != null) { 45 | linkDomain = link.match(r)[1] 46 | if (urlDomain == linkDomain) { 47 | links.push(link); 48 | } 49 | } 50 | }); 51 | 52 | return links; 53 | } 54 | } 55 | 56 | module.exports = function (EightyAppBase) { 57 | EightyApp.prototype = new EightyAppBase(); 58 | return new EightyApp(); 59 | } 60 | -------------------------------------------------------------------------------- /apps/HeaderData.js: -------------------------------------------------------------------------------- 1 | // This 80app returns the header data from each URL crawled 2 | 3 | var EightyApp = function() { 4 | this.processDocument = function(html, url, headers, status, jQuery) { 5 | var app = this; 6 | var $ = jQuery; 7 | var $html = app.parseHtml(html, $); 8 | var object = {}; 9 | 10 | if(typeof headers == 'string' || headers instanceof String) { 11 | var headersArray = headers.split("\r\n"); 12 | for (var i = 0; i < headersArray.length; i++) { 13 | var keyvalArray = headersArray[i].split(": "); 14 | var key = keyvalArray[0]; 15 | var value = keyvalArray[1]; 16 | object[key] = value; 17 | } 18 | 19 | return JSON.stringify(object); 20 | } 21 | 22 | return JSON.stringify(headers); 23 | } 24 | 25 | this.parseLinks = function(html, url, headers, status, jQuery) { 26 | var app = this; 27 | var $ = jQuery; 28 | var $html = app.parseHtml(html, $); 29 | var links = []; 30 | 31 | // gets all links in the html document 32 | $html.find('a').each(function(i, obj) { 33 | var link = app.makeLink(url, $(this).attr('href')); 34 | if(link != null) { 35 | links.push(link); 36 | } 37 | }); 38 | 39 | return links; 40 | } 41 | } 42 | 43 | module.exports = function (EightyAppBase) { 44 | EightyApp.prototype = new EightyAppBase(); 45 | return new EightyApp(); 46 | } -------------------------------------------------------------------------------- /apps/LinksAndKeywords.js: -------------------------------------------------------------------------------- 1 | // This 80app returns all links and keywords, with their counts, found on a page 2 | 3 | var EightyApp = function() { 4 | this.processDocument = function(html, url, headers, status, jQuery) { 5 | var app = this; 6 | var $ = jQuery; 7 | var $html = app.parseHtml(html, $); 8 | var object = {}; 9 | 10 | // gets all links in the html document 11 | var links = []; 12 | $html.find('a').each(function(i, obj) { 13 | // console.log($(this).attr('href')); 14 | var link = app.makeLink(url, $(this).attr('href')); 15 | 16 | if(link != null) { 17 | links.push(link); 18 | } 19 | }); 20 | object.links = links; 21 | 22 | // Get keyword frequency 23 | var keywordCount = {}; 24 | $html.find('p,h1,h2,h3,h4,h5,td,div').each(function() { 25 | var textBlockArray = $(this).text().split(/,?\s+/); 26 | for (var i = 0; i < textBlockArray.length; i++) { 27 | var keyword = textBlockArray[i].toLowerCase(); 28 | if (keyword in keywordCount) { 29 | keywordCount[keyword] = keywordCount[keyword] + 1; 30 | } else { 31 | keywordCount[keyword] = 1; 32 | } 33 | } 34 | }); 35 | object.keywordCount = keywordCount; 36 | 37 | return JSON.stringify(object); 38 | } 39 | 40 | this.parseLinks = function(html, url, headers, status, jQuery) { 41 | var app = this; 42 | var $ = jQuery; 43 | var $html = app.parseHtml(html, $); 44 | var links = []; 45 | 46 | // gets all links in the html document 47 | $html.find('a').each(function(i, obj) { 48 | var link = app.makeLink(url, $(this).attr('href')); 49 | 50 | if(link != null) { 51 | links.push(link); 52 | } 53 | }); 54 | 55 | return links; 56 | } 57 | } 58 | 59 | module.exports = function (EightyAppBase) { 60 | EightyApp.prototype = new EightyAppBase(); 61 | return new EightyApp(); 62 | } 63 | -------------------------------------------------------------------------------- /apps/LossyPageContentInternalLinks.js: -------------------------------------------------------------------------------- 1 | /* ************************************************************************ 2 | * The processDocument for this 80app returns the HTML with all style, * 3 | * blocks, script blocks, and HTML tags stripped out. * 4 | ************************************************************************** 5 | */ 6 | 7 | var EightyApp = function() { 8 | this.processDocument = function(html, url, headers, status, jQuery) { 9 | var app = this; 10 | $ = jQuery; 11 | var $html = app.parseHtml(html, $); 12 | var object = {}; 13 | 14 | // Get crawl date 15 | object.dateCrawled = app.formatDate(Date.now()); 16 | 17 | // Get lossy content by removing html tags and javascript 18 | var lossyHTML = html; 19 | lossyHTML = lossyHTML.replace(//gi,""); 20 | lossyHTML = lossyHTML.replace(//gi,""); 21 | lossyHTML = lossyHTML.replace(/<[\s\S]*?>/g,""); 22 | object.lossyHTML = lossyHTML; 23 | 24 | return app.replaceSpecialCharacters(JSON.stringify(object)); 25 | } 26 | 27 | this.parseLinks = function(html, url, headers, status, jQuery) { 28 | var app = this; 29 | var $ = jQuery; 30 | var $html = app.parseHtml(html, $); 31 | var links = []; 32 | 33 | var r = /:\/\/(.[^/]+)/; 34 | var urlDomain = url.match(r)[1] 35 | 36 | // gets all links in the html document 37 | $html.find('a').each(function(i, obj) { 38 | // console.log($(this).attr('href')); 39 | var link = app.makeLink(url, $(this).attr('href')); 40 | 41 | if (link != null) { 42 | var linkDomain = link.match(r); 43 | if (linkDomain && linkDomain.length > 1) { 44 | linkDomain = linkDomain[1]; 45 | if (urlDomain.toLowerCase() == linkDomain.toLowerCase()) 46 | links.push(link); 47 | } 48 | } 49 | }); 50 | 51 | return links; 52 | } 53 | } 54 | 55 | module.exports = function (EightyAppBase) { 56 | EightyApp.prototype = new EightyAppBase(); 57 | return new EightyApp(); 58 | } 59 | -------------------------------------------------------------------------------- /apps/InternalLinkCollector.js: -------------------------------------------------------------------------------- 1 | // This 80app returns all internal links found on a page 2 | 3 | var EightyApp = function() { 4 | this.processDocument = function(html, url, headers, status, jQuery) { 5 | var app = this; 6 | var $ = jQuery; 7 | var $html = app.parseHtml(html, $); 8 | var object = {}; 9 | 10 | // gets all intenral links in the html document 11 | var r = /:\/\/(.[^/]+)/; 12 | var urlDomain = url.match(r)[1] 13 | var links = []; 14 | $html.find('a').each(function(i, obj) { 15 | if ($(this).attr('href')) { 16 | var link = app.makeLink(url, $(this).attr('href')); 17 | if (link != null) { 18 | var linkDomain = link.match(r); 19 | if (linkDomain && linkDomain.length > 1) { 20 | linkDomain = linkDomain[1]; 21 | if (urlDomain.toLowerCase() == linkDomain.toLowerCase()) 22 | links.push(link); 23 | } 24 | } 25 | } 26 | }); 27 | 28 | object.internalLinks = links; 29 | 30 | return JSON.stringify(object); 31 | } 32 | 33 | this.parseLinks = function(html, url, headers, status, jQuery) { 34 | var app = this; 35 | var $ = jQuery; 36 | var $html = app.parseHtml(html, $); 37 | var links = []; 38 | 39 | var r = /:\/\/(.[^/]+)/; 40 | var urlDomain = url.match(r)[1] 41 | 42 | // gets all links in the html document 43 | $html.find('a').each(function(i, obj) { 44 | // console.log($(this).attr('href')); 45 | var link = app.makeLink(url, $(this).attr('href')); 46 | 47 | if (link != null) { 48 | var linkDomain = link.match(r); 49 | if (linkDomain && linkDomain.length > 1) { 50 | linkDomain = linkDomain[1]; 51 | if (urlDomain.toLowerCase() == linkDomain.toLowerCase()) 52 | links.push(link); 53 | } 54 | } 55 | }); 56 | 57 | return links; 58 | } 59 | } 60 | 61 | module.exports = function (EightyAppBase) { 62 | EightyApp.prototype = new EightyAppBase(); 63 | return new EightyApp(); 64 | } 65 | -------------------------------------------------------------------------------- /apps/SampleScraper.js: -------------------------------------------------------------------------------- 1 | // This is sample code for building a web scraper. 2 | // 3 | // For this sample, we use http://www.houzz.com/pro/jeff-halper/exterior-worlds-landscaping-and-design 4 | // as a sample listing we want to scrape. 5 | // 6 | // For the full crawler, we will assume the crawl starts from http://www.houzz.com/professionals/ 7 | 8 | var EightyApp = function() { 9 | this.processDocument = function(html, url, headers, status, jQuery) { 10 | 11 | // We only want to collect data from listing pages 12 | if (url.match("/pro/")) { 13 | 14 | // First we construct an HTML object so we can use Jquery 15 | var app = this; 16 | $ = jQuery; 17 | var $html = app.parseHtml(html, $); 18 | var object = {}; 19 | 20 | // Then we use JQuery to find all the attributes we want 21 | object.name = $html.find('h1').text(); 22 | object.address = $html.find('span[itemprop="streetAddress"]').text(); 23 | object.city = $html.find('span[itemprop="addressLocality"]').text(); 24 | object.state = $html.find('span[itemprop="addressRegion"]').text(); 25 | object.postalcode = $html.find('span[itemprop="postalCode"]').text(); 26 | object.contact = $html.find('dt:contains("Contact:")').next().text(); 27 | 28 | // Finally, we return the object as a string 29 | return JSON.stringify(object); 30 | } 31 | } 32 | 33 | this.parseLinks = function(html, url, headers, status, jQuery) { 34 | 35 | // We construct the HTML object for Jquery again 36 | var app = this; 37 | var $ = jQuery; 38 | var $html = app.parseHtml(html, $); 39 | var links = []; 40 | 41 | // We add all the pages in the directory 42 | $html.find('a.pageNumber').each(function(i, obj) { 43 | var link = app.makeLink(url, $(this).attr('href')); 44 | if(link != null) { 45 | links.push(link); 46 | } 47 | }); 48 | 49 | // We add all the listings in the directory 50 | $html.find('a.pro-title').each(function(i, obj) { 51 | var link = app.makeLink(url, $(this).attr('href')); 52 | if(link != null) { 53 | links.push(link); 54 | } 55 | }); 56 | 57 | return links; 58 | } 59 | } 60 | 61 | module.exports = function (EightyAppBase) { 62 | EightyApp.prototype = new EightyAppBase(); 63 | return new EightyApp(); 64 | }; -------------------------------------------------------------------------------- /apps/DomainCollector.js: -------------------------------------------------------------------------------- 1 | 2 | // This 80app returns the count of every domain linked from each URL crawled 3 | // The 80app will only crawl to links on the current domain. 4 | 5 | var EightyApp = function() { 6 | this.processDocument = function(html, url, headers, status, jQuery) { 7 | var app = this; 8 | var $ = jQuery; 9 | var $html = app.parseHtml(html, $); 10 | var object = {}; 11 | 12 | // gets all links in the html document 13 | var domainCount = {}; 14 | var r = /:\/\/(.[^/]+)/; 15 | 16 | $html.find('a').each(function(i, obj) { 17 | var link = app.makeLink(url, $(this).attr('href')); 18 | 19 | if (link != null) { 20 | var linkDomain = link.match(r); 21 | if (linkDomain && linkDomain.length > 1) { 22 | linkDomain = linkDomain[1]; 23 | } 24 | 25 | if (linkDomain in domainCount) { 26 | domainCount[linkDomain] = domainCount[linkDomain] + 1; 27 | } else { 28 | domainCount[linkDomain] = 1; 29 | } 30 | } 31 | }); 32 | object.domainCount = domainCount; 33 | 34 | return JSON.stringify(object); 35 | }; 36 | 37 | this.parseLinks = function(html, url, headers, status, jQuery) { 38 | var app = this; 39 | var $ = jQuery; 40 | var $html = app.parseHtml(html, $); 41 | var links = []; 42 | 43 | var r = /:\/\/(.[^/]+)/; 44 | var urlDomain = url.match(r)[1]; 45 | 46 | // gets all links in the html document 47 | $html.find('a').each(function(i, obj) { 48 | // console.log($(this).attr('href')); 49 | var link = app.makeLink(url, $(this).attr('href')); 50 | 51 | if (link != null) { 52 | var linkDomain = link.match(r); 53 | 54 | if (linkDomain && linkDomain.length > 1) { 55 | linkDomain = linkDomain[1]; 56 | } 57 | 58 | if (urlDomain.toLowerCase() == linkDomain.toLowerCase()) { 59 | links.push(link); 60 | } 61 | } 62 | }); 63 | 64 | return links; 65 | }; 66 | }; 67 | 68 | module.exports = function (EightyAppBase) { 69 | EightyApp.prototype = new EightyAppBase(); 70 | return new EightyApp(); 71 | }; 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | EightyApps 2 | ========== 3 | 4 | ###Basic 80app format 5 | 6 | ```javascript 7 | onst EightyApp = require('eighty-app'); 8 | const app = new EightyApp(); 9 | 10 | app.processDocument = function(html, url, headers, status, cheerio, extras) { 11 | const data = {}; 12 | return JSON.stringify(data); 13 | }; 14 | app.parseLinks = function(html, url, headers, status, cheerio, extras) { 15 | return []; 16 | } 17 | module.exports = function() { 18 | return app; 19 | }; 20 | ``` 21 | 22 | ### Testing 23 | To test your 80apps, you should use our [test site](http://80apptester.80legs.com/). 24 | 25 | ### Note about "img" tags 26 | Note that if you use the parseHTML method in EightyApp.js, "img" tags will be changed to "img80" tags. This is so the crawlers do not load the images when using the EightyApp to parse the html response (strangely "img" tags seem to be the only html elements affected by this). If you need to reference an "img" tag by its tag type explicitly (i.e. not by its class, id, or some other attribute) in some html, it will instead be an "img80" tag, but everything else should be the same. 27 | 28 | ### Currently Available Cheerio (i.e. jQuery) Methods 29 | The new version of Voltron - Mauler - uses an extended version of Cheerio, a lighter weight version of jQuery. You will still write functions in the same manner as before (ex: $html.find('selector').parent()); however, in order for Cheerio to obtain its faster speed, it only uses certain core jQuery functions. You can see the list of already implemented Cheerio functions that are available to you here: http://cheeriojs.github.io/cheerio/ 30 | 31 | Our extended version of Cheerio also includes a number of other functions that are available for your use. These include: 32 | 33 | * .not 34 | * .makeArray 35 | * .each 36 | * .filter 37 | * .prop 38 | 39 | We are currently working on implementing the pseudo selectors :eq and :first (ex: $html.find('div:eq(1)')); however, they are NOT currently implemented and WILL cause errors. Check here or our knowledge base for updated Cheerio functionality. 40 | 41 | ### CrawlImages.js 42 | 43 | To successfully use this app, the image url submmitted must contain an 'EightyFlag', a tag that indicates the url is an image. 44 | 45 | `http://www.example.com/exampleimage.jpg?80flag=type:image` 46 | 47 | -------------------------------------------------------------------------------- /apps/DocumentData.js: -------------------------------------------------------------------------------- 1 | // This 80app returns the following attributes from each URL crawled: 2 | // * title 3 | // * meta tags 4 | // * links (everything in an 'a' tag) 5 | 6 | var EightyApp = function() { 7 | this.processDocument = function(html, url, headers, status, jQuery) { 8 | var app = this; 9 | var $ = jQuery; 10 | var $html = app.parseHtml(html, $); 11 | var object = {}; 12 | 13 | object.title = $html.filter('title').text(); 14 | object.meta_description = $html.filter('meta[name="description"]').attr('content'); 15 | object.meta_keywords= $html.filter('meta[name="keywords"]').attr('content'); 16 | var meta_tags = []; 17 | $html.filter('meta').each(function(i, obj) { 18 | var meta_obj = {}; 19 | meta_obj.name = $(this).attr('name'); 20 | meta_obj.content = $(this).attr('content'); 21 | meta_tags.push(meta_obj); 22 | }); 23 | object.meta_tags = meta_tags; 24 | 25 | // gets all links in the html document 26 | var links = []; 27 | $html.find('a').each(function(i, obj) { 28 | var link = app.makeLink(url, $(this).attr('href')); 29 | if(link != null) { 30 | links.push(link); 31 | } 32 | }); 33 | object.links = links; 34 | 35 | return JSON.stringify(object); 36 | } 37 | 38 | this.parseLinks = function(html, url, headers, status, jQuery) { 39 | var app = this; 40 | var $ = jQuery; 41 | var $html = app.parseHtml(html, $); 42 | var links = []; 43 | 44 | // gets all links in the html document 45 | $html.find('a').each(function(i, obj) { 46 | var link = app.makeLink(url, $(this).attr('href')); 47 | 48 | if(link != null) { 49 | links.push(link); 50 | } 51 | }); 52 | 53 | return links; 54 | } 55 | } 56 | 57 | 58 | module.exports = function(EightyAppBase) { 59 | EightyApp.prototype = new EightyAppBase(); 60 | return new EightyApp(); 61 | } -------------------------------------------------------------------------------- /apps/KeywordCount.js: -------------------------------------------------------------------------------- 1 | /* ************************************************************************ 2 | * This 80app's parseLinks only returns URLs that have the same domain as * 3 | * the current URL being crawled. * 4 | * * 5 | * The processDocument returns a timestamp for when the URL was crawled, * 6 | * and a list of words on the page with their frequencies. * 7 | ************************************************************************** 8 | */ 9 | 10 | var EightyApp = function() { 11 | this.processDocument = function(html, url, headers, status, jQuery) { 12 | var app = this; 13 | $ = jQuery; 14 | var $html = app.parseHtml(html, $); 15 | var object = {}; 16 | 17 | // Get crawl date 18 | object.dateCrawled = app.formatDate(Date.now()); 19 | 20 | // Get keyword frequency 21 | var keywordCount = {}; 22 | $html.find('p,h1,h2,h3,h4,h5,td,div').each(function() { 23 | var textBlockArray = $(this).text().split(/,?\s+/); 24 | for (var i = 0; i < textBlockArray.length; i++) { 25 | var keyword = textBlockArray[i].toLowerCase(); 26 | if (keyword in keywordCount) { 27 | keywordCount[keyword] = keywordCount[keyword] + 1; 28 | } else { 29 | keywordCount[keyword] = 1; 30 | } 31 | } 32 | }); 33 | object.keywordCount = keywordCount; 34 | 35 | return JSON.stringify(object); 36 | } 37 | 38 | this.parseLinks = function(html, url, headers, status, jQuery) { 39 | var app = this; 40 | var $ = jQuery; 41 | var $html = app.parseHtml(html, $); 42 | var links = []; 43 | 44 | var r = /:\/\/(.[^/]+)/; 45 | var urlDomain = url.match(r)[1] 46 | 47 | // gets all links in the html document 48 | $html.find('a').each(function(i, obj) { 49 | // console.log($(this).attr('href')); 50 | var link = app.makeLink(url, $(this).attr('href')); 51 | 52 | if (link != null) { 53 | var linkDomain = link.match(r); 54 | if (linkDomain && linkDomain.length > 1) { 55 | linkDomain = linkDomain[1]; 56 | if (urlDomain.toLowerCase() == linkDomain.toLowerCase()) 57 | links.push(link); 58 | } 59 | } 60 | }); 61 | 62 | return links; 63 | } 64 | } 65 | 66 | module.exports = function (EightyAppBase) { 67 | EightyApp.prototype = new EightyAppBase(); 68 | return new EightyApp(); 69 | } 70 | -------------------------------------------------------------------------------- /apps/KeywordCountWith80Flag.js: -------------------------------------------------------------------------------- 1 | /* ************************************************************************ 2 | * This 80app's parseLinks only returns URLs that have the same domain as * 3 | * the current URL being crawled. * 4 | * * 5 | * The processDocument returns a timestamp for when the URL was crawled, * 6 | * and a list of words on the page with their frequencies. * 7 | ************************************************************************** 8 | */ 9 | 10 | var EightyApp = function() { 11 | this.processDocument = function(html, url, headers, status, jQuery) { 12 | var app = this; 13 | $ = jQuery; 14 | var $html = app.parseHtml(html, $); 15 | var object = {}; 16 | 17 | // Get crawl date 18 | object.dateCrawled = app.formatDate(Date.now()); 19 | 20 | // Get keyword frequency 21 | var keywordCount = {}; 22 | $html.find('p,h1,h2,h3,h4,h5,td,div').each(function() { 23 | var textBlockArray = $(this).text().split(/,?\s+/); 24 | for (var i = 0; i < textBlockArray.length; i++) { 25 | var keyword = textBlockArray[i].toLowerCase(); 26 | if (keyword in keywordCount) { 27 | keywordCount[keyword] = keywordCount[keyword] + 1; 28 | } else { 29 | keywordCount[keyword] = 1; 30 | } 31 | } 32 | }); 33 | object.keywordCount = keywordCount; 34 | 35 | return JSON.stringify(object); 36 | } 37 | 38 | this.parseLinks = function(html, url, headers, status, jQuery) { 39 | var app = this; 40 | var $ = jQuery; 41 | var $html = app.parseHtml(html, $); 42 | var links = []; 43 | 44 | var r = /:\/\/(.[^/]+)/; 45 | var urlDomain = url.match(r)[1] 46 | 47 | // gets all links in the html document 48 | $html.find('a').each(function(i, obj) { 49 | var link = app.makeLink(url, $(this).attr('href')); 50 | 51 | if(link != null) { 52 | var linkDomain = link.match(r); 53 | if (linkDomain && linkDomain.length > 1) { 54 | linkDomain = linkDomain[1]; 55 | // only crawl link if domain is the same of current URL 56 | if (urlDomain.toLowerCase() == linkDomain.toLowerCase()) { 57 | link = app.append80FlagToLink("your value here", link); 58 | links.push(link); 59 | } 60 | } 61 | } 62 | }); 63 | 64 | return links; 65 | } 66 | } 67 | 68 | module.exports = function (EightyAppBase) { 69 | EightyApp.prototype = new EightyAppBase(); 70 | return new EightyApp(); 71 | } 72 | -------------------------------------------------------------------------------- /apps/KeywordCountPass80Flag.js: -------------------------------------------------------------------------------- 1 | /* ************************************************************************ 2 | * This 80app's parseLinks only returns URLs that have the same domain as * 3 | * the current URL being crawled. * 4 | * * 5 | * The processDocument returns a timestamp for when the URL was crawled, * 6 | * a list of words on the page with their frequencies, and the HTML. * 7 | ************************************************************************** 8 | */ 9 | 10 | var EightyApp = function() { 11 | this.processDocument = function(html, url, headers, status, jQuery) { 12 | var app = this; 13 | $ = jQuery; 14 | var $html = app.parseHtml(html, $); 15 | var object = {}; 16 | 17 | // Get crawl date 18 | object.dateCrawled = app.formatDate(Date.now()); 19 | 20 | // Get keyword frequency 21 | var keywordCount = {}; 22 | $html.find('p,h1,h2,h3,h4,h5,td,div').each(function() { 23 | var textBlockArray = $(this).text().split(/,?\s+/); 24 | for (var i = 0; i < textBlockArray.length; i++) { 25 | var keyword = textBlockArray[i].toLowerCase(); 26 | if (keyword in keywordCount) { 27 | keywordCount[keyword] = keywordCount[keyword] + 1; 28 | } else { 29 | keywordCount[keyword] = 1; 30 | } 31 | } 32 | }); 33 | object.keywordCount = keywordCount; 34 | 35 | // Get HTML 36 | object.html = html; 37 | 38 | return JSON.stringify(object); 39 | } 40 | 41 | this.parseLinks = function(html, url, headers, status, jQuery) { 42 | var app = this; 43 | var $ = jQuery; 44 | var $html = app.parseHtml(html, $); 45 | var links = []; 46 | 47 | console.log('URL:\t' + url); 48 | 49 | var r = /:\/\/(.[^/]+)/; 50 | var urlDomain = url.match(r)[1] 51 | 52 | // gets all links in the html document 53 | $html.find('a').each(function(i, obj) { 54 | var link = app.makeLink(url, $(this).attr('href')); 55 | 56 | if(link != null) { 57 | var linkDomain = link.match(r); 58 | if (linkDomain && linkDomain.length > 1) { 59 | linkDomain = linkDomain[1]; 60 | // only crawl link if domain is the same of current URL 61 | if (urlDomain.toLowerCase() == linkDomain.toLowerCase()) { 62 | var eightyvalue = app.get80Value(url); 63 | link = app.append80FlagToLink(eightyvalue, link); 64 | links.push(link); 65 | } 66 | } 67 | } 68 | }); 69 | 70 | return links; 71 | } 72 | } 73 | 74 | module.exports = function (EightyAppBase) { 75 | EightyApp.prototype = new EightyAppBase(); 76 | return new EightyApp(); 77 | } 78 | -------------------------------------------------------------------------------- /apps/LossyDocumentData.js: -------------------------------------------------------------------------------- 1 | // This 80app returns the following attributes from each URL crawled: 2 | // * title 3 | // * meta tags 4 | // * links (everything in an 'a' tag) 5 | 6 | var EightyApp = function() { 7 | this.processDocument = function(html, url, headers, status, jQuery) { 8 | var app = this; 9 | var $ = jQuery; 10 | var $html = app.parseHtml(html, $); 11 | var object = {}; 12 | 13 | object.date_crawled = app.formatDate(Date.now()); 14 | object.title = $html.filter('title').text(); 15 | object.meta_description = $html.filter('meta[name="description"]').attr('content'); 16 | object.meta_keywords = $html.filter('meta[name="keywords"]').attr('content'); 17 | object.meta_subject = $html.filter('meta[name="subject"]').attr('content'); 18 | 19 | // Get lossy content by removing html tags and javascript 20 | var lossyHTML = html; 21 | lossyHTML = lossyHTML.replace(//gi,""); 22 | lossyHTML = lossyHTML.replace(//gi,""); 23 | lossyHTML = lossyHTML.replace(/<[\s\S]*?>/g,""); 24 | object.lossyHTML = lossyHTML; 25 | 26 | var meta_tags = []; 27 | $html.filter('meta').each(function(i, obj) { 28 | var meta_obj = {}; 29 | meta_obj.name = $(this).attr('name'); 30 | meta_obj.content = $(this).attr('content'); 31 | meta_tags.push(meta_obj); 32 | }); 33 | object.meta_tags = meta_tags; 34 | 35 | // gets all links in the html document 36 | var links = []; 37 | $html.find('a').each(function(i, obj) { 38 | var link = app.makeLink(url, $(this).attr('href')); 39 | if(link != null) { 40 | links.push(link); 41 | } 42 | }); 43 | object.links = links; 44 | 45 | return JSON.stringify(object); 46 | } 47 | 48 | this.parseLinks = function(html, url, headers, status, jQuery) { 49 | var app = this; 50 | var $ = jQuery; 51 | var $html = app.parseHtml(html, $); 52 | var links = []; 53 | 54 | // gets all links in the html document 55 | $html.find('a').each(function(i, obj) { 56 | var link = app.makeLink(url, $(this).attr('href')); 57 | 58 | if(link != null) { 59 | links.push(link); 60 | } 61 | }); 62 | 63 | return links; 64 | } 65 | } 66 | 67 | module.exports = function (EightyAppBase) { 68 | EightyApp.prototype = new EightyAppBase(); 69 | return new EightyApp(); 70 | } 71 | -------------------------------------------------------------------------------- /apps/CrawlImages.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This 80app can be used to crawl images. The following properties of an image will be returned: 3 | * - base64: the image encoded using base64 4 | * - byteSize: the size of the image 5 | * - width: the width of the image in pixels 6 | * - height: the height of the image in pixels 7 | * - type: the true type of the image i.e. jpg, png etc. This goes off the metadata of the 8 | * image itself as opposed to the extension in the url 9 | * 10 | * If the content of the url cannot be identified as an image, the 80app will return an 11 | * empty object. 12 | * 13 | * Note: This 80app will ONLY work against urls that have ?80flag=type:image tacked onto the 14 | * end of the url 15 | * 16 | * Supported image formats: 17 | * - bmp 18 | * - cur 19 | * - dds 20 | * - gif 21 | * - icns 22 | * - ico 23 | * - jpg 24 | * - png 25 | * - psd 26 | * - svg 27 | * - tiff 28 | * - webp 29 | */ 30 | 31 | const EightyApp = require('eighty-app'); 32 | const sizeOf = require('image-size'); 33 | 34 | const app = new EightyApp(); 35 | 36 | app.processDocument = function (arrayBuffer, url, headers, status) { 37 | if (status > 299) { 38 | return {}; 39 | } 40 | 41 | const base64 = arrayBufferToBase64(arrayBuffer); 42 | 43 | if (!base64) { 44 | return {}; 45 | } 46 | 47 | const dimensions = sizeOf(arrayBuffer); 48 | 49 | const data = { 50 | ...dimensions, 51 | byteSize: arrayBuffer.length, 52 | base64, 53 | sourceURL: url.replace("?80flag=type:image", "") 54 | }; 55 | 56 | 57 | return JSON.stringify(data); 58 | } 59 | 60 | app.parseLinks = function (html, url, headers, status, cheerio) { 61 | if (status > 299) { 62 | return []; 63 | } 64 | 65 | url = url.replace("?80flag=type:image", ""); 66 | if (url.match(/\.(bmp|cur|dds|gif|icns|ico|jpg|png|psd|svg|tiff|webp)$/)) { 67 | return []; 68 | } 69 | 70 | const links = []; 71 | const $html = app.parseHtml(html, cheerio); 72 | 73 | $html.find('img').each(function () { 74 | // Add the 80flag to the image url to let the crawler know to base 64 encode the image. 75 | // The 80flag is also the filter used in the processDocument section 76 | const link = app.append80FlagToLink("type:image", cheerio(this).attr("src")); 77 | links.push(link); 78 | }); 79 | 80 | // gets all links in the html document 81 | $html.find('a').each(function () { 82 | const link = app.makeLink(url, cheerio(this).attr('href')); 83 | 84 | if (link != null) { 85 | links.push(link); 86 | } 87 | }); 88 | 89 | return app.eliminateDuplicates(links);; 90 | } 91 | 92 | function arrayBufferToBase64(arrayBuffer) { 93 | var base64 = '' 94 | var encodings = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' 95 | 96 | var bytes = new Uint8Array(arrayBuffer) 97 | var byteLength = bytes.byteLength 98 | var byteRemainder = byteLength % 3 99 | var mainLength = byteLength - byteRemainder 100 | 101 | var a, b, c, d 102 | var chunk 103 | 104 | // Main loop deals with bytes in chunks of 3 105 | for (var i = 0; i < mainLength; i = i + 3) { 106 | // Combine the three bytes into a single integer 107 | chunk = (bytes[i] << 16) | (bytes[i + 1] << 8) | bytes[i + 2] 108 | 109 | // Use bitmasks to extract 6-bit segments from the triplet 110 | a = (chunk & 16515072) >> 18 // 16515072 = (2^6 - 1) << 18 111 | b = (chunk & 258048) >> 12 // 258048 = (2^6 - 1) << 12 112 | c = (chunk & 4032) >> 6 // 4032 = (2^6 - 1) << 6 113 | d = chunk & 63 // 63 = 2^6 - 1 114 | 115 | // Convert the raw binary segments to the appropriate ASCII encoding 116 | base64 += encodings[a] + encodings[b] + encodings[c] + encodings[d] 117 | } 118 | 119 | // Deal with the remaining bytes and padding 120 | if (byteRemainder == 1) { 121 | chunk = bytes[mainLength] 122 | 123 | a = (chunk & 252) >> 2 // 252 = (2^6 - 1) << 2 124 | 125 | // Set the 4 least significant bits to zero 126 | b = (chunk & 3) << 4 // 3 = 2^2 - 1 127 | 128 | base64 += encodings[a] + encodings[b] + '==' 129 | } else if (byteRemainder == 2) { 130 | chunk = (bytes[mainLength] << 8) | bytes[mainLength + 1] 131 | 132 | a = (chunk & 64512) >> 10 // 64512 = (2^6 - 1) << 10 133 | b = (chunk & 1008) >> 4 // 1008 = (2^6 - 1) << 4 134 | 135 | // Set the 2 least significant bits to zero 136 | c = (chunk & 15) << 2 // 15 = 2^4 - 1 137 | 138 | base64 += encodings[a] + encodings[b] + encodings[c] + '=' 139 | } 140 | 141 | return base64 142 | } 143 | 144 | module.exports = function () { 145 | return app; 146 | } -------------------------------------------------------------------------------- /apps/SiteSpecificScrapers/rentdotcom.js: -------------------------------------------------------------------------------- 1 | var EightyApp = function() { 2 | this.processDocument = function(html, url, headers, status, cheerio) { 3 | var app = this; 4 | $ = cheerio; 5 | var $html = app.parseHtml(html, $); 6 | var objects = []; 7 | 8 | if (/([0-9]+)$/.test(url)) { 9 | var headerGroup = $html.find(".header-group"), 10 | leasingDetails = $html.find(".leasing-details"), 11 | propertyName = $(headerGroup).find("h1[itemprop=name]").first().text().trim(), 12 | address = $(headerGroup).find("span[itemprop=streetAddress]").first().text(), 13 | locality = $(headerGroup).find("span[itemprop=addressLocality]").first().text(), 14 | region = $(headerGroup).find("span[itemprop=addressRegion]").first().text(), 15 | postalCode = $(headerGroup).find("span[itemprop=postalCode]").first().text(), 16 | phone = $html.find('#pdp-leasing-info .tel').first().text(), 17 | managedBy = $html.find(".leasing-info-check-avail").prev().find("p").text(), 18 | petPolicy = $(leasingDetails).find('p').eq(0).text().trim().replace("\n", " ", "gm"), 19 | leasingTerms = $(leasingDetails).find('p').last().prev().text(), 20 | hours = $html.find(".office-hours").text().trim().replace("\n", " ", "gm"); 21 | 22 | 23 | var images = []; 24 | 25 | $html.find(".vert-wrap").each(function() { 26 | $(this).data("getimg") && images.push( $(this).data("getimg").replace("//", "") ); 27 | }); 28 | 29 | var floorPlanDivs = $html.find('div[data-trackgroup=floorPlans]'), 30 | floorPlans = [], 31 | floorPlanBeds = [], 32 | floorPlanBaths = [], 33 | floorPlanRents = [], 34 | floorPlanSqfts = [], 35 | floorPlanDeposits = [], 36 | floorPlanAvailabilities = [], 37 | floorPlan; 38 | 39 | $(floorPlanDivs).find(".fp-bed dd").each(function(){ 40 | floorPlanBeds.push( $(this).text() ); 41 | }); 42 | 43 | $(floorPlanDivs).find(".fp-bath dd").each(function(){ 44 | floorPlanBaths.push( $(this).text() ); 45 | }); 46 | 47 | $(floorPlanDivs).find(".fp-rent dd").each(function(){ 48 | floorPlanRents.push( $(this).text() ); 49 | }); 50 | 51 | $(floorPlanDivs).find(".fp-sqft dd").each(function(){ 52 | floorPlanSqfts.push( $(this).text() ); 53 | }); 54 | $(floorPlanDivs).find(".fp-deposit dd").each(function(){ 55 | floorPlanDeposits.push( $(this).text() ); 56 | }); 57 | 58 | var amenities = $html.find(".det-content-list").children(), 59 | features = [], 60 | feature; 61 | 62 | for (var i = 0; i < amenities.length; i++) { 63 | if ( amenities[i].name === "dt" ) { 64 | feature = {}; 65 | features.push(feature); 66 | features[features.length - 1]["key"] = $(amenities[i]).text(); 67 | features[features.length - 1]["value"] = []; 68 | } 69 | else if ( amenities[i].name === "dd" ) { 70 | features[features.length - 1]["value"].push( $(amenities[i]).text() ); 71 | } 72 | } 73 | 74 | pageReviews = $html.find(".individual-review"); 75 | 76 | if ( $(pageReviews).length > 0 ) { 77 | var reviews = [], 78 | review; 79 | 80 | $(pageReviews).each(function(){ 81 | review = {}; 82 | review.username = $(this).find(".resident-name").text(); 83 | review.rating = $(this).find("meta[itemprop=reviewRating]").attr("content"); 84 | review.text = $(this).find(".blurb").text() + $(this).find(".remainder").text(); 85 | review.numHelpful = $(this).find(".thumbsUp.count").text(); 86 | review.dateSeen = new Date(); 87 | 88 | reviews.push(review); 89 | }); 90 | } 91 | 92 | $html.find('div.row[data-trackgroup=floorPlans]').each(function(i){ 93 | object = {}; 94 | object.propertyType = "apartment"; 95 | object.propertyName = propertyName; 96 | object.managedBy = managedBy; 97 | object.address = address; 98 | object.locality = locality; 99 | object.region = region; 100 | object.country = "US"; 101 | object.postalcode = postalCode; 102 | object.phone = phone; 103 | object.hours = hours; 104 | object.images = images; 105 | object.petPolicy = petPolicy; 106 | object.leasingTerms = leasingTerms; 107 | object.unitName = $(this).find(".fp-prop-name").text(); 108 | object.numBedrooms = floorPlanBeds[i]; 109 | object.numBathrooms = floorPlanBaths[i]; 110 | object.features = features; 111 | if (reviews) { 112 | object.reviews = reviews; 113 | } 114 | 115 | price = {}; 116 | price.price = floorPlanRents[i].replace("$", "", "gm") + " USD"; 117 | price.dateSeen = new Date(); 118 | object.price = price; 119 | 120 | object.size = floorPlanSqfts[i] + " sqft"; 121 | object.deposit = floorPlanDeposits[i]; 122 | 123 | objects.push(object); 124 | }); 125 | 126 | } 127 | return JSON.stringify(objects); 128 | }; 129 | 130 | this.parseLinks = function(html, url, headers, status, cheerio) { 131 | var app = this; 132 | $ = cheerio; 133 | var $html = app.parseHtml(html, $); 134 | var links = []; 135 | 136 | if (/(sitemap)/.test(url)) { 137 | $html.find("#sitemap-links-cont li a").each(function() { 138 | if ( $(this).attr("href").charAt(0) == "/" ) { 139 | links.push( "http://www.rent.com" + $(this).attr("href") ); 140 | } 141 | else { 142 | links.push( "http://www.rent.com/" + $(this).attr("href") ); 143 | } 144 | }); 145 | } 146 | 147 | console.log("parseLinks"); 148 | return links; 149 | }; 150 | }; 151 | 152 | module.exports = function (EightyAppBase) { 153 | EightyApp.prototype = new EightyAppBase(); 154 | return new EightyApp(); 155 | } -------------------------------------------------------------------------------- /apps/DocumentsAndImages.js: -------------------------------------------------------------------------------- 1 | // This 80app collects document data and images 2 | 3 | var EightyApp = function() { 4 | 5 | this.processDocument = function(html, url, headers, status, jQuery) { 6 | // Create empty object for storing image information. 7 | var object = {}; 8 | 9 | // Check for 80flag that will indicate that the url is an image. 10 | if (url.match("80flag=type:image")) { 11 | 12 | // Append the correct prefix based on image type. 13 | var encodedString = ""; 14 | if (url.match(".jpg")) { 15 | encodedString = "data:image/jpg;base64," + html; 16 | } else if (url.match(".png")) { 17 | encodedString = "data:image/png;base64," + html; 18 | } else if (url.match(".gif")) { 19 | encodedString = "data:image/gif;base64," + html; 20 | } 21 | 22 | // Generate a new image object, set source as the base64 encoded string. 23 | var image = new Image(); 24 | image.src = encodedString; 25 | 26 | // Convert the encoded string to binary. 27 | var binary = atob(encodedString.split(",")[1]); 28 | var len = binary.length; 29 | var buffer = new ArrayBuffer(len); 30 | var view = new Uint8Array(buffer); 31 | 32 | // Get turn the binary into a blog to get the image size in bytes. 33 | for (var i = 0; i < len; i++) { 34 | view[i] = binary.charCodeAt(i); 35 | } 36 | var myBlob = new Blob([view]); 37 | 38 | // Add information to object to be returned. 39 | object.byteSize = myBlob.size; 40 | object.width = image.width; 41 | object.height = image.height; 42 | object.base64 = encodedString; 43 | object.binary = binary; 44 | object.source_url = url.replace("?80flag=type:image", ""); 45 | 46 | } else { 47 | 48 | var app = this; 49 | var $ = jQuery; 50 | var $html = app.parseHtml(html, $); 51 | var object = {}; 52 | 53 | object.title = $html.filter('title').text(); 54 | object.meta_description = $html.filter('meta[name="description"]').attr('content'); 55 | object.meta_keywords= $html.filter('meta[name="keywords"]').attr('content'); 56 | var meta_tags = []; 57 | $html.filter('meta').each(function(i, obj) { 58 | var meta_obj = {}; 59 | meta_obj.name = $(this).attr('name'); 60 | meta_obj.content = $(this).attr('content'); 61 | meta_tags.push(meta_obj); 62 | }); 63 | object.meta_tags = meta_tags; 64 | 65 | // gets all links in the html document 66 | var links = []; 67 | $html.find('a').each(function(i, obj) { 68 | var link = app.makeLink(url, $(this).attr('href')); 69 | if(link != null) { 70 | links.push(link); 71 | } 72 | }); 73 | object.links = links; 74 | object.numLinks = links.length; 75 | 76 | // 80Legs converts tags into tags, so find the img80 tags. 77 | var images = []; 78 | $html.find('img80').each(function(i, obj) { 79 | // Add the 80flag to the image url to let the crawler know to base 64 encode the image. 80 | // The 80flag is also the filter used in the processDocument section 81 | var imagelink = app.append80FlagToLink("type:image", $(this).attr("src")); 82 | images.push(app.makeLink(url, imagelink)); 83 | }); 84 | images = app.eliminateDuplicates(images); 85 | object.images = images; 86 | object.numImages = images.length; 87 | 88 | object.html = html; 89 | } 90 | 91 | return JSON.stringify(object); 92 | } 93 | 94 | this.parseLinks = function(html, url, headers, status, jQuery) { 95 | var links = []; 96 | 97 | // If already on an image url, just return the empty links array. 98 | if (url.match(/\.(jpg|png|gif)/g)) { 99 | return links; 100 | } else { 101 | // Use jQuery to parse the document 102 | var app = this; 103 | $ = jQuery; 104 | var $html = app.parseHtml(html, $); 105 | 106 | // 80Legs converts tags into tags, so find the img80 tags. 107 | $html.find('img80').each(function(i, obj) { 108 | // Add the 80flag to the image url to let the crawler know to base 64 encode the image. 109 | // The 80flag is also the filter used in the processDocument section 110 | var link = app.append80FlagToLink("type:image", $(this).attr("src")); 111 | links.push(app.makeLink(url, link)); 112 | }); 113 | links = app.eliminateDuplicates(links); 114 | 115 | var r = /:\/\/(.[^/]+)/; 116 | var urlDomain = url.match(r)[1] 117 | 118 | // gets all links in the html document 119 | $html.find('a').each(function(i, obj) { 120 | // console.log($(this).attr('href')); 121 | var link = app.makeLink(url, $(this).attr('href')); 122 | 123 | if (link != null) { 124 | var linkDomain = link.match(r); 125 | if (linkDomain && linkDomain.length > 1) { 126 | linkDomain = linkDomain[1]; 127 | if (urlDomain.toLowerCase() == linkDomain.toLowerCase()) 128 | links.push(link); 129 | } 130 | } 131 | }); 132 | 133 | return links; 134 | } 135 | 136 | }; 137 | }; 138 | 139 | module.exports = function (EightyAppBase) { 140 | EightyApp.prototype = new EightyAppBase(); 141 | return new EightyApp(); 142 | } 143 | -------------------------------------------------------------------------------- /apps/SiteSpecificScrapers/IMDBScraper.js: -------------------------------------------------------------------------------- 1 | var EightyApp = function() { 2 | 3 | this.processDocument = function(html, url, headers, status, jQuery) { 4 | var app = this; 5 | $ = jQuery; 6 | var $html = app.parseHtml(html, $); 7 | var object = {}; 8 | 9 | if (url.match("/name/")) { 10 | 11 | if (url.match("/bio")) { 12 | 13 | object.data_type = "actor"; 14 | object.name = $html.find('h3[itemprop="name"]').text().trim(); 15 | 16 | object.overviewDetails = []; 17 | $html.find('table[id="overviewTable"] tr').each(function(i, obj) { 18 | var overviewDetail = {}; 19 | overviewDetail.label = $(this).find('td.label').text().trim(); 20 | overviewDetail.value = $(this).find('td').first().next().text().trim(); 21 | object.overviewDetails.push(overviewDetail); 22 | }); 23 | 24 | object.spouseDetails = []; 25 | $html.find('table[id="tableSpouses"] tr').each(function(i, obj) { 26 | var spouseDetail = {}; 27 | spouseDetail.label = $(this).find('td').first().text().trim(); 28 | spouseDetail.value = $(this).find('td').first().next().text().trim(); 29 | object.spouseDetails.push(spouseDetail); 30 | }); 31 | 32 | object.miniBioDetails = []; 33 | $html.find('a[name="mini_bio"]').next().nextUntil('a').each(function(i, obj) { 34 | var miniBioDetail = {}; 35 | miniBioDetail.value = $(this).text().trim(); 36 | object.miniBioDetails.push(miniBioDetail); 37 | }); 38 | 39 | object.spouseDetails = []; 40 | $html.find('table[id="tableSpouses"] tr').each(function(i, obj) { 41 | var spouseDetail = {}; 42 | spouseDetail.name = $(this).find('td').first().text().trim(); 43 | spouseDetail.value = $(this).find('td').first().next().text().trim(); 44 | object.spouseDetails.push(spouseDetail); 45 | }); 46 | 47 | object.trademarkDetails = []; 48 | $html.find('a[name="trademark"]').next().nextUntil('a').each(function(i, obj) { 49 | var trademarkDetail = {}; 50 | trademarkDetail.value = $(this).text().trim(); 51 | object.trademarkDetails.push(trademarkDetail); 52 | }); 53 | 54 | object.personalQuotes = []; 55 | $html.find('a[name="quotes"]').next().nextUntil('a').each(function(i, obj) { 56 | var personalQuote = {}; 57 | personalQuote.value = $(this).text().trim(); 58 | object.personalQuotes.push(personalQuote); 59 | }); 60 | 61 | object.triviaDetails = []; 62 | $html.find('a[name="trivia"]').next().nextUntil('a').each(function(i, obj) { 63 | var triviaDetail = {}; 64 | triviaDetail.value = $(this).text().trim(); 65 | object.triviaDetails.push(triviaDetail); 66 | }); 67 | 68 | object.bioDetails = []; 69 | $html.find('div[class="soda odd"]').each(function(i, obj) { 70 | var bioDetail = {}; 71 | bioDetail.value = $(this).text().trim(); 72 | object.bioDetails.push(bioDetail); 73 | }); 74 | $html.find('div[class="soda even"]').each(function(i, obj) { 75 | var bioDetail = {}; 76 | bioDetail.value = $(this).text().trim(); 77 | object.bioDetails.push(bioDetail); 78 | }); 79 | 80 | object.salaries = []; 81 | $html.find('table[id="salariesTable"] tr').each(function(i, obj) { 82 | var salary = {}; 83 | salary.project = $(this).find('td').first().text().trim(); 84 | salary.value = $(this).find('td').first().next().text().trim(); 85 | object.salaries.push(salary); 86 | }); 87 | 88 | } else if (url.match("/awards")) { 89 | 90 | object.data_type = "actor"; 91 | object.name = $html.find('h3[itemprop="name"]').text().trim(); 92 | 93 | object.awards = []; 94 | $html.find('table[class="awards"] tr').each(function(i, obj) { 95 | var award = {}; 96 | award.year = $(this).find('td.award_year').text().trim(); 97 | award.outcome = $(this).find('td.award_outcome').text().trim(); 98 | award.description = $(this).find('td.award_description').text().trim(); 99 | object.awards.push(award); 100 | }); 101 | 102 | } else if (url.match("/externalsites")) { 103 | 104 | object.data_type = "actor"; 105 | object.name = $html.find('h3[itemprop="name"]').text().trim(); 106 | 107 | object.externalSites = []; 108 | $html.find('ul.simpleList li').each(function(i, obj) { 109 | var externalSite = {}; 110 | externalSite.URL = $(this).find('a').attr('href'); 111 | externalSite.text = $(this).find('a').text().trim(); 112 | object.externalSites.push(externalSite); 113 | }); 114 | 115 | } else { 116 | object.data_type = "actor"; 117 | object.name = $html.find('span[itemprop="name"]').text().trim(); 118 | object.description = $html.find('div[id="name-bio-text"]').text().trim(); 119 | object.born = $html.find('div[id="name-born-info"]').text().trim(); 120 | object.otherWorks = $html.find('div[id="details-other-works"]').text().trim(); 121 | object.publicityListings = $html.find('div[id="details-publicity-listings"]').text().trim(); 122 | object.alternateNames = $html.find('div[id="details-akas"]').text().trim(); 123 | object.height = $html.find('div[id="details-height"]').text().trim(); 124 | 125 | object.filmography = []; 126 | $html.find('div[class="filmo-row odd"]').each(function(i, obj) { 127 | var filmographyCredit = {}; 128 | filmographyCredit.date = $(this).find('span.year_column').text().trim(); 129 | filmographyCredit.title = $(this).find('b').text().trim(); 130 | filmographyCredit.role = $(this).find('a').first().text().trim(); 131 | object.filmography.push(filmographyCredit); 132 | }); 133 | $html.find('div[class="filmo-row even"]').each(function(i, obj) { 134 | var filmographyCredit = {}; 135 | filmographyCredit.date = $(this).find('span.year_column').text().trim(); 136 | filmographyCredit.title = $(this).find('b').text().trim(); 137 | filmographyCredit.role = $(this).find('a').first().text().trim(); 138 | object.filmography.push(filmographyCredit); 139 | }); 140 | 141 | } 142 | } else if (url.match("title/")) { 143 | 144 | if (url.match("fullcredits")) { 145 | 146 | object.data_type = "movie"; 147 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 148 | 149 | object.cast = []; 150 | $html.find('tr.odd').each(function(i, obj) { 151 | var castmember = {}; 152 | castmember.name = $(this).find('td[itemprop="actor"] span[itemprop="name"]').text().trim(); 153 | castmember.role = $(this).find('td.character div').text().trim(); 154 | object.cast.push(castmember); 155 | }); 156 | $html.find('tr.even').each(function(i, obj) { 157 | var castmember = {}; 158 | castmember.name = $(this).find('td[itemprop="actor"] span[itemprop="name"]').text().trim(); 159 | castmember.role = $(this).find('td.character div').text().trim(); 160 | object.cast.push(castmember); 161 | }); 162 | 163 | } else if (url.match("/trivia?ref_=tt_ql_trv_1")){ 164 | 165 | object.data_type = "movie"; 166 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 167 | 168 | object.trivia = []; 169 | $html.find('div[class="sodatext"]').each(function(i, obj) { 170 | object.trivia.push($(this).text().trim()); 171 | }); 172 | 173 | } else if (url.match("/quotes")){ 174 | 175 | object.data_type = "movie"; 176 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 177 | 178 | object.quotes = []; 179 | $html.find('div[class="sodatext"]').each(function(i, obj) { 180 | object.quotes.push($(this).text().trim()); 181 | }); 182 | 183 | } else if (url.match("/plotsummary")){ 184 | 185 | object.data_type = "movie"; 186 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 187 | 188 | object.plots = []; 189 | $html.find('p[class="plotSummary"]').each(function(i, obj) { 190 | object.plots.push($(this).text().trim()); 191 | }); 192 | 193 | } else if (url.match("/releaseinfo")){ 194 | 195 | object.data_type = "movie"; 196 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 197 | 198 | object.releasedates = []; 199 | $html.find('table[id="release_dates"] tr').each(function(i, obj) { 200 | var releasedate = {}; 201 | releasedate.location = $(this).find('td').first().text().trim(); 202 | releasedate.date = $(this).find('td').next().text().trim(); 203 | object.releasedates.push(releasedate); 204 | }); 205 | 206 | } else if (url.match("/companycredits")){ 207 | 208 | object.data_type = "movie"; 209 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 210 | 211 | object.companycredits = []; 212 | $html.find('ul.simpleList li').each(function(i, obj) { 213 | object.companycredits.push($(this).text().trim()); 214 | }); 215 | 216 | } else if (url.match("/parentalguide")){ 217 | 218 | object.data_type = "movie"; 219 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 220 | 221 | object.certifications = []; 222 | $html.find('div.info div.info-content a').each(function(i, obj) { 223 | if ($(this).attr('href').match("certificates")) { 224 | object.certifications.push($(this).text().trim()); 225 | } 226 | }); 227 | 228 | } else if (url.match("/locations")){ 229 | 230 | object.data_type = "movie"; 231 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 232 | 233 | object.locations = []; 234 | $html.find('div[class="soda sodavote even"]').each(function(i, obj) { 235 | object.locations.push($(this).find('dt').text().trim()); 236 | }); 237 | $html.find('div[class="soda sodavote odd"]').each(function(i, obj) { 238 | object.locations.push($(this).find('dt').text().trim()); 239 | }); 240 | 241 | } else if (url.match("/awards")){ 242 | 243 | object.data_type = "movie"; 244 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 245 | 246 | object.awards = []; 247 | $html.find('td.award_description').each(function(i, obj) { 248 | var award = {}; 249 | award.value = $(this).text().trim(); 250 | award.status = $(this).parent().find('td.title_award_outcome').text().trim(); 251 | object.awards.push(award); 252 | }); 253 | 254 | } else if (url.match("/technical")){ 255 | 256 | object.data_type = "movie"; 257 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 258 | 259 | object.technicalspecs = []; 260 | $html.find('table[class="dataTable labelValueTable"] tr').each(function(i, obj) { 261 | var technicalspec = {}; 262 | technicalspec.key = $(this).find('td.label').text().trim(); 263 | technicalspec.value = $(this).find('td').next().text().trim(); 264 | object.technicalspecs.push(technicalspec); 265 | }); 266 | 267 | } else if (url.match("/soundtrack")){ 268 | 269 | object.data_type = "movie"; 270 | object.title = $html.find('h3[itemprop="name"] a').text().trim(); 271 | 272 | object.soundtrack = []; 273 | $html.find('div[class="soundTrack soda odd"]').each(function(i, obj) { 274 | object.soundtrack.push($(this).text().trim()); 275 | }); 276 | $html.find('div[class="soundTrack soda odd"]').each(function(i, obj) { 277 | object.soundtrack.push($(this).text().trim()); 278 | }); 279 | 280 | } else { 281 | 282 | object.data_type = "movie"; 283 | object.title = $html.find('h1.header span[itemprop="name"]').text().trim(); 284 | object.date = $html.find('meta[itemprop="datePublished"]').attr('content').trim(); 285 | object.runningtime = $html.find('time[itemprop="duration"]').text().trim(); 286 | 287 | object.categories = []; 288 | $html.find('div[itemprop="genre"] a').each(function(i, obj) { 289 | object.categories.push($(this).text()); 290 | }); 291 | 292 | object.description = $html.find('p[itemprop="description"]').text().trim(); 293 | object.director = $html.find('div[itemprop="director"] span.itemprop').text().trim(); 294 | 295 | object.writers = []; 296 | $html.find('div[itemprop="creator"] a').each(function(i, obj) { 297 | object.writers.push($(this).find('span').text().trim()); 298 | }); 299 | 300 | object.rating = $html.find('span[itemprop="ratingValue"]').text().trim(); 301 | 302 | object.details = []; 303 | $html.find('div[id="titleDetails"] div.txt-block').each(function(i, obj) { 304 | var detail = {}; 305 | detail.key = $(this).find('h4').text().trim().replace(":",""); 306 | detail.value = $(this).text().trim().replace(/^.*?:/,""); 307 | object.details.push(detail); 308 | }); 309 | 310 | } 311 | } 312 | 313 | return JSON.stringify(object); 314 | }; 315 | 316 | this.parseLinks = function(html, url, headers, status, jQuery) { 317 | var app = this; 318 | $ = jQuery; 319 | var $html = app.parseHtml(html, $); 320 | var links = []; 321 | 322 | if (url.match("/title")) { 323 | 324 | $html.find('div.see-more a').each(function(i, obj) { 325 | var link = app.makeLink(url, $(this).attr('href')); 326 | links.push(link); 327 | }); 328 | 329 | $html.find('ul[class="quicklinks"] a').each(function(i, obj) { 330 | var link = app.makeLink(url, $(this).attr('href')); 331 | links.push(link); 332 | }); 333 | 334 | 335 | } else if (url.match("search/name")) { 336 | 337 | if (url.match("start=")) { 338 | $html.find('td[class="name"] a').each(function(i, obj) { 339 | if ($(this).attr('href').match("/name/")) { 340 | var link = app.makeLink(url, $(this).attr('href')); 341 | links.push(link); 342 | } 343 | }); 344 | 345 | var link = app.makeLink(url, $html.find('span.pagination a').attr('href')); 346 | links.push(link); 347 | } 348 | 349 | } else { 350 | 351 | $html.find('div[id="maindetails_quicklinks"] a').each(function(i, obj) { 352 | var link = app.makeLink(url, $(this).attr('href')); 353 | links.push(link); 354 | }); 355 | 356 | $html.find('ul[class="quicklinks"] a').each(function(i, obj) { 357 | var link = app.makeLink(url, $(this).attr('href')); 358 | links.push(link); 359 | }); 360 | 361 | $html.find('div[class="filmo-row odd"] a').each(function(i, obj) { 362 | var link = app.makeLink(url, $(this).attr('href')); 363 | if (!link.match("/character")) { 364 | links.push(link); 365 | } 366 | }); 367 | $html.find('div[class="filmo-row even"] a').each(function(i, obj) { 368 | var link = app.makeLink(url, $(this).attr('href')); 369 | if (!link.match("/character")) { 370 | links.push(link); 371 | } 372 | }); 373 | 374 | } 375 | 376 | for (var i = 0; i < links.length; i++) { 377 | links[i] = links[i].slice(0,links[i].indexOf('?ref_=')); 378 | } 379 | 380 | return links; 381 | }; 382 | }; 383 | 384 | module.exports = function (EightyAppBase) { 385 | EightyApp.prototype = new EightyAppBase(); 386 | return new EightyApp(); 387 | } 388 | --------------------------------------------------------------------------------