├── .gitignore ├── README.md ├── demo └── server.js ├── index.js ├── lib ├── cleaner │ ├── clean-formatting.js │ ├── index.js │ ├── prep-for-parsing.js │ ├── remove-attributes.js │ ├── remove-empty-elements.js │ ├── remove-navigational-elements.js │ ├── remove-post-data-elements.js │ └── remove-social-elements.js └── parser │ ├── author.js │ ├── content.js │ ├── summary.js │ └── title.js ├── package.json └── test.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | data/ 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # article-extractor 2 | 3 | > A Node.js module to retrieve article content and metadata from a URL. 4 | 5 | **This module is under heavy development! Its quality and API will probably change a lot, so keep an eye out for any changes.** 6 | 7 | To see what features are coming up next, or if you'd like to suggest one yourself, go here: https://github.com/thomastuts/article-extractor/issues/3 8 | 9 | ## Demo 10 | You can see `article-extractor` in action here: 11 | ``` 12 | GET http://article-extractor.thomastuts.com/parse?url=AN_ARTICLE_URL 13 | ``` 14 | 15 | 16 | ## Installation 17 | `npm install --save article-extractor` 18 | 19 | ## Extracting data 20 | ```js 21 | var extractor = require('article-extractor'); 22 | 23 | extractor.extractData('http://paulgraham.com/altair.html', function (err, data) { 24 | console.log(data); 25 | }); 26 | 27 | ``` 28 | 29 | ## Extract result 30 | The result looks like this: 31 | ```json 32 | { 33 | "domain": "thomastuts.com", 34 | "author": "Thomas Tuts", 35 | "title": "Article Extractor Demo", 36 | "summary": "A Node.js module to retrieve article content and metadata from a URL.", 37 | "content": "

This is the article content.

" 38 | } 39 | ``` 40 | -------------------------------------------------------------------------------- /demo/server.js: -------------------------------------------------------------------------------- 1 | var restify = require('restify'); 2 | var extractor = require('../index'); 3 | var server = restify.createServer(); 4 | 5 | server.use(restify.queryParser()); 6 | 7 | server.get('/parse', function (req, res, next) { 8 | var articleUrl = req.query.url; 9 | 10 | extractor.extractData(articleUrl, function (err, data) { 11 | res.json(data); 12 | }); 13 | }); 14 | 15 | server.listen(5050); 16 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var request = require('request'); 2 | var url = require('url'); 3 | var cleaner = require('./lib/cleaner'); 4 | var author = require('./lib/parser/author'); 5 | var content = require('./lib/parser/content'); 6 | var title = require('./lib/parser/title'); 7 | var summary = require('./lib/parser/summary'); 8 | 9 | module.exports = { 10 | extractData: function (articleUrl, callback) { 11 | request(articleUrl, function (err, response, body) { 12 | var data = {}; 13 | var preppedHtml = cleaner.prepForParsing(body); 14 | 15 | data.domain = url.parse(articleUrl).host; 16 | data.author = author.getAuthor(preppedHtml); 17 | data.title = title.getTitle(preppedHtml); 18 | data.content = content.getArticleContent(preppedHtml, data.host); 19 | data.summary = summary.getSummary(preppedHtml, data.content); 20 | 21 | callback(null, data); 22 | }); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /lib/cleaner/clean-formatting.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Cleans up parsed HTML formatting by removing newlines. 3 | * 4 | * @param rawHtml 5 | * @returns {string} 6 | */ 7 | module.exports = function (rawHtml) { 8 | rawHtml = rawHtml 9 | .replace(/\n/g, '') 10 | .trim(); 11 | 12 | return rawHtml; 13 | }; 14 | -------------------------------------------------------------------------------- /lib/cleaner/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Remove newlines and other useless stuff 3 | * Remove all attributes on inline elements 4 | * Remove unwanted attributes (style, width, height, ...) 5 | * Remove content that is not related to the article ('Click to...') 6 | * Remove links in images 7 | * Remove header elements with the article's title in them 8 | * 9 | */ 10 | 11 | var prepForParsing = require('./prep-for-parsing'); 12 | var removeAttributes = require('./remove-attributes'); 13 | var cleanFormatting = require('./clean-formatting'); 14 | var removeSocialElements = require('./remove-social-elements'); 15 | var removeNavigationalElements = require('./remove-navigational-elements'); 16 | var removeEmptyElements = require('./remove-empty-elements'); 17 | 18 | module.exports = { 19 | prepForParsing: prepForParsing, 20 | cleanAfterParsing: function (rawHtml, host) { 21 | rawHtml = removeAttributes(rawHtml); 22 | rawHtml = removeSocialElements(rawHtml); 23 | rawHtml = removeNavigationalElements(rawHtml, host); 24 | rawHtml = removeEmptyElements(rawHtml); 25 | rawHtml = cleanFormatting(rawHtml); 26 | 27 | return rawHtml; 28 | } 29 | }; 30 | -------------------------------------------------------------------------------- /lib/cleaner/prep-for-parsing.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | 3 | var elementsToRemove = [ 4 | 'script', 5 | 'header', 6 | 'footer' 7 | ]; 8 | 9 | var blacklistRegex = /ads|social|comment/i; 10 | 11 | /** 12 | * Prepares a raw HTML string by removing any unnecessary items, like scripts, headers and footers. Also tries to remove 13 | * any elements that are most likely uninteresting (comments, ads, social stuff, ...). 14 | * 15 | * @param rawHtml 16 | */ 17 | module.exports = function (rawHtml) { 18 | var $ = cheerio.load(rawHtml); 19 | 20 | var $body = $('body'); 21 | 22 | elementsToRemove.forEach(function (elementToRemove) { 23 | $body.find(elementToRemove).remove(); 24 | }); 25 | 26 | $body.find('*').filter(function () { 27 | var idAndClasses = $(this).attr('id') + $(this).attr('class'); 28 | if (idAndClasses) { 29 | return idAndClasses.match(blacklistRegex); 30 | } 31 | else { 32 | return false; 33 | } 34 | }).remove(); 35 | 36 | return $.html(); 37 | } 38 | -------------------------------------------------------------------------------- /lib/cleaner/remove-attributes.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | var _ = require('lodash'); 3 | 4 | var attributesToKeep = [ 5 | 'src', 6 | 'href', 7 | 'target' 8 | ]; 9 | 10 | /** 11 | * Removes all attributes from a given HTML string, except for the ones we're still interested in, such as img src, 12 | * anchor hrefs, ... 13 | * 14 | * @param rawHtml 15 | */ 16 | module.exports = function (rawHtml) { 17 | var $ = cheerio.load(rawHtml); 18 | 19 | $('*').each(function () { 20 | var element = this; 21 | var attributes = _.chain(element.attribs) 22 | .keys() 23 | .difference(attributesToKeep) 24 | .value(); 25 | 26 | attributes.forEach(function (attribute) { 27 | $(element).removeAttr(attribute); 28 | }); 29 | }); 30 | 31 | return $.html(); 32 | } 33 | -------------------------------------------------------------------------------- /lib/cleaner/remove-empty-elements.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | var _ = require('lodash'); 3 | 4 | /** 5 | * Removes all empty elements. 6 | * 7 | * @param rawHtml 8 | */ 9 | module.exports = function (rawHtml) { 10 | var $ = cheerio.load(rawHtml); 11 | 12 | $('*').each(function () { 13 | var children = $(this).children().length; 14 | var content = $(this).text().replace(/\t|\s/g, ''); 15 | var isImage = $(this)[0].tagName === 'img'; 16 | 17 | if (!children && !content && !isImage) { 18 | $(this).remove(); 19 | } 20 | }); 21 | 22 | return $.html(); 23 | } 24 | -------------------------------------------------------------------------------- /lib/cleaner/remove-navigational-elements.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | var _ = require('lodash'); 3 | 4 | /** 5 | * Removes all elements that are used for navigation (such as 'to top' links, article tags, ...) 6 | * 7 | * @param rawHtml 8 | */ 9 | module.exports = function (rawHtml, host) { 10 | var $ = cheerio.load(rawHtml); 11 | 12 | // Filter out 'back to top' links 13 | $('a').filter(function () { 14 | var hasTopInText = $(this).text().toLowerCase().indexOf('top') > -1; 15 | var hasHashInHref = $(this).attr('href').indexOf('#') > -1; 16 | return hasTopInText && hasHashInHref; 17 | }).remove(); 18 | 19 | // Filter out any links that have the `rel="tag"` attribute, or link back to the same host with 'tag' in the URL. 20 | $('a').each(function () { 21 | var relTag = $(this).attr('rel'); 22 | var href = $(this).attr('href'); 23 | 24 | var isRelTag = relTag === 'tag'; 25 | var isPartOfList = $(this).parents('ul').length > 0; 26 | var containsUrlWithTag = href.indexOf(host) > -1 && href.indexOf('tag') > -1; 27 | 28 | if (isRelTag || containsUrlWithTag) { 29 | if (isPartOfList) { 30 | $(this).parents('ul').remove(); 31 | } 32 | else { 33 | $(this).remove(); 34 | } 35 | } 36 | 37 | // Remove any other elements with a `tags` class. 38 | $('.tags').remove(); 39 | }); 40 | 41 | return $.html(); 42 | } 43 | -------------------------------------------------------------------------------- /lib/cleaner/remove-post-data-elements.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | var _ = require('lodash'); 3 | 4 | /** 5 | * Removes all post elements that include author, the post's date, ... 6 | * 7 | * @param rawHtml 8 | */ 9 | module.exports = function (rawHtml, host) { 10 | var $ = cheerio.load(rawHtml); 11 | 12 | $('*[property="author"]').remove(); 13 | $('*[rel="author"]').remove(); 14 | $('datetime').remove(); 15 | $('.date').remove(); 16 | 17 | return $.html(); 18 | } 19 | -------------------------------------------------------------------------------- /lib/cleaner/remove-social-elements.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | var _ = require('lodash'); 3 | 4 | var shareUrls = [ 5 | 'twitter.com/intent', 6 | 'facebook.com/sharer' 7 | ]; 8 | 9 | /** 10 | * Removes all elements that contain any social keywords. 11 | * 12 | * @param rawHtml 13 | */ 14 | module.exports = function (rawHtml) { 15 | var $ = cheerio.load(rawHtml); 16 | 17 | $('*').each(function () { 18 | var text = $(this).text().toLowerCase(); 19 | var possibleSocialElement = text.indexOf('share on') > -1; 20 | 21 | if (possibleSocialElement) { 22 | var anchors = $(this).find('a'); 23 | anchors.each(function () { 24 | var $anchor = $(this); 25 | var href = $anchor.attr('href'); 26 | 27 | _.each(shareUrls, function (shareUrl) { 28 | if (href.indexOf(shareUrl) > -1) { 29 | $anchor.remove(); 30 | } 31 | }); 32 | }); 33 | } 34 | }); 35 | 36 | return $.html(); 37 | } 38 | -------------------------------------------------------------------------------- /lib/parser/author.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | 3 | /** 4 | * Tries to get the author from three sources: the `` tag, any anchors with the `rel="author"` 5 | * attribute or, as a last resort, the text value from a DOM element with an `author` class. 6 | * 7 | * @param html 8 | * @returns {string} 9 | */ 10 | function getAuthor(html) { 11 | var $ = cheerio.load(html); 12 | 13 | var metatagAuthor = $('meta[name="author"]').attr('content'); 14 | var semanticAuthor = $('*[rel="author"]').eq(0).text(); 15 | var classAuthor = $('.author').eq(0).text(); 16 | return metatagAuthor || semanticAuthor || classAuthor; 17 | } 18 | 19 | module.exports = { 20 | getAuthor: getAuthor 21 | }; 22 | -------------------------------------------------------------------------------- /lib/parser/content.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | var _ = require('lodash'); 3 | var cleaner = require('../cleaner'); 4 | 5 | /** 6 | * Gets a likely candidate for the article's content based on a DOM's element 'article score' (based on Readability's 7 | * implementation at https://code.google.com/p/arc90labs-readability/source/browse/branches/haiti/js/readability.js). 8 | * This algorithm assumes that the article is written in `

` tags. If it's not, it will return `undefined`. 9 | * 10 | * TODO: add additional score parameters based on paragraph length, comma occurrences and so on (see Readability above) 11 | * 12 | * @param rawHtml 13 | * @returns {*} 14 | */ 15 | function getLikelyCandidate(rawHtml) { 16 | var $ = cheerio.load(rawHtml); 17 | var $body = $('body'); 18 | var candidates = []; 19 | 20 | $body.find('p').each(function () { 21 | var paragraph = $(this); 22 | var parentNode = $(this).get(0).parentNode; 23 | 24 | if (!parentNode.extracted) { 25 | parentNode.extracted = { 26 | score: 0 27 | }; 28 | candidates.push(parentNode); 29 | } 30 | 31 | var paragraphLength = paragraph.text().length; 32 | parentNode.extracted.score += paragraphLength; 33 | }); 34 | 35 | if (candidates.length > 0) { 36 | var sortedByScore = _.sortBy(candidates, function (candidate) { 37 | return candidate.extracted.score; 38 | }).reverse(); 39 | 40 | return $(sortedByScore[0]).html(); 41 | } 42 | } 43 | 44 | /** 45 | * Loops over every node in the DOM and checks for its own text length. We try to pick the one with the longest length 46 | * in the hopes that this will actually be content. This is merely used as a fallback and probably doesn't work half the 47 | * time. This should probably be revisited some time in the future. 48 | * 49 | * This implementation was mostly tested on Paul Graham's essays, so I'm not sure if this would work reliably anywhere 50 | * else. Let's hope people actually use paragraph elements to write an article so we don't even need to use this 51 | * janky thing. 52 | * 53 | * @param rawHtml 54 | * @returns {string} 55 | */ 56 | function getContentByLongestLength(rawHtml) { 57 | console.log('Getting longest length'); 58 | var longestTextLength = 0; 59 | var $longest = null; 60 | var $ = cheerio.load(rawHtml); 61 | 62 | $('*').each(function () { 63 | var textLength = $(this).clone().children().remove().end().text().length; 64 | if (textLength > longestTextLength) { 65 | $longest = $(this); 66 | longestTextLength = textLength; 67 | } 68 | }); 69 | 70 | var content = $longest.html(); 71 | 72 | // Replace any existing newlines with a space 73 | content = content.replace(/\r?\n|\r/g, ' '); 74 | 75 | // Replace any multiple breaks with newlines 76 | content = content.replace(/()\1+/g, '\n'); 77 | 78 | // Replace any single breaks with newlines 79 | content = content.replace(/()/g, '\n'); 80 | 81 | // Replace all paragraphs divided by newlines with actual paragraphs 82 | var paragraphs = content.split('\n'); 83 | 84 | var contentInParagraphs = paragraphs.map(function (paragraph) { 85 | return '

' + paragraph + '

'; 86 | }).join(''); 87 | 88 | return contentInParagraphs; 89 | } 90 | 91 | function getArticleContent(rawHtml, host) { 92 | var content = getLikelyCandidate(rawHtml) || getContentByLongestLength(rawHtml); 93 | content = cleaner.cleanAfterParsing(content, host); 94 | return content; 95 | } 96 | 97 | module.exports = { 98 | getArticleContent: getArticleContent 99 | }; 100 | -------------------------------------------------------------------------------- /lib/parser/summary.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | 3 | var metatags = [ 4 | 'description', 5 | 'twitter:description', 6 | 'og:description' 7 | ]; 8 | 9 | /** 10 | * Gets the summary based on social metatags that are found in most blogs for sharing purposes. 11 | * 12 | * @param rawHtml 13 | * @returns {string} 14 | */ 15 | function getSummaryFromMetatags(rawHtml) { 16 | var $ = cheerio.load(rawHtml); 17 | 18 | for (var i = 0; i < metatags.length; i++) { 19 | var metatag = metatags[i]; 20 | var metaName = $('meta[name="' + metatag + '"]').attr('content'); 21 | var metaProperty = $('meta[property="' + metatag + '"]').attr('content'); 22 | 23 | if (metaName || metaProperty) { 24 | return metaName || metaProperty; 25 | } 26 | } 27 | } 28 | 29 | /** 30 | * Gets the summary by retrieving the article's content and returning the first interesting paragraph. Most definitely 31 | * not a silver bullet here, but at least it gets the job done in case there's no better option. 32 | * 33 | * @param rawHtml 34 | * @returns {string} 35 | */ 36 | function getSummaryFromContent(content) { 37 | var $ = cheerio.load(content); 38 | 39 | var interestingParagraphs = $('p').filter(function () { 40 | return $(this).text().length > 25; 41 | }); 42 | 43 | return $(interestingParagraphs).eq(0).text(); 44 | } 45 | 46 | module.exports = { 47 | getSummary: function (rawHtml, content) { 48 | var summaryFromMetags = getSummaryFromMetatags(rawHtml); 49 | 50 | if (summaryFromMetags) { 51 | return summaryFromMetags; 52 | } 53 | else { 54 | return getSummaryFromContent(content); 55 | } 56 | } 57 | }; 58 | -------------------------------------------------------------------------------- /lib/parser/title.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | 3 | var titleMetatags = [ 4 | 'og:title', 5 | 'twitter:title' 6 | ]; 7 | 8 | var sitenameMetatags = [ 9 | 'og:site_name', 10 | 'twitter:domain' 11 | ]; 12 | 13 | /** 14 | * Removes the site's name from the article title, and keeps removing the last character in the title until it hits 15 | * an alphabetic character. This is done to remove any delimiters that are usually used to add the site's name to the 16 | * article title (for example: This Is An Article | WIRED). 17 | * 18 | * @param articleTitle 19 | * @param siteName 20 | * @returns {string} 21 | */ 22 | function removeSiteNameFromTitle(articleTitle, siteName) { 23 | articleTitle = articleTitle.replace(siteName, ''); 24 | var lastChar = articleTitle.charAt(articleTitle.length - 1); 25 | 26 | while (!/[a-zA-Z|?|!|.]/.test(lastChar)) { 27 | articleTitle = articleTitle.substring(0, articleTitle.length - 1); 28 | lastChar = articleTitle.charAt(articleTitle.length - 1); 29 | } 30 | 31 | return articleTitle; 32 | } 33 | 34 | /** 35 | * Gets the site name based on metatags. 36 | * 37 | * @param rawHtml 38 | * @returns {string} 39 | */ 40 | function getSiteName(rawHtml) { 41 | var $ = cheerio.load(rawHtml); 42 | 43 | for (var i = 0; i < sitenameMetatags.length; i++) { 44 | var metatag = sitenameMetatags[i]; 45 | var sitename = $('meta[property="' + metatag + '"]').attr('content'); 46 | 47 | if (sitename) { 48 | return sitename; 49 | } 50 | } 51 | } 52 | 53 | /** 54 | * Gets the article's title from metatags used for social sharing. 55 | * 56 | * @param rawHtml 57 | * @returns {string} 58 | */ 59 | function getTitleFromMetaTags(rawHtml) { 60 | var $ = cheerio.load(rawHtml); 61 | var title; 62 | var siteName = getSiteName(rawHtml); 63 | 64 | for (var i = 0; i < titleMetatags.length; i++) { 65 | var metatag = titleMetatags[i]; 66 | title = $('meta[property="' + metatag + '"]').attr('content'); 67 | 68 | if (title) { 69 | break; 70 | } 71 | } 72 | 73 | if (siteName) { 74 | title = removeSiteNameFromTitle(title, siteName); 75 | } 76 | 77 | return title; 78 | } 79 | 80 | /** 81 | * Gets the article name from the window's title. 82 | * 83 | * @param rawHtml 84 | * @returns {string} 85 | */ 86 | function getTitleFromWindowTitle(rawHtml) { 87 | var $ = cheerio.load(rawHtml); 88 | var title = $('title').text(); 89 | var siteName = getSiteName(rawHtml); 90 | 91 | if (siteName) { 92 | title = removeSiteNameFromTitle(title, siteName); 93 | } 94 | 95 | return title; 96 | } 97 | 98 | module.exports = { 99 | getTitle: function (rawHtml) { 100 | return getTitleFromMetaTags(rawHtml) || getTitleFromWindowTitle(rawHtml); 101 | } 102 | }; 103 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "article-extractor", 3 | "version": "1.0.2", 4 | "description": "Extract metadata and content from web articles.", 5 | "main": "index.js", 6 | "keywords": [ 7 | "article", 8 | "blog", 9 | "parsing", 10 | "extract", 11 | "web", 12 | "scrape" 13 | ], 14 | "repository": { 15 | "type": "git", 16 | "url": "https://github.com/thomastuts/article-extractor.git" 17 | }, 18 | "author": "Thomas Tuts (http://thomastuts.com)", 19 | "license": "MIT", 20 | "bugs": { 21 | "url": "https://github.com/thomastuts/article-extractor/issues" 22 | }, 23 | "homepage": "https://github.com/thomastuts/article-extractor", 24 | "dependencies": { 25 | "async": "^0.9.0", 26 | "cheerio": "^0.19.0", 27 | "lodash": "^3.5.0", 28 | "request": "^2.53.0" 29 | }, 30 | "devDependencies": { 31 | "restify": "^3.0.1" 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | var extractor = require('./index'); 2 | var fs = require('fs'); 3 | var path = require('path'); 4 | var async = require('async'); 5 | 6 | var articlesToParse = [ 7 | { 8 | url: 'http://gizmodo.com/watch-a-single-day-on-the-london-tube-in-two-minutes-1692810056', 9 | filename: 'gizmodo' 10 | }, 11 | { 12 | url: 'http://www.wired.com/2015/03/google-android-broken-wifi/', 13 | filename: 'wired' 14 | }, 15 | { 16 | url: 'http://techcrunch.com/2015/03/20/from-the-8200-to-silicon-valley/', 17 | filename: 'techcrunch' 18 | }, 19 | { 20 | url: 'http://paulgraham.com/altair.html', 21 | filename: 'paulgraham' 22 | }, 23 | { 24 | url: 'http://www.economist.com/news/united-states/21646763-when-lethal-injection-gets-tricky-try-guns-or-gas-drugs-dont-work', 25 | filename: 'the-economist' 26 | }, 27 | { 28 | url: 'http://onstartups.com/insider-look-at-hubspot-sidekick-growth-approach', 29 | filename: 'on-startups' 30 | }, 31 | { 32 | url: 'http://www.smashingmagazine.com/2015/03/20/better-browser-input-events/', 33 | filename: 'smashing-magazine' 34 | }, 35 | { 36 | url: 'http://nextviewventures.com/blog/pretotyping-product-market-fit-google-alberto-savoia/', 37 | filename: 'next-view-ventures' 38 | } 39 | ]; 40 | 41 | async.each(articlesToParse, function (articleToParse, parseCallback) { 42 | extractor.extractData(articleToParse.url, function (err, data) { 43 | console.log('Parsed article:', data.title); 44 | console.log(data.summary); 45 | console.log('-----'); 46 | fs.writeFileSync(path.join(process.cwd(), 'data/' + articleToParse.filename + '.html'), data.content); 47 | parseCallback(); 48 | }); 49 | }, function () { 50 | process.exit(0); 51 | }); 52 | --------------------------------------------------------------------------------