├── .gitignore ├── .npmignore ├── History.md ├── Makefile ├── Readme.md ├── example.js ├── index.js ├── package.json └── plugins ├── alchemy.js ├── readability.js ├── request.js └── tika.js /.gitignore: -------------------------------------------------------------------------------- 1 | .env.local 2 | node_modules 3 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | support 2 | test 3 | examples 4 | *.sock 5 | -------------------------------------------------------------------------------- /History.md: -------------------------------------------------------------------------------- 1 | 2 | 0.0.2 / 2015-04-02 3 | ================== 4 | 5 | * update readme. more abstract 6 | 7 | 0.0.1 / 2010-01-03 8 | ================== 9 | 10 | * Initial release 11 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | test: 3 | @./node_modules/.bin/mocha \ 4 | --require should \ 5 | --reporter spec 6 | 7 | .PHONY: test -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | 2 | # page-pipe 3 | 4 | pass pages through a pluggable pipeline to extract information from them. 5 | 6 | ## Current plugins 7 | 8 | - [Alchemy](http://www.alchemyapi.com/) 9 | - [Readability](http://readability.com/developers/api/parser) 10 | - [Tika](https://tika.apache.org) 11 | 12 | ## Installation 13 | 14 | ``` 15 | npm install page-pipe 16 | ``` 17 | 18 | ## API 19 | 20 | ```js 21 | pagepipe = Pagepipe() 22 | .use(readability({ key: READABILITY_API_KEY })) 23 | .use(alchemy({ key: ALCHEMY_API_KEY })) 24 | .use(tika({ url: TIKA_SERVER )); 25 | 26 | 27 | pagepipe('http://en.wikipedia.org/wiki/Sloth', function(err, data) { 28 | if (err) throw err; 29 | console.log(data); 30 | }) 31 | ``` 32 | 33 | ## License 34 | 35 | (The MIT License) 36 | 37 | Copyright (c) 2015 Matthew Mueller <matt@lapwinglabs.com> 38 | 39 | Permission is hereby granted, free of charge, to any person obtaining 40 | a copy of this software and associated documentation files (the 41 | 'Software'), to deal in the Software without restriction, including 42 | without limitation the rights to use, copy, modify, merge, publish, 43 | distribute, sublicense, and/or sell copies of the Software, and to 44 | permit persons to whom the Software is furnished to do so, subject to 45 | the following conditions: 46 | 47 | The above copyright notice and this permission notice shall be 48 | included in all copies or substantial portions of the Software. 49 | 50 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 51 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 52 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 53 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 54 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 55 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 56 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 57 | -------------------------------------------------------------------------------- /example.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Environment variables 3 | */ 4 | 5 | require('localenv'); 6 | 7 | var envvar = require('envvar'); 8 | envvar.string('ALCHEMY_KEY'); 9 | envvar.string('READABILITY_PARSER_KEY'); 10 | envvar.string('TIKA_SERVER'); 11 | 12 | /** 13 | * Module Dependencies 14 | */ 15 | 16 | var Pagepipe = require('./'); 17 | 18 | /** 19 | * URL 20 | */ 21 | 22 | var url = 'http://seekingalpha.com/article/3046116-heres-why-facebooks-advertising-revenues-could-grow-to-over-40-billion-by-2021'; 23 | 24 | /** 25 | * Plugins 26 | */ 27 | 28 | var readability = require('./plugins/readability'); 29 | var alchemy = require('./plugins/alchemy'); 30 | var tika = require('./plugins/tika'); 31 | 32 | /** 33 | * Request 34 | */ 35 | 36 | pagepipe = Pagepipe() 37 | .use(tika({ 38 | url: process.env.TIKA_SERVER 39 | })) 40 | .use(alchemy({ 41 | key: process.env.ALCHEMY_KEY 42 | })) 43 | .use(readability({ 44 | key: process.env.READABILITY_PARSER_KEY 45 | })) 46 | 47 | /** 48 | * Make the request 49 | */ 50 | 51 | pagepipe(url, function(err, obj) { 52 | if (err) { 53 | console.log(err); 54 | // throw err; 55 | } else { 56 | console.log(JSON.stringify(obj, true, 2)); 57 | } 58 | }) 59 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Module Dependencies 3 | */ 4 | 5 | var context = require('http-context'); 6 | var assign = require('object-assign'); 7 | var Ware = require('ware'); 8 | var noop = function(){}; 9 | 10 | /** 11 | * Export `Pagepipe` 12 | */ 13 | 14 | module.exports = Pagepipe; 15 | 16 | /** 17 | * Default plugins 18 | */ 19 | 20 | var request = require('./plugins/request'); 21 | 22 | /** 23 | * Initialize the Page Pipe 24 | * with a `url`. 25 | * 26 | * @param {Object} defaults 27 | * @return {Function} 28 | */ 29 | 30 | function Pagepipe(defaults) { 31 | var ware = Ware(); 32 | var plugins = []; 33 | var req = noop; 34 | 35 | // add in the defaults 36 | plugins.push(request()); 37 | 38 | function pagepipe(url, fn) { 39 | var ctx = context(); 40 | 41 | ctx = assign(ctx, defaults); 42 | ctx.method = 'get'; 43 | ctx.state = {}; 44 | ctx.url = url; 45 | 46 | // provide a request hook 47 | req(ctx); 48 | 49 | // run the plugins 50 | ware 51 | .use(plugins) 52 | .run(ctx, done); 53 | 54 | function done(err, ctx) { 55 | if (err) return fn(err); 56 | ctx.state.url = ctx.url; 57 | fn(null, ctx.state); 58 | } 59 | 60 | return pagepipe; 61 | } 62 | 63 | pagepipe.use = function(fn) { 64 | if (!arguments.length) return plugins; 65 | plugins.push(fn); 66 | return pagepipe; 67 | } 68 | 69 | pagepipe.request = function(fn) { 70 | if (!arguments.length) return req; 71 | req = fn; 72 | return pagepipe; 73 | } 74 | 75 | return pagepipe; 76 | } 77 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "page-pipe", 3 | "version": "0.0.2", 4 | "description": "pass pages through a pipeline to extract information from them", 5 | "keywords": [], 6 | "author": "Matthew Mueller ", 7 | "repository": { 8 | "type": "git", 9 | "url": "git://github.com/lapwinglabs/page-pipe.git" 10 | }, 11 | "dependencies": { 12 | "concat-stream": "^1.4.7", 13 | "debug": "^2.1.3", 14 | "http-context": "^1.1.0", 15 | "object-assign": "^2.0.0", 16 | "request": "^2.54.0", 17 | "ware": "^1.2.0" 18 | }, 19 | "devDependencies": { 20 | "envvar": "^1.0.0", 21 | "localenv": "^0.2.2", 22 | "mocha": "*", 23 | "should": "*", 24 | "superagent": "^1.1.0" 25 | }, 26 | "main": "index" 27 | } -------------------------------------------------------------------------------- /plugins/alchemy.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Module Dependencies 3 | */ 4 | 5 | var debug = require('debug')('page-pipe:alchemy'); 6 | var superagent = require('superagent'); 7 | var assign = require('object-assign'); 8 | var fmt = require('util').format; 9 | var assert = require('assert'); 10 | 11 | /** 12 | * Export `Alchemy` 13 | */ 14 | 15 | module.exports = Alchemy; 16 | 17 | /** 18 | * API endpoint 19 | */ 20 | 21 | var api = 'http://access.alchemyapi.com/calls/url/URLGetRankedNamedEntities?apikey=%s&outputMode=json&sentiment=1"ations=1&url=%s'; 22 | 23 | /** 24 | * Attach `alchemy` information 25 | * 26 | * @param {Object} options 27 | * @return {Function} 28 | */ 29 | 30 | function Alchemy(options) { 31 | options = options || {}; 32 | assert(options.key, 'Alchemy requires an API key: http://www.alchemyapi.com/api/register.html') 33 | 34 | function alchemy(ctx, fn) { 35 | var endpoint = fmt(api, options.key, ctx.url); 36 | 37 | // request the data 38 | superagent.get(endpoint, function(err, res) { 39 | if (err) return fn(err); 40 | else if (error(res.status)) return fn(new Error(res.status + ': ' + res.statusText)); 41 | 42 | var body = res.body; 43 | 44 | ctx.state = assign(ctx.state, { 45 | entities: body.entities, 46 | language: body.language 47 | }); 48 | 49 | fn(null, ctx); 50 | }); 51 | 52 | } 53 | 54 | return alchemy; 55 | } 56 | 57 | /** 58 | * Check the error statsu 59 | * 60 | * @param {Number} status 61 | * @return {Boolean} 62 | */ 63 | 64 | function error(status) { 65 | var type = status / 100 | 0; 66 | return 4 == type || 5 == type; 67 | } 68 | -------------------------------------------------------------------------------- /plugins/readability.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Module Dependencies 3 | */ 4 | 5 | var superagent = require('superagent'); 6 | var assign = require('object-assign'); 7 | var fmt = require('util').format; 8 | var assert = require('assert'); 9 | 10 | /** 11 | * Module Dependencies 12 | */ 13 | 14 | module.exports = Readability; 15 | 16 | /** 17 | * API endpoint 18 | */ 19 | 20 | var api = 'https://readability.com/api/content/v1/parser?url=%s&token=%s'; 21 | 22 | /** 23 | * Pass th 24 | */ 25 | 26 | function Readability(options) { 27 | options = options || {}; 28 | assert(options.key, 'Readability requires an API key: https://readability.com/settings/account'); 29 | 30 | function readability(ctx, fn) { 31 | var endpoint = fmt(api, ctx.url, options.key); 32 | 33 | // request the data 34 | superagent.get(endpoint, function(err, res) { 35 | if (err) return fn(err); 36 | else if (error(res.status)) return fn(new Error(res.status + ': ' + res.statusText)); 37 | 38 | var body = JSON.parse(res.text); 39 | ctx.state = assign(ctx.state, body); 40 | 41 | fn(null, ctx); 42 | }); 43 | } 44 | 45 | return readability; 46 | } 47 | 48 | /** 49 | * Check the error statsu 50 | * 51 | * @param {Number} status 52 | * @return {Boolean} 53 | */ 54 | 55 | function error(status) { 56 | var type = status / 100 | 0; 57 | return 4 == type || 5 == type; 58 | } 59 | -------------------------------------------------------------------------------- /plugins/request.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Module Dependencies 3 | */ 4 | 5 | var superagent = require('superagent'); 6 | 7 | /** 8 | * Export `Request` 9 | */ 10 | 11 | module.exports = Request; 12 | 13 | /** 14 | * Create the request 15 | */ 16 | 17 | function Request() { 18 | 19 | function request(ctx, fn) { 20 | if (ctx.body) return fn(null, ctx); 21 | 22 | superagent[ctx.method](ctx.url, function (err, res, body) { 23 | if (err) return fn(err); 24 | ctx.status = res.statusCode; 25 | ctx.body = res.body; 26 | fn(null, ctx); 27 | }); 28 | } 29 | 30 | return request; 31 | } 32 | -------------------------------------------------------------------------------- /plugins/tika.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Module Dependencies 3 | */ 4 | 5 | var debug = require('debug')('page-pipe:tika'); 6 | var concat = require('concat-stream'); 7 | var assign = require('object-assign'); 8 | var request = require('request'); 9 | var assert = require('assert'); 10 | 11 | /** 12 | * Export `tika` 13 | */ 14 | 15 | module.exports = Tika; 16 | 17 | /** 18 | * Attach `tika` information 19 | * 20 | * @param {Object} options 21 | */ 22 | 23 | function Tika(options) { 24 | options = options || {}; 25 | options.url = options.url; 26 | assert(options.url, 'Tika requires a configured server url: http://wiki.apache.org/tika/TikaJAXRS'); 27 | 28 | function tika(ctx, fn) { 29 | var opts = { 30 | url: options.url + '/meta', 31 | headers: { 32 | 'Accept': 'application/json', 33 | 'Content-Type': ctx.url 34 | } 35 | } 36 | 37 | // pipe request of page url to tika server 38 | request.get(ctx.url).pipe(request.put(opts).on('response', response)); 39 | 40 | // response 41 | function response(res) { 42 | if (error(res.statusCode)) { 43 | return fn(new Error(res.statusCode + ': ' + res.statusMessage)); 44 | } 45 | 46 | res.pipe(concat(function(meta) { 47 | var json = JSON.parse(meta.toString()); 48 | ctx.state = assign(ctx.state, json); 49 | fn(null, ctx); 50 | })) 51 | } 52 | } 53 | 54 | return tika; 55 | } 56 | 57 | /** 58 | * Check the error statsu 59 | * 60 | * @param {Number} status 61 | * @return {Boolean} 62 | */ 63 | 64 | function error(status) { 65 | var type = status / 100 | 0; 66 | return 4 == type || 5 == type; 67 | } 68 | --------------------------------------------------------------------------------