├── .gitignore ├── deploy.sh ├── README.md ├── package.json ├── lambda.js ├── devserver.js └── scraper ├── index.js └── phantomjs-script.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .tmp/ 3 | dist/ 4 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PACKAGE=lambda-phantom-scraper.zip 4 | OUTPUT=dist 5 | 6 | aws lambda update-function-code \ 7 | --region us-east-1 \ 8 | --function-name lambda-phantom-scraper \ 9 | --zip-file fileb://$PWD/$OUTPUT/$PACKAGE 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Lambda Phantom Scraper 2 | ====================== 3 | 4 | An example of PhantomJS/Node.js web scraper for AWS Lambda. 5 | 6 | This repository contains the source code for "Scraping the Web with AWS Lambda and PhantomJS" [talk](https://speakerdeck.com/akrylysov/scraping-the-web-with-aws-lambda-and-phantomjs) given at Greater Philadelphia AWS User Group meetup on May 25, 2016. 7 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lambda-phantom-scraper", 3 | "version": "0.1.0", 4 | "description": "An example of PhantomJS/Node.js web scraper for AWS Lambda", 5 | "author": { 6 | "name": "Artem Krylysov" 7 | }, 8 | "dependencies": { 9 | "phantomjs-prebuilt": "2.1.7" 10 | }, 11 | "devDependencies": { 12 | "express": "4.13.4", 13 | "body-parser": "1.15.1 " 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /lambda.js: -------------------------------------------------------------------------------- 1 | var scrapper = require('./scraper'); 2 | 3 | exports.handler = function(event, context, callback) { 4 | if (event.url) { 5 | scrapper.scrape(event.url, function(err, result) { 6 | if (err) { 7 | return callback(null, {error: result}); 8 | } 9 | callback(null, {result: result}); 10 | }) 11 | } 12 | else { 13 | callback(null, {error: 'bad query'}); 14 | } 15 | }; 16 | -------------------------------------------------------------------------------- /devserver.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | var bodyParser = require('body-parser'); 3 | var lambda = require('./lambda'); 4 | var app = express(); 5 | 6 | app.use(bodyParser.json()); 7 | 8 | app.post('/', function(req, res) { 9 | lambda.handler(req.body, {}, function(err, result) { 10 | if (err) { 11 | return res.send(err); 12 | } 13 | res.send(result); 14 | }); 15 | }); 16 | 17 | app.listen(3000); 18 | -------------------------------------------------------------------------------- /scraper/index.js: -------------------------------------------------------------------------------- 1 | var path = require('path'); 2 | var childProcess = require('child_process'); 3 | var phantomJsPath = require('phantomjs-prebuilt').path; 4 | 5 | exports.scrape = function(url, callback) { 6 | var childArgs = [path.join(__dirname, 'phantomjs-script.js')]; 7 | var phantom = childProcess.execFile(phantomJsPath, childArgs, { 8 | env: { 9 | URL: url 10 | }, 11 | maxBuffer: 2048*1024 12 | }); 13 | 14 | var stdout = ''; 15 | var stderr = ''; 16 | 17 | phantom.stdout.on('data', function(data) { 18 | stdout += data; 19 | }); 20 | 21 | phantom.stderr.on('data', function(data) { 22 | stderr += data; 23 | }); 24 | 25 | phantom.on('uncaughtException', function(err) { 26 | console.log('uncaught exception: ' + err); 27 | }); 28 | 29 | phantom.on('exit', function(exitCode) { 30 | if (exitCode !== 0) { 31 | return callback(true, stderr); 32 | } 33 | callback(null, stdout); 34 | }); 35 | }; 36 | -------------------------------------------------------------------------------- /scraper/phantomjs-script.js: -------------------------------------------------------------------------------- 1 | var system = require('system'); 2 | var env = system.env; 3 | var page = require('webpage').create(); 4 | 5 | page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'; 6 | page.settings.resourceTimeout = 10000; 7 | page.viewportSize = { 8 | width: 1366, 9 | height: 768 10 | }; 11 | 12 | var error = ''; 13 | 14 | page.onResourceError = function(resourceError) { 15 | error = resourceError.errorString; 16 | }; 17 | 18 | page.open(env.URL, function(status) { 19 | if (status == 'success') { 20 | function checkReadyState() { 21 | var readyState = page.evaluate(function() { 22 | return document.readyState; 23 | }); 24 | if (readyState == 'complete') { 25 | var result = page.evaluate(function() { 26 | return document.documentElement.outerHTML; 27 | }); 28 | system.stdout.write(result); 29 | phantom.exit(0); 30 | } 31 | else { 32 | setTimeout(checkReadyState, 50); 33 | } 34 | } 35 | checkReadyState(); 36 | } 37 | else { 38 | system.stderr.write(error); 39 | phantom.exit(1); 40 | } 41 | }); 42 | --------------------------------------------------------------------------------