├── .gitignore ├── doc ├── es6.png ├── pp.png ├── sf-jj.gif ├── ali-pay.png ├── es6-pdf.png ├── pp-trace.png ├── pp-trace2.png └── wx-pay.jpeg ├── data ├── sf-juejin │ ├── err.png │ ├── sf.png │ └── done.png ├── zhentaoo │ └── zhentaoo.png └── monitor │ └── success │ ├── ZT-2017-9-19T21:17:10.png │ ├── ZT-2017-9-19T21:18:50.png │ └── ZT-2017-9-19T21:23:43.png ├── src ├── monitor │ ├── views │ │ ├── index.jade │ │ ├── error.jade │ │ └── layout.jade │ ├── public │ │ └── stylesheets │ │ │ └── style.css │ ├── routes │ │ ├── index.js │ │ └── monitor.js │ ├── scripts │ │ └── monitor.js │ ├── app.js │ └── bin │ │ └── www ├── trace.js ├── zhentaoo.js ├── es6-crawl.js ├── sf-juejin.js └── shuabi.js ├── package.json ├── tools └── tools.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | -------------------------------------------------------------------------------- /doc/es6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/doc/es6.png -------------------------------------------------------------------------------- /doc/pp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/doc/pp.png -------------------------------------------------------------------------------- /doc/sf-jj.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/doc/sf-jj.gif -------------------------------------------------------------------------------- /doc/ali-pay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/doc/ali-pay.png -------------------------------------------------------------------------------- /doc/es6-pdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/doc/es6-pdf.png -------------------------------------------------------------------------------- /doc/pp-trace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/doc/pp-trace.png -------------------------------------------------------------------------------- /doc/pp-trace2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/doc/pp-trace2.png -------------------------------------------------------------------------------- /doc/wx-pay.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/doc/wx-pay.jpeg -------------------------------------------------------------------------------- /data/sf-juejin/err.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/data/sf-juejin/err.png -------------------------------------------------------------------------------- /data/sf-juejin/sf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/data/sf-juejin/sf.png -------------------------------------------------------------------------------- /data/sf-juejin/done.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/data/sf-juejin/done.png -------------------------------------------------------------------------------- /data/zhentaoo/zhentaoo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/data/zhentaoo/zhentaoo.png -------------------------------------------------------------------------------- /src/monitor/views/index.jade: -------------------------------------------------------------------------------- 1 | extends layout 2 | 3 | block content 4 | h1= title 5 | p Welcome to #{title} 6 | -------------------------------------------------------------------------------- /src/monitor/views/error.jade: -------------------------------------------------------------------------------- 1 | extends layout 2 | 3 | block content 4 | h1= message 5 | h2= error.status 6 | pre #{error.stack} 7 | -------------------------------------------------------------------------------- /data/monitor/success/ZT-2017-9-19T21:17:10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/data/monitor/success/ZT-2017-9-19T21:17:10.png -------------------------------------------------------------------------------- /data/monitor/success/ZT-2017-9-19T21:18:50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/data/monitor/success/ZT-2017-9-19T21:18:50.png -------------------------------------------------------------------------------- /data/monitor/success/ZT-2017-9-19T21:23:43.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhentaoo/puppeteer-deep/HEAD/data/monitor/success/ZT-2017-9-19T21:23:43.png -------------------------------------------------------------------------------- /src/monitor/public/stylesheets/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | padding: 50px; 3 | font: 14px "Lucida Grande", Helvetica, Arial, sans-serif; 4 | } 5 | 6 | a { 7 | color: #00B7FF; 8 | } 9 | -------------------------------------------------------------------------------- /src/monitor/views/layout.jade: -------------------------------------------------------------------------------- 1 | doctype html 2 | html 3 | head 4 | title= title 5 | link(rel='stylesheet', href='/stylesheets/style.css') 6 | body 7 | block content 8 | -------------------------------------------------------------------------------- /src/monitor/routes/index.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | var router = express.Router(); 3 | 4 | /* GET home page. */ 5 | router.get('/', function(req, res, next) { 6 | res.render('index', { title: 'Monitor' }); 7 | }); 8 | 9 | module.exports = router; 10 | -------------------------------------------------------------------------------- /src/monitor/routes/monitor.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | var router = express.Router(); 3 | 4 | /* GET users listing. */ 5 | router.get('/', function(req, res, next) { 6 | res.render('index', { title: `Monitor${req.query.img}` }); 7 | }); 8 | 9 | module.exports = router; 10 | -------------------------------------------------------------------------------- /src/trace.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | 3 | (async() => { 4 | const browser = await puppeteer.launch({headless: false}); 5 | const page = await browser.newPage(); 6 | 7 | await page.tracing.start({path: './data/trace/trace.json'}); 8 | await page.goto('http://www.zhentaoo.com'); 9 | await page.tracing.stop(); 10 | })(); 11 | -------------------------------------------------------------------------------- /src/zhentaoo.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | 3 | (async() => { 4 | const browser = await puppeteer.launch({headless: false}); 5 | const page = await browser.newPage(); 6 | await page.goto('http://www.zhentaoo.com', {waitUntil: 'networkidle'}); 7 | 8 | await page.screenshot({path: './data/zhentaoo/zhentaoo.png', type: 'png'}); 9 | 10 | // browser.close(); 11 | })(); 12 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "puppeteer-deep", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node src/zhentaoo.js", 8 | "test": "npm run juejin", 9 | "sf-juejin": "node src/sf-juejin.js", 10 | "zhentaoo": "node src/zhentaoo.js", 11 | "es6": "rm -rf ./es6-pdf/* && node src/es6-crawl.js", 12 | "trace": "node src/trace.js", 13 | "monitor": "node src/monitor/bin/www", 14 | "shuabi": "node src/shuabi.js", 15 | "shuabi0": "node src/shuabi0.js" 16 | }, 17 | "author": "", 18 | "license": "ISC", 19 | "dependencies": { 20 | "body-parser": "~1.15.2", 21 | "cookie-parser": "~1.4.3", 22 | "debug": "~2.2.0", 23 | "express": "~4.14.0", 24 | "jade": "~1.11.0", 25 | "morgan": "~1.7.0", 26 | "puppeteer": "^0.9.0", 27 | "request": "^2.81.0", 28 | "request-promise": "^4.2.1", 29 | "serve-favicon": "~2.3.0", 30 | "shelljs": "^0.7.8" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /tools/tools.js: -------------------------------------------------------------------------------- 1 | class Tools { 2 | static timeout(delay) { 3 | return new Promise((resolve, reject) => { 4 | setTimeout(() => { 5 | try { 6 | resolve(1) 7 | } catch (e) { 8 | reject(0) 9 | } 10 | }, delay) 11 | }) 12 | } 13 | 14 | /** 15 | * [TimeTools description] 16 | * @param {[type]} timestamp 12312312312312 17 | * @param {[type]} formatStr Y年M月D日 18 | * 19 | * M: month 1~12 20 | * Y: year 2017 21 | * D: date 0 ~ 31 22 | */ 23 | static moment(formatStr, timestamp) { 24 | let date = new Date(timestamp || new Date().getTime()) 25 | 26 | let M = date.getMonth() + 1 27 | 28 | let Y = date.getFullYear() 29 | 30 | let D = date.getDate() 31 | 32 | let h = date.getHours() 33 | 34 | let m = date.getMinutes() 35 | 36 | let s = date.getSeconds() 37 | 38 | return formatStr.replace('M', M).replace('Y', Y).replace('D', D).replace('h', h).replace('m', m).replace('s', s) 39 | } 40 | } 41 | 42 | module.exports = Tools; 43 | -------------------------------------------------------------------------------- /src/es6-crawl.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | var {timeout} = require('../tools/tools.js'); 3 | 4 | puppeteer.launch().then(async browser => { 5 | let page = await browser.newPage(); 6 | 7 | await page.goto('http://es6.ruanyifeng.com/#README'); 8 | await timeout(2000); 9 | 10 | let aTags = await page.evaluate(() => { 11 | let as = [...document.querySelectorAll('ol li a')]; 12 | return as.map((a) =>{ 13 | return { 14 | href: a.href.trim(), 15 | name: a.text 16 | } 17 | }); 18 | }); 19 | 20 | await page.pdf({path: `./data/es6-pdf/${aTags[0].name}.pdf`}); 21 | page.close() 22 | 23 | // 这里也可以使用promise all,但cpu可能吃紧,谨慎操作 24 | for (var i = 1; i < aTags.length; i++) { 25 | page = await browser.newPage() 26 | 27 | var a = aTags[i]; 28 | 29 | await page.goto(a.href); 30 | 31 | await timeout(2000); 32 | 33 | await page.pdf({path: `./data/es6-pdf/${a.name}.pdf`}); 34 | 35 | page.close(); 36 | } 37 | 38 | browser.close(); 39 | }); 40 | -------------------------------------------------------------------------------- /src/monitor/scripts/monitor.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | let {timeout, moment} = require('../../../tools/tools.js'); 3 | let rp = require('request-promise'); 4 | 5 | function monitor() { 6 | puppeteer.launch().then(async browser => { 7 | let page = await browser.newPage(); 8 | let date = moment("Y-M-DTh:m:s"); 9 | 10 | // 进入网站后,等待三秒 11 | await page.goto('http://www.zhentaoo.com/'); 12 | await timeout(3000); 13 | 14 | // 取出首页的文章title,如果有title为空,则截图存入err,mongo,结束本次任务 15 | let info = await page.evaluate(() => { 16 | let post = [...document.querySelectorAll('.post-title')]; 17 | return post.map((a) => a.innerText ); 18 | }); 19 | 20 | for (let i = 0; i < info.length; i++) { 21 | if (!info[i]) { 22 | let options = { 23 | uri: 'http://127.0.0.1:3000/monitor', 24 | qs: { 25 | img: `ZT-${date}.png` 26 | } 27 | }; 28 | 29 | rq(options); 30 | await page.screenshot({path: `./data/monitor/err/ZT-${date}.png`, type: 'png'}); 31 | browser.close(); 32 | } 33 | } 34 | 35 | // 如果正常则截图,结束任务 36 | await page.screenshot({path: `./data/monitor/success/ZT-${date}.png`, type: 'png'}); 37 | browser.close(); 38 | }); 39 | } 40 | 41 | monitor(); 42 | setInterval(monitor, 1000 * 60 * 5); 43 | -------------------------------------------------------------------------------- /src/monitor/app.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | var path = require('path'); 3 | var favicon = require('serve-favicon'); 4 | var logger = require('morgan'); 5 | var cookieParser = require('cookie-parser'); 6 | var bodyParser = require('body-parser'); 7 | 8 | var index = require('./routes/index'); 9 | var monitor = require('./routes/monitor'); 10 | var child_process = require('child_process'); 11 | 12 | /** 13 | * 运行scripts脚本,定时抓取www.zhentaoo.com的首页 14 | */ 15 | child_process.spawn('node', ['./src/monitor/scripts/monitor.js']); 16 | 17 | /** 18 | * 启动express实例 19 | */ 20 | var app = express(); 21 | 22 | // view engine setup 23 | app.set('views', path.join(__dirname, 'views')); 24 | app.set('view engine', 'jade'); 25 | 26 | // uncomment after placing your favicon in /public 27 | //app.use(favicon(path.join(__dirname, 'public', 'favicon.ico'))); 28 | app.use(logger('dev')); 29 | app.use(bodyParser.json()); 30 | app.use(bodyParser.urlencoded({ extended: false })); 31 | app.use(cookieParser()); 32 | app.use(express.static(path.join(__dirname, 'public'))); 33 | 34 | app.use('/', index); 35 | app.use('/monitor', monitor); 36 | 37 | // catch 404 and forward to error handler 38 | app.use(function(req, res, next) { 39 | var err = new Error('Not Found'); 40 | err.status = 404; 41 | next(err); 42 | }); 43 | 44 | // error handler 45 | app.use(function(err, req, res, next) { 46 | // set locals, only providing error in development 47 | res.locals.message = err.message; 48 | res.locals.error = req.app.get('env') === 'development' ? err : {}; 49 | 50 | // render the error page 51 | res.status(err.status || 500); 52 | res.render('error'); 53 | }); 54 | 55 | module.exports = app; 56 | -------------------------------------------------------------------------------- /src/monitor/bin/www: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | /** 4 | * Module dependencies. 5 | */ 6 | 7 | var app = require('../app'); 8 | var debug = require('debug')('xxx:server'); 9 | var http = require('http'); 10 | 11 | /** 12 | * Get port from environment and store in Express. 13 | */ 14 | 15 | var port = normalizePort(process.env.PORT || '3000'); 16 | app.set('port', port); 17 | 18 | /** 19 | * Create HTTP server. 20 | */ 21 | 22 | var server = http.createServer(app); 23 | 24 | /** 25 | * Listen on provided port, on all network interfaces. 26 | */ 27 | 28 | server.listen(port); 29 | server.on('error', onError); 30 | server.on('listening', onListening); 31 | 32 | /** 33 | * Normalize a port into a number, string, or false. 34 | */ 35 | 36 | function normalizePort(val) { 37 | var port = parseInt(val, 10); 38 | 39 | if (isNaN(port)) { 40 | // named pipe 41 | return val; 42 | } 43 | 44 | if (port >= 0) { 45 | // port number 46 | return port; 47 | } 48 | 49 | return false; 50 | } 51 | 52 | /** 53 | * Event listener for HTTP server "error" event. 54 | */ 55 | 56 | function onError(error) { 57 | if (error.syscall !== 'listen') { 58 | throw error; 59 | } 60 | 61 | var bind = typeof port === 'string' 62 | ? 'Pipe ' + port 63 | : 'Port ' + port; 64 | 65 | // handle specific listen errors with friendly messages 66 | switch (error.code) { 67 | case 'EACCES': 68 | console.error(bind + ' requires elevated privileges'); 69 | process.exit(1); 70 | break; 71 | case 'EADDRINUSE': 72 | console.error(bind + ' is already in use'); 73 | process.exit(1); 74 | break; 75 | default: 76 | throw error; 77 | } 78 | } 79 | 80 | /** 81 | * Event listener for HTTP server "listening" event. 82 | */ 83 | 84 | function onListening() { 85 | var addr = server.address(); 86 | var bind = typeof addr === 'string' 87 | ? 'pipe ' + addr 88 | : 'port ' + addr.port; 89 | debug('Listening on ' + bind); 90 | } 91 | -------------------------------------------------------------------------------- /src/sf-juejin.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer') 2 | var {timeout} = require('../tools/tools.js'); 3 | 4 | var delay = 1000 5 | // 以下拿掘金开刀,贡献私人测试账号 6 | // puppeteer.launch().then(async browser => { 7 | puppeteer.launch({headless: false}).then(async browser => { 8 | var page = await browser.newPage() 9 | page.setViewport({width: 1200, height: 600}) 10 | 11 | /** 1. 到sf获取最新的前端文章 **/ 12 | try { 13 | await page.goto('https://segmentfault.com/news/frontend') 14 | await timeout(delay) 15 | 16 | var SfFeArticleList = await page.evaluate(() => { 17 | var list = [...document.querySelectorAll('.news__list .news__item-title a')] 18 | 19 | return list.map(el => { 20 | return {href: el.href.trim(), title: el.innerText} 21 | }) 22 | }) 23 | 24 | console.log('SfFeArticleList:', SfFeArticleList); 25 | 26 | await page.screenshot({path: './data/sf-juejin/sf.png', type: 'png'}); 27 | } catch (e) { 28 | console.log('sf err:', e); 29 | } 30 | 31 | /** 登录juejin **/ 32 | try { 33 | await timeout(3000) 34 | await page.goto('https://juejin.im') 35 | await timeout(3000) 36 | 37 | var login = await page.$('.login') 38 | await login.click() 39 | 40 | var loginPhoneOrEmail = await page.$('[name=loginPhoneOrEmail]') 41 | console.log('loginPhoneOrEmail:', loginPhoneOrEmail); 42 | await loginPhoneOrEmail.click() 43 | await page.type('18516697699@163.com', {delay: 20}) 44 | 45 | var password = await page.$('[placeholder=请输入密码]') 46 | console.log('password:', password); 47 | await password.click() 48 | await page.type('aaa123456', {delay: 20}) 49 | 50 | var authLogin = await page.$('.panel .btn') 51 | console.log('authLogin:', authLogin); 52 | await authLogin.click() 53 | 54 | } catch (e) {} 55 | 56 | /** 随机推荐一篇从sf拿来的文章到掘金 **/ 57 | try { 58 | await timeout(2500) 59 | var seed = Math.floor(Math.random() * 30) 60 | var theArtile = SfFeArticleList[seed] 61 | 62 | var add = await page.$('.main-nav .more') 63 | await add.click() 64 | 65 | var addLink = await page.$('.more-list .item') 66 | await addLink.click() 67 | 68 | await timeout(2500) 69 | 70 | var shareUrl = await page.$('.entry-form-input .url-input') 71 | await shareUrl.click() 72 | await page.type(theArtile.href, {delay: 20}) 73 | 74 | await page.press('Tab') 75 | await page.type(theArtile.title, {delay: 20}) 76 | 77 | await page.press('Tab') 78 | await page.type(theArtile.title, {delay: 20}) 79 | 80 | await page.evaluate(() => { 81 | let li = [...document.querySelectorAll('.category-list-box .category-list .item')] 82 | li.forEach(el => { 83 | if (el.innerText == '前端') 84 | el.click() 85 | }) 86 | }) 87 | 88 | var submitBtn = await page.$('.submit-btn') 89 | await submitBtn.click() 90 | 91 | } catch (e) { 92 | await page.screenshot({path: './data/sf-juejin/err.png', type: 'png'}); 93 | } 94 | 95 | await page.screenshot({path: './data/sf-juejin/done.png', type: 'png'}); 96 | // await page.close() 97 | // browser.close() 98 | }) 99 | -------------------------------------------------------------------------------- /src/shuabi.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | var {timeout} = require('../tools/tools.js'); 3 | 4 | // var fxUrl = 'https://yeecall.gl.yeecall.com/activity/share/5a59a9d14cc2562b0fd32ec5' 5 | // var meUrl = 'https://yeecall.gl.yeecall.com/activity/share/5a599e2664325570dd5b6c25' 6 | 7 | var url1 = 'https://yeecall.gl.yeecall.com/activity/share?t=5a5a2507edea6a2010998444' 8 | var url2 = 'https://yeecall.gl.yeecall.com/activity/share?t=5a5a24deebf2136b273c05ce' 9 | var url3 = 'https://yeecall.gl.yeecall.com/activity/share?t=5a5a297debf2136b273c26ab' 10 | var url4 = 'https://yeecall.gl.yeecall.com/activity/share?t=5a5a2986ebf2136b273c26f0' 11 | var url5 = 'https://yeecall.gl.yeecall.com/activity/share?t=5a5a29add6ab05313934d7f6' 12 | var url6 = 'https://yeecall.gl.yeecall.com/activity/share?t=5a5a29c4ebf2136b273c2851' 13 | var url7 = 'https://yeecall.gl.yeecall.com/activity/share?t=5a5a2ec6edea6a201099cf70' 14 | var url8 = 'https://yeecall.gl.yeecall.com/activity/share?t=5a5a2ecdd6ab0531393500ad' 15 | var url9 = 'https://yeecall.gl.yeecall.com/activity/share?t=5a5a2edad6ab05313935010a' 16 | var url10 = 'https://yeecall.gl.yeecall.com/activity/share?t=5a5a2ee2edea6a201099d040' 17 | 18 | var token = [ 19 | '0x1AA59c01fa169fB6A4a2E2D7DB9D02db9A9e', 20 | '0x1BD55G01fa169fkjA7a2E2D7SBG8R0dgy928', 21 | '0x1TG85G01faII2fkjA7a3G2DD76BG8D10dgyO', 22 | '0x1TON15GMUXaIO3BkjA7a3M8UD78YB8D1B3Gy', 23 | '0x100im57UNXaIa7bkjA7a3M8UD78YB8D1B3Gy', 24 | '0x5197fa565CED81cf5d599169e73754F0EDdC', 25 | '0xD3Da927d3698832fA68568152E43E9824b30', 26 | '0x743fFa1891640d0eA108d69362a287ea1063', 27 | '0x56B86E766860D8866B42e87B041BBa8065AE', 28 | ] 29 | 30 | function rdToken() { 31 | var seed = [ 32 | 1,2,3,4,5,6,7,8,9,0, 33 | 'a','b','c','d','e','f','g', 34 | 'h','i','j','k','l','m','n', 35 | 'o','p','q','r','s','t', 36 | 'u','v','w','x','y','z', 37 | 'A','B','C','D','E','F','G', 38 | 'H','I','J','K','L','M','N', 39 | 'O','P','Q','R','S','T', 40 | 'U','V','W','X','Y','Z', 41 | ] 42 | var rd1 = Math.floor( Math.random() * 62 ) 43 | var rd2 = Math.floor( Math.random() * 62 ) 44 | var rd3 = Math.floor( Math.random() * 62 ) 45 | var rd4 = Math.floor( Math.random() * 62 ) 46 | var tookenRand = Math.floor( Math.random() * 9 ) 47 | 48 | return token[tookenRand] + seed[rd1] + seed[rd2] + seed[rd3] + seed[rd4] 49 | } 50 | var count = 0 51 | puppeteer.launch().then(async browser => { 52 | // puppeteer.launch({headless: false}).then(async browser => { 53 | let p1 = await browser.newPage(); 54 | let p2 = await browser.newPage(); 55 | let p3 = await browser.newPage(); 56 | let p4 = await browser.newPage(); 57 | let p5 = await browser.newPage(); 58 | let p6 = await browser.newPage(); 59 | let p7 = await browser.newPage(); 60 | let p8 = await browser.newPage(); 61 | let p9 = await browser.newPage(); 62 | let p10 = await browser.newPage(); 63 | 64 | oneVisit(p1, url1) 65 | oneVisit(p2, url2) 66 | oneVisit(p3, url3) 67 | oneVisit(p4, url4) 68 | oneVisit(p5, url5) 69 | oneVisit(p6, url6) 70 | oneVisit(p7, url7) 71 | oneVisit(p8, url8) 72 | oneVisit(p9, url9) 73 | oneVisit(p10, url10) 74 | 75 | rdLoop() 76 | 77 | function rdLoop() { 78 | var time = Math.floor( Math.random() * 40 ) + 5 79 | console.log('time:', time); 80 | 81 | setTimeout(function () { 82 | oneVisit(p1, url1) 83 | oneVisit(p2, url2) 84 | oneVisit(p3, url3) 85 | oneVisit(p4, url4) 86 | oneVisit(p5, url5) 87 | oneVisit(p6, url6) 88 | oneVisit(p7, url7) 89 | oneVisit(p8, url8) 90 | oneVisit(p9, url9) 91 | oneVisit(p10, url10) 92 | 93 | console.log('time:', time); 94 | rdLoop() 95 | }, time * 1000); 96 | } 97 | 98 | async function oneVisit(page, url) { 99 | await page.goto(url); 100 | 101 | var input = await page.$('input') 102 | await input.click() 103 | await page.type(rdToken(), {delay: 20}) 104 | await timeout(500); 105 | 106 | var submit = await page.$('button') 107 | await submit.click() 108 | await timeout(500); 109 | 110 | var ICO_TOKEN = await page.evaluate(() => { 111 | if (localStorage.ICO_TOKEN) { 112 | delete localStorage.ICO_TOKEN 113 | location.reload() 114 | } 115 | return localStorage.ICO_TOKEN 116 | }) 117 | } 118 | }); 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 首先介绍Puppeteer 2 | - Puppeteer是一个node库,他提供了一组用来操纵Chrome的API,理论上使用它可以做任何Chrome可以做的事 3 | - 有点类似于PhantomJS,但Puppeteer由Chrome官方团队进行维护,前景更好 4 | - Puppeteer的应用场景会非常多,就爬虫领域来说,远比一般的爬虫工具功能更丰富,性能分析、自动化测试也不在话下 5 | - [Puppeteer官方文档请猛戳这里](https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#puppeteerlaunchoptions) 6 | 7 | ## 本项目会针对Puppeteer应用场景做几个可用的DEMO 8 | 1. 高级爬虫(有别于传统爬虫.使用Puppeteer可以拿到渲染后的效果。而传统爬虫相当于只能拿到http response,对字符串进行解析) 9 | 2. UI自动化测试(使用Puppeteer可以模拟用户操作,模拟表单填写) 10 | 3. 页面性能分析 (使用chrome的timeline,也就是Puppeteer提供的trace API) 11 | 12 | ## 项目Repo && Usage 13 | 1. git clone https://github.com/zhentaoo/puppeteer-deep 14 | 2. npm install (puppeteer在win下100+M、mac下70+M,请耐心等候) 15 | 16 | - npm run sf-juejin (推荐segmentfault的热门文章到掘金) 17 | - npm run monitor (前端监控、报警) 18 | - npm run es6 (爬取了阮一峰老师的《ES6标准入门》并打印PDF) 19 | - npm run zhentaoo (打印 www.zhentaoo.com 首页的图片) 20 | - npm run trace (生成 www.zhentaoo.com 的trace.json,并分析性能) 21 | 22 | ## 一、 UI自动化测试--自动推荐segmentfault的热门文章到掘金 23 | #### 1. 废话不多说,先上动图/视频看效果 24 | GIF图片比较大,如果不能加载成功,也可以到微博看下录制的视频 25 | http://weibo.com/tv/v/FiHMz7dcq?fid=1034:dcc08a8eee118263f6071fb6fafcc9a9 26 | 27 | 28 | 29 | #### 2. 开始介绍,第一步,爬取 segmentfault 前30篇热门文章 30 | - 跳转到https://segmentfault.com/news/frontend 31 | - 接着分析SF首页的Dom结构,爬取每篇文章的链接 32 | - 然后取出每篇文章最重要的 href,title 等信息 33 | - 具体代码如下: 34 | ```js 35 | await page.goto('https://segmentfault.com/news/frontend') 36 | 37 | var SfFeArticleList = await page.evaluate(() => { 38 | var list = [...document.querySelectorAll('.news__list .news__item-title a')] 39 | return list.map(el => { 40 | return {href: el.href.trim(), title: el.innerText} 41 | }) 42 | }) 43 | 44 | await page.screenshot({path: './sf-juejin/sf.png', type: 'png'}); 45 | ``` 46 | 47 | #### 3. 登录掘金 (这里我事先注册了个测试账号,大家可以替换成自己的) 48 | - 跳转到掘金,模拟点击登录按钮 49 | - 接着,会弹出一个的登录dialog,模拟输入用户名密码 50 | - 模拟点击登录,稍等....嗯...掘金应该把cookie写好了.... 51 | - 代码如下: 52 | ```js 53 | await page.goto('https://juejin.im') 54 | 55 | var login = await page.$('.login') 56 | await login.click() 57 | 58 | var loginPhoneOrEmail = await page.$('[name=loginPhoneOrEmail]') 59 | await loginPhoneOrEmail.click() 60 | await page.type('18516697699@163.com', {delay: 20}) 61 | 62 | var password = await page.$('[placeholder=请输入密码]') 63 | await password.click() 64 | await page.type('123456', {delay: 20}) 65 | 66 | var authLogin = await page.$('.panel .btn') 67 | await authLogin.click() 68 | ``` 69 | #### 4.推荐文章(使用第一步从SF爬取的文章信息) 70 | - 模拟点击推荐文章 按钮 “+” 71 | - 这时从SF拿到的文章信息就派上用场了,随机取出一篇: Math.floor(Math.random() * 30) 72 | - 模拟填写推荐表单,点击发布 73 | - 嗯,有时会提示该文章已被分享,那就换一篇吧,再执行一次。 74 | - 代码如下 75 | ```js 76 | var seed = Math.floor(Math.random() * 30) 77 | var theArtile = SfFeArticleList[seed] 78 | 79 | var add = await page.$('.main-nav .ion-android-add') 80 | await add.click() 81 | 82 | var shareUrl = await page.$('.entry-form-input .url-input') 83 | await shareUrl.click() 84 | await page.type(theArtile.href, {delay: 20}) 85 | 86 | await page.press('Tab') 87 | await page.type(theArtile.title, {delay: 20}) 88 | 89 | await page.press('Tab') 90 | await page.type(theArtile.title, {delay: 20}) 91 | 92 | await page.evaluate(() => { 93 | let li = [...document.querySelectorAll('.category-list-box .category-list .item')] 94 | li.forEach(el => { 95 | if (el.innerText == '前端') 96 | el.click() 97 | }) 98 | }) 99 | 100 | var submitBtn = await page.$('.submit-btn') 101 | await submitBtn.click() 102 | ``` 103 | 104 | ## 二、 前端监控系统 105 | 106 | #### 代码 https://github.com/zhentaoo/hawk-eye 107 | 108 | #### 1. 为什么要有前端监控系统? 109 | > 目前市面上以及各大公司流行的监控系统,都是API层的监控,包括调用量、数据、响应时长..... 110 | > 似乎只要接口没问题,整个系统就是稳定运行的,一切皆大欢喜 111 | > 但事实并非如此,CDN、DNS、Webview等等这些条件,都可能导致前端渲染失败、白屏 112 | > 离用户最近的一层--前端,却迟迟没有被加入监控列表,无形中流失多少用户..... 113 | 114 | #### 2. Node Express Server 115 | - 使用 `express monitor` 命令,生成express项目模版 116 | - 安装并启动mongodb,推荐 robomongo 可视化工具 117 | - 提供两个接口,1当Puppeteer发现网页渲染有异常则调用,2获取系统监控状态 118 | 119 | #### 3. 定时脚本 120 | - 设置定是脚本,每隔5分钟,访问 www.zhentaoo.com,抓取关键信息,并生成截图 121 | - 如果信息获取失败,则将截图保存至err目录,并记入数据库中 122 | ```js 123 | function monitor() { 124 | puppeteer.launch().then(async browser => { 125 | let page = await browser.newPage(); 126 | 127 | await page.goto('http://www.zhentaoo.com/'); 128 | await timeout(2000); 129 | 130 | let aTags = await page.evaluate(() => { 131 | let as = [...document.querySelectorAll('ol li a')]; 132 | return as.map((a) =>{ 133 | return { 134 | href: a.href.trim(), 135 | name: a.text 136 | } 137 | }); 138 | }); 139 | 140 | await page.screenshot({path: './data/zhentaoo/zhentaoo.png', type: 'png'}); 141 | browser.close(); 142 | }); 143 | } 144 | 145 | monitor(); 146 | setInterval(monitor, 1000 * 60 * 5); 147 | ``` 148 | 149 | #### 4. 进阶:与Chrome插件集成 150 | > 如果单纯的监控系统,每每需要点击,然后去看监控的情况,想必也有些麻烦 151 | > 那么为何不做个Chrome插件,显示监控状态呢? 152 | > 好吧,可以看我的另一个repo,https://github.com/zhentaoo/bitcoin-price,学习如何写一个chrome插件 153 | > 然后监控系统提供API给Chrome插件使用 154 | 155 | ## 三、高级爬虫--爬取《ES6标准入门》并打印成PDF 156 | 157 | #### 1. 运行Puppeteer,使用launch 158 | ```js 159 | puppeteer.launch().then(async browser => { 160 | ...... 161 | what you want 162 | ...... 163 | }) 164 | ``` 165 | 166 | #### 2. 跳转至 [阮一峰老师的ES6博客](http://es6.ruanyifeng.com/#README),使用goto 167 | ```js 168 | let page = await browser.newPage(); 169 | await page.goto('http://es6.ruanyifeng.com/#README'); 170 | ``` 171 | 172 | #### 3. 分析博客左侧导航栏的dom结构,并拿到所有链接的href、title信息 173 | ```js 174 | let as = [...document.querySelectorAll('ol li a')]; 175 | return as.map((a) =>{ 176 | return { 177 | href: a.href.trim(), 178 | name: a.text 179 | } 180 | }); 181 | ``` 182 | 183 | #### 4. 使用Puppeteer打印当前页面的PDF,使用pdf 184 | ```js 185 | await page.pdf({path: `./es6-pdf/${aTags[0].name}.pdf`}); 186 | ``` 187 | 188 | #### 5. 最终结果,将20多页博客打印成PDF 189 | 190 | 191 | 192 | ## 四、性能分析--Puppeteer Trace API 193 | 194 | #### 1. 简单介绍 Trace API 195 | > Trace API其实很简单,主要是使用Chrome Performance,生成当前页面的 性能追踪 文件, 196 | 然后将该文件上传给Chrome,就可以利用Chrome的开发者工具分析火焰图、各种数据参数 197 | 198 | #### 2. API: 使用 tracing start,stop生成trace.json 199 | ```js 200 | await page.tracing.start({path: './data/trace/trace.json'}); 201 | await page.goto('http://www.zhentaoo.com'); 202 | await page.tracing.stop(); 203 | ``` 204 | 205 | 206 | #### 3. 将trace.json上传给chrome,如下图 207 | 208 | 209 | #### 4. Chrome Performance/Timeline 使用教程 210 | 关于Chrome Performance/Timeline的使用又是一个大篇幅,这里提供一个教程 211 | - [Chrome 开发者工具](https://developers.google.com/web/tools/chrome-devtools/?hl=zh-cn) 212 | - [如何查看性能](https://developers.google.com/web/tools/chrome-devtools/?hl=zh-cn) 213 | - [分析运行时性能](https://developers.google.com/web/tools/chrome-devtools/evaluate-performance/timeline-tool?hl=zh-cn) 214 | - [诊断强制的同步布局](https://developers.google.com/web/tools/chrome-devtools/rendering-tools/forced-synchronous-layouts?hl=zh-cn) 215 | 216 | 217 | ## 结语 218 | 1. 为了效果展示,这里使用的headless: false模式,实际使用时可以同时开n个page,模拟操作,大家可以尝试改改,也可以给我提PR 219 | 2. 目前已经带领大家,使用Puppeteer完成爬虫 和 UI自动化测试,接下来可能会出第三篇,应该会是关于前端性能分析 220 | 3. 其实Puppeteer的应用场景远不止这些,大家也可以使用它在各自的领域大放异彩!!! 221 | 4. 希望掘金小编不会打我.... 222 | 223 | ## 赞助 224 | 如果你觉得该项目对你有用,欢迎打赏作者,你的打赏是开源的强大动力~~~ 225 |

226 | 227 | --------------------------------------------------------------------------------